diff --git a/.config/nextest.toml b/.config/nextest.toml index a9398e4ab0..affdc16f31 100644 --- a/.config/nextest.toml +++ b/.config/nextest.toml @@ -1,2 +1,2 @@ [profile.default] -slow-timeout = { period = "20s", terminate-after = 3 } +slow-timeout = { period = "60s", terminate-after = 3 } diff --git a/.dockerignore b/.dockerignore index 29abdc37aa..1258532db8 100644 --- a/.dockerignore +++ b/.dockerignore @@ -23,6 +23,7 @@ !s3_scrubber/ !safekeeper/ !storage_broker/ +!storage_controller/ !trace/ !vendor/postgres-*/ !workspace_hack/ diff --git a/.github/actionlint.yml b/.github/actionlint.yml index cb36e2eee6..942861ecd8 100644 --- a/.github/actionlint.yml +++ b/.github/actionlint.yml @@ -1,11 +1,9 @@ self-hosted-runner: labels: - arm64 - - dev - gen3 - large - # Remove `macos-14` from the list after https://github.com/rhysd/actionlint/pull/392 is merged. - - macos-14 + - large-arm64 - small - us-east-2 config-variables: diff --git a/.github/actions/allure-report-generate/action.yml b/.github/actions/allure-report-generate/action.yml index 1ecb5ecc7e..f84beff20c 100644 --- a/.github/actions/allure-report-generate/action.yml +++ b/.github/actions/allure-report-generate/action.yml @@ -150,7 +150,7 @@ runs: # Use aws s3 cp (instead of aws s3 sync) to keep files from previous runs to make old URLs work, # and to keep files on the host to upload them to the database - time aws s3 cp --recursive --only-show-errors "${WORKDIR}/report" "s3://${BUCKET}/${REPORT_PREFIX}/${GITHUB_RUN_ID}" + time s5cmd --log error cp "${WORKDIR}/report/*" "s3://${BUCKET}/${REPORT_PREFIX}/${GITHUB_RUN_ID}/" # Generate redirect cat < ${WORKDIR}/index.html diff --git a/.github/actions/neon-branch-create/action.yml b/.github/actions/neon-branch-create/action.yml index f1eea34ab9..dea3fc2357 100644 --- a/.github/actions/neon-branch-create/action.yml +++ b/.github/actions/neon-branch-create/action.yml @@ -10,7 +10,7 @@ inputs: required: true api_host: desctiption: 'Neon API host' - default: console.stage.neon.tech + default: console-stage.neon.build outputs: dsn: description: 'Created Branch DSN (for main database)' diff --git a/.github/actions/neon-branch-delete/action.yml b/.github/actions/neon-branch-delete/action.yml index f8cd351dd9..8acba7ad00 100644 --- a/.github/actions/neon-branch-delete/action.yml +++ b/.github/actions/neon-branch-delete/action.yml @@ -13,7 +13,7 @@ inputs: required: true api_host: desctiption: 'Neon API host' - default: console.stage.neon.tech + default: console-stage.neon.build runs: using: "composite" diff --git a/.github/actions/neon-project-create/action.yml b/.github/actions/neon-project-create/action.yml index ae6464990e..7f0e599b97 100644 --- a/.github/actions/neon-project-create/action.yml +++ b/.github/actions/neon-project-create/action.yml @@ -13,7 +13,7 @@ inputs: default: 15 api_host: desctiption: 'Neon API host' - default: console.stage.neon.tech + default: console-stage.neon.build provisioner: desctiption: 'k8s-pod or k8s-neonvm' default: 'k8s-pod' diff --git a/.github/actions/neon-project-delete/action.yml b/.github/actions/neon-project-delete/action.yml index adc8510a34..b8ec6cac70 100644 --- a/.github/actions/neon-project-delete/action.yml +++ b/.github/actions/neon-project-delete/action.yml @@ -10,7 +10,7 @@ inputs: required: true api_host: desctiption: 'Neon API host' - default: console.stage.neon.tech + default: console-stage.neon.build runs: using: "composite" diff --git a/.github/workflows/approved-for-ci-run.yml b/.github/workflows/approved-for-ci-run.yml index 69c48d86b9..ab616d17e2 100644 --- a/.github/workflows/approved-for-ci-run.yml +++ b/.github/workflows/approved-for-ci-run.yml @@ -18,6 +18,7 @@ on: concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number }} + cancel-in-progress: false env: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml index 2e56bf909f..1eaf05cd54 100644 --- a/.github/workflows/benchmarking.yml +++ b/.github/workflows/benchmarking.yml @@ -147,15 +147,16 @@ jobs: "neonvm-captest-new" ], "db_size": [ "10gb" ], - "include": [{ "platform": "neon-captest-freetier", "db_size": "3gb" }, - { "platform": "neon-captest-new", "db_size": "50gb" }, - { "platform": "neonvm-captest-freetier", "db_size": "3gb" }, - { "platform": "neonvm-captest-new", "db_size": "50gb" }] + "include": [{ "platform": "neon-captest-freetier", "db_size": "3gb" }, + { "platform": "neon-captest-new", "db_size": "50gb" }, + { "platform": "neonvm-captest-freetier", "db_size": "3gb" }, + { "platform": "neonvm-captest-new", "db_size": "50gb" }, + { "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb" }] }' if [ "$(date +%A)" = "Saturday" ]; then matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres", "db_size": "10gb"}, - { "platform": "rds-aurora", "db_size": "50gb"}]') + { "platform": "rds-aurora", "db_size": "50gb"}]') fi echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT @@ -171,7 +172,7 @@ jobs: if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres" }, - { "platform": "rds-aurora" }]') + { "platform": "rds-aurora" }]') fi echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT @@ -190,7 +191,7 @@ jobs: if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres", "scale": "10" }, - { "platform": "rds-aurora", "scale": "10" }]') + { "platform": "rds-aurora", "scale": "10" }]') fi echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT @@ -253,6 +254,9 @@ jobs: neon-captest-reuse) CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR }} ;; + neonvm-captest-sharding-reuse) + CONNSTR=${{ secrets.BENCHMARK_CAPTEST_SHARDING_CONNSTR }} + ;; neon-captest-new | neon-captest-freetier | neonvm-captest-new | neonvm-captest-freetier) CONNSTR=${{ steps.create-neon-project.outputs.dsn }} ;; @@ -270,11 +274,15 @@ jobs: echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT - QUERY="SELECT version();" + QUERIES=("SELECT version()") if [[ "${PLATFORM}" = "neon"* ]]; then - QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;" + QUERIES+=("SHOW neon.tenant_id") + QUERIES+=("SHOW neon.timeline_id") fi - psql ${CONNSTR} -c "${QUERY}" + + for q in "${QUERIES[@]}"; do + psql ${CONNSTR} -c "${q}" + done - name: Benchmark init uses: ./.github/actions/run-python-test-set @@ -401,11 +409,15 @@ jobs: echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT - QUERY="SELECT version();" + QUERIES=("SELECT version()") if [[ "${PLATFORM}" = "neon"* ]]; then - QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;" + QUERIES+=("SHOW neon.tenant_id") + QUERIES+=("SHOW neon.timeline_id") fi - psql ${CONNSTR} -c "${QUERY}" + + for q in "${QUERIES[@]}"; do + psql ${CONNSTR} -c "${q}" + done - name: ClickBench benchmark uses: ./.github/actions/run-python-test-set @@ -507,11 +519,15 @@ jobs: echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT - QUERY="SELECT version();" + QUERIES=("SELECT version()") if [[ "${PLATFORM}" = "neon"* ]]; then - QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;" + QUERIES+=("SHOW neon.tenant_id") + QUERIES+=("SHOW neon.timeline_id") fi - psql ${CONNSTR} -c "${QUERY}" + + for q in "${QUERIES[@]}"; do + psql ${CONNSTR} -c "${q}" + done - name: Run TPC-H benchmark uses: ./.github/actions/run-python-test-set @@ -597,11 +613,15 @@ jobs: echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT - QUERY="SELECT version();" + QUERIES=("SELECT version()") if [[ "${PLATFORM}" = "neon"* ]]; then - QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;" + QUERIES+=("SHOW neon.tenant_id") + QUERIES+=("SHOW neon.timeline_id") fi - psql ${CONNSTR} -c "${QUERY}" + + for q in "${QUERIES[@]}"; do + psql ${CONNSTR} -c "${q}" + done - name: Run user examples uses: ./.github/actions/run-python-test-set diff --git a/.github/workflows/build-build-tools-image.yml b/.github/workflows/build-build-tools-image.yml index 251423e701..bdf00bcaae 100644 --- a/.github/workflows/build-build-tools-image.yml +++ b/.github/workflows/build-build-tools-image.yml @@ -21,6 +21,7 @@ defaults: concurrency: group: build-build-tools-image-${{ inputs.image-tag }} + cancel-in-progress: false # No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job. permissions: {} @@ -38,7 +39,7 @@ jobs: matrix: arch: [ x64, arm64 ] - runs-on: ${{ fromJson(format('["self-hosted", "dev", "{0}"]', matrix.arch)) }} + runs-on: ${{ fromJson(format('["self-hosted", "gen3", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }} env: IMAGE_TAG: ${{ inputs.image-tag }} diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 2bcda7cc8e..21e7a56670 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -236,27 +236,6 @@ jobs: submodules: true fetch-depth: 1 - - name: Check Postgres submodules revision - shell: bash -euo pipefail {0} - run: | - # This is a temporary solution to ensure that the Postgres submodules revision is correct (i.e. the updated intentionally). - # Eventually it will be replaced by a regression test https://github.com/neondatabase/neon/pull/4603 - - FAILED=false - for postgres in postgres-v14 postgres-v15 postgres-v16; do - expected=$(cat vendor/revisions.json | jq --raw-output '."'"${postgres}"'"') - actual=$(git rev-parse "HEAD:vendor/${postgres}") - if [ "${expected}" != "${actual}" ]; then - echo >&2 "Expected ${postgres} rev to be at '${expected}', but it is at '${actual}'" - FAILED=true - fi - done - - if [ "${FAILED}" = "true" ]; then - echo >&2 "Please update vendor/revisions.json if these changes are intentional" - exit 1 - fi - - name: Set pg 14 revision for caching id: pg_v14_rev run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v14) >> $GITHUB_OUTPUT @@ -362,6 +341,9 @@ jobs: env: NEXTEST_RETRIES: 3 run: | + #nextest does not yet support running doctests + cargo test --doc $CARGO_FLAGS $CARGO_FEATURES + for io_engine in std-fs tokio-epoll-uring ; do NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES done @@ -477,6 +459,8 @@ jobs: BUILD_TAG: ${{ needs.tag.outputs.build-tag }} PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring PAGESERVER_GET_VECTORED_IMPL: vectored + PAGESERVER_GET_IMPL: vectored + PAGESERVER_VALIDATE_VEC_GET: true # Temporary disable this step until we figure out why it's so flaky # Ref https://github.com/neondatabase/neon/issues/4540 @@ -556,6 +540,9 @@ jobs: PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}" PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring + PAGESERVER_GET_VECTORED_IMPL: vectored + PAGESERVER_GET_IMPL: vectored + PAGESERVER_VALIDATE_VEC_GET: false # XXX: no coverage data handling here, since benchmarks are run on release builds, # while coverage is currently collected for the debug ones @@ -735,7 +722,7 @@ jobs: run: | mkdir -p .docker-custom echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV - - uses: docker/setup-buildx-action@v3 + - uses: docker/setup-buildx-action@v2 - uses: docker/login-action@v3 with: @@ -792,7 +779,7 @@ jobs: run: | mkdir -p .docker-custom echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV - - uses: docker/setup-buildx-action@v3 + - uses: docker/setup-buildx-action@v2 with: # Disable parallelism for docker buildkit. # As we already build everything with `make -j$(nproc)`, running it in additional level of parallelisam blows up the Runner. @@ -865,7 +852,7 @@ jobs: run: shell: sh -eu {0} env: - VM_BUILDER_VERSION: v0.23.2 + VM_BUILDER_VERSION: v0.28.1 steps: - name: Checkout @@ -1121,18 +1108,34 @@ jobs: run: | if [[ "$GITHUB_REF_NAME" == "main" ]]; then gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=false - - # TODO: move deployPreprodRegion to release (`"$GITHUB_REF_NAME" == "release"` block), once Staging support different compute tag prefixes for different regions - gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=true elif [[ "$GITHUB_REF_NAME" == "release" ]]; then - gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main \ + gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main \ -f deployPgSniRouter=false \ -f deployProxy=false \ -f deployStorage=true \ -f deployStorageBroker=true \ + -f deployStorageController=true \ + -f branch=main \ + -f dockerTag=${{needs.tag.outputs.build-tag}} \ + -f deployPreprodRegion=true + + gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main \ + -f deployStorage=true \ + -f deployStorageBroker=true \ + -f deployStorageController=true \ -f branch=main \ -f dockerTag=${{needs.tag.outputs.build-tag}} elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then + gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main \ + -f deployPgSniRouter=true \ + -f deployProxy=true \ + -f deployStorage=false \ + -f deployStorageBroker=false \ + -f deployStorageController=false \ + -f branch=main \ + -f dockerTag=${{needs.tag.outputs.build-tag}} \ + -f deployPreprodRegion=true + gh workflow --repo neondatabase/aws run deploy-proxy-prod.yml --ref main \ -f deployPgSniRouter=true \ -f deployProxy=true \ diff --git a/.github/workflows/check-build-tools-image.yml b/.github/workflows/check-build-tools-image.yml index 28646dfc19..a1e22cf93f 100644 --- a/.github/workflows/check-build-tools-image.yml +++ b/.github/workflows/check-build-tools-image.yml @@ -28,7 +28,9 @@ jobs: - name: Get build-tools image tag for the current commit id: get-build-tools-tag env: - COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }} + # Usually, for COMMIT_SHA, we use `github.event.pull_request.head.sha || github.sha`, but here, even for PRs, + # we want to use `github.sha` i.e. point to a phantom merge commit to determine the image tag correctly. + COMMIT_SHA: ${{ github.sha }} GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} run: | LAST_BUILD_TOOLS_SHA=$( diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml index 5a2f9d6645..fdb03963fb 100644 --- a/.github/workflows/neon_extra_builds.yml +++ b/.github/workflows/neon_extra_builds.yml @@ -136,7 +136,7 @@ jobs: check-linux-arm-build: needs: [ check-permissions, build-build-tools-image ] timeout-minutes: 90 - runs-on: [ self-hosted, dev, arm64 ] + runs-on: [ self-hosted, large-arm64 ] env: # Use release build only, to have less debug info around @@ -232,20 +232,20 @@ jobs: - name: Run cargo build run: | - mold -run cargo build --locked $CARGO_FLAGS $CARGO_FEATURES --bins --tests + mold -run cargo build --locked $CARGO_FLAGS $CARGO_FEATURES --bins --tests -j$(nproc) - name: Run cargo test env: NEXTEST_RETRIES: 3 run: | - cargo nextest run $CARGO_FEATURES + cargo nextest run $CARGO_FEATURES -j$(nproc) # Run separate tests for real S3 export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty export REMOTE_STORAGE_S3_BUCKET=neon-github-ci-tests export REMOTE_STORAGE_S3_REGION=eu-central-1 # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now - cargo nextest run --package remote_storage --test test_real_s3 + cargo nextest run --package remote_storage --test test_real_s3 -j$(nproc) # Run separate tests for real Azure Blob Storage # XXX: replace region with `eu-central-1`-like region @@ -255,12 +255,12 @@ jobs: export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}" export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}" # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now - cargo nextest run --package remote_storage --test test_real_azure + cargo nextest run --package remote_storage --test test_real_azure -j$(nproc) check-codestyle-rust-arm: needs: [ check-permissions, build-build-tools-image ] timeout-minutes: 90 - runs-on: [ self-hosted, dev, arm64 ] + runs-on: [ self-hosted, large-arm64 ] container: image: ${{ needs.build-build-tools-image.outputs.image }} @@ -269,6 +269,11 @@ jobs: password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} options: --init + strategy: + fail-fast: false + matrix: + build_type: [ debug, release ] + steps: - name: Fix git ownership run: | @@ -305,31 +310,35 @@ jobs: exit 1 fi echo "CLIPPY_COMMON_ARGS=${CLIPPY_COMMON_ARGS}" >> $GITHUB_ENV + - name: Run cargo clippy (debug) + if: matrix.build_type == 'debug' run: cargo hack --feature-powerset clippy $CLIPPY_COMMON_ARGS - name: Run cargo clippy (release) + if: matrix.build_type == 'release' run: cargo hack --feature-powerset clippy --release $CLIPPY_COMMON_ARGS - name: Check documentation generation - run: cargo doc --workspace --no-deps --document-private-items + if: matrix.build_type == 'release' + run: cargo doc --workspace --no-deps --document-private-items -j$(nproc) env: RUSTDOCFLAGS: "-Dwarnings -Arustdoc::private_intra_doc_links" # Use `${{ !cancelled() }}` to run quck tests after the longer clippy run - name: Check formatting - if: ${{ !cancelled() }} + if: ${{ !cancelled() && matrix.build_type == 'release' }} run: cargo fmt --all -- --check # https://github.com/facebookincubator/cargo-guppy/tree/bec4e0eb29dcd1faac70b1b5360267fc02bf830e/tools/cargo-hakari#2-keep-the-workspace-hack-up-to-date-in-ci - name: Check rust dependencies - if: ${{ !cancelled() }} + if: ${{ !cancelled() && matrix.build_type == 'release' }} run: | cargo hakari generate --diff # workspace-hack Cargo.toml is up-to-date cargo hakari manage-deps --dry-run # all workspace crates depend on workspace-hack # https://github.com/EmbarkStudios/cargo-deny - name: Check rust licenses/bans/advisories/sources - if: ${{ !cancelled() }} + if: ${{ !cancelled() && matrix.build_type == 'release' }} run: cargo deny check gather-rust-build-stats: @@ -338,7 +347,7 @@ jobs: contains(github.event.pull_request.labels.*.name, 'run-extra-build-stats') || contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') || github.ref_name == 'main' - runs-on: [ self-hosted, gen3, large ] + runs-on: [ self-hosted, large ] container: image: ${{ needs.build-build-tools-image.outputs.image }} credentials: @@ -369,7 +378,7 @@ jobs: run: make walproposer-lib -j$(nproc) - name: Produce the build stats - run: cargo build --all --release --timings + run: cargo build --all --release --timings -j$(nproc) - name: Upload the build stats id: upload-stats diff --git a/.github/workflows/pin-build-tools-image.yml b/.github/workflows/pin-build-tools-image.yml index c941692066..d495a158e8 100644 --- a/.github/workflows/pin-build-tools-image.yml +++ b/.github/workflows/pin-build-tools-image.yml @@ -20,6 +20,7 @@ defaults: concurrency: group: pin-build-tools-image-${{ inputs.from-tag }} + cancel-in-progress: false permissions: {} diff --git a/.github/workflows/trigger-e2e-tests.yml b/.github/workflows/trigger-e2e-tests.yml index ae34cbffe0..7111ee37fa 100644 --- a/.github/workflows/trigger-e2e-tests.yml +++ b/.github/workflows/trigger-e2e-tests.yml @@ -62,14 +62,14 @@ jobs: trigger-e2e-tests: needs: [ tag ] - runs-on: [ self-hosted, gen3, small ] + runs-on: ubuntu-latest env: TAG: ${{ needs.tag.outputs.build-tag }} - container: - image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned - options: --init steps: - name: check if ecr image are present + env: + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }} run: | for REPO in neon compute-tools compute-node-v14 vm-compute-node-v14 compute-node-v15 vm-compute-node-v15 compute-node-v16 vm-compute-node-v16; do OUTPUT=$(aws ecr describe-images --repository-name ${REPO} --region eu-central-1 --query "imageDetails[?imageTags[?contains(@, '${TAG}')]]" --output text) @@ -79,41 +79,55 @@ jobs: fi done - - name: Set PR's status to pending and request a remote CI test + - name: Set e2e-platforms + id: e2e-platforms + env: + PR_NUMBER: ${{ github.event.pull_request.number }} + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} run: | - # For pull requests, GH Actions set "github.sha" variable to point at a fake merge commit - # but we need to use a real sha of a latest commit in the PR's branch for the e2e job, - # to place a job run status update later. - COMMIT_SHA=${{ github.event.pull_request.head.sha }} - # For non-PR kinds of runs, the above will produce an empty variable, pick the original sha value for those - COMMIT_SHA=${COMMIT_SHA:-${{ github.sha }}} + # Default set of platforms to run e2e tests on + platforms='["docker", "k8s"]' - REMOTE_REPO="${{ github.repository_owner }}/cloud" + # If the PR changes vendor/, pgxn/ or libs/vm_monitor/ directories, or Dockerfile.compute-node, add k8s-neonvm to the list of platforms. + # If the workflow run is not a pull request, add k8s-neonvm to the list. + if [ "$GITHUB_EVENT_NAME" == "pull_request" ]; then + for f in $(gh api "/repos/${GITHUB_REPOSITORY}/pulls/${PR_NUMBER}/files" --paginate --jq '.[].filename'); do + case "$f" in + vendor/*|pgxn/*|libs/vm_monitor/*|Dockerfile.compute-node) + platforms=$(echo "${platforms}" | jq --compact-output '. += ["k8s-neonvm"] | unique') + ;; + *) + # no-op + ;; + esac + done + else + platforms=$(echo "${platforms}" | jq --compact-output '. += ["k8s-neonvm"] | unique') + fi - curl -f -X POST \ - https://api.github.com/repos/${{ github.repository }}/statuses/$COMMIT_SHA \ - -H "Accept: application/vnd.github.v3+json" \ - --user "${{ secrets.CI_ACCESS_TOKEN }}" \ - --data \ - "{ - \"state\": \"pending\", - \"context\": \"neon-cloud-e2e\", - \"description\": \"[$REMOTE_REPO] Remote CI job is about to start\" - }" + echo "e2e-platforms=${platforms}" | tee -a $GITHUB_OUTPUT - curl -f -X POST \ - https://api.github.com/repos/$REMOTE_REPO/actions/workflows/testing.yml/dispatches \ - -H "Accept: application/vnd.github.v3+json" \ - --user "${{ secrets.CI_ACCESS_TOKEN }}" \ - --data \ - "{ - \"ref\": \"main\", - \"inputs\": { - \"ci_job_name\": \"neon-cloud-e2e\", - \"commit_hash\": \"$COMMIT_SHA\", - \"remote_repo\": \"${{ github.repository }}\", - \"storage_image_tag\": \"${TAG}\", - \"compute_image_tag\": \"${TAG}\", - \"concurrency_group\": \"${{ env.E2E_CONCURRENCY_GROUP }}\" - } - }" + - name: Set PR's status to pending and request a remote CI test + env: + E2E_PLATFORMS: ${{ steps.e2e-platforms.outputs.e2e-platforms }} + COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }} + GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }} + run: | + REMOTE_REPO="${GITHUB_REPOSITORY_OWNER}/cloud" + + gh api "/repos/${GITHUB_REPOSITORY}/statuses/${COMMIT_SHA}" \ + --method POST \ + --raw-field "state=pending" \ + --raw-field "description=[$REMOTE_REPO] Remote CI job is about to start" \ + --raw-field "context=neon-cloud-e2e" + + gh workflow --repo ${REMOTE_REPO} \ + run testing.yml \ + --ref "main" \ + --raw-field "ci_job_name=neon-cloud-e2e" \ + --raw-field "commit_hash=$COMMIT_SHA" \ + --raw-field "remote_repo=${GITHUB_REPOSITORY}" \ + --raw-field "storage_image_tag=${TAG}" \ + --raw-field "compute_image_tag=${TAG}" \ + --raw-field "concurrency_group=${E2E_CONCURRENCY_GROUP}" \ + --raw-field "e2e-platforms=${E2E_PLATFORMS}" diff --git a/CODEOWNERS b/CODEOWNERS index 9a23e8c958..af2fa6088e 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -1,5 +1,5 @@ /compute_tools/ @neondatabase/control-plane @neondatabase/compute -/control_plane/attachment_service @neondatabase/storage +/storage_controller @neondatabase/storage /libs/pageserver_api/ @neondatabase/storage /libs/postgres_ffi/ @neondatabase/compute @neondatabase/safekeepers /libs/remote_storage/ @neondatabase/storage diff --git a/Cargo.lock b/Cargo.lock index 824cac13b3..6ce7180d67 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -25,9 +25,9 @@ checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" [[package]] name = "ahash" -version = "0.8.9" +version = "0.8.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d713b3834d76b85304d4d525563c1276e2e30dc97cc67bfb4585a4a29fc2c89f" +checksum = "e89da841a80418a9b391ebaea17f5c112ffaaa96f621d2c285b5174da76b9011" dependencies = [ "cfg-if", "const-random", @@ -271,43 +271,10 @@ dependencies = [ ] [[package]] -name = "attachment_service" -version = "0.1.0" -dependencies = [ - "anyhow", - "aws-config", - "aws-sdk-secretsmanager", - "bytes", - "camino", - "clap", - "control_plane", - "diesel", - "diesel_migrations", - "fail", - "futures", - "git-version", - "hex", - "humantime", - "hyper", - "lasso", - "measured", - "metrics", - "once_cell", - "pageserver_api", - "pageserver_client", - "postgres_connection", - "r2d2", - "reqwest", - "routerify", - "serde", - "serde_json", - "thiserror", - "tokio", - "tokio-util", - "tracing", - "utils", - "workspace_hack", -] +name = "atomic-take" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a8ab6b55fe97976e46f91ddbed8d147d966475dc29b2032757ba47e02376fbc3" [[package]] name = "autocfg" @@ -317,9 +284,9 @@ checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" [[package]] name = "aws-config" -version = "1.1.4" +version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b30c39ebe61f75d1b3785362b1586b41991873c9ab3e317a9181c246fb71d82" +checksum = "baaa0be6ee7d90b775ae6ccb6d2ba182b91219ec2001f92338773a094246af1d" dependencies = [ "aws-credential-types", "aws-runtime", @@ -337,19 +304,20 @@ dependencies = [ "fastrand 2.0.0", "hex", "http 0.2.9", - "hyper", + "hyper 0.14.26", "ring 0.17.6", "time", "tokio", "tracing", + "url", "zeroize", ] [[package]] name = "aws-credential-types" -version = "1.1.4" +version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "33cc49dcdd31c8b6e79850a179af4c367669150c7ac0135f176c61bec81a70f7" +checksum = "e16838e6c9e12125face1c1eff1343c75e3ff540de98ff7ebd61874a89bcfeb9" dependencies = [ "aws-smithy-async", "aws-smithy-runtime-api", @@ -359,9 +327,9 @@ dependencies = [ [[package]] name = "aws-runtime" -version = "1.1.4" +version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eb031bff99877c26c28895766f7bb8484a05e24547e370768d6cc9db514662aa" +checksum = "785da4a15e7b166b505fd577e4560c7a7cd8fbdf842eb1336cbcbf8944ce56f1" dependencies = [ "aws-credential-types", "aws-sigv4", @@ -374,7 +342,7 @@ dependencies = [ "bytes", "fastrand 2.0.0", "http 0.2.9", - "http-body", + "http-body 0.4.5", "percent-encoding", "pin-project-lite", "tracing", @@ -382,11 +350,35 @@ dependencies = [ ] [[package]] -name = "aws-sdk-s3" -version = "1.14.0" +name = "aws-sdk-iam" +version = "1.17.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "951f7730f51a2155c711c85c79f337fbc02a577fa99d2a0a8059acfce5392113" +checksum = "b8ae76026bfb1b80a6aed0bb400c1139cd9c0563e26bce1986cd021c6a968c7b" dependencies = [ + "aws-credential-types", + "aws-runtime", + "aws-smithy-async", + "aws-smithy-http", + "aws-smithy-json", + "aws-smithy-query", + "aws-smithy-runtime", + "aws-smithy-runtime-api", + "aws-smithy-types", + "aws-smithy-xml", + "aws-types", + "http 0.2.9", + "once_cell", + "regex-lite", + "tracing", +] + +[[package]] +name = "aws-sdk-s3" +version = "1.26.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7bc5ce518d4b8d16e0408de7bdf1b3097cec61a7daa979750a208f8d9934386d" +dependencies = [ + "ahash", "aws-credential-types", "aws-runtime", "aws-sigv4", @@ -401,43 +393,25 @@ dependencies = [ "aws-smithy-xml", "aws-types", "bytes", + "fastrand 2.0.0", + "hex", + "hmac", "http 0.2.9", - "http-body", + "http-body 0.4.5", + "lru", "once_cell", "percent-encoding", "regex-lite", + "sha2", "tracing", "url", ] -[[package]] -name = "aws-sdk-secretsmanager" -version = "1.14.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0a0b64e61e7d632d9df90a2e0f32630c68c24960cab1d27d848718180af883d3" -dependencies = [ - "aws-credential-types", - "aws-runtime", - "aws-smithy-async", - "aws-smithy-http", - "aws-smithy-json", - "aws-smithy-runtime", - "aws-smithy-runtime-api", - "aws-smithy-types", - "aws-types", - "bytes", - "fastrand 2.0.0", - "http 0.2.9", - "once_cell", - "regex-lite", - "tracing", -] - [[package]] name = "aws-sdk-sso" -version = "1.12.0" +version = "1.22.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f486420a66caad72635bc2ce0ff6581646e0d32df02aa39dc983bfe794955a5b" +checksum = "ca3d6c4cba4e009391b72b0fcf12aff04ea3c9c3aa2ecaafa330326a8bd7e601" dependencies = [ "aws-credential-types", "aws-runtime", @@ -457,9 +431,9 @@ dependencies = [ [[package]] name = "aws-sdk-ssooidc" -version = "1.12.0" +version = "1.22.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "39ddccf01d82fce9b4a15c8ae8608211ee7db8ed13a70b514bbfe41df3d24841" +checksum = "73400dc239d14f63d932f4ca7b55af5e9ef1f857f7d70655249ccc287adb2570" dependencies = [ "aws-credential-types", "aws-runtime", @@ -479,9 +453,9 @@ dependencies = [ [[package]] name = "aws-sdk-sts" -version = "1.12.0" +version = "1.22.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1a591f8c7e6a621a501b2b5d2e88e1697fcb6274264523a6ad4d5959889a41ce" +checksum = "10f8858308af76fba3e5ffcf1bb56af5471574d2bdfaf0159470c25bc2f760e5" dependencies = [ "aws-credential-types", "aws-runtime", @@ -502,9 +476,9 @@ dependencies = [ [[package]] name = "aws-sigv4" -version = "1.1.4" +version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c371c6b0ac54d4605eb6f016624fb5c7c2925d315fdf600ac1bf21b19d5f1742" +checksum = "58b56f1cbe6fd4d0c2573df72868f20ab1c125ca9c9dbce17927a463433a2e57" dependencies = [ "aws-credential-types", "aws-smithy-eventstream", @@ -517,7 +491,7 @@ dependencies = [ "hex", "hmac", "http 0.2.9", - "http 1.0.0", + "http 1.1.0", "once_cell", "p256", "percent-encoding", @@ -531,9 +505,9 @@ dependencies = [ [[package]] name = "aws-smithy-async" -version = "1.1.4" +version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "72ee2d09cce0ef3ae526679b522835d63e75fb427aca5413cd371e490d52dcc6" +checksum = "62220bc6e97f946ddd51b5f1361f78996e704677afc518a4ff66b7a72ea1378c" dependencies = [ "futures-util", "pin-project-lite", @@ -542,9 +516,9 @@ dependencies = [ [[package]] name = "aws-smithy-checksums" -version = "0.60.4" +version = "0.60.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "be2acd1b9c6ae5859999250ed5a62423aedc5cf69045b844432de15fa2f31f2b" +checksum = "83fa43bc04a6b2441968faeab56e68da3812f978a670a5db32accbdcafddd12f" dependencies = [ "aws-smithy-http", "aws-smithy-types", @@ -553,7 +527,7 @@ dependencies = [ "crc32fast", "hex", "http 0.2.9", - "http-body", + "http-body 0.4.5", "md-5", "pin-project-lite", "sha1", @@ -574,9 +548,9 @@ dependencies = [ [[package]] name = "aws-smithy-http" -version = "0.60.4" +version = "0.60.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dab56aea3cd9e1101a0a999447fb346afb680ab1406cebc44b32346e25b4117d" +checksum = "4a7de001a1b9a25601016d8057ea16e31a45fdca3751304c8edf4ad72e706c08" dependencies = [ "aws-smithy-eventstream", "aws-smithy-runtime-api", @@ -585,7 +559,7 @@ dependencies = [ "bytes-utils", "futures-core", "http 0.2.9", - "http-body", + "http-body 0.4.5", "once_cell", "percent-encoding", "pin-project-lite", @@ -595,18 +569,18 @@ dependencies = [ [[package]] name = "aws-smithy-json" -version = "0.60.4" +version = "0.60.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fd3898ca6518f9215f62678870064398f00031912390efd03f1f6ef56d83aa8e" +checksum = "4683df9469ef09468dad3473d129960119a0d3593617542b7d52086c8486f2d6" dependencies = [ "aws-smithy-types", ] [[package]] name = "aws-smithy-query" -version = "0.60.4" +version = "0.60.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bda4b1dfc9810e35fba8a620e900522cd1bd4f9578c446e82f49d1ce41d2e9f9" +checksum = "f2fbd61ceb3fe8a1cb7352e42689cec5335833cd9f94103a61e98f9bb61c64bb" dependencies = [ "aws-smithy-types", "urlencoding", @@ -614,9 +588,9 @@ dependencies = [ [[package]] name = "aws-smithy-runtime" -version = "1.1.4" +version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fafdab38f40ad7816e7da5dec279400dd505160780083759f01441af1bbb10ea" +checksum = "c9ac79e9f3a4d576f3cd4a470a0275b138d9e7b11b1cd514a6858ae0a79dd5bb" dependencies = [ "aws-smithy-async", "aws-smithy-http", @@ -624,29 +598,31 @@ dependencies = [ "aws-smithy-types", "bytes", "fastrand 2.0.0", - "h2", + "h2 0.3.26", "http 0.2.9", - "http-body", - "hyper", - "hyper-rustls", + "http-body 0.4.5", + "http-body 1.0.0", + "hyper 0.14.26", + "hyper-rustls 0.24.0", "once_cell", "pin-project-lite", "pin-utils", - "rustls 0.21.9", + "rustls 0.21.11", "tokio", "tracing", ] [[package]] name = "aws-smithy-runtime-api" -version = "1.1.4" +version = "1.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c18276dd28852f34b3bf501f4f3719781f4999a51c7bff1a5c6dc8c4529adc29" +checksum = "04ec42c2f5c0e7796a2848dde4d9f3bf8ce12ccbb3d5aa40c52fa0cdd61a1c47" dependencies = [ "aws-smithy-async", "aws-smithy-types", "bytes", "http 0.2.9", + "http 1.1.0", "pin-project-lite", "tokio", "tracing", @@ -655,16 +631,19 @@ dependencies = [ [[package]] name = "aws-smithy-types" -version = "1.1.4" +version = "1.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bb3e134004170d3303718baa2a4eb4ca64ee0a1c0a7041dca31b38be0fb414f3" +checksum = "baf98d97bba6ddaba180f1b1147e202d8fe04940403a95a3f826c790f931bbd1" dependencies = [ "base64-simd", "bytes", "bytes-utils", "futures-core", "http 0.2.9", - "http-body", + "http 1.1.0", + "http-body 0.4.5", + "http-body 1.0.0", + "http-body-util", "itoa", "num-integer", "pin-project-lite", @@ -678,18 +657,18 @@ dependencies = [ [[package]] name = "aws-smithy-xml" -version = "0.60.4" +version = "0.60.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8604a11b25e9ecaf32f9aa56b9fe253c5e2f606a3477f0071e96d3155a5ed218" +checksum = "d123fbc2a4adc3c301652ba8e149bf4bc1d1725affb9784eb20c953ace06bf55" dependencies = [ "xmlparser", ] [[package]] name = "aws-types" -version = "1.1.4" +version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "789bbe008e65636fe1b6dbbb374c40c8960d1232b96af5ff4aec349f9c4accf4" +checksum = "5a43b56df2c529fe44cb4d92bd64d0479883fb9608ff62daede4df5405381814" dependencies = [ "aws-credential-types", "aws-smithy-async", @@ -713,10 +692,10 @@ dependencies = [ "bytes", "futures-util", "http 0.2.9", - "http-body", - "hyper", + "http-body 0.4.5", + "hyper 0.14.26", "itoa", - "matchit", + "matchit 0.7.0", "memchr", "mime", "percent-encoding", @@ -729,7 +708,7 @@ dependencies = [ "sha1", "sync_wrapper", "tokio", - "tokio-tungstenite", + "tokio-tungstenite 0.20.0", "tower", "tower-layer", "tower-service", @@ -745,7 +724,7 @@ dependencies = [ "bytes", "futures-util", "http 0.2.9", - "http-body", + "http-body 0.4.5", "mime", "rustversion", "tower-layer", @@ -754,9 +733,9 @@ dependencies = [ [[package]] name = "azure_core" -version = "0.18.0" +version = "0.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a6218987c374650fdad0b476bfc675729762c28dfb35f58608a38a2b1ea337dd" +checksum = "70fd680c0d0424a518229b1150922f92653ba2ac933aa000abc8bf1ca08105f7" dependencies = [ "async-trait", "base64 0.21.1", @@ -772,7 +751,7 @@ dependencies = [ "pin-project", "quick-xml", "rand 0.8.5", - "reqwest", + "reqwest 0.11.19", "rustc_version", "serde", "serde_json", @@ -784,9 +763,9 @@ dependencies = [ [[package]] name = "azure_identity" -version = "0.18.1" +version = "0.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e1eacc4f7fb2a73d57c39139d0fc3aed78435606055779ddaef4b43cdf919a8" +checksum = "a6d2060f5b2e1c664026ca4edd561306c473be887c1f7a81f10bf06f9b71c63f" dependencies = [ "async-lock", "async-trait", @@ -804,9 +783,9 @@ dependencies = [ [[package]] name = "azure_storage" -version = "0.18.0" +version = "0.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ade8f2653e408de88b9eafec9f48c3c26b94026375e88adbd34523a7dd9795a1" +checksum = "15d3da73bfa09350e1bd6ae2a260806fcf90048c7e78cd2d8f88be60b19a7266" dependencies = [ "RustyXML", "async-lock", @@ -823,9 +802,9 @@ dependencies = [ [[package]] name = "azure_storage_blobs" -version = "0.18.0" +version = "0.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "025701c7cc5b523100f0f3b2b01723564ec5a86c03236521c06826337047e872" +checksum = "149c21834a4105d761e3dd33d91c2a3064acc05a3c978848ea8089102ae45c94" dependencies = [ "RustyXML", "azure_core", @@ -844,9 +823,9 @@ dependencies = [ [[package]] name = "azure_svc_blobstorage" -version = "0.18.0" +version = "0.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "76051e5bb67cea1055abe5e530a0878feac7e0ab4cbbcb4a6adc953a58993389" +checksum = "88c888b7bf522d5405218b8613bf0fae7ddaae6ef3bf4ad42ae005993c96ab8b" dependencies = [ "azure_core", "bytes", @@ -897,6 +876,12 @@ version = "0.21.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3f1e31e207a6b8fb791a38ea3105e6cb541f55e4d029902d3039a4ad07cc4105" +[[package]] +name = "base64" +version = "0.22.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" + [[package]] name = "base64-simd" version = "0.8.0" @@ -1162,7 +1147,7 @@ version = "4.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "191d9573962933b4027f932c600cd252ce27a8ad5979418fe78e43c07996f27b" dependencies = [ - "heck", + "heck 0.4.1", "proc-macro2", "quote", "syn 2.0.52", @@ -1234,7 +1219,7 @@ dependencies = [ "compute_api", "flate2", "futures", - "hyper", + "hyper 0.14.26", "nix 0.27.1", "notify", "num_cpus", @@ -1242,7 +1227,7 @@ dependencies = [ "postgres", "regex", "remote_storage", - "reqwest", + "reqwest 0.12.4", "rust-ini", "serde", "serde_json", @@ -1351,7 +1336,8 @@ dependencies = [ "git-version", "hex", "humantime", - "hyper", + "humantime-serde", + "hyper 0.14.26", "nix 0.27.1", "once_cell", "pageserver_api", @@ -1360,7 +1346,7 @@ dependencies = [ "postgres_backend", "postgres_connection", "regex", - "reqwest", + "reqwest 0.12.4", "safekeeper_api", "scopeguard", "serde", @@ -1373,6 +1359,7 @@ dependencies = [ "tokio-postgres", "tokio-util", "toml", + "toml_edit", "tracing", "url", "utils", @@ -1500,12 +1487,9 @@ dependencies = [ [[package]] name = "crossbeam-utils" -version = "0.8.15" +version = "0.8.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3c063cd8cc95f5c377ed0d4b49a4b21f632396ff690e8470c29b3359b346984b" -dependencies = [ - "cfg-if", -] +checksum = "248e3bacc7dc6baa3b21e405ee045c3047101a49145e7e9eca583ab4c2ca5345" [[package]] name = "crossterm" @@ -1878,23 +1862,12 @@ checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" [[package]] name = "errno" -version = "0.3.1" +version = "0.3.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4bcfec3a70f97c962c307b2d2c56e358cf1d00b558d74262b5f929ee8cc7e73a" +checksum = "a258e46cdc063eb8519c00b9fc845fc47bcfca4130e2f08e88665ceda8474245" dependencies = [ - "errno-dragonfly", - "libc", - "windows-sys 0.48.0", -] - -[[package]] -name = "errno-dragonfly" -version = "0.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aa68f1b12764fab894d2755d2518754e71b4fd80ecfb822714a1206c2aab39bf" -dependencies = [ - "cc", "libc", + "windows-sys 0.52.0", ] [[package]] @@ -2234,9 +2207,9 @@ dependencies = [ [[package]] name = "h2" -version = "0.3.24" +version = "0.3.26" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bb2c4422095b67ee78da96fbb51a4cc413b3b25883c7717ff7ca1ab31022c9c9" +checksum = "81fe527a889e1532da5c525686d96d4c2e74cdd345badf8dfef9f6b39dd5f5e8" dependencies = [ "bytes", "fnv", @@ -2251,6 +2224,25 @@ dependencies = [ "tracing", ] +[[package]] +name = "h2" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "816ec7294445779408f36fe57bc5b7fc1cf59664059096c65f905c1c61f58069" +dependencies = [ + "bytes", + "fnv", + "futures-core", + "futures-sink", + "futures-util", + "http 1.1.0", + "indexmap 2.0.1", + "slab", + "tokio", + "tokio-util", + "tracing", +] + [[package]] name = "half" version = "1.8.2" @@ -2332,6 +2324,12 @@ version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" +[[package]] +name = "heck" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" + [[package]] name = "hermit-abi" version = "0.3.3" @@ -2383,6 +2381,17 @@ dependencies = [ "winapi", ] +[[package]] +name = "hostname" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f9c7c7c8ac16c798734b8a24560c1362120597c40d5e1459f09498f8f6c8f2ba" +dependencies = [ + "cfg-if", + "libc", + "windows 0.52.0", +] + [[package]] name = "http" version = "0.2.9" @@ -2396,9 +2405,9 @@ dependencies = [ [[package]] name = "http" -version = "1.0.0" +version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b32afd38673a8016f7c9ae69e5af41a58f81b1d31689040f2f1959594ce194ea" +checksum = "21b9ddb458710bc376481b842f5da65cdf31522de232c1ca8146abce2a358258" dependencies = [ "bytes", "fnv", @@ -2416,6 +2425,29 @@ dependencies = [ "pin-project-lite", ] +[[package]] +name = "http-body" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1cac85db508abc24a2e48553ba12a996e87244a0395ce011e62b37158745d643" +dependencies = [ + "bytes", + "http 1.1.0", +] + +[[package]] +name = "http-body-util" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41cb79eb393015dadd30fc252023adb0b2400a0caee0fa2a077e6e21a551e840" +dependencies = [ + "bytes", + "futures-util", + "http 1.1.0", + "http-body 1.0.0", + "pin-project-lite", +] + [[package]] name = "http-types" version = "2.12.0" @@ -2474,9 +2506,9 @@ dependencies = [ "futures-channel", "futures-core", "futures-util", - "h2", + "h2 0.3.26", "http 0.2.9", - "http-body", + "http-body 0.4.5", "httparse", "httpdate", "itoa", @@ -2488,6 +2520,27 @@ dependencies = [ "want", ] +[[package]] +name = "hyper" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "186548d73ac615b32a73aafe38fb4f56c0d340e110e5a200bcadbaf2e199263a" +dependencies = [ + "bytes", + "futures-channel", + "futures-util", + "h2 0.4.4", + "http 1.1.0", + "http-body 1.0.0", + "httparse", + "httpdate", + "itoa", + "pin-project-lite", + "smallvec", + "tokio", + "want", +] + [[package]] name = "hyper-rustls" version = "0.24.0" @@ -2495,21 +2548,38 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0646026eb1b3eea4cd9ba47912ea5ce9cc07713d105b1a14698f4e6433d348b7" dependencies = [ "http 0.2.9", - "hyper", + "hyper 0.14.26", "log", - "rustls 0.21.9", - "rustls-native-certs", + "rustls 0.21.11", + "rustls-native-certs 0.6.2", "tokio", "tokio-rustls 0.24.0", ] +[[package]] +name = "hyper-rustls" +version = "0.26.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a0bea761b46ae2b24eb4aef630d8d1c398157b6fc29e6350ecf090a0b70c952c" +dependencies = [ + "futures-util", + "http 1.1.0", + "hyper 1.2.0", + "hyper-util", + "rustls 0.22.4", + "rustls-pki-types", + "tokio", + "tokio-rustls 0.25.0", + "tower-service", +] + [[package]] name = "hyper-timeout" version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bbb958482e8c7be4bc3cf272a766a2b0bf1a6755e7a6ae777f017a31d11b13b1" dependencies = [ - "hyper", + "hyper 0.14.26", "pin-project-lite", "tokio", "tokio-io-timeout", @@ -2522,7 +2592,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d6183ddfa99b85da61a140bea0efc93fdf56ceaa041b37d553518030827f9905" dependencies = [ "bytes", - "hyper", + "hyper 0.14.26", "native-tls", "tokio", "tokio-native-tls", @@ -2530,15 +2600,37 @@ dependencies = [ [[package]] name = "hyper-tungstenite" -version = "0.11.1" +version = "0.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7cc7dcb1ab67cd336f468a12491765672e61a3b6b148634dbfe2fe8acd3fe7d9" +checksum = "7a343d17fe7885302ed7252767dc7bb83609a874b6ff581142241ec4b73957ad" dependencies = [ - "hyper", + "http-body-util", + "hyper 1.2.0", + "hyper-util", "pin-project-lite", "tokio", - "tokio-tungstenite", - "tungstenite", + "tokio-tungstenite 0.21.0", + "tungstenite 0.21.0", +] + +[[package]] +name = "hyper-util" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ca38ef113da30126bbff9cd1705f9273e15d45498615d138b0c20279ac7a76aa" +dependencies = [ + "bytes", + "futures-channel", + "futures-util", + "http 1.1.0", + "http-body 1.0.0", + "hyper 1.2.0", + "pin-project-lite", + "socket2 0.5.5", + "tokio", + "tower", + "tower-service", + "tracing", ] [[package]] @@ -2552,7 +2644,7 @@ dependencies = [ "iana-time-zone-haiku", "js-sys", "wasm-bindgen", - "windows", + "windows 0.48.0", ] [[package]] @@ -2723,9 +2815,9 @@ dependencies = [ [[package]] name = "js-sys" -version = "0.3.63" +version = "0.3.69" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2f37a4a5928311ac501dee68b3c7613a1037d0edb30c8e5427bd832d55d1b790" +checksum = "29c15563dc2726973df627357ce0c9ddddbea194836909d655df6a75d2cf296d" dependencies = [ "wasm-bindgen", ] @@ -2832,6 +2924,12 @@ version = "0.3.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ef53942eb7bf7ff43a617b3e2c1c4a5ecf5944a7c1bc12d7ee39bbb15e5c1519" +[[package]] +name = "linux-raw-sys" +version = "0.4.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "01cda141df6706de531b6c46c3a33ecca755538219bd484262fa09410c13539c" + [[package]] name = "lock_api" version = "0.4.10" @@ -2848,6 +2946,15 @@ version = "0.4.20" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f" +[[package]] +name = "lru" +version = "0.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3262e75e648fce39813cb56ac41f3c3e3f65217ebf3844d818d1f9398cfb0dc" +dependencies = [ + "hashbrown 0.14.0", +] + [[package]] name = "match_cfg" version = "0.1.0" @@ -2869,6 +2976,12 @@ version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b87248edafb776e59e6ee64a79086f65890d3510f2c656c000bf2a7e8a0aea40" +[[package]] +name = "matchit" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "540f1c43aed89909c0cc0cc604e3bb2f7e7a341a3728a9e6cfe760e733cd11ed" + [[package]] name = "md-5" version = "0.10.5" @@ -2886,11 +2999,12 @@ checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771" [[package]] name = "measured" -version = "0.0.13" +version = "0.0.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f246648d027839a34b420e27c7de1165ace96e19ef894985d0a6ff89a7840a9f" +checksum = "652bc741286361c06de8cb4d89b21a6437f120c508c51713663589eeb9928ac5" dependencies = [ "bytes", + "crossbeam-utils", "hashbrown 0.14.0", "itoa", "lasso", @@ -2903,16 +3017,27 @@ dependencies = [ [[package]] name = "measured-derive" -version = "0.0.13" +version = "0.0.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "edaa5cc22d99d5d6d7d99c3b5b5f7e7f8034c22f1b5d62a1adecd2ed005d9b80" +checksum = "6ea497f33e1e856a376c32ad916f69a0bd3c597db1f912a399f842b01a4a685d" dependencies = [ - "heck", + "heck 0.5.0", "proc-macro2", "quote", "syn 2.0.52", ] +[[package]] +name = "measured-process" +version = "0.0.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b364ccb66937a814b6b2ad751d1a2f7a9d5a78c761144036825fb36bb0771000" +dependencies = [ + "libc", + "measured", + "procfs 0.16.0", +] + [[package]] name = "memchr" version = "2.6.4" @@ -2952,8 +3077,10 @@ version = "0.1.0" dependencies = [ "chrono", "libc", + "measured", + "measured-process", "once_cell", - "procfs", + "procfs 0.14.2", "prometheus", "rand 0.8.5", "rand_distr", @@ -2988,16 +3115,6 @@ version = "0.3.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a" -[[package]] -name = "mime_guess" -version = "2.0.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4192263c238a5f0d0c6bfd21f336a313a4ce1c450542449ca191bb657b4642ef" -dependencies = [ - "mime", - "unicase", -] - [[package]] name = "minimal-lexical" version = "0.2.1" @@ -3124,6 +3241,16 @@ dependencies = [ "winapi", ] +[[package]] +name = "nu-ansi-term" +version = "0.46.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77a8165726e8236064dbb45459242600304b42a5ea24ee2948e18e023bf7ba84" +dependencies = [ + "overload", + "winapi", +] + [[package]] name = "num" version = "0.4.1" @@ -3331,7 +3458,7 @@ dependencies = [ "bytes", "http 0.2.9", "opentelemetry_api", - "reqwest", + "reqwest 0.11.19", ] [[package]] @@ -3349,7 +3476,7 @@ dependencies = [ "opentelemetry_api", "opentelemetry_sdk", "prost", - "reqwest", + "reqwest 0.11.19", "thiserror", "tokio", "tonic", @@ -3435,9 +3562,9 @@ dependencies = [ [[package]] name = "ordered-multimap" -version = "0.7.1" +version = "0.7.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a4d6a8c22fc714f0c2373e6091bf6f5e9b37b1bc0b1184874b7e0a4e303d318f" +checksum = "49203cdcae0030493bad186b28da2fa25645fa276a51b6fec8010d281e02ef79" dependencies = [ "dlv-list", "hashbrown 0.14.0", @@ -3460,6 +3587,12 @@ version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4030760ffd992bef45b0ae3f10ce1aba99e33464c90d14dd7c039884963ddc7a" +[[package]] +name = "overload" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39" + [[package]] name = "p256" version = "0.11.1" @@ -3503,12 +3636,17 @@ dependencies = [ "camino", "clap", "git-version", + "humantime", "pageserver", + "pageserver_api", "postgres_ffi", + "remote_storage", "serde", "serde_json", "svg_fmt", "tokio", + "tokio-util", + "toml_edit", "utils", "workspace_hack", ] @@ -3544,7 +3682,7 @@ dependencies = [ "hex-literal", "humantime", "humantime-serde", - "hyper", + "hyper 0.14.26", "itertools", "leaky-bucket", "md5", @@ -3563,11 +3701,11 @@ dependencies = [ "postgres_connection", "postgres_ffi", "pq_proto", - "procfs", + "procfs 0.14.2", "rand 0.8.5", "regex", "remote_storage", - "reqwest", + "reqwest 0.12.4", "rpds", "scopeguard", "serde", @@ -3581,6 +3719,7 @@ dependencies = [ "strum_macros", "svg_fmt", "sync_wrapper", + "sysinfo", "tenant_size_model", "thiserror", "tokio", @@ -3592,6 +3731,7 @@ dependencies = [ "tokio-util", "toml_edit", "tracing", + "twox-hash", "url", "utils", "walkdir", @@ -3635,7 +3775,7 @@ dependencies = [ "futures", "pageserver_api", "postgres", - "reqwest", + "reqwest 0.12.4", "serde", "thiserror", "tokio", @@ -3653,7 +3793,6 @@ dependencies = [ "anyhow", "async-compression", "async-stream", - "async-trait", "byteorder", "bytes", "chrono", @@ -3994,7 +4133,7 @@ dependencies = [ "futures", "once_cell", "pq_proto", - "rustls 0.22.2", + "rustls 0.22.4", "rustls-pemfile 2.1.1", "serde", "thiserror", @@ -4123,6 +4262,29 @@ dependencies = [ "rustix 0.36.16", ] +[[package]] +name = "procfs" +version = "0.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "731e0d9356b0c25f16f33b5be79b1c57b562f141ebfcdb0ad8ac2c13a24293b4" +dependencies = [ + "bitflags 2.4.1", + "hex", + "lazy_static", + "procfs-core", + "rustix 0.38.28", +] + +[[package]] +name = "procfs-core" +version = "0.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2d3554923a69f4ce04c4a754260c338f505ce22642d3830e049a399fc2059a29" +dependencies = [ + "bitflags 2.4.1", + "hex", +] + [[package]] name = "prometheus" version = "0.13.3" @@ -4135,7 +4297,7 @@ dependencies = [ "libc", "memchr", "parking_lot 0.12.1", - "procfs", + "procfs 0.14.2", "thiserror", ] @@ -4156,7 +4318,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "119533552c9a7ffacc21e099c24a0ac8bb19c2a2a3f363de84cd9b844feab270" dependencies = [ "bytes", - "heck", + "heck 0.4.1", "itertools", "lazy_static", "log", @@ -4198,7 +4360,13 @@ name = "proxy" version = "0.1.0" dependencies = [ "anyhow", + "async-compression", "async-trait", + "atomic-take", + "aws-config", + "aws-sdk-iam", + "aws-sigv4", + "aws-types", "base64 0.13.1", "bstr", "bytes", @@ -4209,20 +4377,27 @@ dependencies = [ "consumption_metrics", "dashmap", "env_logger", + "fallible-iterator", "futures", "git-version", "hashbrown 0.13.2", "hashlink", "hex", "hmac", - "hostname", + "hostname 0.3.1", + "http 1.1.0", + "http-body-util", "humantime", - "hyper", + "hyper 0.14.26", + "hyper 1.2.0", "hyper-tungstenite", + "hyper-util", + "indexmap 2.0.1", "ipnet", "itertools", "lasso", "md5", + "measured", "metrics", "native-tls", "once_cell", @@ -4243,14 +4418,14 @@ dependencies = [ "redis", "regex", "remote_storage", - "reqwest", + "reqwest 0.12.4", "reqwest-middleware", "reqwest-retry", "reqwest-tracing", "routerify", "rstest", "rustc-hash", - "rustls 0.22.2", + "rustls 0.22.4", "rustls-pemfile 2.1.1", "scopeguard", "serde", @@ -4270,6 +4445,7 @@ dependencies = [ "tokio-postgres-rustls", "tokio-rustls 0.25.0", "tokio-util", + "tower-service", "tracing", "tracing-opentelemetry", "tracing-subscriber", @@ -4431,9 +4607,9 @@ dependencies = [ [[package]] name = "redis" -version = "0.24.0" +version = "0.25.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c580d9cbbe1d1b479e8d67cf9daf6a62c957e6846048408b80b43ac3f6af84cd" +checksum = "71d64e978fd98a0e6b105d066ba4889a7301fca65aeac850a877d8797343feeb" dependencies = [ "async-trait", "bytes", @@ -4442,15 +4618,15 @@ dependencies = [ "itoa", "percent-encoding", "pin-project-lite", - "rustls 0.21.9", - "rustls-native-certs", - "rustls-pemfile 1.0.2", - "rustls-webpki 0.101.7", + "rustls 0.22.4", + "rustls-native-certs 0.7.0", + "rustls-pemfile 2.1.1", + "rustls-pki-types", "ryu", "sha1_smol", - "socket2 0.4.9", + "socket2 0.5.5", "tokio", - "tokio-rustls 0.24.0", + "tokio-rustls 0.25.0", "tokio-util", "url", ] @@ -4551,7 +4727,7 @@ dependencies = [ "futures-util", "http-types", "humantime", - "hyper", + "hyper 0.14.26", "itertools", "metrics", "once_cell", @@ -4560,6 +4736,7 @@ dependencies = [ "scopeguard", "serde", "serde_json", + "sync_wrapper", "test-context", "tokio", "tokio-stream", @@ -4581,73 +4758,110 @@ dependencies = [ "encoding_rs", "futures-core", "futures-util", - "h2", + "h2 0.3.26", "http 0.2.9", - "http-body", - "hyper", - "hyper-rustls", + "http-body 0.4.5", + "hyper 0.14.26", "hyper-tls", "ipnet", "js-sys", "log", "mime", - "mime_guess", "native-tls", "once_cell", "percent-encoding", "pin-project-lite", - "rustls 0.21.9", - "rustls-pemfile 1.0.2", "serde", "serde_json", "serde_urlencoded", "tokio", "tokio-native-tls", - "tokio-rustls 0.24.0", "tokio-util", "tower-service", "url", "wasm-bindgen", "wasm-bindgen-futures", - "wasm-streams", + "wasm-streams 0.3.0", "web-sys", - "webpki-roots 0.25.2", - "winreg", + "winreg 0.50.0", +] + +[[package]] +name = "reqwest" +version = "0.12.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "566cafdd92868e0939d3fb961bd0dc25fcfaaed179291093b3d43e6b3150ea10" +dependencies = [ + "base64 0.22.1", + "bytes", + "futures-channel", + "futures-core", + "futures-util", + "http 1.1.0", + "http-body 1.0.0", + "http-body-util", + "hyper 1.2.0", + "hyper-rustls 0.26.0", + "hyper-util", + "ipnet", + "js-sys", + "log", + "mime", + "once_cell", + "percent-encoding", + "pin-project-lite", + "rustls 0.22.4", + "rustls-pemfile 2.1.1", + "rustls-pki-types", + "serde", + "serde_json", + "serde_urlencoded", + "sync_wrapper", + "tokio", + "tokio-rustls 0.25.0", + "tokio-util", + "tower-service", + "url", + "wasm-bindgen", + "wasm-bindgen-futures", + "wasm-streams 0.4.0", + "web-sys", + "webpki-roots 0.26.1", + "winreg 0.52.0", ] [[package]] name = "reqwest-middleware" -version = "0.2.2" +version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4531c89d50effe1fac90d095c8b133c20c5c714204feee0bfc3fd158e784209d" +checksum = "0209efb52486ad88136190094ee214759ef7507068b27992256ed6610eb71a01" dependencies = [ "anyhow", "async-trait", - "http 0.2.9", - "reqwest", + "http 1.1.0", + "reqwest 0.12.4", "serde", - "task-local-extensions", "thiserror", + "tower-service", ] [[package]] name = "reqwest-retry" -version = "0.2.2" +version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "48d0fd6ef4c6d23790399fe15efc8d12cd9f3d4133958f9bd7801ee5cbaec6c4" +checksum = "40f342894422862af74c50e1e9601cf0931accc9c6981e5eb413c46603b616b5" dependencies = [ "anyhow", "async-trait", "chrono", "futures", "getrandom 0.2.11", - "http 0.2.9", - "hyper", + "http 1.1.0", + "hyper 1.2.0", "parking_lot 0.11.2", - "reqwest", + "reqwest 0.12.4", "reqwest-middleware", "retry-policies", - "task-local-extensions", "tokio", "tracing", "wasm-timer", @@ -4655,27 +4869,27 @@ dependencies = [ [[package]] name = "reqwest-tracing" -version = "0.4.7" +version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5a0152176687dd5cfe7f507ac1cb1a491c679cfe483afd133a7db7aaea818bb3" +checksum = "b253954a1979e02eabccd7e9c3d61d8f86576108baa160775e7f160bb4e800a3" dependencies = [ "anyhow", "async-trait", "getrandom 0.2.11", - "matchit", + "http 1.1.0", + "matchit 0.8.2", "opentelemetry", - "reqwest", + "reqwest 0.12.4", "reqwest-middleware", - "task-local-extensions", "tracing", "tracing-opentelemetry", ] [[package]] name = "retry-policies" -version = "0.1.2" +version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e09bbcb5003282bcb688f0bae741b278e9c7e8f378f561522c9806c58e075d9b" +checksum = "493b4243e32d6eedd29f9a398896e35c6943a123b55eec97dcaee98310d25810" dependencies = [ "anyhow", "chrono", @@ -4729,7 +4943,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "496c1d3718081c45ba9c31fbfc07417900aa96f4070ff90dc29961836b7a9945" dependencies = [ "http 0.2.9", - "hyper", + "hyper 0.14.26", "lazy_static", "percent-encoding", "regex", @@ -4842,10 +5056,23 @@ dependencies = [ ] [[package]] -name = "rustls" -version = "0.21.9" +name = "rustix" +version = "0.38.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "629648aced5775d558af50b2b4c7b02983a04b312126d45eeead26e7caa498b9" +checksum = "72e572a5e8ca657d7366229cdde4bd14c4eb5499a9573d4d366fe1b599daa316" +dependencies = [ + "bitflags 2.4.1", + "errno", + "libc", + "linux-raw-sys 0.4.13", + "windows-sys 0.52.0", +] + +[[package]] +name = "rustls" +version = "0.21.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7fecbfb7b1444f477b345853b1fce097a2c6fb637b2bfb87e6bc5db0f043fae4" dependencies = [ "log", "ring 0.17.6", @@ -4855,9 +5082,9 @@ dependencies = [ [[package]] name = "rustls" -version = "0.22.2" +version = "0.22.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e87c9956bd9807afa1f77e0f7594af32566e830e088a5576d27c5b6f30f49d41" +checksum = "bf4ef73721ac7bcd79b2b315da7779d8fc09718c6b3d2d1b2d94850eb8c18432" dependencies = [ "log", "ring 0.17.6", @@ -4879,6 +5106,19 @@ dependencies = [ "security-framework", ] +[[package]] +name = "rustls-native-certs" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f1fb85efa936c42c6d5fc28d2629bb51e4b2f4b8a5211e297d599cc5a093792" +dependencies = [ + "openssl-probe", + "rustls-pemfile 2.1.1", + "rustls-pki-types", + "schannel", + "security-framework", +] + [[package]] name = "rustls-pemfile" version = "1.0.2" @@ -4958,6 +5198,7 @@ dependencies = [ "aws-smithy-async", "bincode", "bytes", + "camino", "chrono", "clap", "crc32c", @@ -4967,18 +5208,23 @@ dependencies = [ "hex", "histogram", "itertools", + "native-tls", "pageserver", "pageserver_api", + "postgres-native-tls", + "postgres_ffi", "rand 0.8.5", "remote_storage", - "reqwest", + "reqwest 0.12.4", "serde", "serde_json", "serde_with", "thiserror", "tokio", + "tokio-postgres", "tokio-rustls 0.25.0", "tokio-stream", + "tokio-util", "tracing", "tracing-appender", "tracing-subscriber", @@ -5008,7 +5254,7 @@ dependencies = [ "git-version", "hex", "humantime", - "hyper", + "hyper 0.14.26", "metrics", "once_cell", "parking_lot 0.12.1", @@ -5020,7 +5266,7 @@ dependencies = [ "rand 0.8.5", "regex", "remote_storage", - "reqwest", + "reqwest 0.12.4", "safekeeper_api", "scopeguard", "sd-notify", @@ -5150,13 +5396,13 @@ checksum = "bebd363326d05ec3e2f532ab7660680f3b02130d780c299bca73469d521bc0ed" [[package]] name = "sentry" -version = "0.31.6" +version = "0.32.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2e95efd0cefa32028cdb9766c96de71d96671072f9fb494dc9fb84c0ef93e52b" +checksum = "00421ed8fa0c995f07cde48ba6c89e80f2b312f74ff637326f392fbfd23abe02" dependencies = [ "httpdate", - "reqwest", - "rustls 0.21.9", + "reqwest 0.12.4", + "rustls 0.21.11", "sentry-backtrace", "sentry-contexts", "sentry-core", @@ -5169,9 +5415,9 @@ dependencies = [ [[package]] name = "sentry-backtrace" -version = "0.31.6" +version = "0.32.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6ac2bac6f310c4c4c4bb094d1541d32ae497f8c5c23405e85492cefdfe0971a9" +checksum = "a79194074f34b0cbe5dd33896e5928bbc6ab63a889bd9df2264af5acb186921e" dependencies = [ "backtrace", "once_cell", @@ -5181,11 +5427,11 @@ dependencies = [ [[package]] name = "sentry-contexts" -version = "0.31.6" +version = "0.32.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6c3e17295cecdbacf66c5bd38d6e1147e09e1e9d824d2d5341f76638eda02a3a" +checksum = "eba8870c5dba2bfd9db25c75574a11429f6b95957b0a78ac02e2970dd7a5249a" dependencies = [ - "hostname", + "hostname 0.4.0", "libc", "os_info", "rustc_version", @@ -5195,9 +5441,9 @@ dependencies = [ [[package]] name = "sentry-core" -version = "0.31.6" +version = "0.32.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8339474f587f36cb110fa1ed1b64229eea6d47b0b886375579297b7e47aeb055" +checksum = "46a75011ea1c0d5c46e9e57df03ce81f5c7f0a9e199086334a1f9c0a541e0826" dependencies = [ "once_cell", "rand 0.8.5", @@ -5208,9 +5454,9 @@ dependencies = [ [[package]] name = "sentry-panic" -version = "0.31.6" +version = "0.32.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "875b69f506da75bd664029eafb05f8934297d2990192896d17325f066bd665b7" +checksum = "2eaa3ecfa3c8750c78dcfd4637cfa2598b95b52897ed184b4dc77fcf7d95060d" dependencies = [ "sentry-backtrace", "sentry-core", @@ -5218,9 +5464,9 @@ dependencies = [ [[package]] name = "sentry-tracing" -version = "0.31.6" +version = "0.32.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89feead9bdd116f8035e89567651340fc382db29240b6c55ef412078b08d1aa3" +checksum = "f715932bf369a61b7256687c6f0554141b7ce097287e30e3f7ed6e9de82498fe" dependencies = [ "sentry-backtrace", "sentry-core", @@ -5230,13 +5476,13 @@ dependencies = [ [[package]] name = "sentry-types" -version = "0.31.6" +version = "0.32.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "99dc599bd6646884fc403d593cdcb9816dd67c50cff3271c01ff123617908dcd" +checksum = "4519c900ce734f7a0eb7aba0869dfb225a7af8820634a7dd51449e3b093cfb7c" dependencies = [ "debugid", - "getrandom 0.2.11", "hex", + "rand 0.8.5", "serde", "serde_json", "thiserror", @@ -5493,9 +5739,9 @@ dependencies = [ [[package]] name = "smallvec" -version = "1.11.0" +version = "1.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "62bb4feee49fdd9f707ef802e22365a35de4b7b299de4763d44bfea899442ff9" +checksum = "e6ecd384b10a64542d77071bd64bd7b231f4ed5940fba55e98c3de13824cf3d7" [[package]] name = "smol_str" @@ -5587,7 +5833,7 @@ dependencies = [ "futures-util", "git-version", "humantime", - "hyper", + "hyper 0.14.26", "metrics", "once_cell", "parking_lot 0.12.1", @@ -5601,6 +5847,67 @@ dependencies = [ "workspace_hack", ] +[[package]] +name = "storage_controller" +version = "0.1.0" +dependencies = [ + "anyhow", + "aws-config", + "bytes", + "camino", + "clap", + "control_plane", + "diesel", + "diesel_migrations", + "fail", + "futures", + "git-version", + "hex", + "humantime", + "hyper 0.14.26", + "itertools", + "lasso", + "measured", + "metrics", + "once_cell", + "pageserver_api", + "pageserver_client", + "postgres_connection", + "r2d2", + "reqwest 0.12.4", + "routerify", + "serde", + "serde_json", + "strum", + "strum_macros", + "thiserror", + "tokio", + "tokio-util", + "tracing", + "utils", + "workspace_hack", +] + +[[package]] +name = "storcon_cli" +version = "0.1.0" +dependencies = [ + "anyhow", + "clap", + "comfy-table", + "hyper 0.14.26", + "pageserver_api", + "pageserver_client", + "reqwest 0.12.4", + "serde", + "serde_json", + "thiserror", + "tokio", + "tracing", + "utils", + "workspace_hack", +] + [[package]] name = "stringprep" version = "0.1.2" @@ -5629,7 +5936,7 @@ version = "0.24.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1e385be0d24f186b4ce2f9982191e7101bb737312ad61c1f2f984f34bcf85d59" dependencies = [ - "heck", + "heck 0.4.1", "proc-macro2", "quote", "rustversion", @@ -5645,8 +5952,7 @@ checksum = "81cdd64d312baedb58e21336b31bc043b77e01cc99033ce76ef539f78e965ebc" [[package]] name = "svg_fmt" version = "0.4.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f83ba502a3265efb76efb89b0a2f7782ad6f2675015d4ce37e4b547dda42b499" +source = "git+https://github.com/neondatabase/fork--nical--rust_debug?branch=neon#c1820b28664b5df68de7f043fccf2ed5d67b6ae8" [[package]] name = "syn" @@ -5675,6 +5981,9 @@ name = "sync_wrapper" version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2047c6ded9c721764247e62cd3b03c09ffc529b2ba5b10ec482ae507a4a70160" +dependencies = [ + "futures-core", +] [[package]] name = "synstructure" @@ -5757,23 +6066,23 @@ dependencies = [ [[package]] name = "test-context" -version = "0.1.4" +version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "055831a02a4f5aa28fede67f2902014273eb8c21b958ac5ebbd59b71ef30dbc3" +checksum = "6676ab8513edfd2601a108621103fdb45cac9098305ca25ec93f7023b06b05d9" dependencies = [ - "async-trait", "futures", "test-context-macros", ] [[package]] name = "test-context-macros" -version = "0.1.4" +version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8901a55b0a7a06ebc4a674dcca925170da8e613fa3b163a1df804ed10afb154d" +checksum = "78ea17a2dc368aeca6f554343ced1b1e31f76d63683fa8016e5844bd7a5144a1" dependencies = [ + "proc-macro2", "quote", - "syn 1.0.109", + "syn 2.0.52", ] [[package]] @@ -5914,9 +6223,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tokio" -version = "1.36.0" +version = "1.37.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "61285f6515fa018fb2d1e46eb21223fff441ee8db5d0f1435e8ab4f5cdb80931" +checksum = "1adbebffeca75fcfd058afa480fb6c0b81e165a0323f9c9d39c9697e37c46787" dependencies = [ "backtrace", "bytes", @@ -6008,7 +6317,7 @@ checksum = "0ea13f22eda7127c827983bdaf0d7fff9df21c8817bab02815ac277a21143677" dependencies = [ "futures", "ring 0.17.6", - "rustls 0.22.2", + "rustls 0.22.4", "tokio", "tokio-postgres", "tokio-rustls 0.25.0", @@ -6021,7 +6330,7 @@ version = "0.24.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e0d409377ff5b1e3ca6437aa86c1eb7d40c134bfec254e44c830defa92669db5" dependencies = [ - "rustls 0.21.9", + "rustls 0.21.11", "tokio", ] @@ -6031,7 +6340,7 @@ version = "0.25.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "775e0c0f0adb3a2f22a00c4745d728b479985fc15ee7ca6a2608388c5569860f" dependencies = [ - "rustls 0.22.2", + "rustls 0.22.4", "rustls-pki-types", "tokio", ] @@ -6071,7 +6380,19 @@ dependencies = [ "futures-util", "log", "tokio", - "tungstenite", + "tungstenite 0.20.1", +] + +[[package]] +name = "tokio-tungstenite" +version = "0.21.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c83b561d025642014097b66e6c1bb422783339e0909e4429cde4749d1990bc38" +dependencies = [ + "futures-util", + "log", + "tokio", + "tungstenite 0.21.0", ] [[package]] @@ -6138,15 +6459,15 @@ dependencies = [ "bytes", "futures-core", "futures-util", - "h2", + "h2 0.3.26", "http 0.2.9", - "http-body", - "hyper", + "http-body 0.4.5", + "hyper 0.14.26", "hyper-timeout", "percent-encoding", "pin-project", "prost", - "rustls-native-certs", + "rustls-native-certs 0.6.2", "rustls-pemfile 1.0.2", "tokio", "tokio-rustls 0.24.0", @@ -6281,12 +6602,14 @@ dependencies = [ [[package]] name = "tracing-opentelemetry" -version = "0.20.0" +version = "0.21.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fc09e402904a5261e42cf27aea09ccb7d5318c6717a9eec3d8e2e65c56b18f19" +checksum = "75327c6b667828ddc28f5e3f169036cb793c3f588d83bf0f262a7f062ffed3c8" dependencies = [ "once_cell", "opentelemetry", + "opentelemetry_sdk", + "smallvec", "tracing", "tracing-core", "tracing-log", @@ -6310,6 +6633,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "30a651bc37f915e81f087d86e62a18eec5f79550c7faff886f7090b4ea757c77" dependencies = [ "matchers", + "nu-ansi-term", "once_cell", "regex", "serde", @@ -6327,11 +6651,11 @@ dependencies = [ name = "tracing-utils" version = "0.1.0" dependencies = [ - "hyper", + "hyper 0.14.26", "opentelemetry", "opentelemetry-otlp", "opentelemetry-semantic-conventions", - "reqwest", + "reqwest 0.12.4", "tokio", "tracing", "tracing-opentelemetry", @@ -6364,6 +6688,25 @@ dependencies = [ "utf-8", ] +[[package]] +name = "tungstenite" +version = "0.21.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ef1a641ea34f399a848dea702823bbecfb4c486f911735368f1f137cb8257e1" +dependencies = [ + "byteorder", + "bytes", + "data-encoding", + "http 1.1.0", + "httparse", + "log", + "rand 0.8.5", + "sha1", + "thiserror", + "url", + "utf-8", +] + [[package]] name = "twox-hash" version = "1.6.3" @@ -6398,15 +6741,6 @@ dependencies = [ "libc", ] -[[package]] -name = "unicase" -version = "2.6.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "50f37be617794602aabbeee0be4f259dc1778fabe05e2d67ee8f79326d5cb4f6" -dependencies = [ - "version_check", -] - [[package]] name = "unicode-bidi" version = "0.3.13" @@ -6461,7 +6795,7 @@ dependencies = [ "base64 0.21.1", "log", "once_cell", - "rustls 0.21.9", + "rustls 0.21.11", "rustls-webpki 0.100.2", "url", "webpki-roots 0.23.1", @@ -6528,7 +6862,8 @@ dependencies = [ "heapless", "hex", "hex-literal", - "hyper", + "humantime", + "hyper 0.14.26", "jsonwebtoken", "leaky-bucket", "metrics", @@ -6687,9 +7022,9 @@ checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" [[package]] name = "wasm-bindgen" -version = "0.2.86" +version = "0.2.92" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5bba0e8cb82ba49ff4e229459ff22a191bbe9a1cb3a341610c9c33efc27ddf73" +checksum = "4be2531df63900aeb2bca0daaaddec08491ee64ceecbee5076636a3b026795a8" dependencies = [ "cfg-if", "wasm-bindgen-macro", @@ -6697,9 +7032,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-backend" -version = "0.2.86" +version = "0.2.92" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "19b04bc93f9d6bdee709f6bd2118f57dd6679cf1176a1af464fca3ab0d66d8fb" +checksum = "614d787b966d3989fa7bb98a654e369c762374fd3213d212cfc0251257e747da" dependencies = [ "bumpalo", "log", @@ -6712,9 +7047,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-futures" -version = "0.4.36" +version = "0.4.42" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2d1985d03709c53167ce907ff394f5316aa22cb4e12761295c5dc57dacb6297e" +checksum = "76bc14366121efc8dbb487ab05bcc9d346b3b5ec0eaa76e46594cabbe51762c0" dependencies = [ "cfg-if", "js-sys", @@ -6724,9 +7059,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro" -version = "0.2.86" +version = "0.2.92" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "14d6b024f1a526bb0234f52840389927257beb670610081360e5a03c5df9c258" +checksum = "a1f8823de937b71b9460c0c34e25f3da88250760bec0ebac694b49997550d726" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -6734,9 +7069,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.86" +version = "0.2.92" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e128beba882dd1eb6200e1dc92ae6c5dbaa4311aa7bb211ca035779e5efc39f8" +checksum = "e94f17b526d0a461a191c78ea52bbce64071ed5c04c9ffe424dcb38f74171bb7" dependencies = [ "proc-macro2", "quote", @@ -6747,9 +7082,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-shared" -version = "0.2.86" +version = "0.2.92" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ed9d5b4305409d1fc9482fee2d7f9bcbf24b3972bf59817ef757e23982242a93" +checksum = "af190c94f2773fdb3729c55b007a722abb5384da03bc0986df4c289bf5567e96" [[package]] name = "wasm-streams" @@ -6764,6 +7099,19 @@ dependencies = [ "web-sys", ] +[[package]] +name = "wasm-streams" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b65dc4c90b63b118468cf747d8bf3566c1913ef60be765b5730ead9e0a3ba129" +dependencies = [ + "futures-util", + "js-sys", + "wasm-bindgen", + "wasm-bindgen-futures", + "web-sys", +] + [[package]] name = "wasm-timer" version = "0.2.5" @@ -6781,9 +7129,9 @@ dependencies = [ [[package]] name = "web-sys" -version = "0.3.63" +version = "0.3.69" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3bdd9ef4e984da1187bf8110c5cf5b845fbc87a23602cdf912386a76fcd3a7c2" +checksum = "77afa9a11836342370f4817622a2f0f418b134426d91a82dfb48f532d2ec13ef" dependencies = [ "js-sys", "wasm-bindgen", @@ -6804,6 +7152,15 @@ version = "0.25.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "14247bb57be4f377dfb94c72830b8ce8fc6beac03cf4bf7b9732eadd414123fc" +[[package]] +name = "webpki-roots" +version = "0.26.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b3de34ae270483955a94f4b21bdaaeb83d508bb84a01435f393818edb0012009" +dependencies = [ + "rustls-pki-types", +] + [[package]] name = "which" version = "4.4.0" @@ -6855,6 +7212,25 @@ dependencies = [ "windows-targets 0.48.0", ] +[[package]] +name = "windows" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e48a53791691ab099e5e2ad123536d0fff50652600abaf43bbf952894110d0be" +dependencies = [ + "windows-core", + "windows-targets 0.52.4", +] + +[[package]] +name = "windows-core" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "33ab640c8d7e35bf8ba19b884ba838ceb4fba93a4e8c65a9059d08afcfc683d9" +dependencies = [ + "windows-targets 0.52.4", +] + [[package]] name = "windows-sys" version = "0.42.0" @@ -6888,6 +7264,15 @@ dependencies = [ "windows-targets 0.48.0", ] +[[package]] +name = "windows-sys" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" +dependencies = [ + "windows-targets 0.52.4", +] + [[package]] name = "windows-targets" version = "0.42.2" @@ -6918,6 +7303,21 @@ dependencies = [ "windows_x86_64_msvc 0.48.0", ] +[[package]] +name = "windows-targets" +version = "0.52.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7dd37b7e5ab9018759f893a1952c9420d060016fc19a472b4bb20d1bdd694d1b" +dependencies = [ + "windows_aarch64_gnullvm 0.52.4", + "windows_aarch64_msvc 0.52.4", + "windows_i686_gnu 0.52.4", + "windows_i686_msvc 0.52.4", + "windows_x86_64_gnu 0.52.4", + "windows_x86_64_gnullvm 0.52.4", + "windows_x86_64_msvc 0.52.4", +] + [[package]] name = "windows_aarch64_gnullvm" version = "0.42.2" @@ -6930,6 +7330,12 @@ version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "91ae572e1b79dba883e0d315474df7305d12f569b400fcf90581b06062f7e1bc" +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bcf46cf4c365c6f2d1cc93ce535f2c8b244591df96ceee75d8e83deb70a9cac9" + [[package]] name = "windows_aarch64_msvc" version = "0.42.2" @@ -6942,6 +7348,12 @@ version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b2ef27e0d7bdfcfc7b868b317c1d32c641a6fe4629c171b8928c7b08d98d7cf3" +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "da9f259dd3bcf6990b55bffd094c4f7235817ba4ceebde8e6d11cd0c5633b675" + [[package]] name = "windows_i686_gnu" version = "0.42.2" @@ -6954,6 +7366,12 @@ version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "622a1962a7db830d6fd0a69683c80a18fda201879f0f447f065a3b7467daa241" +[[package]] +name = "windows_i686_gnu" +version = "0.52.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b474d8268f99e0995f25b9f095bc7434632601028cf86590aea5c8a5cb7801d3" + [[package]] name = "windows_i686_msvc" version = "0.42.2" @@ -6966,6 +7384,12 @@ version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4542c6e364ce21bf45d69fdd2a8e455fa38d316158cfd43b3ac1c5b1b19f8e00" +[[package]] +name = "windows_i686_msvc" +version = "0.52.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1515e9a29e5bed743cb4415a9ecf5dfca648ce85ee42e15873c3cd8610ff8e02" + [[package]] name = "windows_x86_64_gnu" version = "0.42.2" @@ -6978,6 +7402,12 @@ version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ca2b8a661f7628cbd23440e50b05d705db3686f894fc9580820623656af974b1" +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5eee091590e89cc02ad514ffe3ead9eb6b660aedca2183455434b93546371a03" + [[package]] name = "windows_x86_64_gnullvm" version = "0.42.2" @@ -6990,6 +7420,12 @@ version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7896dbc1f41e08872e9d5e8f8baa8fdd2677f29468c4e156210174edc7f7b953" +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77ca79f2451b49fa9e2af39f0747fe999fcda4f5e241b2898624dca97a1f2177" + [[package]] name = "windows_x86_64_msvc" version = "0.42.2" @@ -7002,6 +7438,12 @@ version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1a515f5799fe4961cb532f983ce2b23082366b898e52ffbce459c86f67c8378a" +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32b752e52a2da0ddfbdbcc6fceadfeede4c939ed16d13e648833a61dfb611ed8" + [[package]] name = "winnow" version = "0.4.6" @@ -7021,6 +7463,16 @@ dependencies = [ "windows-sys 0.48.0", ] +[[package]] +name = "winreg" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a277a57398d4bfa075df44f501a17cfdf8542d224f0d36095a2adc7aee4ef0a5" +dependencies = [ + "cfg-if", + "windows-sys 0.48.0", +] + [[package]] name = "workspace_hack" version = "0.1.0" @@ -7031,7 +7483,6 @@ dependencies = [ "aws-sigv4", "aws-smithy-async", "aws-smithy-http", - "aws-smithy-runtime-api", "aws-smithy-types", "axum", "base64 0.21.1", @@ -7051,11 +7502,10 @@ dependencies = [ "futures-sink", "futures-util", "getrandom 0.2.11", - "hashbrown 0.13.2", "hashbrown 0.14.0", "hex", "hmac", - "hyper", + "hyper 0.14.26", "indexmap 1.9.3", "itertools", "libc", @@ -7072,8 +7522,9 @@ dependencies = [ "regex", "regex-automata 0.4.3", "regex-syntax 0.8.2", - "reqwest", - "rustls 0.21.9", + "reqwest 0.11.19", + "reqwest 0.12.4", + "rustls 0.21.11", "scopeguard", "serde", "serde_json", @@ -7082,6 +7533,7 @@ dependencies = [ "subtle", "syn 1.0.109", "syn 2.0.52", + "sync_wrapper", "time", "time-macros", "tokio", @@ -7093,7 +7545,6 @@ dependencies = [ "tower", "tracing", "tracing-core", - "tungstenite", "url", "uuid", "zeroize", diff --git a/Cargo.toml b/Cargo.toml index 44e6ec9744..17f30a1327 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,7 +3,7 @@ resolver = "2" members = [ "compute_tools", "control_plane", - "control_plane/attachment_service", + "control_plane/storcon_cli", "pageserver", "pageserver/compaction", "pageserver/ctl", @@ -12,6 +12,7 @@ members = [ "proxy", "safekeeper", "storage_broker", + "storage_controller", "s3_scrubber", "workspace_hack", "trace", @@ -43,19 +44,22 @@ license = "Apache-2.0" anyhow = { version = "1.0", features = ["backtrace"] } arc-swap = "1.6" async-compression = { version = "0.4.0", features = ["tokio", "gzip", "zstd"] } -azure_core = "0.18" -azure_identity = "0.18" -azure_storage = "0.18" -azure_storage_blobs = "0.18" +atomic-take = "1.1.0" +azure_core = "0.19" +azure_identity = "0.19" +azure_storage = "0.19" +azure_storage_blobs = "0.19" flate2 = "1.0.26" async-stream = "0.3" async-trait = "0.1" -aws-config = { version = "1.1.4", default-features = false, features=["rustls"] } -aws-sdk-s3 = "1.14" -aws-sdk-secretsmanager = { version = "1.14.0" } -aws-smithy-async = { version = "1.1.4", default-features = false, features=["rt-tokio"] } -aws-smithy-types = "1.1.4" -aws-credential-types = "1.1.4" +aws-config = { version = "1.3", default-features = false, features=["rustls"] } +aws-sdk-s3 = "1.26" +aws-sdk-iam = "1.15.0" +aws-smithy-async = { version = "1.2.1", default-features = false, features=["rt-tokio"] } +aws-smithy-types = "1.1.9" +aws-credential-types = "1.2.0" +aws-sigv4 = { version = "1.2.1", features = ["sign-http"] } +aws-types = "1.2.0" axum = { version = "0.6.20", features = ["ws"] } base64 = "0.13.0" bincode = "1.3" @@ -76,6 +80,7 @@ either = "1.8" enum-map = "2.4.2" enumset = "1.0.12" fail = "0.5.0" +fallible-iterator = "0.2" fs2 = "0.4.3" futures = "0.3" futures-core = "0.3" @@ -88,11 +93,13 @@ hex = "0.4" hex-literal = "0.4" hmac = "0.12.1" hostname = "0.3.1" +http = {version = "1.1.0", features = ["std"]} http-types = { version = "2", default-features = false } humantime = "2.1" humantime-serde = "1.1.1" hyper = "0.14" -hyper-tungstenite = "0.11" +hyper-tungstenite = "0.13.0" +indexmap = "2" inotify = "0.10.2" ipnet = "2.9.0" itertools = "0.10" @@ -101,7 +108,8 @@ lasso = "0.7" leaky-bucket = "1.0.1" libc = "0.2" md5 = "0.7.0" -measured = { version = "0.0.13", features=["default", "lasso"] } +measured = { version = "0.0.21", features=["lasso"] } +measured-process = { version = "0.0.21" } memoffset = "0.8" native-tls = "0.2" nix = { version = "0.27", features = ["fs", "process", "socket", "signal", "poll"] } @@ -121,12 +129,12 @@ procfs = "0.14" prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency prost = "0.11" rand = "0.8" -redis = { version = "0.24.0", features = ["tokio-rustls-comp", "keep-alive"] } +redis = { version = "0.25.2", features = ["tokio-rustls-comp", "keep-alive"] } regex = "1.10.2" -reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"] } -reqwest-tracing = { version = "0.4.7", features = ["opentelemetry_0_20"] } -reqwest-middleware = "0.2.0" -reqwest-retry = "0.2.2" +reqwest = { version = "0.12", default-features = false, features = ["rustls-tls"] } +reqwest-tracing = { version = "0.5", features = ["opentelemetry_0_20"] } +reqwest-middleware = "0.3.0" +reqwest-retry = "0.5" routerify = "3" rpds = "0.13" rustc-hash = "1.1.0" @@ -136,7 +144,7 @@ rustls-split = "0.3" scopeguard = "1.1" sysinfo = "0.29.2" sd-notify = "0.4.1" -sentry = { version = "0.31", default-features = false, features = ["backtrace", "contexts", "panic", "rustls", "reqwest" ] } +sentry = { version = "0.32", default-features = false, features = ["backtrace", "contexts", "panic", "rustls", "reqwest" ] } serde = { version = "1.0", features = ["derive"] } serde_json = "1" serde_path_to_error = "0.1" @@ -150,11 +158,12 @@ socket2 = "0.5" strum = "0.24" strum_macros = "0.24" "subtle" = "2.5.0" -svg_fmt = "0.4.1" +# https://github.com/nical/rust_debug/pull/4 +svg_fmt = { git = "https://github.com/neondatabase/fork--nical--rust_debug", branch = "neon" } sync_wrapper = "0.1.2" tar = "0.4" task-local-extensions = "0.1.4" -test-context = "0.1" +test-context = "0.3" thiserror = "1.0" tikv-jemallocator = "0.5" tikv-jemalloc-ctl = "0.5" @@ -169,10 +178,11 @@ tokio-util = { version = "0.7.10", features = ["io", "rt"] } toml = "0.7" toml_edit = "0.19" tonic = {version = "0.9", features = ["tls", "tls-roots"]} +tower-service = "0.3.2" tracing = "0.1" tracing-error = "0.2.0" -tracing-opentelemetry = "0.20.0" -tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] } +tracing-opentelemetry = "0.21.0" +tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json", "ansi"] } twox-hash = { version = "1.6.3", default-features = false } url = "2.2" urlencoding = "2.1" diff --git a/Dockerfile.build-tools b/Dockerfile.build-tools index 3a452fec32..19739cc1f8 100644 --- a/Dockerfile.build-tools +++ b/Dockerfile.build-tools @@ -58,8 +58,14 @@ RUN curl -fsSL "https://github.com/protocolbuffers/protobuf/releases/download/v$ && mv protoc/include/google /usr/local/include/google \ && rm -rf protoc.zip protoc +# s5cmd +ENV S5CMD_VERSION=2.2.2 +RUN curl -sL "https://github.com/peak/s5cmd/releases/download/v${S5CMD_VERSION}/s5cmd_${S5CMD_VERSION}_Linux-$(uname -m | sed 's/x86_64/64bit/g' | sed 's/aarch64/arm64/g').tar.gz" | tar zxvf - s5cmd \ + && chmod +x s5cmd \ + && mv s5cmd /usr/local/bin/s5cmd + # LLVM -ENV LLVM_VERSION=17 +ENV LLVM_VERSION=18 RUN curl -fsSL 'https://apt.llvm.org/llvm-snapshot.gpg.key' | apt-key add - \ && echo "deb http://apt.llvm.org/bullseye/ llvm-toolchain-bullseye-${LLVM_VERSION} main" > /etc/apt/sources.list.d/llvm.stable.list \ && apt update \ @@ -135,7 +141,7 @@ WORKDIR /home/nonroot # Rust # Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`) -ENV RUSTC_VERSION=1.76.0 +ENV RUSTC_VERSION=1.78.0 ENV RUSTUP_HOME="/home/nonroot/.rustup" ENV PATH="/home/nonroot/.cargo/bin:${PATH}" RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \ @@ -149,7 +155,7 @@ RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux cargo install --git https://github.com/paritytech/cachepot && \ cargo install rustfilt && \ cargo install cargo-hakari && \ - cargo install cargo-deny && \ + cargo install cargo-deny --locked && \ cargo install cargo-hack && \ cargo install cargo-nextest && \ rm -rf /home/nonroot/.cargo/registry && \ diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node index 076f41325c..f47fb316b6 100644 --- a/Dockerfile.compute-node +++ b/Dockerfile.compute-node @@ -947,6 +947,9 @@ RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \ COPY --from=postgres-cleanup-layer --chown=postgres /usr/local/pgsql /usr/local COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl +# Create remote extension download directory +RUN mkdir /usr/local/download_extensions && chown -R postgres:postgres /usr/local/download_extensions + # Install: # libreadline8 for psql # libicu67, locales for collations (including ICU and plpgsql_check) diff --git a/Makefile b/Makefile index f13f080f1a..dcbfdbcbc1 100644 --- a/Makefile +++ b/Makefile @@ -25,14 +25,16 @@ ifeq ($(UNAME_S),Linux) # Seccomp BPF is only available for Linux PG_CONFIGURE_OPTS += --with-libseccomp else ifeq ($(UNAME_S),Darwin) - # macOS with brew-installed openssl requires explicit paths - # It can be configured with OPENSSL_PREFIX variable - OPENSSL_PREFIX ?= $(shell brew --prefix openssl@3) - PG_CONFIGURE_OPTS += --with-includes=$(OPENSSL_PREFIX)/include --with-libraries=$(OPENSSL_PREFIX)/lib - PG_CONFIGURE_OPTS += PKG_CONFIG_PATH=$(shell brew --prefix icu4c)/lib/pkgconfig - # macOS already has bison and flex in the system, but they are old and result in postgres-v14 target failure - # brew formulae are keg-only and not symlinked into HOMEBREW_PREFIX, force their usage - EXTRA_PATH_OVERRIDES += $(shell brew --prefix bison)/bin/:$(shell brew --prefix flex)/bin/: + ifndef DISABLE_HOMEBREW + # macOS with brew-installed openssl requires explicit paths + # It can be configured with OPENSSL_PREFIX variable + OPENSSL_PREFIX ?= $(shell brew --prefix openssl@3) + PG_CONFIGURE_OPTS += --with-includes=$(OPENSSL_PREFIX)/include --with-libraries=$(OPENSSL_PREFIX)/lib + PG_CONFIGURE_OPTS += PKG_CONFIG_PATH=$(shell brew --prefix icu4c)/lib/pkgconfig + # macOS already has bison and flex in the system, but they are old and result in postgres-v14 target failure + # brew formulae are keg-only and not symlinked into HOMEBREW_PREFIX, force their usage + EXTRA_PATH_OVERRIDES += $(shell brew --prefix bison)/bin/:$(shell brew --prefix flex)/bin/: + endif endif # Use -C option so that when PostgreSQL "make install" installs the @@ -79,11 +81,14 @@ $(POSTGRES_INSTALL_DIR)/build/%/config.status: echo "'git submodule update --init --recursive --depth 2 --progress .' in project root.\n"; \ exit 1; } mkdir -p $(POSTGRES_INSTALL_DIR)/build/$* - (cd $(POSTGRES_INSTALL_DIR)/build/$* && \ - env PATH="$(EXTRA_PATH_OVERRIDES):$$PATH" $(ROOT_PROJECT_DIR)/vendor/postgres-$*/configure \ + + VERSION=$*; \ + EXTRA_VERSION=$$(cd $(ROOT_PROJECT_DIR)/vendor/postgres-$$VERSION && git rev-parse HEAD); \ + (cd $(POSTGRES_INSTALL_DIR)/build/$$VERSION && \ + env PATH="$(EXTRA_PATH_OVERRIDES):$$PATH" $(ROOT_PROJECT_DIR)/vendor/postgres-$$VERSION/configure \ CFLAGS='$(PG_CFLAGS)' \ - $(PG_CONFIGURE_OPTS) \ - --prefix=$(abspath $(POSTGRES_INSTALL_DIR))/$* > configure.log) + $(PG_CONFIGURE_OPTS) --with-extra-version=" ($$EXTRA_VERSION)" \ + --prefix=$(abspath $(POSTGRES_INSTALL_DIR))/$$VERSION > configure.log) # nicer alias to run 'configure' # Note: I've been unable to use templates for this part of our configuration. diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs index 117919786e..9295f091d5 100644 --- a/compute_tools/src/bin/compute_ctl.rs +++ b/compute_tools/src/bin/compute_ctl.rs @@ -47,10 +47,11 @@ use chrono::Utc; use clap::Arg; use signal_hook::consts::{SIGQUIT, SIGTERM}; use signal_hook::{consts::SIGINT, iterator::Signals}; -use tracing::{error, info}; +use tracing::{error, info, warn}; use url::Url; use compute_api::responses::ComputeStatus; +use compute_api::spec::ComputeSpec; use compute_tools::compute::{ forward_termination_signal, ComputeNode, ComputeState, ParsedSpec, PG_PID, @@ -62,12 +63,41 @@ use compute_tools::logger::*; use compute_tools::monitor::launch_monitor; use compute_tools::params::*; use compute_tools::spec::*; +use compute_tools::swap::resize_swap; // this is an arbitrary build tag. Fine as a default / for testing purposes // in-case of not-set environment var const BUILD_TAG_DEFAULT: &str = "latest"; fn main() -> Result<()> { + let (build_tag, clap_args) = init()?; + + let (pg_handle, start_pg_result) = { + // Enter startup tracing context + let _startup_context_guard = startup_context_from_env(); + + let cli_args = process_cli(&clap_args)?; + + let cli_spec = try_spec_from_cli(&clap_args, &cli_args)?; + + let wait_spec_result = wait_spec(build_tag, cli_args, cli_spec)?; + + start_postgres(&clap_args, wait_spec_result)? + + // Startup is finished, exit the startup tracing span + }; + + // PostgreSQL is now running, if startup was successful. Wait until it exits. + let wait_pg_result = wait_postgres(pg_handle)?; + + let delay_exit = cleanup_after_postgres_exit(start_pg_result)?; + + maybe_delay_exit(delay_exit); + + deinit_and_exit(wait_pg_result); +} + +fn init() -> Result<(String, clap::ArgMatches)> { init_tracing_and_logging(DEFAULT_LOG_LEVEL)?; let mut signals = Signals::new([SIGINT, SIGTERM, SIGQUIT])?; @@ -82,9 +112,15 @@ fn main() -> Result<()> { .to_string(); info!("build_tag: {build_tag}"); - let matches = cli().get_matches(); - let pgbin_default = String::from("postgres"); - let pgbin = matches.get_one::("pgbin").unwrap_or(&pgbin_default); + Ok((build_tag, cli().get_matches())) +} + +fn process_cli(matches: &clap::ArgMatches) -> Result { + let pgbin_default = "postgres"; + let pgbin = matches + .get_one::("pgbin") + .map(|s| s.as_str()) + .unwrap_or(pgbin_default); let ext_remote_storage = matches .get_one::("remote-ext-config") @@ -110,7 +146,32 @@ fn main() -> Result<()> { .expect("Postgres connection string is required"); let spec_json = matches.get_one::("spec"); let spec_path = matches.get_one::("spec-path"); + let resize_swap_on_bind = matches.get_flag("resize-swap-on-bind"); + Ok(ProcessCliResult { + connstr, + pgdata, + pgbin, + ext_remote_storage, + http_port, + spec_json, + spec_path, + resize_swap_on_bind, + }) +} + +struct ProcessCliResult<'clap> { + connstr: &'clap str, + pgdata: &'clap str, + pgbin: &'clap str, + ext_remote_storage: Option<&'clap str>, + http_port: u16, + spec_json: Option<&'clap String>, + spec_path: Option<&'clap String>, + resize_swap_on_bind: bool, +} + +fn startup_context_from_env() -> Option { // Extract OpenTelemetry context for the startup actions from the // TRACEPARENT and TRACESTATE env variables, and attach it to the current // tracing context. @@ -147,7 +208,7 @@ fn main() -> Result<()> { if let Ok(val) = std::env::var("TRACESTATE") { startup_tracing_carrier.insert("tracestate".to_string(), val); } - let startup_context_guard = if !startup_tracing_carrier.is_empty() { + if !startup_tracing_carrier.is_empty() { use opentelemetry::propagation::TextMapPropagator; use opentelemetry::sdk::propagation::TraceContextPropagator; let guard = TraceContextPropagator::new() @@ -157,8 +218,17 @@ fn main() -> Result<()> { Some(guard) } else { None - }; + } +} +fn try_spec_from_cli( + matches: &clap::ArgMatches, + ProcessCliResult { + spec_json, + spec_path, + .. + }: &ProcessCliResult, +) -> Result { let compute_id = matches.get_one::("compute-id"); let control_plane_uri = matches.get_one::("control-plane-uri"); @@ -199,6 +269,34 @@ fn main() -> Result<()> { } }; + Ok(CliSpecParams { + spec, + live_config_allowed, + }) +} + +struct CliSpecParams { + /// If a spec was provided via CLI or file, the [`ComputeSpec`] + spec: Option, + live_config_allowed: bool, +} + +fn wait_spec( + build_tag: String, + ProcessCliResult { + connstr, + pgdata, + pgbin, + ext_remote_storage, + resize_swap_on_bind, + http_port, + .. + }: ProcessCliResult, + CliSpecParams { + spec, + live_config_allowed, + }: CliSpecParams, +) -> Result { let mut new_state = ComputeState::new(); let spec_set; @@ -226,19 +324,17 @@ fn main() -> Result<()> { // If this is a pooled VM, prewarm before starting HTTP server and becoming // available for binding. Prewarming helps Postgres start quicker later, - // because QEMU will already have it's memory allocated from the host, and + // because QEMU will already have its memory allocated from the host, and // the necessary binaries will already be cached. if !spec_set { compute.prewarm_postgres()?; } - // Launch http service first, so we were able to serve control-plane - // requests, while configuration is still in progress. + // Launch http service first, so that we can serve control-plane requests + // while configuration is still in progress. let _http_handle = launch_http_server(http_port, &compute).expect("cannot launch http endpoint thread"); - let extension_server_port: u16 = http_port; - if !spec_set { // No spec provided, hang waiting for it. info!("no compute spec provided, waiting"); @@ -253,21 +349,45 @@ fn main() -> Result<()> { break; } } + + // Record for how long we slept waiting for the spec. + let now = Utc::now(); + state.metrics.wait_for_spec_ms = now + .signed_duration_since(state.start_time) + .to_std() + .unwrap() + .as_millis() as u64; + + // Reset start time, so that the total startup time that is calculated later will + // not include the time that we waited for the spec. + state.start_time = now; } + Ok(WaitSpecResult { + compute, + http_port, + resize_swap_on_bind, + }) +} + +struct WaitSpecResult { + compute: Arc, + // passed through from ProcessCliResult + http_port: u16, + resize_swap_on_bind: bool, +} + +fn start_postgres( + // need to allow unused because `matches` is only used if target_os = "linux" + #[allow(unused_variables)] matches: &clap::ArgMatches, + WaitSpecResult { + compute, + http_port, + resize_swap_on_bind, + }: WaitSpecResult, +) -> Result<(Option, StartPostgresResult)> { // We got all we need, update the state. let mut state = compute.state.lock().unwrap(); - - // Record for how long we slept waiting for the spec. - state.metrics.wait_for_spec_ms = Utc::now() - .signed_duration_since(state.start_time) - .to_std() - .unwrap() - .as_millis() as u64; - // Reset start time to the actual start of the configuration, so that - // total startup time was properly measured at the end. - state.start_time = Utc::now(); - state.status = ComputeStatus::Init; compute.state_changed.notify_all(); @@ -275,33 +395,72 @@ fn main() -> Result<()> { "running compute with features: {:?}", state.pspec.as_ref().unwrap().spec.features ); + // before we release the mutex, fetch the swap size (if any) for later. + let swap_size_bytes = state.pspec.as_ref().unwrap().spec.swap_size_bytes; drop(state); // Launch remaining service threads let _monitor_handle = launch_monitor(&compute); let _configurator_handle = launch_configurator(&compute); - // Start Postgres + let mut prestartup_failed = false; let mut delay_exit = false; - let mut exit_code = None; - let pg = match compute.start_compute(extension_server_port) { - Ok(pg) => Some(pg), - Err(err) => { - error!("could not start the compute node: {:#}", err); - let mut state = compute.state.lock().unwrap(); - state.error = Some(format!("{:?}", err)); - state.status = ComputeStatus::Failed; - // Notify others that Postgres failed to start. In case of configuring the - // empty compute, it's likely that API handler is still waiting for compute - // state change. With this we will notify it that compute is in Failed state, - // so control plane will know about it earlier and record proper error instead - // of timeout. - compute.state_changed.notify_all(); - drop(state); // unlock - delay_exit = true; - None + + // Resize swap to the desired size if the compute spec says so + if let (Some(size_bytes), true) = (swap_size_bytes, resize_swap_on_bind) { + // To avoid 'swapoff' hitting postgres startup, we need to run resize-swap to completion + // *before* starting postgres. + // + // In theory, we could do this asynchronously if SkipSwapon was enabled for VMs, but this + // carries a risk of introducing hard-to-debug issues - e.g. if postgres sometimes gets + // OOM-killed during startup because swap wasn't available yet. + match resize_swap(size_bytes) { + Ok(()) => { + let size_gib = size_bytes as f32 / (1 << 20) as f32; // just for more coherent display. + info!(%size_bytes, %size_gib, "resized swap"); + } + Err(err) => { + let err = err.context("failed to resize swap"); + error!("{err:#}"); + + // Mark compute startup as failed; don't try to start postgres, and report this + // error to the control plane when it next asks. + prestartup_failed = true; + let mut state = compute.state.lock().unwrap(); + state.error = Some(format!("{err:?}")); + state.status = ComputeStatus::Failed; + compute.state_changed.notify_all(); + delay_exit = true; + } } - }; + } + + let extension_server_port: u16 = http_port; + + // Start Postgres + let mut pg = None; + if !prestartup_failed { + pg = match compute.start_compute(extension_server_port) { + Ok(pg) => Some(pg), + Err(err) => { + error!("could not start the compute node: {:#}", err); + let mut state = compute.state.lock().unwrap(); + state.error = Some(format!("{:?}", err)); + state.status = ComputeStatus::Failed; + // Notify others that Postgres failed to start. In case of configuring the + // empty compute, it's likely that API handler is still waiting for compute + // state change. With this we will notify it that compute is in Failed state, + // so control plane will know about it earlier and record proper error instead + // of timeout. + compute.state_changed.notify_all(); + drop(state); // unlock + delay_exit = true; + None + } + }; + } else { + warn!("skipping postgres startup because pre-startup step failed"); + } // Start the vm-monitor if directed to. The vm-monitor only runs on linux // because it requires cgroups. @@ -334,7 +493,7 @@ fn main() -> Result<()> { // This token is used internally by the monitor to clean up all threads let token = CancellationToken::new(); - let vm_monitor = &rt.as_ref().map(|rt| { + let vm_monitor = rt.as_ref().map(|rt| { rt.spawn(vm_monitor::start( Box::leak(Box::new(vm_monitor::Args { cgroup: cgroup.cloned(), @@ -347,12 +506,41 @@ fn main() -> Result<()> { } } + Ok(( + pg, + StartPostgresResult { + delay_exit, + compute, + #[cfg(target_os = "linux")] + rt, + #[cfg(target_os = "linux")] + token, + #[cfg(target_os = "linux")] + vm_monitor, + }, + )) +} + +type PostgresHandle = (std::process::Child, std::thread::JoinHandle<()>); + +struct StartPostgresResult { + delay_exit: bool, + // passed through from WaitSpecResult + compute: Arc, + + #[cfg(target_os = "linux")] + rt: Option, + #[cfg(target_os = "linux")] + token: tokio_util::sync::CancellationToken, + #[cfg(target_os = "linux")] + vm_monitor: Option>>, +} + +fn wait_postgres(pg: Option) -> Result { // Wait for the child Postgres process forever. In this state Ctrl+C will // propagate to Postgres and it will be shut down as well. + let mut exit_code = None; if let Some((mut pg, logs_handle)) = pg { - // Startup is finished, exit the startup tracing span - drop(startup_context_guard); - let ecode = pg .wait() .expect("failed to start waiting on Postgres process"); @@ -367,6 +555,25 @@ fn main() -> Result<()> { exit_code = ecode.code() } + Ok(WaitPostgresResult { exit_code }) +} + +struct WaitPostgresResult { + exit_code: Option, +} + +fn cleanup_after_postgres_exit( + StartPostgresResult { + mut delay_exit, + compute, + #[cfg(target_os = "linux")] + vm_monitor, + #[cfg(target_os = "linux")] + token, + #[cfg(target_os = "linux")] + rt, + }: StartPostgresResult, +) -> Result { // Terminate the vm_monitor so it releases the file watcher on // /sys/fs/cgroup/neon-postgres. // Note: the vm-monitor only runs on linux because it requires cgroups. @@ -408,13 +615,19 @@ fn main() -> Result<()> { error!("error while checking for core dumps: {err:?}"); } + Ok(delay_exit) +} + +fn maybe_delay_exit(delay_exit: bool) { // If launch failed, keep serving HTTP requests for a while, so the cloud // control plane can get the actual error. if delay_exit { info!("giving control plane 30s to collect the error before shutdown"); thread::sleep(Duration::from_secs(30)); } +} +fn deinit_and_exit(WaitPostgresResult { exit_code }: WaitPostgresResult) -> ! { // Shutdown trace pipeline gracefully, so that it has a chance to send any // pending traces before we exit. Shutting down OTEL tracing provider may // hang for quite some time, see, for example: @@ -526,6 +739,11 @@ fn cli() -> clap::Command { ) .value_name("FILECACHE_CONNSTR"), ) + .arg( + Arg::new("resize-swap-on-bind") + .long("resize-swap-on-bind") + .action(clap::ArgAction::SetTrue), + ) } /// When compute_ctl is killed, send also termination signal to sync-safekeepers diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index 0fa315682d..40060f4117 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -818,9 +818,15 @@ impl ComputeNode { Client::connect(zenith_admin_connstr.as_str(), NoTls) .context("broken cloud_admin credential: tried connecting with cloud_admin but could not authenticate, and zenith_admin does not work either")?; // Disable forwarding so that users don't get a cloud_admin role - client.simple_query("SET neon.forward_ddl = false")?; - client.simple_query("CREATE USER cloud_admin WITH SUPERUSER")?; - client.simple_query("GRANT zenith_admin TO cloud_admin")?; + + let mut func = || { + client.simple_query("SET neon.forward_ddl = false")?; + client.simple_query("CREATE USER cloud_admin WITH SUPERUSER")?; + client.simple_query("GRANT zenith_admin TO cloud_admin")?; + Ok::<_, anyhow::Error>(()) + }; + func().context("apply_config setup cloud_admin")?; + drop(client); // reconnect with connstring with expected name @@ -832,24 +838,29 @@ impl ComputeNode { }; // Disable DDL forwarding because control plane already knows about these roles/databases. - client.simple_query("SET neon.forward_ddl = false")?; + client + .simple_query("SET neon.forward_ddl = false") + .context("apply_config SET neon.forward_ddl = false")?; // Proceed with post-startup configuration. Note, that order of operations is important. let spec = &compute_state.pspec.as_ref().expect("spec must be set").spec; - create_neon_superuser(spec, &mut client)?; - cleanup_instance(&mut client)?; - handle_roles(spec, &mut client)?; - handle_databases(spec, &mut client)?; - handle_role_deletions(spec, connstr.as_str(), &mut client)?; + create_neon_superuser(spec, &mut client).context("apply_config create_neon_superuser")?; + cleanup_instance(&mut client).context("apply_config cleanup_instance")?; + handle_roles(spec, &mut client).context("apply_config handle_roles")?; + handle_databases(spec, &mut client).context("apply_config handle_databases")?; + handle_role_deletions(spec, connstr.as_str(), &mut client) + .context("apply_config handle_role_deletions")?; handle_grants( spec, &mut client, connstr.as_str(), self.has_feature(ComputeFeature::AnonExtension), - )?; - handle_extensions(spec, &mut client)?; - handle_extension_neon(&mut client)?; - create_availability_check_data(&mut client)?; + ) + .context("apply_config handle_grants")?; + handle_extensions(spec, &mut client).context("apply_config handle_extensions")?; + handle_extension_neon(&mut client).context("apply_config handle_extension_neon")?; + create_availability_check_data(&mut client) + .context("apply_config create_availability_check_data")?; // 'Close' connection drop(client); @@ -857,7 +868,7 @@ impl ComputeNode { // Run migrations separately to not hold up cold starts thread::spawn(move || { let mut client = Client::connect(connstr.as_str(), NoTls)?; - handle_migrations(&mut client) + handle_migrations(&mut client).context("apply_config handle_migrations") }); Ok(()) } @@ -1262,10 +1273,12 @@ LIMIT 100", .await .map_err(DownloadError::Other); - self.ext_download_progress - .write() - .expect("bad lock") - .insert(ext_archive_name.to_string(), (download_start, true)); + if download_size.is_ok() { + self.ext_download_progress + .write() + .expect("bad lock") + .insert(ext_archive_name.to_string(), (download_start, true)); + } download_size } diff --git a/compute_tools/src/config.rs b/compute_tools/src/config.rs index 42b8480211..89c866b20c 100644 --- a/compute_tools/src/config.rs +++ b/compute_tools/src/config.rs @@ -6,8 +6,8 @@ use std::path::Path; use anyhow::Result; use crate::pg_helpers::escape_conf_value; -use crate::pg_helpers::PgOptionsSerialize; -use compute_api::spec::{ComputeMode, ComputeSpec}; +use crate::pg_helpers::{GenericOptionExt, PgOptionsSerialize}; +use compute_api::spec::{ComputeMode, ComputeSpec, GenericOption}; /// Check that `line` is inside a text file and put it there if it is not. /// Create file if it doesn't exist. @@ -17,6 +17,7 @@ pub fn line_in_file(path: &Path, line: &str) -> Result { .write(true) .create(true) .append(false) + .truncate(false) .open(path)?; let buf = io::BufReader::new(&file); let mut count: usize = 0; @@ -91,6 +92,27 @@ pub fn write_postgres_conf( } } + if cfg!(target_os = "linux") { + // Check /proc/sys/vm/overcommit_memory -- if it equals 2 (i.e. linux memory overcommit is + // disabled), then the control plane has enabled swap and we should set + // dynamic_shared_memory_type = 'mmap'. + // + // This is (maybe?) temporary - for more, see https://github.com/neondatabase/cloud/issues/12047. + let overcommit_memory_contents = std::fs::read_to_string("/proc/sys/vm/overcommit_memory") + // ignore any errors - they may be expected to occur under certain situations (e.g. when + // not running in Linux). + .unwrap_or_else(|_| String::new()); + if overcommit_memory_contents.trim() == "2" { + let opt = GenericOption { + name: "dynamic_shared_memory_type".to_owned(), + value: Some("mmap".to_owned()), + vartype: "enum".to_owned(), + }; + + write!(file, "{}", opt.to_pg_setting())?; + } + } + // If there are any extra options in the 'settings' field, append those if spec.cluster.settings.is_some() { writeln!(file, "# Managed by compute_ctl: begin")?; diff --git a/compute_tools/src/lib.rs b/compute_tools/src/lib.rs index 4e01ffd954..eac808385c 100644 --- a/compute_tools/src/lib.rs +++ b/compute_tools/src/lib.rs @@ -14,4 +14,5 @@ pub mod monitor; pub mod params; pub mod pg_helpers; pub mod spec; +pub mod swap; pub mod sync_sk; diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs index 5deb50d6b7..fa0822748b 100644 --- a/compute_tools/src/pg_helpers.rs +++ b/compute_tools/src/pg_helpers.rs @@ -44,7 +44,7 @@ pub fn escape_conf_value(s: &str) -> String { format!("'{}'", res) } -trait GenericOptionExt { +pub trait GenericOptionExt { fn to_pg_option(&self) -> String; fn to_pg_setting(&self) -> String; } diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs index 3b596a88ff..3a6e18b638 100644 --- a/compute_tools/src/spec.rs +++ b/compute_tools/src/spec.rs @@ -2,7 +2,7 @@ use std::fs::File; use std::path::Path; use std::str::FromStr; -use anyhow::{anyhow, bail, Result}; +use anyhow::{anyhow, bail, Context, Result}; use postgres::config::Config; use postgres::{Client, NoTls}; use reqwest::StatusCode; @@ -302,9 +302,9 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> { RoleAction::Create => { // This branch only runs when roles are created through the console, so it is // safe to add more permissions here. BYPASSRLS and REPLICATION are inherited - // from neon_superuser. (NOTE: REPLICATION has been removed from here for now). + // from neon_superuser. let mut query: String = format!( - "CREATE ROLE {} INHERIT CREATEROLE CREATEDB BYPASSRLS IN ROLE neon_superuser", + "CREATE ROLE {} INHERIT CREATEROLE CREATEDB BYPASSRLS REPLICATION IN ROLE neon_superuser", name.pg_quote() ); info!("running role create query: '{}'", &query); @@ -490,7 +490,7 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> { "rename_db" => { let new_name = op.new_name.as_ref().unwrap(); - if existing_dbs.get(&op.name).is_some() { + if existing_dbs.contains_key(&op.name) { let query: String = format!( "ALTER DATABASE {} RENAME TO {}", op.name.pg_quote(), @@ -698,7 +698,8 @@ pub fn handle_grants( // it is important to run this after all grants if enable_anon_extension { - handle_extension_anon(spec, &db.owner, &mut db_client, false)?; + handle_extension_anon(spec, &db.owner, &mut db_client, false) + .context("handle_grants handle_extension_anon")?; } } @@ -743,21 +744,24 @@ pub fn handle_extension_neon(client: &mut Client) -> Result<()> { // which may happen in two cases: // - extension was just installed // - extension was already installed and is up to date - // DISABLED due to compute node unpinning epic - // let query = "ALTER EXTENSION neon UPDATE"; - // info!("update neon extension version with query: {}", query); - // client.simple_query(query)?; + let query = "ALTER EXTENSION neon UPDATE"; + info!("update neon extension version with query: {}", query); + if let Err(e) = client.simple_query(query) { + error!( + "failed to upgrade neon extension during `handle_extension_neon`: {}", + e + ); + } Ok(()) } #[instrument(skip_all)] -pub fn handle_neon_extension_upgrade(_client: &mut Client) -> Result<()> { - info!("handle neon extension upgrade (not really)"); - // DISABLED due to compute node unpinning epic - // let query = "ALTER EXTENSION neon UPDATE"; - // info!("update neon extension version with query: {}", query); - // client.simple_query(query)?; +pub fn handle_neon_extension_upgrade(client: &mut Client) -> Result<()> { + info!("handle neon extension upgrade"); + let query = "ALTER EXTENSION neon UPDATE"; + info!("update neon extension version with query: {}", query); + client.simple_query(query)?; Ok(()) } @@ -806,43 +810,40 @@ $$;"#, "", "", "", + "", // Add new migrations below. - r#" -DO $$ -DECLARE - role_name TEXT; -BEGIN - FOR role_name IN SELECT rolname FROM pg_roles WHERE rolreplication IS TRUE - LOOP - RAISE NOTICE 'EXECUTING ALTER ROLE % NOREPLICATION', quote_ident(role_name); - EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' NOREPLICATION'; - END LOOP; -END -$$;"#, ]; - let mut query = "CREATE SCHEMA IF NOT EXISTS neon_migration"; - client.simple_query(query)?; + let mut func = || { + let query = "CREATE SCHEMA IF NOT EXISTS neon_migration"; + client.simple_query(query)?; - query = "CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key INT NOT NULL PRIMARY KEY, id bigint NOT NULL DEFAULT 0)"; - client.simple_query(query)?; + let query = "CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key INT NOT NULL PRIMARY KEY, id bigint NOT NULL DEFAULT 0)"; + client.simple_query(query)?; - query = "INSERT INTO neon_migration.migration_id VALUES (0, 0) ON CONFLICT DO NOTHING"; - client.simple_query(query)?; + let query = "INSERT INTO neon_migration.migration_id VALUES (0, 0) ON CONFLICT DO NOTHING"; + client.simple_query(query)?; - query = "ALTER SCHEMA neon_migration OWNER TO cloud_admin"; - client.simple_query(query)?; + let query = "ALTER SCHEMA neon_migration OWNER TO cloud_admin"; + client.simple_query(query)?; - query = "REVOKE ALL ON SCHEMA neon_migration FROM PUBLIC"; - client.simple_query(query)?; + let query = "REVOKE ALL ON SCHEMA neon_migration FROM PUBLIC"; + client.simple_query(query)?; + Ok::<_, anyhow::Error>(()) + }; + func().context("handle_migrations prepare")?; - query = "SELECT id FROM neon_migration.migration_id"; - let row = client.query_one(query, &[])?; + let query = "SELECT id FROM neon_migration.migration_id"; + let row = client + .query_one(query, &[]) + .context("handle_migrations get migration_id")?; let mut current_migration: usize = row.get::<&str, i64>("id") as usize; let starting_migration_id = current_migration; - query = "BEGIN"; - client.simple_query(query)?; + let query = "BEGIN"; + client + .simple_query(query) + .context("handle_migrations begin")?; while current_migration < migrations.len() { let migration = &migrations[current_migration]; @@ -850,7 +851,9 @@ $$;"#, info!("Skip migration id={}", current_migration); } else { info!("Running migration:\n{}\n", migration); - client.simple_query(migration)?; + client.simple_query(migration).with_context(|| { + format!("handle_migrations current_migration={}", current_migration) + })?; } current_migration += 1; } @@ -858,10 +861,14 @@ $$;"#, "UPDATE neon_migration.migration_id SET id={}", migrations.len() ); - client.simple_query(&setval)?; + client + .simple_query(&setval) + .context("handle_migrations update id")?; - query = "COMMIT"; - client.simple_query(query)?; + let query = "COMMIT"; + client + .simple_query(query) + .context("handle_migrations commit")?; info!( "Ran {} migrations", diff --git a/compute_tools/src/swap.rs b/compute_tools/src/swap.rs new file mode 100644 index 0000000000..c22b6bc14e --- /dev/null +++ b/compute_tools/src/swap.rs @@ -0,0 +1,36 @@ +use anyhow::{anyhow, Context}; +use tracing::warn; + +pub const RESIZE_SWAP_BIN: &str = "/neonvm/bin/resize-swap"; + +pub fn resize_swap(size_bytes: u64) -> anyhow::Result<()> { + // run `/neonvm/bin/resize-swap --once {size_bytes}` + // + // Passing '--once' causes resize-swap to delete itself after successful completion, which + // means that if compute_ctl restarts later, we won't end up calling 'swapoff' while + // postgres is running. + // + // NOTE: resize-swap is not very clever. If present, --once MUST be the first arg. + let child_result = std::process::Command::new("/usr/bin/sudo") + .arg(RESIZE_SWAP_BIN) + .arg("--once") + .arg(size_bytes.to_string()) + .spawn(); + + if matches!(&child_result, Err(e) if e.kind() == std::io::ErrorKind::NotFound) { + warn!("ignoring \"not found\" error from resize-swap to avoid swapoff while compute is running"); + return Ok(()); + } + + child_result + .context("spawn() failed") + .and_then(|mut child| child.wait().context("wait() failed")) + .and_then(|status| match status.success() { + true => Ok(()), + false => Err(anyhow!("process exited with {status}")), + }) + // wrap any prior error with the overall context that we couldn't run the command + .with_context(|| { + format!("could not run `/usr/bin/sudo {RESIZE_SWAP_BIN} --once {size_bytes}`") + }) +} diff --git a/control_plane/Cargo.toml b/control_plane/Cargo.toml index b544a8c587..e62f3b8a47 100644 --- a/control_plane/Cargo.toml +++ b/control_plane/Cargo.toml @@ -17,6 +17,7 @@ nix.workspace = true once_cell.workspace = true postgres.workspace = true hex.workspace = true +humantime-serde.workspace = true hyper.workspace = true regex.workspace = true reqwest = { workspace = true, features = ["blocking", "json"] } @@ -27,6 +28,7 @@ serde_with.workspace = true tar.workspace = true thiserror.workspace = true toml.workspace = true +toml_edit.workspace = true tokio.workspace = true tokio-postgres.workspace = true tokio-util.workspace = true diff --git a/control_plane/attachment_service/src/compute_hook.rs b/control_plane/attachment_service/src/compute_hook.rs deleted file mode 100644 index bebc62ac2f..0000000000 --- a/control_plane/attachment_service/src/compute_hook.rs +++ /dev/null @@ -1,462 +0,0 @@ -use std::{collections::HashMap, time::Duration}; - -use control_plane::endpoint::{ComputeControlPlane, EndpointStatus}; -use control_plane::local_env::LocalEnv; -use hyper::{Method, StatusCode}; -use pageserver_api::shard::{ShardCount, ShardNumber, ShardStripeSize, TenantShardId}; -use postgres_connection::parse_host_port; -use serde::{Deserialize, Serialize}; -use tokio_util::sync::CancellationToken; -use utils::{ - backoff::{self}, - id::{NodeId, TenantId}, -}; - -use crate::service::Config; - -const BUSY_DELAY: Duration = Duration::from_secs(1); -const SLOWDOWN_DELAY: Duration = Duration::from_secs(5); - -pub(crate) const API_CONCURRENCY: usize = 32; - -struct ShardedComputeHookTenant { - stripe_size: ShardStripeSize, - shard_count: ShardCount, - shards: Vec<(ShardNumber, NodeId)>, -} - -enum ComputeHookTenant { - Unsharded(NodeId), - Sharded(ShardedComputeHookTenant), -} - -impl ComputeHookTenant { - /// Construct with at least one shard's information - fn new(tenant_shard_id: TenantShardId, stripe_size: ShardStripeSize, node_id: NodeId) -> Self { - if tenant_shard_id.shard_count.count() > 1 { - Self::Sharded(ShardedComputeHookTenant { - shards: vec![(tenant_shard_id.shard_number, node_id)], - stripe_size, - shard_count: tenant_shard_id.shard_count, - }) - } else { - Self::Unsharded(node_id) - } - } - - /// Set one shard's location. If stripe size or shard count have changed, Self is reset - /// and drops existing content. - fn update( - &mut self, - tenant_shard_id: TenantShardId, - stripe_size: ShardStripeSize, - node_id: NodeId, - ) { - match self { - Self::Unsharded(existing_node_id) if tenant_shard_id.shard_count.count() == 1 => { - *existing_node_id = node_id - } - Self::Sharded(sharded_tenant) - if sharded_tenant.stripe_size == stripe_size - && sharded_tenant.shard_count == tenant_shard_id.shard_count => - { - if let Some(existing) = sharded_tenant - .shards - .iter() - .position(|s| s.0 == tenant_shard_id.shard_number) - { - sharded_tenant.shards.get_mut(existing).unwrap().1 = node_id; - } else { - sharded_tenant - .shards - .push((tenant_shard_id.shard_number, node_id)); - sharded_tenant.shards.sort_by_key(|s| s.0) - } - } - _ => { - // Shard count changed: reset struct. - *self = Self::new(tenant_shard_id, stripe_size, node_id); - } - } - } -} - -#[derive(Serialize, Deserialize, Debug)] -struct ComputeHookNotifyRequestShard { - node_id: NodeId, - shard_number: ShardNumber, -} - -/// Request body that we send to the control plane to notify it of where a tenant is attached -#[derive(Serialize, Deserialize, Debug)] -struct ComputeHookNotifyRequest { - tenant_id: TenantId, - stripe_size: Option, - shards: Vec, -} - -/// Error type for attempts to call into the control plane compute notification hook -#[derive(thiserror::Error, Debug)] -pub(crate) enum NotifyError { - // Request was not send successfully, e.g. transport error - #[error("Sending request: {0}")] - Request(#[from] reqwest::Error), - // Request could not be serviced right now due to ongoing Operation in control plane, but should be possible soon. - #[error("Control plane tenant busy")] - Busy, - // Explicit 429 response asking us to retry less frequently - #[error("Control plane overloaded")] - SlowDown, - // A 503 response indicates the control plane can't handle the request right now - #[error("Control plane unavailable (status {0})")] - Unavailable(StatusCode), - // API returned unexpected non-success status. We will retry, but log a warning. - #[error("Control plane returned unexpected status {0}")] - Unexpected(StatusCode), - // We shutdown while sending - #[error("Shutting down")] - ShuttingDown, - // A response indicates we will never succeed, such as 400 or 404 - #[error("Non-retryable error {0}")] - Fatal(StatusCode), -} - -impl ComputeHookTenant { - fn maybe_reconfigure(&self, tenant_id: TenantId) -> Option { - match self { - Self::Unsharded(node_id) => Some(ComputeHookNotifyRequest { - tenant_id, - shards: vec![ComputeHookNotifyRequestShard { - shard_number: ShardNumber(0), - node_id: *node_id, - }], - stripe_size: None, - }), - Self::Sharded(sharded_tenant) - if sharded_tenant.shards.len() == sharded_tenant.shard_count.count() as usize => - { - Some(ComputeHookNotifyRequest { - tenant_id, - shards: sharded_tenant - .shards - .iter() - .map(|(shard_number, node_id)| ComputeHookNotifyRequestShard { - shard_number: *shard_number, - node_id: *node_id, - }) - .collect(), - stripe_size: Some(sharded_tenant.stripe_size), - }) - } - Self::Sharded(sharded_tenant) => { - // Sharded tenant doesn't yet have information for all its shards - - tracing::info!( - "ComputeHookTenant::maybe_reconfigure: not enough shards ({}/{})", - sharded_tenant.shards.len(), - sharded_tenant.shard_count.count() - ); - None - } - } - } -} - -/// The compute hook is a destination for notifications about changes to tenant:pageserver -/// mapping. It aggregates updates for the shards in a tenant, and when appropriate reconfigures -/// the compute connection string. -pub(super) struct ComputeHook { - config: Config, - state: tokio::sync::Mutex>, - authorization_header: Option, -} - -impl ComputeHook { - pub(super) fn new(config: Config) -> Self { - let authorization_header = config - .control_plane_jwt_token - .clone() - .map(|jwt| format!("Bearer {}", jwt)); - - Self { - state: Default::default(), - config, - authorization_header, - } - } - - /// For test environments: use neon_local's LocalEnv to update compute - async fn do_notify_local( - &self, - reconfigure_request: ComputeHookNotifyRequest, - ) -> anyhow::Result<()> { - let env = match LocalEnv::load_config() { - Ok(e) => e, - Err(e) => { - tracing::warn!("Couldn't load neon_local config, skipping compute update ({e})"); - return Ok(()); - } - }; - let cplane = - ComputeControlPlane::load(env.clone()).expect("Error loading compute control plane"); - let ComputeHookNotifyRequest { - tenant_id, - shards, - stripe_size, - } = reconfigure_request; - - let compute_pageservers = shards - .into_iter() - .map(|shard| { - let ps_conf = env - .get_pageserver_conf(shard.node_id) - .expect("Unknown pageserver"); - let (pg_host, pg_port) = parse_host_port(&ps_conf.listen_pg_addr) - .expect("Unable to parse listen_pg_addr"); - (pg_host, pg_port.unwrap_or(5432)) - }) - .collect::>(); - - for (endpoint_name, endpoint) in &cplane.endpoints { - if endpoint.tenant_id == tenant_id && endpoint.status() == EndpointStatus::Running { - tracing::info!("Reconfiguring endpoint {}", endpoint_name,); - endpoint - .reconfigure(compute_pageservers.clone(), stripe_size) - .await?; - } - } - - Ok(()) - } - - async fn do_notify_iteration( - &self, - client: &reqwest::Client, - url: &String, - reconfigure_request: &ComputeHookNotifyRequest, - cancel: &CancellationToken, - ) -> Result<(), NotifyError> { - let req = client.request(Method::PUT, url); - let req = if let Some(value) = &self.authorization_header { - req.header(reqwest::header::AUTHORIZATION, value) - } else { - req - }; - - tracing::info!( - "Sending notify request to {} ({:?})", - url, - reconfigure_request - ); - let send_result = req.json(&reconfigure_request).send().await; - let response = match send_result { - Ok(r) => r, - Err(e) => return Err(e.into()), - }; - - // Treat all 2xx responses as success - if response.status() >= StatusCode::OK && response.status() < StatusCode::MULTIPLE_CHOICES { - if response.status() != StatusCode::OK { - // Non-200 2xx response: it doesn't make sense to retry, but this is unexpected, so - // log a warning. - tracing::warn!( - "Unexpected 2xx response code {} from control plane", - response.status() - ); - } - - return Ok(()); - } - - // Error response codes - match response.status() { - StatusCode::TOO_MANY_REQUESTS => { - // TODO: 429 handling should be global: set some state visible to other requests - // so that they will delay before starting, rather than all notifications trying - // once before backing off. - tokio::time::timeout(SLOWDOWN_DELAY, cancel.cancelled()) - .await - .ok(); - Err(NotifyError::SlowDown) - } - StatusCode::LOCKED => { - // Delay our retry if busy: the usual fast exponential backoff in backoff::retry - // is not appropriate - tokio::time::timeout(BUSY_DELAY, cancel.cancelled()) - .await - .ok(); - Err(NotifyError::Busy) - } - StatusCode::SERVICE_UNAVAILABLE - | StatusCode::GATEWAY_TIMEOUT - | StatusCode::BAD_GATEWAY => Err(NotifyError::Unavailable(response.status())), - StatusCode::BAD_REQUEST | StatusCode::UNAUTHORIZED | StatusCode::FORBIDDEN => { - Err(NotifyError::Fatal(response.status())) - } - _ => Err(NotifyError::Unexpected(response.status())), - } - } - - async fn do_notify( - &self, - url: &String, - reconfigure_request: ComputeHookNotifyRequest, - cancel: &CancellationToken, - ) -> Result<(), NotifyError> { - let client = reqwest::Client::new(); - backoff::retry( - || self.do_notify_iteration(&client, url, &reconfigure_request, cancel), - |e| matches!(e, NotifyError::Fatal(_) | NotifyError::Unexpected(_)), - 3, - 10, - "Send compute notification", - cancel, - ) - .await - .ok_or_else(|| NotifyError::ShuttingDown) - .and_then(|x| x) - } - - /// Call this to notify the compute (postgres) tier of new pageservers to use - /// for a tenant. notify() is called by each shard individually, and this function - /// will decide whether an update to the tenant is sent. An update is sent on the - /// condition that: - /// - We know a pageserver for every shard. - /// - All the shards have the same shard_count (i.e. we are not mid-split) - /// - /// Cancellation token enables callers to drop out, e.g. if calling from a Reconciler - /// that is cancelled. - /// - /// This function is fallible, including in the case that the control plane is transiently - /// unavailable. A limited number of retries are done internally to efficiently hide short unavailability - /// periods, but we don't retry forever. The **caller** is responsible for handling failures and - /// ensuring that they eventually call again to ensure that the compute is eventually notified of - /// the proper pageserver nodes for a tenant. - #[tracing::instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), node_id))] - pub(super) async fn notify( - &self, - tenant_shard_id: TenantShardId, - node_id: NodeId, - stripe_size: ShardStripeSize, - cancel: &CancellationToken, - ) -> Result<(), NotifyError> { - let mut locked = self.state.lock().await; - - use std::collections::hash_map::Entry; - let tenant = match locked.entry(tenant_shard_id.tenant_id) { - Entry::Vacant(e) => e.insert(ComputeHookTenant::new( - tenant_shard_id, - stripe_size, - node_id, - )), - Entry::Occupied(e) => { - let tenant = e.into_mut(); - tenant.update(tenant_shard_id, stripe_size, node_id); - tenant - } - }; - - let reconfigure_request = tenant.maybe_reconfigure(tenant_shard_id.tenant_id); - let Some(reconfigure_request) = reconfigure_request else { - // The tenant doesn't yet have pageservers for all its shards: we won't notify anything - // until it does. - tracing::info!("Tenant isn't yet ready to emit a notification"); - return Ok(()); - }; - - if let Some(notify_url) = &self.config.compute_hook_url { - self.do_notify(notify_url, reconfigure_request, cancel) - .await - } else { - self.do_notify_local(reconfigure_request) - .await - .map_err(|e| { - // This path is for testing only, so munge the error into our prod-style error type. - tracing::error!("Local notification hook failed: {e}"); - NotifyError::Fatal(StatusCode::INTERNAL_SERVER_ERROR) - }) - } - } -} - -#[cfg(test)] -pub(crate) mod tests { - use pageserver_api::shard::{ShardCount, ShardNumber}; - use utils::id::TenantId; - - use super::*; - - #[test] - fn tenant_updates() -> anyhow::Result<()> { - let tenant_id = TenantId::generate(); - let mut tenant_state = ComputeHookTenant::new( - TenantShardId { - tenant_id, - shard_count: ShardCount::new(0), - shard_number: ShardNumber(0), - }, - ShardStripeSize(12345), - NodeId(1), - ); - - // An unsharded tenant is always ready to emit a notification - assert!(tenant_state.maybe_reconfigure(tenant_id).is_some()); - assert_eq!( - tenant_state - .maybe_reconfigure(tenant_id) - .unwrap() - .shards - .len(), - 1 - ); - assert!(tenant_state - .maybe_reconfigure(tenant_id) - .unwrap() - .stripe_size - .is_none()); - - // Writing the first shard of a multi-sharded situation (i.e. in a split) - // resets the tenant state and puts it in an non-notifying state (need to - // see all shards) - tenant_state.update( - TenantShardId { - tenant_id, - shard_count: ShardCount::new(2), - shard_number: ShardNumber(1), - }, - ShardStripeSize(32768), - NodeId(1), - ); - assert!(tenant_state.maybe_reconfigure(tenant_id).is_none()); - - // Writing the second shard makes it ready to notify - tenant_state.update( - TenantShardId { - tenant_id, - shard_count: ShardCount::new(2), - shard_number: ShardNumber(0), - }, - ShardStripeSize(32768), - NodeId(1), - ); - - assert!(tenant_state.maybe_reconfigure(tenant_id).is_some()); - assert_eq!( - tenant_state - .maybe_reconfigure(tenant_id) - .unwrap() - .shards - .len(), - 2 - ); - assert_eq!( - tenant_state - .maybe_reconfigure(tenant_id) - .unwrap() - .stripe_size, - Some(ShardStripeSize(32768)) - ); - - Ok(()) - } -} diff --git a/control_plane/attachment_service/src/id_lock_map.rs b/control_plane/attachment_service/src/id_lock_map.rs deleted file mode 100644 index b03700b50c..0000000000 --- a/control_plane/attachment_service/src/id_lock_map.rs +++ /dev/null @@ -1,54 +0,0 @@ -use std::{collections::HashMap, sync::Arc}; - -/// A map of locks covering some arbitrary identifiers. Useful if you have a collection of objects but don't -/// want to embed a lock in each one, or if your locking granularity is different to your object granularity. -/// For example, used in the storage controller where the objects are tenant shards, but sometimes locking -/// is needed at a tenant-wide granularity. -pub(crate) struct IdLockMap -where - T: Eq + PartialEq + std::hash::Hash, -{ - /// A synchronous lock for getting/setting the async locks that our callers will wait on. - entities: std::sync::Mutex>>>, -} - -impl IdLockMap -where - T: Eq + PartialEq + std::hash::Hash, -{ - pub(crate) fn shared( - &self, - key: T, - ) -> impl std::future::Future> { - let mut locked = self.entities.lock().unwrap(); - let entry = locked.entry(key).or_default(); - entry.clone().read_owned() - } - - pub(crate) fn exclusive( - &self, - key: T, - ) -> impl std::future::Future> { - let mut locked = self.entities.lock().unwrap(); - let entry = locked.entry(key).or_default(); - entry.clone().write_owned() - } - - /// Rather than building a lock guard that re-takes the [`Self::entities`] lock, we just do - /// periodic housekeeping to avoid the map growing indefinitely - pub(crate) fn housekeeping(&self) { - let mut locked = self.entities.lock().unwrap(); - locked.retain(|_k, lock| lock.try_write().is_err()) - } -} - -impl Default for IdLockMap -where - T: Eq + PartialEq + std::hash::Hash, -{ - fn default() -> Self { - Self { - entities: std::sync::Mutex::new(HashMap::new()), - } - } -} diff --git a/control_plane/src/background_process.rs b/control_plane/src/background_process.rs index 0e59b28230..94666f2870 100644 --- a/control_plane/src/background_process.rs +++ b/control_plane/src/background_process.rs @@ -86,7 +86,10 @@ where .stdout(process_log_file) .stderr(same_file_for_stderr) .args(args); - let filled_cmd = fill_remote_storage_secrets_vars(fill_rust_env_vars(background_command)); + + let filled_cmd = fill_env_vars_prefixed_neon(fill_remote_storage_secrets_vars( + fill_rust_env_vars(background_command), + )); filled_cmd.envs(envs); let pid_file_to_check = match &initial_pid_file { @@ -268,6 +271,15 @@ fn fill_remote_storage_secrets_vars(mut cmd: &mut Command) -> &mut Command { cmd } +fn fill_env_vars_prefixed_neon(mut cmd: &mut Command) -> &mut Command { + for (var, val) in std::env::vars() { + if var.starts_with("NEON_PAGESERVER_") { + cmd = cmd.env(var, val); + } + } + cmd +} + /// Add a `pre_exec` to the cmd that, inbetween fork() and exec(), /// 1. Claims a pidfile with a fcntl lock on it and /// 2. Sets up the pidfile's file descriptor so that it (and the lock) @@ -294,7 +306,7 @@ where // is in state 'taken' but the thread that would unlock it is // not there. // 2. A rust object that represented some external resource in the - // parent now got implicitly copied by the the fork, even though + // parent now got implicitly copied by the fork, even though // the object's type is not `Copy`. The parent program may use // non-copyability as way to enforce unique ownership of an // external resource in the typesystem. The fork breaks that diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index 401feae706..18e395e2b5 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -9,22 +9,23 @@ use anyhow::{anyhow, bail, Context, Result}; use clap::{value_parser, Arg, ArgAction, ArgMatches, Command, ValueEnum}; use compute_api::spec::ComputeMode; use control_plane::endpoint::ComputeControlPlane; -use control_plane::local_env::{InitForceMode, LocalEnv}; -use control_plane::pageserver::{PageServerNode, PAGESERVER_REMOTE_STORAGE_DIR}; +use control_plane::local_env::{ + InitForceMode, LocalEnv, NeonBroker, NeonLocalInitConf, NeonLocalInitPageserverConf, + SafekeeperConf, +}; +use control_plane::pageserver::PageServerNode; use control_plane::safekeeper::SafekeeperNode; use control_plane::storage_controller::StorageController; use control_plane::{broker, local_env}; -use pageserver_api::controller_api::{ - NodeAvailability, NodeConfigureRequest, NodeSchedulingPolicy, PlacementPolicy, +use pageserver_api::config::{ + DEFAULT_HTTP_LISTEN_PORT as DEFAULT_PAGESERVER_HTTP_PORT, + DEFAULT_PG_LISTEN_PORT as DEFAULT_PAGESERVER_PG_PORT, }; +use pageserver_api::controller_api::PlacementPolicy; use pageserver_api::models::{ ShardParameters, TenantCreateRequest, TimelineCreateRequest, TimelineInfo, }; use pageserver_api::shard::{ShardCount, ShardStripeSize, TenantShardId}; -use pageserver_api::{ - DEFAULT_HTTP_LISTEN_PORT as DEFAULT_PAGESERVER_HTTP_PORT, - DEFAULT_PG_LISTEN_PORT as DEFAULT_PAGESERVER_PG_PORT, -}; use postgres_backend::AuthType; use postgres_connection::parse_host_port; use safekeeper_api::{ @@ -54,44 +55,6 @@ const DEFAULT_PG_VERSION: &str = "15"; const DEFAULT_PAGESERVER_CONTROL_PLANE_API: &str = "http://127.0.0.1:1234/upcall/v1/"; -fn default_conf(num_pageservers: u16) -> String { - let mut template = format!( - r#" -# Default built-in configuration, defined in main.rs -control_plane_api = '{DEFAULT_PAGESERVER_CONTROL_PLANE_API}' - -[broker] -listen_addr = '{DEFAULT_BROKER_ADDR}' - -[[safekeepers]] -id = {DEFAULT_SAFEKEEPER_ID} -pg_port = {DEFAULT_SAFEKEEPER_PG_PORT} -http_port = {DEFAULT_SAFEKEEPER_HTTP_PORT} - -"#, - ); - - for i in 0..num_pageservers { - let pageserver_id = NodeId(DEFAULT_PAGESERVER_ID.0 + i as u64); - let pg_port = DEFAULT_PAGESERVER_PG_PORT + i; - let http_port = DEFAULT_PAGESERVER_HTTP_PORT + i; - - template += &format!( - r#" -[[pageservers]] -id = {pageserver_id} -listen_pg_addr = '127.0.0.1:{pg_port}' -listen_http_addr = '127.0.0.1:{http_port}' -pg_auth_type = '{trust_auth}' -http_auth_type = '{trust_auth}' -"#, - trust_auth = AuthType::Trust, - ) - } - - template -} - /// /// Timelines tree element used as a value in the HashMap. /// @@ -135,7 +98,7 @@ fn main() -> Result<()> { let subcommand_result = match sub_name { "tenant" => rt.block_on(handle_tenant(sub_args, &mut env)), "timeline" => rt.block_on(handle_timeline(sub_args, &mut env)), - "start" => rt.block_on(handle_start_all(sub_args, &env)), + "start" => rt.block_on(handle_start_all(&env)), "stop" => rt.block_on(handle_stop_all(sub_args, &env)), "pageserver" => rt.block_on(handle_pageserver(sub_args, &env)), "storage_controller" => rt.block_on(handle_storage_controller(sub_args, &env)), @@ -154,7 +117,7 @@ fn main() -> Result<()> { }; match subcommand_result { - Ok(Some(updated_env)) => updated_env.persist_config(&updated_env.base_data_dir)?, + Ok(Some(updated_env)) => updated_env.persist_config()?, Ok(None) => (), Err(e) => { eprintln!("command failed: {e:?}"); @@ -343,48 +306,65 @@ fn parse_timeline_id(sub_match: &ArgMatches) -> anyhow::Result anyhow::Result { - let num_pageservers = init_match - .get_one::("num-pageservers") - .expect("num-pageservers arg has a default"); - // Create config file - let toml_file: String = if let Some(config_path) = init_match.get_one::("config") { + let num_pageservers = init_match.get_one::("num-pageservers"); + + let force = init_match.get_one("force").expect("we set a default value"); + + // Create the in-memory `LocalEnv` that we'd normally load from disk in `load_config`. + let init_conf: NeonLocalInitConf = if let Some(config_path) = + init_match.get_one::("config") + { + // User (likely the Python test suite) provided a description of the environment. + if num_pageservers.is_some() { + bail!("Cannot specify both --num-pageservers and --config, use key `pageservers` in the --config file instead"); + } // load and parse the file - std::fs::read_to_string(config_path).with_context(|| { + let contents = std::fs::read_to_string(config_path).with_context(|| { format!( "Could not read configuration file '{}'", config_path.display() ) - })? + })?; + toml_edit::de::from_str(&contents)? } else { - // Built-in default config - default_conf(*num_pageservers) + // User (likely interactive) did not provide a description of the environment, give them the default + NeonLocalInitConf { + control_plane_api: Some(Some(DEFAULT_PAGESERVER_CONTROL_PLANE_API.parse().unwrap())), + broker: NeonBroker { + listen_addr: DEFAULT_BROKER_ADDR.parse().unwrap(), + }, + safekeepers: vec![SafekeeperConf { + id: DEFAULT_SAFEKEEPER_ID, + pg_port: DEFAULT_SAFEKEEPER_PG_PORT, + http_port: DEFAULT_SAFEKEEPER_HTTP_PORT, + ..Default::default() + }], + pageservers: (0..num_pageservers.copied().unwrap_or(1)) + .map(|i| { + let pageserver_id = NodeId(DEFAULT_PAGESERVER_ID.0 + i as u64); + let pg_port = DEFAULT_PAGESERVER_PG_PORT + i; + let http_port = DEFAULT_PAGESERVER_HTTP_PORT + i; + NeonLocalInitPageserverConf { + id: pageserver_id, + listen_pg_addr: format!("127.0.0.1:{pg_port}"), + listen_http_addr: format!("127.0.0.1:{http_port}"), + pg_auth_type: AuthType::Trust, + http_auth_type: AuthType::Trust, + other: Default::default(), + } + }) + .collect(), + pg_distrib_dir: None, + neon_distrib_dir: None, + default_tenant_id: TenantId::from_array(std::array::from_fn(|_| 0)), + storage_controller: None, + control_plane_compute_hook_api: None, + } }; - let pg_version = init_match - .get_one::("pg-version") - .copied() - .context("Failed to parse postgres version from the argument string")?; - - let mut env = - LocalEnv::parse_config(&toml_file).context("Failed to create neon configuration")?; - let force = init_match.get_one("force").expect("we set a default value"); - env.init(pg_version, force) - .context("Failed to initialize neon repository")?; - - // Create remote storage location for default LocalFs remote storage - std::fs::create_dir_all(env.base_data_dir.join(PAGESERVER_REMOTE_STORAGE_DIR))?; - - // Initialize pageserver, create initial tenant and timeline. - for ps_conf in &env.pageservers { - PageServerNode::from_env(&env, ps_conf) - .initialize(&pageserver_config_overrides(init_match)) - .unwrap_or_else(|e| { - eprintln!("pageserver init failed: {e:?}"); - exit(1); - }); - } - - Ok(env) + LocalEnv::init(init_conf, force) + .context("materialize initial neon_local environment on disk")?; + Ok(LocalEnv::load_config().expect("freshly written config should be loadable")) } /// The default pageserver is the one where CLI tenant/timeline operations are sent by default. @@ -399,15 +379,6 @@ fn get_default_pageserver(env: &local_env::LocalEnv) -> PageServerNode { PageServerNode::from_env(env, ps_conf) } -fn pageserver_config_overrides(init_match: &ArgMatches) -> Vec<&str> { - init_match - .get_many::("pageserver-config-override") - .into_iter() - .flatten() - .map(String::as_str) - .collect() -} - async fn handle_tenant( tenant_match: &ArgMatches, env: &mut local_env::LocalEnv, @@ -419,6 +390,54 @@ async fn handle_tenant( println!("{} {:?}", t.id, t.state); } } + Some(("import", import_match)) => { + let tenant_id = parse_tenant_id(import_match)?.unwrap_or_else(TenantId::generate); + + let storage_controller = StorageController::from_env(env); + let create_response = storage_controller.tenant_import(tenant_id).await?; + + let shard_zero = create_response + .shards + .first() + .expect("Import response omitted shards"); + + let attached_pageserver_id = shard_zero.node_id; + let pageserver = + PageServerNode::from_env(env, env.get_pageserver_conf(attached_pageserver_id)?); + + println!( + "Imported tenant {tenant_id}, attached to pageserver {attached_pageserver_id}" + ); + + let timelines = pageserver + .http_client + .list_timelines(shard_zero.shard_id) + .await?; + + // Pick a 'main' timeline that has no ancestors, the rest will get arbitrary names + let main_timeline = timelines + .iter() + .find(|t| t.ancestor_timeline_id.is_none()) + .expect("No timelines found") + .timeline_id; + + let mut branch_i = 0; + for timeline in timelines.iter() { + let branch_name = if timeline.timeline_id == main_timeline { + "main".to_string() + } else { + branch_i += 1; + format!("branch_{branch_i}") + }; + + println!( + "Importing timeline {tenant_id}/{} as branch {branch_name}", + timeline.timeline_id + ); + + env.register_branch_mapping(branch_name, tenant_id, timeline.timeline_id)?; + } + } Some(("create", create_match)) => { let tenant_conf: HashMap<_, _> = create_match .get_many::("config") @@ -791,6 +810,8 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re .copied() .unwrap_or(false); + let allow_multiple = sub_args.get_flag("allow-multiple"); + let mode = match (lsn, hot_standby) { (Some(lsn), false) => ComputeMode::Static(lsn), (None, true) => ComputeMode::Replica, @@ -808,7 +829,9 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re _ => {} } - cplane.check_conflicting_endpoints(mode, tenant_id, timeline_id)?; + if !allow_multiple { + cplane.check_conflicting_endpoints(mode, tenant_id, timeline_id)?; + } cplane.new_endpoint( &endpoint_id, @@ -837,6 +860,8 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re let remote_ext_config = sub_args.get_one::("remote-ext-config"); + let allow_multiple = sub_args.get_flag("allow-multiple"); + // If --safekeepers argument is given, use only the listed safekeeper nodes. let safekeepers = if let Some(safekeepers_str) = sub_args.get_one::("safekeepers") { @@ -862,11 +887,13 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re .cloned() .unwrap_or_default(); - cplane.check_conflicting_endpoints( - endpoint.mode, - endpoint.tenant_id, - endpoint.timeline_id, - )?; + if !allow_multiple { + cplane.check_conflicting_endpoints( + endpoint.mode, + endpoint.tenant_id, + endpoint.timeline_id, + )?; + } let (pageservers, stripe_size) = if let Some(pageserver_id) = pageserver_id { let conf = env.get_pageserver_conf(pageserver_id).unwrap(); @@ -1022,10 +1049,7 @@ fn get_pageserver(env: &local_env::LocalEnv, args: &ArgMatches) -> Result Result<()> { match sub_match.subcommand() { Some(("start", subcommand_args)) => { - if let Err(e) = get_pageserver(env, subcommand_args)? - .start(&pageserver_config_overrides(subcommand_args)) - .await - { + if let Err(e) = get_pageserver(env, subcommand_args)?.start().await { eprintln!("pageserver start failed: {e}"); exit(1); } @@ -1051,30 +1075,12 @@ async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> exit(1); } - if let Err(e) = pageserver - .start(&pageserver_config_overrides(subcommand_args)) - .await - { + if let Err(e) = pageserver.start().await { eprintln!("pageserver start failed: {e}"); exit(1); } } - Some(("set-state", subcommand_args)) => { - let pageserver = get_pageserver(env, subcommand_args)?; - let scheduling = subcommand_args.get_one("scheduling"); - let availability = subcommand_args.get_one("availability"); - - let storage_controller = StorageController::from_env(env); - storage_controller - .node_configure(NodeConfigureRequest { - node_id: pageserver.conf.id, - scheduling: scheduling.cloned(), - availability: availability.cloned(), - }) - .await?; - } - Some(("status", subcommand_args)) => { match get_pageserver(env, subcommand_args)?.check_status().await { Ok(_) => println!("Page server is up and running"), @@ -1196,7 +1202,7 @@ async fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Ok(()) } -async fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow::Result<()> { +async fn handle_start_all(env: &local_env::LocalEnv) -> anyhow::Result<()> { // Endpoints are not started automatically broker::start_broker_process(env).await?; @@ -1213,10 +1219,7 @@ async fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> for ps_conf in &env.pageservers { let pageserver = PageServerNode::from_env(env, ps_conf); - if let Err(e) = pageserver - .start(&pageserver_config_overrides(sub_match)) - .await - { + if let Err(e) = pageserver.start().await { eprintln!("pageserver {} start failed: {:#}", ps_conf.id, e); try_stop_all(env, true).await; exit(1); @@ -1248,7 +1251,7 @@ async fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) { match ComputeControlPlane::load(env.clone()) { Ok(cplane) => { for (_k, node) in cplane.endpoints { - if let Err(e) = node.stop(if immediate { "immediate" } else { "fast " }, false) { + if let Err(e) = node.stop(if immediate { "immediate" } else { "fast" }, false) { eprintln!("postgres stop failed: {e:#}"); } } @@ -1357,13 +1360,6 @@ fn cli() -> Command { .required(false) .value_name("stop-mode"); - let pageserver_config_args = Arg::new("pageserver-config-override") - .long("pageserver-config-override") - .num_args(1) - .action(ArgAction::Append) - .help("Additional pageserver's configuration options or overrides, refer to pageserver's 'config-override' CLI parameter docs for more") - .required(false); - let remote_ext_config_args = Arg::new("remote-ext-config") .long("remote-ext-config") .num_args(1) @@ -1397,9 +1393,7 @@ fn cli() -> Command { let num_pageservers_arg = Arg::new("num-pageservers") .value_parser(value_parser!(u16)) .long("num-pageservers") - .help("How many pageservers to create (default 1)") - .required(false) - .default_value("1"); + .help("How many pageservers to create (default 1)"); let update_catalog = Arg::new("update-catalog") .value_parser(value_parser!(bool)) @@ -1413,20 +1407,25 @@ fn cli() -> Command { .help("If set, will create test user `user` and `neondb` database. Requires `update-catalog = true`") .required(false); + let allow_multiple = Arg::new("allow-multiple") + .help("Allow multiple primary endpoints running on the same branch. Shouldn't be used normally, but useful for tests.") + .long("allow-multiple") + .action(ArgAction::SetTrue) + .required(false); + Command::new("Neon CLI") .arg_required_else_help(true) .version(GIT_VERSION) .subcommand( Command::new("init") .about("Initialize a new Neon repository, preparing configs for services to start with") - .arg(pageserver_config_args.clone()) .arg(num_pageservers_arg.clone()) .arg( Arg::new("config") .long("config") .required(false) .value_parser(value_parser!(PathBuf)) - .value_name("config"), + .value_name("config") ) .arg(pg_version_arg.clone()) .arg(force_arg) @@ -1434,6 +1433,7 @@ fn cli() -> Command { .subcommand( Command::new("timeline") .about("Manage timelines") + .arg_required_else_help(true) .subcommand(Command::new("list") .about("List all timelines, available to this pageserver") .arg(tenant_id_arg.clone())) @@ -1496,6 +1496,8 @@ fn cli() -> Command { .subcommand(Command::new("config") .arg(tenant_id_arg.clone()) .arg(Arg::new("config").short('c').num_args(1).action(ArgAction::Append).required(false))) + .subcommand(Command::new("import").arg(tenant_id_arg.clone().required(true)) + .about("Import a tenant that is present in remote storage, and create branches for its timelines")) ) .subcommand( Command::new("pageserver") @@ -1505,7 +1507,6 @@ fn cli() -> Command { .subcommand(Command::new("status")) .subcommand(Command::new("start") .about("Start local pageserver") - .arg(pageserver_config_args.clone()) ) .subcommand(Command::new("stop") .about("Stop local pageserver") @@ -1513,21 +1514,14 @@ fn cli() -> Command { ) .subcommand(Command::new("restart") .about("Restart local pageserver") - .arg(pageserver_config_args.clone()) - ) - .subcommand(Command::new("set-state") - .arg(Arg::new("availability").value_parser(value_parser!(NodeAvailability)).long("availability").action(ArgAction::Set).help("Availability state: offline,active")) - .arg(Arg::new("scheduling").value_parser(value_parser!(NodeSchedulingPolicy)).long("scheduling").action(ArgAction::Set).help("Scheduling state: draining,pause,filling,active")) - .about("Set scheduling or availability state of pageserver node") - .arg(pageserver_config_args.clone()) ) ) .subcommand( Command::new("storage_controller") .arg_required_else_help(true) .about("Manage storage_controller") - .subcommand(Command::new("start").about("Start local pageserver").arg(pageserver_config_args.clone())) - .subcommand(Command::new("stop").about("Stop local pageserver") + .subcommand(Command::new("start").about("Start storage controller")) + .subcommand(Command::new("stop").about("Stop storage controller") .arg(stop_mode_arg.clone())) ) .subcommand( @@ -1573,6 +1567,7 @@ fn cli() -> Command { .arg(pg_version_arg.clone()) .arg(hot_standby_arg.clone()) .arg(update_catalog) + .arg(allow_multiple.clone()) ) .subcommand(Command::new("start") .about("Start postgres.\n If the endpoint doesn't exist yet, it is created.") @@ -1581,6 +1576,7 @@ fn cli() -> Command { .arg(safekeepers_arg) .arg(remote_ext_config_args) .arg(create_test_user) + .arg(allow_multiple.clone()) ) .subcommand(Command::new("reconfigure") .about("Reconfigure the endpoint") @@ -1632,7 +1628,6 @@ fn cli() -> Command { .subcommand( Command::new("start") .about("Start page server and safekeepers") - .arg(pageserver_config_args) ) .subcommand( Command::new("stop") diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs index 5206222961..20371e1cb8 100644 --- a/control_plane/src/endpoint.rs +++ b/control_plane/src/endpoint.rs @@ -12,7 +12,7 @@ //! //! The endpoint is managed by the `compute_ctl` binary. When an endpoint is //! started, we launch `compute_ctl` It synchronizes the safekeepers, downloads -//! the basebackup from the pageserver to initialize the the data directory, and +//! the basebackup from the pageserver to initialize the data directory, and //! finally launches the PostgreSQL process. It watches the PostgreSQL process //! until it exits. //! @@ -554,6 +554,7 @@ impl Endpoint { format_version: 1.0, operation_uuid: None, features: self.features.clone(), + swap_size_bytes: None, cluster: Cluster { cluster_id: None, // project ID: not used name: None, // project name: not used diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index bd3dbef453..d13884198e 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -3,7 +3,7 @@ //! Now it also provides init method which acts like a stub for proper installation //! script which will use local paths. -use anyhow::{bail, ensure, Context}; +use anyhow::{bail, Context}; use clap::ValueEnum; use postgres_backend::AuthType; @@ -17,11 +17,14 @@ use std::net::Ipv4Addr; use std::net::SocketAddr; use std::path::{Path, PathBuf}; use std::process::{Command, Stdio}; +use std::time::Duration; use utils::{ auth::{encode_from_key_file, Claims}, id::{NodeId, TenantId, TenantTimelineId, TimelineId}, }; +use crate::pageserver::PageServerNode; +use crate::pageserver::PAGESERVER_REMOTE_STORAGE_DIR; use crate::safekeeper::SafekeeperNode; pub const DEFAULT_PG_VERSION: u32 = 15; @@ -33,7 +36,7 @@ pub const DEFAULT_PG_VERSION: u32 = 15; // to 'neon_local init --config=' option. See control_plane/simple.conf for // an example. // -#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)] +#[derive(PartialEq, Eq, Clone, Debug)] pub struct LocalEnv { // Base directory for all the nodes (the pageserver, safekeepers and // compute endpoints). @@ -41,55 +44,99 @@ pub struct LocalEnv { // This is not stored in the config file. Rather, this is the path where the // config file itself is. It is read from the NEON_REPO_DIR env variable or // '.neon' if not given. - #[serde(skip)] pub base_data_dir: PathBuf, // Path to postgres distribution. It's expected that "bin", "include", // "lib", "share" from postgres distribution are there. If at some point // in time we will be able to run against vanilla postgres we may split that // to four separate paths and match OS-specific installation layout. - #[serde(default)] pub pg_distrib_dir: PathBuf, // Path to pageserver binary. - #[serde(default)] pub neon_distrib_dir: PathBuf, // Default tenant ID to use with the 'neon_local' command line utility, when // --tenant_id is not explicitly specified. - #[serde(default)] pub default_tenant_id: Option, // used to issue tokens during e.g pg start - #[serde(default)] pub private_key_path: PathBuf, pub broker: NeonBroker, + // Configuration for the storage controller (1 per neon_local environment) + pub storage_controller: NeonStorageControllerConf, + /// This Vec must always contain at least one pageserver + /// Populdated by [`Self::load_config`] from the individual `pageserver.toml`s. + /// NB: not used anymore except for informing users that they need to change their `.neon/config`. pub pageservers: Vec, - #[serde(default)] pub safekeepers: Vec, // Control plane upcall API for pageserver: if None, we will not run storage_controller If set, this will // be propagated into each pageserver's configuration. - #[serde(default)] pub control_plane_api: Option, // Control plane upcall API for storage controller. If set, this will be propagated into the // storage controller's configuration. - #[serde(default)] pub control_plane_compute_hook_api: Option, /// Keep human-readable aliases in memory (and persist them to config), to hide ZId hex strings from the user. - #[serde(default)] // A `HashMap>` would be more appropriate here, // but deserialization into a generic toml object as `toml::Value::try_from` fails with an error. // https://toml.io/en/v1.0.0 does not contain a concept of "a table inside another table". + pub branch_name_mappings: HashMap>, +} + +/// On-disk state stored in `.neon/config`. +#[derive(PartialEq, Eq, Clone, Debug, Default, Serialize, Deserialize)] +#[serde(default, deny_unknown_fields)] +pub struct OnDiskConfig { + pub pg_distrib_dir: PathBuf, + pub neon_distrib_dir: PathBuf, + pub default_tenant_id: Option, + pub private_key_path: PathBuf, + pub broker: NeonBroker, + pub storage_controller: NeonStorageControllerConf, + #[serde( + skip_serializing, + deserialize_with = "fail_if_pageservers_field_specified" + )] + pub pageservers: Vec, + pub safekeepers: Vec, + pub control_plane_api: Option, + pub control_plane_compute_hook_api: Option, branch_name_mappings: HashMap>, } +fn fail_if_pageservers_field_specified<'de, D>(_: D) -> Result, D::Error> +where + D: serde::Deserializer<'de>, +{ + Err(serde::de::Error::custom( + "The 'pageservers' field is no longer used; pageserver.toml is now authoritative; \ + Please remove the `pageservers` from your .neon/config.", + )) +} + +/// The description of the neon_local env to be initialized by `neon_local init --config`. +#[derive(Clone, Debug, Deserialize)] +#[serde(deny_unknown_fields)] +pub struct NeonLocalInitConf { + // TODO: do we need this? Seems unused + pub pg_distrib_dir: Option, + // TODO: do we need this? Seems unused + pub neon_distrib_dir: Option, + pub default_tenant_id: TenantId, + pub broker: NeonBroker, + pub storage_controller: Option, + pub pageservers: Vec, + pub safekeepers: Vec, + pub control_plane_api: Option>, + pub control_plane_compute_hook_api: Option>, +} + /// Broker config for cluster internal communication. #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)] #[serde(default)] @@ -98,6 +145,29 @@ pub struct NeonBroker { pub listen_addr: SocketAddr, } +/// Broker config for cluster internal communication. +#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)] +#[serde(default)] +pub struct NeonStorageControllerConf { + /// Heartbeat timeout before marking a node offline + #[serde(with = "humantime_serde")] + pub max_unavailable: Duration, +} + +impl NeonStorageControllerConf { + // Use a shorter pageserver unavailability interval than the default to speed up tests. + const DEFAULT_MAX_UNAVAILABLE_INTERVAL: std::time::Duration = + std::time::Duration::from_secs(10); +} + +impl Default for NeonStorageControllerConf { + fn default() -> Self { + Self { + max_unavailable: Self::DEFAULT_MAX_UNAVAILABLE_INTERVAL, + } + } +} + // Dummy Default impl to satisfy Deserialize derive. impl Default for NeonBroker { fn default() -> Self { @@ -113,22 +183,18 @@ impl NeonBroker { } } +// neon_local needs to know this subset of pageserver configuration. +// For legacy reasons, this information is duplicated from `pageserver.toml` into `.neon/config`. +// It can get stale if `pageserver.toml` is changed. +// TODO(christian): don't store this at all in `.neon/config`, always load it from `pageserver.toml` #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)] #[serde(default, deny_unknown_fields)] pub struct PageServerConf { - // node id pub id: NodeId, - - // Pageserver connection settings pub listen_pg_addr: String, pub listen_http_addr: String, - - // auth type used for the PG and HTTP ports pub pg_auth_type: AuthType, pub http_auth_type: AuthType, - - pub(crate) virtual_file_io_engine: Option, - pub(crate) get_vectored_impl: Option, } impl Default for PageServerConf { @@ -139,8 +205,40 @@ impl Default for PageServerConf { listen_http_addr: String::new(), pg_auth_type: AuthType::Trust, http_auth_type: AuthType::Trust, - virtual_file_io_engine: None, - get_vectored_impl: None, + } + } +} + +/// The toml that can be passed to `neon_local init --config`. +/// This is a subset of the `pageserver.toml` configuration. +// TODO(christian): use pageserver_api::config::ConfigToml (PR #7656) +#[derive(Clone, Debug, serde::Deserialize, serde::Serialize)] +pub struct NeonLocalInitPageserverConf { + pub id: NodeId, + pub listen_pg_addr: String, + pub listen_http_addr: String, + pub pg_auth_type: AuthType, + pub http_auth_type: AuthType, + #[serde(flatten)] + pub other: HashMap, +} + +impl From<&NeonLocalInitPageserverConf> for PageServerConf { + fn from(conf: &NeonLocalInitPageserverConf) -> Self { + let NeonLocalInitPageserverConf { + id, + listen_pg_addr, + listen_http_addr, + pg_auth_type, + http_auth_type, + other: _, + } = conf; + Self { + id: *id, + listen_pg_addr: listen_pg_addr.clone(), + listen_http_addr: listen_http_addr.clone(), + pg_auth_type: *pg_auth_type, + http_auth_type: *http_auth_type, } } } @@ -156,6 +254,7 @@ pub struct SafekeeperConf { pub remote_storage: Option, pub backup_threads: Option, pub auth_enabled: bool, + pub listen_addr: Option, } impl Default for SafekeeperConf { @@ -169,6 +268,7 @@ impl Default for SafekeeperConf { remote_storage: None, backup_threads: None, auth_enabled: false, + listen_addr: None, } } } @@ -326,41 +426,7 @@ impl LocalEnv { .collect() } - /// Create a LocalEnv from a config file. - /// - /// Unlike 'load_config', this function fills in any defaults that are missing - /// from the config file. - pub fn parse_config(toml: &str) -> anyhow::Result { - let mut env: LocalEnv = toml::from_str(toml)?; - - // Find postgres binaries. - // Follow POSTGRES_DISTRIB_DIR if set, otherwise look in "pg_install". - // Note that later in the code we assume, that distrib dirs follow the same pattern - // for all postgres versions. - if env.pg_distrib_dir == Path::new("") { - if let Some(postgres_bin) = env::var_os("POSTGRES_DISTRIB_DIR") { - env.pg_distrib_dir = postgres_bin.into(); - } else { - let cwd = env::current_dir()?; - env.pg_distrib_dir = cwd.join("pg_install") - } - } - - // Find neon binaries. - if env.neon_distrib_dir == Path::new("") { - env.neon_distrib_dir = env::current_exe()?.parent().unwrap().to_owned(); - } - - if env.pageservers.is_empty() { - anyhow::bail!("Configuration must contain at least one pageserver"); - } - - env.base_data_dir = base_path(); - - Ok(env) - } - - /// Locate and load config + /// Construct `Self` from on-disk state. pub fn load_config() -> anyhow::Result { let repopath = base_path(); @@ -374,38 +440,129 @@ impl LocalEnv { // TODO: check that it looks like a neon repository // load and parse file - let config = fs::read_to_string(repopath.join("config"))?; - let mut env: LocalEnv = toml::from_str(config.as_str())?; + let config_file_contents = fs::read_to_string(repopath.join("config"))?; + let on_disk_config: OnDiskConfig = toml::from_str(config_file_contents.as_str())?; + let mut env = { + let OnDiskConfig { + pg_distrib_dir, + neon_distrib_dir, + default_tenant_id, + private_key_path, + broker, + storage_controller, + pageservers, + safekeepers, + control_plane_api, + control_plane_compute_hook_api, + branch_name_mappings, + } = on_disk_config; + LocalEnv { + base_data_dir: repopath.clone(), + pg_distrib_dir, + neon_distrib_dir, + default_tenant_id, + private_key_path, + broker, + storage_controller, + pageservers, + safekeepers, + control_plane_api, + control_plane_compute_hook_api, + branch_name_mappings, + } + }; - env.base_data_dir = repopath; + // The source of truth for pageserver configuration is the pageserver.toml. + assert!( + env.pageservers.is_empty(), + "we ensure this during deserialization" + ); + env.pageservers = { + let iter = std::fs::read_dir(&repopath).context("open dir")?; + let mut pageservers = Vec::new(); + for res in iter { + let dentry = res?; + const PREFIX: &str = "pageserver_"; + let dentry_name = dentry + .file_name() + .into_string() + .ok() + .with_context(|| format!("non-utf8 dentry: {:?}", dentry.path())) + .unwrap(); + if !dentry_name.starts_with(PREFIX) { + continue; + } + if !dentry.file_type().context("determine file type")?.is_dir() { + anyhow::bail!("expected a directory, got {:?}", dentry.path()); + } + let id = dentry_name[PREFIX.len()..] + .parse::() + .with_context(|| format!("parse id from {:?}", dentry.path()))?; + // TODO(christian): use pageserver_api::config::ConfigToml (PR #7656) + #[derive(serde::Serialize, serde::Deserialize)] + // (allow unknown fields, unlike PageServerConf) + struct PageserverConfigTomlSubset { + id: NodeId, + listen_pg_addr: String, + listen_http_addr: String, + pg_auth_type: AuthType, + http_auth_type: AuthType, + } + let config_toml_path = dentry.path().join("pageserver.toml"); + let config_toml: PageserverConfigTomlSubset = toml_edit::de::from_str( + &std::fs::read_to_string(&config_toml_path) + .with_context(|| format!("read {:?}", config_toml_path))?, + ) + .context("parse pageserver.toml")?; + let PageserverConfigTomlSubset { + id: config_toml_id, + listen_pg_addr, + listen_http_addr, + pg_auth_type, + http_auth_type, + } = config_toml; + let conf = PageServerConf { + id: { + anyhow::ensure!( + config_toml_id == id, + "id mismatch: config_toml.id={config_toml_id} id={id}", + ); + id + }, + listen_pg_addr, + listen_http_addr, + pg_auth_type, + http_auth_type, + }; + pageservers.push(conf); + } + pageservers + }; Ok(env) } - pub fn persist_config(&self, base_path: &Path) -> anyhow::Result<()> { - // Currently, the user first passes a config file with 'neon_local init --config=' - // We read that in, in `create_config`, and fill any missing defaults. Then it's saved - // to .neon/config. TODO: We lose any formatting and comments along the way, which is - // a bit sad. - let mut conf_content = r#"# This file describes a local deployment of the page server -# and safekeeeper node. It is read by the 'neon_local' command-line -# utility. -"# - .to_string(); - - // Convert the LocalEnv to a toml file. - // - // This could be as simple as this: - // - // conf_content += &toml::to_string_pretty(env)?; - // - // But it results in a "values must be emitted before tables". I'm not sure - // why, AFAICS the table, i.e. 'safekeepers: Vec' is last. - // Maybe rust reorders the fields to squeeze avoid padding or something? - // In any case, converting to toml::Value first, and serializing that, works. - // See https://github.com/alexcrichton/toml-rs/issues/142 - conf_content += &toml::to_string_pretty(&toml::Value::try_from(self)?)?; + pub fn persist_config(&self) -> anyhow::Result<()> { + Self::persist_config_impl( + &self.base_data_dir, + &OnDiskConfig { + pg_distrib_dir: self.pg_distrib_dir.clone(), + neon_distrib_dir: self.neon_distrib_dir.clone(), + default_tenant_id: self.default_tenant_id, + private_key_path: self.private_key_path.clone(), + broker: self.broker.clone(), + storage_controller: self.storage_controller.clone(), + pageservers: vec![], // it's skip_serializing anyway + safekeepers: self.safekeepers.clone(), + control_plane_api: self.control_plane_api.clone(), + control_plane_compute_hook_api: self.control_plane_compute_hook_api.clone(), + branch_name_mappings: self.branch_name_mappings.clone(), + }, + ) + } + pub fn persist_config_impl(base_path: &Path, config: &OnDiskConfig) -> anyhow::Result<()> { + let conf_content = &toml::to_string_pretty(config)?; let target_config_path = base_path.join("config"); fs::write(&target_config_path, conf_content).with_context(|| { format!( @@ -430,17 +587,13 @@ impl LocalEnv { } } - // - // Initialize a new Neon repository - // - pub fn init(&mut self, pg_version: u32, force: &InitForceMode) -> anyhow::Result<()> { - // check if config already exists - let base_path = &self.base_data_dir; - ensure!( - base_path != Path::new(""), - "repository base path is missing" - ); + /// Materialize the [`NeonLocalInitConf`] to disk. Called during [`neon_local init`]. + pub fn init(conf: NeonLocalInitConf, force: &InitForceMode) -> anyhow::Result<()> { + let base_path = base_path(); + assert_ne!(base_path, Path::new("")); + let base_path = &base_path; + // create base_path dir if base_path.exists() { match force { InitForceMode::MustNotExist => { @@ -472,70 +625,96 @@ impl LocalEnv { } } } - - if !self.pg_bin_dir(pg_version)?.join("postgres").exists() { - bail!( - "Can't find postgres binary at {}", - self.pg_bin_dir(pg_version)?.display() - ); - } - for binary in ["pageserver", "safekeeper"] { - if !self.neon_distrib_dir.join(binary).exists() { - bail!( - "Can't find binary '{binary}' in neon distrib dir '{}'", - self.neon_distrib_dir.display() - ); - } - } - if !base_path.exists() { fs::create_dir(base_path)?; } + let NeonLocalInitConf { + pg_distrib_dir, + neon_distrib_dir, + default_tenant_id, + broker, + storage_controller, + pageservers, + safekeepers, + control_plane_api, + control_plane_compute_hook_api, + } = conf; + + // Find postgres binaries. + // Follow POSTGRES_DISTRIB_DIR if set, otherwise look in "pg_install". + // Note that later in the code we assume, that distrib dirs follow the same pattern + // for all postgres versions. + let pg_distrib_dir = pg_distrib_dir.unwrap_or_else(|| { + if let Some(postgres_bin) = env::var_os("POSTGRES_DISTRIB_DIR") { + postgres_bin.into() + } else { + let cwd = env::current_dir().unwrap(); + cwd.join("pg_install") + } + }); + + // Find neon binaries. + let neon_distrib_dir = neon_distrib_dir + .unwrap_or_else(|| env::current_exe().unwrap().parent().unwrap().to_owned()); + // Generate keypair for JWT. // // The keypair is only needed if authentication is enabled in any of the // components. For convenience, we generate the keypair even if authentication // is not enabled, so that you can easily enable it after the initialization - // step. However, if the key generation fails, we treat it as non-fatal if - // authentication was not enabled. - if self.private_key_path == PathBuf::new() { - match generate_auth_keys( - base_path.join("auth_private_key.pem").as_path(), - base_path.join("auth_public_key.pem").as_path(), - ) { - Ok(()) => { - self.private_key_path = PathBuf::from("auth_private_key.pem"); - } - Err(e) => { - if !self.auth_keys_needed() { - eprintln!("Could not generate keypair for JWT authentication: {e}"); - eprintln!("Continuing anyway because authentication was not enabled"); - self.private_key_path = PathBuf::from("auth_private_key.pem"); - } else { - return Err(e); - } - } - } + // step. + generate_auth_keys( + base_path.join("auth_private_key.pem").as_path(), + base_path.join("auth_public_key.pem").as_path(), + ) + .context("generate auth keys")?; + let private_key_path = PathBuf::from("auth_private_key.pem"); + + // create the runtime type because the remaining initialization code below needs + // a LocalEnv instance op operation + // TODO: refactor to avoid this, LocalEnv should only be constructed from on-disk state + let env = LocalEnv { + base_data_dir: base_path.clone(), + pg_distrib_dir, + neon_distrib_dir, + default_tenant_id: Some(default_tenant_id), + private_key_path, + broker, + storage_controller: storage_controller.unwrap_or_default(), + pageservers: pageservers.iter().map(Into::into).collect(), + safekeepers, + control_plane_api: control_plane_api.unwrap_or_default(), + control_plane_compute_hook_api: control_plane_compute_hook_api.unwrap_or_default(), + branch_name_mappings: Default::default(), + }; + + // create endpoints dir + fs::create_dir_all(env.endpoints_path())?; + + // create safekeeper dirs + for safekeeper in &env.safekeepers { + fs::create_dir_all(SafekeeperNode::datadir_path_by_id(&env, safekeeper.id))?; } - fs::create_dir_all(self.endpoints_path())?; - - for safekeeper in &self.safekeepers { - fs::create_dir_all(SafekeeperNode::datadir_path_by_id(self, safekeeper.id))?; + // initialize pageserver state + for (i, ps) in pageservers.into_iter().enumerate() { + let runtime_ps = &env.pageservers[i]; + assert_eq!(&PageServerConf::from(&ps), runtime_ps); + fs::create_dir(env.pageserver_data_dir(ps.id))?; + PageServerNode::from_env(&env, runtime_ps) + .initialize(ps) + .context("pageserver init failed")?; } - self.persist_config(base_path) - } + // setup remote remote location for default LocalFs remote storage + std::fs::create_dir_all(env.base_data_dir.join(PAGESERVER_REMOTE_STORAGE_DIR))?; - fn auth_keys_needed(&self) -> bool { - self.pageservers.iter().any(|ps| { - ps.pg_auth_type == AuthType::NeonJWT || ps.http_auth_type == AuthType::NeonJWT - }) || self.safekeepers.iter().any(|sk| sk.auth_enabled) + env.persist_config() } } -fn base_path() -> PathBuf { +pub fn base_path() -> PathBuf { match std::env::var_os("NEON_REPO_DIR") { Some(val) => PathBuf::from(val), None => PathBuf::from(".neon"), @@ -578,31 +757,3 @@ fn generate_auth_keys(private_key_path: &Path, public_key_path: &Path) -> anyhow } Ok(()) } - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn simple_conf_parsing() { - let simple_conf_toml = include_str!("../simple.conf"); - let simple_conf_parse_result = LocalEnv::parse_config(simple_conf_toml); - assert!( - simple_conf_parse_result.is_ok(), - "failed to parse simple config {simple_conf_toml}, reason: {simple_conf_parse_result:?}" - ); - - let string_to_replace = "listen_addr = '127.0.0.1:50051'"; - let spoiled_url_str = "listen_addr = '!@$XOXO%^&'"; - let spoiled_url_toml = simple_conf_toml.replace(string_to_replace, spoiled_url_str); - assert!( - spoiled_url_toml.contains(spoiled_url_str), - "Failed to replace string {string_to_replace} in the toml file {simple_conf_toml}" - ); - let spoiled_url_parse_result = LocalEnv::parse_config(&spoiled_url_toml); - assert!( - spoiled_url_parse_result.is_err(), - "expected toml with invalid Url {spoiled_url_toml} to fail the parsing, but got {spoiled_url_parse_result:?}" - ); - } -} diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs index c5eabc46db..5a84763697 100644 --- a/control_plane/src/pageserver.rs +++ b/control_plane/src/pageserver.rs @@ -4,21 +4,21 @@ //! //! .neon/ //! -use std::borrow::Cow; use std::collections::HashMap; use std::io; use std::io::Write; use std::num::NonZeroU64; use std::path::PathBuf; -use std::process::Command; +use std::str::FromStr; use std::time::Duration; use anyhow::{bail, Context}; use camino::Utf8PathBuf; use futures::SinkExt; use pageserver_api::models::{ - self, LocationConfig, ShardParameters, TenantHistorySize, TenantInfo, TimelineInfo, + self, AuxFilePolicy, LocationConfig, ShardParameters, TenantHistorySize, TenantInfo, + TimelineInfo, }; use pageserver_api::shard::TenantShardId; use pageserver_client::mgmt_api; @@ -30,7 +30,7 @@ use utils::{ lsn::Lsn, }; -use crate::local_env::PageServerConf; +use crate::local_env::{NeonLocalInitPageserverConf, PageServerConf}; use crate::{background_process, local_env::LocalEnv}; /// Directory within .neon which will be used by default for LocalFs remote storage. @@ -74,57 +74,23 @@ impl PageServerNode { } } - /// Merge overrides provided by the user on the command line with our default overides derived from neon_local configuration. - /// - /// These all end up on the command line of the `pageserver` binary. - fn neon_local_overrides(&self, cli_overrides: &[&str]) -> Vec { + fn pageserver_init_make_toml( + &self, + conf: NeonLocalInitPageserverConf, + ) -> anyhow::Result { + assert_eq!(&PageServerConf::from(&conf), &self.conf, "during neon_local init, we derive the runtime state of ps conf (self.conf) from the --config flag fully"); + + // TODO(christian): instead of what we do here, create a pageserver_api::config::ConfigToml (PR #7656) + // FIXME: the paths should be shell-escaped to handle paths with spaces, quotas etc. let pg_distrib_dir_param = format!( "pg_distrib_dir='{}'", self.env.pg_distrib_dir_raw().display() ); - let PageServerConf { - id, - listen_pg_addr, - listen_http_addr, - pg_auth_type, - http_auth_type, - virtual_file_io_engine, - get_vectored_impl, - } = &self.conf; - - let id = format!("id={}", id); - - let http_auth_type_param = format!("http_auth_type='{}'", http_auth_type); - let listen_http_addr_param = format!("listen_http_addr='{}'", listen_http_addr); - - let pg_auth_type_param = format!("pg_auth_type='{}'", pg_auth_type); - let listen_pg_addr_param = format!("listen_pg_addr='{}'", listen_pg_addr); - let virtual_file_io_engine = if let Some(virtual_file_io_engine) = virtual_file_io_engine { - format!("virtual_file_io_engine='{virtual_file_io_engine}'") - } else { - String::new() - }; - let get_vectored_impl = if let Some(get_vectored_impl) = get_vectored_impl { - format!("get_vectored_impl='{get_vectored_impl}'") - } else { - String::new() - }; - let broker_endpoint_param = format!("broker_endpoint='{}'", self.env.broker.client_url()); - let mut overrides = vec![ - id, - pg_distrib_dir_param, - http_auth_type_param, - pg_auth_type_param, - listen_http_addr_param, - listen_pg_addr_param, - broker_endpoint_param, - virtual_file_io_engine, - get_vectored_impl, - ]; + let mut overrides = vec![pg_distrib_dir_param, broker_endpoint_param]; if let Some(control_plane_api) = &self.env.control_plane_api { overrides.push(format!( @@ -134,7 +100,7 @@ impl PageServerNode { // Storage controller uses the same auth as pageserver: if JWT is enabled // for us, we will also need it to talk to them. - if matches!(http_auth_type, AuthType::NeonJWT) { + if matches!(conf.http_auth_type, AuthType::NeonJWT) { let jwt_token = self .env .generate_auth_token(&Claims::new(None, Scope::GenerationsApi)) @@ -143,31 +109,40 @@ impl PageServerNode { } } - if !cli_overrides - .iter() - .any(|c| c.starts_with("remote_storage")) - { + if !conf.other.contains_key("remote_storage") { overrides.push(format!( "remote_storage={{local_path='../{PAGESERVER_REMOTE_STORAGE_DIR}'}}" )); } - if *http_auth_type != AuthType::Trust || *pg_auth_type != AuthType::Trust { + if conf.http_auth_type != AuthType::Trust || conf.pg_auth_type != AuthType::Trust { // Keys are generated in the toplevel repo dir, pageservers' workdirs // are one level below that, so refer to keys with ../ overrides.push("auth_validation_public_key_path='../auth_public_key.pem'".to_owned()); } // Apply the user-provided overrides - overrides.extend(cli_overrides.iter().map(|&c| c.to_owned())); + overrides.push( + toml_edit::ser::to_string_pretty(&conf) + .expect("we deserialized this from toml earlier"), + ); - overrides + // Turn `overrides` into a toml document. + // TODO: above code is legacy code, it should be refactored to use toml_edit directly. + let mut config_toml = toml_edit::Document::new(); + for fragment_str in overrides { + let fragment = toml_edit::Document::from_str(&fragment_str) + .expect("all fragments in `overrides` are valid toml documents, this function controls that"); + for (key, item) in fragment.iter() { + config_toml.insert(key, item.clone()); + } + } + Ok(config_toml) } /// Initializes a pageserver node by creating its config with the overrides provided. - pub fn initialize(&self, config_overrides: &[&str]) -> anyhow::Result<()> { - // First, run `pageserver --init` and wait for it to write a config into FS and exit. - self.pageserver_init(config_overrides) + pub fn initialize(&self, conf: NeonLocalInitPageserverConf) -> anyhow::Result<()> { + self.pageserver_init(conf) .with_context(|| format!("Failed to run init for pageserver node {}", self.conf.id)) } @@ -183,11 +158,11 @@ impl PageServerNode { .expect("non-Unicode path") } - pub async fn start(&self, config_overrides: &[&str]) -> anyhow::Result<()> { - self.start_node(config_overrides, false).await + pub async fn start(&self) -> anyhow::Result<()> { + self.start_node().await } - fn pageserver_init(&self, config_overrides: &[&str]) -> anyhow::Result<()> { + fn pageserver_init(&self, conf: NeonLocalInitPageserverConf) -> anyhow::Result<()> { let datadir = self.repo_path(); let node_id = self.conf.id; println!( @@ -198,29 +173,20 @@ impl PageServerNode { ); io::stdout().flush()?; - if !datadir.exists() { - std::fs::create_dir(&datadir)?; - } - - let datadir_path_str = datadir.to_str().with_context(|| { - format!("Cannot start pageserver node {node_id} in path that has no string representation: {datadir:?}") - })?; - let mut args = self.pageserver_basic_args(config_overrides, datadir_path_str); - args.push(Cow::Borrowed("--init")); - - let init_output = Command::new(self.env.pageserver_bin()) - .args(args.iter().map(Cow::as_ref)) - .envs(self.pageserver_env_variables()?) - .output() - .with_context(|| format!("Failed to run pageserver init for node {node_id}"))?; - - anyhow::ensure!( - init_output.status.success(), - "Pageserver init for node {} did not finish successfully, stdout: {}, stderr: {}", - node_id, - String::from_utf8_lossy(&init_output.stdout), - String::from_utf8_lossy(&init_output.stderr), - ); + let config = self + .pageserver_init_make_toml(conf) + .context("make pageserver toml")?; + let config_file_path = datadir.join("pageserver.toml"); + let mut config_file = std::fs::OpenOptions::new() + .create_new(true) + .write(true) + .open(&config_file_path) + .with_context(|| format!("open pageserver toml for write: {config_file_path:?}"))?; + config_file + .write_all(config.to_string().as_bytes()) + .context("write pageserver toml")?; + drop(config_file); + // TODO: invoke a TBD config-check command to validate that pageserver will start with the written config // Write metadata file, used by pageserver on startup to register itself with // the storage controller @@ -234,12 +200,13 @@ impl PageServerNode { // situation: the metadata is written by some other script. std::fs::write( metadata_path, - serde_json::to_vec(&serde_json::json!({ - "host": "localhost", - "port": self.pg_connection_config.port(), - "http_host": "localhost", - "http_port": http_port, - })) + serde_json::to_vec(&pageserver_api::config::NodeMetadata { + postgres_host: "localhost".to_string(), + postgres_port: self.pg_connection_config.port(), + http_host: "localhost".to_string(), + http_port, + other: HashMap::new(), + }) .unwrap(), ) .expect("Failed to write metadata file"); @@ -247,11 +214,7 @@ impl PageServerNode { Ok(()) } - async fn start_node( - &self, - config_overrides: &[&str], - update_config: bool, - ) -> anyhow::Result<()> { + async fn start_node(&self) -> anyhow::Result<()> { // TODO: using a thread here because start_process() is not async but we need to call check_status() let datadir = self.repo_path(); print!( @@ -268,15 +231,12 @@ impl PageServerNode { self.conf.id, datadir, ) })?; - let mut args = self.pageserver_basic_args(config_overrides, datadir_path_str); - if update_config { - args.push(Cow::Borrowed("--update-config")); - } + let args = vec!["-D", datadir_path_str]; background_process::start_process( "pageserver", &datadir, &self.env.pageserver_bin(), - args.iter().map(Cow::as_ref), + args, self.pageserver_env_variables()?, background_process::InitialPidFile::Expect(self.pid_file()), || async { @@ -293,22 +253,6 @@ impl PageServerNode { Ok(()) } - fn pageserver_basic_args<'a>( - &self, - config_overrides: &'a [&'a str], - datadir_path_str: &'a str, - ) -> Vec> { - let mut args = vec![Cow::Borrowed("-D"), Cow::Borrowed(datadir_path_str)]; - - let overrides = self.neon_local_overrides(config_overrides); - for config_override in overrides { - args.push(Cow::Borrowed("-c")); - args.push(Cow::Owned(config_override)); - } - - args - } - fn pageserver_env_variables(&self) -> anyhow::Result> { // FIXME: why is this tied to pageserver's auth type? Whether or not the safekeeper // needs a token, and how to generate that token, seems independent to whether @@ -389,6 +333,10 @@ impl PageServerNode { .remove("image_creation_threshold") .map(|x| x.parse::()) .transpose()?, + image_layer_creation_check_threshold: settings + .remove("image_layer_creation_check_threshold") + .map(|x| x.parse::()) + .transpose()?, pitr_interval: settings.remove("pitr_interval").map(|x| x.to_string()), walreceiver_connect_timeout: settings .remove("walreceiver_connect_timeout") @@ -430,6 +378,11 @@ impl PageServerNode { .map(serde_json::from_str) .transpose() .context("parse `timeline_get_throttle` from json")?, + switch_aux_file_policy: settings + .remove("switch_aux_file_policy") + .map(|x| x.parse::()) + .transpose() + .context("Failed to parse 'switch_aux_file_policy'")?, }; if !settings.is_empty() { bail!("Unrecognized tenant settings: {settings:?}") @@ -501,6 +454,12 @@ impl PageServerNode { .map(|x| x.parse::()) .transpose() .context("Failed to parse 'image_creation_threshold' as non zero integer")?, + image_layer_creation_check_threshold: settings + .remove("image_layer_creation_check_threshold") + .map(|x| x.parse::()) + .transpose() + .context("Failed to parse 'image_creation_check_threshold' as integer")?, + pitr_interval: settings.remove("pitr_interval").map(|x| x.to_string()), walreceiver_connect_timeout: settings .remove("walreceiver_connect_timeout") @@ -542,6 +501,11 @@ impl PageServerNode { .map(serde_json::from_str) .transpose() .context("parse `timeline_get_throttle` from json")?, + switch_aux_file_policy: settings + .remove("switch_aux_file_policy") + .map(|x| x.parse::()) + .transpose() + .context("Failed to parse 'switch_aux_file_policy'")?, } }; diff --git a/control_plane/src/safekeeper.rs b/control_plane/src/safekeeper.rs index 6ac71dfe51..d62a2e80b5 100644 --- a/control_plane/src/safekeeper.rs +++ b/control_plane/src/safekeeper.rs @@ -70,24 +70,31 @@ pub struct SafekeeperNode { pub pg_connection_config: PgConnectionConfig, pub env: LocalEnv, pub http_client: reqwest::Client, + pub listen_addr: String, pub http_base_url: String, } impl SafekeeperNode { pub fn from_env(env: &LocalEnv, conf: &SafekeeperConf) -> SafekeeperNode { + let listen_addr = if let Some(ref listen_addr) = conf.listen_addr { + listen_addr.clone() + } else { + "127.0.0.1".to_string() + }; SafekeeperNode { id: conf.id, conf: conf.clone(), - pg_connection_config: Self::safekeeper_connection_config(conf.pg_port), + pg_connection_config: Self::safekeeper_connection_config(&listen_addr, conf.pg_port), env: env.clone(), http_client: reqwest::Client::new(), - http_base_url: format!("http://127.0.0.1:{}/v1", conf.http_port), + http_base_url: format!("http://{}:{}/v1", listen_addr, conf.http_port), + listen_addr, } } /// Construct libpq connection string for connecting to this safekeeper. - fn safekeeper_connection_config(port: u16) -> PgConnectionConfig { - PgConnectionConfig::new_host_port(url::Host::parse("127.0.0.1").unwrap(), port) + fn safekeeper_connection_config(addr: &str, port: u16) -> PgConnectionConfig { + PgConnectionConfig::new_host_port(url::Host::parse(addr).unwrap(), port) } pub fn datadir_path_by_id(env: &LocalEnv, sk_id: NodeId) -> PathBuf { @@ -111,8 +118,8 @@ impl SafekeeperNode { ); io::stdout().flush().unwrap(); - let listen_pg = format!("127.0.0.1:{}", self.conf.pg_port); - let listen_http = format!("127.0.0.1:{}", self.conf.http_port); + let listen_pg = format!("{}:{}", self.listen_addr, self.conf.pg_port); + let listen_http = format!("{}:{}", self.listen_addr, self.conf.http_port); let id = self.id; let datadir = self.datadir_path(); @@ -139,7 +146,7 @@ impl SafekeeperNode { availability_zone, ]; if let Some(pg_tenant_only_port) = self.conf.pg_tenant_only_port { - let listen_pg_tenant_only = format!("127.0.0.1:{}", pg_tenant_only_port); + let listen_pg_tenant_only = format!("{}:{}", self.listen_addr, pg_tenant_only_port); args.extend(["--listen-pg-tenant-only".to_owned(), listen_pg_tenant_only]); } if !self.conf.sync { diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs index e7697ecac8..f1c43f4036 100644 --- a/control_plane/src/storage_controller.rs +++ b/control_plane/src/storage_controller.rs @@ -1,6 +1,8 @@ -use crate::{background_process, local_env::LocalEnv}; +use crate::{ + background_process, + local_env::{LocalEnv, NeonStorageControllerConf}, +}; use camino::{Utf8Path, Utf8PathBuf}; -use hyper::Method; use pageserver_api::{ controller_api::{ NodeConfigureRequest, NodeRegisterRequest, TenantCreateResponse, TenantLocateResponse, @@ -14,6 +16,7 @@ use pageserver_api::{ }; use pageserver_client::mgmt_api::ResponseErrorMessageExt; use postgres_backend::AuthType; +use reqwest::Method; use serde::{de::DeserializeOwned, Deserialize, Serialize}; use std::{fs, str::FromStr}; use tokio::process::Command; @@ -32,15 +35,13 @@ pub struct StorageController { public_key: Option, postgres_port: u16, client: reqwest::Client, + config: NeonStorageControllerConf, } const COMMAND: &str = "storage_controller"; const STORAGE_CONTROLLER_POSTGRES_VERSION: u32 = 16; -// Use a shorter pageserver unavailability interval than the default to speed up tests. -const NEON_LOCAL_MAX_UNAVAILABLE_INTERVAL: std::time::Duration = std::time::Duration::from_secs(10); - #[derive(Serialize, Deserialize)] pub struct AttachHookRequest { pub tenant_shard_id: TenantShardId, @@ -135,6 +136,7 @@ impl StorageController { client: reqwest::ClientBuilder::new() .build() .expect("Failed to construct http client"), + config: env.storage_controller.clone(), } } @@ -272,17 +274,16 @@ impl StorageController { // Run migrations on every startup, in case something changed. let database_url = self.setup_database().await?; - let max_unavailable: humantime::Duration = NEON_LOCAL_MAX_UNAVAILABLE_INTERVAL.into(); - let mut args = vec![ "-l", &self.listen, "-p", self.path.as_ref(), + "--dev", "--database-url", &database_url, "--max-unavailable-interval", - &max_unavailable.to_string(), + &humantime::Duration::from(self.config.max_unavailable).to_string(), ] .into_iter() .map(|s| s.to_string()) @@ -378,7 +379,7 @@ impl StorageController { /// Simple HTTP request wrapper for calling into storage controller async fn dispatch( &self, - method: hyper::Method, + method: reqwest::Method, path: String, body: Option, ) -> anyhow::Result @@ -471,6 +472,16 @@ impl StorageController { .await } + #[instrument(skip(self))] + pub async fn tenant_import(&self, tenant_id: TenantId) -> anyhow::Result { + self.dispatch::<(), TenantCreateResponse>( + Method::POST, + format!("debug/v1/tenant/{tenant_id}/import"), + None, + ) + .await + } + #[instrument(skip(self))] pub async fn tenant_locate(&self, tenant_id: TenantId) -> anyhow::Result { self.dispatch::<(), _>( diff --git a/control_plane/storcon_cli/Cargo.toml b/control_plane/storcon_cli/Cargo.toml new file mode 100644 index 0000000000..61eb7fa4e4 --- /dev/null +++ b/control_plane/storcon_cli/Cargo.toml @@ -0,0 +1,23 @@ +[package] +name = "storcon_cli" +version = "0.1.0" +edition.workspace = true +license.workspace = true + + +[dependencies] +anyhow.workspace = true +clap.workspace = true +comfy-table.workspace = true +hyper.workspace = true +pageserver_api.workspace = true +pageserver_client.workspace = true +reqwest.workspace = true +serde.workspace = true +serde_json = { workspace = true, features = ["raw_value"] } +thiserror.workspace = true +tokio.workspace = true +tracing.workspace = true +utils.workspace = true +workspace_hack.workspace = true + diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs new file mode 100644 index 0000000000..c19bc96cdb --- /dev/null +++ b/control_plane/storcon_cli/src/main.rs @@ -0,0 +1,680 @@ +use std::{collections::HashMap, str::FromStr, time::Duration}; + +use clap::{Parser, Subcommand}; +use pageserver_api::{ + controller_api::{ + NodeAvailabilityWrapper, NodeDescribeResponse, ShardSchedulingPolicy, + TenantDescribeResponse, TenantPolicyRequest, + }, + models::{ + LocationConfigSecondary, ShardParameters, TenantConfig, TenantConfigRequest, + TenantCreateRequest, TenantShardSplitRequest, TenantShardSplitResponse, + }, + shard::{ShardStripeSize, TenantShardId}, +}; +use pageserver_client::mgmt_api::{self, ResponseErrorMessageExt}; +use reqwest::{Method, StatusCode, Url}; +use serde::{de::DeserializeOwned, Serialize}; +use utils::id::{NodeId, TenantId}; + +use pageserver_api::controller_api::{ + NodeConfigureRequest, NodeRegisterRequest, NodeSchedulingPolicy, PlacementPolicy, + TenantLocateResponse, TenantShardMigrateRequest, TenantShardMigrateResponse, +}; + +#[derive(Subcommand, Debug)] +enum Command { + /// Register a pageserver with the storage controller. This shouldn't usually be necessary, + /// since pageservers auto-register when they start up + NodeRegister { + #[arg(long)] + node_id: NodeId, + + #[arg(long)] + listen_pg_addr: String, + #[arg(long)] + listen_pg_port: u16, + + #[arg(long)] + listen_http_addr: String, + #[arg(long)] + listen_http_port: u16, + }, + + /// Modify a node's configuration in the storage controller + NodeConfigure { + #[arg(long)] + node_id: NodeId, + + /// Availability is usually auto-detected based on heartbeats. Set 'offline' here to + /// manually mark a node offline + #[arg(long)] + availability: Option, + /// Scheduling policy controls whether tenant shards may be scheduled onto this node. + #[arg(long)] + scheduling: Option, + }, + /// Modify a tenant's policies in the storage controller + TenantPolicy { + #[arg(long)] + tenant_id: TenantId, + /// Placement policy controls whether a tenant is `detached`, has only a secondary location (`secondary`), + /// or is in the normal attached state with N secondary locations (`attached:N`) + #[arg(long)] + placement: Option, + /// Scheduling policy enables pausing the controller's scheduling activity involving this tenant. `active` is normal, + /// `essential` disables optimization scheduling changes, `pause` disables all scheduling changes, and `stop` prevents + /// all reconciliation activity including for scheduling changes already made. `pause` and `stop` can make a tenant + /// unavailable, and are only for use in emergencies. + #[arg(long)] + scheduling: Option, + }, + /// List nodes known to the storage controller + Nodes {}, + /// List tenants known to the storage controller + Tenants {}, + /// Create a new tenant in the storage controller, and by extension on pageservers. + TenantCreate { + #[arg(long)] + tenant_id: TenantId, + }, + /// Delete a tenant in the storage controller, and by extension on pageservers. + TenantDelete { + #[arg(long)] + tenant_id: TenantId, + }, + /// Split an existing tenant into a higher number of shards than its current shard count. + TenantShardSplit { + #[arg(long)] + tenant_id: TenantId, + #[arg(long)] + shard_count: u8, + /// Optional, in 8kiB pages. e.g. set 2048 for 16MB stripes. + #[arg(long)] + stripe_size: Option, + }, + /// Migrate the attached location for a tenant shard to a specific pageserver. + TenantShardMigrate { + #[arg(long)] + tenant_shard_id: TenantShardId, + #[arg(long)] + node: NodeId, + }, + /// Modify the pageserver tenant configuration of a tenant: this is the configuration structure + /// that is passed through to pageservers, and does not affect storage controller behavior. + TenantConfig { + #[arg(long)] + tenant_id: TenantId, + #[arg(long)] + config: String, + }, + /// Attempt to balance the locations for a tenant across pageservers. This is a client-side + /// alternative to the storage controller's scheduling optimization behavior. + TenantScatter { + #[arg(long)] + tenant_id: TenantId, + }, + /// Print details about a particular tenant, including all its shards' states. + TenantDescribe { + #[arg(long)] + tenant_id: TenantId, + }, + /// For a tenant which hasn't been onboarded to the storage controller yet, add it in secondary + /// mode so that it can warm up content on a pageserver. + TenantWarmup { + #[arg(long)] + tenant_id: TenantId, + }, +} + +#[derive(Parser)] +#[command( + author, + version, + about, + long_about = "CLI for Storage Controller Support/Debug" +)] +#[command(arg_required_else_help(true))] +struct Cli { + #[arg(long)] + /// URL to storage controller. e.g. http://127.0.0.1:1234 when using `neon_local` + api: Url, + + #[arg(long)] + /// JWT token for authenticating with storage controller. Depending on the API used, this + /// should have either `pageserverapi` or `admin` scopes: for convenience, you should mint + /// a token with both scopes to use with this tool. + jwt: Option, + + #[command(subcommand)] + command: Command, +} + +#[derive(Debug, Clone)] +struct PlacementPolicyArg(PlacementPolicy); + +impl FromStr for PlacementPolicyArg { + type Err = anyhow::Error; + + fn from_str(s: &str) -> Result { + match s { + "detached" => Ok(Self(PlacementPolicy::Detached)), + "secondary" => Ok(Self(PlacementPolicy::Secondary)), + _ if s.starts_with("attached:") => { + let mut splitter = s.split(':'); + let _prefix = splitter.next().unwrap(); + match splitter.next().and_then(|s| s.parse::().ok()) { + Some(n) => Ok(Self(PlacementPolicy::Attached(n))), + None => Err(anyhow::anyhow!( + "Invalid format '{s}', a valid example is 'attached:1'" + )), + } + } + _ => Err(anyhow::anyhow!( + "Unknown placement policy '{s}', try detached,secondary,attached:" + )), + } + } +} + +#[derive(Debug, Clone)] +struct ShardSchedulingPolicyArg(ShardSchedulingPolicy); + +impl FromStr for ShardSchedulingPolicyArg { + type Err = anyhow::Error; + + fn from_str(s: &str) -> Result { + match s { + "active" => Ok(Self(ShardSchedulingPolicy::Active)), + "essential" => Ok(Self(ShardSchedulingPolicy::Essential)), + "pause" => Ok(Self(ShardSchedulingPolicy::Pause)), + "stop" => Ok(Self(ShardSchedulingPolicy::Stop)), + _ => Err(anyhow::anyhow!( + "Unknown scheduling policy '{s}', try active,essential,pause,stop" + )), + } + } +} + +#[derive(Debug, Clone)] +struct NodeAvailabilityArg(NodeAvailabilityWrapper); + +impl FromStr for NodeAvailabilityArg { + type Err = anyhow::Error; + + fn from_str(s: &str) -> Result { + match s { + "active" => Ok(Self(NodeAvailabilityWrapper::Active)), + "offline" => Ok(Self(NodeAvailabilityWrapper::Offline)), + _ => Err(anyhow::anyhow!("Unknown availability state '{s}'")), + } + } +} + +struct Client { + base_url: Url, + jwt_token: Option, + client: reqwest::Client, +} + +impl Client { + fn new(base_url: Url, jwt_token: Option) -> Self { + Self { + base_url, + jwt_token, + client: reqwest::ClientBuilder::new() + .build() + .expect("Failed to construct http client"), + } + } + + /// Simple HTTP request wrapper for calling into storage controller + async fn dispatch( + &self, + method: Method, + path: String, + body: Option, + ) -> mgmt_api::Result + where + RQ: Serialize + Sized, + RS: DeserializeOwned + Sized, + { + // The configured URL has the /upcall path prefix for pageservers to use: we will strip that out + // for general purpose API access. + let url = Url::from_str(&format!( + "http://{}:{}/{path}", + self.base_url.host_str().unwrap(), + self.base_url.port().unwrap() + )) + .unwrap(); + + let mut builder = self.client.request(method, url); + if let Some(body) = body { + builder = builder.json(&body) + } + if let Some(jwt_token) = &self.jwt_token { + builder = builder.header( + reqwest::header::AUTHORIZATION, + format!("Bearer {jwt_token}"), + ); + } + + let response = builder.send().await.map_err(mgmt_api::Error::ReceiveBody)?; + let response = response.error_from_body().await?; + + response + .json() + .await + .map_err(pageserver_client::mgmt_api::Error::ReceiveBody) + } +} + +#[tokio::main] +async fn main() -> anyhow::Result<()> { + let cli = Cli::parse(); + + let storcon_client = Client::new(cli.api.clone(), cli.jwt.clone()); + + let mut trimmed = cli.api.to_string(); + trimmed.pop(); + let vps_client = mgmt_api::Client::new(trimmed, cli.jwt.as_deref()); + + match cli.command { + Command::NodeRegister { + node_id, + listen_pg_addr, + listen_pg_port, + listen_http_addr, + listen_http_port, + } => { + storcon_client + .dispatch::<_, ()>( + Method::POST, + "control/v1/node".to_string(), + Some(NodeRegisterRequest { + node_id, + listen_pg_addr, + listen_pg_port, + listen_http_addr, + listen_http_port, + }), + ) + .await?; + } + Command::TenantCreate { tenant_id } => { + vps_client + .tenant_create(&TenantCreateRequest { + new_tenant_id: TenantShardId::unsharded(tenant_id), + generation: None, + shard_parameters: ShardParameters::default(), + placement_policy: Some(PlacementPolicy::Attached(1)), + config: TenantConfig::default(), + }) + .await?; + } + Command::TenantDelete { tenant_id } => { + let status = vps_client + .tenant_delete(TenantShardId::unsharded(tenant_id)) + .await?; + tracing::info!("Delete status: {}", status); + } + Command::Nodes {} => { + let resp = storcon_client + .dispatch::<(), Vec>( + Method::GET, + "control/v1/node".to_string(), + None, + ) + .await?; + let mut table = comfy_table::Table::new(); + table.set_header(["Id", "Hostname", "Scheduling", "Availability"]); + for node in resp { + table.add_row([ + format!("{}", node.id), + node.listen_http_addr, + format!("{:?}", node.scheduling), + format!("{:?}", node.availability), + ]); + } + println!("{table}"); + } + Command::NodeConfigure { + node_id, + availability, + scheduling, + } => { + let req = NodeConfigureRequest { + node_id, + availability: availability.map(|a| a.0), + scheduling, + }; + storcon_client + .dispatch::<_, ()>( + Method::PUT, + format!("control/v1/node/{node_id}/config"), + Some(req), + ) + .await?; + } + Command::Tenants {} => { + let resp = storcon_client + .dispatch::<(), Vec>( + Method::GET, + "control/v1/tenant".to_string(), + None, + ) + .await?; + let mut table = comfy_table::Table::new(); + table.set_header([ + "TenantId", + "ShardCount", + "StripeSize", + "Placement", + "Scheduling", + ]); + for tenant in resp { + let shard_zero = tenant.shards.into_iter().next().unwrap(); + table.add_row([ + format!("{}", tenant.tenant_id), + format!("{}", shard_zero.tenant_shard_id.shard_count.literal()), + format!("{:?}", tenant.stripe_size), + format!("{:?}", tenant.policy), + format!("{:?}", shard_zero.scheduling_policy), + ]); + } + + println!("{table}"); + } + Command::TenantPolicy { + tenant_id, + placement, + scheduling, + } => { + let req = TenantPolicyRequest { + scheduling: scheduling.map(|s| s.0), + placement: placement.map(|p| p.0), + }; + storcon_client + .dispatch::<_, ()>( + Method::PUT, + format!("control/v1/tenant/{tenant_id}/policy"), + Some(req), + ) + .await?; + } + Command::TenantShardSplit { + tenant_id, + shard_count, + stripe_size, + } => { + let req = TenantShardSplitRequest { + new_shard_count: shard_count, + new_stripe_size: stripe_size.map(ShardStripeSize), + }; + + let response = storcon_client + .dispatch::( + Method::PUT, + format!("control/v1/tenant/{tenant_id}/shard_split"), + Some(req), + ) + .await?; + println!( + "Split tenant {} into {} shards: {}", + tenant_id, + shard_count, + response + .new_shards + .iter() + .map(|s| format!("{:?}", s)) + .collect::>() + .join(",") + ); + } + Command::TenantShardMigrate { + tenant_shard_id, + node, + } => { + let req = TenantShardMigrateRequest { + tenant_shard_id, + node_id: node, + }; + + storcon_client + .dispatch::( + Method::PUT, + format!("control/v1/tenant/{tenant_shard_id}/migrate"), + Some(req), + ) + .await?; + } + Command::TenantConfig { tenant_id, config } => { + let tenant_conf = serde_json::from_str(&config)?; + + vps_client + .tenant_config(&TenantConfigRequest { + tenant_id, + config: tenant_conf, + }) + .await?; + } + Command::TenantScatter { tenant_id } => { + // Find the shards + let locate_response = storcon_client + .dispatch::<(), TenantLocateResponse>( + Method::GET, + format!("control/v1/tenant/{tenant_id}/locate"), + None, + ) + .await?; + let shards = locate_response.shards; + + let mut node_to_shards: HashMap> = HashMap::new(); + let shard_count = shards.len(); + for s in shards { + let entry = node_to_shards.entry(s.node_id).or_default(); + entry.push(s.shard_id); + } + + // Load list of available nodes + let nodes_resp = storcon_client + .dispatch::<(), Vec>( + Method::GET, + "control/v1/node".to_string(), + None, + ) + .await?; + + for node in nodes_resp { + if matches!(node.availability, NodeAvailabilityWrapper::Active) { + node_to_shards.entry(node.id).or_default(); + } + } + + let max_shard_per_node = shard_count / node_to_shards.len(); + + loop { + let mut migrate_shard = None; + for shards in node_to_shards.values_mut() { + if shards.len() > max_shard_per_node { + // Pick the emptiest + migrate_shard = Some(shards.pop().unwrap()); + } + } + let Some(migrate_shard) = migrate_shard else { + break; + }; + + // Pick the emptiest node to migrate to + let mut destinations = node_to_shards + .iter() + .map(|(k, v)| (k, v.len())) + .collect::>(); + destinations.sort_by_key(|i| i.1); + let (destination_node, destination_count) = *destinations.first().unwrap(); + if destination_count + 1 > max_shard_per_node { + // Even the emptiest destination doesn't have space: we're done + break; + } + let destination_node = *destination_node; + + node_to_shards + .get_mut(&destination_node) + .unwrap() + .push(migrate_shard); + + println!("Migrate {} -> {} ...", migrate_shard, destination_node); + + storcon_client + .dispatch::( + Method::PUT, + format!("control/v1/tenant/{migrate_shard}/migrate"), + Some(TenantShardMigrateRequest { + tenant_shard_id: migrate_shard, + node_id: destination_node, + }), + ) + .await?; + println!("Migrate {} -> {} OK", migrate_shard, destination_node); + } + + // Spread the shards across the nodes + } + Command::TenantDescribe { tenant_id } => { + let describe_response = storcon_client + .dispatch::<(), TenantDescribeResponse>( + Method::GET, + format!("control/v1/tenant/{tenant_id}"), + None, + ) + .await?; + let shards = describe_response.shards; + let mut table = comfy_table::Table::new(); + table.set_header(["Shard", "Attached", "Secondary", "Last error", "status"]); + for shard in shards { + let secondary = shard + .node_secondary + .iter() + .map(|n| format!("{}", n)) + .collect::>() + .join(","); + + let mut status_parts = Vec::new(); + if shard.is_reconciling { + status_parts.push("reconciling"); + } + + if shard.is_pending_compute_notification { + status_parts.push("pending_compute"); + } + + if shard.is_splitting { + status_parts.push("splitting"); + } + let status = status_parts.join(","); + + table.add_row([ + format!("{}", shard.tenant_shard_id), + shard + .node_attached + .map(|n| format!("{}", n)) + .unwrap_or(String::new()), + secondary, + shard.last_error, + status, + ]); + } + println!("{table}"); + } + Command::TenantWarmup { tenant_id } => { + let describe_response = storcon_client + .dispatch::<(), TenantDescribeResponse>( + Method::GET, + format!("control/v1/tenant/{tenant_id}"), + None, + ) + .await; + match describe_response { + Ok(describe) => { + if matches!(describe.policy, PlacementPolicy::Secondary) { + // Fine: it's already known to controller in secondary mode: calling + // again to put it into secondary mode won't cause problems. + } else { + anyhow::bail!("Tenant already present with policy {:?}", describe.policy); + } + } + Err(mgmt_api::Error::ApiError(StatusCode::NOT_FOUND, _)) => { + // Fine: this tenant isn't know to the storage controller yet. + } + Err(e) => { + // Unexpected API error + return Err(e.into()); + } + } + + vps_client + .location_config( + TenantShardId::unsharded(tenant_id), + pageserver_api::models::LocationConfig { + mode: pageserver_api::models::LocationConfigMode::Secondary, + generation: None, + secondary_conf: Some(LocationConfigSecondary { warm: true }), + shard_number: 0, + shard_count: 0, + shard_stripe_size: ShardParameters::DEFAULT_STRIPE_SIZE.0, + tenant_conf: TenantConfig::default(), + }, + None, + true, + ) + .await?; + + let describe_response = storcon_client + .dispatch::<(), TenantDescribeResponse>( + Method::GET, + format!("control/v1/tenant/{tenant_id}"), + None, + ) + .await?; + + let secondary_ps_id = describe_response + .shards + .first() + .unwrap() + .node_secondary + .first() + .unwrap(); + + println!("Tenant {tenant_id} warming up on pageserver {secondary_ps_id}"); + loop { + let (status, progress) = vps_client + .tenant_secondary_download( + TenantShardId::unsharded(tenant_id), + Some(Duration::from_secs(10)), + ) + .await?; + println!( + "Progress: {}/{} layers, {}/{} bytes", + progress.layers_downloaded, + progress.layers_total, + progress.bytes_downloaded, + progress.bytes_total + ); + match status { + StatusCode::OK => { + println!("Download complete"); + break; + } + StatusCode::ACCEPTED => { + // Loop + } + _ => { + anyhow::bail!("Unexpected download status: {status}"); + } + } + } + } + } + + Ok(()) +} diff --git a/diesel.toml b/diesel.toml index 30ed4444d7..558c54a1e1 100644 --- a/diesel.toml +++ b/diesel.toml @@ -2,8 +2,8 @@ # see https://diesel.rs/guides/configuring-diesel-cli [print_schema] -file = "control_plane/attachment_service/src/schema.rs" +file = "storage_controller/src/schema.rs" custom_type_derives = ["diesel::query_builder::QueryId"] [migrations_directory] -dir = "control_plane/attachment_service/migrations" +dir = "storage_controller/migrations" diff --git a/docs/sourcetree.md b/docs/sourcetree.md index 12fa80349e..3732bfdab2 100644 --- a/docs/sourcetree.md +++ b/docs/sourcetree.md @@ -7,6 +7,11 @@ Below you will find a brief overview of each subdir in the source tree in alphab Neon storage broker, providing messaging between safekeepers and pageservers. [storage_broker.md](./storage_broker.md) +`storage_controller`: + +Neon storage controller, manages a cluster of pageservers and exposes an API that enables +managing a many-sharded tenant as a single entity. + `/control_plane`: Local control plane. diff --git a/docs/storage_controller.md b/docs/storage_controller.md new file mode 100644 index 0000000000..daf4d0c8b7 --- /dev/null +++ b/docs/storage_controller.md @@ -0,0 +1,150 @@ +# Storage Controller + +## Concepts + +The storage controller sits between administrative API clients and pageservers, and handles the details of mapping tenants to pageserver tenant shards. For example, creating a tenant is one API call to the storage controller, +which is mapped into many API calls to many pageservers (for multiple shards, and for secondary locations). + +It implements a pageserver-compatible API that may be used for CRUD operations on tenants and timelines, translating these requests into appropriate operations on the shards within a tenant, which may be on many different pageservers. Using this API, the storage controller may be used in the same way as the pageserver's administrative HTTP API, hiding +the underlying details of how data is spread across multiple nodes. + +The storage controller also manages generations, high availability (via secondary locations) and live migrations for tenants under its management. This is done with a reconciliation loop pattern, where tenants have an “intent” state and a “reconcile” task that tries to make the outside world match the intent. + +## APIs + +The storage controller’s HTTP server implements four logically separate APIs: + +- `/v1/...` path is the pageserver-compatible API. This has to be at the path root because that’s where clients expect to find it on a pageserver. +- `/control/v1/...` path is the storage controller’s API, which enables operations such as registering and management pageservers, or executing shard splits. +- `/debug/v1/...` path contains endpoints which are either exclusively used in tests, or are for use by engineers when supporting a deployed system. +- `/upcall/v1/...` path contains endpoints that are called by pageservers. This includes the `/re-attach` and `/validate` APIs used by pageservers + to ensure data safety with generation numbers. + +The API is authenticated with a JWT token, and tokens must have scope `pageserverapi` (i.e. the same scope as pageservers’ APIs). + +See the `http.rs` file in the source for where the HTTP APIs are implemented. + +## Database + +The storage controller uses a postgres database to persist a subset of its state. Note that the storage controller does _not_ keep all its state in the database: this is a design choice to enable most operations to be done efficiently in memory, rather than having to read from the database. See `persistence.rs` for a more comprehensive comment explaining what we do and do not persist: a useful metaphor is that we persist objects like tenants and nodes, but we do not +persist the _relationships_ between them: the attachment state of a tenant's shards to nodes is kept in memory and +rebuilt on startup. + +The file `persistence.rs` contains all the code for accessing the database, and has a large doc comment that goes into more detail about exactly what we persist and why. + +The `diesel` crate is used for defining models & migrations. + +Running a local cluster with `cargo neon` automatically starts a vanilla postgress process to host the storage controller’s database. + +### Diesel tip: migrations + +If you need to modify the database schema, here’s how to create a migration: + +- Install the diesel CLI with `cargo install diesel_cli` +- Use `diesel migration generate ` to create a new migration +- Populate the SQL files in the `migrations/` subdirectory +- Use `DATABASE_URL=... diesel migration run` to apply the migration you just wrote: this will update the `[schema.rs](http://schema.rs)` file automatically. + - This requires a running database: the easiest way to do that is to just run `cargo neon init ; cargo neon start`, which will leave a database available at `postgresql://localhost:1235/attachment_service` +- Commit the migration files and the changes to schema.rs +- If you need to iterate, you can rewind migrations with `diesel migration revert -a` and then `diesel migration run` again. +- The migrations are build into the storage controller binary, and automatically run at startup after it is deployed, so once you’ve committed a migration no further steps are needed. + +## storcon_cli + +The `storcon_cli` tool enables interactive management of the storage controller. This is usually +only necessary for debug, but may also be used to manage nodes (e.g. marking a node as offline). + +`storcon_cli --help` includes details on commands. + +# Deploying + +This section is aimed at engineers deploying the storage controller outside of Neon's cloud platform, as +part of a self-hosted system. + +_General note: since the default `neon_local` environment includes a storage controller, this is a useful +reference when figuring out deployment._ + +## Database + +It is **essential** that the database used by the storage controller is durable (**do not store it on ephemeral +local disk**). This database contains pageserver generation numbers, which are essential to data safety on the pageserver. + +The resource requirements for the database are very low: a single CPU core and 1GiB of memory should work well for most deployments. The physical size of the database is typically under a gigabyte. + +Set the URL to the database using the `--database-url` CLI option. + +There is no need to run migrations manually: the storage controller automatically applies migrations +when it starts up. + +## Configure pageservers to use the storage controller + +1. The pageserver `control_plane_api` and `control_plane_api_token` should be set in the `pageserver.toml` file. The API setting should + point to the "upcall" prefix, for example `http://127.0.0.1:1234/upcall/v1/` is used in neon_local clusters. +2. Create a `metadata.json` file in the same directory as `pageserver.toml`: this enables the pageserver to automatically register itself + with the storage controller when it starts up. See the example below for the format of this file. + +### Example `metadata.json` + +``` +{"host":"acmehost.localdomain","http_host":"acmehost.localdomain","http_port":9898,"port":64000} +``` + +- `port` and `host` refer to the _postgres_ port and host, and these must be accessible from wherever + postgres runs. +- `http_port` and `http_host` refer to the pageserver's HTTP api, this must be accessible from where + the storage controller runs. + +## Handle compute notifications. + +The storage controller independently moves tenant attachments between pageservers in response to +changes such as a pageserver node becoming unavailable, or the tenant's shard count changing. To enable +postgres clients to handle such changes, the storage controller calls an API hook when a tenant's pageserver +location changes. + +The hook is configured using the storage controller's `--compute-hook-url` CLI option. If the hook requires +JWT auth, the token may be provided with `--control-plane-jwt-token`. The hook will be invoked with a `PUT` request. + +In the Neon cloud service, this hook is implemented by Neon's internal cloud control plane. In `neon_local` systems +the storage controller integrates directly with neon_local to reconfigure local postgres processes instead of calling +the compute hook. + +When implementing an on-premise Neon deployment, you must implement a service that handles the compute hook. This is not complicated: +the request body has format of the `ComputeHookNotifyRequest` structure, provided below for convenience. + +``` +struct ComputeHookNotifyRequestShard { + node_id: NodeId, + shard_number: ShardNumber, +} + +struct ComputeHookNotifyRequest { + tenant_id: TenantId, + stripe_size: Option, + shards: Vec, +} +``` + +When a notification is received: + +1. Modify postgres configuration for this tenant: + + - set `neon.pageserver_connstr` to a comma-separated list of postgres connection strings to pageservers according to the `shards` list. The + shards identified by `NodeId` must be converted to the address+port of the node. + - if stripe_size is not None, set `neon.stripe_size` to this value + +2. Send SIGHUP to postgres to reload configuration +3. Respond with 200 to the notification request. Do not return success if postgres was not updated: if an error is returned, the controller + will retry the notification until it succeeds.. + +### Example notification body + +``` +{ + "tenant_id": "1f359dd625e519a1a4e8d7509690f6fc", + "stripe_size": 32768, + "shards": [ + {"node_id": 344, "shard_number": 0}, + {"node_id": 722, "shard_number": 1}, + ], +} +``` diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs index 71ae66c45c..1c4ee2089f 100644 --- a/libs/compute_api/src/spec.rs +++ b/libs/compute_api/src/spec.rs @@ -33,6 +33,23 @@ pub struct ComputeSpec { #[serde(default)] pub features: Vec, + /// If compute_ctl was passed `--resize-swap-on-bind`, a value of `Some(_)` instructs + /// compute_ctl to `/neonvm/bin/resize-swap` with the given size, when the spec is first + /// received. + /// + /// Both this field and `--resize-swap-on-bind` are required, so that the control plane's + /// spec generation doesn't need to be aware of the actual compute it's running on, while + /// guaranteeing gradual rollout of swap. Otherwise, without `--resize-swap-on-bind`, we could + /// end up trying to resize swap in VMs without it -- or end up *not* resizing swap, thus + /// giving every VM much more swap than it should have (32GiB). + /// + /// Eventually we may remove `--resize-swap-on-bind` and exclusively use `swap_size_bytes` for + /// enabling the swap resizing behavior once rollout is complete. + /// + /// See neondatabase/cloud#12047 for more. + #[serde(default)] + pub swap_size_bytes: Option, + /// Expected cluster state at the end of transition process. pub cluster: Cluster, pub delta_operations: Option>, diff --git a/libs/metrics/Cargo.toml b/libs/metrics/Cargo.toml index f6a49a0166..0bd804051c 100644 --- a/libs/metrics/Cargo.toml +++ b/libs/metrics/Cargo.toml @@ -10,11 +10,13 @@ libc.workspace = true once_cell.workspace = true chrono.workspace = true twox-hash.workspace = true +measured.workspace = true workspace_hack.workspace = true [target.'cfg(target_os = "linux")'.dependencies] procfs.workspace = true +measured-process.workspace = true [dev-dependencies] rand = "0.8" diff --git a/libs/metrics/src/hll.rs b/libs/metrics/src/hll.rs index 46a623b0e2..f53511ab5c 100644 --- a/libs/metrics/src/hll.rs +++ b/libs/metrics/src/hll.rs @@ -7,14 +7,19 @@ //! use significantly less memory than this, but can only approximate the cardinality. use std::{ - collections::HashMap, - hash::{BuildHasher, BuildHasherDefault, Hash, Hasher}, - sync::{atomic::AtomicU8, Arc, RwLock}, + hash::{BuildHasher, BuildHasherDefault, Hash}, + sync::atomic::AtomicU8, }; -use prometheus::{ - core::{self, Describer}, - proto, Opts, +use measured::{ + label::{LabelGroupVisitor, LabelName, LabelValue, LabelVisitor}, + metric::{ + group::{Encoding, MetricValue}, + name::MetricNameEncoder, + Metric, MetricType, MetricVec, + }, + text::TextEncoder, + LabelGroup, }; use twox_hash::xxh3; @@ -40,7 +45,7 @@ macro_rules! register_hll { }}; ($N:literal, $NAME:expr, $HELP:expr $(,)?) => {{ - $crate::register_hll!($N, $crate::opts!($NAME, $HELP), $LABELS_NAMES) + $crate::register_hll!($N, $crate::opts!($NAME, $HELP)) }}; } @@ -93,203 +98,25 @@ macro_rules! register_hll { /// ``` /// /// See for estimates on alpha -#[derive(Clone)] -pub struct HyperLogLogVec { - core: Arc>, +pub type HyperLogLogVec = MetricVec, L>; +pub type HyperLogLog = Metric>; + +pub struct HyperLogLogState { + shards: [AtomicU8; N], } - -struct HyperLogLogVecCore { - pub children: RwLock, BuildHasherDefault>>, - pub desc: core::Desc, - pub opts: Opts, -} - -impl core::Collector for HyperLogLogVec { - fn desc(&self) -> Vec<&core::Desc> { - vec![&self.core.desc] - } - - fn collect(&self) -> Vec { - let mut m = proto::MetricFamily::default(); - m.set_name(self.core.desc.fq_name.clone()); - m.set_help(self.core.desc.help.clone()); - m.set_field_type(proto::MetricType::GAUGE); - - let mut metrics = Vec::new(); - for child in self.core.children.read().unwrap().values() { - child.core.collect_into(&mut metrics); - } - m.set_metric(metrics); - - vec![m] +impl Default for HyperLogLogState { + fn default() -> Self { + #[allow(clippy::declare_interior_mutable_const)] + const ZERO: AtomicU8 = AtomicU8::new(0); + Self { shards: [ZERO; N] } } } -impl HyperLogLogVec { - /// Create a new [`HyperLogLogVec`] based on the provided - /// [`Opts`] and partitioned by the given label names. At least one label name must be - /// provided. - pub fn new(opts: Opts, label_names: &[&str]) -> prometheus::Result { - assert!(N.is_power_of_two()); - let variable_names = label_names.iter().map(|s| (*s).to_owned()).collect(); - let opts = opts.variable_labels(variable_names); - - let desc = opts.describe()?; - let v = HyperLogLogVecCore { - children: RwLock::new(HashMap::default()), - desc, - opts, - }; - - Ok(Self { core: Arc::new(v) }) - } - - /// `get_metric_with_label_values` returns the [`HyperLogLog

`] for the given slice - /// of label values (same order as the VariableLabels in Desc). If that combination of - /// label values is accessed for the first time, a new [`HyperLogLog

`] is created. - /// - /// An error is returned if the number of label values is not the same as the - /// number of VariableLabels in Desc. - pub fn get_metric_with_label_values( - &self, - vals: &[&str], - ) -> prometheus::Result> { - self.core.get_metric_with_label_values(vals) - } - - /// `with_label_values` works as `get_metric_with_label_values`, but panics if an error - /// occurs. - pub fn with_label_values(&self, vals: &[&str]) -> HyperLogLog { - self.get_metric_with_label_values(vals).unwrap() - } +impl MetricType for HyperLogLogState { + type Metadata = (); } -impl HyperLogLogVecCore { - pub fn get_metric_with_label_values( - &self, - vals: &[&str], - ) -> prometheus::Result> { - let h = self.hash_label_values(vals)?; - - if let Some(metric) = self.children.read().unwrap().get(&h).cloned() { - return Ok(metric); - } - - self.get_or_create_metric(h, vals) - } - - pub(crate) fn hash_label_values(&self, vals: &[&str]) -> prometheus::Result { - if vals.len() != self.desc.variable_labels.len() { - return Err(prometheus::Error::InconsistentCardinality { - expect: self.desc.variable_labels.len(), - got: vals.len(), - }); - } - - let mut h = xxh3::Hash64::default(); - for val in vals { - h.write(val.as_bytes()); - } - - Ok(h.finish()) - } - - fn get_or_create_metric( - &self, - hash: u64, - label_values: &[&str], - ) -> prometheus::Result> { - let mut children = self.children.write().unwrap(); - // Check exist first. - if let Some(metric) = children.get(&hash).cloned() { - return Ok(metric); - } - - let metric = HyperLogLog::with_opts_and_label_values(&self.opts, label_values)?; - children.insert(hash, metric.clone()); - Ok(metric) - } -} - -/// HLL is a probabilistic cardinality measure. -/// -/// How to use this time-series for a metric name `my_metrics_total_hll`: -/// -/// ```promql -/// # harmonic mean -/// 1 / ( -/// sum ( -/// 2 ^ -( -/// # HLL merge operation -/// max (my_metrics_total_hll{}) by (hll_shard, other_labels...) -/// ) -/// ) without (hll_shard) -/// ) -/// * alpha -/// * shards_count -/// * shards_count -/// ``` -/// -/// If you want an estimate over time, you can use the following query: -/// -/// ```promql -/// # harmonic mean -/// 1 / ( -/// sum ( -/// 2 ^ -( -/// # HLL merge operation -/// max ( -/// max_over_time(my_metrics_total_hll{}[$__rate_interval]) -/// ) by (hll_shard, other_labels...) -/// ) -/// ) without (hll_shard) -/// ) -/// * alpha -/// * shards_count -/// * shards_count -/// ``` -/// -/// In the case of low cardinality, you might want to use the linear counting approximation: -/// -/// ```promql -/// # LinearCounting(m, V) = m log (m / V) -/// shards_count * ln(shards_count / -/// # calculate V = how many shards contain a 0 -/// count(max (proxy_connecting_endpoints{}) by (hll_shard, protocol) == 0) without (hll_shard) -/// ) -/// ``` -/// -/// See for estimates on alpha -#[derive(Clone)] -pub struct HyperLogLog { - core: Arc>, -} - -impl HyperLogLog { - /// Create a [`HyperLogLog`] with the `name` and `help` arguments. - pub fn new, S2: Into>(name: S1, help: S2) -> prometheus::Result { - assert!(N.is_power_of_two()); - let opts = Opts::new(name, help); - Self::with_opts(opts) - } - - /// Create a [`HyperLogLog`] with the `opts` options. - pub fn with_opts(opts: Opts) -> prometheus::Result { - Self::with_opts_and_label_values(&opts, &[]) - } - - fn with_opts_and_label_values(opts: &Opts, label_values: &[&str]) -> prometheus::Result { - let desc = opts.describe()?; - let labels = make_label_pairs(&desc, label_values)?; - - let v = HyperLogLogCore { - shards: [0; N].map(AtomicU8::new), - desc, - labels, - }; - Ok(Self { core: Arc::new(v) }) - } - +impl HyperLogLogState { pub fn measure(&self, item: &impl Hash) { // changing the hasher will break compatibility with previous measurements. self.record(BuildHasherDefault::::default().hash_one(item)); @@ -299,42 +126,11 @@ impl HyperLogLog { let p = N.ilog2() as u8; let j = hash & (N as u64 - 1); let rho = (hash >> p).leading_zeros() as u8 + 1 - p; - self.core.shards[j as usize].fetch_max(rho, std::sync::atomic::Ordering::Relaxed); - } -} - -struct HyperLogLogCore { - shards: [AtomicU8; N], - desc: core::Desc, - labels: Vec, -} - -impl core::Collector for HyperLogLog { - fn desc(&self) -> Vec<&core::Desc> { - vec![&self.core.desc] + self.shards[j as usize].fetch_max(rho, std::sync::atomic::Ordering::Relaxed); } - fn collect(&self) -> Vec { - let mut m = proto::MetricFamily::default(); - m.set_name(self.core.desc.fq_name.clone()); - m.set_help(self.core.desc.help.clone()); - m.set_field_type(proto::MetricType::GAUGE); - - let mut metrics = Vec::new(); - self.core.collect_into(&mut metrics); - m.set_metric(metrics); - - vec![m] - } -} - -impl HyperLogLogCore { - fn collect_into(&self, metrics: &mut Vec) { - self.shards.iter().enumerate().for_each(|(i, x)| { - let mut shard_label = proto::LabelPair::default(); - shard_label.set_name("hll_shard".to_owned()); - shard_label.set_value(format!("{i}")); - + fn take_sample(&self) -> [u8; N] { + self.shards.each_ref().map(|x| { // We reset the counter to 0 so we can perform a cardinality measure over any time slice in prometheus. // This seems like it would be a race condition, @@ -344,85 +140,90 @@ impl HyperLogLogCore { // TODO: maybe we shouldn't reset this on every collect, instead, only after a time window. // this would mean that a dev port-forwarding the metrics url won't break the sampling. - let v = x.swap(0, std::sync::atomic::Ordering::Relaxed); - - let mut m = proto::Metric::default(); - let mut c = proto::Gauge::default(); - c.set_value(v as f64); - m.set_gauge(c); - - let mut labels = Vec::with_capacity(self.labels.len() + 1); - labels.extend_from_slice(&self.labels); - labels.push(shard_label); - - m.set_label(labels); - metrics.push(m); + x.swap(0, std::sync::atomic::Ordering::Relaxed) }) } } - -fn make_label_pairs( - desc: &core::Desc, - label_values: &[&str], -) -> prometheus::Result> { - if desc.variable_labels.len() != label_values.len() { - return Err(prometheus::Error::InconsistentCardinality { - expect: desc.variable_labels.len(), - got: label_values.len(), - }); +impl measured::metric::MetricEncoding> + for HyperLogLogState +{ + fn write_type( + name: impl MetricNameEncoder, + enc: &mut TextEncoder, + ) -> Result<(), std::io::Error> { + enc.write_type(&name, measured::text::MetricType::Gauge) } + fn collect_into( + &self, + _: &(), + labels: impl LabelGroup, + name: impl MetricNameEncoder, + enc: &mut TextEncoder, + ) -> Result<(), std::io::Error> { + struct I64(i64); + impl LabelValue for I64 { + fn visit(&self, v: V) -> V::Output { + v.write_int(self.0) + } + } - let total_len = desc.variable_labels.len() + desc.const_label_pairs.len(); - if total_len == 0 { - return Ok(vec![]); - } + struct HllShardLabel { + hll_shard: i64, + } - if desc.variable_labels.is_empty() { - return Ok(desc.const_label_pairs.clone()); - } + impl LabelGroup for HllShardLabel { + fn visit_values(&self, v: &mut impl LabelGroupVisitor) { + const LE: &LabelName = LabelName::from_str("hll_shard"); + v.write_value(LE, &I64(self.hll_shard)); + } + } - let mut label_pairs = Vec::with_capacity(total_len); - for (i, n) in desc.variable_labels.iter().enumerate() { - let mut label_pair = proto::LabelPair::default(); - label_pair.set_name(n.clone()); - label_pair.set_value(label_values[i].to_owned()); - label_pairs.push(label_pair); + self.take_sample() + .into_iter() + .enumerate() + .try_for_each(|(hll_shard, val)| { + enc.write_metric_value( + name.by_ref(), + labels.by_ref().compose_with(HllShardLabel { + hll_shard: hll_shard as i64, + }), + MetricValue::Int(val as i64), + ) + }) } - - for label_pair in &desc.const_label_pairs { - label_pairs.push(label_pair.clone()); - } - label_pairs.sort(); - Ok(label_pairs) } #[cfg(test)] mod tests { use std::collections::HashSet; - use prometheus::{proto, Opts}; + use measured::{label::StaticLabelSet, FixedCardinalityLabel}; use rand::{rngs::StdRng, Rng, SeedableRng}; use rand_distr::{Distribution, Zipf}; use crate::HyperLogLogVec; - fn collect(hll: &HyperLogLogVec<32>) -> Vec { - let mut metrics = vec![]; - hll.core - .children - .read() - .unwrap() - .values() - .for_each(|c| c.core.collect_into(&mut metrics)); - metrics + #[derive(FixedCardinalityLabel, Clone, Copy)] + #[label(singleton = "x")] + enum Label { + A, + B, } - fn get_cardinality(metrics: &[proto::Metric], filter: impl Fn(&proto::Metric) -> bool) -> f64 { + + fn collect(hll: &HyperLogLogVec, 32>) -> ([u8; 32], [u8; 32]) { + // cannot go through the `hll.collect_family_into` interface yet... + // need to see if I can fix the conflicting impls problem in measured. + ( + hll.get_metric(hll.with_labels(Label::A)).take_sample(), + hll.get_metric(hll.with_labels(Label::B)).take_sample(), + ) + } + + fn get_cardinality(samples: &[[u8; 32]]) -> f64 { let mut buckets = [0.0; 32]; - for metric in metrics.chunks_exact(32) { - if filter(&metric[0]) { - for (i, m) in metric.iter().enumerate() { - buckets[i] = f64::max(buckets[i], m.get_gauge().get_value()); - } + for &sample in samples { + for (i, m) in sample.into_iter().enumerate() { + buckets[i] = f64::max(buckets[i], m as f64); } } @@ -437,7 +238,7 @@ mod tests { } fn test_cardinality(n: usize, dist: impl Distribution) -> ([usize; 3], [f64; 3]) { - let hll = HyperLogLogVec::<32>::new(Opts::new("foo", "bar"), &["x"]).unwrap(); + let hll = HyperLogLogVec::, 32>::new(); let mut iter = StdRng::seed_from_u64(0x2024_0112).sample_iter(dist); let mut set_a = HashSet::new(); @@ -445,18 +246,20 @@ mod tests { for x in iter.by_ref().take(n) { set_a.insert(x.to_bits()); - hll.with_label_values(&["a"]).measure(&x.to_bits()); + hll.get_metric(hll.with_labels(Label::A)) + .measure(&x.to_bits()); } for x in iter.by_ref().take(n) { set_b.insert(x.to_bits()); - hll.with_label_values(&["b"]).measure(&x.to_bits()); + hll.get_metric(hll.with_labels(Label::B)) + .measure(&x.to_bits()); } let merge = &set_a | &set_b; - let metrics = collect(&hll); - let len = get_cardinality(&metrics, |_| true); - let len_a = get_cardinality(&metrics, |l| l.get_label()[0].get_value() == "a"); - let len_b = get_cardinality(&metrics, |l| l.get_label()[0].get_value() == "b"); + let (a, b) = collect(&hll); + let len = get_cardinality(&[a, b]); + let len_a = get_cardinality(&[a]); + let len_b = get_cardinality(&[b]); ([merge.len(), set_a.len(), set_b.len()], [len, len_a, len_b]) } diff --git a/libs/metrics/src/lib.rs b/libs/metrics/src/lib.rs index 22b0a18933..141d8a6d01 100644 --- a/libs/metrics/src/lib.rs +++ b/libs/metrics/src/lib.rs @@ -4,6 +4,17 @@ //! a default registry. #![deny(clippy::undocumented_unsafe_blocks)] +use measured::{ + label::{LabelGroupSet, LabelGroupVisitor, LabelName, NoLabels}, + metric::{ + counter::CounterState, + gauge::GaugeState, + group::{Encoding, MetricValue}, + name::{MetricName, MetricNameEncoder}, + MetricEncoding, MetricFamilyEncoding, + }, + FixedCardinalityLabel, LabelGroup, MetricGroup, +}; use once_cell::sync::Lazy; use prometheus::core::{ Atomic, AtomicU64, Collector, GenericCounter, GenericCounterVec, GenericGauge, GenericGaugeVec, @@ -11,6 +22,7 @@ use prometheus::core::{ pub use prometheus::opts; pub use prometheus::register; pub use prometheus::Error; +use prometheus::Registry; pub use prometheus::{core, default_registry, proto}; pub use prometheus::{exponential_buckets, linear_buckets}; pub use prometheus::{register_counter_vec, Counter, CounterVec}; @@ -23,13 +35,12 @@ pub use prometheus::{register_int_counter_vec, IntCounterVec}; pub use prometheus::{register_int_gauge, IntGauge}; pub use prometheus::{register_int_gauge_vec, IntGaugeVec}; pub use prometheus::{Encoder, TextEncoder}; -use prometheus::{Registry, Result}; pub mod launch_timestamp; mod wrappers; pub use wrappers::{CountedReader, CountedWriter}; mod hll; -pub use hll::{HyperLogLog, HyperLogLogVec}; +pub use hll::{HyperLogLog, HyperLogLogState, HyperLogLogVec}; #[cfg(target_os = "linux")] pub mod more_process_metrics; @@ -59,7 +70,7 @@ static INTERNAL_REGISTRY: Lazy = Lazy::new(Registry::new); /// Register a collector in the internal registry. MUST be called before the first call to `gather()`. /// Otherwise, we can have a deadlock in the `gather()` call, trying to register a new collector /// while holding the lock. -pub fn register_internal(c: Box) -> Result<()> { +pub fn register_internal(c: Box) -> prometheus::Result<()> { INTERNAL_REGISTRY.register(c) } @@ -96,6 +107,127 @@ pub const DISK_WRITE_SECONDS_BUCKETS: &[f64] = &[ 0.000_050, 0.000_100, 0.000_500, 0.001, 0.003, 0.005, 0.01, 0.05, 0.1, 0.3, 0.5, ]; +pub struct BuildInfo { + pub revision: &'static str, + pub build_tag: &'static str, +} + +// todo: allow label group without the set +impl LabelGroup for BuildInfo { + fn visit_values(&self, v: &mut impl LabelGroupVisitor) { + const REVISION: &LabelName = LabelName::from_str("revision"); + v.write_value(REVISION, &self.revision); + const BUILD_TAG: &LabelName = LabelName::from_str("build_tag"); + v.write_value(BUILD_TAG, &self.build_tag); + } +} + +impl MetricFamilyEncoding for BuildInfo +where + GaugeState: MetricEncoding, +{ + fn collect_family_into( + &self, + name: impl measured::metric::name::MetricNameEncoder, + enc: &mut T, + ) -> Result<(), T::Err> { + enc.write_help(&name, "Build/version information")?; + GaugeState::write_type(&name, enc)?; + GaugeState { + count: std::sync::atomic::AtomicI64::new(1), + } + .collect_into(&(), self, name, enc) + } +} + +#[derive(MetricGroup)] +#[metric(new(build_info: BuildInfo))] +pub struct NeonMetrics { + #[cfg(target_os = "linux")] + #[metric(namespace = "process")] + #[metric(init = measured_process::ProcessCollector::for_self())] + process: measured_process::ProcessCollector, + + #[metric(namespace = "libmetrics")] + #[metric(init = LibMetrics::new(build_info))] + libmetrics: LibMetrics, +} + +#[derive(MetricGroup)] +#[metric(new(build_info: BuildInfo))] +pub struct LibMetrics { + #[metric(init = build_info)] + build_info: BuildInfo, + + #[metric(flatten)] + rusage: Rusage, + + serve_count: CollectionCounter, +} + +fn write_gauge( + x: i64, + labels: impl LabelGroup, + name: impl MetricNameEncoder, + enc: &mut Enc, +) -> Result<(), Enc::Err> { + enc.write_metric_value(name, labels, MetricValue::Int(x)) +} + +#[derive(Default)] +struct Rusage; + +#[derive(FixedCardinalityLabel, Clone, Copy)] +#[label(singleton = "io_operation")] +enum IoOp { + Read, + Write, +} + +impl MetricGroup for Rusage +where + GaugeState: MetricEncoding, +{ + fn collect_group_into(&self, enc: &mut T) -> Result<(), T::Err> { + const DISK_IO: &MetricName = MetricName::from_str("disk_io_bytes_total"); + const MAXRSS: &MetricName = MetricName::from_str("maxrss_kb"); + + let ru = get_rusage_stats(); + + enc.write_help( + DISK_IO, + "Bytes written and read from disk, grouped by the operation (read|write)", + )?; + GaugeState::write_type(DISK_IO, enc)?; + write_gauge(ru.ru_inblock * BYTES_IN_BLOCK, IoOp::Read, DISK_IO, enc)?; + write_gauge(ru.ru_oublock * BYTES_IN_BLOCK, IoOp::Write, DISK_IO, enc)?; + + enc.write_help(MAXRSS, "Memory usage (Maximum Resident Set Size)")?; + GaugeState::write_type(MAXRSS, enc)?; + write_gauge(ru.ru_maxrss, IoOp::Read, MAXRSS, enc)?; + + Ok(()) + } +} + +#[derive(Default)] +struct CollectionCounter(CounterState); + +impl MetricFamilyEncoding for CollectionCounter +where + CounterState: MetricEncoding, +{ + fn collect_family_into( + &self, + name: impl measured::metric::name::MetricNameEncoder, + enc: &mut T, + ) -> Result<(), T::Err> { + self.0.inc(); + enc.write_help(&name, "Number of metric requests made")?; + self.0.collect_into(&(), NoLabels, name, enc) + } +} + pub fn set_build_info_metric(revision: &str, build_tag: &str) { let metric = register_int_gauge_vec!( "libmetrics_build_info", @@ -105,6 +237,7 @@ pub fn set_build_info_metric(revision: &str, build_tag: &str) { .expect("Failed to register build info metric"); metric.with_label_values(&[revision, build_tag]).set(1); } +const BYTES_IN_BLOCK: i64 = 512; // Records I/O stats in a "cross-platform" way. // Compiles both on macOS and Linux, but current macOS implementation always returns 0 as values for I/O stats. @@ -117,14 +250,22 @@ pub fn set_build_info_metric(revision: &str, build_tag: &str) { fn update_rusage_metrics() { let rusage_stats = get_rusage_stats(); - const BYTES_IN_BLOCK: i64 = 512; DISK_IO_BYTES .with_label_values(&["read"]) .set(rusage_stats.ru_inblock * BYTES_IN_BLOCK); DISK_IO_BYTES .with_label_values(&["write"]) .set(rusage_stats.ru_oublock * BYTES_IN_BLOCK); - MAXRSS_KB.set(rusage_stats.ru_maxrss); + + // On macOS, the unit of maxrss is bytes; on Linux, it's kilobytes. https://stackoverflow.com/a/59915669 + #[cfg(target_os = "macos")] + { + MAXRSS_KB.set(rusage_stats.ru_maxrss / 1024); + } + #[cfg(not(target_os = "macos"))] + { + MAXRSS_KB.set(rusage_stats.ru_maxrss); + } } fn get_rusage_stats() -> libc::rusage { @@ -151,6 +292,7 @@ macro_rules! register_int_counter_pair_vec { } }}; } + /// Create an [`IntCounterPair`] and registers to default registry. #[macro_export(local_inner_macros)] macro_rules! register_int_counter_pair { @@ -188,7 +330,10 @@ impl GenericCounterPairVec

{ /// /// An error is returned if the number of label values is not the same as the /// number of VariableLabels in Desc. - pub fn get_metric_with_label_values(&self, vals: &[&str]) -> Result> { + pub fn get_metric_with_label_values( + &self, + vals: &[&str], + ) -> prometheus::Result> { Ok(GenericCounterPair { inc: self.inc.get_metric_with_label_values(vals)?, dec: self.dec.get_metric_with_label_values(vals)?, @@ -201,7 +346,7 @@ impl GenericCounterPairVec

{ self.get_metric_with_label_values(vals).unwrap() } - pub fn remove_label_values(&self, res: &mut [Result<()>; 2], vals: &[&str]) { + pub fn remove_label_values(&self, res: &mut [prometheus::Result<()>; 2], vals: &[&str]) { res[0] = self.inc.remove_label_values(vals); res[1] = self.dec.remove_label_values(vals); } @@ -285,3 +430,180 @@ pub type IntCounterPair = GenericCounterPair; /// A guard for [`IntCounterPair`] that will decrement the gauge on drop pub type IntCounterPairGuard = GenericCounterPairGuard; + +pub trait CounterPairAssoc { + const INC_NAME: &'static MetricName; + const DEC_NAME: &'static MetricName; + + const INC_HELP: &'static str; + const DEC_HELP: &'static str; + + type LabelGroupSet: LabelGroupSet; +} + +pub struct CounterPairVec { + vec: measured::metric::MetricVec, +} + +impl Default for CounterPairVec +where + A::LabelGroupSet: Default, +{ + fn default() -> Self { + Self { + vec: Default::default(), + } + } +} + +impl CounterPairVec { + pub fn guard( + &self, + labels: ::Group<'_>, + ) -> MeasuredCounterPairGuard<'_, A> { + let id = self.vec.with_labels(labels); + self.vec.get_metric(id).inc.inc(); + MeasuredCounterPairGuard { vec: &self.vec, id } + } + pub fn inc(&self, labels: ::Group<'_>) { + let id = self.vec.with_labels(labels); + self.vec.get_metric(id).inc.inc(); + } + pub fn dec(&self, labels: ::Group<'_>) { + let id = self.vec.with_labels(labels); + self.vec.get_metric(id).dec.inc(); + } + pub fn remove_metric( + &self, + labels: ::Group<'_>, + ) -> Option { + let id = self.vec.with_labels(labels); + self.vec.remove_metric(id) + } + + pub fn sample(&self, labels: ::Group<'_>) -> u64 { + let id = self.vec.with_labels(labels); + let metric = self.vec.get_metric(id); + + let inc = metric.inc.count.load(std::sync::atomic::Ordering::Relaxed); + let dec = metric.dec.count.load(std::sync::atomic::Ordering::Relaxed); + inc.saturating_sub(dec) + } +} + +impl ::measured::metric::group::MetricGroup for CounterPairVec +where + T: ::measured::metric::group::Encoding, + A: CounterPairAssoc, + ::measured::metric::counter::CounterState: ::measured::metric::MetricEncoding, +{ + fn collect_group_into(&self, enc: &mut T) -> Result<(), T::Err> { + // write decrement first to avoid a race condition where inc - dec < 0 + T::write_help(enc, A::DEC_NAME, A::DEC_HELP)?; + self.vec + .collect_family_into(A::DEC_NAME, &mut Dec(&mut *enc))?; + + T::write_help(enc, A::INC_NAME, A::INC_HELP)?; + self.vec + .collect_family_into(A::INC_NAME, &mut Inc(&mut *enc))?; + + Ok(()) + } +} + +#[derive(MetricGroup, Default)] +pub struct MeasuredCounterPairState { + pub inc: CounterState, + pub dec: CounterState, +} + +impl measured::metric::MetricType for MeasuredCounterPairState { + type Metadata = (); +} + +pub struct MeasuredCounterPairGuard<'a, A: CounterPairAssoc> { + vec: &'a measured::metric::MetricVec, + id: measured::metric::LabelId, +} + +impl Drop for MeasuredCounterPairGuard<'_, A> { + fn drop(&mut self) { + self.vec.get_metric(self.id).dec.inc(); + } +} + +/// [`MetricEncoding`] for [`MeasuredCounterPairState`] that only writes the inc counter to the inner encoder. +struct Inc(T); +/// [`MetricEncoding`] for [`MeasuredCounterPairState`] that only writes the dec counter to the inner encoder. +struct Dec(T); + +impl Encoding for Inc { + type Err = T::Err; + + fn write_help(&mut self, name: impl MetricNameEncoder, help: &str) -> Result<(), Self::Err> { + self.0.write_help(name, help) + } + + fn write_metric_value( + &mut self, + name: impl MetricNameEncoder, + labels: impl LabelGroup, + value: MetricValue, + ) -> Result<(), Self::Err> { + self.0.write_metric_value(name, labels, value) + } +} + +impl MetricEncoding> for MeasuredCounterPairState +where + CounterState: MetricEncoding, +{ + fn write_type(name: impl MetricNameEncoder, enc: &mut Inc) -> Result<(), T::Err> { + CounterState::write_type(name, &mut enc.0) + } + fn collect_into( + &self, + metadata: &(), + labels: impl LabelGroup, + name: impl MetricNameEncoder, + enc: &mut Inc, + ) -> Result<(), T::Err> { + self.inc.collect_into(metadata, labels, name, &mut enc.0) + } +} + +impl Encoding for Dec { + type Err = T::Err; + + fn write_help(&mut self, name: impl MetricNameEncoder, help: &str) -> Result<(), Self::Err> { + self.0.write_help(name, help) + } + + fn write_metric_value( + &mut self, + name: impl MetricNameEncoder, + labels: impl LabelGroup, + value: MetricValue, + ) -> Result<(), Self::Err> { + self.0.write_metric_value(name, labels, value) + } +} + +/// Write the dec counter to the encoder +impl MetricEncoding> for MeasuredCounterPairState +where + CounterState: MetricEncoding, +{ + fn write_type(name: impl MetricNameEncoder, enc: &mut Dec) -> Result<(), T::Err> { + CounterState::write_type(name, &mut enc.0) + } + fn collect_into( + &self, + metadata: &(), + labels: impl LabelGroup, + name: impl MetricNameEncoder, + enc: &mut Dec, + ) -> Result<(), T::Err> { + self.dec.collect_into(metadata, labels, name, &mut enc.0) + } +} diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs new file mode 100644 index 0000000000..d996a62349 --- /dev/null +++ b/libs/pageserver_api/src/config.rs @@ -0,0 +1,31 @@ +use std::collections::HashMap; + +use const_format::formatcp; + +#[cfg(test)] +mod tests; + +pub const DEFAULT_PG_LISTEN_PORT: u16 = 64000; +pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}"); +pub const DEFAULT_HTTP_LISTEN_PORT: u16 = 9898; +pub const DEFAULT_HTTP_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_HTTP_LISTEN_PORT}"); + +// Certain metadata (e.g. externally-addressable name, AZ) is delivered +// as a separate structure. This information is not neeed by the pageserver +// itself, it is only used for registering the pageserver with the control +// plane and/or storage controller. +// +#[derive(PartialEq, Eq, Debug, serde::Serialize, serde::Deserialize)] +pub struct NodeMetadata { + #[serde(rename = "host")] + pub postgres_host: String, + #[serde(rename = "port")] + pub postgres_port: u16, + pub http_host: String, + pub http_port: u16, + + // Deployment tools may write fields to the metadata file beyond what we + // use in this type: this type intentionally only names fields that require. + #[serde(flatten)] + pub other: HashMap, +} diff --git a/libs/pageserver_api/src/config/tests.rs b/libs/pageserver_api/src/config/tests.rs new file mode 100644 index 0000000000..edeefc156e --- /dev/null +++ b/libs/pageserver_api/src/config/tests.rs @@ -0,0 +1,22 @@ +use super::*; + +#[test] +fn test_node_metadata_v1_backward_compatibilty() { + let v1 = serde_json::to_vec(&serde_json::json!({ + "host": "localhost", + "port": 23, + "http_host": "localhost", + "http_port": 42, + })); + + assert_eq!( + serde_json::from_slice::(&v1.unwrap()).unwrap(), + NodeMetadata { + postgres_host: "localhost".to_string(), + postgres_port: 23, + http_host: "localhost".to_string(), + http_port: 42, + other: HashMap::new(), + } + ) +} diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs index e33bd0f486..1278f17ad2 100644 --- a/libs/pageserver_api/src/controller_api.rs +++ b/libs/pageserver_api/src/controller_api.rs @@ -2,9 +2,9 @@ use std::str::FromStr; /// Request/response types for the storage controller /// API (`/control/v1` prefix). Implemented by the server -/// in [`attachment_service::http`] +/// in [`storage_controller::http`] use serde::{Deserialize, Serialize}; -use utils::id::NodeId; +use utils::id::{NodeId, TenantId}; use crate::{ models::{ShardParameters, TenantConfig}, @@ -42,6 +42,12 @@ pub struct NodeConfigureRequest { pub scheduling: Option, } +#[derive(Serialize, Deserialize)] +pub struct TenantPolicyRequest { + pub placement: Option, + pub scheduling: Option, +} + #[derive(Serialize, Deserialize, Debug)] pub struct TenantLocateResponseShard { pub shard_id: TenantShardId, @@ -62,12 +68,27 @@ pub struct TenantLocateResponse { #[derive(Serialize, Deserialize)] pub struct TenantDescribeResponse { + pub tenant_id: TenantId, pub shards: Vec, pub stripe_size: ShardStripeSize, pub policy: PlacementPolicy, pub config: TenantConfig, } +#[derive(Serialize, Deserialize)] +pub struct NodeDescribeResponse { + pub id: NodeId, + + pub availability: NodeAvailabilityWrapper, + pub scheduling: NodeSchedulingPolicy, + + pub listen_http_addr: String, + pub listen_http_port: u16, + + pub listen_pg_addr: String, + pub listen_pg_port: u16, +} + #[derive(Serialize, Deserialize)] pub struct TenantDescribeResponseShard { pub tenant_shard_id: TenantShardId, @@ -83,6 +104,8 @@ pub struct TenantDescribeResponseShard { pub is_pending_compute_notification: bool, /// A shard split is currently underway pub is_splitting: bool, + + pub scheduling_policy: ShardSchedulingPolicy, } /// Explicitly migrating a particular shard is a low level operation @@ -97,7 +120,7 @@ pub struct TenantShardMigrateRequest { /// Utilisation score indicating how good a candidate a pageserver /// is for scheduling the next tenant. See [`crate::models::PageserverUtilization`]. /// Lower values are better. -#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, PartialOrd, Ord)] +#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, PartialOrd, Ord, Debug)] pub struct UtilizationScore(pub u64); impl UtilizationScore { @@ -106,7 +129,7 @@ impl UtilizationScore { } } -#[derive(Serialize, Clone, Copy)] +#[derive(Serialize, Deserialize, Clone, Copy, Debug)] #[serde(into = "NodeAvailabilityWrapper")] pub enum NodeAvailability { // Normal, happy state @@ -129,7 +152,7 @@ impl Eq for NodeAvailability {} // This wrapper provides serde functionality and it should only be used to // communicate with external callers which don't know or care about the // utilisation score of the pageserver it is targeting. -#[derive(Serialize, Deserialize, Clone)] +#[derive(Serialize, Deserialize, Clone, Copy, Debug)] pub enum NodeAvailabilityWrapper { Active, Offline, @@ -155,22 +178,33 @@ impl From for NodeAvailabilityWrapper { } } -impl FromStr for NodeAvailability { - type Err = anyhow::Error; +#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, Debug)] +pub enum ShardSchedulingPolicy { + // Normal mode: the tenant's scheduled locations may be updated at will, including + // for non-essential optimization. + Active, - fn from_str(s: &str) -> Result { - match s { - // This is used when parsing node configuration requests from neon-local. - // Assume the worst possible utilisation score - // and let it get updated via the heartbeats. - "active" => Ok(Self::Active(UtilizationScore::worst())), - "offline" => Ok(Self::Offline), - _ => Err(anyhow::anyhow!("Unknown availability state '{s}'")), - } + // Disable optimizations, but permit scheduling when necessary to fulfil the PlacementPolicy. + // For example, this still permits a node's attachment location to change to a secondary in + // response to a node failure, or to assign a new secondary if a node was removed. + Essential, + + // No scheduling: leave the shard running wherever it currently is. Even if the shard is + // unavailable, it will not be rescheduled to another node. + Pause, + + // No reconciling: we will make no location_conf API calls to pageservers at all. If the + // shard is unavailable, it stays that way. If a node fails, this shard doesn't get failed over. + Stop, +} + +impl Default for ShardSchedulingPolicy { + fn default() -> Self { + Self::Active } } -#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq)] +#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, Debug)] pub enum NodeSchedulingPolicy { Active, Filling, diff --git a/libs/pageserver_api/src/key.rs b/libs/pageserver_api/src/key.rs index 852670af2c..2511de00d5 100644 --- a/libs/pageserver_api/src/key.rs +++ b/libs/pageserver_api/src/key.rs @@ -1,5 +1,6 @@ use anyhow::{bail, Result}; use byteorder::{ByteOrder, BE}; +use bytes::BufMut; use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM}; use postgres_ffi::{Oid, TransactionId}; use serde::{Deserialize, Serialize}; @@ -21,15 +22,107 @@ pub struct Key { pub field6: u32, } +/// The storage key size. pub const KEY_SIZE: usize = 18; +/// The metadata key size. 2B fewer than the storage key size because field2 is not fully utilized. +/// See [`Key::to_i128`] for more information on the encoding. +pub const METADATA_KEY_SIZE: usize = 16; + +/// The key prefix start range for the metadata keys. All keys with the first byte >= 0x40 is a metadata key. +pub const METADATA_KEY_BEGIN_PREFIX: u8 = 0x60; +pub const METADATA_KEY_END_PREFIX: u8 = 0x7F; + +/// The (reserved) key prefix of relation sizes. +pub const RELATION_SIZE_PREFIX: u8 = 0x61; + +/// The key prefix of AUX file keys. +pub const AUX_KEY_PREFIX: u8 = 0x62; + +/// Check if the key falls in the range of metadata keys. +pub const fn is_metadata_key_slice(key: &[u8]) -> bool { + key[0] >= METADATA_KEY_BEGIN_PREFIX && key[0] < METADATA_KEY_END_PREFIX +} + impl Key { + /// Check if the key falls in the range of metadata keys. + pub const fn is_metadata_key(&self) -> bool { + self.field1 >= METADATA_KEY_BEGIN_PREFIX && self.field1 < METADATA_KEY_END_PREFIX + } + + /// Encode a metadata key to a storage key. + pub fn from_metadata_key_fixed_size(key: &[u8; METADATA_KEY_SIZE]) -> Self { + assert!(is_metadata_key_slice(key), "key not in metadata key range"); + Key { + field1: key[0], + field2: u16::from_be_bytes(key[1..3].try_into().unwrap()) as u32, + field3: u32::from_be_bytes(key[3..7].try_into().unwrap()), + field4: u32::from_be_bytes(key[7..11].try_into().unwrap()), + field5: key[11], + field6: u32::from_be_bytes(key[12..16].try_into().unwrap()), + } + } + + /// Encode a metadata key to a storage key. + pub fn from_metadata_key(key: &[u8]) -> Self { + Self::from_metadata_key_fixed_size(key.try_into().expect("expect 16 byte metadata key")) + } + + /// Extract a metadata key to a writer. The result should always be 16 bytes. + pub fn extract_metadata_key_to_writer(&self, mut writer: impl BufMut) { + writer.put_u8(self.field1); + assert!(self.field2 <= 0xFFFF); + writer.put_u16(self.field2 as u16); + writer.put_u32(self.field3); + writer.put_u32(self.field4); + writer.put_u8(self.field5); + writer.put_u32(self.field6); + } + + /// Get the range of metadata keys. + pub const fn metadata_key_range() -> Range { + Key { + field1: METADATA_KEY_BEGIN_PREFIX, + field2: 0, + field3: 0, + field4: 0, + field5: 0, + field6: 0, + }..Key { + field1: METADATA_KEY_END_PREFIX, + field2: 0, + field3: 0, + field4: 0, + field5: 0, + field6: 0, + } + } + + /// Get the range of aux keys. + pub fn metadata_aux_key_range() -> Range { + Key { + field1: AUX_KEY_PREFIX, + field2: 0, + field3: 0, + field4: 0, + field5: 0, + field6: 0, + }..Key { + field1: AUX_KEY_PREFIX + 1, + field2: 0, + field3: 0, + field4: 0, + field5: 0, + field6: 0, + } + } + /// 'field2' is used to store tablespaceid for relations and small enum numbers for other relish. /// As long as Neon does not support tablespace (because of lack of access to local file system), /// we can assume that only some predefined namespace OIDs are used which can fit in u16 pub fn to_i128(&self) -> i128 { assert!(self.field2 < 0xFFFF || self.field2 == 0xFFFFFFFF || self.field2 == 0x22222222); - (((self.field1 & 0xf) as i128) << 120) + (((self.field1 & 0x7F) as i128) << 120) | (((self.field2 & 0xFFFF) as i128) << 104) | ((self.field3 as i128) << 72) | ((self.field4 as i128) << 40) @@ -39,7 +132,7 @@ impl Key { pub const fn from_i128(x: i128) -> Self { Key { - field1: ((x >> 120) & 0xf) as u8, + field1: ((x >> 120) & 0x7F) as u8, field2: ((x >> 104) & 0xFFFF) as u32, field3: (x >> 72) as u32, field4: (x >> 40) as u32, @@ -48,11 +141,11 @@ impl Key { } } - pub fn next(&self) -> Key { + pub const fn next(&self) -> Key { self.add(1) } - pub fn add(&self, x: u32) -> Key { + pub const fn add(&self, x: u32) -> Key { let mut key = *self; let r = key.field6.overflowing_add(x); @@ -81,6 +174,8 @@ impl Key { key } + /// Convert a 18B slice to a key. This function should not be used for metadata keys because field2 is handled differently. + /// Use [`Key::from_metadata_key`] instead. pub fn from_slice(b: &[u8]) -> Self { Key { field1: b[0], @@ -92,6 +187,8 @@ impl Key { } } + /// Convert a key to a 18B slice. This function should not be used for metadata keys because field2 is handled differently. + /// Use [`Key::extract_metadata_key_to_writer`] instead. pub fn write_to_byte_slice(&self, buf: &mut [u8]) { buf[0] = self.field1; BE::write_u32(&mut buf[1..5], self.field2); @@ -475,12 +572,17 @@ pub const AUX_FILES_KEY: Key = Key { // Reverse mappings for a few Keys. // These are needed by WAL redo manager. +/// Non inherited range for vectored get. +pub const NON_INHERITED_RANGE: Range = AUX_FILES_KEY..AUX_FILES_KEY.next(); +/// Sparse keyspace range for vectored get. Missing key error will be ignored for this range. +pub const NON_INHERITED_SPARSE_RANGE: Range = Key::metadata_key_range(); + // AUX_FILES currently stores only data for logical replication (slots etc), and // we don't preserve these on a branch because safekeepers can't follow timeline // switch (and generally it likely should be optional), so ignore these. #[inline(always)] pub fn is_inherited_key(key: Key) -> bool { - key != AUX_FILES_KEY + !NON_INHERITED_RANGE.contains(&key) && !NON_INHERITED_SPARSE_RANGE.contains(&key) } #[inline(always)] @@ -556,11 +658,14 @@ impl std::str::FromStr for Key { mod tests { use std::str::FromStr; + use crate::key::is_metadata_key_slice; use crate::key::Key; use rand::Rng; use rand::SeedableRng; + use super::AUX_KEY_PREFIX; + #[test] fn display_fromstr_bijection() { let mut rng = rand::rngs::StdRng::seed_from_u64(42); @@ -576,4 +681,16 @@ mod tests { assert_eq!(key, Key::from_str(&format!("{key}")).unwrap()); } + + #[test] + fn test_metadata_keys() { + let mut metadata_key = vec![AUX_KEY_PREFIX]; + metadata_key.extend_from_slice(&[0xFF; 15]); + let encoded_key = Key::from_metadata_key(&metadata_key); + let mut output_key = Vec::new(); + encoded_key.extract_metadata_key_to_writer(&mut output_key); + assert_eq!(metadata_key, output_key); + assert!(encoded_key.is_metadata_key()); + assert!(is_metadata_key_slice(&metadata_key)); + } } diff --git a/libs/pageserver_api/src/keyspace.rs b/libs/pageserver_api/src/keyspace.rs index 05fa4562e1..c0c4710a00 100644 --- a/libs/pageserver_api/src/keyspace.rs +++ b/libs/pageserver_api/src/keyspace.rs @@ -1,7 +1,10 @@ use postgres_ffi::BLCKSZ; use std::ops::Range; -use crate::key::Key; +use crate::{ + key::Key, + shard::{ShardCount, ShardIdentity}, +}; use itertools::Itertools; /// @@ -14,44 +17,279 @@ pub struct KeySpace { pub ranges: Vec>, } -impl KeySpace { +/// A wrapper type for sparse keyspaces. +#[derive(Clone, Debug, Default, PartialEq, Eq)] +pub struct SparseKeySpace(pub KeySpace); + +/// Represents a contiguous half-open range of the keyspace, masked according to a particular +/// ShardNumber's stripes: within this range of keys, only some "belong" to the current +/// shard. +/// +/// When we iterate over keys within this object, we will skip any keys that don't belong +/// to this shard. +/// +/// The start + end keys may not belong to the shard: these specify where layer files should +/// start + end, but we will never actually read/write those keys. +#[derive(Clone, Debug, PartialEq, Eq)] +pub struct ShardedRange<'a> { + pub shard_identity: &'a ShardIdentity, + pub range: Range, +} + +// Calculate the size of a range within the blocks of the same relation, or spanning only the +// top page in the previous relation's space. +fn contiguous_range_len(range: &Range) -> u32 { + debug_assert!(is_contiguous_range(range)); + if range.start.field6 == 0xffffffff { + range.end.field6 + 1 + } else { + range.end.field6 - range.start.field6 + } +} + +/// Return true if this key range includes only keys in the same relation's data blocks, or +/// just spanning one relation and the logical size (0xffffffff) block of the relation before it. +/// +/// Contiguous in this context means we know the keys are in use _somewhere_, but it might not +/// be on our shard. Later in ShardedRange we do the extra work to figure out how much +/// of a given contiguous range is present on one shard. +/// +/// This matters, because: +/// - Within such ranges, keys are used contiguously. Outside such ranges it is sparse. +/// - Within such ranges, we may calculate distances using simple subtraction of field6. +fn is_contiguous_range(range: &Range) -> bool { + range.start.field1 == range.end.field1 + && range.start.field2 == range.end.field2 + && range.start.field3 == range.end.field3 + && range.start.field4 == range.end.field4 + && (range.start.field5 == range.end.field5 + || (range.start.field6 == 0xffffffff && range.start.field5 + 1 == range.end.field5)) +} + +impl<'a> ShardedRange<'a> { + pub fn new(range: Range, shard_identity: &'a ShardIdentity) -> Self { + Self { + shard_identity, + range, + } + } + + /// Break up this range into chunks, each of which has at least one local key in it if the + /// total range has at least one local key. + pub fn fragment(self, target_nblocks: u32) -> Vec<(u32, Range)> { + // Optimization for single-key case (e.g. logical size keys) + if self.range.end == self.range.start.add(1) { + return vec![( + if self.shard_identity.is_key_disposable(&self.range.start) { + 0 + } else { + 1 + }, + self.range, + )]; + } + + if !is_contiguous_range(&self.range) { + // Ranges that span relations are not fragmented. We only get these ranges as a result + // of operations that act on existing layers, so we trust that the existing range is + // reasonably small. + return vec![(u32::MAX, self.range)]; + } + + let mut fragments: Vec<(u32, Range)> = Vec::new(); + + let mut cursor = self.range.start; + while cursor < self.range.end { + let advance_by = self.distance_to_next_boundary(cursor); + let is_fragment_disposable = self.shard_identity.is_key_disposable(&cursor); + + // If the previous fragment is undersized, then we seek to consume enough + // blocks to complete it. + let (want_blocks, merge_last_fragment) = match fragments.last_mut() { + Some(frag) if frag.0 < target_nblocks => (target_nblocks - frag.0, Some(frag)), + Some(frag) => { + // Prev block is complete, want the full number. + ( + target_nblocks, + if is_fragment_disposable { + // If this current range will be empty (not shard-local data), we will merge into previous + Some(frag) + } else { + None + }, + ) + } + None => { + // First iteration, want the full number + (target_nblocks, None) + } + }; + + let advance_by = if is_fragment_disposable { + advance_by + } else { + std::cmp::min(advance_by, want_blocks) + }; + + let next_cursor = cursor.add(advance_by); + + let this_frag = ( + if is_fragment_disposable { + 0 + } else { + advance_by + }, + cursor..next_cursor, + ); + cursor = next_cursor; + + if let Some(last_fragment) = merge_last_fragment { + // Previous fragment was short or this one is empty, merge into it + last_fragment.0 += this_frag.0; + last_fragment.1.end = this_frag.1.end; + } else { + fragments.push(this_frag); + } + } + + fragments + } + + /// Estimate the physical pages that are within this range, on this shard. This returns + /// u32::MAX if the range spans relations: this return value should be interpreted as "large". + pub fn page_count(&self) -> u32 { + // Special cases for single keys like logical sizes + if self.range.end == self.range.start.add(1) { + return if self.shard_identity.is_key_disposable(&self.range.start) { + 0 + } else { + 1 + }; + } + + // We can only do an authentic calculation of contiguous key ranges + if !is_contiguous_range(&self.range) { + return u32::MAX; + } + + // Special case for single sharded tenants: our logical and physical sizes are the same + if self.shard_identity.count < ShardCount::new(2) { + return contiguous_range_len(&self.range); + } + + // Normal path: step through stripes and part-stripes in the range, evaluate whether each one belongs + // to Self, and add the stripe's block count to our total if so. + let mut result: u64 = 0; + let mut cursor = self.range.start; + while cursor < self.range.end { + // Count up to the next stripe_size boundary or end of range + let advance_by = self.distance_to_next_boundary(cursor); + + // If this blocks in this stripe belong to us, add them to our count + if !self.shard_identity.is_key_disposable(&cursor) { + result += advance_by as u64; + } + + cursor = cursor.add(advance_by); + } + + if result > u32::MAX as u64 { + u32::MAX + } else { + result as u32 + } + } + + /// Advance the cursor to the next potential fragment boundary: this is either + /// a stripe boundary, or the end of the range. + fn distance_to_next_boundary(&self, cursor: Key) -> u32 { + let distance_to_range_end = contiguous_range_len(&(cursor..self.range.end)); + + if self.shard_identity.count < ShardCount::new(2) { + // Optimization: don't bother stepping through stripes if the tenant isn't sharded. + return distance_to_range_end; + } + + if cursor.field6 == 0xffffffff { + // We are wrapping from one relation's logical size to the next relation's first data block + return 1; + } + + let stripe_index = cursor.field6 / self.shard_identity.stripe_size.0; + let stripe_remainder = self.shard_identity.stripe_size.0 + - (cursor.field6 - stripe_index * self.shard_identity.stripe_size.0); + + if cfg!(debug_assertions) { + // We should never overflow field5 and field6 -- our callers check this earlier + // and would have returned their u32::MAX cases if the input range violated this. + let next_cursor = cursor.add(stripe_remainder); + debug_assert!( + next_cursor.field1 == cursor.field1 + && next_cursor.field2 == cursor.field2 + && next_cursor.field3 == cursor.field3 + && next_cursor.field4 == cursor.field4 + && next_cursor.field5 == cursor.field5 + ) + } + + std::cmp::min(stripe_remainder, distance_to_range_end) + } + + /// Whereas `page_count` estimates the number of pages physically in this range on this shard, + /// this function simply calculates the number of pages in the space, without accounting for those + /// pages that would not actually be stored on this node. /// + /// Don't use this function in code that works with physical entities like layer files. + pub fn raw_size(range: &Range) -> u32 { + if is_contiguous_range(range) { + contiguous_range_len(range) + } else { + u32::MAX + } + } +} + +impl KeySpace { + /// Create a key space with a single range. + pub fn single(key_range: Range) -> Self { + Self { + ranges: vec![key_range], + } + } + /// Partition a key space into roughly chunks of roughly 'target_size' bytes /// in each partition. /// - pub fn partition(&self, target_size: u64) -> KeyPartitioning { + pub fn partition(&self, shard_identity: &ShardIdentity, target_size: u64) -> KeyPartitioning { // Assume that each value is 8k in size. - let target_nblocks = (target_size / BLCKSZ as u64) as usize; + let target_nblocks = (target_size / BLCKSZ as u64) as u32; let mut parts = Vec::new(); let mut current_part = Vec::new(); let mut current_part_size: usize = 0; for range in &self.ranges { - // If appending the next contiguous range in the keyspace to the current - // partition would cause it to be too large, start a new partition. - let this_size = key_range_size(range) as usize; - if current_part_size + this_size > target_nblocks && !current_part.is_empty() { - parts.push(KeySpace { - ranges: current_part, - }); - current_part = Vec::new(); - current_part_size = 0; - } + // While doing partitioning, wrap the range in ShardedRange so that our size calculations + // will respect shard striping rather than assuming all keys within a range are present. + let range = ShardedRange::new(range.clone(), shard_identity); - // If the next range is larger than 'target_size', split it into - // 'target_size' chunks. - let mut remain_size = this_size; - let mut start = range.start; - while remain_size > target_nblocks { - let next = start.add(target_nblocks as u32); - parts.push(KeySpace { - ranges: vec![start..next], - }); - start = next; - remain_size -= target_nblocks + // Chunk up the range into parts that each contain up to target_size local blocks + for (frag_on_shard_size, frag_range) in range.fragment(target_nblocks) { + // If appending the next contiguous range in the keyspace to the current + // partition would cause it to be too large, and our current partition + // covers at least one block that is physically present in this shard, + // then start a new partition + if current_part_size + frag_on_shard_size as usize > target_nblocks as usize + && current_part_size > 0 + { + parts.push(KeySpace { + ranges: current_part, + }); + current_part = Vec::new(); + current_part_size = 0; + } + current_part.push(frag_range.start..frag_range.end); + current_part_size += frag_on_shard_size as usize; } - current_part.push(start..range.end); - current_part_size += remain_size; } // add last partition that wasn't full yet. @@ -64,6 +302,10 @@ impl KeySpace { KeyPartitioning { parts } } + pub fn is_empty(&self) -> bool { + self.total_raw_size() == 0 + } + /// Merge another keyspace into the current one. /// Note: the keyspaces must not ovelap (enforced via assertions) pub fn merge(&mut self, other: &KeySpace) { @@ -94,12 +336,13 @@ impl KeySpace { /// Remove all keys in `other` from `self`. /// This can involve splitting or removing of existing ranges. - pub fn remove_overlapping_with(&mut self, other: &KeySpace) { + /// Returns the removed keyspace + pub fn remove_overlapping_with(&mut self, other: &KeySpace) -> KeySpace { let (self_start, self_end) = match (self.start(), self.end()) { (Some(start), Some(end)) => (start, end), _ => { // self is empty - return; + return KeySpace::default(); } }; @@ -112,30 +355,37 @@ impl KeySpace { .skip_while(|range| self_start >= range.end) .take_while(|range| self_end > range.start); + let mut removed_accum = KeySpaceRandomAccum::new(); for range in other_ranges { while let Some(overlap_at) = self.overlaps_at(range) { let overlapped = self.ranges[overlap_at].clone(); if overlapped.start < range.start && overlapped.end <= range.end { // Higher part of the range is completely overlapped. + removed_accum.add_range(range.start..self.ranges[overlap_at].end); self.ranges[overlap_at].end = range.start; } if overlapped.start >= range.start && overlapped.end > range.end { // Lower part of the range is completely overlapped. + removed_accum.add_range(self.ranges[overlap_at].start..range.end); self.ranges[overlap_at].start = range.end; } if overlapped.start < range.start && overlapped.end > range.end { // Middle part of the range is overlapped. + removed_accum.add_range(range.clone()); self.ranges[overlap_at].end = range.start; self.ranges .insert(overlap_at + 1, range.end..overlapped.end); } if overlapped.start >= range.start && overlapped.end <= range.end { // Whole range is overlapped + removed_accum.add_range(self.ranges[overlap_at].clone()); self.ranges.remove(overlap_at); } } } + + removed_accum.to_keyspace() } pub fn start(&self) -> Option { @@ -146,11 +396,11 @@ impl KeySpace { self.ranges.last().map(|range| range.end) } - #[allow(unused)] - pub fn total_size(&self) -> usize { + /// The size of the keyspace in pages, before accounting for sharding + pub fn total_raw_size(&self) -> usize { self.ranges .iter() - .map(|range| key_range_size(range) as usize) + .map(|range| ShardedRange::raw_size(range) as usize) .sum() } @@ -170,6 +420,11 @@ impl KeySpace { pub fn overlaps(&self, range: &Range) -> bool { self.overlaps_at(range).is_some() } + + /// Check if the keyspace contains a key + pub fn contains(&self, key: &Key) -> bool { + self.overlaps(&(*key..key.next())) + } } /// @@ -184,10 +439,33 @@ pub struct KeyPartitioning { pub parts: Vec, } +/// Represents a partitioning of the sparse key space. +#[derive(Clone, Debug, Default)] +pub struct SparseKeyPartitioning { + pub parts: Vec, +} + impl KeyPartitioning { pub fn new() -> Self { KeyPartitioning { parts: Vec::new() } } + + /// Convert a key partitioning to a sparse partition. + pub fn into_sparse(self) -> SparseKeyPartitioning { + SparseKeyPartitioning { + parts: self.parts.into_iter().map(SparseKeySpace).collect(), + } + } +} + +impl SparseKeyPartitioning { + /// Note: use this function with caution. Attempt to handle a sparse keyspace in the same way as a dense keyspace will + /// cause long/dead loops. + pub fn into_dense(self) -> KeyPartitioning { + KeyPartitioning { + parts: self.parts.into_iter().map(|x| x.0).collect(), + } + } } /// @@ -219,7 +497,7 @@ impl KeySpaceAccum { #[inline(always)] pub fn add_range(&mut self, range: Range) { - self.size += key_range_size(&range) as u64; + self.size += ShardedRange::raw_size(&range) as u64; match self.accum.as_mut() { Some(accum) => { @@ -251,7 +529,9 @@ impl KeySpaceAccum { std::mem::take(self).to_keyspace() } - pub fn size(&self) -> u64 { + // The total number of keys in this object, ignoring any sharding effects that might cause some of + // the keys to be omitted in storage on this shard. + pub fn raw_size(&self) -> u64 { self.size } } @@ -307,36 +587,19 @@ impl KeySpaceRandomAccum { } } -#[inline(always)] -pub fn key_range_size(key_range: &Range) -> u32 { - let start = key_range.start; - let end = key_range.end; - - if end.field1 != start.field1 - || end.field2 != start.field2 - || end.field3 != start.field3 - || end.field4 != start.field4 - { - return u32::MAX; - } - - let start = (start.field5 as u64) << 32 | start.field6 as u64; - let end = (end.field5 as u64) << 32 | end.field6 as u64; - - let diff = end - start; - if diff > u32::MAX as u64 { - u32::MAX - } else { - diff as u32 - } -} - pub fn singleton_range(key: Key) -> Range { key..key.next() } #[cfg(test)] mod tests { + use rand::{RngCore, SeedableRng}; + + use crate::{ + models::ShardParameters, + shard::{ShardCount, ShardNumber}, + }; + use super::*; use std::fmt::Write; @@ -379,14 +642,17 @@ mod tests { accum.add_range(range.clone()); } - let expected_size: u64 = ranges.iter().map(|r| key_range_size(r) as u64).sum(); - assert_eq!(accum.size(), expected_size); + let expected_size: u64 = ranges + .iter() + .map(|r| ShardedRange::raw_size(r) as u64) + .sum(); + assert_eq!(accum.raw_size(), expected_size); assert_ks_eq(&accum.consume_keyspace(), ranges.clone()); - assert_eq!(accum.size(), 0); + assert_eq!(accum.raw_size(), 0); assert_ks_eq(&accum.consume_keyspace(), vec![]); - assert_eq!(accum.size(), 0); + assert_eq!(accum.raw_size(), 0); for range in &ranges { accum.add_range(range.clone()); @@ -553,7 +819,16 @@ mod tests { Key::from_i128(11)..Key::from_i128(13), ], }; - key_space1.remove_overlapping_with(&key_space2); + let removed = key_space1.remove_overlapping_with(&key_space2); + let removed_expected = KeySpace { + ranges: vec![ + Key::from_i128(2)..Key::from_i128(3), + Key::from_i128(6)..Key::from_i128(7), + Key::from_i128(11)..Key::from_i128(12), + ], + }; + assert_eq!(removed, removed_expected); + assert_eq!( key_space1.ranges, vec![ @@ -583,7 +858,17 @@ mod tests { Key::from_i128(14)..Key::from_i128(17), ], }; - key_space1.remove_overlapping_with(&key_space2); + + let removed = key_space1.remove_overlapping_with(&key_space2); + let removed_expected = KeySpace { + ranges: vec![ + Key::from_i128(3)..Key::from_i128(5), + Key::from_i128(8)..Key::from_i128(10), + Key::from_i128(14)..Key::from_i128(15), + ], + }; + assert_eq!(removed, removed_expected); + assert_eq!( key_space1.ranges, vec![ @@ -610,7 +895,11 @@ mod tests { Key::from_i128(15)..Key::from_i128(17), ], }; - key_space1.remove_overlapping_with(&key_space2); + + let removed = key_space1.remove_overlapping_with(&key_space2); + let removed_expected = KeySpace::default(); + assert_eq!(removed, removed_expected); + assert_eq!( key_space1.ranges, vec![ @@ -637,7 +926,17 @@ mod tests { let key_space2 = KeySpace { ranges: vec![Key::from_i128(9)..Key::from_i128(19)], }; - key_space1.remove_overlapping_with(&key_space2); + + let removed = key_space1.remove_overlapping_with(&key_space2); + let removed_expected = KeySpace { + ranges: vec![ + Key::from_i128(9)..Key::from_i128(10), + Key::from_i128(12)..Key::from_i128(15), + Key::from_i128(17)..Key::from_i128(19), + ], + }; + assert_eq!(removed, removed_expected); + assert_eq!( key_space1.ranges, vec![ @@ -650,4 +949,412 @@ mod tests { ] ); } + #[test] + fn sharded_range_relation_gap() { + let shard_identity = ShardIdentity::new( + ShardNumber(0), + ShardCount::new(4), + ShardParameters::DEFAULT_STRIPE_SIZE, + ) + .unwrap(); + + let range = ShardedRange::new( + Range { + start: Key::from_hex("000000067F00000005000040100300000000").unwrap(), + end: Key::from_hex("000000067F00000005000040130000004000").unwrap(), + }, + &shard_identity, + ); + + // Key range spans relations, expect MAX + assert_eq!(range.page_count(), u32::MAX); + } + + #[test] + fn shard_identity_keyspaces_single_key() { + let shard_identity = ShardIdentity::new( + ShardNumber(1), + ShardCount::new(4), + ShardParameters::DEFAULT_STRIPE_SIZE, + ) + .unwrap(); + + let range = ShardedRange::new( + Range { + start: Key::from_hex("000000067f000000010000007000ffffffff").unwrap(), + end: Key::from_hex("000000067f00000001000000700100000000").unwrap(), + }, + &shard_identity, + ); + // Single-key range on logical size key + assert_eq!(range.page_count(), 1); + } + + /// Test the helper that we use to identify ranges which go outside the data blocks of a single relation + #[test] + fn contiguous_range_check() { + assert!(!is_contiguous_range( + &(Key::from_hex("000000067f00000001000004df00fffffffe").unwrap() + ..Key::from_hex("000000067f00000001000004df0100000003").unwrap()) + ),); + + // The ranges goes all the way up to the 0xffffffff, including it: this is + // not considered a rel block range because 0xffffffff stores logical sizes, + // not blocks. + assert!(!is_contiguous_range( + &(Key::from_hex("000000067f00000001000004df00fffffffe").unwrap() + ..Key::from_hex("000000067f00000001000004df0100000000").unwrap()) + ),); + + // Keys within the normal data region of a relation + assert!(is_contiguous_range( + &(Key::from_hex("000000067f00000001000004df0000000000").unwrap() + ..Key::from_hex("000000067f00000001000004df0000000080").unwrap()) + ),); + + // The logical size key of one forkno, then some blocks in the next + assert!(is_contiguous_range( + &(Key::from_hex("000000067f00000001000004df00ffffffff").unwrap() + ..Key::from_hex("000000067f00000001000004df0100000080").unwrap()) + ),); + } + + #[test] + fn shard_identity_keyspaces_forkno_gap() { + let shard_identity = ShardIdentity::new( + ShardNumber(1), + ShardCount::new(4), + ShardParameters::DEFAULT_STRIPE_SIZE, + ) + .unwrap(); + + let range = ShardedRange::new( + Range { + start: Key::from_hex("000000067f00000001000004df00fffffffe").unwrap(), + end: Key::from_hex("000000067f00000001000004df0100000003").unwrap(), + }, + &shard_identity, + ); + + // Range spanning the end of one forkno and the start of the next: we do not attempt to + // calculate a valid size, because we have no way to know if they keys between start + // and end are actually in use. + assert_eq!(range.page_count(), u32::MAX); + } + + #[test] + fn shard_identity_keyspaces_one_relation() { + for shard_number in 0..4 { + let shard_identity = ShardIdentity::new( + ShardNumber(shard_number), + ShardCount::new(4), + ShardParameters::DEFAULT_STRIPE_SIZE, + ) + .unwrap(); + + let range = ShardedRange::new( + Range { + start: Key::from_hex("000000067f00000001000000ae0000000000").unwrap(), + end: Key::from_hex("000000067f00000001000000ae0000000001").unwrap(), + }, + &shard_identity, + ); + + // Very simple case: range covering block zero of one relation, where that block maps to shard zero + if shard_number == 0 { + assert_eq!(range.page_count(), 1); + } else { + // Other shards should perceive the range's size as zero + assert_eq!(range.page_count(), 0); + } + } + } + + /// Test helper: construct a ShardedRange and call fragment() on it, returning + /// the total page count in the range and the fragments. + fn do_fragment( + range_start: Key, + range_end: Key, + shard_identity: &ShardIdentity, + target_nblocks: u32, + ) -> (u32, Vec<(u32, Range)>) { + let range = ShardedRange::new( + Range { + start: range_start, + end: range_end, + }, + shard_identity, + ); + + let page_count = range.page_count(); + let fragments = range.fragment(target_nblocks); + + // Invariant: we always get at least one fragment + assert!(!fragments.is_empty()); + + // Invariant: the first/last fragment start/end should equal the input start/end + assert_eq!(fragments.first().unwrap().1.start, range_start); + assert_eq!(fragments.last().unwrap().1.end, range_end); + + if page_count > 0 { + // Invariant: every fragment must contain at least one shard-local page, if the + // total range contains at least one shard-local page + let all_nonzero = fragments.iter().all(|f| f.0 > 0); + if !all_nonzero { + eprintln!("Found a zero-length fragment: {:?}", fragments); + } + assert!(all_nonzero); + } else { + // A range with no shard-local pages should always be returned as a single fragment + assert_eq!(fragments, vec![(0, range_start..range_end)]); + } + + // Invariant: fragments must be ordered and non-overlapping + let mut last: Option> = None; + for frag in &fragments { + if let Some(last) = last { + assert!(frag.1.start >= last.end); + assert!(frag.1.start > last.start); + } + last = Some(frag.1.clone()) + } + + // Invariant: fragments respect target_nblocks + for frag in &fragments { + assert!(frag.0 == u32::MAX || frag.0 <= target_nblocks); + } + + (page_count, fragments) + } + + /// Really simple tests for fragment(), on a range that just contains a single stripe + /// for a single tenant. + #[test] + fn sharded_range_fragment_simple() { + let shard_identity = ShardIdentity::new( + ShardNumber(0), + ShardCount::new(4), + ShardParameters::DEFAULT_STRIPE_SIZE, + ) + .unwrap(); + + // A range which we happen to know covers exactly one stripe which belongs to this shard + let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap(); + let input_end = Key::from_hex("000000067f00000001000000ae0000008000").unwrap(); + + // Ask for stripe_size blocks, we get the whole stripe + assert_eq!( + do_fragment(input_start, input_end, &shard_identity, 32768), + (32768, vec![(32768, input_start..input_end)]) + ); + + // Ask for more, we still get the whole stripe + assert_eq!( + do_fragment(input_start, input_end, &shard_identity, 10000000), + (32768, vec![(32768, input_start..input_end)]) + ); + + // Ask for target_nblocks of half the stripe size, we get two halves + assert_eq!( + do_fragment(input_start, input_end, &shard_identity, 16384), + ( + 32768, + vec![ + (16384, input_start..input_start.add(16384)), + (16384, input_start.add(16384)..input_end) + ] + ) + ); + } + + #[test] + fn sharded_range_fragment_multi_stripe() { + let shard_identity = ShardIdentity::new( + ShardNumber(0), + ShardCount::new(4), + ShardParameters::DEFAULT_STRIPE_SIZE, + ) + .unwrap(); + + // A range which covers multiple stripes, exactly one of which belongs to the current shard. + let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap(); + let input_end = Key::from_hex("000000067f00000001000000ae0000020000").unwrap(); + // Ask for all the blocks, get a fragment that covers the whole range but reports + // its size to be just the blocks belonging to our shard. + assert_eq!( + do_fragment(input_start, input_end, &shard_identity, 131072), + (32768, vec![(32768, input_start..input_end)]) + ); + + // Ask for a sub-stripe quantity + assert_eq!( + do_fragment(input_start, input_end, &shard_identity, 16000), + ( + 32768, + vec![ + (16000, input_start..input_start.add(16000)), + (16000, input_start.add(16000)..input_start.add(32000)), + (768, input_start.add(32000)..input_end), + ] + ) + ); + + // Try on a range that starts slightly after our owned stripe + assert_eq!( + do_fragment(input_start.add(1), input_end, &shard_identity, 131072), + (32767, vec![(32767, input_start.add(1)..input_end)]) + ); + } + + /// Test our calculations work correctly when we start a range from the logical size key of + /// a previous relation. + #[test] + fn sharded_range_fragment_starting_from_logical_size() { + let input_start = Key::from_hex("000000067f00000001000000ae00ffffffff").unwrap(); + let input_end = Key::from_hex("000000067f00000001000000ae0100008000").unwrap(); + + // Shard 0 owns the first stripe in the relation, and the preceding logical size is shard local too + let shard_identity = ShardIdentity::new( + ShardNumber(0), + ShardCount::new(4), + ShardParameters::DEFAULT_STRIPE_SIZE, + ) + .unwrap(); + assert_eq!( + do_fragment(input_start, input_end, &shard_identity, 0x10000), + (0x8001, vec![(0x8001, input_start..input_end)]) + ); + + // Shard 1 does not own the first stripe in the relation, but it does own the logical size (all shards + // store all logical sizes) + let shard_identity = ShardIdentity::new( + ShardNumber(1), + ShardCount::new(4), + ShardParameters::DEFAULT_STRIPE_SIZE, + ) + .unwrap(); + assert_eq!( + do_fragment(input_start, input_end, &shard_identity, 0x10000), + (0x1, vec![(0x1, input_start..input_end)]) + ); + } + + /// Test that ShardedRange behaves properly when used on un-sharded data + #[test] + fn sharded_range_fragment_unsharded() { + let shard_identity = ShardIdentity::unsharded(); + + let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap(); + let input_end = Key::from_hex("000000067f00000001000000ae0000010000").unwrap(); + assert_eq!( + do_fragment(input_start, input_end, &shard_identity, 0x8000), + ( + 0x10000, + vec![ + (0x8000, input_start..input_start.add(0x8000)), + (0x8000, input_start.add(0x8000)..input_start.add(0x10000)) + ] + ) + ); + } + + #[test] + fn sharded_range_fragment_cross_relation() { + let shard_identity = ShardIdentity::unsharded(); + + // A range that spans relations: expect fragmentation to give up and return a u32::MAX size + let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap(); + let input_end = Key::from_hex("000000068f00000001000000ae0000010000").unwrap(); + assert_eq!( + do_fragment(input_start, input_end, &shard_identity, 0x8000), + (u32::MAX, vec![(u32::MAX, input_start..input_end),]) + ); + + // Same, but using a sharded identity + let shard_identity = ShardIdentity::new( + ShardNumber(0), + ShardCount::new(4), + ShardParameters::DEFAULT_STRIPE_SIZE, + ) + .unwrap(); + assert_eq!( + do_fragment(input_start, input_end, &shard_identity, 0x8000), + (u32::MAX, vec![(u32::MAX, input_start..input_end),]) + ); + } + + #[test] + fn sharded_range_fragment_tiny_nblocks() { + let shard_identity = ShardIdentity::unsharded(); + + // A range that spans relations: expect fragmentation to give up and return a u32::MAX size + let input_start = Key::from_hex("000000067F00000001000004E10000000000").unwrap(); + let input_end = Key::from_hex("000000067F00000001000004E10000000038").unwrap(); + assert_eq!( + do_fragment(input_start, input_end, &shard_identity, 16), + ( + 0x38, + vec![ + (16, input_start..input_start.add(16)), + (16, input_start.add(16)..input_start.add(32)), + (16, input_start.add(32)..input_start.add(48)), + (8, input_start.add(48)..input_end), + ] + ) + ); + } + + #[test] + fn sharded_range_fragment_fuzz() { + // Use a fixed seed: we don't want to explicitly pick values, but we do want + // the test to be reproducible. + let mut prng = rand::rngs::StdRng::seed_from_u64(0xdeadbeef); + + for _i in 0..1000 { + let shard_identity = if prng.next_u32() % 2 == 0 { + ShardIdentity::unsharded() + } else { + let shard_count = prng.next_u32() % 127 + 1; + ShardIdentity::new( + ShardNumber((prng.next_u32() % shard_count) as u8), + ShardCount::new(shard_count as u8), + ShardParameters::DEFAULT_STRIPE_SIZE, + ) + .unwrap() + }; + + let target_nblocks = prng.next_u32() % 65536 + 1; + + let start_offset = prng.next_u32() % 16384; + + // Try ranges up to 4GiB in size, that are always at least 1 + let range_size = prng.next_u32() % 8192 + 1; + + // A range that spans relations: expect fragmentation to give up and return a u32::MAX size + let input_start = Key::from_hex("000000067F00000001000004E10000000000") + .unwrap() + .add(start_offset); + let input_end = input_start.add(range_size); + + // This test's main success conditions are the invariants baked into do_fragment + let (_total_size, fragments) = + do_fragment(input_start, input_end, &shard_identity, target_nblocks); + + // Pick a random key within the range and check it appears in the output + let example_key = input_start.add(prng.next_u32() % range_size); + + // Panic on unwrap if it isn't found + let example_key_frag = fragments + .iter() + .find(|f| f.1.contains(&example_key)) + .unwrap(); + + // Check that the fragment containing our random key has a nonzero size if + // that key is shard-local + let example_key_local = !shard_identity.is_key_disposable(&example_key); + if example_key_local { + assert!(example_key_frag.0 > 0); + } + } + } } diff --git a/libs/pageserver_api/src/lib.rs b/libs/pageserver_api/src/lib.rs index 1b948d60c3..532185a366 100644 --- a/libs/pageserver_api/src/lib.rs +++ b/libs/pageserver_api/src/lib.rs @@ -1,6 +1,5 @@ #![deny(unsafe_code)] #![deny(clippy::undocumented_unsafe_blocks)] -use const_format::formatcp; pub mod controller_api; pub mod key; @@ -11,7 +10,4 @@ pub mod shard; /// Public API types pub mod upcall_api; -pub const DEFAULT_PG_LISTEN_PORT: u16 = 64000; -pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}"); -pub const DEFAULT_HTTP_LISTEN_PORT: u16 = 9898; -pub const DEFAULT_HTTP_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_HTTP_LISTEN_PORT}"); +pub mod config; diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index aad4cc97fc..1df5820fb9 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -1,3 +1,4 @@ +pub mod detach_ancestor; pub mod partitioning; pub mod utilization; @@ -8,6 +9,7 @@ use std::{ collections::HashMap, io::{BufRead, Read}, num::{NonZeroU64, NonZeroUsize}, + str::FromStr, time::{Duration, SystemTime}, }; @@ -20,6 +22,7 @@ use utils::{ history_buffer::HistoryBufferWithDropCounter, id::{NodeId, TenantId, TimelineId}, lsn::Lsn, + serde_system_time, }; use crate::controller_api::PlacementPolicy; @@ -301,6 +304,32 @@ pub struct TenantConfig { pub heatmap_period: Option, pub lazy_slru_download: Option, pub timeline_get_throttle: Option, + pub image_layer_creation_check_threshold: Option, + pub switch_aux_file_policy: Option, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +pub enum AuxFilePolicy { + V1, + V2, + CrossValidation, +} + +impl FromStr for AuxFilePolicy { + type Err = anyhow::Error; + + fn from_str(s: &str) -> Result { + let s = s.to_lowercase(); + if s == "v1" { + Ok(Self::V1) + } else if s == "v2" { + Ok(Self::V2) + } else if s == "crossvalidation" || s == "cross_validation" { + Ok(Self::CrossValidation) + } else { + anyhow::bail!("cannot parse {} to aux file policy", s) + } + } } #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] @@ -427,7 +456,6 @@ pub struct StatusResponse { #[derive(Serialize, Deserialize, Debug)] #[serde(deny_unknown_fields)] pub struct TenantLocationConfigRequest { - pub tenant_id: Option, #[serde(flatten)] pub config: LocationConfig, // as we have a flattened field, we should reject all unknown fields in it } @@ -745,10 +773,18 @@ pub struct TimelineGcRequest { pub gc_horizon: Option, } +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct WalRedoManagerProcessStatus { + pub pid: u32, + /// The strum-generated `into::<&'static str>()` for `pageserver::walredo::ProcessKind`. + /// `ProcessKind` are a transitory thing, so, they have no enum representation in `pageserver_api`. + pub kind: Cow<'static, str>, +} + #[derive(Debug, Clone, Serialize, Deserialize)] pub struct WalRedoManagerStatus { pub last_redo_at: Option>, - pub pid: Option, + pub process: Option, } /// The progress of a secondary tenant is mostly useful when doing a long running download: e.g. initiating @@ -757,11 +793,7 @@ pub struct WalRedoManagerStatus { #[derive(Default, Debug, Serialize, Deserialize, Clone)] pub struct SecondaryProgress { /// The remote storage LastModified time of the heatmap object we last downloaded. - #[serde( - serialize_with = "opt_ser_rfc3339_millis", - deserialize_with = "opt_deser_rfc3339_millis" - )] - pub heatmap_mtime: Option, + pub heatmap_mtime: Option, /// The number of layers currently on-disk pub layers_downloaded: usize, @@ -774,27 +806,15 @@ pub struct SecondaryProgress { pub bytes_total: u64, } -fn opt_ser_rfc3339_millis( - ts: &Option, - serializer: S, -) -> Result { - match ts { - Some(ts) => serializer.collect_str(&humantime::format_rfc3339_millis(*ts)), - None => serializer.serialize_none(), - } +#[derive(Serialize, Deserialize, Debug)] +pub struct TenantScanRemoteStorageShard { + pub tenant_shard_id: TenantShardId, + pub generation: Option, } -fn opt_deser_rfc3339_millis<'de, D>(deserializer: D) -> Result, D::Error> -where - D: serde::de::Deserializer<'de>, -{ - let s: Option = serde::de::Deserialize::deserialize(deserializer)?; - match s { - None => Ok(None), - Some(s) => humantime::parse_rfc3339(&s) - .map_err(serde::de::Error::custom) - .map(Some), - } +#[derive(Serialize, Deserialize, Debug, Default)] +pub struct TenantScanRemoteStorageResponse { + pub shards: Vec, } pub mod virtual_file { @@ -864,39 +884,72 @@ impl TryFrom for PagestreamBeMessageTag { } } +// In the V2 protocol version, a GetPage request contains two LSN values: +// +// request_lsn: Get the page version at this point in time. Lsn::Max is a special value that means +// "get the latest version present". It's used by the primary server, which knows that no one else +// is writing WAL. 'not_modified_since' must be set to a proper value even if request_lsn is +// Lsn::Max. Standby servers use the current replay LSN as the request LSN. +// +// not_modified_since: Hint to the pageserver that the client knows that the page has not been +// modified between 'not_modified_since' and the request LSN. It's always correct to set +// 'not_modified_since equal' to 'request_lsn' (unless Lsn::Max is used as the 'request_lsn'), but +// passing an earlier LSN can speed up the request, by allowing the pageserver to process the +// request without waiting for 'request_lsn' to arrive. +// +// The legacy V1 interface contained only one LSN, and a boolean 'latest' flag. The V1 interface was +// sufficient for the primary; the 'lsn' was equivalent to the 'not_modified_since' value, and +// 'latest' was set to true. The V2 interface was added because there was no correct way for a +// standby to request a page at a particular non-latest LSN, and also include the +// 'not_modified_since' hint. That led to an awkward choice of either using an old LSN in the +// request, if the standby knows that the page hasn't been modified since, and risk getting an error +// if that LSN has fallen behind the GC horizon, or requesting the current replay LSN, which could +// require the pageserver unnecessarily to wait for the WAL to arrive up to that point. The new V2 +// interface allows sending both LSNs, and let the pageserver do the right thing. There is no +// difference in the responses between V1 and V2. +// +// The Request structs below reflect the V2 interface. If V1 is used, the parse function +// maps the old format requests to the new format. +// +#[derive(Clone, Copy)] +pub enum PagestreamProtocolVersion { + V1, + V2, +} + #[derive(Debug, PartialEq, Eq)] pub struct PagestreamExistsRequest { - pub latest: bool, - pub lsn: Lsn, + pub request_lsn: Lsn, + pub not_modified_since: Lsn, pub rel: RelTag, } #[derive(Debug, PartialEq, Eq)] pub struct PagestreamNblocksRequest { - pub latest: bool, - pub lsn: Lsn, + pub request_lsn: Lsn, + pub not_modified_since: Lsn, pub rel: RelTag, } #[derive(Debug, PartialEq, Eq)] pub struct PagestreamGetPageRequest { - pub latest: bool, - pub lsn: Lsn, + pub request_lsn: Lsn, + pub not_modified_since: Lsn, pub rel: RelTag, pub blkno: u32, } #[derive(Debug, PartialEq, Eq)] pub struct PagestreamDbSizeRequest { - pub latest: bool, - pub lsn: Lsn, + pub request_lsn: Lsn, + pub not_modified_since: Lsn, pub dbnode: u32, } #[derive(Debug, PartialEq, Eq)] pub struct PagestreamGetSlruSegmentRequest { - pub latest: bool, - pub lsn: Lsn, + pub request_lsn: Lsn, + pub not_modified_since: Lsn, pub kind: u8, pub segno: u32, } @@ -943,14 +996,16 @@ pub struct TenantHistorySize { } impl PagestreamFeMessage { + /// Serialize a compute -> pageserver message. This is currently only used in testing + /// tools. Always uses protocol version 2. pub fn serialize(&self) -> Bytes { let mut bytes = BytesMut::new(); match self { Self::Exists(req) => { bytes.put_u8(0); - bytes.put_u8(u8::from(req.latest)); - bytes.put_u64(req.lsn.0); + bytes.put_u64(req.request_lsn.0); + bytes.put_u64(req.not_modified_since.0); bytes.put_u32(req.rel.spcnode); bytes.put_u32(req.rel.dbnode); bytes.put_u32(req.rel.relnode); @@ -959,8 +1014,8 @@ impl PagestreamFeMessage { Self::Nblocks(req) => { bytes.put_u8(1); - bytes.put_u8(u8::from(req.latest)); - bytes.put_u64(req.lsn.0); + bytes.put_u64(req.request_lsn.0); + bytes.put_u64(req.not_modified_since.0); bytes.put_u32(req.rel.spcnode); bytes.put_u32(req.rel.dbnode); bytes.put_u32(req.rel.relnode); @@ -969,8 +1024,8 @@ impl PagestreamFeMessage { Self::GetPage(req) => { bytes.put_u8(2); - bytes.put_u8(u8::from(req.latest)); - bytes.put_u64(req.lsn.0); + bytes.put_u64(req.request_lsn.0); + bytes.put_u64(req.not_modified_since.0); bytes.put_u32(req.rel.spcnode); bytes.put_u32(req.rel.dbnode); bytes.put_u32(req.rel.relnode); @@ -980,15 +1035,15 @@ impl PagestreamFeMessage { Self::DbSize(req) => { bytes.put_u8(3); - bytes.put_u8(u8::from(req.latest)); - bytes.put_u64(req.lsn.0); + bytes.put_u64(req.request_lsn.0); + bytes.put_u64(req.not_modified_since.0); bytes.put_u32(req.dbnode); } Self::GetSlruSegment(req) => { bytes.put_u8(4); - bytes.put_u8(u8::from(req.latest)); - bytes.put_u64(req.lsn.0); + bytes.put_u64(req.request_lsn.0); + bytes.put_u64(req.not_modified_since.0); bytes.put_u8(req.kind); bytes.put_u32(req.segno); } @@ -997,18 +1052,40 @@ impl PagestreamFeMessage { bytes.into() } - pub fn parse(body: &mut R) -> anyhow::Result { - // TODO these gets can fail - + pub fn parse( + body: &mut R, + protocol_version: PagestreamProtocolVersion, + ) -> anyhow::Result { // these correspond to the NeonMessageTag enum in pagestore_client.h // // TODO: consider using protobuf or serde bincode for less error prone // serialization. let msg_tag = body.read_u8()?; + + let (request_lsn, not_modified_since) = match protocol_version { + PagestreamProtocolVersion::V2 => ( + Lsn::from(body.read_u64::()?), + Lsn::from(body.read_u64::()?), + ), + PagestreamProtocolVersion::V1 => { + // In the old protocol, each message starts with a boolean 'latest' flag, + // followed by 'lsn'. Convert that to the two LSNs, 'request_lsn' and + // 'not_modified_since', used in the new protocol version. + let latest = body.read_u8()? != 0; + let request_lsn = Lsn::from(body.read_u64::()?); + if latest { + (Lsn::MAX, request_lsn) // get latest version + } else { + (request_lsn, request_lsn) // get version at specified LSN + } + } + }; + + // The rest of the messages are the same between V1 and V2 match msg_tag { 0 => Ok(PagestreamFeMessage::Exists(PagestreamExistsRequest { - latest: body.read_u8()? != 0, - lsn: Lsn::from(body.read_u64::()?), + request_lsn, + not_modified_since, rel: RelTag { spcnode: body.read_u32::()?, dbnode: body.read_u32::()?, @@ -1017,8 +1094,8 @@ impl PagestreamFeMessage { }, })), 1 => Ok(PagestreamFeMessage::Nblocks(PagestreamNblocksRequest { - latest: body.read_u8()? != 0, - lsn: Lsn::from(body.read_u64::()?), + request_lsn, + not_modified_since, rel: RelTag { spcnode: body.read_u32::()?, dbnode: body.read_u32::()?, @@ -1027,8 +1104,8 @@ impl PagestreamFeMessage { }, })), 2 => Ok(PagestreamFeMessage::GetPage(PagestreamGetPageRequest { - latest: body.read_u8()? != 0, - lsn: Lsn::from(body.read_u64::()?), + request_lsn, + not_modified_since, rel: RelTag { spcnode: body.read_u32::()?, dbnode: body.read_u32::()?, @@ -1038,14 +1115,14 @@ impl PagestreamFeMessage { blkno: body.read_u32::()?, })), 3 => Ok(PagestreamFeMessage::DbSize(PagestreamDbSizeRequest { - latest: body.read_u8()? != 0, - lsn: Lsn::from(body.read_u64::()?), + request_lsn, + not_modified_since, dbnode: body.read_u32::()?, })), 4 => Ok(PagestreamFeMessage::GetSlruSegment( PagestreamGetSlruSegmentRequest { - latest: body.read_u8()? != 0, - lsn: Lsn::from(body.read_u64::()?), + request_lsn, + not_modified_since, kind: body.read_u8()?, segno: body.read_u32::()?, }, @@ -1173,8 +1250,8 @@ mod tests { // Test serialization/deserialization of PagestreamFeMessage let messages = vec![ PagestreamFeMessage::Exists(PagestreamExistsRequest { - latest: true, - lsn: Lsn(4), + request_lsn: Lsn(4), + not_modified_since: Lsn(3), rel: RelTag { forknum: 1, spcnode: 2, @@ -1183,8 +1260,8 @@ mod tests { }, }), PagestreamFeMessage::Nblocks(PagestreamNblocksRequest { - latest: false, - lsn: Lsn(4), + request_lsn: Lsn(4), + not_modified_since: Lsn(4), rel: RelTag { forknum: 1, spcnode: 2, @@ -1193,8 +1270,8 @@ mod tests { }, }), PagestreamFeMessage::GetPage(PagestreamGetPageRequest { - latest: true, - lsn: Lsn(4), + request_lsn: Lsn(4), + not_modified_since: Lsn(3), rel: RelTag { forknum: 1, spcnode: 2, @@ -1204,14 +1281,16 @@ mod tests { blkno: 7, }), PagestreamFeMessage::DbSize(PagestreamDbSizeRequest { - latest: true, - lsn: Lsn(4), + request_lsn: Lsn(4), + not_modified_since: Lsn(3), dbnode: 7, }), ]; for msg in messages { let bytes = msg.serialize(); - let reconstructed = PagestreamFeMessage::parse(&mut bytes.reader()).unwrap(); + let reconstructed = + PagestreamFeMessage::parse(&mut bytes.reader(), PagestreamProtocolVersion::V2) + .unwrap(); assert!(msg == reconstructed); } } diff --git a/libs/pageserver_api/src/models/detach_ancestor.rs b/libs/pageserver_api/src/models/detach_ancestor.rs new file mode 100644 index 0000000000..fc1f10e734 --- /dev/null +++ b/libs/pageserver_api/src/models/detach_ancestor.rs @@ -0,0 +1,6 @@ +use utils::id::TimelineId; + +#[derive(Default, serde::Serialize)] +pub struct AncestorDetached { + pub reparented_timelines: Vec, +} diff --git a/libs/pageserver_api/src/models/partitioning.rs b/libs/pageserver_api/src/models/partitioning.rs index 0d287f7be0..f6644be635 100644 --- a/libs/pageserver_api/src/models/partitioning.rs +++ b/libs/pageserver_api/src/models/partitioning.rs @@ -1,9 +1,11 @@ use utils::lsn::Lsn; +use crate::keyspace::SparseKeySpace; + #[derive(Debug, PartialEq, Eq)] pub struct Partitioning { pub keys: crate::keyspace::KeySpace, - + pub sparse_keys: crate::keyspace::SparseKeySpace, pub at_lsn: Lsn, } @@ -32,6 +34,8 @@ impl serde::Serialize for Partitioning { let mut map = serializer.serialize_map(Some(2))?; map.serialize_key("keys")?; map.serialize_value(&KeySpace(&self.keys))?; + map.serialize_key("sparse_keys")?; + map.serialize_value(&KeySpace(&self.sparse_keys.0))?; map.serialize_key("at_lsn")?; map.serialize_value(&WithDisplay(&self.at_lsn))?; map.end() @@ -99,6 +103,7 @@ impl<'a> serde::Deserialize<'a> for Partitioning { #[derive(serde::Deserialize)] struct De { keys: KeySpace, + sparse_keys: KeySpace, #[serde_as(as = "serde_with::DisplayFromStr")] at_lsn: Lsn, } @@ -107,6 +112,7 @@ impl<'a> serde::Deserialize<'a> for Partitioning { Ok(Self { at_lsn: de.at_lsn, keys: de.keys.0, + sparse_keys: SparseKeySpace(de.sparse_keys.0), }) } } @@ -133,6 +139,12 @@ mod tests { "030000000000000000000000000000000003" ] ], + "sparse_keys": [ + [ + "620000000000000000000000000000000000", + "620000000000000000000000000000000003" + ] + ], "at_lsn": "0/2240160" } "#; diff --git a/libs/pageserver_api/src/models/utilization.rs b/libs/pageserver_api/src/models/utilization.rs index f5984dff5d..e88cab5d6a 100644 --- a/libs/pageserver_api/src/models/utilization.rs +++ b/libs/pageserver_api/src/models/utilization.rs @@ -1,4 +1,4 @@ -use std::time::SystemTime; +use utils::serde_system_time::SystemTime; /// Pageserver current utilization and scoring for how good candidate the pageserver would be for /// the next tenant. @@ -21,28 +21,9 @@ pub struct PageserverUtilization { /// When was this snapshot captured, pageserver local time. /// /// Use millis to give confidence that the value is regenerated often enough. - #[serde( - serialize_with = "ser_rfc3339_millis", - deserialize_with = "deser_rfc3339_millis" - )] pub captured_at: SystemTime, } -fn ser_rfc3339_millis( - ts: &SystemTime, - serializer: S, -) -> Result { - serializer.collect_str(&humantime::format_rfc3339_millis(*ts)) -} - -fn deser_rfc3339_millis<'de, D>(deserializer: D) -> Result -where - D: serde::de::Deserializer<'de>, -{ - let s: String = serde::de::Deserialize::deserialize(deserializer)?; - humantime::parse_rfc3339(&s).map_err(serde::de::Error::custom) -} - /// openapi knows only `format: int64`, so avoid outputting a non-parseable value by generated clients. /// /// Instead of newtype, use this because a newtype would get require handling deserializing values @@ -69,7 +50,9 @@ mod tests { disk_usage_bytes: u64::MAX, free_space_bytes: 0, utilization_score: u64::MAX, - captured_at: SystemTime::UNIX_EPOCH + Duration::from_secs(1708509779), + captured_at: SystemTime( + std::time::SystemTime::UNIX_EPOCH + Duration::from_secs(1708509779), + ), }; let s = serde_json::to_string(&doc).unwrap(); diff --git a/libs/pageserver_api/src/shard.rs b/libs/pageserver_api/src/shard.rs index a2a9165184..ff6d3d91b6 100644 --- a/libs/pageserver_api/src/shard.rs +++ b/libs/pageserver_api/src/shard.rs @@ -5,21 +5,99 @@ use crate::{ models::ShardParameters, }; use hex::FromHex; +use postgres_ffi::relfile_utils::INIT_FORKNUM; use serde::{Deserialize, Serialize}; use utils::id::TenantId; +/// See docs/rfcs/031-sharding-static.md for an overview of sharding. +/// +/// This module contains a variety of types used to represent the concept of sharding +/// a Neon tenant across multiple physical shards. Since there are quite a few of these, +/// we provide an summary here. +/// +/// Types used to describe shards: +/// - [`ShardCount`] describes how many shards make up a tenant, plus the magic `unsharded` value +/// which identifies a tenant which is not shard-aware. This means its storage paths do not include +/// a shard suffix. +/// - [`ShardNumber`] is simply the zero-based index of a shard within a tenant. +/// - [`ShardIndex`] is the 2-tuple of `ShardCount` and `ShardNumber`, it's just like a `TenantShardId` +/// without the tenant ID. This is useful for things that are implicitly scoped to a particular +/// tenant, such as layer files. +/// - [`ShardIdentity`]` is the full description of a particular shard's parameters, in sufficient +/// detail to convert a [`Key`] to a [`ShardNumber`] when deciding where to write/read. +/// - The [`ShardSlug`] is a terse formatter for ShardCount and ShardNumber, written as +/// four hex digits. An unsharded tenant is `0000`. +/// - [`TenantShardId`] is the unique ID of a particular shard within a particular tenant +/// +/// Types used to describe the parameters for data distribution in a sharded tenant: +/// - [`ShardStripeSize`] controls how long contiguous runs of [`Key`]s (stripes) are when distributed across +/// multiple shards. Its value is given in 8kiB pages. +/// - [`ShardLayout`] describes the data distribution scheme, and at time of writing is +/// always zero: this is provided for future upgrades that might introduce different +/// data distribution schemes. +/// +/// Examples: +/// - A legacy unsharded tenant has one shard with ShardCount(0), ShardNumber(0), and its slug is 0000 +/// - A single sharded tenant has one shard with ShardCount(1), ShardNumber(0), and its slug is 0001 +/// - In a tenant with 4 shards, each shard has ShardCount(N), ShardNumber(i) where i in 0..N-1 (inclusive), +/// and their slugs are 0004, 0104, 0204, and 0304. + #[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)] pub struct ShardNumber(pub u8); #[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)] pub struct ShardCount(u8); +/// Combination of ShardNumber and ShardCount. For use within the context of a particular tenant, +/// when we need to know which shard we're dealing with, but do not need to know the full +/// ShardIdentity (because we won't be doing any page->shard mapping), and do not need to know +/// the fully qualified TenantShardId. +#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)] +pub struct ShardIndex { + pub shard_number: ShardNumber, + pub shard_count: ShardCount, +} + +/// The ShardIdentity contains enough information to map a [`Key`] to a [`ShardNumber`], +/// and to check whether that [`ShardNumber`] is the same as the current shard. +#[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)] +pub struct ShardIdentity { + pub number: ShardNumber, + pub count: ShardCount, + pub stripe_size: ShardStripeSize, + layout: ShardLayout, +} + +/// Formatting helper, for generating the `shard_id` label in traces. +struct ShardSlug<'a>(&'a TenantShardId); + +/// TenantShardId globally identifies a particular shard in a particular tenant. +/// +/// These are written as `-`, for example: +/// # The second shard in a two-shard tenant +/// 072f1291a5310026820b2fe4b2968934-0102 +/// +/// If the `ShardCount` is _unsharded_, the `TenantShardId` is written without +/// a shard suffix and is equivalent to the encoding of a `TenantId`: this enables +/// an unsharded [`TenantShardId`] to be used interchangably with a [`TenantId`]. +/// +/// The human-readable encoding of an unsharded TenantShardId, such as used in API URLs, +/// is both forward and backward compatible with TenantId: a legacy TenantId can be +/// decoded as a TenantShardId, and when re-encoded it will be parseable +/// as a TenantId. +#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)] +pub struct TenantShardId { + pub tenant_id: TenantId, + pub shard_number: ShardNumber, + pub shard_count: ShardCount, +} + impl ShardCount { pub const MAX: Self = Self(u8::MAX); /// The internal value of a ShardCount may be zero, which means "1 shard, but use /// legacy format for TenantShardId that excludes the shard suffix", also known - /// as `TenantShardId::unsharded`. + /// as [`TenantShardId::unsharded`]. /// /// This method returns the actual number of shards, i.e. if our internal value is /// zero, we return 1 (unsharded tenants have 1 shard). @@ -38,6 +116,9 @@ impl ShardCount { self.0 } + /// Whether the `ShardCount` is for an unsharded tenant, so uses one shard but + /// uses the legacy format for `TenantShardId`. See also the documentation for + /// [`Self::count`]. pub fn is_unsharded(&self) -> bool { self.0 == 0 } @@ -53,33 +134,6 @@ impl ShardNumber { pub const MAX: Self = Self(u8::MAX); } -/// TenantShardId identify the units of work for the Pageserver. -/// -/// These are written as `-`, for example: -/// -/// # The second shard in a two-shard tenant -/// 072f1291a5310026820b2fe4b2968934-0102 -/// -/// Historically, tenants could not have multiple shards, and were identified -/// by TenantId. To support this, TenantShardId has a special legacy -/// mode where `shard_count` is equal to zero: this represents a single-sharded -/// tenant which should be written as a TenantId with no suffix. -/// -/// The human-readable encoding of TenantShardId, such as used in API URLs, -/// is both forward and backward compatible: a legacy TenantId can be -/// decoded as a TenantShardId, and when re-encoded it will be parseable -/// as a TenantId. -/// -/// Note that the binary encoding is _not_ backward compatible, because -/// at the time sharding is introduced, there are no existing binary structures -/// containing TenantId that we need to handle. -#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)] -pub struct TenantShardId { - pub tenant_id: TenantId, - pub shard_number: ShardNumber, - pub shard_count: ShardCount, -} - impl TenantShardId { pub fn unsharded(tenant_id: TenantId) -> Self { Self { @@ -111,10 +165,13 @@ impl TenantShardId { } /// Convenience for code that has special behavior on the 0th shard. - pub fn is_zero(&self) -> bool { + pub fn is_shard_zero(&self) -> bool { self.shard_number == ShardNumber(0) } + /// The "unsharded" value is distinct from simply having a single shard: it represents + /// a tenant which is not shard-aware at all, and whose storage paths will not include + /// a shard suffix. pub fn is_unsharded(&self) -> bool { self.shard_number == ShardNumber(0) && self.shard_count.is_unsharded() } @@ -150,9 +207,6 @@ impl TenantShardId { } } -/// Formatting helper -struct ShardSlug<'a>(&'a TenantShardId); - impl<'a> std::fmt::Display for ShardSlug<'a> { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!( @@ -222,16 +276,6 @@ impl From<[u8; 18]> for TenantShardId { } } -/// For use within the context of a particular tenant, when we need to know which -/// shard we're dealing with, but do not need to know the full ShardIdentity (because -/// we won't be doing any page->shard mapping), and do not need to know the fully qualified -/// TenantShardId. -#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)] -pub struct ShardIndex { - pub shard_number: ShardNumber, - pub shard_count: ShardCount, -} - impl ShardIndex { pub fn new(number: ShardNumber, count: ShardCount) -> Self { Self { @@ -246,6 +290,9 @@ impl ShardIndex { } } + /// The "unsharded" value is distinct from simply having a single shard: it represents + /// a tenant which is not shard-aware at all, and whose storage paths will not include + /// a shard suffix. pub fn is_unsharded(&self) -> bool { self.shard_number == ShardNumber(0) && self.shard_count == ShardCount(0) } @@ -313,6 +360,8 @@ impl Serialize for TenantShardId { if serializer.is_human_readable() { serializer.collect_str(self) } else { + // Note: while human encoding of [`TenantShardId`] is backward and forward + // compatible, this binary encoding is not. let mut packed: [u8; 18] = [0; 18]; packed[0..16].clone_from_slice(&self.tenant_id.as_arr()); packed[16] = self.shard_number.0; @@ -390,16 +439,6 @@ const LAYOUT_BROKEN: ShardLayout = ShardLayout(255); /// Default stripe size in pages: 256MiB divided by 8kiB page size. const DEFAULT_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(256 * 1024 / 8); -/// The ShardIdentity contains the information needed for one member of map -/// to resolve a key to a shard, and then check whether that shard is ==self. -#[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)] -pub struct ShardIdentity { - pub number: ShardNumber, - pub count: ShardCount, - pub stripe_size: ShardStripeSize, - layout: ShardLayout, -} - #[derive(thiserror::Error, Debug, PartialEq, Eq)] pub enum ShardConfigError { #[error("Invalid shard count")] @@ -414,7 +453,7 @@ impl ShardIdentity { /// An identity with number=0 count=0 is a "none" identity, which represents legacy /// tenants. Modern single-shard tenants should not use this: they should /// have number=0 count=1. - pub fn unsharded() -> Self { + pub const fn unsharded() -> Self { Self { number: ShardNumber(0), count: ShardCount(0), @@ -439,6 +478,9 @@ impl ShardIdentity { } } + /// The "unsharded" value is distinct from simply having a single shard: it represents + /// a tenant which is not shard-aware at all, and whose storage paths will not include + /// a shard suffix. pub fn is_unsharded(&self) -> bool { self.number == ShardNumber(0) && self.count == ShardCount(0) } @@ -487,6 +529,8 @@ impl ShardIdentity { } /// Return true if the key should be ingested by this shard + /// + /// Shards must ingest _at least_ keys which return true from this check. pub fn is_key_local(&self, key: &Key) -> bool { assert!(!self.is_broken()); if self.count < ShardCount(2) || (key_is_shard0(key) && self.number == ShardNumber(0)) { @@ -497,7 +541,9 @@ impl ShardIdentity { } /// Return true if the key should be discarded if found in this shard's - /// data store, e.g. during compaction after a split + /// data store, e.g. during compaction after a split. + /// + /// Shards _may_ drop keys which return false here, but are not obliged to. pub fn is_key_disposable(&self, key: &Key) -> bool { if key_is_shard0(key) { // Q: Why can't we dispose of shard0 content if we're not shard 0? @@ -523,7 +569,7 @@ impl ShardIdentity { /// Convenience for checking if this identity is the 0th shard in a tenant, /// for special cases on shard 0 such as ingesting relation sizes. - pub fn is_zero(&self) -> bool { + pub fn is_shard_zero(&self) -> bool { self.number == ShardNumber(0) } } @@ -606,7 +652,13 @@ fn key_is_shard0(key: &Key) -> bool { // relation pages are distributed to shards other than shard zero. Everything else gets // stored on shard 0. This guarantees that shard 0 can independently serve basebackup // requests, and any request other than those for particular blocks in relations. - !is_rel_block_key(key) + // + // The only exception to this rule is "initfork" data -- this relates to postgres's UNLOGGED table + // type. These are special relations, usually with only 0 or 1 blocks, and we store them on shard 0 + // because they must be included in basebackups. + let is_initfork = key.field5 == INIT_FORKNUM; + + !is_rel_block_key(key) || is_initfork } /// Provide the same result as the function in postgres `hashfn.h` with the same name diff --git a/libs/postgres_ffi/src/lib.rs b/libs/postgres_ffi/src/lib.rs index aa6845b9b1..0d6986778a 100644 --- a/libs/postgres_ffi/src/lib.rs +++ b/libs/postgres_ffi/src/lib.rs @@ -118,7 +118,9 @@ pub use v14::bindings::{TimeLineID, TimestampTz, XLogRecPtr, XLogSegNo}; // Likewise for these, although the assumption that these don't change is a little more iffy. pub use v14::bindings::{MultiXactOffset, MultiXactStatus}; pub use v14::bindings::{PageHeaderData, XLogRecord}; -pub use v14::xlog_utils::{XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD}; +pub use v14::xlog_utils::{ + XLOG_SIZE_OF_XLOG_LONG_PHD, XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD, +}; pub use v14::bindings::{CheckPoint, ControlFileData}; diff --git a/libs/postgres_ffi/src/xlog_utils.rs b/libs/postgres_ffi/src/xlog_utils.rs index 4a66a0ab1d..0bbb91afc2 100644 --- a/libs/postgres_ffi/src/xlog_utils.rs +++ b/libs/postgres_ffi/src/xlog_utils.rs @@ -331,7 +331,10 @@ impl CheckPoint { /// Returns 'true' if the XID was updated. pub fn update_next_xid(&mut self, xid: u32) -> bool { // nextXid should be greater than any XID in WAL, so increment provided XID and check for wraparround. - let mut new_xid = std::cmp::max(xid.wrapping_add(1), pg_constants::FIRST_NORMAL_TRANSACTION_ID); + let mut new_xid = std::cmp::max( + xid.wrapping_add(1), + pg_constants::FIRST_NORMAL_TRANSACTION_ID, + ); // To reduce number of metadata checkpoints, we forward align XID on XID_CHECKPOINT_INTERVAL. // XID_CHECKPOINT_INTERVAL should not be larger than BLCKSZ*CLOG_XACTS_PER_BYTE new_xid = @@ -367,8 +370,16 @@ pub fn generate_wal_segment(segno: u64, system_id: u64, lsn: Lsn) -> Result 0 { + assert!(seg_off >= XLOG_SIZE_OF_XLOG_LONG_PHD); + // xlp_rem_len doesn't include page header, hence the subtraction. + ( + seg_off - XLOG_SIZE_OF_XLOG_LONG_PHD, + pg_constants::XLP_FIRST_IS_CONTRECORD, + ) } else { (0, 0) }; @@ -397,20 +408,22 @@ pub fn generate_wal_segment(segno: u64, system_id: u64, lsn: Lsn) -> Result 0 { + assert!(page_off >= XLOG_SIZE_OF_XLOG_SHORT_PHD as u64); + ( + (page_off - XLOG_SIZE_OF_XLOG_SHORT_PHD as u64) as u32, + pg_constants::XLP_FIRST_IS_CONTRECORD, + ) + } else { + (0, 0) + }; let header = XLogPageHeaderData { xlp_magic: XLOG_PAGE_MAGIC as u16, - xlp_info: if page_off >= pg_constants::SIZE_OF_PAGE_HEADER as u64 { - pg_constants::XLP_FIRST_IS_CONTRECORD - } else { - 0 - }, + xlp_info, xlp_tli: PG_TLI, xlp_pageaddr: lsn.page_lsn().0, - xlp_rem_len: if page_off >= pg_constants::SIZE_OF_PAGE_HEADER as u64 { - page_off as u32 - } else { - 0u32 - }, + xlp_rem_len, ..Default::default() // Put 0 in padding fields. }; let hdr_bytes = header.encode()?; diff --git a/libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs b/libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs index e87ca27e90..41afcea6c2 100644 --- a/libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs +++ b/libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs @@ -1,5 +1,6 @@ use anyhow::*; use clap::{value_parser, Arg, ArgMatches, Command}; +use postgres::Client; use std::{path::PathBuf, str::FromStr}; use wal_craft::*; @@ -8,8 +9,8 @@ fn main() -> Result<()> { .init(); let arg_matches = cli().get_matches(); - let wal_craft = |arg_matches: &ArgMatches, client| { - let (intermediate_lsns, end_of_wal_lsn) = match arg_matches + let wal_craft = |arg_matches: &ArgMatches, client: &mut Client| { + let intermediate_lsns = match arg_matches .get_one::("type") .map(|s| s.as_str()) .context("'type' is required")? @@ -25,6 +26,7 @@ fn main() -> Result<()> { LastWalRecordCrossingSegment::NAME => LastWalRecordCrossingSegment::craft(client)?, a => panic!("Unknown --type argument: {a}"), }; + let end_of_wal_lsn = client.pg_current_wal_insert_lsn()?; for lsn in intermediate_lsns { println!("intermediate_lsn = {lsn}"); } diff --git a/libs/postgres_ffi/wal_craft/src/lib.rs b/libs/postgres_ffi/wal_craft/src/lib.rs index 281a180e3b..262068cbda 100644 --- a/libs/postgres_ffi/wal_craft/src/lib.rs +++ b/libs/postgres_ffi/wal_craft/src/lib.rs @@ -4,8 +4,9 @@ use log::*; use postgres::types::PgLsn; use postgres::Client; use postgres_ffi::{WAL_SEGMENT_SIZE, XLOG_BLCKSZ}; -use postgres_ffi::{XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD}; -use std::cmp::Ordering; +use postgres_ffi::{ + XLOG_SIZE_OF_XLOG_LONG_PHD, XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD, +}; use std::path::{Path, PathBuf}; use std::process::Command; use std::time::{Duration, Instant}; @@ -232,59 +233,62 @@ pub fn ensure_server_config(client: &mut impl postgres::GenericClient) -> anyhow pub trait Crafter { const NAME: &'static str; - /// Generates WAL using the client `client`. Returns a pair of: - /// * A vector of some valid "interesting" intermediate LSNs which one may start reading from. - /// May include or exclude Lsn(0) and the end-of-wal. - /// * The expected end-of-wal LSN. - fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec, PgLsn)>; + /// Generates WAL using the client `client`. Returns a vector of some valid + /// "interesting" intermediate LSNs which one may start reading from. + /// test_end_of_wal uses this to check various starting points. + /// + /// Note that postgres is generally keen about writing some WAL. While we + /// try to disable it (autovacuum, big wal_writer_delay, etc) it is always + /// possible, e.g. xl_running_xacts are dumped each 15s. So checks about + /// stable WAL end would be flaky unless postgres is shut down. For this + /// reason returning potential end of WAL here is pointless. Most of the + /// time this doesn't happen though, so it is reasonable to create needed + /// WAL structure and immediately kill postgres like test_end_of_wal does. + fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result>; } +/// Wraps some WAL craft function, providing current LSN to it before the +/// insertion and flushing WAL afterwards. Also pushes initial LSN to the +/// result. fn craft_internal( client: &mut C, - f: impl Fn(&mut C, PgLsn) -> anyhow::Result<(Vec, Option)>, -) -> anyhow::Result<(Vec, PgLsn)> { + f: impl Fn(&mut C, PgLsn) -> anyhow::Result>, +) -> anyhow::Result> { ensure_server_config(client)?; let initial_lsn = client.pg_current_wal_insert_lsn()?; info!("LSN initial = {}", initial_lsn); - let (mut intermediate_lsns, last_lsn) = f(client, initial_lsn)?; - let last_lsn = match last_lsn { - None => client.pg_current_wal_insert_lsn()?, - Some(last_lsn) => { - let insert_lsn = client.pg_current_wal_insert_lsn()?; - match last_lsn.cmp(&insert_lsn) { - Ordering::Less => bail!( - "Some records were inserted after the crafted WAL: {} vs {}", - last_lsn, - insert_lsn - ), - Ordering::Equal => last_lsn, - Ordering::Greater => bail!("Reported LSN is greater than insert_lsn"), - } - } - }; + let mut intermediate_lsns = f(client, initial_lsn)?; if !intermediate_lsns.starts_with(&[initial_lsn]) { intermediate_lsns.insert(0, initial_lsn); } - // Some records may be not flushed, e.g. non-transactional logical messages. - client.execute("select neon_xlogflush(pg_current_wal_insert_lsn())", &[])?; - match last_lsn.cmp(&client.pg_current_wal_flush_lsn()?) { - Ordering::Less => bail!("Some records were flushed after the crafted WAL"), - Ordering::Equal => {} - Ordering::Greater => bail!("Reported LSN is greater than flush_lsn"), + // Some records may be not flushed, e.g. non-transactional logical messages. Flush now. + // + // If the previous WAL record ended exactly at page boundary, pg_current_wal_insert_lsn + // returns the position just after the page header on the next page. That's where the next + // record will be inserted. But the page header hasn't actually been written to the WAL + // yet, and if you try to flush it, you get a "request to flush past end of generated WAL" + // error. Because of that, if the insert location is just after a page header, back off to + // previous page boundary. + let mut lsn = u64::from(client.pg_current_wal_insert_lsn()?); + if lsn % WAL_SEGMENT_SIZE as u64 == XLOG_SIZE_OF_XLOG_LONG_PHD as u64 { + lsn -= XLOG_SIZE_OF_XLOG_LONG_PHD as u64; + } else if lsn % XLOG_BLCKSZ as u64 == XLOG_SIZE_OF_XLOG_SHORT_PHD as u64 { + lsn -= XLOG_SIZE_OF_XLOG_SHORT_PHD as u64; } - Ok((intermediate_lsns, last_lsn)) + client.execute("select neon_xlogflush($1)", &[&PgLsn::from(lsn)])?; + Ok(intermediate_lsns) } pub struct Simple; impl Crafter for Simple { const NAME: &'static str = "simple"; - fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec, PgLsn)> { + fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result> { craft_internal(client, |client, _| { client.execute("CREATE table t(x int)", &[])?; - Ok((Vec::new(), None)) + Ok(Vec::new()) }) } } @@ -292,67 +296,85 @@ impl Crafter for Simple { pub struct LastWalRecordXlogSwitch; impl Crafter for LastWalRecordXlogSwitch { const NAME: &'static str = "last_wal_record_xlog_switch"; - fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec, PgLsn)> { - // Do not use generate_internal because here we end up with flush_lsn exactly on + fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result> { + // Do not use craft_internal because here we end up with flush_lsn exactly on // the segment boundary and insert_lsn after the initial page header, which is unusual. ensure_server_config(client)?; client.execute("CREATE table t(x int)", &[])?; let before_xlog_switch = client.pg_current_wal_insert_lsn()?; - let after_xlog_switch: PgLsn = client.query_one("SELECT pg_switch_wal()", &[])?.get(0); - let next_segment = PgLsn::from(0x0200_0000); + // pg_switch_wal returns end of last record of the switched segment, + // i.e. end of SWITCH itself. + let xlog_switch_record_end: PgLsn = client.query_one("SELECT pg_switch_wal()", &[])?.get(0); + let before_xlog_switch_u64 = u64::from(before_xlog_switch); + let next_segment = PgLsn::from( + before_xlog_switch_u64 - (before_xlog_switch_u64 % WAL_SEGMENT_SIZE as u64) + + WAL_SEGMENT_SIZE as u64, + ); ensure!( - after_xlog_switch <= next_segment, - "XLOG_SWITCH message ended after the expected segment boundary: {} > {}", - after_xlog_switch, + xlog_switch_record_end <= next_segment, + "XLOG_SWITCH record ended after the expected segment boundary: {} > {}", + xlog_switch_record_end, next_segment ); - Ok((vec![before_xlog_switch, after_xlog_switch], next_segment)) + Ok(vec![before_xlog_switch, xlog_switch_record_end]) } } pub struct LastWalRecordXlogSwitchEndsOnPageBoundary; +/// Craft xlog SWITCH record ending at page boundary. impl Crafter for LastWalRecordXlogSwitchEndsOnPageBoundary { const NAME: &'static str = "last_wal_record_xlog_switch_ends_on_page_boundary"; - fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec, PgLsn)> { + fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result> { // Do not use generate_internal because here we end up with flush_lsn exactly on // the segment boundary and insert_lsn after the initial page header, which is unusual. ensure_server_config(client)?; client.execute("CREATE table t(x int)", &[])?; - // Add padding so the XLOG_SWITCH record ends exactly on XLOG_BLCKSZ boundary. - // We will use logical message as the padding. We start with detecting how much WAL - // it takes for one logical message, considering all alignments and headers. - let base_wal_advance = { + // Add padding so the XLOG_SWITCH record ends exactly on XLOG_BLCKSZ boundary. We + // will use carefully-sized logical messages to advance WAL insert location such + // that there is just enough space on the page for the XLOG_SWITCH record. + loop { + // We start with measuring how much WAL it takes for one logical message, + // considering all alignments and headers. let before_lsn = client.pg_current_wal_insert_lsn()?; - // Small non-empty message bigger than few bytes is more likely than an empty - // message to have the same format as the big padding message. client.execute( "SELECT pg_logical_emit_message(false, 'swch', REPEAT('a', 10))", &[], )?; - // The XLOG_SWITCH record has no data => its size is exactly XLOG_SIZE_OF_XLOG_RECORD. - (u64::from(client.pg_current_wal_insert_lsn()?) - u64::from(before_lsn)) as usize - + XLOG_SIZE_OF_XLOG_RECORD - }; - let mut remaining_lsn = - XLOG_BLCKSZ - u64::from(client.pg_current_wal_insert_lsn()?) as usize % XLOG_BLCKSZ; - if remaining_lsn < base_wal_advance { - remaining_lsn += XLOG_BLCKSZ; + let after_lsn = client.pg_current_wal_insert_lsn()?; + + // Did the record cross a page boundary? If it did, start over. Crossing a + // page boundary adds to the apparent size of the record because of the page + // header, which throws off the calculation. + if u64::from(before_lsn) / XLOG_BLCKSZ as u64 + != u64::from(after_lsn) / XLOG_BLCKSZ as u64 + { + continue; + } + // base_size is the size of a logical message without the payload + let base_size = u64::from(after_lsn) - u64::from(before_lsn) - 10; + + // Is there enough space on the page for another logical message and an + // XLOG_SWITCH? If not, start over. + let page_remain = XLOG_BLCKSZ as u64 - u64::from(after_lsn) % XLOG_BLCKSZ as u64; + if page_remain < base_size - XLOG_SIZE_OF_XLOG_RECORD as u64 { + continue; + } + + // We will write another logical message, such that after the logical message + // record, there will be space for exactly one XLOG_SWITCH. How large should + // the logical message's payload be? An XLOG_SWITCH record has no data => its + // size is exactly XLOG_SIZE_OF_XLOG_RECORD. + let repeats = page_remain - base_size - XLOG_SIZE_OF_XLOG_RECORD as u64; + + client.execute( + "SELECT pg_logical_emit_message(false, 'swch', REPEAT('a', $1))", + &[&(repeats as i32)], + )?; + break; } - let repeats = 10 + remaining_lsn - base_wal_advance; - info!( - "current_wal_insert_lsn={}, remaining_lsn={}, base_wal_advance={}, repeats={}", - client.pg_current_wal_insert_lsn()?, - remaining_lsn, - base_wal_advance, - repeats - ); - client.execute( - "SELECT pg_logical_emit_message(false, 'swch', REPEAT('a', $1))", - &[&(repeats as i32)], - )?; info!( "current_wal_insert_lsn={}, XLOG_SIZE_OF_XLOG_RECORD={}", client.pg_current_wal_insert_lsn()?, @@ -361,28 +383,29 @@ impl Crafter for LastWalRecordXlogSwitchEndsOnPageBoundary { // Emit the XLOG_SWITCH let before_xlog_switch = client.pg_current_wal_insert_lsn()?; - let after_xlog_switch: PgLsn = client.query_one("SELECT pg_switch_wal()", &[])?.get(0); + let xlog_switch_record_end: PgLsn = client.query_one("SELECT pg_switch_wal()", &[])?.get(0); let next_segment = PgLsn::from(0x0200_0000); ensure!( - after_xlog_switch < next_segment, - "XLOG_SWITCH message ended on or after the expected segment boundary: {} > {}", - after_xlog_switch, + xlog_switch_record_end < next_segment, + "XLOG_SWITCH record ended on or after the expected segment boundary: {} > {}", + xlog_switch_record_end, next_segment ); ensure!( - u64::from(after_xlog_switch) as usize % XLOG_BLCKSZ == XLOG_SIZE_OF_XLOG_SHORT_PHD, + u64::from(xlog_switch_record_end) as usize % XLOG_BLCKSZ == XLOG_SIZE_OF_XLOG_SHORT_PHD, "XLOG_SWITCH message ended not on page boundary: {}, offset = {}", - after_xlog_switch, - u64::from(after_xlog_switch) as usize % XLOG_BLCKSZ + xlog_switch_record_end, + u64::from(xlog_switch_record_end) as usize % XLOG_BLCKSZ ); - Ok((vec![before_xlog_switch, after_xlog_switch], next_segment)) + Ok(vec![before_xlog_switch, xlog_switch_record_end]) } } -fn craft_single_logical_message( +/// Write ~16MB logical message; it should cross WAL segment. +fn craft_seg_size_logical_message( client: &mut impl postgres::GenericClient, transactional: bool, -) -> anyhow::Result<(Vec, PgLsn)> { +) -> anyhow::Result> { craft_internal(client, |client, initial_lsn| { ensure!( initial_lsn < PgLsn::from(0x0200_0000 - 1024 * 1024), @@ -405,34 +428,24 @@ fn craft_single_logical_message( "Logical message crossed two segments" ); - if transactional { - // Transactional logical messages are part of a transaction, so the one above is - // followed by a small COMMIT record. - - let after_message_lsn = client.pg_current_wal_insert_lsn()?; - ensure!( - message_lsn < after_message_lsn, - "No record found after the emitted message" - ); - Ok((vec![message_lsn], Some(after_message_lsn))) - } else { - Ok((Vec::new(), Some(message_lsn))) - } + Ok(vec![message_lsn]) }) } pub struct WalRecordCrossingSegmentFollowedBySmallOne; impl Crafter for WalRecordCrossingSegmentFollowedBySmallOne { const NAME: &'static str = "wal_record_crossing_segment_followed_by_small_one"; - fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec, PgLsn)> { - craft_single_logical_message(client, true) + fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result> { + // Transactional message crossing WAL segment will be followed by small + // commit record. + craft_seg_size_logical_message(client, true) } } pub struct LastWalRecordCrossingSegment; impl Crafter for LastWalRecordCrossingSegment { const NAME: &'static str = "last_wal_record_crossing_segment"; - fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec, PgLsn)> { - craft_single_logical_message(client, false) + fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result> { + craft_seg_size_logical_message(client, false) } } diff --git a/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs b/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs index 6ff4c563b2..496458b2e4 100644 --- a/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs +++ b/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs @@ -11,13 +11,15 @@ use utils::const_assert; use utils::lsn::Lsn; fn init_logging() { - let _ = env_logger::Builder::from_env(env_logger::Env::default().default_filter_or( - format!("crate=info,postgres_ffi::{PG_MAJORVERSION}::xlog_utils=trace"), - )) + let _ = env_logger::Builder::from_env(env_logger::Env::default().default_filter_or(format!( + "crate=info,postgres_ffi::{PG_MAJORVERSION}::xlog_utils=trace" + ))) .is_test(true) .try_init(); } +/// Test that find_end_of_wal returns the same results as pg_dump on various +/// WALs created by Crafter. fn test_end_of_wal(test_name: &str) { use crate::*; @@ -38,13 +40,13 @@ fn test_end_of_wal(test_name: &str) { } cfg.initdb().unwrap(); let srv = cfg.start_server().unwrap(); - let (intermediate_lsns, expected_end_of_wal_partial) = - C::craft(&mut srv.connect_with_timeout().unwrap()).unwrap(); + let intermediate_lsns = C::craft(&mut srv.connect_with_timeout().unwrap()).unwrap(); let intermediate_lsns: Vec = intermediate_lsns .iter() .map(|&lsn| u64::from(lsn).into()) .collect(); - let expected_end_of_wal: Lsn = u64::from(expected_end_of_wal_partial).into(); + // Kill postgres. Note that it might have inserted to WAL something after + // 'craft' did its job. srv.kill(); // Check find_end_of_wal on the initial WAL @@ -56,7 +58,7 @@ fn test_end_of_wal(test_name: &str) { .filter(|fname| IsXLogFileName(fname)) .max() .unwrap(); - check_pg_waldump_end_of_wal(&cfg, &last_segment, expected_end_of_wal); + let expected_end_of_wal = find_pg_waldump_end_of_wal(&cfg, &last_segment); for start_lsn in intermediate_lsns .iter() .chain(std::iter::once(&expected_end_of_wal)) @@ -91,11 +93,7 @@ fn test_end_of_wal(test_name: &str) { } } -fn check_pg_waldump_end_of_wal( - cfg: &crate::Conf, - last_segment: &str, - expected_end_of_wal: Lsn, -) { +fn find_pg_waldump_end_of_wal(cfg: &crate::Conf, last_segment: &str) -> Lsn { // Get the actual end of WAL by pg_waldump let waldump_output = cfg .pg_waldump("000000010000000000000001", last_segment) @@ -113,11 +111,8 @@ fn check_pg_waldump_end_of_wal( } }; let waldump_wal_end = Lsn::from_str(caps.get(1).unwrap().as_str()).unwrap(); - info!( - "waldump erred on {}, expected wal end at {}", - waldump_wal_end, expected_end_of_wal - ); - assert_eq!(waldump_wal_end, expected_end_of_wal); + info!("waldump erred on {}", waldump_wal_end); + waldump_wal_end } fn check_end_of_wal( @@ -210,9 +205,9 @@ pub fn test_update_next_xid() { #[test] pub fn test_encode_logical_message() { let expected = [ - 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 21, 0, 0, 170, 34, 166, 227, 255, - 38, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 112, 114, - 101, 102, 105, 120, 0, 109, 101, 115, 115, 97, 103, 101, + 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 21, 0, 0, 170, 34, 166, 227, 255, 38, + 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 112, 114, 101, 102, + 105, 120, 0, 109, 101, 115, 115, 97, 103, 101, ]; let actual = encode_logical_message("prefix", "message"); assert_eq!(expected, actual[..]); diff --git a/libs/remote_storage/Cargo.toml b/libs/remote_storage/Cargo.toml index 4a53f485ca..78da01c9a0 100644 --- a/libs/remote_storage/Cargo.toml +++ b/libs/remote_storage/Cargo.toml @@ -38,6 +38,7 @@ azure_storage_blobs.workspace = true futures-util.workspace = true http-types.workspace = true itertools.workspace = true +sync_wrapper = { workspace = true, features = ["futures"] } [dev-dependencies] camino-tempfile.workspace = true diff --git a/libs/remote_storage/src/azure_blob.rs b/libs/remote_storage/src/azure_blob.rs index 5fff3e25c9..24c1248304 100644 --- a/libs/remote_storage/src/azure_blob.rs +++ b/libs/remote_storage/src/azure_blob.rs @@ -3,6 +3,7 @@ use std::borrow::Cow; use std::collections::HashMap; use std::env; +use std::io; use std::num::NonZeroU32; use std::pin::Pin; use std::str::FromStr; @@ -20,6 +21,7 @@ use azure_storage_blobs::blob::CopyStatus; use azure_storage_blobs::prelude::ClientBuilder; use azure_storage_blobs::{blob::operations::GetBlobBuilder, prelude::ContainerClient}; use bytes::Bytes; +use futures::future::Either; use futures::stream::Stream; use futures_util::StreamExt; use futures_util::TryStreamExt; @@ -128,12 +130,12 @@ impl AzureBlobStorage { let kind = RequestKind::Get; let _permit = self.permit(kind, cancel).await?; + let cancel_or_timeout = crate::support::cancel_or_timeout(self.timeout, cancel.clone()); + let cancel_or_timeout_ = crate::support::cancel_or_timeout(self.timeout, cancel.clone()); let mut etag = None; let mut last_modified = None; let mut metadata = HashMap::new(); - // TODO give proper streaming response instead of buffering into RAM - // https://github.com/neondatabase/neon/issues/5563 let download = async { let response = builder @@ -152,39 +154,46 @@ impl AzureBlobStorage { Err(_elapsed) => Err(DownloadError::Timeout), }); - let mut response = std::pin::pin!(response); + let mut response = Box::pin(response); - let mut bufs = Vec::new(); - while let Some(part) = response.next().await { - let part = part?; - if etag.is_none() { - etag = Some(part.blob.properties.etag); - } - if last_modified.is_none() { - last_modified = Some(part.blob.properties.last_modified.into()); - } - if let Some(blob_meta) = part.blob.metadata { - metadata.extend(blob_meta.iter().map(|(k, v)| (k.to_owned(), v.to_owned()))); - } - let data = part - .data - .collect() - .await - .map_err(|e| DownloadError::Other(e.into()))?; - bufs.push(data); - } - - if bufs.is_empty() { + let Some(part) = response.next().await else { return Err(DownloadError::Other(anyhow::anyhow!( - "Azure GET response contained no buffers" + "Azure GET response contained no response body" ))); + }; + let part = part?; + if etag.is_none() { + etag = Some(part.blob.properties.etag); } + if last_modified.is_none() { + last_modified = Some(part.blob.properties.last_modified.into()); + } + if let Some(blob_meta) = part.blob.metadata { + metadata.extend(blob_meta.iter().map(|(k, v)| (k.to_owned(), v.to_owned()))); + } + // unwrap safety: if these were None, bufs would be empty and we would have returned an error already let etag = etag.unwrap(); let last_modified = last_modified.unwrap(); + let tail_stream = response + .map(|part| match part { + Ok(part) => Either::Left(part.data.map(|r| r.map_err(io::Error::other))), + Err(e) => { + Either::Right(futures::stream::once(async { Err(io::Error::other(e)) })) + } + }) + .flatten(); + let stream = part + .data + .map(|r| r.map_err(io::Error::other)) + .chain(sync_wrapper::SyncStream::new(tail_stream)); + //.chain(SyncStream::from_pin(Box::pin(tail_stream))); + + let download_stream = crate::support::DownloadStream::new(cancel_or_timeout_, stream); + Ok(Download { - download_stream: Box::pin(futures::stream::iter(bufs.into_iter().map(Ok))), + download_stream: Box::pin(download_stream), etag, last_modified, metadata: Some(StorageMetadata(metadata)), @@ -193,7 +202,10 @@ impl AzureBlobStorage { tokio::select! { bufs = download => bufs, - _ = cancel.cancelled() => Err(DownloadError::Cancelled), + cancel_or_timeout = cancel_or_timeout => match cancel_or_timeout { + TimeoutOrCancel::Timeout => Err(DownloadError::Timeout), + TimeoutOrCancel::Cancel => Err(DownloadError::Cancelled), + }, } } diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs index ab2035f19a..708662f20f 100644 --- a/libs/remote_storage/src/lib.rs +++ b/libs/remote_storage/src/lib.rs @@ -21,11 +21,13 @@ use std::{ fmt::Debug, num::{NonZeroU32, NonZeroUsize}, pin::Pin, + str::FromStr, sync::Arc, time::{Duration, SystemTime}, }; use anyhow::{bail, Context}; +use aws_sdk_s3::types::StorageClass; use camino::{Utf8Path, Utf8PathBuf}; use bytes::Bytes; @@ -53,11 +55,11 @@ pub use error::{DownloadError, TimeTravelError, TimeoutOrCancel}; /// ~3500 PUT/COPY/POST/DELETE or 5500 GET/HEAD S3 requests /// pub const DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT: usize = 100; -/// We set this a little bit low as we currently buffer the entire file into RAM +/// Set this limit analogously to the S3 limit /// /// Here, a limit of max 20k concurrent connections was noted. /// -pub const DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT: usize = 30; +pub const DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT: usize = 100; /// No limits on the client side, which currenltly means 1000 for AWS S3. /// pub const DEFAULT_MAX_KEYS_PER_LIST_RESPONSE: Option = None; @@ -134,6 +136,11 @@ impl RemotePath { pub fn strip_prefix(&self, p: &RemotePath) -> Result<&Utf8Path, std::path::StripPrefixError> { self.0.strip_prefix(&p.0) } + + pub fn add_trailing_slash(&self) -> Self { + // Unwrap safety inputs are guararnteed to be valid UTF-8 + Self(format!("{}/", self.0).try_into().unwrap()) + } } /// We don't need callers to be able to pass arbitrary delimiters: just control @@ -157,47 +164,21 @@ pub struct Listing { /// providing basic CRUD operations for storage files. #[allow(async_fn_in_trait)] pub trait RemoteStorage: Send + Sync + 'static { - /// Lists all top level subdirectories for a given prefix - /// Note: here we assume that if the prefix is passed it was obtained via remote_object_id - /// which already takes into account any kind of global prefix (prefix_in_bucket for S3 or storage_root for LocalFS) - /// so this method doesnt need to. - async fn list_prefixes( - &self, - prefix: Option<&RemotePath>, - cancel: &CancellationToken, - ) -> Result, DownloadError> { - let result = self - .list(prefix, ListingMode::WithDelimiter, None, cancel) - .await? - .prefixes; - Ok(result) - } - /// Lists all files in directory "recursively" - /// (not really recursively, because AWS has a flat namespace) - /// Note: This is subtely different than list_prefixes, - /// because it is for listing files instead of listing - /// names sharing common prefixes. - /// For example, - /// list_files("foo/bar") = ["foo/bar/cat123.txt", - /// "foo/bar/cat567.txt", "foo/bar/dog123.txt", "foo/bar/dog456.txt"] - /// whereas, - /// list_prefixes("foo/bar/") = ["cat", "dog"] - /// See `test_real_s3.rs` for more details. + /// List objects in remote storage, with semantics matching AWS S3's ListObjectsV2. + /// (see ``) + /// + /// Note that the prefix is relative to any `prefix_in_bucket` configured for the client, not + /// from the absolute root of the bucket. + /// + /// `mode` configures whether to use a delimiter. Without a delimiter all keys + /// within the prefix are listed in the `keys` of the result. With a delimiter, any "directories" at the top level of + /// the prefix are returned in the `prefixes` of the result, and keys in the top level of the prefix are + /// returned in `keys` (). + /// + /// `max_keys` controls the maximum number of keys that will be returned. If this is None, this function + /// will iteratively call listobjects until it runs out of keys. Note that this is not safe to use on + /// unlimted size buckets, as the full list of objects is allocated into a monolithic data structure. /// - /// max_keys limits max number of keys returned; None means unlimited. - async fn list_files( - &self, - prefix: Option<&RemotePath>, - max_keys: Option, - cancel: &CancellationToken, - ) -> Result, DownloadError> { - let result = self - .list(prefix, ListingMode::NoDelimiter, max_keys, cancel) - .await? - .keys; - Ok(result) - } - async fn list( &self, prefix: Option<&RemotePath>, @@ -336,41 +317,6 @@ impl GenericRemoteStorage> { } } - // A function for listing all the files in a "directory" - // Example: - // list_files("foo/bar") = ["foo/bar/a.txt", "foo/bar/b.txt"] - // - // max_keys limits max number of keys returned; None means unlimited. - pub async fn list_files( - &self, - folder: Option<&RemotePath>, - max_keys: Option, - cancel: &CancellationToken, - ) -> Result, DownloadError> { - match self { - Self::LocalFs(s) => s.list_files(folder, max_keys, cancel).await, - Self::AwsS3(s) => s.list_files(folder, max_keys, cancel).await, - Self::AzureBlob(s) => s.list_files(folder, max_keys, cancel).await, - Self::Unreliable(s) => s.list_files(folder, max_keys, cancel).await, - } - } - - // lists common *prefixes*, if any of files - // Example: - // list_prefixes("foo123","foo567","bar123","bar432") = ["foo", "bar"] - pub async fn list_prefixes( - &self, - prefix: Option<&RemotePath>, - cancel: &CancellationToken, - ) -> Result, DownloadError> { - match self { - Self::LocalFs(s) => s.list_prefixes(prefix, cancel).await, - Self::AwsS3(s) => s.list_prefixes(prefix, cancel).await, - Self::AzureBlob(s) => s.list_prefixes(prefix, cancel).await, - Self::Unreliable(s) => s.list_prefixes(prefix, cancel).await, - } - } - /// See [`RemoteStorage::upload`] pub async fn upload( &self, @@ -565,6 +511,16 @@ impl GenericRemoteStorage { #[derive(Debug, Clone, PartialEq, Eq)] pub struct StorageMetadata(HashMap); +impl From<[(&str, &str); N]> for StorageMetadata { + fn from(arr: [(&str, &str); N]) -> Self { + let map: HashMap = arr + .iter() + .map(|(k, v)| (k.to_string(), v.to_string())) + .collect(); + Self(map) + } +} + /// External backup storage configuration, enough for creating a client for that storage. #[derive(Debug, Clone, PartialEq, Eq)] pub struct RemoteStorageConfig { @@ -609,6 +565,7 @@ pub struct S3Config { /// See [`DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT`] for more details. pub concurrency_limit: NonZeroUsize, pub max_keys_per_list_response: Option, + pub upload_storage_class: Option, } impl Debug for S3Config { @@ -737,6 +694,18 @@ impl RemoteStorageConfig { endpoint, concurrency_limit, max_keys_per_list_response, + upload_storage_class: toml + .get("upload_storage_class") + .map(|prefix_in_bucket| -> anyhow::Result<_> { + let s = parse_toml_string("upload_storage_class", prefix_in_bucket)?; + let storage_class = StorageClass::from_str(&s).expect("infallible"); + #[allow(deprecated)] + if matches!(storage_class, StorageClass::Unknown(_)) { + bail!("Specified storage class unknown to SDK: '{s}'. Allowed values: {:?}", StorageClass::values()); + } + Ok(storage_class) + }) + .transpose()?, }) } (_, _, _, Some(_), None) => { diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs index 313d8226b1..1f7bcfc982 100644 --- a/libs/remote_storage/src/local_fs.rs +++ b/libs/remote_storage/src/local_fs.rs @@ -5,11 +5,9 @@ //! volume is mounted to the local FS. use std::{ - borrow::Cow, - future::Future, + collections::HashSet, io::ErrorKind, num::NonZeroU32, - pin::Pin, time::{Duration, SystemTime, UNIX_EPOCH}, }; @@ -22,11 +20,11 @@ use tokio::{ io::{self, AsyncReadExt, AsyncSeekExt, AsyncWriteExt}, }; use tokio_util::{io::ReaderStream, sync::CancellationToken}; -use tracing::*; -use utils::{crashsafe::path_with_suffix_extension, fs_ext::is_directory_empty}; +use utils::crashsafe::path_with_suffix_extension; use crate::{ Download, DownloadError, Listing, ListingMode, RemotePath, TimeTravelError, TimeoutOrCancel, + REMOTE_STORAGE_PREFIX_SEPARATOR, }; use super::{RemoteStorage, StorageMetadata}; @@ -93,7 +91,47 @@ impl LocalFs { #[cfg(test)] async fn list_all(&self) -> anyhow::Result> { - Ok(get_all_files(&self.storage_root, true) + use std::{future::Future, pin::Pin}; + fn get_all_files<'a, P>( + directory_path: P, + ) -> Pin>> + Send + Sync + 'a>> + where + P: AsRef + Send + Sync + 'a, + { + Box::pin(async move { + let directory_path = directory_path.as_ref(); + if directory_path.exists() { + if directory_path.is_dir() { + let mut paths = Vec::new(); + let mut dir_contents = fs::read_dir(directory_path).await?; + while let Some(dir_entry) = dir_contents.next_entry().await? { + let file_type = dir_entry.file_type().await?; + let entry_path = + Utf8PathBuf::from_path_buf(dir_entry.path()).map_err(|pb| { + anyhow::Error::msg(format!( + "non-Unicode path: {}", + pb.to_string_lossy() + )) + })?; + if file_type.is_symlink() { + tracing::debug!("{entry_path:?} is a symlink, skipping") + } else if file_type.is_dir() { + paths.extend(get_all_files(&entry_path).await?.into_iter()) + } else { + paths.push(entry_path); + } + } + Ok(paths) + } else { + bail!("Path {directory_path:?} is not a directory") + } + } else { + Ok(Vec::new()) + } + }) + } + + Ok(get_all_files(&self.storage_root) .await? .into_iter() .map(|path| { @@ -120,6 +158,14 @@ impl LocalFs { // S3 object list prefixes can be arbitrary strings, but when reading // the local filesystem we need a directory to start calling read_dir on. let mut initial_dir = full_path.clone(); + + // If there's no trailing slash, we have to start looking from one above: even if + // `initial_dir` is a directory, we should still list any prefixes in the parent + // that start with the same string. + if !full_path.to_string().ends_with('/') { + initial_dir.pop(); + } + loop { // Did we make it to the root? if initial_dir.parent().is_none() { @@ -198,6 +244,7 @@ impl LocalFs { fs::OpenOptions::new() .write(true) .create(true) + .truncate(true) .open(&temp_file_path) .await .with_context(|| { @@ -294,61 +341,66 @@ impl RemoteStorage for LocalFs { let op = async { let mut result = Listing::default(); - if let ListingMode::NoDelimiter = mode { - let keys = self - .list_recursive(prefix) - .await - .map_err(DownloadError::Other)?; - - result.keys = keys - .into_iter() - .filter(|k| { - let path = k.with_base(&self.storage_root); - !path.is_dir() - }) - .collect(); - - if let Some(max_keys) = max_keys { - result.keys.truncate(max_keys.get() as usize); - } - - return Ok(result); - } - - let path = match prefix { - Some(prefix) => Cow::Owned(prefix.with_base(&self.storage_root)), - None => Cow::Borrowed(&self.storage_root), - }; - - let prefixes_to_filter = get_all_files(path.as_ref(), false) + // Filter out directories: in S3 directories don't exist, only the keys within them do. + let keys = self + .list_recursive(prefix) .await .map_err(DownloadError::Other)?; + let keys = keys + .into_iter() + .filter(|k| { + let path = k.with_base(&self.storage_root); + !path.is_dir() + }) + .collect(); - // filter out empty directories to mirror s3 behavior. - for prefix in prefixes_to_filter { - if prefix.is_dir() - && is_directory_empty(&prefix) - .await - .map_err(DownloadError::Other)? - { - continue; - } - - let stripped = prefix - .strip_prefix(&self.storage_root) - .context("Failed to strip prefix") - .and_then(RemotePath::new) - .expect( - "We list files for storage root, hence should be able to remote the prefix", - ); - - if prefix.is_dir() { - result.prefixes.push(stripped); - } else { - result.keys.push(stripped); + if let ListingMode::NoDelimiter = mode { + result.keys = keys; + } else { + let mut prefixes = HashSet::new(); + for key in keys { + // If the part after the prefix includes a "/", take only the first part and put it in `prefixes`. + let relative_key = if let Some(prefix) = prefix { + let mut prefix = prefix.clone(); + // We only strip the dirname of the prefix, so that when we strip it from the start of keys we + // end up with full file/dir names. + let prefix_full_local_path = prefix.with_base(&self.storage_root); + let has_slash = prefix.0.to_string().ends_with('/'); + let strip_prefix = if prefix_full_local_path.is_dir() && has_slash { + prefix + } else { + prefix.0.pop(); + prefix + }; + + RemotePath::new(key.strip_prefix(&strip_prefix).unwrap()).unwrap() + } else { + key + }; + + let relative_key = format!("{}", relative_key); + if relative_key.contains(REMOTE_STORAGE_PREFIX_SEPARATOR) { + let first_part = relative_key + .split(REMOTE_STORAGE_PREFIX_SEPARATOR) + .next() + .unwrap() + .to_owned(); + prefixes.insert(first_part); + } else { + result + .keys + .push(RemotePath::from_string(&relative_key).unwrap()); + } } + result.prefixes = prefixes + .into_iter() + .map(|s| RemotePath::from_string(&s).unwrap()) + .collect(); } + if let Some(max_keys) = max_keys { + result.keys.truncate(max_keys.get() as usize); + } Ok(result) }; @@ -559,50 +611,6 @@ fn storage_metadata_path(original_path: &Utf8Path) -> Utf8PathBuf { path_with_suffix_extension(original_path, "metadata") } -fn get_all_files<'a, P>( - directory_path: P, - recursive: bool, -) -> Pin>> + Send + Sync + 'a>> -where - P: AsRef + Send + Sync + 'a, -{ - Box::pin(async move { - let directory_path = directory_path.as_ref(); - if directory_path.exists() { - if directory_path.is_dir() { - let mut paths = Vec::new(); - let mut dir_contents = fs::read_dir(directory_path).await?; - while let Some(dir_entry) = dir_contents.next_entry().await? { - let file_type = dir_entry.file_type().await?; - let entry_path = - Utf8PathBuf::from_path_buf(dir_entry.path()).map_err(|pb| { - anyhow::Error::msg(format!( - "non-Unicode path: {}", - pb.to_string_lossy() - )) - })?; - if file_type.is_symlink() { - debug!("{entry_path:?} is a symlink, skipping") - } else if file_type.is_dir() { - if recursive { - paths.extend(get_all_files(&entry_path, true).await?.into_iter()) - } else { - paths.push(entry_path) - } - } else { - paths.push(entry_path); - } - } - Ok(paths) - } else { - bail!("Path {directory_path:?} is not a directory") - } - } else { - Ok(Vec::new()) - } - }) -} - async fn create_target_directory(target_file_path: &Utf8Path) -> anyhow::Result<()> { let target_dir = match target_file_path.parent() { Some(parent_dir) => parent_dir, @@ -922,13 +930,18 @@ mod fs_tests { // No delimiter: should recursively list everything let (storage, cancel) = create_storage()?; let child = upload_dummy_file(&storage, "grandparent/parent/child", None, &cancel).await?; + let child_sibling = + upload_dummy_file(&storage, "grandparent/parent/child_sibling", None, &cancel).await?; let uncle = upload_dummy_file(&storage, "grandparent/uncle", None, &cancel).await?; let listing = storage .list(None, ListingMode::NoDelimiter, None, &cancel) .await?; assert!(listing.prefixes.is_empty()); - assert_eq!(listing.keys, [uncle.clone(), child.clone()].to_vec()); + assert_eq!( + listing.keys.into_iter().collect::>(), + HashSet::from([uncle.clone(), child.clone(), child_sibling.clone()]) + ); // Delimiter: should only go one deep let listing = storage @@ -941,7 +954,25 @@ mod fs_tests { ); assert!(listing.keys.is_empty()); - // Delimiter & prefix + // Delimiter & prefix with a trailing slash + let listing = storage + .list( + Some(&RemotePath::from_string("timelines/some_timeline/grandparent/").unwrap()), + ListingMode::WithDelimiter, + None, + &cancel, + ) + .await?; + assert_eq!( + listing.keys, + [RemotePath::from_string("uncle").unwrap()].to_vec() + ); + assert_eq!( + listing.prefixes, + [RemotePath::from_string("parent").unwrap()].to_vec() + ); + + // Delimiter and prefix without a trailing slash let listing = storage .list( Some(&RemotePath::from_string("timelines/some_timeline/grandparent").unwrap()), @@ -950,12 +981,66 @@ mod fs_tests { &cancel, ) .await?; + assert_eq!(listing.keys, [].to_vec()); assert_eq!( listing.prefixes, - [RemotePath::from_string("timelines/some_timeline/grandparent/parent").unwrap()] - .to_vec() + [RemotePath::from_string("grandparent").unwrap()].to_vec() + ); + + // Delimiter and prefix that's partway through a path component + let listing = storage + .list( + Some(&RemotePath::from_string("timelines/some_timeline/grandp").unwrap()), + ListingMode::WithDelimiter, + None, + &cancel, + ) + .await?; + assert_eq!(listing.keys, [].to_vec()); + assert_eq!( + listing.prefixes, + [RemotePath::from_string("grandparent").unwrap()].to_vec() + ); + + Ok(()) + } + + #[tokio::test] + async fn list_part_component() -> anyhow::Result<()> { + // No delimiter: should recursively list everything + let (storage, cancel) = create_storage()?; + + // Imitates what happens in a tenant path when we have an unsharded path and a sharded path, and do a listing + // of the unsharded path: although there is a "directory" at the unsharded path, it should be handled as + // a freeform prefix. + let _child_a = + upload_dummy_file(&storage, "grandparent/tenant-01/child", None, &cancel).await?; + let _child_b = + upload_dummy_file(&storage, "grandparent/tenant/child", None, &cancel).await?; + + // Delimiter and prefix that's partway through a path component + let listing = storage + .list( + Some( + &RemotePath::from_string("timelines/some_timeline/grandparent/tenant").unwrap(), + ), + ListingMode::WithDelimiter, + None, + &cancel, + ) + .await?; + assert_eq!(listing.keys, [].to_vec()); + + let mut found_prefixes = listing.prefixes.clone(); + found_prefixes.sort(); + assert_eq!( + found_prefixes, + [ + RemotePath::from_string("tenant").unwrap(), + RemotePath::from_string("tenant-01").unwrap(), + ] + .to_vec() ); - assert_eq!(listing.keys, [uncle.clone()].to_vec()); Ok(()) } diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs index 1cb85cfb1b..c3d6c75e20 100644 --- a/libs/remote_storage/src/s3_bucket.rs +++ b/libs/remote_storage/src/s3_bucket.rs @@ -27,10 +27,10 @@ use aws_config::{ }; use aws_credential_types::provider::SharedCredentialsProvider; use aws_sdk_s3::{ - config::{AsyncSleep, Builder, IdentityCache, Region, SharedAsyncSleep}, + config::{AsyncSleep, IdentityCache, Region, SharedAsyncSleep}, error::SdkError, operation::get_object::GetObjectError, - types::{Delete, DeleteMarkerEntry, ObjectIdentifier, ObjectVersion}, + types::{Delete, DeleteMarkerEntry, ObjectIdentifier, ObjectVersion, StorageClass}, Client, }; use aws_smithy_async::rt::sleep::TokioSleep; @@ -62,6 +62,7 @@ pub struct S3Bucket { bucket_name: String, prefix_in_bucket: Option, max_keys_per_list_response: Option, + upload_storage_class: Option, concurrency_limiter: ConcurrencyLimiter, // Per-request timeout. Accessible for tests. pub timeout: Duration, @@ -74,13 +75,13 @@ struct GetObjectRequest { } impl S3Bucket { /// Creates the S3 storage, errors if incorrect AWS S3 configuration provided. - pub fn new(aws_config: &S3Config, timeout: Duration) -> anyhow::Result { + pub fn new(remote_storage_config: &S3Config, timeout: Duration) -> anyhow::Result { tracing::debug!( "Creating s3 remote storage for S3 bucket {}", - aws_config.bucket_name + remote_storage_config.bucket_name ); - let region = Some(Region::new(aws_config.bucket_region.clone())); + let region = Some(Region::new(remote_storage_config.bucket_region.clone())); let provider_conf = ProviderConfig::without_region().with_region(region.clone()); @@ -112,6 +113,38 @@ impl S3Bucket { // AWS SDK requires us to specify how the RetryConfig should sleep when it wants to back off let sleep_impl: Arc = Arc::new(TokioSleep::new()); + let sdk_config_loader: aws_config::ConfigLoader = aws_config::defaults( + #[allow(deprecated)] /* TODO: https://github.com/neondatabase/neon/issues/7665 */ + BehaviorVersion::v2023_11_09(), + ) + .region(region) + .identity_cache(IdentityCache::lazy().build()) + .credentials_provider(SharedCredentialsProvider::new(credentials_provider)) + .sleep_impl(SharedAsyncSleep::from(sleep_impl)); + + let sdk_config: aws_config::SdkConfig = std::thread::scope(|s| { + s.spawn(|| { + // TODO: make this function async. + tokio::runtime::Builder::new_current_thread() + .enable_all() + .build() + .unwrap() + .block_on(sdk_config_loader.load()) + }) + .join() + .unwrap() + }); + + let mut s3_config_builder = aws_sdk_s3::config::Builder::from(&sdk_config); + + // Technically, the `remote_storage_config.endpoint` field only applies to S3 interactions. + // (In case we ever re-use the `sdk_config` for more than just the S3 client in the future) + if let Some(custom_endpoint) = remote_storage_config.endpoint.clone() { + s3_config_builder = s3_config_builder + .endpoint_url(custom_endpoint) + .force_path_style(true); + } + // We do our own retries (see [`backoff::retry`]). However, for the AWS SDK to enable rate limiting in response to throttling // responses (e.g. 429 on too many ListObjectsv2 requests), we must provide a retry config. We set it to use at most one // attempt, and enable 'Adaptive' mode, which causes rate limiting to be enabled. @@ -119,41 +152,36 @@ impl S3Bucket { retry_config .set_max_attempts(Some(1)) .set_mode(Some(RetryMode::Adaptive)); + s3_config_builder = s3_config_builder.retry_config(retry_config.build()); - let mut config_builder = Builder::default() - .behavior_version(BehaviorVersion::v2023_11_09()) - .region(region) - .identity_cache(IdentityCache::lazy().build()) - .credentials_provider(SharedCredentialsProvider::new(credentials_provider)) - .retry_config(retry_config.build()) - .sleep_impl(SharedAsyncSleep::from(sleep_impl)); + let s3_config = s3_config_builder.build(); + let client = aws_sdk_s3::Client::from_conf(s3_config); - if let Some(custom_endpoint) = aws_config.endpoint.clone() { - config_builder = config_builder - .endpoint_url(custom_endpoint) - .force_path_style(true); - } + let prefix_in_bucket = remote_storage_config + .prefix_in_bucket + .as_deref() + .map(|prefix| { + let mut prefix = prefix; + while prefix.starts_with(REMOTE_STORAGE_PREFIX_SEPARATOR) { + prefix = &prefix[1..] + } - let client = Client::from_conf(config_builder.build()); + let mut prefix = prefix.to_string(); + while prefix.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR) { + prefix.pop(); + } + prefix + }); - let prefix_in_bucket = aws_config.prefix_in_bucket.as_deref().map(|prefix| { - let mut prefix = prefix; - while prefix.starts_with(REMOTE_STORAGE_PREFIX_SEPARATOR) { - prefix = &prefix[1..] - } - - let mut prefix = prefix.to_string(); - while prefix.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR) { - prefix.pop(); - } - prefix - }); Ok(Self { client, - bucket_name: aws_config.bucket_name.clone(), - max_keys_per_list_response: aws_config.max_keys_per_list_response, + bucket_name: remote_storage_config.bucket_name.clone(), + max_keys_per_list_response: remote_storage_config.max_keys_per_list_response, prefix_in_bucket, - concurrency_limiter: ConcurrencyLimiter::new(aws_config.concurrency_limit.get()), + concurrency_limiter: ConcurrencyLimiter::new( + remote_storage_config.concurrency_limit.get(), + ), + upload_storage_class: remote_storage_config.upload_storage_class.clone(), timeout, }) } @@ -178,10 +206,7 @@ impl S3Bucket { pub fn relative_path_to_s3_object(&self, path: &RemotePath) -> String { assert_eq!(std::path::MAIN_SEPARATOR, REMOTE_STORAGE_PREFIX_SEPARATOR); - let path_string = path - .get_path() - .as_str() - .trim_end_matches(REMOTE_STORAGE_PREFIX_SEPARATOR); + let path_string = path.get_path().as_str(); match &self.prefix_in_bucket { Some(prefix) => prefix.clone() + "/" + path_string, None => path_string.to_string(), @@ -471,16 +496,11 @@ impl RemoteStorage for S3Bucket { // get the passed prefix or if it is not set use prefix_in_bucket value let list_prefix = prefix .map(|p| self.relative_path_to_s3_object(p)) - .or_else(|| self.prefix_in_bucket.clone()) - .map(|mut p| { - // required to end with a separator - // otherwise request will return only the entry of a prefix - if matches!(mode, ListingMode::WithDelimiter) - && !p.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR) - { - p.push(REMOTE_STORAGE_PREFIX_SEPARATOR); - } - p + .or_else(|| { + self.prefix_in_bucket.clone().map(|mut s| { + s.push(REMOTE_STORAGE_PREFIX_SEPARATOR); + s + }) }); let _permit = self.permit(kind, cancel).await?; @@ -549,11 +569,15 @@ impl RemoteStorage for S3Bucket { } } - result.prefixes.extend( - prefixes - .iter() - .filter_map(|o| Some(self.s3_object_to_relative_path(o.prefix()?))), - ); + // S3 gives us prefixes like "foo/", we return them like "foo" + result.prefixes.extend(prefixes.iter().filter_map(|o| { + Some( + self.s3_object_to_relative_path( + o.prefix()? + .trim_end_matches(REMOTE_STORAGE_PREFIX_SEPARATOR), + ), + ) + })); continuation_token = match response.next_continuation_token { Some(new_token) => Some(new_token), @@ -586,6 +610,7 @@ impl RemoteStorage for S3Bucket { .bucket(self.bucket_name.clone()) .key(self.relative_path_to_s3_object(to)) .set_metadata(metadata.map(|m| m.0)) + .set_storage_class(self.upload_storage_class.clone()) .content_length(from_size_bytes.try_into()?) .body(bytes_stream) .send(); @@ -637,6 +662,7 @@ impl RemoteStorage for S3Bucket { .copy_object() .bucket(self.bucket_name.clone()) .key(self.relative_path_to_s3_object(to)) + .set_storage_class(self.upload_storage_class.clone()) .copy_source(copy_source) .send(); @@ -894,6 +920,7 @@ impl RemoteStorage for S3Bucket { .copy_object() .bucket(self.bucket_name.clone()) .key(key) + .set_storage_class(self.upload_storage_class.clone()) .copy_source(&source_id) .send(); @@ -1050,22 +1077,22 @@ mod tests { Some("/test/prefix/"), ]; let expected_outputs = [ - vec!["", "some/path", "some/path"], - vec!["/", "/some/path", "/some/path"], + vec!["", "some/path", "some/path/"], + vec!["/", "/some/path", "/some/path/"], vec![ "test/prefix/", "test/prefix/some/path", - "test/prefix/some/path", + "test/prefix/some/path/", ], vec![ "test/prefix/", "test/prefix/some/path", - "test/prefix/some/path", + "test/prefix/some/path/", ], vec![ "test/prefix/", "test/prefix/some/path", - "test/prefix/some/path", + "test/prefix/some/path/", ], ]; @@ -1077,6 +1104,7 @@ mod tests { endpoint: None, concurrency_limit: NonZeroUsize::new(100).unwrap(), max_keys_per_list_response: Some(5), + upload_storage_class: None, }; let storage = S3Bucket::new(&config, std::time::Duration::ZERO).expect("remote storage init"); diff --git a/libs/remote_storage/src/simulate_failures.rs b/libs/remote_storage/src/simulate_failures.rs index f5344d3ae2..c467a2d196 100644 --- a/libs/remote_storage/src/simulate_failures.rs +++ b/libs/remote_storage/src/simulate_failures.rs @@ -107,27 +107,6 @@ impl UnreliableWrapper { type VoidStorage = crate::LocalFs; impl RemoteStorage for UnreliableWrapper { - async fn list_prefixes( - &self, - prefix: Option<&RemotePath>, - cancel: &CancellationToken, - ) -> Result, DownloadError> { - self.attempt(RemoteOp::ListPrefixes(prefix.cloned())) - .map_err(DownloadError::Other)?; - self.inner.list_prefixes(prefix, cancel).await - } - - async fn list_files( - &self, - folder: Option<&RemotePath>, - max_keys: Option, - cancel: &CancellationToken, - ) -> Result, DownloadError> { - self.attempt(RemoteOp::ListPrefixes(folder.cloned())) - .map_err(DownloadError::Other)?; - self.inner.list_files(folder, max_keys, cancel).await - } - async fn list( &self, prefix: Option<&RemotePath>, diff --git a/libs/remote_storage/tests/common/tests.rs b/libs/remote_storage/tests/common/tests.rs index 72f6f956e0..673151c8ef 100644 --- a/libs/remote_storage/tests/common/tests.rs +++ b/libs/remote_storage/tests/common/tests.rs @@ -1,5 +1,6 @@ use anyhow::Context; use camino::Utf8Path; +use remote_storage::ListingMode; use remote_storage::RemotePath; use std::sync::Arc; use std::{collections::HashSet, num::NonZeroU32}; @@ -54,9 +55,9 @@ async fn pagination_should_work(ctx: &mut MaybeEnabledStorageWithTestBlobs) -> a let base_prefix = RemotePath::new(Utf8Path::new(ctx.enabled.base_prefix)) .context("common_prefix construction")?; let root_remote_prefixes = test_client - .list_prefixes(None, &cancel) - .await - .context("client list root prefixes failure")? + .list(None, ListingMode::WithDelimiter, None, &cancel) + .await? + .prefixes .into_iter() .collect::>(); assert_eq!( @@ -65,9 +66,14 @@ async fn pagination_should_work(ctx: &mut MaybeEnabledStorageWithTestBlobs) -> a ); let nested_remote_prefixes = test_client - .list_prefixes(Some(&base_prefix), &cancel) - .await - .context("client list nested prefixes failure")? + .list( + Some(&base_prefix.add_trailing_slash()), + ListingMode::WithDelimiter, + None, + &cancel, + ) + .await? + .prefixes .into_iter() .collect::>(); let remote_only_prefixes = nested_remote_prefixes @@ -90,11 +96,13 @@ async fn pagination_should_work(ctx: &mut MaybeEnabledStorageWithTestBlobs) -> a /// /// First, create a set of S3 objects with keys `random_prefix/folder{j}/blob_{i}.txt` in [`upload_remote_data`] /// Then performs the following queries: -/// 1. `list_files(None)`. This should return all files `random_prefix/folder{j}/blob_{i}.txt` -/// 2. `list_files("folder1")`. This should return all files `random_prefix/folder1/blob_{i}.txt` +/// 1. `list(None)`. This should return all files `random_prefix/folder{j}/blob_{i}.txt` +/// 2. `list("folder1")`. This should return all files `random_prefix/folder1/blob_{i}.txt` #[test_context(MaybeEnabledStorageWithSimpleTestBlobs)] #[tokio::test] -async fn list_files_works(ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs) -> anyhow::Result<()> { +async fn list_no_delimiter_works( + ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs, +) -> anyhow::Result<()> { let ctx = match ctx { MaybeEnabledStorageWithSimpleTestBlobs::Enabled(ctx) => ctx, MaybeEnabledStorageWithSimpleTestBlobs::Disabled => return Ok(()), @@ -107,29 +115,36 @@ async fn list_files_works(ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs) -> a let base_prefix = RemotePath::new(Utf8Path::new("folder1")).context("common_prefix construction")?; let root_files = test_client - .list_files(None, None, &cancel) + .list(None, ListingMode::NoDelimiter, None, &cancel) .await .context("client list root files failure")? + .keys .into_iter() .collect::>(); assert_eq!( root_files, ctx.remote_blobs.clone(), - "remote storage list_files on root mismatches with the uploads." + "remote storage list on root mismatches with the uploads." ); // Test that max_keys limit works. In total there are about 21 files (see // upload_simple_remote_data call in test_real_s3.rs). let limited_root_files = test_client - .list_files(None, Some(NonZeroU32::new(2).unwrap()), &cancel) + .list( + None, + ListingMode::NoDelimiter, + Some(NonZeroU32::new(2).unwrap()), + &cancel, + ) .await .context("client list root files failure")?; - assert_eq!(limited_root_files.len(), 2); + assert_eq!(limited_root_files.keys.len(), 2); let nested_remote_files = test_client - .list_files(Some(&base_prefix), None, &cancel) + .list(Some(&base_prefix), ListingMode::NoDelimiter, None, &cancel) .await .context("client list nested files failure")? + .keys .into_iter() .collect::>(); let trim_remote_blobs: HashSet<_> = ctx @@ -141,7 +156,7 @@ async fn list_files_works(ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs) -> a .collect(); assert_eq!( nested_remote_files, trim_remote_blobs, - "remote storage list_files on subdirrectory mismatches with the uploads." + "remote storage list on subdirrectory mismatches with the uploads." ); Ok(()) } @@ -199,7 +214,11 @@ async fn delete_objects_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<( ctx.client.delete_objects(&[path1, path2], &cancel).await?; - let prefixes = ctx.client.list_prefixes(None, &cancel).await?; + let prefixes = ctx + .client + .list(None, ListingMode::WithDelimiter, None, &cancel) + .await? + .prefixes; assert_eq!(prefixes.len(), 1); diff --git a/libs/remote_storage/tests/test_real_azure.rs b/libs/remote_storage/tests/test_real_azure.rs index 6adddf52a9..cd0b2be4b5 100644 --- a/libs/remote_storage/tests/test_real_azure.rs +++ b/libs/remote_storage/tests/test_real_azure.rs @@ -57,7 +57,6 @@ enum MaybeEnabledStorage { Disabled, } -#[async_trait::async_trait] impl AsyncTestContext for MaybeEnabledStorage { async fn setup() -> Self { ensure_logging_ready(); @@ -86,7 +85,6 @@ struct AzureWithTestBlobs { remote_blobs: HashSet, } -#[async_trait::async_trait] impl AsyncTestContext for MaybeEnabledStorageWithTestBlobs { async fn setup() -> Self { ensure_logging_ready(); @@ -134,10 +132,6 @@ impl AsyncTestContext for MaybeEnabledStorageWithTestBlobs { } } -// NOTE: the setups for the list_prefixes test and the list_files test are very similar -// However, they are not idential. The list_prefixes function is concerned with listing prefixes, -// whereas the list_files function is concerned with listing files. -// See `RemoteStorage::list_files` documentation for more details enum MaybeEnabledStorageWithSimpleTestBlobs { Enabled(AzureWithSimpleTestBlobs), Disabled, @@ -148,7 +142,6 @@ struct AzureWithSimpleTestBlobs { remote_blobs: HashSet, } -#[async_trait::async_trait] impl AsyncTestContext for MaybeEnabledStorageWithSimpleTestBlobs { async fn setup() -> Self { ensure_logging_ready(); diff --git a/libs/remote_storage/tests/test_real_s3.rs b/libs/remote_storage/tests/test_real_s3.rs index bc5e40e70f..a273abe867 100644 --- a/libs/remote_storage/tests/test_real_s3.rs +++ b/libs/remote_storage/tests/test_real_s3.rs @@ -12,8 +12,8 @@ use anyhow::Context; use camino::Utf8Path; use futures_util::StreamExt; use remote_storage::{ - DownloadError, GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind, - S3Config, + DownloadError, GenericRemoteStorage, ListingMode, RemotePath, RemoteStorageConfig, + RemoteStorageKind, S3Config, }; use test_context::test_context; use test_context::AsyncTestContext; @@ -75,11 +75,14 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow: client: &Arc, cancel: &CancellationToken, ) -> anyhow::Result> { - Ok(retry(|| client.list_files(None, None, cancel)) - .await - .context("list root files failure")? - .into_iter() - .collect::>()) + Ok( + retry(|| client.list(None, ListingMode::NoDelimiter, None, cancel)) + .await + .context("list root files failure")? + .keys + .into_iter() + .collect::>(), + ) } let cancel = CancellationToken::new(); @@ -219,7 +222,6 @@ enum MaybeEnabledStorage { Disabled, } -#[async_trait::async_trait] impl AsyncTestContext for MaybeEnabledStorage { async fn setup() -> Self { ensure_logging_ready(); @@ -248,7 +250,6 @@ struct S3WithTestBlobs { remote_blobs: HashSet, } -#[async_trait::async_trait] impl AsyncTestContext for MaybeEnabledStorageWithTestBlobs { async fn setup() -> Self { ensure_logging_ready(); @@ -296,10 +297,6 @@ impl AsyncTestContext for MaybeEnabledStorageWithTestBlobs { } } -// NOTE: the setups for the list_prefixes test and the list_files test are very similar -// However, they are not idential. The list_prefixes function is concerned with listing prefixes, -// whereas the list_files function is concerned with listing files. -// See `RemoteStorage::list_files` documentation for more details enum MaybeEnabledStorageWithSimpleTestBlobs { Enabled(S3WithSimpleTestBlobs), Disabled, @@ -310,7 +307,6 @@ struct S3WithSimpleTestBlobs { remote_blobs: HashSet, } -#[async_trait::async_trait] impl AsyncTestContext for MaybeEnabledStorageWithSimpleTestBlobs { async fn setup() -> Self { ensure_logging_ready(); @@ -384,6 +380,7 @@ fn create_s3_client( endpoint: None, concurrency_limit: NonZeroUsize::new(100).unwrap(), max_keys_per_list_response, + upload_storage_class: None, }), timeout: RemoteStorageConfig::DEFAULT_TIMEOUT, }; diff --git a/libs/tenant_size_model/tests/tests.rs b/libs/tenant_size_model/tests/tests.rs index 7660d41c56..0ffea0f2cd 100644 --- a/libs/tenant_size_model/tests/tests.rs +++ b/libs/tenant_size_model/tests/tests.rs @@ -247,7 +247,7 @@ fn scenario_4() { // // This is in total 5000 + 1000 + 5000 + 1000 = 12000 // - // (If we used the the method from the previous scenario, and + // (If we used the method from the previous scenario, and // kept only snapshot at the branch point, we'd need to keep // all the WAL between 10000-18000 on the main branch, so // the total size would be 5000 + 1000 + 8000 = 14000. The diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml index c2d9d9d396..a6a081c5c1 100644 --- a/libs/utils/Cargo.toml +++ b/libs/utils/Cargo.toml @@ -22,6 +22,7 @@ camino.workspace = true chrono.workspace = true heapless.workspace = true hex = { workspace = true, features = ["serde"] } +humantime.workspace = true hyper = { workspace = true, features = ["full"] } fail.workspace = true futures = { workspace = true} diff --git a/libs/utils/src/env.rs b/libs/utils/src/env.rs new file mode 100644 index 0000000000..b3e326bfd0 --- /dev/null +++ b/libs/utils/src/env.rs @@ -0,0 +1,21 @@ +//! Wrapper around `std::env::var` for parsing environment variables. + +use std::{fmt::Display, str::FromStr}; + +pub fn var(varname: &str) -> Option +where + V: FromStr, + E: Display, +{ + match std::env::var(varname) { + Ok(s) => Some( + s.parse() + .map_err(|e| format!("failed to parse env var {varname}: {e:#}")) + .unwrap(), + ), + Err(std::env::VarError::NotPresent) => None, + Err(std::env::VarError::NotUnicode(_)) => { + panic!("env var {varname} is not unicode") + } + } +} diff --git a/libs/utils/src/generation.rs b/libs/utils/src/generation.rs index af15cee924..b703e883de 100644 --- a/libs/utils/src/generation.rs +++ b/libs/utils/src/generation.rs @@ -34,6 +34,8 @@ pub enum Generation { /// scenarios where pageservers might otherwise issue conflicting writes to /// remote storage impl Generation { + pub const MAX: Self = Self::Valid(u32::MAX); + /// Create a new Generation that represents a legacy key format with /// no generation suffix pub fn none() -> Self { diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs index 04ce0626c8..2953f0aad4 100644 --- a/libs/utils/src/lib.rs +++ b/libs/utils/src/lib.rs @@ -63,6 +63,7 @@ pub mod measured_stream; pub mod serde_percent; pub mod serde_regex; +pub mod serde_system_time; pub mod pageserver_feedback; @@ -89,6 +90,10 @@ pub mod yielding_loop; pub mod zstd; +pub mod env; + +pub mod poison; + /// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages /// /// we have several cases: diff --git a/libs/utils/src/lock_file.rs b/libs/utils/src/lock_file.rs index 987b9d9ad2..59c66ca757 100644 --- a/libs/utils/src/lock_file.rs +++ b/libs/utils/src/lock_file.rs @@ -63,6 +63,7 @@ impl UnwrittenLockFile { pub fn create_exclusive(lock_file_path: &Utf8Path) -> anyhow::Result { let lock_file = fs::OpenOptions::new() .create(true) // O_CREAT + .truncate(true) .write(true) .open(lock_file_path) .context("open lock file")?; diff --git a/libs/utils/src/poison.rs b/libs/utils/src/poison.rs new file mode 100644 index 0000000000..27378c69fc --- /dev/null +++ b/libs/utils/src/poison.rs @@ -0,0 +1,121 @@ +//! Protect a piece of state from reuse after it is left in an inconsistent state. +//! +//! # Example +//! +//! ``` +//! # tokio::runtime::Builder::new_current_thread().enable_all().build().unwrap().block_on(async { +//! use utils::poison::Poison; +//! use std::time::Duration; +//! +//! struct State { +//! clean: bool, +//! } +//! let state = tokio::sync::Mutex::new(Poison::new("mystate", State { clean: true })); +//! +//! let mut mutex_guard = state.lock().await; +//! let mut poison_guard = mutex_guard.check_and_arm()?; +//! let state = poison_guard.data_mut(); +//! state.clean = false; +//! // If we get cancelled at this await point, subsequent check_and_arm() calls will fail. +//! tokio::time::sleep(Duration::from_secs(10)).await; +//! state.clean = true; +//! poison_guard.disarm(); +//! # Ok::<(), utils::poison::Error>(()) +//! # }); +//! ``` + +use tracing::warn; + +pub struct Poison { + what: &'static str, + state: State, + data: T, +} + +#[derive(Clone, Copy)] +enum State { + Clean, + Armed, + Poisoned { at: chrono::DateTime }, +} + +impl Poison { + /// We log `what` `warning!` level if the [`Guard`] gets dropped without being [`Guard::disarm`]ed. + pub fn new(what: &'static str, data: T) -> Self { + Self { + what, + state: State::Clean, + data, + } + } + + /// Check for poisoning and return a [`Guard`] that provides access to the wrapped state. + pub fn check_and_arm(&mut self) -> Result, Error> { + match self.state { + State::Clean => { + self.state = State::Armed; + Ok(Guard(self)) + } + State::Armed => unreachable!("transient state"), + State::Poisoned { at } => Err(Error::Poisoned { + what: self.what, + at, + }), + } + } +} + +/// Use [`Self::data`] and [`Self::data_mut`] to access the wrapped state. +/// Once modifications are done, use [`Self::disarm`]. +/// If [`Guard`] gets dropped instead of calling [`Self::disarm`], the state is poisoned +/// and subsequent calls to [`Poison::check_and_arm`] will fail with an error. +pub struct Guard<'a, T>(&'a mut Poison); + +impl<'a, T> Guard<'a, T> { + pub fn data(&self) -> &T { + &self.0.data + } + pub fn data_mut(&mut self) -> &mut T { + &mut self.0.data + } + + pub fn disarm(self) { + match self.0.state { + State::Clean => unreachable!("we set it to Armed in check_and_arm()"), + State::Armed => { + self.0.state = State::Clean; + } + State::Poisoned { at } => { + unreachable!("we fail check_and_arm() if it's in that state: {at}") + } + } + } +} + +impl<'a, T> Drop for Guard<'a, T> { + fn drop(&mut self) { + match self.0.state { + State::Clean => { + // set by disarm() + } + State::Armed => { + // still armed => poison it + let at = chrono::Utc::now(); + self.0.state = State::Poisoned { at }; + warn!(at=?at, "poisoning {}", self.0.what); + } + State::Poisoned { at } => { + unreachable!("we fail check_and_arm() if it's in that state: {at}") + } + } + } +} + +#[derive(thiserror::Error, Debug)] +pub enum Error { + #[error("poisoned at {at}: {what}")] + Poisoned { + what: &'static str, + at: chrono::DateTime, + }, +} diff --git a/libs/utils/src/seqwait.rs b/libs/utils/src/seqwait.rs index b7301776eb..375b227b99 100644 --- a/libs/utils/src/seqwait.rs +++ b/libs/utils/src/seqwait.rs @@ -2,11 +2,10 @@ use std::cmp::{Eq, Ordering}; use std::collections::BinaryHeap; -use std::fmt::Debug; use std::mem; use std::sync::Mutex; use std::time::Duration; -use tokio::sync::watch::{channel, Receiver, Sender}; +use tokio::sync::watch::{self, channel}; use tokio::time::timeout; /// An error happened while waiting for a number @@ -35,23 +34,73 @@ pub trait MonotonicCounter { fn cnt_value(&self) -> V; } -/// Internal components of a `SeqWait` -struct SeqWaitInt +/// Heap of waiters, lowest numbers pop first. +struct Waiters where - S: MonotonicCounter, V: Ord, { - waiters: BinaryHeap>, - current: S, - shutdown: bool, + heap: BinaryHeap>, + /// Number of the first waiter in the heap, or None if there are no waiters. + status_channel: watch::Sender>, +} + +impl Waiters +where + V: Ord + Copy, +{ + fn new() -> Self { + Waiters { + heap: BinaryHeap::new(), + status_channel: channel(None).0, + } + } + + /// `status_channel` contains the number of the first waiter in the heap. + /// This function should be called whenever waiters heap changes. + fn update_status(&self) { + let first_waiter = self.heap.peek().map(|w| w.wake_num); + let _ = self.status_channel.send_replace(first_waiter); + } + + /// Add new waiter to the heap, return a channel that will be notified when the number arrives. + fn add(&mut self, num: V) -> watch::Receiver<()> { + let (tx, rx) = channel(()); + self.heap.push(Waiter { + wake_num: num, + wake_channel: tx, + }); + self.update_status(); + rx + } + + /// Pop all waiters <= num from the heap. Collect channels in a vector, + /// so that caller can wake them up. + fn pop_leq(&mut self, num: V) -> Vec> { + let mut wake_these = Vec::new(); + while let Some(n) = self.heap.peek() { + if n.wake_num > num { + break; + } + wake_these.push(self.heap.pop().unwrap().wake_channel); + } + self.update_status(); + wake_these + } + + /// Used on shutdown to efficiently drop all waiters. + fn take_all(&mut self) -> BinaryHeap> { + let heap = mem::take(&mut self.heap); + self.update_status(); + heap + } } struct Waiter where T: Ord, { - wake_num: T, // wake me when this number arrives ... - wake_channel: Sender<()>, // ... by sending a message to this channel + wake_num: T, // wake me when this number arrives ... + wake_channel: watch::Sender<()>, // ... by sending a message to this channel } // BinaryHeap is a max-heap, and we want a min-heap. Reverse the ordering here @@ -76,6 +125,17 @@ impl PartialEq for Waiter { impl Eq for Waiter {} +/// Internal components of a `SeqWait` +struct SeqWaitInt +where + S: MonotonicCounter, + V: Ord, +{ + waiters: Waiters, + current: S, + shutdown: bool, +} + /// A tool for waiting on a sequence number /// /// This provides a way to wait the arrival of a number. @@ -108,7 +168,7 @@ where /// Create a new `SeqWait`, initialized to a particular number pub fn new(starting_num: S) -> Self { let internal = SeqWaitInt { - waiters: BinaryHeap::new(), + waiters: Waiters::new(), current: starting_num, shutdown: false, }; @@ -128,9 +188,8 @@ where // Block any future waiters from starting internal.shutdown = true; - // This will steal the entire waiters map. - // When we drop it all waiters will be woken. - mem::take(&mut internal.waiters) + // Take all waiters to drop them later. + internal.waiters.take_all() // Drop the lock as we exit this scope. }; @@ -182,9 +241,21 @@ where } } + /// Check if [`Self::wait_for`] or [`Self::wait_for_timeout`] would wait if called with `num`. + pub fn would_wait_for(&self, num: V) -> Result<(), V> { + let internal = self.internal.lock().unwrap(); + let cnt = internal.current.cnt_value(); + drop(internal); + if cnt >= num { + Ok(()) + } else { + Err(cnt) + } + } + /// Register and return a channel that will be notified when a number arrives, /// or None, if it has already arrived. - fn queue_for_wait(&self, num: V) -> Result>, SeqWaitError> { + fn queue_for_wait(&self, num: V) -> Result>, SeqWaitError> { let mut internal = self.internal.lock().unwrap(); if internal.current.cnt_value() >= num { return Ok(None); @@ -193,12 +264,8 @@ where return Err(SeqWaitError::Shutdown); } - // Create a new channel. - let (tx, rx) = channel(()); - internal.waiters.push(Waiter { - wake_num: num, - wake_channel: tx, - }); + // Add waiter channel to the queue. + let rx = internal.waiters.add(num); // Drop the lock as we exit this scope. Ok(Some(rx)) } @@ -219,16 +286,8 @@ where } internal.current.cnt_advance(num); - // Pop all waiters <= num from the heap. Collect them in a vector, and - // wake them up after releasing the lock. - let mut wake_these = Vec::new(); - while let Some(n) = internal.waiters.peek() { - if n.wake_num > num { - break; - } - wake_these.push(internal.waiters.pop().unwrap().wake_channel); - } - wake_these + // Pop all waiters <= num from the heap. + internal.waiters.pop_leq(num) }; for tx in wake_these { @@ -243,6 +302,23 @@ where pub fn load(&self) -> S { self.internal.lock().unwrap().current } + + /// Get a Receiver for the current status. + /// + /// The current status is the number of the first waiter in the queue, + /// or None if there are no waiters. + /// + /// This receiver will be notified whenever the status changes. + /// It is useful for receiving notifications when the first waiter + /// starts waiting for a number, or when there are no more waiters left. + pub fn status_receiver(&self) -> watch::Receiver> { + self.internal + .lock() + .unwrap() + .waiters + .status_channel + .subscribe() + } } #[cfg(test)] diff --git a/libs/utils/src/serde_system_time.rs b/libs/utils/src/serde_system_time.rs new file mode 100644 index 0000000000..b0f6934e87 --- /dev/null +++ b/libs/utils/src/serde_system_time.rs @@ -0,0 +1,55 @@ +//! A `serde::{Deserialize,Serialize}` type for SystemTime with RFC3339 format and millisecond precision. + +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, serde::Serialize, serde::Deserialize)] +#[serde(transparent)] +pub struct SystemTime( + #[serde( + deserialize_with = "deser_rfc3339_millis", + serialize_with = "ser_rfc3339_millis" + )] + pub std::time::SystemTime, +); + +fn ser_rfc3339_millis( + ts: &std::time::SystemTime, + serializer: S, +) -> Result { + serializer.collect_str(&humantime::format_rfc3339_millis(*ts)) +} + +fn deser_rfc3339_millis<'de, D>(deserializer: D) -> Result +where + D: serde::de::Deserializer<'de>, +{ + let s: String = serde::de::Deserialize::deserialize(deserializer)?; + humantime::parse_rfc3339(&s).map_err(serde::de::Error::custom) +} + +#[cfg(test)] +mod tests { + use super::*; + + /// Helper function to make a SystemTime have millisecond precision by truncating additional nanoseconds. + fn to_millisecond_precision(time: SystemTime) -> SystemTime { + match time.0.duration_since(std::time::SystemTime::UNIX_EPOCH) { + Ok(duration) => { + let total_millis = duration.as_secs() * 1_000 + u64::from(duration.subsec_millis()); + SystemTime( + std::time::SystemTime::UNIX_EPOCH + + std::time::Duration::from_millis(total_millis), + ) + } + Err(_) => time, + } + } + + #[test] + fn test_serialize_deserialize() { + let input = SystemTime(std::time::SystemTime::now()); + let expected_serialized = format!("\"{}\"", humantime::format_rfc3339_millis(input.0)); + let serialized = serde_json::to_string(&input).unwrap(); + assert_eq!(expected_serialized, serialized); + let deserialized: SystemTime = serde_json::from_str(&expected_serialized).unwrap(); + assert_eq!(to_millisecond_precision(input), deserialized); + } +} diff --git a/libs/utils/src/sync/heavier_once_cell.rs b/libs/utils/src/sync/heavier_once_cell.rs index 8eee1f72a6..1abd3d9861 100644 --- a/libs/utils/src/sync/heavier_once_cell.rs +++ b/libs/utils/src/sync/heavier_once_cell.rs @@ -192,6 +192,14 @@ impl OnceCell { } } + /// Like [`Guard::take_and_deinit`], but will return `None` if this OnceCell was never + /// initialized. + pub fn take_and_deinit(&mut self) -> Option<(T, InitPermit)> { + let inner = self.inner.get_mut().unwrap(); + + inner.take_and_deinit() + } + /// Return the number of [`Self::get_or_init`] calls waiting for initialization to complete. pub fn initializer_count(&self) -> usize { self.initializers.load(Ordering::Relaxed) @@ -246,15 +254,23 @@ impl<'a, T> Guard<'a, T> { /// The permit will be on a semaphore part of the new internal value, and any following /// [`OnceCell::get_or_init`] will wait on it to complete. pub fn take_and_deinit(mut self) -> (T, InitPermit) { + self.0 + .take_and_deinit() + .expect("guard is not created unless value has been initialized") + } +} + +impl Inner { + pub fn take_and_deinit(&mut self) -> Option<(T, InitPermit)> { + let value = self.value.take()?; + let mut swapped = Inner::default(); let sem = swapped.init_semaphore.clone(); // acquire and forget right away, moving the control over to InitPermit sem.try_acquire().expect("we just created this").forget(); - std::mem::swap(&mut *self.0, &mut swapped); - swapped - .value - .map(|v| (v, InitPermit(sem))) - .expect("guard is not created unless value has been initialized") + let permit = InitPermit(sem); + std::mem::swap(self, &mut swapped); + Some((value, permit)) } } @@ -263,6 +279,13 @@ impl<'a, T> Guard<'a, T> { /// On drop, this type will return the permit. pub struct InitPermit(Arc); +impl std::fmt::Debug for InitPermit { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let ptr = Arc::as_ptr(&self.0) as *const (); + f.debug_tuple("InitPermit").field(&ptr).finish() + } +} + impl Drop for InitPermit { fn drop(&mut self) { assert_eq!( @@ -559,4 +582,22 @@ mod tests { assert_eq!(*target.get().unwrap(), 11); } + + #[tokio::test] + async fn take_and_deinit_on_mut() { + use std::convert::Infallible; + + let mut target = OnceCell::::default(); + assert!(target.take_and_deinit().is_none()); + + target + .get_or_init(|permit| async move { Ok::<_, Infallible>((42, permit)) }) + .await + .unwrap(); + + let again = target.take_and_deinit(); + assert!(matches!(again, Some((42, _))), "{again:?}"); + + assert!(target.take_and_deinit().is_none()); + } } diff --git a/libs/vm_monitor/src/runner.rs b/libs/vm_monitor/src/runner.rs index ba37966476..ca02637ecf 100644 --- a/libs/vm_monitor/src/runner.rs +++ b/libs/vm_monitor/src/runner.rs @@ -69,7 +69,7 @@ pub struct Config { /// should be removed once we have a better solution there. sys_buffer_bytes: u64, - /// Minimum fraction of total system memory reserved *before* the the cgroup threshold; in + /// Minimum fraction of total system memory reserved *before* the cgroup threshold; in /// other words, providing a ceiling for the highest value of the threshold by enforcing that /// there's at least `cgroup_min_overhead_fraction` of the total memory remaining beyond the /// threshold. diff --git a/libs/walproposer/src/api_bindings.rs b/libs/walproposer/src/api_bindings.rs index 906302e46e..bbc3663402 100644 --- a/libs/walproposer/src/api_bindings.rs +++ b/libs/walproposer/src/api_bindings.rs @@ -50,6 +50,14 @@ extern "C" fn get_flush_rec_ptr(wp: *mut WalProposer) -> XLogRecPtr { } } +extern "C" fn update_donor(wp: *mut WalProposer, donor: *mut Safekeeper, donor_lsn: XLogRecPtr) { + unsafe { + let callback_data = (*(*wp).config).callback_data; + let api = callback_data as *mut Box; + (*api).update_donor(&mut (*donor), donor_lsn) + } +} + extern "C" fn get_current_timestamp(wp: *mut WalProposer) -> TimestampTz { unsafe { let callback_data = (*(*wp).config).callback_data; @@ -391,6 +399,7 @@ pub(crate) fn create_api() -> walproposer_api { get_shmem_state: Some(get_shmem_state), start_streaming: Some(start_streaming), get_flush_rec_ptr: Some(get_flush_rec_ptr), + update_donor: Some(update_donor), get_current_timestamp: Some(get_current_timestamp), conn_error_message: Some(conn_error_message), conn_status: Some(conn_status), @@ -421,6 +430,32 @@ pub(crate) fn create_api() -> walproposer_api { } } +pub fn empty_shmem() -> crate::bindings::WalproposerShmemState { + let empty_feedback = crate::bindings::PageserverFeedback { + present: false, + currentClusterSize: 0, + last_received_lsn: 0, + disk_consistent_lsn: 0, + remote_consistent_lsn: 0, + replytime: 0, + shard_number: 0, + }; + + crate::bindings::WalproposerShmemState { + propEpochStartLsn: crate::bindings::pg_atomic_uint64 { value: 0 }, + donor_name: [0; 64], + donor_conninfo: [0; 1024], + donor_lsn: 0, + mutex: 0, + mineLastElectedTerm: crate::bindings::pg_atomic_uint64 { value: 0 }, + backpressureThrottlingTime: crate::bindings::pg_atomic_uint64 { value: 0 }, + currentClusterSize: crate::bindings::pg_atomic_uint64 { value: 0 }, + shard_ps_feedback: [empty_feedback; 128], + num_shards: 0, + min_ps_feedback: empty_feedback, + } +} + impl std::fmt::Display for Level { fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { write!(f, "{:?}", self) diff --git a/libs/walproposer/src/walproposer.rs b/libs/walproposer/src/walproposer.rs index 14cc3e05a2..fb815607a7 100644 --- a/libs/walproposer/src/walproposer.rs +++ b/libs/walproposer/src/walproposer.rs @@ -1,8 +1,5 @@ use std::ffi::CString; -use postgres_ffi::WAL_SEGMENT_SIZE; -use utils::{id::TenantTimelineId, lsn::Lsn}; - use crate::{ api_bindings::{create_api, take_vec_u8, Level}, bindings::{ @@ -10,6 +7,8 @@ use crate::{ WalProposerCreate, WalProposerFree, WalProposerPoll, WalProposerStart, }, }; +use postgres_ffi::WAL_SEGMENT_SIZE; +use utils::{id::TenantTimelineId, lsn::Lsn}; /// Rust high-level wrapper for C walproposer API. Many methods are not required /// for simple cases, hence todo!() in default implementations. @@ -28,6 +27,10 @@ pub trait ApiImpl { todo!() } + fn update_donor(&self, _donor: &mut Safekeeper, _donor_lsn: u64) { + todo!() + } + fn get_current_timestamp(&self) -> i64 { todo!() } @@ -274,6 +277,7 @@ mod tests { sync::{atomic::AtomicUsize, mpsc::sync_channel}, }; + use std::cell::UnsafeCell; use utils::id::TenantTimelineId; use crate::{api_bindings::Level, bindings::NeonWALReadResult, walproposer::Wrapper}; @@ -297,6 +301,8 @@ mod tests { replies_ptr: AtomicUsize, // channel to send LSN to the main thread sync_channel: std::sync::mpsc::SyncSender, + // Shmem state, used for storing donor info + shmem: UnsafeCell, } impl MockImpl { @@ -327,11 +333,22 @@ mod tests { } impl ApiImpl for MockImpl { + fn get_shmem_state(&self) -> *mut crate::bindings::WalproposerShmemState { + self.shmem.get() + } + fn get_current_timestamp(&self) -> i64 { println!("get_current_timestamp"); 0 } + fn update_donor(&self, donor: &mut crate::bindings::Safekeeper, donor_lsn: u64) { + let mut shmem = unsafe { *self.get_shmem_state() }; + shmem.propEpochStartLsn.value = donor_lsn; + shmem.donor_conninfo = donor.conninfo; + shmem.donor_lsn = donor_lsn; + } + fn conn_status( &self, _: &mut crate::bindings::Safekeeper, @@ -507,6 +524,7 @@ mod tests { ], replies_ptr: AtomicUsize::new(0), sync_channel: sender, + shmem: UnsafeCell::new(crate::api_bindings::empty_shmem()), }); let config = crate::walproposer::Config { ttid, diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index f304294591..4335f38f1e 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -59,6 +59,7 @@ signal-hook.workspace = true smallvec = { workspace = true, features = ["write"] } svg_fmt.workspace = true sync_wrapper.workspace = true +sysinfo.workspace = true tokio-tar.workspace = true thiserror.workspace = true tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time"] } @@ -69,6 +70,7 @@ tokio-stream.workspace = true tokio-util.workspace = true toml_edit = { workspace = true, features = [ "serde" ] } tracing.workspace = true +twox-hash.workspace = true url.workspace = true walkdir.workspace = true metrics.workspace = true diff --git a/pageserver/benches/bench_layer_map.rs b/pageserver/benches/bench_layer_map.rs index 5d05af0c00..1d02aa7709 100644 --- a/pageserver/benches/bench_layer_map.rs +++ b/pageserver/benches/bench_layer_map.rs @@ -1,7 +1,7 @@ use pageserver::keyspace::{KeyPartitioning, KeySpace}; use pageserver::repository::Key; use pageserver::tenant::layer_map::LayerMap; -use pageserver::tenant::storage_layer::LayerFileName; +use pageserver::tenant::storage_layer::LayerName; use pageserver::tenant::storage_layer::PersistentLayerDesc; use pageserver_api::shard::TenantShardId; use rand::prelude::{SeedableRng, SliceRandom, StdRng}; @@ -28,7 +28,7 @@ fn build_layer_map(filename_dump: PathBuf) -> LayerMap { let mut updates = layer_map.batch_update(); for fname in filenames { let fname = fname.unwrap(); - let fname = LayerFileName::from_str(&fname).unwrap(); + let fname = LayerName::from_str(&fname).unwrap(); let layer = PersistentLayerDesc::from(fname); let lsn_range = layer.get_lsn_range(); diff --git a/pageserver/benches/bench_walredo.rs b/pageserver/benches/bench_walredo.rs index 3efad546a6..5b871c5d5e 100644 --- a/pageserver/benches/bench_walredo.rs +++ b/pageserver/benches/bench_walredo.rs @@ -27,30 +27,50 @@ //! //! # Reference Numbers //! -//! 2024-03-20 on i3en.3xlarge +//! 2024-04-15 on i3en.3xlarge //! //! ```text -//! short/1 time: [26.483 µs 26.614 µs 26.767 µs] -//! short/2 time: [32.223 µs 32.465 µs 32.767 µs] -//! short/4 time: [47.203 µs 47.583 µs 47.984 µs] -//! short/8 time: [89.135 µs 89.612 µs 90.139 µs] -//! short/16 time: [190.12 µs 191.52 µs 192.88 µs] -//! short/32 time: [380.96 µs 382.63 µs 384.20 µs] -//! short/64 time: [736.86 µs 741.07 µs 745.03 µs] -//! short/128 time: [1.4106 ms 1.4206 ms 1.4294 ms] -//! medium/1 time: [111.81 µs 112.25 µs 112.79 µs] -//! medium/2 time: [158.26 µs 159.13 µs 160.21 µs] -//! medium/4 time: [334.65 µs 337.14 µs 340.07 µs] -//! medium/8 time: [675.32 µs 679.91 µs 685.25 µs] -//! medium/16 time: [1.2929 ms 1.2996 ms 1.3067 ms] -//! medium/32 time: [2.4295 ms 2.4461 ms 2.4623 ms] -//! medium/64 time: [4.3973 ms 4.4458 ms 4.4875 ms] -//! medium/128 time: [7.5955 ms 7.7847 ms 7.9481 ms] +//! async-short/1 time: [24.584 µs 24.737 µs 24.922 µs] +//! async-short/2 time: [33.479 µs 33.660 µs 33.888 µs] +//! async-short/4 time: [42.713 µs 43.046 µs 43.440 µs] +//! async-short/8 time: [71.814 µs 72.478 µs 73.240 µs] +//! async-short/16 time: [132.73 µs 134.45 µs 136.22 µs] +//! async-short/32 time: [258.31 µs 260.73 µs 263.27 µs] +//! async-short/64 time: [511.61 µs 514.44 µs 517.51 µs] +//! async-short/128 time: [992.64 µs 998.23 µs 1.0042 ms] +//! async-medium/1 time: [110.11 µs 110.50 µs 110.96 µs] +//! async-medium/2 time: [153.06 µs 153.85 µs 154.99 µs] +//! async-medium/4 time: [317.51 µs 319.92 µs 322.85 µs] +//! async-medium/8 time: [638.30 µs 644.68 µs 652.12 µs] +//! async-medium/16 time: [1.2651 ms 1.2773 ms 1.2914 ms] +//! async-medium/32 time: [2.5117 ms 2.5410 ms 2.5720 ms] +//! async-medium/64 time: [4.8088 ms 4.8555 ms 4.9047 ms] +//! async-medium/128 time: [8.8311 ms 8.9849 ms 9.1263 ms] +//! sync-short/1 time: [25.503 µs 25.626 µs 25.771 µs] +//! sync-short/2 time: [30.850 µs 31.013 µs 31.208 µs] +//! sync-short/4 time: [45.543 µs 45.856 µs 46.193 µs] +//! sync-short/8 time: [84.114 µs 84.639 µs 85.220 µs] +//! sync-short/16 time: [185.22 µs 186.15 µs 187.13 µs] +//! sync-short/32 time: [377.43 µs 378.87 µs 380.46 µs] +//! sync-short/64 time: [756.49 µs 759.04 µs 761.70 µs] +//! sync-short/128 time: [1.4825 ms 1.4874 ms 1.4923 ms] +//! sync-medium/1 time: [105.66 µs 106.01 µs 106.43 µs] +//! sync-medium/2 time: [153.10 µs 153.84 µs 154.72 µs] +//! sync-medium/4 time: [327.13 µs 329.44 µs 332.27 µs] +//! sync-medium/8 time: [654.26 µs 658.73 µs 663.63 µs] +//! sync-medium/16 time: [1.2682 ms 1.2748 ms 1.2816 ms] +//! sync-medium/32 time: [2.4456 ms 2.4595 ms 2.4731 ms] +//! sync-medium/64 time: [4.6523 ms 4.6890 ms 4.7256 ms] +//! sync-medium/128 time: [8.7215 ms 8.8323 ms 8.9344 ms] //! ``` use bytes::{Buf, Bytes}; use criterion::{BenchmarkId, Criterion}; -use pageserver::{config::PageServerConf, walrecord::NeonWalRecord, walredo::PostgresRedoManager}; +use pageserver::{ + config::PageServerConf, + walrecord::NeonWalRecord, + walredo::{PostgresRedoManager, ProcessKind}, +}; use pageserver_api::{key::Key, shard::TenantShardId}; use std::{ sync::Arc, @@ -60,33 +80,39 @@ use tokio::{sync::Barrier, task::JoinSet}; use utils::{id::TenantId, lsn::Lsn}; fn bench(c: &mut Criterion) { - { - let nclients = [1, 2, 4, 8, 16, 32, 64, 128]; - for nclients in nclients { - let mut group = c.benchmark_group("short"); - group.bench_with_input( - BenchmarkId::from_parameter(nclients), - &nclients, - |b, nclients| { - let redo_work = Arc::new(Request::short_input()); - b.iter_custom(|iters| bench_impl(Arc::clone(&redo_work), iters, *nclients)); - }, - ); + for process_kind in &[ProcessKind::Async, ProcessKind::Sync] { + { + let nclients = [1, 2, 4, 8, 16, 32, 64, 128]; + for nclients in nclients { + let mut group = c.benchmark_group(format!("{process_kind}-short")); + group.bench_with_input( + BenchmarkId::from_parameter(nclients), + &nclients, + |b, nclients| { + let redo_work = Arc::new(Request::short_input()); + b.iter_custom(|iters| { + bench_impl(*process_kind, Arc::clone(&redo_work), iters, *nclients) + }); + }, + ); + } } - } - { - let nclients = [1, 2, 4, 8, 16, 32, 64, 128]; - for nclients in nclients { - let mut group = c.benchmark_group("medium"); - group.bench_with_input( - BenchmarkId::from_parameter(nclients), - &nclients, - |b, nclients| { - let redo_work = Arc::new(Request::medium_input()); - b.iter_custom(|iters| bench_impl(Arc::clone(&redo_work), iters, *nclients)); - }, - ); + { + let nclients = [1, 2, 4, 8, 16, 32, 64, 128]; + for nclients in nclients { + let mut group = c.benchmark_group(format!("{process_kind}-medium")); + group.bench_with_input( + BenchmarkId::from_parameter(nclients), + &nclients, + |b, nclients| { + let redo_work = Arc::new(Request::medium_input()); + b.iter_custom(|iters| { + bench_impl(*process_kind, Arc::clone(&redo_work), iters, *nclients) + }); + }, + ); + } } } } @@ -94,10 +120,16 @@ criterion::criterion_group!(benches, bench); criterion::criterion_main!(benches); // Returns the sum of each client's wall-clock time spent executing their share of the n_redos. -fn bench_impl(redo_work: Arc, n_redos: u64, nclients: u64) -> Duration { +fn bench_impl( + process_kind: ProcessKind, + redo_work: Arc, + n_redos: u64, + nclients: u64, +) -> Duration { let repo_dir = camino_tempfile::tempdir_in(env!("CARGO_TARGET_TMPDIR")).unwrap(); - let conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf()); + let mut conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf()); + conf.walredo_process_kind = process_kind; let conf = Box::leak(Box::new(conf)); let tenant_shard_id = TenantShardId::unsharded(TenantId::generate()); @@ -113,25 +145,40 @@ fn bench_impl(redo_work: Arc, n_redos: u64, nclients: u64) -> Duration let manager = PostgresRedoManager::new(conf, tenant_shard_id); let manager = Arc::new(manager); + // divide the amount of work equally among the clients. + let nredos_per_client = n_redos / nclients; for _ in 0..nclients { rt.block_on(async { tasks.spawn(client( Arc::clone(&manager), Arc::clone(&start), Arc::clone(&redo_work), - // divide the amount of work equally among the clients - n_redos / nclients, + nredos_per_client, )) }); } - rt.block_on(async move { - let mut total_wallclock_time = std::time::Duration::from_millis(0); + let elapsed = rt.block_on(async move { + let mut total_wallclock_time = Duration::ZERO; while let Some(res) = tasks.join_next().await { total_wallclock_time += res.unwrap(); } total_wallclock_time - }) + }); + + // consistency check to ensure process kind setting worked + if nredos_per_client > 0 { + assert_eq!( + manager + .status() + .process + .map(|p| p.kind) + .expect("the benchmark work causes a walredo process to be spawned"), + std::borrow::Cow::Borrowed(process_kind.into()) + ); + } + + elapsed } async fn client( diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs index ab55d2b0a3..6df8b2170d 100644 --- a/pageserver/client/src/mgmt_api.rs +++ b/pageserver/client/src/mgmt_api.rs @@ -128,12 +128,12 @@ impl Client { pub async fn timeline_info( &self, - tenant_id: TenantId, + tenant_shard_id: TenantShardId, timeline_id: TimelineId, force_await_logical_size: ForceAwaitLogicalSize, ) -> Result { let uri = format!( - "{}/v1/tenant/{tenant_id}/timeline/{timeline_id}", + "{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}", self.mgmt_api_endpoint ); @@ -151,11 +151,11 @@ impl Client { pub async fn keyspace( &self, - tenant_id: TenantId, + tenant_shard_id: TenantShardId, timeline_id: TimelineId, ) -> Result { let uri = format!( - "{}/v1/tenant/{tenant_id}/timeline/{timeline_id}/keyspace", + "{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/keyspace", self.mgmt_api_endpoint ); self.get(&uri) @@ -243,6 +243,19 @@ impl Client { Ok(()) } + pub async fn tenant_scan_remote_storage( + &self, + tenant_id: TenantId, + ) -> Result { + let uri = format!( + "{}/v1/tenant/{tenant_id}/scan_remote_storage", + self.mgmt_api_endpoint + ); + let response = self.request(Method::GET, &uri, ()).await?; + let body = response.json().await.map_err(Error::ReceiveBody)?; + Ok(body) + } + pub async fn tenant_config(&self, req: &TenantConfigRequest) -> Result<()> { let uri = format!("{}/v1/tenant/config", self.mgmt_api_endpoint); self.request(Method::PUT, &uri, req).await?; @@ -271,6 +284,34 @@ impl Client { Ok((status, progress)) } + pub async fn tenant_secondary_status( + &self, + tenant_shard_id: TenantShardId, + ) -> Result { + let path = reqwest::Url::parse(&format!( + "{}/v1/tenant/{}/secondary/status", + self.mgmt_api_endpoint, tenant_shard_id + )) + .expect("Cannot build URL"); + + self.request(Method::GET, path, ()) + .await? + .json() + .await + .map_err(Error::ReceiveBody) + } + + pub async fn tenant_heatmap_upload(&self, tenant_id: TenantShardId) -> Result<()> { + let path = reqwest::Url::parse(&format!( + "{}/v1/tenant/{}/heatmap_upload", + self.mgmt_api_endpoint, tenant_id + )) + .expect("Cannot build URL"); + + self.request(Method::POST, path, ()).await?; + Ok(()) + } + pub async fn location_config( &self, tenant_shard_id: TenantShardId, @@ -278,10 +319,7 @@ impl Client { flush_ms: Option, lazy: bool, ) -> Result<()> { - let req_body = TenantLocationConfigRequest { - tenant_id: Some(tenant_shard_id), - config, - }; + let req_body = TenantLocationConfigRequest { config }; let mut path = reqwest::Url::parse(&format!( "{}/v1/tenant/{}/location_config", diff --git a/pageserver/client/src/page_service.rs b/pageserver/client/src/page_service.rs index 49175b3b90..f9507fc47a 100644 --- a/pageserver/client/src/page_service.rs +++ b/pageserver/client/src/page_service.rs @@ -60,7 +60,7 @@ impl Client { ) -> anyhow::Result { let copy_both: tokio_postgres::CopyBothDuplex = self .client - .copy_both_simple(&format!("pagestream {tenant_id} {timeline_id}")) + .copy_both_simple(&format!("pagestream_v2 {tenant_id} {timeline_id}")) .await?; let Client { cancel_on_client_drop, diff --git a/pageserver/compaction/Cargo.toml b/pageserver/compaction/Cargo.toml index 47f318db63..0fd1d81845 100644 --- a/pageserver/compaction/Cargo.toml +++ b/pageserver/compaction/Cargo.toml @@ -11,7 +11,6 @@ default = [] anyhow.workspace = true async-compression.workspace = true async-stream.workspace = true -async-trait.workspace = true byteorder.workspace = true bytes.workspace = true chrono = { workspace = true, features = ["serde"] } diff --git a/pageserver/compaction/src/compact_tiered.rs b/pageserver/compaction/src/compact_tiered.rs index 60fc7ac925..20e9cf2196 100644 --- a/pageserver/compaction/src/compact_tiered.rs +++ b/pageserver/compaction/src/compact_tiered.rs @@ -18,12 +18,15 @@ //! database size. For example, if the logical database size is 10 GB, we would //! generate new image layers every 10 GB of WAL. use futures::StreamExt; +use pageserver_api::shard::ShardIdentity; use tracing::{debug, info}; use std::collections::{HashSet, VecDeque}; use std::ops::Range; -use crate::helpers::{accum_key_values, keyspace_total_size, merge_delta_keys, overlaps_with}; +use crate::helpers::{ + accum_key_values, keyspace_total_size, merge_delta_keys_buffered, overlaps_with, +}; use crate::interface::*; use utils::lsn::Lsn; @@ -43,7 +46,8 @@ pub async fn compact_tiered( fanout: u64, ctx: &E::RequestContext, ) -> anyhow::Result<()> { - assert!(fanout >= 2); + assert!(fanout >= 1, "fanout needs to be at least 1 but is {fanout}"); + let exp_base = fanout.max(2); // Start at L0 let mut current_level_no = 0; let mut current_level_target_height = target_file_size; @@ -102,11 +106,17 @@ pub async fn compact_tiered( ctx, ) .await?; - if target_file_size == u64::MAX { + if current_level_target_height == u64::MAX { + // our target height includes all possible lsns + info!( + level = current_level_no, + depth = depth, + "compaction loop reached max current_level_target_height" + ); break; } current_level_no += 1; - current_level_target_height = current_level_target_height.saturating_mul(fanout); + current_level_target_height = current_level_target_height.saturating_mul(exp_base); } Ok(()) } @@ -124,6 +134,7 @@ async fn compact_level( } let mut state = LevelCompactionState { + shard_identity: *executor.get_shard_identity(), target_file_size, _lsn_range: lsn_range.clone(), layers: layer_fragments, @@ -163,6 +174,8 @@ struct LevelCompactionState<'a, E> where E: CompactionJobExecutor, { + shard_identity: ShardIdentity, + // parameters target_file_size: u64, @@ -365,6 +378,7 @@ where .executor .get_keyspace(&job.key_range, job.lsn_range.end, ctx) .await?, + &self.shard_identity, ) * 8192; let wal_size = job @@ -429,7 +443,7 @@ where keyspace, self.target_file_size / 8192, ); - while let Some(key_range) = window.choose_next_image() { + while let Some(key_range) = window.choose_next_image(&self.shard_identity) { new_jobs.push(CompactionJob:: { key_range, lsn_range: job.lsn_range.clone(), @@ -529,7 +543,10 @@ where } } // Open stream - let key_value_stream = std::pin::pin!(merge_delta_keys::(deltas.as_slice(), ctx)); + let key_value_stream = + std::pin::pin!(merge_delta_keys_buffered::(deltas.as_slice(), ctx) + .await? + .map(Result::<_, anyhow::Error>::Ok)); let mut new_jobs = Vec::new(); // Slide a window through the keyspace @@ -622,7 +639,12 @@ impl KeyspaceWindowPos { } // Advance the cursor until it reaches 'target_keysize'. - fn advance_until_size(&mut self, w: &KeyspaceWindowHead, max_size: u64) { + fn advance_until_size( + &mut self, + w: &KeyspaceWindowHead, + max_size: u64, + shard_identity: &ShardIdentity, + ) { while self.accum_keysize < max_size && !self.reached_end(w) { let curr_range = &w.keyspace[self.keyspace_idx]; if self.end_key < curr_range.start { @@ -631,7 +653,7 @@ impl KeyspaceWindowPos { } // We're now within 'curr_range'. Can we advance past it completely? - let distance = K::key_range_size(&(self.end_key..curr_range.end)); + let distance = K::key_range_size(&(self.end_key..curr_range.end), shard_identity); if (self.accum_keysize + distance as u64) < max_size { // oh yeah, it fits self.end_key = curr_range.end; @@ -640,7 +662,7 @@ impl KeyspaceWindowPos { } else { // advance within the range let skip_key = self.end_key.skip_some(); - let distance = K::key_range_size(&(self.end_key..skip_key)); + let distance = K::key_range_size(&(self.end_key..skip_key), shard_identity); if (self.accum_keysize + distance as u64) < max_size { self.end_key = skip_key; self.accum_keysize += distance as u64; @@ -676,7 +698,7 @@ where } } - fn choose_next_image(&mut self) -> Option> { + fn choose_next_image(&mut self, shard_identity: &ShardIdentity) -> Option> { if self.start_pos.keyspace_idx == self.head.keyspace.len() { // we've reached the end return None; @@ -686,6 +708,7 @@ where next_pos.advance_until_size( &self.head, self.start_pos.accum_keysize + self.head.target_keysize, + shard_identity, ); // See if we can gobble up the rest of the keyspace if we stretch out the layer, up to @@ -694,6 +717,7 @@ where end_pos.advance_until_size( &self.head, self.start_pos.accum_keysize + (self.head.target_keysize * 5 / 4), + shard_identity, ); if end_pos.reached_end(&self.head) { // gobble up any unused keyspace between the last used key and end of the range diff --git a/pageserver/compaction/src/helpers.rs b/pageserver/compaction/src/helpers.rs index 22a410b4af..06454ee1d0 100644 --- a/pageserver/compaction/src/helpers.rs +++ b/pageserver/compaction/src/helpers.rs @@ -5,19 +5,28 @@ use crate::interface::*; use futures::future::BoxFuture; use futures::{Stream, StreamExt}; use itertools::Itertools; +use pageserver_api::shard::ShardIdentity; use pin_project_lite::pin_project; use std::collections::BinaryHeap; use std::collections::VecDeque; +use std::fmt::Display; use std::future::Future; use std::ops::{DerefMut, Range}; use std::pin::Pin; use std::task::{ready, Poll}; +use utils::lsn::Lsn; -pub fn keyspace_total_size(keyspace: &CompactionKeySpace) -> u64 +pub fn keyspace_total_size( + keyspace: &CompactionKeySpace, + shard_identity: &ShardIdentity, +) -> u64 where K: CompactionKey, { - keyspace.iter().map(|r| K::key_range_size(r) as u64).sum() + keyspace + .iter() + .map(|r| K::key_range_size(r, shard_identity) as u64) + .sum() } pub fn overlaps_with(a: &Range, b: &Range) -> bool { @@ -101,17 +110,40 @@ pub fn merge_delta_keys<'a, E: CompactionJobExecutor>( } } +pub async fn merge_delta_keys_buffered<'a, E: CompactionJobExecutor + 'a>( + layers: &'a [E::DeltaLayer], + ctx: &'a E::RequestContext, +) -> anyhow::Result>::DeltaEntry<'a>>> +{ + let mut keys = Vec::new(); + for l in layers { + // Boxing and casting to LoadFuture is required to obtain the right Sync bound. + // If we do l.load_keys(ctx).await? directly, there is a compilation error. + let load_future: LoadFuture<'a, _> = Box::pin(l.load_keys(ctx)); + keys.extend(load_future.await?.into_iter()); + } + keys.sort_by_key(|k| (k.key(), k.lsn())); + let stream = futures::stream::iter(keys.into_iter()); + Ok(stream) +} + enum LazyLoadLayer<'a, E: CompactionJobExecutor> { Loaded(VecDeque<>::DeltaEntry<'a>>), Unloaded(&'a E::DeltaLayer), } impl<'a, E: CompactionJobExecutor> LazyLoadLayer<'a, E> { - fn key(&self) -> E::Key { + fn min_key(&self) -> E::Key { match self { Self::Loaded(entries) => entries.front().unwrap().key(), Self::Unloaded(dl) => dl.key_range().start, } } + fn min_lsn(&self) -> Lsn { + match self { + Self::Loaded(entries) => entries.front().unwrap().lsn(), + Self::Unloaded(dl) => dl.lsn_range().start, + } + } } impl<'a, E: CompactionJobExecutor> PartialOrd for LazyLoadLayer<'a, E> { fn partial_cmp(&self, other: &Self) -> Option { @@ -121,12 +153,12 @@ impl<'a, E: CompactionJobExecutor> PartialOrd for LazyLoadLayer<'a, E> { impl<'a, E: CompactionJobExecutor> Ord for LazyLoadLayer<'a, E> { fn cmp(&self, other: &Self) -> std::cmp::Ordering { // reverse order so that we get a min-heap - other.key().cmp(&self.key()) + (other.min_key(), other.min_lsn()).cmp(&(self.min_key(), self.min_lsn())) } } impl<'a, E: CompactionJobExecutor> PartialEq for LazyLoadLayer<'a, E> { fn eq(&self, other: &Self) -> bool { - self.key().eq(&other.key()) + self.cmp(other) == std::cmp::Ordering::Equal } } impl<'a, E: CompactionJobExecutor> Eq for LazyLoadLayer<'a, E> {} @@ -180,7 +212,7 @@ where match top.deref_mut() { LazyLoadLayer::Unloaded(ref mut l) => { let fut = l.load_keys(this.ctx); - this.load_future.set(Some(fut)); + this.load_future.set(Some(Box::pin(fut))); continue; } LazyLoadLayer::Loaded(ref mut entries) => { @@ -207,7 +239,7 @@ pub struct KeySize { pub fn accum_key_values<'a, I, K, D, E>(input: I) -> impl Stream, E>> where - K: Eq, + K: Eq + PartialOrd + Display + Copy, I: Stream>, D: CompactionDeltaEntry<'a, K>, { @@ -222,12 +254,15 @@ where num_values: 1, size: first.size(), }; + let mut last_key = accum.key; while let Some(this) = input.next().await { let this = this?; if this.key() == accum.key { accum.size += this.size(); accum.num_values += 1; } else { + assert!(last_key <= accum.key, "last_key={last_key} <= accum.key={}", accum.key); + last_key = accum.key; yield accum; accum = KeySize { key: this.key(), @@ -236,6 +271,7 @@ where }; } } + assert!(last_key <= accum.key, "last_key={last_key} <= accum.key={}", accum.key); yield accum; } } diff --git a/pageserver/compaction/src/interface.rs b/pageserver/compaction/src/interface.rs index 2bb2e749c0..35519b5d0a 100644 --- a/pageserver/compaction/src/interface.rs +++ b/pageserver/compaction/src/interface.rs @@ -3,9 +3,8 @@ //! //! All the heavy lifting is done by the create_image and create_delta //! functions that the implementor provides. -use async_trait::async_trait; use futures::Future; -use pageserver_api::{key::Key, keyspace::key_range_size}; +use pageserver_api::{key::Key, keyspace::ShardedRange, shard::ShardIdentity}; use std::ops::Range; use utils::lsn::Lsn; @@ -33,6 +32,8 @@ pub trait CompactionJobExecutor { // Functions that the planner uses to support its decisions // ---- + fn get_shard_identity(&self) -> &ShardIdentity; + /// Return all layers that overlap the given bounding box. fn get_layers( &mut self, @@ -99,7 +100,7 @@ pub trait CompactionKey: std::cmp::Ord + Clone + Copy + std::fmt::Display { /// /// This returns u32, for compatibility with Repository::key. If the /// distance is larger, return u32::MAX. - fn key_range_size(key_range: &Range) -> u32; + fn key_range_size(key_range: &Range, shard_identity: &ShardIdentity) -> u32; // return "self + 1" fn next(&self) -> Self; @@ -114,8 +115,8 @@ impl CompactionKey for Key { const MIN: Self = Self::MIN; const MAX: Self = Self::MAX; - fn key_range_size(r: &std::ops::Range) -> u32 { - key_range_size(r) + fn key_range_size(r: &std::ops::Range, shard_identity: &ShardIdentity) -> u32 { + ShardedRange::new(r.clone(), shard_identity).page_count() } fn next(&self) -> Key { (self as &Key).next() @@ -141,18 +142,16 @@ pub trait CompactionLayer { fn is_delta(&self) -> bool; } - -#[async_trait] pub trait CompactionDeltaLayer: CompactionLayer { type DeltaEntry<'a>: CompactionDeltaEntry<'a, E::Key> where Self: 'a; /// Return all keys in this delta layer. - async fn load_keys<'a>( + fn load_keys<'a>( &self, ctx: &E::RequestContext, - ) -> anyhow::Result>>; + ) -> impl Future>>> + Send; } pub trait CompactionImageLayer: CompactionLayer {} diff --git a/pageserver/compaction/src/simulator.rs b/pageserver/compaction/src/simulator.rs index def7983e75..3543df64fa 100644 --- a/pageserver/compaction/src/simulator.rs +++ b/pageserver/compaction/src/simulator.rs @@ -2,8 +2,8 @@ mod draw; use draw::{LayerTraceEvent, LayerTraceFile, LayerTraceOp}; -use async_trait::async_trait; use futures::StreamExt; +use pageserver_api::shard::ShardIdentity; use rand::Rng; use tracing::info; @@ -72,7 +72,7 @@ impl interface::CompactionKey for Key { const MIN: Self = u64::MIN; const MAX: Self = u64::MAX; - fn key_range_size(key_range: &Range) -> u32 { + fn key_range_size(key_range: &Range, _shard_identity: &ShardIdentity) -> u32 { std::cmp::min(key_range.end - key_range.start, u32::MAX as u64) as u32 } @@ -139,7 +139,6 @@ impl interface::CompactionLayer for Arc { } } -#[async_trait] impl interface::CompactionDeltaLayer for Arc { type DeltaEntry<'a> = MockRecord; @@ -436,6 +435,11 @@ impl interface::CompactionJobExecutor for MockTimeline { type ImageLayer = Arc; type RequestContext = MockRequestContext; + fn get_shard_identity(&self) -> &ShardIdentity { + static IDENTITY: ShardIdentity = ShardIdentity::unsharded(); + &IDENTITY + } + async fn get_layers( &mut self, key_range: &Range, diff --git a/pageserver/compaction/tests/tests.rs b/pageserver/compaction/tests/tests.rs index 1cea2a20e1..7aa20e6863 100644 --- a/pageserver/compaction/tests/tests.rs +++ b/pageserver/compaction/tests/tests.rs @@ -1,5 +1,20 @@ +use once_cell::sync::OnceCell; use pageserver_compaction::interface::CompactionLayer; use pageserver_compaction::simulator::MockTimeline; +use utils::logging; + +static LOG_HANDLE: OnceCell<()> = OnceCell::new(); + +pub(crate) fn setup_logging() { + LOG_HANDLE.get_or_init(|| { + logging::init( + logging::LogFormat::Test, + logging::TracingErrorLayerEnablement::EnableWithRustLogFilter, + logging::Output::Stdout, + ) + .expect("Failed to init test logging") + }); +} /// Test the extreme case that there are so many updates for a single key that /// even if we produce an extremely narrow delta layer, spanning just that one @@ -11,13 +26,14 @@ use pageserver_compaction::simulator::MockTimeline; #[ignore] #[tokio::test] async fn test_many_updates_for_single_key() { + setup_logging(); let mut executor = MockTimeline::new(); - executor.target_file_size = 10_000_000; // 10 MB + executor.target_file_size = 1_000_000; // 1 MB - // Ingest 100 MB of updates to a single key. + // Ingest 10 MB of updates to a single key. for _ in 1..1000 { executor.ingest_uniform(100, 10, &(0..100_000)).unwrap(); - executor.ingest_uniform(10_000, 10, &(0..1)).unwrap(); + executor.ingest_uniform(1000, 10, &(0..1)).unwrap(); executor.compact().await.unwrap(); } @@ -33,3 +49,26 @@ async fn test_many_updates_for_single_key() { } } } + +#[tokio::test] +async fn test_simple_updates() { + setup_logging(); + let mut executor = MockTimeline::new(); + executor.target_file_size = 500_000; // 500 KB + + // Ingest some traffic. + for _ in 1..400 { + executor.ingest_uniform(100, 500, &(0..100_000)).unwrap(); + } + + for l in executor.live_layers.iter() { + println!("layer {}: {}", l.short_id(), l.file_size()); + } + + println!("Running compaction..."); + executor.compact().await.unwrap(); + + for l in executor.live_layers.iter() { + println!("layer {}: {}", l.short_id(), l.file_size()); + } +} diff --git a/pageserver/ctl/Cargo.toml b/pageserver/ctl/Cargo.toml index c5cd451e8d..843f5dd862 100644 --- a/pageserver/ctl/Cargo.toml +++ b/pageserver/ctl/Cargo.toml @@ -12,9 +12,14 @@ bytes.workspace = true camino.workspace = true clap = { workspace = true, features = ["string"] } git-version.workspace = true +humantime.workspace = true pageserver = { path = ".." } +pageserver_api.workspace = true +remote_storage = { path = "../../libs/remote_storage" } postgres_ffi.workspace = true tokio.workspace = true +tokio-util.workspace = true +toml_edit.workspace = true utils.workspace = true svg_fmt.workspace = true workspace_hack.workspace = true diff --git a/pageserver/ctl/src/draw_timeline_dir.rs b/pageserver/ctl/src/draw_timeline_dir.rs index 0e77ef0563..d8082f8ab4 100644 --- a/pageserver/ctl/src/draw_timeline_dir.rs +++ b/pageserver/ctl/src/draw_timeline_dir.rs @@ -9,19 +9,48 @@ //! Coordinates in both axis are compressed for better readability. //! (see ) //! -//! Example use: +//! The plain text API was chosen so that we can easily work with filenames from various +//! sources; see the Usage section below for examples. +//! +//! # Usage +//! +//! ## Producing the SVG +//! //! ```bash -//! $ ls test_output/test_pgbench\[neon-45-684\]/repo/tenants/$TENANT/timelines/$TIMELINE | \ -//! $ grep "__" | cargo run --release --bin pagectl draw-timeline-dir > out.svg -//! $ firefox out.svg +//! +//! # local timeline dir +//! ls test_output/test_pgbench\[neon-45-684\]/repo/tenants/$TENANT/timelines/$TIMELINE | \ +//! grep "__" | cargo run --release --bin pagectl draw-timeline-dir > out.svg +//! +//! # Layer map dump from `/v1/tenant/$TENANT/timeline/$TIMELINE/layer` +//! (jq -r '.historic_layers[] | .layer_file_name' | cargo run -p pagectl draw-timeline) < layer-map.json > out.svg +//! +//! # From an `index_part.json` in S3 +//! (jq -r '.layer_metadata | keys[]' | cargo run -p pagectl draw-timeline ) < index_part.json-00000016 > out.svg +//! +//! # enrich with lines for gc_cutoff and a child branch point +//! cat <(jq -r '.historic_layers[] | .layer_file_name' < layers.json) <(echo -e 'gc_cutoff:0000001CE3FE32C9\nbranch:0000001DE3FE32C9') | cargo run --bin pagectl draw-timeline >| out.svg //! ``` //! -//! This API was chosen so that we can easily work with filenames extracted from ssh, -//! or from pageserver log files. +//! ## Viewing //! -//! TODO Consider shipping this as a grafana panel plugin: -//! -use anyhow::Result; +//! **Inkscape** is better than the built-in viewers in browsers. +//! +//! After selecting a layer file rectangle, use "Open XML Editor" (Ctrl|Cmd + Shift + X) +//! to see the layer file name in the comment field. +//! +//! ```bash +//! +//! # Linux +//! inkscape out.svg +//! +//! # macOS +//! /Applications/Inkscape.app/Contents/MacOS/inkscape out.svg +//! +//! ``` +//! + +use anyhow::{Context, Result}; use pageserver::repository::Key; use pageserver::METADATA_FILE_NAME; use std::cmp::Ordering; @@ -63,12 +92,65 @@ fn parse_filename(name: &str) -> (Range, Range) { (keys, lsns) } +#[derive(Clone, Copy)] +enum LineKind { + GcCutoff, + Branch, +} + +impl From for Fill { + fn from(value: LineKind) -> Self { + match value { + LineKind::GcCutoff => Fill::Color(rgb(255, 0, 0)), + LineKind::Branch => Fill::Color(rgb(0, 255, 0)), + } + } +} + +impl FromStr for LineKind { + type Err = anyhow::Error; + + fn from_str(s: &str) -> std::prelude::v1::Result { + Ok(match s { + "gc_cutoff" => LineKind::GcCutoff, + "branch" => LineKind::Branch, + _ => anyhow::bail!("unsupported linekind: {s}"), + }) + } +} + pub fn main() -> Result<()> { // Parse layer filenames from stdin - let mut ranges: Vec<(Range, Range)> = vec![]; + struct Layer { + filename: String, + key_range: Range, + lsn_range: Range, + } + let mut files: Vec = vec![]; let stdin = io::stdin(); - for line in stdin.lock().lines() { + + let mut lines: Vec<(Lsn, LineKind)> = vec![]; + + for (lineno, line) in stdin.lock().lines().enumerate() { + let lineno = lineno + 1; + let line = line.unwrap(); + if let Some((kind, lsn)) = line.split_once(':') { + let (kind, lsn) = LineKind::from_str(kind) + .context("parse kind") + .and_then(|kind| { + if lsn.contains('/') { + Lsn::from_str(lsn) + } else { + Lsn::from_hex(lsn) + } + .map(|lsn| (kind, lsn)) + .context("parse lsn") + }) + .with_context(|| format!("parse {line:?} on {lineno}"))?; + lines.push((lsn, kind)); + continue; + } let line = PathBuf::from_str(&line).unwrap(); let filename = line.file_name().unwrap(); let filename = filename.to_str().unwrap(); @@ -76,20 +158,32 @@ pub fn main() -> Result<()> { // Don't try and parse "metadata" like a key-lsn range continue; } - let range = parse_filename(filename); - ranges.push(range); + let (key_range, lsn_range) = parse_filename(filename); + files.push(Layer { + filename: filename.to_owned(), + key_range, + lsn_range, + }); } // Collect all coordinates - let mut keys: Vec = vec![]; - let mut lsns: Vec = vec![]; - for (keyr, lsnr) in &ranges { + let mut keys: Vec = Vec::with_capacity(files.len()); + let mut lsns: Vec = Vec::with_capacity(files.len() + lines.len()); + + for Layer { + key_range: keyr, + lsn_range: lsnr, + .. + } in &files + { keys.push(keyr.start); keys.push(keyr.end); lsns.push(lsnr.start); lsns.push(lsnr.end); } + lsns.extend(lines.iter().map(|(lsn, _)| *lsn)); + // Analyze let key_map = build_coordinate_compression_map(keys); let lsn_map = build_coordinate_compression_map(lsns); @@ -103,11 +197,19 @@ pub fn main() -> Result<()> { println!( "{}", BeginSvg { - w: key_map.len() as f32, + w: (key_map.len() + 10) as f32, h: stretch * lsn_map.len() as f32 } ); - for (keyr, lsnr) in &ranges { + + let xmargin = 0.05; // Height-dependent margin to disambiguate overlapping deltas + + for Layer { + filename, + key_range: keyr, + lsn_range: lsnr, + } in &files + { let key_start = *key_map.get(&keyr.start).unwrap(); let key_end = *key_map.get(&keyr.end).unwrap(); let key_diff = key_end - key_start; @@ -123,7 +225,6 @@ pub fn main() -> Result<()> { let mut lsn_diff = (lsn_end - lsn_start) as f32; let mut fill = Fill::None; let mut ymargin = 0.05 * lsn_diff; // Height-dependent margin to disambiguate overlapping deltas - let xmargin = 0.05; // Height-dependent margin to disambiguate overlapping deltas let mut lsn_offset = 0.0; // Fill in and thicken rectangle if it's an @@ -143,7 +244,7 @@ pub fn main() -> Result<()> { println!( " {}", rectangle( - key_start as f32 + stretch * xmargin, + 5.0 + key_start as f32 + stretch * xmargin, stretch * (lsn_max as f32 - (lsn_end as f32 - ymargin - lsn_offset)), key_diff as f32 - stretch * 2.0 * xmargin, stretch * (lsn_diff - 2.0 * ymargin) @@ -151,8 +252,29 @@ pub fn main() -> Result<()> { .fill(fill) .stroke(Stroke::Color(rgb(0, 0, 0), 0.1)) .border_radius(0.4) + .comment(filename) ); } + + for (lsn, kind) in lines { + let lsn_start = *lsn_map.get(&lsn).unwrap(); + let lsn_end = lsn_start; + let stretch = 2.0; + let lsn_diff = 0.3; + let lsn_offset = -lsn_diff / 2.0; + let ymargin = 0.05; + println!( + "{}", + rectangle( + 0.0f32 + stretch * xmargin, + stretch * (lsn_map.len() as f32 - (lsn_end as f32 - ymargin - lsn_offset)), + (key_map.len() + 10) as f32, + stretch * (lsn_diff - 2.0 * ymargin) + ) + .fill(kind) + ); + } + println!("{}", EndSvg); eprintln!("num_images: {}", num_images); diff --git a/pageserver/ctl/src/index_part.rs b/pageserver/ctl/src/index_part.rs index 20e5572914..0d010eb009 100644 --- a/pageserver/ctl/src/index_part.rs +++ b/pageserver/ctl/src/index_part.rs @@ -3,7 +3,7 @@ use std::collections::HashMap; use anyhow::Context; use camino::Utf8PathBuf; use pageserver::tenant::remote_timeline_client::index::IndexLayerMetadata; -use pageserver::tenant::storage_layer::LayerFileName; +use pageserver::tenant::storage_layer::LayerName; use pageserver::tenant::{metadata::TimelineMetadata, IndexPart}; use utils::lsn::Lsn; @@ -19,7 +19,7 @@ pub(crate) async fn main(cmd: &IndexPartCmd) -> anyhow::Result<()> { let des: IndexPart = IndexPart::from_s3_bytes(&bytes).context("deserialize")?; #[derive(serde::Serialize)] struct Output<'a> { - layer_metadata: &'a HashMap, + layer_metadata: &'a HashMap, disk_consistent_lsn: Lsn, timeline_metadata: &'a TimelineMetadata, } diff --git a/pageserver/ctl/src/main.rs b/pageserver/ctl/src/main.rs index e73d961e36..1fb75584fc 100644 --- a/pageserver/ctl/src/main.rs +++ b/pageserver/ctl/src/main.rs @@ -9,6 +9,11 @@ mod index_part; mod layer_map_analyzer; mod layers; +use std::{ + str::FromStr, + time::{Duration, SystemTime}, +}; + use camino::{Utf8Path, Utf8PathBuf}; use clap::{Parser, Subcommand}; use index_part::IndexPartCmd; @@ -20,8 +25,16 @@ use pageserver::{ tenant::{dump_layerfile_from_path, metadata::TimelineMetadata}, virtual_file, }; +use pageserver_api::shard::TenantShardId; use postgres_ffi::ControlFileData; -use utils::{lsn::Lsn, project_git_version}; +use remote_storage::{RemotePath, RemoteStorageConfig}; +use tokio_util::sync::CancellationToken; +use utils::{ + id::TimelineId, + logging::{self, LogFormat, TracingErrorLayerEnablement}, + lsn::Lsn, + project_git_version, +}; project_git_version!(GIT_VERSION); @@ -43,6 +56,7 @@ enum Commands { #[command(subcommand)] IndexPart(IndexPartCmd), PrintLayerFile(PrintLayerFileCmd), + TimeTravelRemotePrefix(TimeTravelRemotePrefixCmd), DrawTimeline {}, AnalyzeLayerMap(AnalyzeLayerMapCmd), #[command(subcommand)] @@ -68,6 +82,26 @@ struct PrintLayerFileCmd { path: Utf8PathBuf, } +/// Roll back the time for the specified prefix using S3 history. +/// +/// The command is fairly low level and powerful. Validation is only very light, +/// so it is more powerful, and thus potentially more dangerous. +#[derive(Parser)] +struct TimeTravelRemotePrefixCmd { + /// A configuration string for the remote_storage configuration. + /// + /// Example: `remote_storage = { bucket_name = "aws-storage-bucket-name", bucket_region = "us-east-2" }` + config_toml_str: String, + /// remote prefix to time travel recover. For safety reasons, we require it to contain + /// a timeline or tenant ID in the prefix. + prefix: String, + /// Timestamp to travel to. Given in format like `2024-01-20T10:45:45Z`. Assumes UTC and second accuracy. + travel_to: String, + /// Timestamp of the start of the operation, must be after any changes we want to roll back and after. + /// You can use a few seconds before invoking the command. Same format as `travel_to`. + done_if_after: Option, +} + #[derive(Parser)] struct AnalyzeLayerMapCmd { /// Pageserver data path @@ -78,6 +112,14 @@ struct AnalyzeLayerMapCmd { #[tokio::main] async fn main() -> anyhow::Result<()> { + logging::init( + LogFormat::Plain, + TracingErrorLayerEnablement::EnableWithRustLogFilter, + logging::Output::Stdout, + )?; + + logging::replace_panic_hook_with_tracing_panic_hook().forget(); + let cli = CliOpts::parse(); match cli.command { @@ -105,6 +147,42 @@ async fn main() -> anyhow::Result<()> { print_layerfile(&cmd.path).await?; } } + Commands::TimeTravelRemotePrefix(cmd) => { + let timestamp = humantime::parse_rfc3339(&cmd.travel_to) + .map_err(|_e| anyhow::anyhow!("Invalid time for travel_to: '{}'", cmd.travel_to))?; + + let done_if_after = if let Some(done_if_after) = &cmd.done_if_after { + humantime::parse_rfc3339(done_if_after).map_err(|_e| { + anyhow::anyhow!("Invalid time for done_if_after: '{}'", done_if_after) + })? + } else { + const SAFETY_MARGIN: Duration = Duration::from_secs(3); + tokio::time::sleep(SAFETY_MARGIN).await; + // Convert to string representation and back to get rid of sub-second values + let done_if_after = SystemTime::now(); + tokio::time::sleep(SAFETY_MARGIN).await; + done_if_after + }; + + let timestamp = strip_subsecond(timestamp); + let done_if_after = strip_subsecond(done_if_after); + + let Some(prefix) = validate_prefix(&cmd.prefix) else { + println!("specified prefix '{}' failed validation", cmd.prefix); + return Ok(()); + }; + let toml_document = toml_edit::Document::from_str(&cmd.config_toml_str)?; + let toml_item = toml_document + .get("remote_storage") + .expect("need remote_storage"); + let config = RemoteStorageConfig::from_toml(toml_item)?.expect("incomplete config"); + let storage = remote_storage::GenericRemoteStorage::from_config(&config); + let cancel = CancellationToken::new(); + storage + .unwrap() + .time_travel_recover(Some(&prefix), timestamp, done_if_after, &cancel) + .await?; + } }; Ok(()) } @@ -185,3 +263,89 @@ fn handle_metadata( Ok(()) } + +/// Ensures that the given S3 prefix is sufficiently constrained. +/// The command is very risky already and we don't want to expose something +/// that allows usually unintentional and quite catastrophic time travel of +/// an entire bucket, which would be a major catastrophy and away +/// by only one character change (similar to "rm -r /home /username/foobar"). +fn validate_prefix(prefix: &str) -> Option { + if prefix.is_empty() { + // Empty prefix means we want to specify the *whole* bucket + return None; + } + let components = prefix.split('/').collect::>(); + let (last, components) = { + let last = components.last()?; + if last.is_empty() { + ( + components.iter().nth_back(1)?, + &components[..(components.len() - 1)], + ) + } else { + (last, &components[..]) + } + }; + 'valid: { + if let Ok(_timeline_id) = TimelineId::from_str(last) { + // Ends in either a tenant or timeline ID + break 'valid; + } + if *last == "timelines" { + if let Some(before_last) = components.iter().nth_back(1) { + if let Ok(_tenant_id) = TenantShardId::from_str(before_last) { + // Has a valid tenant id + break 'valid; + } + } + } + + return None; + } + RemotePath::from_string(prefix).ok() +} + +fn strip_subsecond(timestamp: SystemTime) -> SystemTime { + let ts_str = humantime::format_rfc3339_seconds(timestamp).to_string(); + humantime::parse_rfc3339(&ts_str).expect("can't parse just created timestamp") +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_validate_prefix() { + assert_eq!(validate_prefix(""), None); + assert_eq!(validate_prefix("/"), None); + #[track_caller] + fn assert_valid(prefix: &str) { + let remote_path = RemotePath::from_string(prefix).unwrap(); + assert_eq!(validate_prefix(prefix), Some(remote_path)); + } + assert_valid("wal/3aa8fcc61f6d357410b7de754b1d9001/641e5342083b2235ee3deb8066819683/"); + // Path is not relative but absolute + assert_eq!( + validate_prefix( + "/wal/3aa8fcc61f6d357410b7de754b1d9001/641e5342083b2235ee3deb8066819683/" + ), + None + ); + assert_valid("wal/3aa8fcc61f6d357410b7de754b1d9001/"); + // Partial tenant IDs should be invalid, S3 will match all tenants with the specific ID prefix + assert_eq!(validate_prefix("wal/3aa8fcc61f6d357410b7d"), None); + assert_eq!(validate_prefix("wal"), None); + assert_eq!(validate_prefix("/wal/"), None); + assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001"); + // Partial tenant ID + assert_eq!( + validate_prefix("pageserver/v1/tenants/3aa8fcc61f6d357410b"), + None + ); + assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001/timelines"); + assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001-0004/timelines"); + assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001/timelines/"); + assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001/timelines/641e5342083b2235ee3deb8066819683"); + assert_eq!(validate_prefix("pageserver/v1/tenants/"), None); + } +} diff --git a/pageserver/pagebench/src/cmd/basebackup.rs b/pageserver/pagebench/src/cmd/basebackup.rs index 55844be041..3ae6d99aa7 100644 --- a/pageserver/pagebench/src/cmd/basebackup.rs +++ b/pageserver/pagebench/src/cmd/basebackup.rs @@ -1,4 +1,5 @@ use anyhow::Context; +use pageserver_api::shard::TenantShardId; use pageserver_client::mgmt_api::ForceAwaitLogicalSize; use pageserver_client::page_service::BasebackupRequest; @@ -95,7 +96,7 @@ async fn main_impl( let timeline = *timeline; let info = mgmt_api_client .timeline_info( - timeline.tenant_id, + TenantShardId::unsharded(timeline.tenant_id), timeline.timeline_id, ForceAwaitLogicalSize::No, ) diff --git a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs index 2838511a77..5043a207fc 100644 --- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs +++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs @@ -4,6 +4,7 @@ use pageserver_api::key::{is_rel_block_key, key_to_rel_block, Key}; use pageserver_api::keyspace::KeySpaceAccum; use pageserver_api::models::PagestreamGetPageRequest; +use pageserver_api::shard::TenantShardId; use tokio_util::sync::CancellationToken; use utils::id::TenantTimelineId; use utils::lsn::Lsn; @@ -173,7 +174,10 @@ async fn main_impl( let timeline = *timeline; async move { let partitioning = mgmt_api_client - .keyspace(timeline.tenant_id, timeline.timeline_id) + .keyspace( + TenantShardId::unsharded(timeline.tenant_id), + timeline.timeline_id, + ) .await?; let lsn = partitioning.at_lsn; let start = Instant::now(); @@ -308,8 +312,12 @@ async fn main_impl( let (rel_tag, block_no) = key_to_rel_block(key).expect("we filter non-rel-block keys out above"); PagestreamGetPageRequest { - latest: rng.gen_bool(args.req_latest_probability), - lsn: r.timeline_lsn, + request_lsn: if rng.gen_bool(args.req_latest_probability) { + Lsn::MAX + } else { + r.timeline_lsn + }, + not_modified_since: r.timeline_lsn, rel: rel_tag, blkno: block_no, } diff --git a/pageserver/pagebench/src/cmd/trigger_initial_size_calculation.rs b/pageserver/pagebench/src/cmd/trigger_initial_size_calculation.rs index 98938d780a..f07beeecfd 100644 --- a/pageserver/pagebench/src/cmd/trigger_initial_size_calculation.rs +++ b/pageserver/pagebench/src/cmd/trigger_initial_size_calculation.rs @@ -1,6 +1,7 @@ use std::sync::Arc; use humantime::Duration; +use pageserver_api::shard::TenantShardId; use tokio::task::JoinSet; use utils::id::TenantTimelineId; @@ -59,7 +60,11 @@ async fn main_impl(args: Args) -> anyhow::Result<()> { let mgmt_api_client = Arc::clone(&mgmt_api_client); js.spawn(async move { let info = mgmt_api_client - .timeline_info(tl.tenant_id, tl.timeline_id, ForceAwaitLogicalSize::Yes) + .timeline_info( + TenantShardId::unsharded(tl.tenant_id), + tl.timeline_id, + ForceAwaitLogicalSize::Yes, + ) .await .unwrap(); @@ -74,7 +79,11 @@ async fn main_impl(args: Args) -> anyhow::Result<()> { while !info.current_logical_size_is_accurate { ticker.tick().await; info = mgmt_api_client - .timeline_info(tl.tenant_id, tl.timeline_id, ForceAwaitLogicalSize::Yes) + .timeline_info( + TenantShardId::unsharded(tl.tenant_id), + tl.timeline_id, + ForceAwaitLogicalSize::Yes, + ) .await .unwrap(); } diff --git a/pageserver/src/aux_file.rs b/pageserver/src/aux_file.rs new file mode 100644 index 0000000000..a26ed84a0d --- /dev/null +++ b/pageserver/src/aux_file.rs @@ -0,0 +1,208 @@ +use bytes::{Buf, BufMut, Bytes}; +use pageserver_api::key::{Key, AUX_KEY_PREFIX, METADATA_KEY_SIZE}; +use tracing::warn; + +/// Create a metadata key from a hash, encoded as [AUX_KEY_PREFIX, 2B directory prefix, first 13B of 128b xxhash]. +fn aux_hash_to_metadata_key(dir_level1: u8, dir_level2: u8, data: &[u8]) -> Key { + let mut key = [0; METADATA_KEY_SIZE]; + let hash = twox_hash::xxh3::hash128(data).to_be_bytes(); + key[0] = AUX_KEY_PREFIX; + key[1] = dir_level1; + key[2] = dir_level2; + key[3..16].copy_from_slice(&hash[0..13]); + Key::from_metadata_key_fixed_size(&key) +} + +const AUX_DIR_PG_LOGICAL: u8 = 0x01; +const AUX_DIR_PG_REPLSLOT: u8 = 0x02; +const AUX_DIR_PG_UNKNOWN: u8 = 0xFF; + +/// Encode the aux file into a fixed-size key. +/// +/// The first byte is the AUX key prefix. We use the next 2 bytes of the key for the directory / aux file type. +/// We have one-to-one mapping for each of the aux file that we support. We hash the remaining part of the path +/// (usually a single file name, or several components) into 13-byte hash. The way we determine the 2-byte prefix +/// is roughly based on the first two components of the path, one unique number for one component. +/// +/// * pg_logical/mappings -> 0x0101 +/// * pg_logical/snapshots -> 0x0102 +/// * pg_logical/replorigin_checkpoint -> 0x0103 +/// * pg_logical/others -> 0x01FF +/// * pg_replslot/ -> 0x0201 +/// * others -> 0xFFFF +/// +/// If you add new AUX files to this function, please also add a test case to `test_encoding_portable`. +/// The new file type must have never been written to the storage before. Otherwise, there could be data +/// corruptions as the new file belongs to a new prefix but it might have been stored under the `others` prefix. +pub fn encode_aux_file_key(path: &str) -> Key { + if let Some(fname) = path.strip_prefix("pg_logical/mappings/") { + aux_hash_to_metadata_key(AUX_DIR_PG_LOGICAL, 0x01, fname.as_bytes()) + } else if let Some(fname) = path.strip_prefix("pg_logical/snapshots/") { + aux_hash_to_metadata_key(AUX_DIR_PG_LOGICAL, 0x02, fname.as_bytes()) + } else if path == "pg_logical/replorigin_checkpoint" { + aux_hash_to_metadata_key(AUX_DIR_PG_LOGICAL, 0x03, b"") + } else if let Some(fname) = path.strip_prefix("pg_logical/") { + if cfg!(debug_assertions) { + warn!( + "unsupported pg_logical aux file type: {}, putting to 0x01FF, would affect path scanning", + path + ); + } + aux_hash_to_metadata_key(AUX_DIR_PG_LOGICAL, 0xFF, fname.as_bytes()) + } else if let Some(fname) = path.strip_prefix("pg_replslot/") { + aux_hash_to_metadata_key(AUX_DIR_PG_REPLSLOT, 0x01, fname.as_bytes()) + } else { + if cfg!(debug_assertions) { + warn!( + "unsupported aux file type: {}, putting to 0xFFFF, would affect path scanning", + path + ); + } + aux_hash_to_metadata_key(AUX_DIR_PG_UNKNOWN, 0xFF, path.as_bytes()) + } +} + +const AUX_FILE_ENCODING_VERSION: u8 = 0x01; + +pub fn decode_file_value(val: &[u8]) -> anyhow::Result> { + let mut ptr = val; + if ptr.is_empty() { + // empty value = no files + return Ok(Vec::new()); + } + assert_eq!( + ptr.get_u8(), + AUX_FILE_ENCODING_VERSION, + "unsupported aux file value" + ); + let mut files = vec![]; + while ptr.has_remaining() { + let key_len = ptr.get_u32() as usize; + let key = &ptr[..key_len]; + ptr.advance(key_len); + let val_len = ptr.get_u32() as usize; + let content = &ptr[..val_len]; + ptr.advance(val_len); + + let path = std::str::from_utf8(key)?; + files.push((path, content)); + } + Ok(files) +} + +/// Decode an aux file key-value pair into a list of files. The returned `Bytes` contains reference +/// to the original value slice. Be cautious about memory consumption. +pub fn decode_file_value_bytes(val: &Bytes) -> anyhow::Result> { + let mut ptr = val.clone(); + if ptr.is_empty() { + // empty value = no files + return Ok(Vec::new()); + } + assert_eq!( + ptr.get_u8(), + AUX_FILE_ENCODING_VERSION, + "unsupported aux file value" + ); + let mut files = vec![]; + while ptr.has_remaining() { + let key_len = ptr.get_u32() as usize; + let key = ptr.slice(..key_len); + ptr.advance(key_len); + let val_len = ptr.get_u32() as usize; + let content = ptr.slice(..val_len); + ptr.advance(val_len); + + let path = std::str::from_utf8(&key)?.to_string(); + files.push((path, content)); + } + Ok(files) +} + +pub fn encode_file_value(files: &[(&str, &[u8])]) -> anyhow::Result> { + if files.is_empty() { + // no files = empty value + return Ok(Vec::new()); + } + let mut encoded = vec![]; + encoded.put_u8(AUX_FILE_ENCODING_VERSION); + for (path, content) in files { + if path.len() > u32::MAX as usize { + anyhow::bail!("{} exceeds path size limit", path); + } + encoded.put_u32(path.len() as u32); + encoded.put_slice(path.as_bytes()); + if content.len() > u32::MAX as usize { + anyhow::bail!("{} exceeds content size limit", path); + } + encoded.put_u32(content.len() as u32); + encoded.put_slice(content); + } + Ok(encoded) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_hash_portable() { + // AUX file encoding requires the hash to be portable across all platforms. This test case checks + // if the algorithm produces the same hash across different environments. + assert_eq!( + 305317690835051308206966631765527126151, + twox_hash::xxh3::hash128("test1".as_bytes()) + ); + assert_eq!( + 85104974691013376326742244813280798847, + twox_hash::xxh3::hash128("test/test2".as_bytes()) + ); + assert_eq!(0, twox_hash::xxh3::hash128("".as_bytes())); + } + + #[test] + fn test_encoding_portable() { + // To correct retrieve AUX files, the generated keys for the same file must be the same for all versions + // of the page server. + assert_eq!( + "6200000101E5B20C5F8DD5AA3289D6D9EAFA", + encode_aux_file_key("pg_logical/mappings/test1").to_string() + ); + assert_eq!( + "620000010239AAC544893139B26F501B97E6", + encode_aux_file_key("pg_logical/snapshots/test2").to_string() + ); + assert_eq!( + "620000010300000000000000000000000000", + encode_aux_file_key("pg_logical/replorigin_checkpoint").to_string() + ); + assert_eq!( + "62000001FF8635AF2134B7266EC5B4189FD6", + encode_aux_file_key("pg_logical/unsupported").to_string() + ); + assert_eq!( + "6200000201772D0E5D71DE14DA86142A1619", + encode_aux_file_key("pg_replslot/test3").to_string() + ); + assert_eq!( + "620000FFFF1866EBEB53B807B26A2416F317", + encode_aux_file_key("other_file_not_supported").to_string() + ); + } + + #[test] + fn test_value_encoding() { + let files = vec![ + ("pg_logical/1.file", "1111".as_bytes()), + ("pg_logical/2.file", "2222".as_bytes()), + ]; + assert_eq!( + files, + decode_file_value(&encode_file_value(&files).unwrap()).unwrap() + ); + let files = vec![]; + assert_eq!( + files, + decode_file_value(&encode_file_value(&files).unwrap()).unwrap() + ); + } +} diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index 0479d05f8f..dca1510810 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -10,7 +10,7 @@ //! This module is responsible for creation of such tarball //! from data stored in object storage. //! -use anyhow::{anyhow, bail, ensure, Context}; +use anyhow::{anyhow, Context}; use bytes::{BufMut, Bytes, BytesMut}; use fail::fail_point; use pageserver_api::key::{key_to_slru_block, Key}; @@ -38,6 +38,14 @@ use postgres_ffi::PG_TLI; use postgres_ffi::{BLCKSZ, RELSEG_SIZE, WAL_SEGMENT_SIZE}; use utils::lsn::Lsn; +#[derive(Debug, thiserror::Error)] +pub enum BasebackupError { + #[error("basebackup pageserver error {0:#}")] + Server(#[from] anyhow::Error), + #[error("basebackup client error {0:#}")] + Client(#[source] io::Error), +} + /// Create basebackup with non-rel data in it. /// Only include relational data if 'full_backup' is true. /// @@ -53,7 +61,7 @@ pub async fn send_basebackup_tarball<'a, W>( prev_lsn: Option, full_backup: bool, ctx: &'a RequestContext, -) -> anyhow::Result<()> +) -> Result<(), BasebackupError> where W: AsyncWrite + Send + Sync + Unpin, { @@ -92,8 +100,10 @@ where // Consolidate the derived and the provided prev_lsn values let prev_lsn = if let Some(provided_prev_lsn) = prev_lsn { - if backup_prev != Lsn(0) { - ensure!(backup_prev == provided_prev_lsn); + if backup_prev != Lsn(0) && backup_prev != provided_prev_lsn { + return Err(BasebackupError::Server(anyhow!( + "backup_prev {backup_prev} != provided_prev_lsn {provided_prev_lsn}" + ))); } provided_prev_lsn } else { @@ -159,15 +169,26 @@ where } } - async fn add_block(&mut self, key: &Key, block: Bytes) -> anyhow::Result<()> { + async fn add_block(&mut self, key: &Key, block: Bytes) -> Result<(), BasebackupError> { let (kind, segno, _) = key_to_slru_block(*key)?; match kind { SlruKind::Clog => { - ensure!(block.len() == BLCKSZ as usize || block.len() == BLCKSZ as usize + 8); + if !(block.len() == BLCKSZ as usize || block.len() == BLCKSZ as usize + 8) { + return Err(BasebackupError::Server(anyhow!( + "invalid SlruKind::Clog record: block.len()={}", + block.len() + ))); + } } SlruKind::MultiXactMembers | SlruKind::MultiXactOffsets => { - ensure!(block.len() == BLCKSZ as usize); + if block.len() != BLCKSZ as usize { + return Err(BasebackupError::Server(anyhow!( + "invalid {:?} record: block.len()={}", + kind, + block.len() + ))); + } } } @@ -194,12 +215,15 @@ where Ok(()) } - async fn flush(&mut self) -> anyhow::Result<()> { + async fn flush(&mut self) -> Result<(), BasebackupError> { let nblocks = self.buf.len() / BLCKSZ as usize; let (kind, segno) = self.current_segment.take().unwrap(); let segname = format!("{}/{:>04X}", kind.to_str(), segno); let header = new_tar_header(&segname, self.buf.len() as u64)?; - self.ar.append(&header, self.buf.as_slice()).await?; + self.ar + .append(&header, self.buf.as_slice()) + .await + .map_err(BasebackupError::Client)?; self.total_blocks += nblocks; debug!("Added to basebackup slru {} relsize {}", segname, nblocks); @@ -209,7 +233,7 @@ where Ok(()) } - async fn finish(mut self) -> anyhow::Result<()> { + async fn finish(mut self) -> Result<(), BasebackupError> { let res = if self.current_segment.is_none() || self.buf.is_empty() { Ok(()) } else { @@ -226,7 +250,7 @@ impl<'a, W> Basebackup<'a, W> where W: AsyncWrite + Send + Sync + Unpin, { - async fn send_tarball(mut self) -> anyhow::Result<()> { + async fn send_tarball(mut self) -> Result<(), BasebackupError> { // TODO include checksum let lazy_slru_download = self.timeline.get_lazy_slru_download() && !self.full_backup; @@ -262,16 +286,25 @@ where let slru_partitions = self .timeline .get_slru_keyspace(Version::Lsn(self.lsn), self.ctx) - .await? - .partition(Timeline::MAX_GET_VECTORED_KEYS * BLCKSZ as u64); + .await + .map_err(|e| BasebackupError::Server(e.into()))? + .partition( + self.timeline.get_shard_identity(), + Timeline::MAX_GET_VECTORED_KEYS * BLCKSZ as u64, + ); let mut slru_builder = SlruSegmentsBuilder::new(&mut self.ar); for part in slru_partitions.parts { - let blocks = self.timeline.get_vectored(part, self.lsn, self.ctx).await?; + let blocks = self + .timeline + .get_vectored(part, self.lsn, self.ctx) + .await + .map_err(|e| BasebackupError::Server(e.into()))?; for (key, block) in blocks { - slru_builder.add_block(&key, block?).await?; + let block = block.map_err(|e| BasebackupError::Server(e.into()))?; + slru_builder.add_block(&key, block).await?; } } slru_builder.finish().await?; @@ -279,8 +312,11 @@ where let mut min_restart_lsn: Lsn = Lsn::MAX; // Create tablespace directories - for ((spcnode, dbnode), has_relmap_file) in - self.timeline.list_dbdirs(self.lsn, self.ctx).await? + for ((spcnode, dbnode), has_relmap_file) in self + .timeline + .list_dbdirs(self.lsn, self.ctx) + .await + .map_err(|e| BasebackupError::Server(e.into()))? { self.add_dbdir(spcnode, dbnode, has_relmap_file).await?; @@ -289,7 +325,8 @@ where let rels = self .timeline .list_rels(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx) - .await?; + .await + .map_err(|e| BasebackupError::Server(e.into()))?; for &rel in rels.iter() { // Send init fork as main fork to provide well formed empty // contents of UNLOGGED relations. Postgres copies it in @@ -312,7 +349,12 @@ where } } - for (path, content) in self.timeline.list_aux_files(self.lsn, self.ctx).await? { + for (path, content) in self + .timeline + .list_aux_files(self.lsn, self.ctx) + .await + .map_err(|e| BasebackupError::Server(e.into()))? + { if path.starts_with("pg_replslot") { let offs = pg_constants::REPL_SLOT_ON_DISK_OFFSETOF_RESTART_LSN; let restart_lsn = Lsn(u64::from_le_bytes( @@ -343,34 +385,41 @@ where for xid in self .timeline .list_twophase_files(self.lsn, self.ctx) - .await? + .await + .map_err(|e| BasebackupError::Server(e.into()))? { self.add_twophase_file(xid).await?; } fail_point!("basebackup-before-control-file", |_| { - bail!("failpoint basebackup-before-control-file") + Err(BasebackupError::Server(anyhow!( + "failpoint basebackup-before-control-file" + ))) }); // Generate pg_control and bootstrap WAL segment. self.add_pgcontrol_file().await?; - self.ar.finish().await?; + self.ar.finish().await.map_err(BasebackupError::Client)?; debug!("all tarred up!"); Ok(()) } /// Add contents of relfilenode `src`, naming it as `dst`. - async fn add_rel(&mut self, src: RelTag, dst: RelTag) -> anyhow::Result<()> { + async fn add_rel(&mut self, src: RelTag, dst: RelTag) -> Result<(), BasebackupError> { let nblocks = self .timeline - .get_rel_size(src, Version::Lsn(self.lsn), false, self.ctx) - .await?; + .get_rel_size(src, Version::Lsn(self.lsn), self.ctx) + .await + .map_err(|e| BasebackupError::Server(e.into()))?; // If the relation is empty, create an empty file if nblocks == 0 { let file_name = dst.to_segfile_name(0); let header = new_tar_header(&file_name, 0)?; - self.ar.append(&header, &mut io::empty()).await?; + self.ar + .append(&header, &mut io::empty()) + .await + .map_err(BasebackupError::Client)?; return Ok(()); } @@ -384,14 +433,18 @@ where for blknum in startblk..endblk { let img = self .timeline - .get_rel_page_at_lsn(src, blknum, Version::Lsn(self.lsn), false, self.ctx) - .await?; + .get_rel_page_at_lsn(src, blknum, Version::Lsn(self.lsn), self.ctx) + .await + .map_err(|e| BasebackupError::Server(e.into()))?; segment_data.extend_from_slice(&img[..]); } let file_name = dst.to_segfile_name(seg as u32); let header = new_tar_header(&file_name, segment_data.len() as u64)?; - self.ar.append(&header, segment_data.as_slice()).await?; + self.ar + .append(&header, segment_data.as_slice()) + .await + .map_err(BasebackupError::Client)?; seg += 1; startblk = endblk; @@ -411,20 +464,22 @@ where spcnode: u32, dbnode: u32, has_relmap_file: bool, - ) -> anyhow::Result<()> { + ) -> Result<(), BasebackupError> { let relmap_img = if has_relmap_file { let img = self .timeline .get_relmap_file(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx) - .await?; + .await + .map_err(|e| BasebackupError::Server(e.into()))?; - ensure!( - img.len() - == dispatch_pgversion!( - self.timeline.pg_version, - pgv::bindings::SIZEOF_RELMAPFILE - ) - ); + if img.len() + != dispatch_pgversion!(self.timeline.pg_version, pgv::bindings::SIZEOF_RELMAPFILE) + { + return Err(BasebackupError::Server(anyhow!( + "img.len() != SIZE_OF_RELMAPFILE, img.len()={}", + img.len(), + ))); + } Some(img) } else { @@ -437,14 +492,20 @@ where ver => format!("{ver}\x0A"), }; let header = new_tar_header("PG_VERSION", pg_version_str.len() as u64)?; - self.ar.append(&header, pg_version_str.as_bytes()).await?; + self.ar + .append(&header, pg_version_str.as_bytes()) + .await + .map_err(BasebackupError::Client)?; info!("timeline.pg_version {}", self.timeline.pg_version); if let Some(img) = relmap_img { // filenode map for global tablespace let header = new_tar_header("global/pg_filenode.map", img.len() as u64)?; - self.ar.append(&header, &img[..]).await?; + self.ar + .append(&header, &img[..]) + .await + .map_err(BasebackupError::Client)?; } else { warn!("global/pg_filenode.map is missing"); } @@ -463,18 +524,26 @@ where && self .timeline .list_rels(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx) - .await? + .await + .map_err(|e| BasebackupError::Server(e.into()))? .is_empty() { return Ok(()); } // User defined tablespaces are not supported - ensure!(spcnode == DEFAULTTABLESPACE_OID); + if spcnode != DEFAULTTABLESPACE_OID { + return Err(BasebackupError::Server(anyhow!( + "spcnode != DEFAULTTABLESPACE_OID, spcnode={spcnode}" + ))); + } // Append dir path for each database let path = format!("base/{}", dbnode); let header = new_tar_header_dir(&path)?; - self.ar.append(&header, &mut io::empty()).await?; + self.ar + .append(&header, &mut io::empty()) + .await + .map_err(BasebackupError::Client)?; if let Some(img) = relmap_img { let dst_path = format!("base/{}/PG_VERSION", dbnode); @@ -484,11 +553,17 @@ where ver => format!("{ver}\x0A"), }; let header = new_tar_header(&dst_path, pg_version_str.len() as u64)?; - self.ar.append(&header, pg_version_str.as_bytes()).await?; + self.ar + .append(&header, pg_version_str.as_bytes()) + .await + .map_err(BasebackupError::Client)?; let relmap_path = format!("base/{}/pg_filenode.map", dbnode); let header = new_tar_header(&relmap_path, img.len() as u64)?; - self.ar.append(&header, &img[..]).await?; + self.ar + .append(&header, &img[..]) + .await + .map_err(BasebackupError::Client)?; } }; Ok(()) @@ -497,11 +572,12 @@ where // // Extract twophase state files // - async fn add_twophase_file(&mut self, xid: TransactionId) -> anyhow::Result<()> { + async fn add_twophase_file(&mut self, xid: TransactionId) -> Result<(), BasebackupError> { let img = self .timeline .get_twophase_file(xid, self.lsn, self.ctx) - .await?; + .await + .map_err(|e| BasebackupError::Server(e.into()))?; let mut buf = BytesMut::new(); buf.extend_from_slice(&img[..]); @@ -509,7 +585,10 @@ where buf.put_u32_le(crc); let path = format!("pg_twophase/{:>08X}", xid); let header = new_tar_header(&path, buf.len() as u64)?; - self.ar.append(&header, &buf[..]).await?; + self.ar + .append(&header, &buf[..]) + .await + .map_err(BasebackupError::Client)?; Ok(()) } @@ -518,24 +597,28 @@ where // Add generated pg_control file and bootstrap WAL segment. // Also send zenith.signal file with extra bootstrap data. // - async fn add_pgcontrol_file(&mut self) -> anyhow::Result<()> { + async fn add_pgcontrol_file(&mut self) -> Result<(), BasebackupError> { // add zenith.signal file let mut zenith_signal = String::new(); if self.prev_record_lsn == Lsn(0) { - if self.lsn == self.timeline.get_ancestor_lsn() { - write!(zenith_signal, "PREV LSN: none")?; + if self.timeline.is_ancestor_lsn(self.lsn) { + write!(zenith_signal, "PREV LSN: none") + .map_err(|e| BasebackupError::Server(e.into()))?; } else { - write!(zenith_signal, "PREV LSN: invalid")?; + write!(zenith_signal, "PREV LSN: invalid") + .map_err(|e| BasebackupError::Server(e.into()))?; } } else { - write!(zenith_signal, "PREV LSN: {}", self.prev_record_lsn)?; + write!(zenith_signal, "PREV LSN: {}", self.prev_record_lsn) + .map_err(|e| BasebackupError::Server(e.into()))?; } self.ar .append( &new_tar_header("zenith.signal", zenith_signal.len() as u64)?, zenith_signal.as_bytes(), ) - .await?; + .await + .map_err(BasebackupError::Client)?; let checkpoint_bytes = self .timeline @@ -557,7 +640,10 @@ where //send pg_control let header = new_tar_header("global/pg_control", pg_control_bytes.len() as u64)?; - self.ar.append(&header, &pg_control_bytes[..]).await?; + self.ar + .append(&header, &pg_control_bytes[..]) + .await + .map_err(BasebackupError::Client)?; //send wal segment let segno = self.lsn.segment_number(WAL_SEGMENT_SIZE); @@ -572,8 +658,16 @@ where self.lsn, ) .map_err(|e| anyhow!(e).context("Failed generating wal segment"))?; - ensure!(wal_seg.len() == WAL_SEGMENT_SIZE); - self.ar.append(&header, &wal_seg[..]).await?; + if wal_seg.len() != WAL_SEGMENT_SIZE { + return Err(BasebackupError::Server(anyhow!( + "wal_seg.len() != WAL_SEGMENT_SIZE, wal_seg.len()={}", + wal_seg.len() + ))); + } + self.ar + .append(&header, &wal_seg[..]) + .await + .map_err(BasebackupError::Client)?; Ok(()) } } diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 1fd7c775d5..eb4b8bb8bb 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -3,6 +3,7 @@ //! Main entry point for the Page Server executable. use std::env::{var, VarError}; +use std::io::Read; use std::sync::Arc; use std::time::Duration; use std::{env, ops::ControlFlow, str::FromStr}; @@ -18,6 +19,7 @@ use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING}; use pageserver::task_mgr::WALRECEIVER_RUNTIME; use pageserver::tenant::{secondary, TenantSharedResources}; use remote_storage::GenericRemoteStorage; +use tokio::signal::unix::SignalKind; use tokio::time::Instant; use tracing::*; @@ -120,8 +122,10 @@ fn main() -> anyhow::Result<()> { &[("node_id", &conf.id.to_string())], ); - // after setting up logging, log the effective IO engine choice + // after setting up logging, log the effective IO engine choice and read path implementations info!(?conf.virtual_file_io_engine, "starting with virtual_file IO engine"); + info!(?conf.get_impl, "starting with get page implementation"); + info!(?conf.get_vectored_impl, "starting with vectored get page implementation"); let tenants_path = conf.tenants_path(); if !tenants_path.exists() { @@ -148,37 +152,34 @@ fn initialize_config( workdir: &Utf8Path, ) -> anyhow::Result> { let init = arg_matches.get_flag("init"); - let update_config = init || arg_matches.get_flag("update-config"); - let (mut toml, config_file_exists) = if cfg_file_path.is_file() { - if init { - anyhow::bail!( - "Config file '{cfg_file_path}' already exists, cannot init it, use --update-config to update it", - ); + let file_contents: Option = match std::fs::File::open(cfg_file_path) { + Ok(mut f) => { + if init { + anyhow::bail!("config file already exists: {cfg_file_path}"); + } + let md = f.metadata().context("stat config file")?; + if md.is_file() { + let mut s = String::new(); + f.read_to_string(&mut s).context("read config file")?; + Some(s.parse().context("parse config file toml")?) + } else { + anyhow::bail!("directory entry exists but is not a file: {cfg_file_path}"); + } + } + Err(e) if e.kind() == std::io::ErrorKind::NotFound => None, + Err(e) => { + anyhow::bail!("open pageserver config: {e}: {cfg_file_path}"); } - // Supplement the CLI arguments with the config file - let cfg_file_contents = std::fs::read_to_string(cfg_file_path) - .with_context(|| format!("Failed to read pageserver config at '{cfg_file_path}'"))?; - ( - cfg_file_contents - .parse::() - .with_context(|| { - format!("Failed to parse '{cfg_file_path}' as pageserver config") - })?, - true, - ) - } else if cfg_file_path.exists() { - anyhow::bail!("Config file '{cfg_file_path}' exists but is not a regular file"); - } else { - // We're initializing the tenant, so there's no config file yet - ( - DEFAULT_CONFIG_FILE - .parse::() - .context("could not parse built-in config file")?, - false, - ) }; + let mut effective_config = file_contents.unwrap_or_else(|| { + DEFAULT_CONFIG_FILE + .parse() + .expect("unit tests ensure this works") + }); + + // Patch with overrides from the command line if let Some(values) = arg_matches.get_many::("config-override") { for option_line in values { let doc = toml_edit::Document::from_str(option_line).with_context(|| { @@ -186,22 +187,21 @@ fn initialize_config( })?; for (key, item) in doc.iter() { - if config_file_exists && update_config && key == "id" && toml.contains_key(key) { - anyhow::bail!("Pageserver config file exists at '{cfg_file_path}' and has node id already, it cannot be overridden"); - } - toml.insert(key, item.clone()); + effective_config.insert(key, item.clone()); } } } - debug!("Resulting toml: {toml}"); - let conf = PageServerConf::parse_and_validate(&toml, workdir) + debug!("Resulting toml: {effective_config}"); + + // Construct the runtime representation + let conf = PageServerConf::parse_and_validate(&effective_config, workdir) .context("Failed to parse pageserver configuration")?; - if update_config { + if init { info!("Writing pageserver config to '{cfg_file_path}'"); - std::fs::write(cfg_file_path, toml.to_string()) + std::fs::write(cfg_file_path, effective_config.to_string()) .with_context(|| format!("Failed to write pageserver config to '{cfg_file_path}'"))?; info!("Config successfully written to '{cfg_file_path}'") } @@ -284,6 +284,7 @@ fn start_pageserver( )) .unwrap(); pageserver::preinitialize_metrics(); + pageserver::metrics::wal_redo::set_process_kind_metric(conf.walredo_process_kind); // If any failpoints were set from FAILPOINTS environment variable, // print them to the log for debugging purposes @@ -600,32 +601,37 @@ fn start_pageserver( None, "consumption metrics collection", true, - async move { - // first wait until background jobs are cleared to launch. - // - // this is because we only process active tenants and timelines, and the - // Timeline::get_current_logical_size will spawn the logical size calculation, - // which will not be rate-limited. - let cancel = task_mgr::shutdown_token(); + { + let tenant_manager = tenant_manager.clone(); + async move { + // first wait until background jobs are cleared to launch. + // + // this is because we only process active tenants and timelines, and the + // Timeline::get_current_logical_size will spawn the logical size calculation, + // which will not be rate-limited. + let cancel = task_mgr::shutdown_token(); - tokio::select! { - _ = cancel.cancelled() => { return Ok(()); }, - _ = background_jobs_barrier.wait() => {} - }; + tokio::select! { + _ = cancel.cancelled() => { return Ok(()); }, + _ = background_jobs_barrier.wait() => {} + }; - pageserver::consumption_metrics::collect_metrics( - metric_collection_endpoint, - conf.metric_collection_interval, - conf.cached_metric_collection_interval, - conf.synthetic_size_calculation_interval, - conf.id, - local_disk_storage, - cancel, - metrics_ctx, - ) - .instrument(info_span!("metrics_collection")) - .await?; - Ok(()) + pageserver::consumption_metrics::collect_metrics( + tenant_manager, + metric_collection_endpoint, + &conf.metric_collection_bucket, + conf.metric_collection_interval, + conf.cached_metric_collection_interval, + conf.synthetic_size_calculation_interval, + conf.id, + local_disk_storage, + cancel, + metrics_ctx, + ) + .instrument(info_span!("metrics_collection")) + .await?; + Ok(()) + } }, ); } @@ -666,42 +672,37 @@ fn start_pageserver( let mut shutdown_pageserver = Some(shutdown_pageserver.drop_guard()); // All started up! Now just sit and wait for shutdown signal. - { - use signal_hook::consts::*; - let signal_handler = BACKGROUND_RUNTIME.spawn_blocking(move || { - let mut signals = - signal_hook::iterator::Signals::new([SIGINT, SIGTERM, SIGQUIT]).unwrap(); - return signals - .forever() - .next() - .expect("forever() never returns None unless explicitly closed"); - }); - let signal = BACKGROUND_RUNTIME - .block_on(signal_handler) - .expect("join error"); - match signal { - SIGQUIT => { - info!("Got signal {signal}. Terminating in immediate shutdown mode",); - std::process::exit(111); - } - SIGINT | SIGTERM => { - info!("Got signal {signal}. Terminating gracefully in fast shutdown mode",); - // This cancels the `shutdown_pageserver` cancellation tree. - // Right now that tree doesn't reach very far, and `task_mgr` is used instead. - // The plan is to change that over time. - shutdown_pageserver.take(); - let bg_remote_storage = remote_storage.clone(); - let bg_deletion_queue = deletion_queue.clone(); - BACKGROUND_RUNTIME.block_on(pageserver::shutdown_pageserver( - &tenant_manager, - bg_remote_storage.map(|_| bg_deletion_queue), - 0, - )); - unreachable!() - } - _ => unreachable!(), - } + { + BACKGROUND_RUNTIME.block_on(async move { + let mut sigint = tokio::signal::unix::signal(SignalKind::interrupt()).unwrap(); + let mut sigterm = tokio::signal::unix::signal(SignalKind::terminate()).unwrap(); + let mut sigquit = tokio::signal::unix::signal(SignalKind::quit()).unwrap(); + let signal = tokio::select! { + _ = sigquit.recv() => { + info!("Got signal SIGQUIT. Terminating in immediate shutdown mode",); + std::process::exit(111); + } + _ = sigint.recv() => { "SIGINT" }, + _ = sigterm.recv() => { "SIGTERM" }, + }; + + info!("Got signal {signal}. Terminating gracefully in fast shutdown mode",); + + // This cancels the `shutdown_pageserver` cancellation tree. + // Right now that tree doesn't reach very far, and `task_mgr` is used instead. + // The plan is to change that over time. + shutdown_pageserver.take(); + let bg_remote_storage = remote_storage.clone(); + let bg_deletion_queue = deletion_queue.clone(); + pageserver::shutdown_pageserver( + &tenant_manager, + bg_remote_storage.map(|_| bg_deletion_queue), + 0, + ) + .await; + unreachable!() + }) } } @@ -754,18 +755,13 @@ fn cli() -> Command { // See `settings.md` for more details on the extra configuration patameters pageserver can process .arg( Arg::new("config-override") + .long("config-override") .short('c') .num_args(1) .action(ArgAction::Append) .help("Additional configuration overrides of the ones from the toml config file (or new ones to add there). \ Any option has to be a valid toml document, example: `-c=\"foo='hey'\"` `-c=\"foo={value=1}\"`"), ) - .arg( - Arg::new("update-config") - .long("update-config") - .action(ArgAction::SetTrue) - .help("Update the config file when started"), - ) .arg( Arg::new("enabled-features") .long("enabled-features") diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 8ad9ade4a9..258eed0b12 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -9,7 +9,7 @@ use pageserver_api::shard::TenantShardId; use remote_storage::{RemotePath, RemoteStorageConfig}; use serde; use serde::de::IntoDeserializer; -use std::{collections::HashMap, env}; +use std::env; use storage_broker::Uri; use utils::crashsafe::path_with_suffix_extension; use utils::id::ConnectionId; @@ -30,9 +30,9 @@ use utils::{ logging::LogFormat, }; -use crate::tenant::config::TenantConfOpt; use crate::tenant::timeline::GetVectoredImpl; use crate::tenant::vectored_blob_io::MaxVectoredReadBytes; +use crate::tenant::{config::TenantConfOpt, timeline::GetImpl}; use crate::tenant::{ TENANTS_SEGMENT_NAME, TENANT_DELETED_MARKER_FILE_NAME, TIMELINES_SEGMENT_NAME, }; @@ -51,7 +51,7 @@ pub mod defaults { use crate::tenant::config::defaults::*; use const_format::formatcp; - pub use pageserver_api::{ + pub use pageserver_api::config::{ DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_HTTP_LISTEN_PORT, DEFAULT_PG_LISTEN_ADDR, DEFAULT_PG_LISTEN_PORT, }; @@ -91,10 +91,16 @@ pub mod defaults { pub const DEFAULT_GET_VECTORED_IMPL: &str = "sequential"; + pub const DEFAULT_GET_IMPL: &str = "legacy"; + pub const DEFAULT_MAX_VECTORED_READ_BYTES: usize = 128 * 1024; // 128 KiB pub const DEFAULT_VALIDATE_VECTORED_GET: bool = true; + pub const DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB: usize = 0; + + pub const DEFAULT_WALREDO_PROCESS_KIND: &str = "sync"; + /// /// Default built-in configuration file. /// @@ -134,10 +140,14 @@ pub mod defaults { #get_vectored_impl = '{DEFAULT_GET_VECTORED_IMPL}' +#get_impl = '{DEFAULT_GET_IMPL}' + #max_vectored_read_bytes = '{DEFAULT_MAX_VECTORED_READ_BYTES}' #validate_vectored_get = '{DEFAULT_VALIDATE_VECTORED_GET}' +#walredo_process_kind = '{DEFAULT_WALREDO_PROCESS_KIND}' + [tenant_config] #checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes #checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT} @@ -156,6 +166,8 @@ pub mod defaults { #heatmap_upload_concurrency = {DEFAULT_HEATMAP_UPLOAD_CONCURRENCY} #secondary_download_concurrency = {DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY} +#ephemeral_bytes_per_memory_kb = {DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB} + [remote_storage] "# @@ -234,6 +246,7 @@ pub struct PageServerConf { // How often to send unchanged cached metrics to the metrics endpoint. pub cached_metric_collection_interval: Duration, pub metric_collection_endpoint: Option, + pub metric_collection_bucket: Option, pub synthetic_size_calculation_interval: Duration, pub disk_usage_based_eviction: Option, @@ -275,9 +288,20 @@ pub struct PageServerConf { pub get_vectored_impl: GetVectoredImpl, + pub get_impl: GetImpl, + pub max_vectored_read_bytes: MaxVectoredReadBytes, pub validate_vectored_get: bool, + + /// How many bytes of ephemeral layer content will we allow per kilobyte of RAM. When this + /// is exceeded, we start proactively closing ephemeral layers to limit the total amount + /// of ephemeral data. + /// + /// Setting this to zero disables limits on total ephemeral layer size. + pub ephemeral_bytes_per_memory_kb: usize, + + pub walredo_process_kind: crate::walredo::ProcessKind, } /// We do not want to store this in a PageServerConf because the latter may be logged @@ -311,26 +335,6 @@ impl BuilderValue { } } -// Certain metadata (e.g. externally-addressable name, AZ) is delivered -// as a separate structure. This information is not neeed by the pageserver -// itself, it is only used for registering the pageserver with the control -// plane and/or storage controller. -// -#[derive(serde::Deserialize)] -pub(crate) struct NodeMetadata { - #[serde(rename = "host")] - pub(crate) postgres_host: String, - #[serde(rename = "port")] - pub(crate) postgres_port: u16, - pub(crate) http_host: String, - pub(crate) http_port: u16, - - // Deployment tools may write fields to the metadata file beyond what we - // use in this type: this type intentionally only names fields that require. - #[serde(flatten)] - pub(crate) other: HashMap, -} - // needed to simplify config construction #[derive(Default)] struct PageServerConfigBuilder { @@ -373,6 +377,7 @@ struct PageServerConfigBuilder { cached_metric_collection_interval: BuilderValue, metric_collection_endpoint: BuilderValue>, synthetic_size_calculation_interval: BuilderValue, + metric_collection_bucket: BuilderValue>, disk_usage_based_eviction: BuilderValue>, @@ -395,9 +400,15 @@ struct PageServerConfigBuilder { get_vectored_impl: BuilderValue, + get_impl: BuilderValue, + max_vectored_read_bytes: BuilderValue, validate_vectored_get: BuilderValue, + + ephemeral_bytes_per_memory_kb: BuilderValue, + + walredo_process_kind: BuilderValue, } impl PageServerConfigBuilder { @@ -455,6 +466,8 @@ impl PageServerConfigBuilder { .expect("cannot parse default synthetic size calculation interval")), metric_collection_endpoint: Set(DEFAULT_METRIC_COLLECTION_ENDPOINT), + metric_collection_bucket: Set(None), + disk_usage_based_eviction: Set(None), test_remote_failures: Set(0), @@ -478,10 +491,14 @@ impl PageServerConfigBuilder { virtual_file_io_engine: Set(DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap()), get_vectored_impl: Set(DEFAULT_GET_VECTORED_IMPL.parse().unwrap()), + get_impl: Set(DEFAULT_GET_IMPL.parse().unwrap()), max_vectored_read_bytes: Set(MaxVectoredReadBytes( NonZeroUsize::new(DEFAULT_MAX_VECTORED_READ_BYTES).unwrap(), )), validate_vectored_get: Set(DEFAULT_VALIDATE_VECTORED_GET), + ephemeral_bytes_per_memory_kb: Set(DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB), + + walredo_process_kind: Set(DEFAULT_WALREDO_PROCESS_KIND.parse().unwrap()), } } } @@ -586,6 +603,13 @@ impl PageServerConfigBuilder { self.metric_collection_endpoint = BuilderValue::Set(metric_collection_endpoint) } + pub fn metric_collection_bucket( + &mut self, + metric_collection_bucket: Option, + ) { + self.metric_collection_bucket = BuilderValue::Set(metric_collection_bucket) + } + pub fn synthetic_size_calculation_interval( &mut self, synthetic_size_calculation_interval: Duration, @@ -646,6 +670,10 @@ impl PageServerConfigBuilder { self.get_vectored_impl = BuilderValue::Set(value); } + pub fn get_impl(&mut self, value: GetImpl) { + self.get_impl = BuilderValue::Set(value); + } + pub fn get_max_vectored_read_bytes(&mut self, value: MaxVectoredReadBytes) { self.max_vectored_read_bytes = BuilderValue::Set(value); } @@ -654,6 +682,14 @@ impl PageServerConfigBuilder { self.validate_vectored_get = BuilderValue::Set(value); } + pub fn get_ephemeral_bytes_per_memory_kb(&mut self, value: usize) { + self.ephemeral_bytes_per_memory_kb = BuilderValue::Set(value); + } + + pub fn get_walredo_process_kind(&mut self, value: crate::walredo::ProcessKind) { + self.walredo_process_kind = BuilderValue::Set(value); + } + pub fn build(self) -> anyhow::Result { let default = Self::default_values(); @@ -694,6 +730,7 @@ impl PageServerConfigBuilder { metric_collection_interval, cached_metric_collection_interval, metric_collection_endpoint, + metric_collection_bucket, synthetic_size_calculation_interval, disk_usage_based_eviction, test_remote_failures, @@ -706,8 +743,11 @@ impl PageServerConfigBuilder { secondary_download_concurrency, ingest_batch_size, get_vectored_impl, + get_impl, max_vectored_read_bytes, validate_vectored_get, + ephemeral_bytes_per_memory_kb, + walredo_process_kind, } CUSTOM LOGIC { @@ -942,6 +982,9 @@ impl PageServerConf { let endpoint = parse_toml_string(key, item)?.parse().context("failed to parse metric_collection_endpoint")?; builder.metric_collection_endpoint(Some(endpoint)); }, + "metric_collection_bucket" => { + builder.metric_collection_bucket(RemoteStorageConfig::from_toml(item)?) + } "synthetic_size_calculation_interval" => builder.synthetic_size_calculation_interval(parse_toml_duration(key, item)?), "test_remote_failures" => builder.test_remote_failures(parse_toml_u64(key, item)?), @@ -986,6 +1029,9 @@ impl PageServerConf { "get_vectored_impl" => { builder.get_vectored_impl(parse_toml_from_str("get_vectored_impl", item)?) } + "get_impl" => { + builder.get_impl(parse_toml_from_str("get_impl", item)?) + } "max_vectored_read_bytes" => { let bytes = parse_toml_u64("max_vectored_read_bytes", item)? as usize; builder.get_max_vectored_read_bytes( @@ -995,6 +1041,12 @@ impl PageServerConf { "validate_vectored_get" => { builder.get_validate_vectored_get(parse_toml_bool("validate_vectored_get", item)?) } + "ephemeral_bytes_per_memory_kb" => { + builder.get_ephemeral_bytes_per_memory_kb(parse_toml_u64("ephemeral_bytes_per_memory_kb", item)? as usize) + } + "walredo_process_kind" => { + builder.get_walredo_process_kind(parse_toml_from_str("walredo_process_kind", item)?) + } _ => bail!("unrecognized pageserver option '{key}'"), } } @@ -1057,6 +1109,7 @@ impl PageServerConf { metric_collection_interval: Duration::from_secs(60), cached_metric_collection_interval: Duration::from_secs(60 * 60), metric_collection_endpoint: defaults::DEFAULT_METRIC_COLLECTION_ENDPOINT, + metric_collection_bucket: None, synthetic_size_calculation_interval: Duration::from_secs(60), disk_usage_based_eviction: None, test_remote_failures: 0, @@ -1070,11 +1123,14 @@ impl PageServerConf { ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE, virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(), get_vectored_impl: defaults::DEFAULT_GET_VECTORED_IMPL.parse().unwrap(), + get_impl: defaults::DEFAULT_GET_IMPL.parse().unwrap(), max_vectored_read_bytes: MaxVectoredReadBytes( NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES) .expect("Invalid default constant"), ), validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET, + ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB, + walredo_process_kind: defaults::DEFAULT_WALREDO_PROCESS_KIND.parse().unwrap(), } } } @@ -1289,6 +1345,7 @@ background_task_maximum_delay = '334 s' defaults::DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL )?, metric_collection_endpoint: defaults::DEFAULT_METRIC_COLLECTION_ENDPOINT, + metric_collection_bucket: None, synthetic_size_calculation_interval: humantime::parse_duration( defaults::DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL )?, @@ -1306,11 +1363,14 @@ background_task_maximum_delay = '334 s' ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE, virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(), get_vectored_impl: defaults::DEFAULT_GET_VECTORED_IMPL.parse().unwrap(), + get_impl: defaults::DEFAULT_GET_IMPL.parse().unwrap(), max_vectored_read_bytes: MaxVectoredReadBytes( NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES) .expect("Invalid default constant") ), validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET, + ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB, + walredo_process_kind: defaults::DEFAULT_WALREDO_PROCESS_KIND.parse().unwrap(), }, "Correct defaults should be used when no config values are provided" ); @@ -1363,6 +1423,7 @@ background_task_maximum_delay = '334 s' metric_collection_interval: Duration::from_secs(222), cached_metric_collection_interval: Duration::from_secs(22200), metric_collection_endpoint: Some(Url::parse("http://localhost:80/metrics")?), + metric_collection_bucket: None, synthetic_size_calculation_interval: Duration::from_secs(333), disk_usage_based_eviction: None, test_remote_failures: 0, @@ -1376,11 +1437,14 @@ background_task_maximum_delay = '334 s' ingest_batch_size: 100, virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(), get_vectored_impl: defaults::DEFAULT_GET_VECTORED_IMPL.parse().unwrap(), + get_impl: defaults::DEFAULT_GET_IMPL.parse().unwrap(), max_vectored_read_bytes: MaxVectoredReadBytes( NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES) .expect("Invalid default constant") ), validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET, + ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB, + walredo_process_kind: defaults::DEFAULT_WALREDO_PROCESS_KIND.parse().unwrap(), }, "Should be able to parse all basic config values correctly" ); @@ -1493,6 +1557,7 @@ broker_endpoint = '{broker_endpoint}' endpoint: Some(endpoint.clone()), concurrency_limit: s3_concurrency_limit, max_keys_per_list_response: None, + upload_storage_class: None, }), timeout: RemoteStorageConfig::DEFAULT_TIMEOUT, }, diff --git a/pageserver/src/consumption_metrics.rs b/pageserver/src/consumption_metrics.rs index c7f9d596c6..62bbde42f4 100644 --- a/pageserver/src/consumption_metrics.rs +++ b/pageserver/src/consumption_metrics.rs @@ -3,10 +3,13 @@ use crate::context::{DownloadBehavior, RequestContext}; use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME}; use crate::tenant::tasks::BackgroundLoopKind; -use crate::tenant::{mgr, LogicalSizeCalculationCause, PageReconstructError, Tenant}; +use crate::tenant::{ + mgr::TenantManager, LogicalSizeCalculationCause, PageReconstructError, Tenant, +}; use camino::Utf8PathBuf; use consumption_metrics::EventType; use pageserver_api::models::TenantState; +use remote_storage::{GenericRemoteStorage, RemoteStorageConfig}; use reqwest::Url; use std::collections::HashMap; use std::sync::Arc; @@ -40,7 +43,9 @@ type Cache = HashMap; /// Main thread that serves metrics collection #[allow(clippy::too_many_arguments)] pub async fn collect_metrics( + tenant_manager: Arc, metric_collection_endpoint: &Url, + metric_collection_bucket: &Option, metric_collection_interval: Duration, _cached_metric_collection_interval: Duration, synthetic_size_calculation_interval: Duration, @@ -65,15 +70,19 @@ pub async fn collect_metrics( None, "synthetic size calculation", false, - async move { - calculate_synthetic_size_worker( - synthetic_size_calculation_interval, - &cancel, - &worker_ctx, - ) - .instrument(info_span!("synthetic_size_worker")) - .await?; - Ok(()) + { + let tenant_manager = tenant_manager.clone(); + async move { + calculate_synthetic_size_worker( + tenant_manager, + synthetic_size_calculation_interval, + &cancel, + &worker_ctx, + ) + .instrument(info_span!("synthetic_size_worker")) + .await?; + Ok(()) + } }, ); @@ -94,13 +103,27 @@ pub async fn collect_metrics( .build() .expect("Failed to create http client with timeout"); + let bucket_client = if let Some(bucket_config) = metric_collection_bucket { + match GenericRemoteStorage::from_config(bucket_config) { + Ok(client) => Some(client), + Err(e) => { + // Non-fatal error: if we were given an invalid config, we will proceed + // with sending metrics over the network, but not to S3. + tracing::warn!("Invalid configuration for metric_collection_bucket: {e}"); + None + } + } + } else { + None + }; + let node_id = node_id.to_string(); loop { let started_at = Instant::now(); // these are point in time, with variable "now" - let metrics = metrics::collect_all_metrics(&cached_metrics, &ctx).await; + let metrics = metrics::collect_all_metrics(&tenant_manager, &cached_metrics, &ctx).await; let metrics = Arc::new(metrics); @@ -118,10 +141,18 @@ pub async fn collect_metrics( tracing::error!("failed to persist metrics to {path:?}: {e:#}"); } } + + if let Some(bucket_client) = &bucket_client { + let res = + upload::upload_metrics_bucket(bucket_client, &cancel, &node_id, &metrics).await; + if let Err(e) = res { + tracing::error!("failed to upload to S3: {e:#}"); + } + } }; let upload = async { - let res = upload::upload_metrics( + let res = upload::upload_metrics_http( &client, metric_collection_endpoint, &cancel, @@ -132,7 +163,7 @@ pub async fn collect_metrics( .await; if let Err(e) = res { // serialization error which should never happen - tracing::error!("failed to upload due to {e:#}"); + tracing::error!("failed to upload via HTTP due to {e:#}"); } }; @@ -247,6 +278,7 @@ async fn reschedule( /// Caclculate synthetic size for each active tenant async fn calculate_synthetic_size_worker( + tenant_manager: Arc, synthetic_size_calculation_interval: Duration, cancel: &CancellationToken, ctx: &RequestContext, @@ -259,7 +291,7 @@ async fn calculate_synthetic_size_worker( loop { let started_at = Instant::now(); - let tenants = match mgr::list_tenants().await { + let tenants = match tenant_manager.list_tenants() { Ok(tenants) => tenants, Err(e) => { warn!("cannot get tenant list: {e:#}"); @@ -272,16 +304,20 @@ async fn calculate_synthetic_size_worker( continue; } - if !tenant_shard_id.is_zero() { + if !tenant_shard_id.is_shard_zero() { // We only send consumption metrics from shard 0, so don't waste time calculating // synthetic size on other shards. continue; } - let Ok(tenant) = mgr::get_tenant(tenant_shard_id, true) else { + let Ok(tenant) = tenant_manager.get_attached_tenant_shard(tenant_shard_id) else { continue; }; + if !tenant.is_active() { + continue; + } + // there is never any reason to exit calculate_synthetic_size_worker following any // return value -- we don't need to care about shutdown because no tenant is found when // pageserver is shut down. @@ -319,9 +355,7 @@ async fn calculate_and_log(tenant: &Tenant, cancel: &CancellationToken, ctx: &Re }; // this error can be returned if timeline is shutting down, but it does not - // mean the synthetic size worker should terminate. we do not need any checks - // in this function because `mgr::get_tenant` will error out after shutdown has - // progressed to shutting down tenants. + // mean the synthetic size worker should terminate. let shutting_down = matches!( e.downcast_ref::(), Some(PageReconstructError::Cancelled | PageReconstructError::AncestorStopping(_)) diff --git a/pageserver/src/consumption_metrics/metrics.rs b/pageserver/src/consumption_metrics/metrics.rs index 26b299a71d..7ba2d04c4f 100644 --- a/pageserver/src/consumption_metrics/metrics.rs +++ b/pageserver/src/consumption_metrics/metrics.rs @@ -1,3 +1,4 @@ +use crate::tenant::mgr::TenantManager; use crate::{context::RequestContext, tenant::timeline::logical_size::CurrentLogicalSize}; use chrono::{DateTime, Utc}; use consumption_metrics::EventType; @@ -181,6 +182,7 @@ impl MetricsKey { } pub(super) async fn collect_all_metrics( + tenant_manager: &Arc, cached_metrics: &Cache, ctx: &RequestContext, ) -> Vec { @@ -188,7 +190,7 @@ pub(super) async fn collect_all_metrics( let started_at = std::time::Instant::now(); - let tenants = match crate::tenant::mgr::list_tenants().await { + let tenants = match tenant_manager.list_tenants() { Ok(tenants) => tenants, Err(err) => { tracing::error!("failed to list tenants: {:?}", err); @@ -197,10 +199,11 @@ pub(super) async fn collect_all_metrics( }; let tenants = futures::stream::iter(tenants).filter_map(|(id, state, _)| async move { - if state != TenantState::Active || !id.is_zero() { + if state != TenantState::Active || !id.is_shard_zero() { None } else { - crate::tenant::mgr::get_tenant(id, true) + tenant_manager + .get_attached_tenant_shard(id) .ok() .map(|tenant| (id.tenant_id, tenant)) } diff --git a/pageserver/src/consumption_metrics/upload.rs b/pageserver/src/consumption_metrics/upload.rs index 6b840a3136..4e8283c3e4 100644 --- a/pageserver/src/consumption_metrics/upload.rs +++ b/pageserver/src/consumption_metrics/upload.rs @@ -1,4 +1,9 @@ +use std::time::SystemTime; + +use chrono::{DateTime, Utc}; use consumption_metrics::{Event, EventChunk, IdempotencyKey, CHUNK_SIZE}; +use remote_storage::{GenericRemoteStorage, RemotePath}; +use tokio::io::AsyncWriteExt; use tokio_util::sync::CancellationToken; use tracing::Instrument; @@ -13,8 +18,9 @@ struct Ids { pub(super) timeline_id: Option, } +/// Serialize and write metrics to an HTTP endpoint #[tracing::instrument(skip_all, fields(metrics_total = %metrics.len()))] -pub(super) async fn upload_metrics( +pub(super) async fn upload_metrics_http( client: &reqwest::Client, metric_collection_endpoint: &reqwest::Url, cancel: &CancellationToken, @@ -74,6 +80,60 @@ pub(super) async fn upload_metrics( Ok(()) } +/// Serialize and write metrics to a remote storage object +#[tracing::instrument(skip_all, fields(metrics_total = %metrics.len()))] +pub(super) async fn upload_metrics_bucket( + client: &GenericRemoteStorage, + cancel: &CancellationToken, + node_id: &str, + metrics: &[RawMetric], +) -> anyhow::Result<()> { + if metrics.is_empty() { + // Skip uploads if we have no metrics, so that readers don't have to handle the edge case + // of an empty object. + return Ok(()); + } + + // Compose object path + let datetime: DateTime = SystemTime::now().into(); + let ts_prefix = datetime.format("year=%Y/month=%m/day=%d/%H:%M:%SZ"); + let path = RemotePath::from_string(&format!("{ts_prefix}_{node_id}.ndjson.gz"))?; + + // Set up a gzip writer into a buffer + let mut compressed_bytes: Vec = Vec::new(); + let compressed_writer = std::io::Cursor::new(&mut compressed_bytes); + let mut gzip_writer = async_compression::tokio::write::GzipEncoder::new(compressed_writer); + + // Serialize and write into compressed buffer + let started_at = std::time::Instant::now(); + for res in serialize_in_chunks(CHUNK_SIZE, metrics, node_id) { + let (_chunk, body) = res?; + gzip_writer.write_all(&body).await?; + } + gzip_writer.flush().await?; + gzip_writer.shutdown().await?; + let compressed_length = compressed_bytes.len(); + + // Write to remote storage + client + .upload_storage_object( + futures::stream::once(futures::future::ready(Ok(compressed_bytes.into()))), + compressed_length, + &path, + cancel, + ) + .await?; + let elapsed = started_at.elapsed(); + + tracing::info!( + compressed_length, + elapsed_ms = elapsed.as_millis(), + "write metrics bucket at {path}", + ); + + Ok(()) +} + // The return type is quite ugly, but we gain testability in isolation fn serialize_in_chunks<'a, F>( chunk_size: usize, diff --git a/pageserver/src/control_plane_client.rs b/pageserver/src/control_plane_client.rs index 42c800822b..26e7cc7ef8 100644 --- a/pageserver/src/control_plane_client.rs +++ b/pageserver/src/control_plane_client.rs @@ -12,12 +12,10 @@ use pageserver_api::{ use serde::{de::DeserializeOwned, Serialize}; use tokio_util::sync::CancellationToken; use url::Url; -use utils::{backoff, generation::Generation, id::NodeId}; +use utils::{backoff, failpoint_support, generation::Generation, id::NodeId}; -use crate::{ - config::{NodeMetadata, PageServerConf}, - virtual_file::on_fatal_io_error, -}; +use crate::{config::PageServerConf, virtual_file::on_fatal_io_error}; +use pageserver_api::config::NodeMetadata; /// The Pageserver's client for using the control plane API: this is a small subset /// of the overall control plane API, for dealing with generations (see docs/rfcs/025-generation-numbers.md) @@ -65,7 +63,7 @@ impl ControlPlaneClient { let mut client = reqwest::ClientBuilder::new(); if let Some(jwt) = &conf.control_plane_api_token { - let mut headers = hyper::HeaderMap::new(); + let mut headers = reqwest::header::HeaderMap::new(); headers.insert( "Authorization", format!("Bearer {}", jwt.get_contents()).parse().unwrap(), @@ -210,7 +208,10 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient { .collect(), }; - fail::fail_point!("control-plane-client-validate"); + failpoint_support::sleep_millis_async!("control-plane-client-validate-sleep", &self.cancel); + if self.cancel.is_cancelled() { + return Err(RetryForeverError::ShuttingDown); + } let response: ValidateResponse = self.retry_http_forever(&re_attach_path, request).await?; diff --git a/pageserver/src/deletion_queue.rs b/pageserver/src/deletion_queue.rs index e3c11cb299..c937309d83 100644 --- a/pageserver/src/deletion_queue.rs +++ b/pageserver/src/deletion_queue.rs @@ -38,7 +38,7 @@ use deleter::DeleterMessage; use list_writer::ListWriterQueueMessage; use validator::ValidatorQueueMessage; -use crate::{config::PageServerConf, tenant::storage_layer::LayerFileName}; +use crate::{config::PageServerConf, tenant::storage_layer::LayerName}; // TODO: configurable for how long to wait before executing deletions @@ -479,7 +479,7 @@ impl DeletionQueueClient { tenant_shard_id: TenantShardId, timeline_id: TimelineId, current_generation: Generation, - layers: Vec<(LayerFileName, LayerFileMetadata)>, + layers: Vec<(LayerName, LayerFileMetadata)>, ) -> Result<(), DeletionQueueError> { if current_generation.is_none() { debug!("Enqueuing deletions in legacy mode, skipping queue"); @@ -511,7 +511,7 @@ impl DeletionQueueClient { tenant_shard_id: TenantShardId, timeline_id: TimelineId, current_generation: Generation, - layers: Vec<(LayerFileName, LayerFileMetadata)>, + layers: Vec<(LayerName, LayerFileMetadata)>, ) -> Result<(), DeletionQueueError> { metrics::DELETION_QUEUE .keys_submitted @@ -734,20 +734,20 @@ mod test { use crate::{ control_plane_client::RetryForeverError, repository::Key, - tenant::{harness::TenantHarness, storage_layer::DeltaFileName}, + tenant::{harness::TenantHarness, storage_layer::DeltaLayerName}, }; use super::*; pub const TIMELINE_ID: TimelineId = TimelineId::from_array(hex!("11223344556677881122334455667788")); - pub const EXAMPLE_LAYER_NAME: LayerFileName = LayerFileName::Delta(DeltaFileName { + pub const EXAMPLE_LAYER_NAME: LayerName = LayerName::Delta(DeltaLayerName { key_range: Key::from_i128(0x0)..Key::from_i128(0xFFFFFFFFFFFFFFFF), lsn_range: Lsn(0x00000000016B59D8)..Lsn(0x00000000016B5A51), }); // When you need a second layer in a test. - pub const EXAMPLE_LAYER_NAME_ALT: LayerFileName = LayerFileName::Delta(DeltaFileName { + pub const EXAMPLE_LAYER_NAME_ALT: LayerName = LayerName::Delta(DeltaLayerName { key_range: Key::from_i128(0x0)..Key::from_i128(0xFFFFFFFFFFFFFFFF), lsn_range: Lsn(0x00000000016B5A51)..Lsn(0x00000000016B5A61), }); @@ -797,7 +797,7 @@ mod test { /// Returns remote layer file name, suitable for use in assert_remote_files fn write_remote_layer( &self, - file_name: LayerFileName, + file_name: LayerName, gen: Generation, ) -> anyhow::Result { let tenant_shard_id = self.harness.tenant_shard_id; @@ -952,7 +952,7 @@ mod test { let client = ctx.deletion_queue.new_client(); client.recover(HashMap::new())?; - let layer_file_name_1: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(); + let layer_file_name_1: LayerName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(); let tenant_shard_id = ctx.harness.tenant_shard_id; let content: Vec = "victim1 contents".into(); diff --git a/pageserver/src/deletion_queue/list_writer.rs b/pageserver/src/deletion_queue/list_writer.rs index 3a3d600ac2..ae3b2c9180 100644 --- a/pageserver/src/deletion_queue/list_writer.rs +++ b/pageserver/src/deletion_queue/list_writer.rs @@ -34,7 +34,7 @@ use crate::deletion_queue::TEMP_SUFFIX; use crate::metrics; use crate::tenant::remote_timeline_client::remote_layer_path; use crate::tenant::remote_timeline_client::LayerFileMetadata; -use crate::tenant::storage_layer::LayerFileName; +use crate::tenant::storage_layer::LayerName; use crate::virtual_file::on_fatal_io_error; use crate::virtual_file::MaybeFatalIo; @@ -59,7 +59,7 @@ pub(super) struct DeletionOp { // `layers` and `objects` are both just lists of objects. `layers` is used if you do not // have a config object handy to project it to a remote key, and need the consuming worker // to do it for you. - pub(super) layers: Vec<(LayerFileName, LayerFileMetadata)>, + pub(super) layers: Vec<(LayerName, LayerFileMetadata)>, pub(super) objects: Vec, /// The _current_ generation of the Tenant shard attachment in which we are enqueuing diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs index 92c1475aef..ebeb8bbb20 100644 --- a/pageserver/src/disk_usage_eviction_task.rs +++ b/pageserver/src/disk_usage_eviction_task.rs @@ -61,11 +61,10 @@ use crate::{ metrics::disk_usage_based_eviction::METRICS, task_mgr::{self, TaskKind, BACKGROUND_RUNTIME}, tenant::{ - self, mgr::TenantManager, remote_timeline_client::LayerFileMetadata, secondary::SecondaryTenant, - storage_layer::{AsLayerDesc, EvictionError, Layer, LayerFileName}, + storage_layer::{AsLayerDesc, EvictionError, Layer, LayerName}, }, }; @@ -541,7 +540,12 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl( js.spawn(async move { layer .secondary_tenant - .evict_layer(tenant_manager.get_conf(), layer.timeline_id, layer.name) + .evict_layer( + tenant_manager.get_conf(), + layer.timeline_id, + layer.name, + layer.metadata, + ) .await; Ok(file_size) }); @@ -600,7 +604,7 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl( pub(crate) struct EvictionSecondaryLayer { pub(crate) secondary_tenant: Arc, pub(crate) timeline_id: TimelineId, - pub(crate) name: LayerFileName, + pub(crate) name: LayerName, pub(crate) metadata: LayerFileMetadata, } @@ -633,9 +637,9 @@ impl EvictionLayer { } } - pub(crate) fn get_name(&self) -> LayerFileName { + pub(crate) fn get_name(&self) -> LayerName { match self { - Self::Attached(l) => l.layer_desc().filename(), + Self::Attached(l) => l.layer_desc().layer_name(), Self::Secondary(sl) => sl.name.clone(), } } @@ -814,8 +818,8 @@ async fn collect_eviction_candidates( const LOG_DURATION_THRESHOLD: std::time::Duration = std::time::Duration::from_secs(10); // get a snapshot of the list of tenants - let tenants = tenant::mgr::list_tenants() - .await + let tenants = tenant_manager + .list_tenants() .context("get list of tenants")?; // TODO: avoid listing every layer in every tenant: this loop can block the executor, @@ -827,8 +831,12 @@ async fn collect_eviction_candidates( if cancel.is_cancelled() { return Ok(EvictionCandidates::Cancelled); } - let tenant = match tenant::mgr::get_tenant(tenant_id, true) { - Ok(tenant) => tenant, + let tenant = match tenant_manager.get_attached_tenant_shard(tenant_id) { + Ok(tenant) if tenant.is_active() => tenant, + Ok(_) => { + debug!(tenant_id=%tenant_id.tenant_id, shard_id=%tenant_id.shard_slug(), "Tenant shard is not active"); + continue; + } Err(e) => { // this can happen if tenant has lifecycle transition after we fetched it debug!("failed to get tenant: {e:#}"); diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index 0771229845..36c74ed140 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -58,24 +58,6 @@ paths: responses: "200": description: The reload completed successfully. - "401": - description: Unauthorized Error - content: - application/json: - schema: - $ref: "#/components/schemas/UnauthorizedError" - "403": - description: Forbidden Error - content: - application/json: - schema: - $ref: "#/components/schemas/ForbiddenError" - "500": - description: Generic operation error (also hits if no keys were found) - content: - application/json: - schema: - $ref: "#/components/schemas/Error" /v1/tenant/{tenant_id}: parameters: @@ -93,62 +75,14 @@ paths: application/json: schema: $ref: "#/components/schemas/TenantInfo" - "400": - description: Error when no tenant id found in path or no timeline id - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "401": - description: Unauthorized Error - content: - application/json: - schema: - $ref: "#/components/schemas/UnauthorizedError" - "403": - description: Forbidden Error - content: - application/json: - schema: - $ref: "#/components/schemas/ForbiddenError" - "500": - description: Generic operation error - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "503": - description: Temporarily unavailable, please retry. - content: - application/json: - schema: - $ref: "#/components/schemas/ServiceUnavailableError" delete: description: | Attempts to delete specified tenant. 500, 503 and 409 errors should be retried until 404 is retrieved. 404 means that deletion successfully finished" responses: - "400": - description: Error when no tenant id found in path - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "401": - description: Unauthorized Error - content: - application/json: - schema: - $ref: "#/components/schemas/UnauthorizedError" - "403": - description: Forbidden Error - content: - application/json: - schema: - $ref: "#/components/schemas/ForbiddenError" "404": - description: Tenant not found + description: Tenant not found. This is the success path. content: application/json: schema: @@ -165,18 +99,6 @@ paths: application/json: schema: $ref: "#/components/schemas/PreconditionFailedError" - "500": - description: Generic operation error - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "503": - description: Temporarily unavailable, please retry. - content: - application/json: - schema: - $ref: "#/components/schemas/ServiceUnavailableError" /v1/tenant/{tenant_id}/time_travel_remote_storage: parameters: @@ -206,36 +128,6 @@ paths: application/json: schema: type: string - "400": - description: Error when no tenant id found in path or invalid timestamp - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "401": - description: Unauthorized Error - content: - application/json: - schema: - $ref: "#/components/schemas/UnauthorizedError" - "403": - description: Forbidden Error - content: - application/json: - schema: - $ref: "#/components/schemas/ForbiddenError" - "500": - description: Generic operation error - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "503": - description: Temporarily unavailable, please retry. - content: - application/json: - schema: - $ref: "#/components/schemas/ServiceUnavailableError" /v1/tenant/{tenant_id}/timeline: parameters: @@ -255,36 +147,6 @@ paths: type: array items: $ref: "#/components/schemas/TimelineInfo" - "400": - description: Error when no tenant id found in path - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "401": - description: Unauthorized Error - content: - application/json: - schema: - $ref: "#/components/schemas/UnauthorizedError" - "403": - description: Forbidden Error - content: - application/json: - schema: - $ref: "#/components/schemas/ForbiddenError" - "500": - description: Generic operation error - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "503": - description: Temporarily unavailable, please retry. - content: - application/json: - schema: - $ref: "#/components/schemas/ServiceUnavailableError" /v1/tenant/{tenant_id}/timeline/{timeline_id}: @@ -309,60 +171,12 @@ paths: application/json: schema: $ref: "#/components/schemas/TimelineInfo" - "400": - description: Error when no tenant id found in path or no timeline id - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "401": - description: Unauthorized Error - content: - application/json: - schema: - $ref: "#/components/schemas/UnauthorizedError" - "403": - description: Forbidden Error - content: - application/json: - schema: - $ref: "#/components/schemas/ForbiddenError" - "500": - description: Generic operation error - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "503": - description: Temporarily unavailable, please retry. - content: - application/json: - schema: - $ref: "#/components/schemas/ServiceUnavailableError" delete: description: "Attempts to delete specified timeline. 500 and 409 errors should be retried" responses: - "400": - description: Error when no tenant id found in path or no timeline id - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "401": - description: Unauthorized Error - content: - application/json: - schema: - $ref: "#/components/schemas/UnauthorizedError" - "403": - description: Forbidden Error - content: - application/json: - schema: - $ref: "#/components/schemas/ForbiddenError" "404": - description: Timeline not found + description: Timeline not found. This is the success path. content: application/json: schema: @@ -379,18 +193,6 @@ paths: application/json: schema: $ref: "#/components/schemas/PreconditionFailedError" - "500": - description: Generic operation error - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "503": - description: Temporarily unavailable, please retry. - content: - application/json: - schema: - $ref: "#/components/schemas/ServiceUnavailableError" /v1/tenant/{tenant_id}/timeline/{timeline_id}/get_timestamp_of_lsn: parameters: @@ -423,36 +225,6 @@ paths: schema: type: string format: date-time - "400": - description: Error when no tenant id found in path, no timeline id or invalid timestamp - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "401": - description: Unauthorized Error - content: - application/json: - schema: - $ref: "#/components/schemas/UnauthorizedError" - "403": - description: Forbidden Error - content: - application/json: - schema: - $ref: "#/components/schemas/ForbiddenError" - "404": - description: Timeline not found, or there is no timestamp information for the given lsn - content: - application/json: - schema: - $ref: "#/components/schemas/NotFoundError" - "500": - description: Generic operation error - content: - application/json: - schema: - $ref: "#/components/schemas/Error" /v1/tenant/{tenant_id}/timeline/{timeline_id}/get_lsn_by_timestamp: parameters: @@ -484,36 +256,6 @@ paths: application/json: schema: $ref: "#/components/schemas/LsnByTimestampResponse" - "400": - description: Error when no tenant id found in path, no timeline id or invalid timestamp - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "401": - description: Unauthorized Error - content: - application/json: - schema: - $ref: "#/components/schemas/UnauthorizedError" - "403": - description: Forbidden Error - content: - application/json: - schema: - $ref: "#/components/schemas/ForbiddenError" - "500": - description: Generic operation error - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "503": - description: Temporarily unavailable, please retry. - content: - application/json: - schema: - $ref: "#/components/schemas/ServiceUnavailableError" /v1/tenant/{tenant_id}/timeline/{timeline_id}/do_gc: parameters: @@ -537,36 +279,6 @@ paths: application/json: schema: type: string - "400": - description: Error when no tenant id found in path, no timeline id or invalid timestamp - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "401": - description: Unauthorized Error - content: - application/json: - schema: - $ref: "#/components/schemas/UnauthorizedError" - "403": - description: Forbidden Error - content: - application/json: - schema: - $ref: "#/components/schemas/ForbiddenError" - "500": - description: Generic operation error - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "503": - description: Temporarily unavailable, please retry. - content: - application/json: - schema: - $ref: "#/components/schemas/ServiceUnavailableError" /v1/tenant/{tenant_shard_id}/location_config: parameters: - name: tenant_shard_id @@ -628,24 +340,6 @@ paths: application/json: schema: $ref: "#/components/schemas/TenantLocationConfigResponse" - "503": - description: Tenant's state cannot be changed right now. Wait a few seconds and retry. - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "401": - description: Unauthorized Error - content: - application/json: - schema: - $ref: "#/components/schemas/UnauthorizedError" - "403": - description: Forbidden Error - content: - application/json: - schema: - $ref: "#/components/schemas/ForbiddenError" "409": description: | The tenant is already known to Pageserver in some way, @@ -662,12 +356,6 @@ paths: application/json: schema: $ref: "#/components/schemas/ConflictError" - "500": - description: Generic operation error - content: - application/json: - schema: - $ref: "#/components/schemas/Error" /v1/tenant/{tenant_id}/ignore: parameters: - name: tenant_id @@ -684,36 +372,6 @@ paths: responses: "200": description: Tenant ignored - "400": - description: Error when no tenant id found in path parameters - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "401": - description: Unauthorized Error - content: - application/json: - schema: - $ref: "#/components/schemas/UnauthorizedError" - "403": - description: Forbidden Error - content: - application/json: - schema: - $ref: "#/components/schemas/ForbiddenError" - "500": - description: Generic operation error - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "503": - description: Temporarily unavailable, please retry. - content: - application/json: - schema: - $ref: "#/components/schemas/ServiceUnavailableError" /v1/tenant/{tenant_id}/load: @@ -740,36 +398,6 @@ paths: responses: "202": description: Tenant scheduled to load successfully - "400": - description: Error when no tenant id found in path parameters - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "401": - description: Unauthorized Error - content: - application/json: - schema: - $ref: "#/components/schemas/UnauthorizedError" - "403": - description: Forbidden Error - content: - application/json: - schema: - $ref: "#/components/schemas/ForbiddenError" - "500": - description: Generic operation error - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "503": - description: Temporarily unavailable, please retry. - content: - application/json: - schema: - $ref: "#/components/schemas/ServiceUnavailableError" /v1/tenant/{tenant_id}/{timeline_id}/preserve_initdb_archive: parameters: @@ -790,81 +418,8 @@ paths: responses: "202": description: Tenant scheduled to load successfully - "404": - description: No tenant or timeline found for the specified ids - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "401": - description: Unauthorized Error - content: - application/json: - schema: - $ref: "#/components/schemas/UnauthorizedError" - "403": - description: Forbidden Error - content: - application/json: - schema: - $ref: "#/components/schemas/ForbiddenError" - "500": - description: Generic operation error - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "503": - description: Temporarily unavailable, please retry. - content: - application/json: - schema: - $ref: "#/components/schemas/ServiceUnavailableError" - /v1/tenant/{tenant_id}/synthetic_size: - parameters: - - name: tenant_id - in: path - required: true - schema: - type: string - get: - description: | - Calculate tenant's synthetic size - responses: - "200": - description: Tenant's synthetic size - content: - application/json: - schema: - $ref: "#/components/schemas/SyntheticSizeResponse" - "401": - description: Unauthorized Error - content: - application/json: - schema: - $ref: "#/components/schemas/UnauthorizedError" - "403": - description: Forbidden Error - content: - application/json: - schema: - $ref: "#/components/schemas/ForbiddenError" - "500": - description: Generic operation error - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "503": - description: Temporarily unavailable, please retry. - content: - application/json: - schema: - $ref: "#/components/schemas/ServiceUnavailableError" - - /v1/tenant/{tenant_id}/size: parameters: - name: tenant_id in: path @@ -894,19 +449,9 @@ paths: content: application/json: schema: - type: object - required: - - id - - size - properties: - id: - type: string - format: hex - size: - type: integer - nullable: true - description: | - Size metric in bytes or null if inputs_only=true was given. + $ref: "#/components/schemas/SyntheticSizeResponse" + text/html: + description: SVG representation of the tenant and it's timelines. "401": description: Unauthorized Error content: @@ -945,18 +490,6 @@ paths: responses: "200": description: Success - "500": - description: Generic operation error - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "503": - description: Temporarily unavailable, please retry. - content: - application/json: - schema: - $ref: "#/components/schemas/ServiceUnavailableError" /v1/tenant/{tenant_shard_id}/secondary/download: parameters: @@ -987,20 +520,6 @@ paths: application/json: schema: $ref: "#/components/schemas/SecondaryProgress" - "500": - description: Generic operation error - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "503": - description: Temporarily unavailable, please retry. - content: - application/json: - schema: - $ref: "#/components/schemas/ServiceUnavailableError" - - /v1/tenant/{tenant_id}/timeline/: parameters: @@ -1038,29 +557,11 @@ paths: format: hex responses: "201": - description: TimelineInfo + description: Timeline was created, or already existed with matching parameters content: application/json: schema: $ref: "#/components/schemas/TimelineInfo" - "400": - description: Malformed timeline create request - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "401": - description: Unauthorized Error - content: - application/json: - schema: - $ref: "#/components/schemas/UnauthorizedError" - "403": - description: Forbidden Error - content: - application/json: - schema: - $ref: "#/components/schemas/ForbiddenError" "406": description: Permanently unsatisfiable request, don't retry. content: @@ -1068,23 +569,17 @@ paths: schema: $ref: "#/components/schemas/Error" "409": - description: Timeline already exists, creation skipped + description: Timeline already exists, with different parameters. Creation cannot proceed. content: application/json: schema: $ref: "#/components/schemas/ConflictError" - "500": - description: Generic operation error + "429": + description: A creation request was sent for the same Timeline Id while a creation was already in progress. Back off and retry. content: application/json: schema: $ref: "#/components/schemas/Error" - "503": - description: Temporarily unavailable, please retry. - content: - application/json: - schema: - $ref: "#/components/schemas/ServiceUnavailableError" /v1/tenant/: get: @@ -1098,30 +593,6 @@ paths: type: array items: $ref: "#/components/schemas/TenantInfo" - "401": - description: Unauthorized Error - content: - application/json: - schema: - $ref: "#/components/schemas/UnauthorizedError" - "403": - description: Forbidden Error - content: - application/json: - schema: - $ref: "#/components/schemas/ForbiddenError" - "500": - description: Generic operation error - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "503": - description: Temporarily unavailable, please retry. - content: - application/json: - schema: - $ref: "#/components/schemas/ServiceUnavailableError" post: description: | @@ -1142,43 +613,12 @@ paths: application/json: schema: type: string - "400": - description: Malformed tenant create request - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "401": - description: Unauthorized Error - content: - application/json: - schema: - $ref: "#/components/schemas/UnauthorizedError" - "403": - description: Forbidden Error - content: - application/json: - schema: - $ref: "#/components/schemas/ForbiddenError" "409": description: Tenant already exists, creation skipped content: application/json: schema: $ref: "#/components/schemas/ConflictError" - "500": - description: Generic operation error - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "503": - description: Temporarily unavailable, please retry. - content: - application/json: - schema: - $ref: "#/components/schemas/ServiceUnavailableError" - /v1/tenant/config: put: @@ -1200,36 +640,6 @@ paths: type: array items: $ref: "#/components/schemas/TenantInfo" - "400": - description: Malformed tenant config request - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "401": - description: Unauthorized Error - content: - application/json: - schema: - $ref: "#/components/schemas/UnauthorizedError" - "403": - description: Forbidden Error - content: - application/json: - schema: - $ref: "#/components/schemas/ForbiddenError" - "500": - description: Generic operation error - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "503": - description: Temporarily unavailable, please retry. - content: - application/json: - schema: - $ref: "#/components/schemas/ServiceUnavailableError" /v1/tenant/{tenant_id}/config/: parameters: @@ -1249,42 +659,6 @@ paths: application/json: schema: $ref: "#/components/schemas/TenantConfigResponse" - "400": - description: Malformed get tenanant config request - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "401": - description: Unauthorized Error - content: - application/json: - schema: - $ref: "#/components/schemas/UnauthorizedError" - "403": - description: Forbidden Error - content: - application/json: - schema: - $ref: "#/components/schemas/ForbiddenError" - "404": - description: Tenand or timeline were not found - content: - application/json: - schema: - $ref: "#/components/schemas/NotFoundError" - "500": - description: Generic operation error - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "503": - description: Temporarily unavailable, please retry. - content: - application/json: - schema: - $ref: "#/components/schemas/ServiceUnavailableError" /v1/utilization: get: @@ -1298,12 +672,6 @@ paths: application/json: schema: $ref: "#/components/schemas/PageserverUtilization" - "500": - description: Generic operation error - content: - application/json: - schema: - $ref: "#/components/schemas/Error" components: securitySchemes: @@ -1385,9 +753,6 @@ components: required: - mode properties: - tenant_id: - type: string - description: Not used, scheduled for removal. mode: type: string enum: ["AttachedSingle", "AttachedMulti", "AttachedStale", "Secondary", "Detached"] @@ -1535,6 +900,9 @@ components: format: hex size: type: integer + nullable: true + description: | + Size metric in bytes or null if inputs_only=true was given. segment_sizes: type: array items: @@ -1623,7 +991,7 @@ components: type: integer format: int64 minimum: 0 - description: The amount of disk space currently utilized by layer files. + description: The amount of disk space currently used. free_space_bytes: type: integer format: int64 diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 26f23fb8c2..a8ca642dc5 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -19,6 +19,8 @@ use pageserver_api::models::LocationConfigListResponse; use pageserver_api::models::ShardParameters; use pageserver_api::models::TenantDetails; use pageserver_api::models::TenantLocationConfigResponse; +use pageserver_api::models::TenantScanRemoteStorageResponse; +use pageserver_api::models::TenantScanRemoteStorageShard; use pageserver_api::models::TenantShardLocation; use pageserver_api::models::TenantShardSplitRequest; use pageserver_api::models::TenantShardSplitResponse; @@ -29,6 +31,7 @@ use pageserver_api::models::{ }; use pageserver_api::shard::ShardCount; use pageserver_api::shard::TenantShardId; +use remote_storage::DownloadError; use remote_storage::GenericRemoteStorage; use remote_storage::TimeTravelError; use tenant_size_model::{SizeResult, StorageModel}; @@ -49,14 +52,18 @@ use crate::task_mgr::TaskKind; use crate::tenant::config::{LocationConf, TenantConfOpt}; use crate::tenant::mgr::GetActiveTenantError; use crate::tenant::mgr::{ - GetTenantError, SetNewTenantConfigError, TenantManager, TenantMapError, TenantMapInsertError, - TenantSlotError, TenantSlotUpsertError, TenantStateError, + GetTenantError, TenantManager, TenantMapError, TenantMapInsertError, TenantSlotError, + TenantSlotUpsertError, TenantStateError, }; use crate::tenant::mgr::{TenantSlot, UpsertLocationError}; use crate::tenant::remote_timeline_client; +use crate::tenant::remote_timeline_client::download_index_part; +use crate::tenant::remote_timeline_client::list_remote_tenant_shards; +use crate::tenant::remote_timeline_client::list_remote_timelines; use crate::tenant::secondary::SecondaryController; use crate::tenant::size::ModelInputs; use crate::tenant::storage_layer::LayerAccessStatsReset; +use crate::tenant::storage_layer::LayerName; use crate::tenant::timeline::CompactFlags; use crate::tenant::timeline::Timeline; use crate::tenant::SpawnMode; @@ -160,6 +167,9 @@ impl From for ApiError { fn from(pre: PageReconstructError) -> ApiError { match pre { PageReconstructError::Other(pre) => ApiError::InternalServerError(pre), + PageReconstructError::MissingKey(e) => { + ApiError::InternalServerError(anyhow::anyhow!("{e}")) + } PageReconstructError::Cancelled => { ApiError::InternalServerError(anyhow::anyhow!("request was cancelled")) } @@ -249,16 +259,11 @@ impl From for ApiError { fn from(tse: GetTenantError) -> ApiError { match tse { GetTenantError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {}", tid).into()), - GetTenantError::Broken(reason) => { - ApiError::InternalServerError(anyhow!("tenant is broken: {}", reason)) - } GetTenantError::NotActive(_) => { // Why is this not `ApiError::NotFound`? // Because we must be careful to never return 404 for a tenant if it does // in fact exist locally. If we did, the caller could draw the conclusion // that it can attach the tenant to another PS and we'd be in split-brain. - // - // (We can produce this variant only in `mgr::get_tenant(..., active=true)` calls). ApiError::ResourceUnavailable("Tenant not yet active".into()) } GetTenantError::MapState(e) => ApiError::ResourceUnavailable(format!("{e}").into()), @@ -269,6 +274,9 @@ impl From for ApiError { impl From for ApiError { fn from(e: GetActiveTenantError) -> ApiError { match e { + GetActiveTenantError::Broken(reason) => { + ApiError::InternalServerError(anyhow!("tenant is broken: {}", reason)) + } GetActiveTenantError::WillNotBecomeActive(_) => ApiError::Conflict(format!("{}", e)), GetActiveTenantError::Cancelled => ApiError::ShuttingDown, GetActiveTenantError::NotFound(gte) => gte.into(), @@ -279,19 +287,6 @@ impl From for ApiError { } } -impl From for ApiError { - fn from(e: SetNewTenantConfigError) -> ApiError { - match e { - SetNewTenantConfigError::GetTenant(tid) => { - ApiError::NotFound(anyhow!("tenant {}", tid).into()) - } - e @ (SetNewTenantConfigError::Persist(_) | SetNewTenantConfigError::Other(_)) => { - ApiError::InternalServerError(anyhow::Error::new(e)) - } - } - } -} - impl From for ApiError { fn from(value: crate::tenant::DeleteTimelineError) -> Self { use crate::tenant::DeleteTimelineError::*; @@ -472,8 +467,12 @@ async fn reload_auth_validation_keys_handler( json_response(StatusCode::OK, ()) } Err(e) => { + let err_msg = "Error reloading public keys"; warn!("Error reloading public keys from {key_path:?}: {e:}"); - json_response(StatusCode::INTERNAL_SERVER_ERROR, ()) + json_response( + StatusCode::INTERNAL_SERVER_ERROR, + HttpErrorBody::from_msg(err_msg.to_string()), + ) } } } @@ -495,7 +494,7 @@ async fn timeline_create_handler( async { let tenant = state .tenant_manager - .get_attached_tenant_shard(tenant_shard_id, false)?; + .get_attached_tenant_shard(tenant_shard_id)?; tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?; @@ -535,10 +534,13 @@ async fn timeline_create_handler( HttpErrorBody::from_msg("Tenant shutting down".to_string()), ) } - Err( - e @ tenant::CreateTimelineError::Conflict - | e @ tenant::CreateTimelineError::AlreadyCreating, - ) => json_response(StatusCode::CONFLICT, HttpErrorBody::from_msg(e.to_string())), + Err(e @ tenant::CreateTimelineError::Conflict) => { + json_response(StatusCode::CONFLICT, HttpErrorBody::from_msg(e.to_string())) + } + Err(e @ tenant::CreateTimelineError::AlreadyCreating) => json_response( + StatusCode::TOO_MANY_REQUESTS, + HttpErrorBody::from_msg(e.to_string()), + ), Err(tenant::CreateTimelineError::AncestorLsn(err)) => json_response( StatusCode::NOT_ACCEPTABLE, HttpErrorBody::from_msg(format!("{err:#}")), @@ -581,7 +583,7 @@ async fn timeline_list_handler( let response_data = async { let tenant = state .tenant_manager - .get_attached_tenant_shard(tenant_shard_id, false)?; + .get_attached_tenant_shard(tenant_shard_id)?; tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?; @@ -619,6 +621,7 @@ async fn timeline_preserve_initdb_handler( let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; check_permission(&request, Some(tenant_shard_id.tenant_id))?; + let state = get_state(&request); // Part of the process for disaster recovery from safekeeper-stored WAL: // If we don't recover into a new timeline but want to keep the timeline ID, @@ -626,7 +629,9 @@ async fn timeline_preserve_initdb_handler( // location where timeline recreation cand find it. async { - let tenant = mgr::get_tenant(tenant_shard_id, false)?; + let tenant = state + .tenant_manager + .get_attached_tenant_shard(tenant_shard_id)?; let timeline = tenant .get_timeline(timeline_id, false) @@ -668,7 +673,7 @@ async fn timeline_detail_handler( let timeline_info = async { let tenant = state .tenant_manager - .get_attached_tenant_shard(tenant_shard_id, false)?; + .get_attached_tenant_shard(tenant_shard_id)?; tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?; @@ -705,7 +710,7 @@ async fn get_lsn_by_timestamp_handler( check_permission(&request, Some(tenant_shard_id.tenant_id))?; let state = get_state(&request); - if !tenant_shard_id.is_zero() { + if !tenant_shard_id.is_shard_zero() { // Requires SLRU contents, which are only stored on shard zero return Err(ApiError::BadRequest(anyhow!( "Size calculations are only available on shard zero" @@ -756,7 +761,7 @@ async fn get_timestamp_of_lsn_handler( check_permission(&request, Some(tenant_shard_id.tenant_id))?; let state = get_state(&request); - if !tenant_shard_id.is_zero() { + if !tenant_shard_id.is_shard_zero() { // Requires SLRU contents, which are only stored on shard zero return Err(ApiError::BadRequest(anyhow!( "Size calculations are only available on shard zero" @@ -781,7 +786,9 @@ async fn get_timestamp_of_lsn_handler( let time = format_rfc3339(postgres_ffi::from_pg_timestamp(time)).to_string(); json_response(StatusCode::OK, time) } - None => json_response(StatusCode::NOT_FOUND, ()), + None => Err(ApiError::NotFound( + anyhow::anyhow!("Timestamp for lsn {} not found", lsn).into(), + )), } } @@ -855,7 +862,7 @@ async fn timeline_delete_handler( let tenant = state .tenant_manager - .get_attached_tenant_shard(tenant_shard_id, false) + .get_attached_tenant_shard(tenant_shard_id) .map_err(|e| { match e { // GetTenantError has a built-in conversion to ApiError, but in this context we don't @@ -973,10 +980,11 @@ async fn tenant_list_handler( _cancel: CancellationToken, ) -> Result, ApiError> { check_permission(&request, None)?; + let state = get_state(&request); - let response_data = mgr::list_tenants() - .instrument(info_span!("tenant_list")) - .await + let response_data = state + .tenant_manager + .list_tenants() .map_err(|_| { ApiError::ResourceUnavailable("Tenant map is initializing or shutting down".into()) })? @@ -999,9 +1007,27 @@ async fn tenant_status( ) -> Result, ApiError> { let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; check_permission(&request, Some(tenant_shard_id.tenant_id))?; + let state = get_state(&request); + + // In tests, sometimes we want to query the state of a tenant without auto-activating it if it's currently waiting. + let activate = true; + #[cfg(feature = "testing")] + let activate = parse_query_param(&request, "activate")?.unwrap_or(activate); let tenant_info = async { - let tenant = mgr::get_tenant(tenant_shard_id, false)?; + let tenant = state + .tenant_manager + .get_attached_tenant_shard(tenant_shard_id)?; + + if activate { + // This is advisory: we prefer to let the tenant activate on-demand when this function is + // called, but it is still valid to return 200 and describe the current state of the tenant + // if it doesn't make it into an active state. + tenant + .wait_to_become_active(ACTIVE_TENANT_TIMEOUT) + .await + .ok(); + } // Calculate total physical size of all timelines let mut current_physical_size = 0; @@ -1074,16 +1100,20 @@ async fn tenant_size_handler( let inputs_only: Option = parse_query_param(&request, "inputs_only")?; let retention_period: Option = parse_query_param(&request, "retention_period")?; let headers = request.headers(); + let state = get_state(&request); - let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); - let tenant = mgr::get_tenant(tenant_shard_id, true)?; - - if !tenant_shard_id.is_zero() { + if !tenant_shard_id.is_shard_zero() { return Err(ApiError::BadRequest(anyhow!( "Size calculations are only available on shard zero" ))); } + let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); + let tenant = state + .tenant_manager + .get_attached_tenant_shard(tenant_shard_id)?; + tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?; + // this can be long operation let inputs = tenant .gather_size_inputs( @@ -1152,10 +1182,15 @@ async fn tenant_shard_split_handler( let state = get_state(&request); let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn); + let tenant = state + .tenant_manager + .get_attached_tenant_shard(tenant_shard_id)?; + tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?; + let new_shards = state .tenant_manager .shard_split( - tenant_shard_id, + tenant, ShardCount::new(req.new_shard_count), req.new_stripe_size, &ctx, @@ -1194,13 +1229,15 @@ async fn layer_download_handler( let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; let layer_file_name = get_request_param(&request, "layer_file_name")?; check_permission(&request, Some(tenant_shard_id.tenant_id))?; + let layer_name = LayerName::from_str(layer_file_name) + .map_err(|s| ApiError::BadRequest(anyhow::anyhow!(s)))?; let state = get_state(&request); let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id) .await?; let downloaded = timeline - .download_layer(layer_file_name) + .download_layer(&layer_name) .await .map_err(ApiError::InternalServerError)?; @@ -1224,11 +1261,14 @@ async fn evict_timeline_layer_handler( let layer_file_name = get_request_param(&request, "layer_file_name")?; let state = get_state(&request); + let layer_name = LayerName::from_str(layer_file_name) + .map_err(|s| ApiError::BadRequest(anyhow::anyhow!(s)))?; + let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id) .await?; let evicted = timeline - .evict_layer(layer_file_name) + .evict_layer(&layer_name) .await .map_err(ApiError::InternalServerError)?; @@ -1373,8 +1413,11 @@ async fn get_tenant_config_handler( ) -> Result, ApiError> { let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; check_permission(&request, Some(tenant_shard_id.tenant_id))?; + let state = get_state(&request); - let tenant = mgr::get_tenant(tenant_shard_id, false)?; + let tenant = state + .tenant_manager + .get_attached_tenant_shard(tenant_shard_id)?; let response = HashMap::from([ ( @@ -1402,15 +1445,31 @@ async fn update_tenant_config_handler( let tenant_id = request_data.tenant_id; check_permission(&request, Some(tenant_id))?; - let tenant_conf = + let new_tenant_conf = TenantConfOpt::try_from(&request_data.config).map_err(ApiError::BadRequest)?; let state = get_state(&request); - state + + let tenant_shard_id = TenantShardId::unsharded(tenant_id); + + let tenant = state .tenant_manager - .set_new_tenant_config(tenant_conf, tenant_id) - .instrument(info_span!("tenant_config", %tenant_id)) - .await?; + .get_attached_tenant_shard(tenant_shard_id)?; + tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?; + + // This is a legacy API that only operates on attached tenants: the preferred + // API to use is the location_config/ endpoint, which lets the caller provide + // the full LocationConf. + let location_conf = LocationConf::attached_single( + new_tenant_conf.clone(), + tenant.get_generation(), + &ShardParameters::default(), + ); + + crate::tenant::Tenant::persist_tenant_config(state.conf, &tenant_shard_id, &location_conf) + .await + .map_err(ApiError::InternalServerError)?; + tenant.set_new_tenant_config(new_tenant_conf); json_response(StatusCode::OK, ()) } @@ -1634,10 +1693,12 @@ async fn handle_tenant_break( ) -> Result, ApiError> { let tenant_shard_id: TenantShardId = parse_request_param(&r, "tenant_shard_id")?; - let tenant = crate::tenant::mgr::get_tenant(tenant_shard_id, true) - .map_err(|_| ApiError::Conflict(String::from("no active tenant found")))?; - - tenant.set_broken("broken from test".to_owned()).await; + let state = get_state(&r); + state + .tenant_manager + .get_attached_tenant_shard(tenant_shard_id)? + .set_broken("broken from test".to_owned()) + .await; json_response(StatusCode::OK, ()) } @@ -1772,6 +1833,75 @@ async fn timeline_download_remote_layers_handler_get( json_response(StatusCode::OK, info) } +async fn timeline_detach_ancestor_handler( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + use crate::tenant::timeline::detach_ancestor::Options; + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; + check_permission(&request, Some(tenant_shard_id.tenant_id))?; + let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; + + let span = tracing::info_span!("detach_ancestor", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id); + + async move { + let mut options = Options::default(); + + let rewrite_concurrency = + parse_query_param::<_, std::num::NonZeroUsize>(&request, "rewrite_concurrency")?; + let copy_concurrency = + parse_query_param::<_, std::num::NonZeroUsize>(&request, "copy_concurrency")?; + + [ + (&mut options.rewrite_concurrency, rewrite_concurrency), + (&mut options.copy_concurrency, copy_concurrency), + ] + .into_iter() + .filter_map(|(target, val)| val.map(|val| (target, val))) + .for_each(|(target, val)| *target = val); + + let state = get_state(&request); + + let tenant = state + .tenant_manager + .get_attached_tenant_shard(tenant_shard_id)?; + + tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?; + + let ctx = RequestContext::new(TaskKind::DetachAncestor, DownloadBehavior::Download); + let ctx = &ctx; + + let timeline = tenant + .get_timeline(timeline_id, true) + .map_err(|e| ApiError::NotFound(e.into()))?; + + let (_guard, prepared) = timeline + .prepare_to_detach_from_ancestor(&tenant, options, ctx) + .await + .map_err(|e| ApiError::InternalServerError(e.into()))?; + + let res = state + .tenant_manager + .complete_detaching_timeline_ancestor(tenant_shard_id, timeline_id, prepared, ctx) + .await; + + match res { + Ok(reparented_timelines) => { + let resp = pageserver_api::models::detach_ancestor::AncestorDetached { + reparented_timelines, + }; + + json_response(StatusCode::OK, resp) + } + Err(e) => Err(ApiError::InternalServerError( + e.context("timeline detach completion"), + )), + } + } + .instrument(span) + .await +} + async fn deletion_queue_flush( r: Request, cancel: CancellationToken, @@ -1863,12 +1993,14 @@ async fn timeline_collect_keyspace( let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?; let at_lsn = at_lsn.unwrap_or_else(|| timeline.get_last_record_lsn()); - let keys = timeline + let (dense_ks, sparse_ks) = timeline .collect_keyspace(at_lsn, &ctx) .await .map_err(|e| ApiError::InternalServerError(e.into()))?; - let res = pageserver_api::models::partitioning::Partitioning { keys, at_lsn }; + // This API is currently used by pagebench. Pagebench will iterate all keys within the keyspace. + // Therefore, we split dense/sparse keys in this API. + let res = pageserver_api::models::partitioning::Partitioning { keys: dense_ks, sparse_keys: sparse_ks, at_lsn }; json_response(StatusCode::OK, res) } @@ -1881,7 +2013,7 @@ async fn active_timeline_of_active_tenant( tenant_shard_id: TenantShardId, timeline_id: TimelineId, ) -> Result, ApiError> { - let tenant = tenant_manager.get_attached_tenant_shard(tenant_shard_id, false)?; + let tenant = tenant_manager.get_attached_tenant_shard(tenant_shard_id)?; tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?; @@ -1986,6 +2118,79 @@ async fn secondary_upload_handler( json_response(StatusCode::OK, ()) } +async fn tenant_scan_remote_handler( + request: Request, + cancel: CancellationToken, +) -> Result, ApiError> { + let state = get_state(&request); + let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; + + let Some(remote_storage) = state.remote_storage.as_ref() else { + return Err(ApiError::BadRequest(anyhow::anyhow!( + "Remote storage not configured" + ))); + }; + + let mut response = TenantScanRemoteStorageResponse::default(); + + let (shards, _other_keys) = + list_remote_tenant_shards(remote_storage, tenant_id, cancel.clone()) + .await + .map_err(|e| ApiError::InternalServerError(anyhow::anyhow!(e)))?; + + for tenant_shard_id in shards { + let (timeline_ids, _other_keys) = + list_remote_timelines(remote_storage, tenant_shard_id, cancel.clone()) + .await + .map_err(|e| ApiError::InternalServerError(anyhow::anyhow!(e)))?; + + let mut generation = Generation::none(); + for timeline_id in timeline_ids { + match download_index_part( + remote_storage, + &tenant_shard_id, + &timeline_id, + Generation::MAX, + &cancel, + ) + .instrument(info_span!("download_index_part", + tenant_id=%tenant_shard_id.tenant_id, + shard_id=%tenant_shard_id.shard_slug(), + %timeline_id)) + .await + { + Ok((index_part, index_generation)) => { + tracing::info!("Found timeline {tenant_shard_id}/{timeline_id} metadata (gen {index_generation:?}, {} layers, {} consistent LSN)", + index_part.layer_metadata.len(), index_part.get_disk_consistent_lsn()); + generation = std::cmp::max(generation, index_generation); + } + Err(DownloadError::NotFound) => { + // This is normal for tenants that were created with multiple shards: they have an unsharded path + // containing the timeline's initdb tarball but no index. Otherwise it is a bit strange. + tracing::info!("Timeline path {tenant_shard_id}/{timeline_id} exists in remote storage but has no index, skipping"); + continue; + } + Err(e) => { + return Err(ApiError::InternalServerError(anyhow::anyhow!(e))); + } + }; + } + + response.shards.push(TenantScanRemoteStorageShard { + tenant_shard_id, + generation: generation.into(), + }); + } + + if response.shards.is_empty() { + return Err(ApiError::NotFound( + anyhow::anyhow!("No shards found for tenant ID {tenant_id}").into(), + )); + } + + json_response(StatusCode::OK, response) +} + async fn secondary_download_handler( request: Request, _cancel: CancellationToken, @@ -2030,6 +2235,27 @@ async fn secondary_download_handler( json_response(status, progress) } +async fn secondary_status_handler( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + let state = get_state(&request); + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; + + let Some(secondary_tenant) = state + .tenant_manager + .get_secondary_tenant_shard(tenant_shard_id) + else { + return Err(ApiError::NotFound( + anyhow::anyhow!("Shard {} not found", tenant_shard_id).into(), + )); + }; + + let progress = secondary_tenant.progress.lock().unwrap().clone(); + + json_response(StatusCode::OK, progress) +} + async fn handler_404(_: Request) -> Result, ApiError> { json_response( StatusCode::NOT_FOUND, @@ -2364,6 +2590,10 @@ pub fn make_router( "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/download_remote_layers", |r| api_handler(r, timeline_download_remote_layers_handler_get), ) + .put( + "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/detach_ancestor", + |r| api_handler(r, timeline_detach_ancestor_handler), + ) .delete("/v1/tenant/:tenant_shard_id/timeline/:timeline_id", |r| { api_handler(r, timeline_delete_handler) }) @@ -2382,12 +2612,18 @@ pub fn make_router( .post("/v1/tenant/:tenant_shard_id/heatmap_upload", |r| { api_handler(r, secondary_upload_handler) }) + .get("/v1/tenant/:tenant_id/scan_remote_storage", |r| { + api_handler(r, tenant_scan_remote_handler) + }) .put("/v1/disk_usage_eviction/run", |r| { api_handler(r, disk_usage_eviction_run) }) .put("/v1/deletion_queue/flush", |r| { api_handler(r, deletion_queue_flush) }) + .get("/v1/tenant/:tenant_shard_id/secondary/status", |r| { + api_handler(r, secondary_status_handler) + }) .post("/v1/tenant/:tenant_shard_id/secondary/download", |r| { api_handler(r, secondary_download_handler) }) diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs index 343dec2ca1..ed409d3130 100644 --- a/pageserver/src/import_datadir.rs +++ b/pageserver/src/import_datadir.rs @@ -8,6 +8,7 @@ use anyhow::{bail, ensure, Context, Result}; use bytes::Bytes; use camino::Utf8Path; use futures::StreamExt; +use pageserver_api::key::rel_block_to_key; use tokio::io::{AsyncRead, AsyncReadExt}; use tokio_tar::Archive; use tracing::*; @@ -170,7 +171,10 @@ async fn import_rel( let r = reader.read_exact(&mut buf).await; match r { Ok(_) => { - modification.put_rel_page_image(rel, blknum, Bytes::copy_from_slice(&buf))?; + let key = rel_block_to_key(rel, blknum); + if modification.tline.get_shard_identity().is_key_local(&key) { + modification.put_rel_page_image(rel, blknum, Bytes::copy_from_slice(&buf))?; + } } // TODO: UnexpectedEof is expected diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index f947a75f61..930700e50c 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -12,6 +12,7 @@ pub mod disk_usage_eviction_task; pub mod http; pub mod import_datadir; pub use pageserver_api::keyspace; +pub mod aux_file; pub mod metrics; pub mod page_cache; pub mod page_service; diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index 075bb76a1b..256f2f334c 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -51,6 +51,9 @@ pub(crate) enum StorageTimeOperation { #[strum(serialize = "gc")] Gc, + #[strum(serialize = "find gc cutoffs")] + FindGcCutoffs, + #[strum(serialize = "create tenant")] CreateTenant, } @@ -86,41 +89,58 @@ pub(crate) static STORAGE_TIME_GLOBAL: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); -pub(crate) static READ_NUM_FS_LAYERS: Lazy = Lazy::new(|| { +pub(crate) static READ_NUM_LAYERS_VISITED: Lazy = Lazy::new(|| { register_histogram!( - "pageserver_read_num_fs_layers", - "Number of persistent layers accessed for processing a read request, including those in the cache", - vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 10.0, 20.0, 50.0, 100.0], + "pageserver_layers_visited_per_read_global", + "Number of layers visited to reconstruct one key", + vec![1.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0], + ) + .expect("failed to define a metric") +}); + +pub(crate) static VEC_READ_NUM_LAYERS_VISITED: Lazy = Lazy::new(|| { + register_histogram!( + "pageserver_layers_visited_per_vectored_read_global", + "Average number of layers visited to reconstruct one key", + vec![1.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0], ) .expect("failed to define a metric") }); // Metrics collected on operations on the storage repository. +#[derive( + Clone, Copy, enum_map::Enum, strum_macros::EnumString, strum_macros::Display, IntoStaticStr, +)] +pub(crate) enum GetKind { + Singular, + Vectored, +} pub(crate) struct ReconstructTimeMetrics { - ok: Histogram, - err: Histogram, + singular: Histogram, + vectored: Histogram, } pub(crate) static RECONSTRUCT_TIME: Lazy = Lazy::new(|| { let inner = register_histogram_vec!( "pageserver_getpage_reconstruct_seconds", "Time spent in reconstruct_value (reconstruct a page from deltas)", - &["result"], + &["get_kind"], CRITICAL_OP_BUCKETS.into(), ) .expect("failed to define a metric"); + ReconstructTimeMetrics { - ok: inner.get_metric_with_label_values(&["ok"]).unwrap(), - err: inner.get_metric_with_label_values(&["err"]).unwrap(), + singular: inner.with_label_values(&[GetKind::Singular.into()]), + vectored: inner.with_label_values(&[GetKind::Vectored.into()]), } }); impl ReconstructTimeMetrics { - pub(crate) fn for_result(&self, result: &Result) -> &Histogram { - match result { - Ok(_) => &self.ok, - Err(_) => &self.err, + pub(crate) fn for_get_kind(&self, get_kind: GetKind) -> &Histogram { + match get_kind { + GetKind::Singular => &self.singular, + GetKind::Vectored => &self.vectored, } } } @@ -133,13 +153,33 @@ pub(crate) static MATERIALIZED_PAGE_CACHE_HIT_DIRECT: Lazy = Lazy::n .expect("failed to define a metric") }); -pub(crate) static GET_RECONSTRUCT_DATA_TIME: Lazy = Lazy::new(|| { - register_histogram!( +pub(crate) struct ReconstructDataTimeMetrics { + singular: Histogram, + vectored: Histogram, +} + +impl ReconstructDataTimeMetrics { + pub(crate) fn for_get_kind(&self, get_kind: GetKind) -> &Histogram { + match get_kind { + GetKind::Singular => &self.singular, + GetKind::Vectored => &self.vectored, + } + } +} + +pub(crate) static GET_RECONSTRUCT_DATA_TIME: Lazy = Lazy::new(|| { + let inner = register_histogram_vec!( "pageserver_getpage_get_reconstruct_data_seconds", "Time spent in get_reconstruct_value_data", + &["get_kind"], CRITICAL_OP_BUCKETS.into(), ) - .expect("failed to define a metric") + .expect("failed to define a metric"); + + ReconstructDataTimeMetrics { + singular: inner.with_label_values(&[GetKind::Singular.into()]), + vectored: inner.with_label_values(&[GetKind::Vectored.into()]), + } }); pub(crate) static MATERIALIZED_PAGE_CACHE_HIT: Lazy = Lazy::new(|| { @@ -154,6 +194,11 @@ pub(crate) struct GetVectoredLatency { map: EnumMap>, } +#[allow(dead_code)] +pub(crate) struct ScanLatency { + map: EnumMap>, +} + impl GetVectoredLatency { // Only these task types perform vectored gets. Filter all other tasks out to reduce total // cardinality of the metric. @@ -164,6 +209,48 @@ impl GetVectoredLatency { } } +impl ScanLatency { + // Only these task types perform vectored gets. Filter all other tasks out to reduce total + // cardinality of the metric. + const TRACKED_TASK_KINDS: [TaskKind; 1] = [TaskKind::PageRequestHandler]; + + pub(crate) fn for_task_kind(&self, task_kind: TaskKind) -> Option<&Histogram> { + self.map[task_kind].as_ref() + } +} + +pub(crate) struct ScanLatencyOngoingRecording<'a> { + parent: &'a Histogram, + start: std::time::Instant, +} + +impl<'a> ScanLatencyOngoingRecording<'a> { + pub(crate) fn start_recording(parent: &'a Histogram) -> ScanLatencyOngoingRecording<'a> { + let start = Instant::now(); + ScanLatencyOngoingRecording { parent, start } + } + + pub(crate) fn observe(self, throttled: Option) { + let elapsed = self.start.elapsed(); + let ex_throttled = if let Some(throttled) = throttled { + elapsed.checked_sub(throttled) + } else { + Some(elapsed) + }; + if let Some(ex_throttled) = ex_throttled { + self.parent.observe(ex_throttled.as_secs_f64()); + } else { + use utils::rate_limit::RateLimit; + static LOGGED: Lazy> = + Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10)))); + let mut rate_limit = LOGGED.lock().unwrap(); + rate_limit.call(|| { + warn!("error deducting time spent throttled; this message is logged at a global rate limit"); + }); + } + } +} + pub(crate) static GET_VECTORED_LATENCY: Lazy = Lazy::new(|| { let inner = register_histogram_vec!( "pageserver_get_vectored_seconds", @@ -187,6 +274,29 @@ pub(crate) static GET_VECTORED_LATENCY: Lazy = Lazy::new(|| } }); +pub(crate) static SCAN_LATENCY: Lazy = Lazy::new(|| { + let inner = register_histogram_vec!( + "pageserver_scan_seconds", + "Time spent in scan, excluding time spent in timeline_get_throttle.", + &["task_kind"], + CRITICAL_OP_BUCKETS.into(), + ) + .expect("failed to define a metric"); + + ScanLatency { + map: EnumMap::from_array(std::array::from_fn(|task_kind_idx| { + let task_kind = ::from_usize(task_kind_idx); + + if ScanLatency::TRACKED_TASK_KINDS.contains(&task_kind) { + let task_kind = task_kind.into(); + Some(inner.with_label_values(&[task_kind])) + } else { + None + } + })), + } +}); + pub(crate) struct PageCacheMetricsForTaskKind { pub read_accesses_materialized_page: IntCounter, pub read_accesses_immutable: IntCounter, @@ -435,7 +545,7 @@ pub(crate) static RESIDENT_PHYSICAL_SIZE_GLOBAL: Lazy = Lazy::new(|| static REMOTE_PHYSICAL_SIZE: Lazy = Lazy::new(|| { register_uint_gauge_vec!( "pageserver_remote_physical_size", - "The size of the layer files present in the remote storage that are listed in the the remote index_part.json.", + "The size of the layer files present in the remote storage that are listed in the remote index_part.json.", // Corollary: If any files are missing from the index part, they won't be included here. &["tenant_id", "shard_id", "timeline_id"] ) @@ -699,6 +809,14 @@ pub static STARTUP_IS_LOADING: Lazy = Lazy::new(|| { .expect("Failed to register pageserver_startup_is_loading") }); +pub(crate) static TIMELINE_EPHEMERAL_BYTES: Lazy = Lazy::new(|| { + register_uint_gauge!( + "pageserver_timeline_ephemeral_bytes", + "Total number of bytes in ephemeral layers, summed for all timelines. Approximate, lazily updated." + ) + .expect("Failed to register metric") +}); + /// Metrics related to the lifecycle of a [`crate::tenant::Tenant`] object: things /// like how long it took to load. /// @@ -1394,29 +1512,80 @@ static REMOTE_TIMELINE_CLIENT_BYTES_FINISHED_COUNTER: Lazy = Lazy }); pub(crate) struct TenantManagerMetrics { - pub(crate) tenant_slots: UIntGauge, + tenant_slots_attached: UIntGauge, + tenant_slots_secondary: UIntGauge, + tenant_slots_inprogress: UIntGauge, pub(crate) tenant_slot_writes: IntCounter, pub(crate) unexpected_errors: IntCounter, } +impl TenantManagerMetrics { + /// Helpers for tracking slots. Note that these do not track the lifetime of TenantSlot objects + /// exactly: they track the lifetime of the slots _in the tenant map_. + pub(crate) fn slot_inserted(&self, slot: &TenantSlot) { + match slot { + TenantSlot::Attached(_) => { + self.tenant_slots_attached.inc(); + } + TenantSlot::Secondary(_) => { + self.tenant_slots_secondary.inc(); + } + TenantSlot::InProgress(_) => { + self.tenant_slots_inprogress.inc(); + } + } + } + + pub(crate) fn slot_removed(&self, slot: &TenantSlot) { + match slot { + TenantSlot::Attached(_) => { + self.tenant_slots_attached.dec(); + } + TenantSlot::Secondary(_) => { + self.tenant_slots_secondary.dec(); + } + TenantSlot::InProgress(_) => { + self.tenant_slots_inprogress.dec(); + } + } + } + + #[cfg(all(debug_assertions, not(test)))] + pub(crate) fn slots_total(&self) -> u64 { + self.tenant_slots_attached.get() + + self.tenant_slots_secondary.get() + + self.tenant_slots_inprogress.get() + } +} + pub(crate) static TENANT_MANAGER: Lazy = Lazy::new(|| { - TenantManagerMetrics { - tenant_slots: register_uint_gauge!( + let tenant_slots = register_uint_gauge_vec!( "pageserver_tenant_manager_slots", "How many slots currently exist, including all attached, secondary and in-progress operations", + &["mode"] ) - .expect("failed to define a metric"), - tenant_slot_writes: register_int_counter!( - "pageserver_tenant_manager_slot_writes", - "Writes to a tenant slot, including all of create/attach/detach/delete" - ) - .expect("failed to define a metric"), - unexpected_errors: register_int_counter!( - "pageserver_tenant_manager_unexpected_errors_total", - "Number of unexpected conditions encountered: nonzero value indicates a non-fatal bug." - ) - .expect("failed to define a metric"), -} + .expect("failed to define a metric"); + TenantManagerMetrics { + tenant_slots_attached: tenant_slots + .get_metric_with_label_values(&["attached"]) + .unwrap(), + tenant_slots_secondary: tenant_slots + .get_metric_with_label_values(&["secondary"]) + .unwrap(), + tenant_slots_inprogress: tenant_slots + .get_metric_with_label_values(&["inprogress"]) + .unwrap(), + tenant_slot_writes: register_int_counter!( + "pageserver_tenant_manager_slot_writes", + "Writes to a tenant slot, including all of create/attach/detach/delete" + ) + .expect("failed to define a metric"), + unexpected_errors: register_int_counter!( + "pageserver_tenant_manager_unexpected_errors_total", + "Number of unexpected conditions encountered: nonzero value indicates a non-fatal bug." + ) + .expect("failed to define a metric"), + } }); pub(crate) struct DeletionQueueMetrics { @@ -1474,29 +1643,6 @@ pub(crate) static DELETION_QUEUE: Lazy = Lazy::new(|| { } }); -pub(crate) struct WalIngestMetrics { - pub(crate) records_received: IntCounter, - pub(crate) records_committed: IntCounter, - pub(crate) records_filtered: IntCounter, -} - -pub(crate) static WAL_INGEST: Lazy = Lazy::new(|| WalIngestMetrics { - records_received: register_int_counter!( - "pageserver_wal_ingest_records_received", - "Number of WAL records received from safekeepers" - ) - .expect("failed to define a metric"), - records_committed: register_int_counter!( - "pageserver_wal_ingest_records_committed", - "Number of WAL records which resulted in writes to pageserver storage" - ) - .expect("failed to define a metric"), - records_filtered: register_int_counter!( - "pageserver_wal_ingest_records_filtered", - "Number of WAL records filtered out due to sharding" - ) - .expect("failed to define a metric"), -}); pub(crate) struct SecondaryModeMetrics { pub(crate) upload_heatmap: IntCounter, pub(crate) upload_heatmap_errors: IntCounter, @@ -1504,7 +1650,8 @@ pub(crate) struct SecondaryModeMetrics { pub(crate) download_heatmap: IntCounter, pub(crate) download_layer: IntCounter, } -pub(crate) static SECONDARY_MODE: Lazy = Lazy::new(|| SecondaryModeMetrics { +pub(crate) static SECONDARY_MODE: Lazy = Lazy::new(|| { + SecondaryModeMetrics { upload_heatmap: register_int_counter!( "pageserver_secondary_upload_heatmap", "Number of heatmaps written to remote storage by attached tenants" @@ -1522,7 +1669,7 @@ pub(crate) static SECONDARY_MODE: Lazy = Lazy::new(|| Seco .expect("failed to define a metric"), download_heatmap: register_int_counter!( "pageserver_secondary_download_heatmap", - "Number of downloads of heatmaps by secondary mode locations" + "Number of downloads of heatmaps by secondary mode locations, including when it hasn't changed" ) .expect("failed to define a metric"), download_layer: register_int_counter!( @@ -1530,6 +1677,7 @@ pub(crate) static SECONDARY_MODE: Lazy = Lazy::new(|| Seco "Number of downloads of layers by secondary mode locations" ) .expect("failed to define a metric"), +} }); #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] @@ -1696,6 +1844,43 @@ macro_rules! redo_bytes_histogram_count_buckets { }; } +pub(crate) struct WalIngestMetrics { + pub(crate) bytes_received: IntCounter, + pub(crate) records_received: IntCounter, + pub(crate) records_committed: IntCounter, + pub(crate) records_filtered: IntCounter, + pub(crate) time_spent_on_ingest: Histogram, +} + +pub(crate) static WAL_INGEST: Lazy = Lazy::new(|| WalIngestMetrics { + bytes_received: register_int_counter!( + "pageserver_wal_ingest_bytes_received", + "Bytes of WAL ingested from safekeepers", + ) + .unwrap(), + records_received: register_int_counter!( + "pageserver_wal_ingest_records_received", + "Number of WAL records received from safekeepers" + ) + .expect("failed to define a metric"), + records_committed: register_int_counter!( + "pageserver_wal_ingest_records_committed", + "Number of WAL records which resulted in writes to pageserver storage" + ) + .expect("failed to define a metric"), + records_filtered: register_int_counter!( + "pageserver_wal_ingest_records_filtered", + "Number of WAL records filtered out due to sharding" + ) + .expect("failed to define a metric"), + time_spent_on_ingest: register_histogram!( + "pageserver_wal_ingest_put_value_seconds", + "Actual time spent on ingesting a record", + redo_histogram_time_buckets!(), + ) + .expect("failed to define a metric"), +}); + pub(crate) static WAL_REDO_TIME: Lazy = Lazy::new(|| { register_histogram!( "pageserver_wal_redo_seconds", @@ -1805,6 +1990,29 @@ impl Default for WalRedoProcessCounters { pub(crate) static WAL_REDO_PROCESS_COUNTERS: Lazy = Lazy::new(WalRedoProcessCounters::default); +#[cfg(not(test))] +pub mod wal_redo { + use super::*; + + static PROCESS_KIND: Lazy> = Lazy::new(|| { + std::sync::Mutex::new( + register_uint_gauge_vec!( + "pageserver_wal_redo_process_kind", + "The configured process kind for walredo", + &["kind"], + ) + .unwrap(), + ) + }); + + pub fn set_process_kind_metric(kind: crate::walredo::ProcessKind) { + // use guard to avoid races around the next two steps + let guard = PROCESS_KIND.lock().unwrap(); + guard.reset(); + guard.with_label_values(&[&format!("{kind}")]).set(1); + } +} + /// Similar to `prometheus::HistogramTimer` but does not record on drop. pub(crate) struct StorageTimeMetricsTimer { metrics: StorageTimeMetrics, @@ -1826,6 +2034,22 @@ impl StorageTimeMetricsTimer { self.metrics.timeline_count.inc(); self.metrics.global_histogram.observe(duration); } + + /// Turns this timer into a timer, which will always record -- usually this means recording + /// regardless an early `?` path was taken in a function. + pub(crate) fn record_on_drop(self) -> AlwaysRecordingStorageTimeMetricsTimer { + AlwaysRecordingStorageTimeMetricsTimer(Some(self)) + } +} + +pub(crate) struct AlwaysRecordingStorageTimeMetricsTimer(Option); + +impl Drop for AlwaysRecordingStorageTimeMetricsTimer { + fn drop(&mut self) { + if let Some(inner) = self.0.take() { + inner.stop_and_record(); + } + } } /// Timing facilities for an globally histogrammed metric, which is supported by per tenant and @@ -1886,6 +2110,7 @@ pub(crate) struct TimelineMetrics { pub imitate_logical_size_histo: StorageTimeMetrics, pub load_layer_map_histo: StorageTimeMetrics, pub garbage_collect_histo: StorageTimeMetrics, + pub find_gc_cutoffs_histo: StorageTimeMetrics, pub last_record_gauge: IntGauge, resident_physical_size_gauge: UIntGauge, /// copy of LayeredTimeline.current_logical_size @@ -1946,6 +2171,12 @@ impl TimelineMetrics { &shard_id, &timeline_id, ); + let find_gc_cutoffs_histo = StorageTimeMetrics::new( + StorageTimeOperation::FindGcCutoffs, + &tenant_id, + &shard_id, + &timeline_id, + ); let last_record_gauge = LAST_RECORD_LSN .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) .unwrap(); @@ -1988,6 +2219,7 @@ impl TimelineMetrics { logical_size_histo, imitate_logical_size_histo, garbage_collect_histo, + find_gc_cutoffs_histo, load_layer_map_histo, last_record_gauge, resident_physical_size_gauge, @@ -2075,7 +2307,7 @@ impl TimelineMetrics { pub(crate) fn remove_tenant_metrics(tenant_shard_id: &TenantShardId) { // Only shard zero deals in synthetic sizes - if tenant_shard_id.is_zero() { + if tenant_shard_id.is_shard_zero() { let tid = tenant_shard_id.tenant_id.to_string(); let _ = TENANT_SYNTHETIC_SIZE_METRIC.remove_label_values(&[&tid]); } @@ -2086,6 +2318,7 @@ pub(crate) fn remove_tenant_metrics(tenant_shard_id: &TenantShardId) { use futures::Future; use pin_project_lite::pin_project; use std::collections::HashMap; +use std::num::NonZeroUsize; use std::pin::Pin; use std::sync::{Arc, Mutex}; use std::task::{Context, Poll}; @@ -2093,6 +2326,7 @@ use std::time::{Duration, Instant}; use crate::context::{PageContentKind, RequestContext}; use crate::task_mgr::TaskKind; +use crate::tenant::mgr::TenantSlot; /// Maintain a per timeline gauge in addition to the global gauge. struct PerTimelineRemotePhysicalSizeGauge { @@ -2655,6 +2889,26 @@ pub(crate) mod disk_usage_based_eviction { pub(crate) static METRICS: Lazy = Lazy::new(Metrics::default); } +static TOKIO_EXECUTOR_THREAD_COUNT: Lazy = Lazy::new(|| { + register_uint_gauge_vec!( + "pageserver_tokio_executor_thread_configured_count", + "Total number of configued tokio executor threads in the process. + The `setup` label denotes whether we're running with multiple runtimes or a single runtime.", + &["setup"], + ) + .unwrap() +}); + +pub(crate) fn set_tokio_runtime_setup(setup: &str, num_threads: NonZeroUsize) { + static SERIALIZE: std::sync::Mutex<()> = std::sync::Mutex::new(()); + let _guard = SERIALIZE.lock().unwrap(); + TOKIO_EXECUTOR_THREAD_COUNT.reset(); + TOKIO_EXECUTOR_THREAD_COUNT + .get_metric_with_label_values(&[setup]) + .unwrap() + .set(u64::try_from(num_threads.get()).unwrap()); +} + pub fn preinitialize_metrics() { // Python tests need these and on some we do alerting. // @@ -2675,6 +2929,8 @@ pub fn preinitialize_metrics() { &WALRECEIVER_CANDIDATES_REMOVED, &tokio_epoll_uring::THREAD_LOCAL_LAUNCH_FAILURES, &tokio_epoll_uring::THREAD_LOCAL_LAUNCH_SUCCESSES, + &REMOTE_ONDEMAND_DOWNLOADED_LAYERS, + &REMOTE_ONDEMAND_DOWNLOADED_BYTES, ] .into_iter() .for_each(|c| { @@ -2711,7 +2967,8 @@ pub fn preinitialize_metrics() { // histograms [ - &READ_NUM_FS_LAYERS, + &READ_NUM_LAYERS_VISITED, + &VEC_READ_NUM_LAYERS_VISITED, &WAIT_LSN_TIME, &WAL_REDO_TIME, &WAL_REDO_RECORDS_HISTOGRAM, diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index f3ceb7d3e6..f6b251283c 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -1,13 +1,5 @@ -// //! The Page Service listens for client connections and serves their GetPage@LSN //! requests. -// -// It is possible to connect here using usual psql/pgbench/libpq. Following -// commands are supported now: -// *status* -- show actual info about this pageserver, -// *pagestream* -- enter mode where smgr and pageserver talk with their -// custom protocol. -// use anyhow::Context; use async_compression::tokio::write::GzipEncoder; @@ -23,7 +15,7 @@ use pageserver_api::models::{ PagestreamErrorResponse, PagestreamExistsRequest, PagestreamExistsResponse, PagestreamFeMessage, PagestreamGetPageRequest, PagestreamGetPageResponse, PagestreamGetSlruSegmentRequest, PagestreamGetSlruSegmentResponse, PagestreamNblocksRequest, - PagestreamNblocksResponse, + PagestreamNblocksResponse, PagestreamProtocolVersion, }; use pageserver_api::shard::ShardIndex; use pageserver_api::shard::ShardNumber; @@ -56,6 +48,7 @@ use utils::{ use crate::auth::check_permission; use crate::basebackup; +use crate::basebackup::BasebackupError; use crate::config::PageServerConf; use crate::context::{DownloadBehavior, RequestContext}; use crate::import_datadir::import_wal_from_tar; @@ -551,6 +544,7 @@ impl PageServerHandler { pgb: &mut PostgresBackend, tenant_id: TenantId, timeline_id: TimelineId, + protocol_version: PagestreamProtocolVersion, ctx: RequestContext, ) -> Result<(), QueryError> where @@ -613,14 +607,15 @@ impl PageServerHandler { t.trace(©_data_bytes) } - let neon_fe_msg = PagestreamFeMessage::parse(&mut copy_data_bytes.reader())?; + let neon_fe_msg = + PagestreamFeMessage::parse(&mut copy_data_bytes.reader(), protocol_version)?; // TODO: We could create a new per-request context here, with unique ID. // Currently we use the same per-timeline context for all requests let (response, span) = match neon_fe_msg { PagestreamFeMessage::Exists(req) => { - let span = tracing::info_span!("handle_get_rel_exists_request", rel = %req.rel, req_lsn = %req.lsn); + let span = tracing::info_span!("handle_get_rel_exists_request", rel = %req.rel, req_lsn = %req.request_lsn); ( self.handle_get_rel_exists_request(tenant_id, timeline_id, &req, &ctx) .instrument(span.clone()) @@ -629,7 +624,7 @@ impl PageServerHandler { ) } PagestreamFeMessage::Nblocks(req) => { - let span = tracing::info_span!("handle_get_nblocks_request", rel = %req.rel, req_lsn = %req.lsn); + let span = tracing::info_span!("handle_get_nblocks_request", rel = %req.rel, req_lsn = %req.request_lsn); ( self.handle_get_nblocks_request(tenant_id, timeline_id, &req, &ctx) .instrument(span.clone()) @@ -639,7 +634,7 @@ impl PageServerHandler { } PagestreamFeMessage::GetPage(req) => { // shard_id is filled in by the handler - let span = tracing::info_span!("handle_get_page_at_lsn_request", rel = %req.rel, blkno = %req.blkno, req_lsn = %req.lsn); + let span = tracing::info_span!("handle_get_page_at_lsn_request", rel = %req.rel, blkno = %req.blkno, req_lsn = %req.request_lsn); ( self.handle_get_page_at_lsn_request(tenant_id, timeline_id, &req, &ctx) .instrument(span.clone()) @@ -648,7 +643,7 @@ impl PageServerHandler { ) } PagestreamFeMessage::DbSize(req) => { - let span = tracing::info_span!("handle_db_size_request", dbnode = %req.dbnode, req_lsn = %req.lsn); + let span = tracing::info_span!("handle_db_size_request", dbnode = %req.dbnode, req_lsn = %req.request_lsn); ( self.handle_db_size_request(tenant_id, timeline_id, &req, &ctx) .instrument(span.clone()) @@ -657,7 +652,7 @@ impl PageServerHandler { ) } PagestreamFeMessage::GetSlruSegment(req) => { - let span = tracing::info_span!("handle_get_slru_segment_request", kind = %req.kind, segno = %req.segno, req_lsn = %req.lsn); + let span = tracing::info_span!("handle_get_slru_segment_request", kind = %req.kind, segno = %req.segno, req_lsn = %req.request_lsn); ( self.handle_get_slru_segment_request(tenant_id, timeline_id, &req, &ctx) .instrument(span.clone()) @@ -760,6 +755,7 @@ impl PageServerHandler { let mut copyin_reader = pin!(StreamReader::new(self.copyin_stream(pgb, &tenant.cancel))); timeline .import_basebackup_from_tar( + tenant.clone(), &mut copyin_reader, base_lsn, self.broker_client.clone(), @@ -837,66 +833,80 @@ impl PageServerHandler { /// Helper function to handle the LSN from client request. /// /// Each GetPage (and Exists and Nblocks) request includes information about - /// which version of the page is being requested. The client can request the - /// latest version of the page, or the version that's valid at a particular - /// LSN. The primary compute node will always request the latest page - /// version, while a standby will request a version at the LSN that it's - /// currently caught up to. + /// which version of the page is being requested. The primary compute node + /// will always request the latest page version, by setting 'request_lsn' to + /// the last inserted or flushed WAL position, while a standby will request + /// a version at the LSN that it's currently caught up to. /// /// In either case, if the page server hasn't received the WAL up to the /// requested LSN yet, we will wait for it to arrive. The return value is /// the LSN that should be used to look up the page versions. + /// + /// In addition to the request LSN, each request carries another LSN, + /// 'not_modified_since', which is a hint to the pageserver that the client + /// knows that the page has not been modified between 'not_modified_since' + /// and the request LSN. This allows skipping the wait, as long as the WAL + /// up to 'not_modified_since' has arrived. If the client doesn't have any + /// information about when the page was modified, it will use + /// not_modified_since == lsn. If the client lies and sends a too low + /// not_modified_hint such that there are in fact later page versions, the + /// behavior is undefined: the pageserver may return any of the page versions + /// or an error. async fn wait_or_get_last_lsn( timeline: &Timeline, - mut lsn: Lsn, - latest: bool, + request_lsn: Lsn, + not_modified_since: Lsn, latest_gc_cutoff_lsn: &RcuReadGuard, ctx: &RequestContext, ) -> Result { - if latest { - // Latest page version was requested. If LSN is given, it is a hint - // to the page server that there have been no modifications to the - // page after that LSN. If we haven't received WAL up to that point, - // wait until it arrives. - let last_record_lsn = timeline.get_last_record_lsn(); + let last_record_lsn = timeline.get_last_record_lsn(); - // Note: this covers the special case that lsn == Lsn(0). That - // special case means "return the latest version whatever it is", - // and it's used for bootstrapping purposes, when the page server is - // connected directly to the compute node. That is needed because - // when you connect to the compute node, to receive the WAL, the - // walsender process will do a look up in the pg_authid catalog - // table for authentication. That poses a deadlock problem: the - // catalog table lookup will send a GetPage request, but the GetPage - // request will block in the page server because the recent WAL - // hasn't been received yet, and it cannot be received until the - // walsender completes the authentication and starts streaming the - // WAL. - if lsn <= last_record_lsn { - lsn = last_record_lsn; + // Sanity check the request + if request_lsn < not_modified_since { + return Err(PageStreamError::BadRequest( + format!( + "invalid request with request LSN {} and not_modified_since {}", + request_lsn, not_modified_since, + ) + .into(), + )); + } + + if request_lsn < **latest_gc_cutoff_lsn { + // Check explicitly for INVALID just to get a less scary error message if the + // request is obviously bogus + return Err(if request_lsn == Lsn::INVALID { + PageStreamError::BadRequest("invalid LSN(0) in request".into()) } else { - timeline.wait_lsn(lsn, ctx).await?; - // Since we waited for 'lsn' to arrive, that is now the last - // record LSN. (Or close enough for our purposes; the - // last-record LSN can advance immediately after we return - // anyway) - } - } else { - if lsn == Lsn(0) { - return Err(PageStreamError::BadRequest( - "invalid LSN(0) in request".into(), - )); - } - timeline.wait_lsn(lsn, ctx).await?; + PageStreamError::BadRequest(format!( + "tried to request a page version that was garbage collected. requested at {} gc cutoff {}", + request_lsn, **latest_gc_cutoff_lsn + ).into()) + }); } - if lsn < **latest_gc_cutoff_lsn { - return Err(PageStreamError::BadRequest(format!( - "tried to request a page version that was garbage collected. requested at {} gc cutoff {}", - lsn, **latest_gc_cutoff_lsn - ).into())); + // Wait for WAL up to 'not_modified_since' to arrive, if necessary + if not_modified_since > last_record_lsn { + timeline + .wait_lsn( + not_modified_since, + crate::tenant::timeline::WaitLsnWaiter::PageService, + ctx, + ) + .await?; + // Since we waited for 'not_modified_since' to arrive, that is now the last + // record LSN. (Or close enough for our purposes; the last-record LSN can + // advance immediately after we return anyway) + Ok(not_modified_since) + } else { + // It might be better to use max(not_modified_since, latest_gc_cutoff_lsn) + // here instead. That would give the same result, since we know that there + // haven't been any modifications since 'not_modified_since'. Using an older + // LSN might be faster, because that could allow skipping recent layers when + // finding the page. However, we have historically used 'last_record_lsn', so + // stick to that for now. + Ok(std::cmp::min(last_record_lsn, request_lsn)) } - Ok(lsn) } #[instrument(skip_all, fields(shard_id))] @@ -913,12 +923,17 @@ impl PageServerHandler { .start_timer(metrics::SmgrQueryType::GetRelExists, ctx); let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); - let lsn = - Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx) - .await?; + let lsn = Self::wait_or_get_last_lsn( + timeline, + req.request_lsn, + req.not_modified_since, + &latest_gc_cutoff_lsn, + ctx, + ) + .await?; let exists = timeline - .get_rel_exists(req.rel, Version::Lsn(lsn), req.latest, ctx) + .get_rel_exists(req.rel, Version::Lsn(lsn), ctx) .await?; Ok(PagestreamBeMessage::Exists(PagestreamExistsResponse { @@ -941,12 +956,17 @@ impl PageServerHandler { .start_timer(metrics::SmgrQueryType::GetRelSize, ctx); let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); - let lsn = - Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx) - .await?; + let lsn = Self::wait_or_get_last_lsn( + timeline, + req.request_lsn, + req.not_modified_since, + &latest_gc_cutoff_lsn, + ctx, + ) + .await?; let n_blocks = timeline - .get_rel_size(req.rel, Version::Lsn(lsn), req.latest, ctx) + .get_rel_size(req.rel, Version::Lsn(lsn), ctx) .await?; Ok(PagestreamBeMessage::Nblocks(PagestreamNblocksResponse { @@ -969,18 +989,17 @@ impl PageServerHandler { .start_timer(metrics::SmgrQueryType::GetDbSize, ctx); let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); - let lsn = - Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx) - .await?; + let lsn = Self::wait_or_get_last_lsn( + timeline, + req.request_lsn, + req.not_modified_since, + &latest_gc_cutoff_lsn, + ctx, + ) + .await?; let total_blocks = timeline - .get_db_size( - DEFAULTTABLESPACE_OID, - req.dbnode, - Version::Lsn(lsn), - req.latest, - ctx, - ) + .get_db_size(DEFAULTTABLESPACE_OID, req.dbnode, Version::Lsn(lsn), ctx) .await?; let db_size = total_blocks as i64 * BLCKSZ as i64; @@ -1147,12 +1166,17 @@ impl PageServerHandler { .start_timer(metrics::SmgrQueryType::GetPageAtLsn, ctx); let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); - let lsn = - Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx) - .await?; + let lsn = Self::wait_or_get_last_lsn( + timeline, + req.request_lsn, + req.not_modified_since, + &latest_gc_cutoff_lsn, + ctx, + ) + .await?; let page = timeline - .get_rel_page_at_lsn(req.rel, req.blkno, Version::Lsn(lsn), req.latest, ctx) + .get_rel_page_at_lsn(req.rel, req.blkno, Version::Lsn(lsn), ctx) .await?; Ok(PagestreamBeMessage::GetPage(PagestreamGetPageResponse { @@ -1175,9 +1199,14 @@ impl PageServerHandler { .start_timer(metrics::SmgrQueryType::GetSlruSegment, ctx); let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); - let lsn = - Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx) - .await?; + let lsn = Self::wait_or_get_last_lsn( + timeline, + req.request_lsn, + req.not_modified_since, + &latest_gc_cutoff_lsn, + ctx, + ) + .await?; let kind = SlruKind::from_repr(req.kind) .ok_or(PageStreamError::BadRequest("invalid SLRU kind".into()))?; @@ -1188,6 +1217,10 @@ impl PageServerHandler { )) } + /// Note on "fullbackup": + /// Full basebackups should only be used for debugging purposes. + /// Originally, it was introduced to enable breaking storage format changes, + /// but that is not applicable anymore. #[allow(clippy::too_many_arguments)] #[instrument(skip_all, fields(shard_id, ?lsn, ?prev_lsn, %full_backup))] async fn handle_basebackup_request( @@ -1204,6 +1237,13 @@ impl PageServerHandler { where IO: AsyncRead + AsyncWrite + Send + Sync + Unpin, { + fn map_basebackup_error(err: BasebackupError) -> QueryError { + match err { + BasebackupError::Client(e) => QueryError::Disconnected(ConnectionError::Io(e)), + BasebackupError::Server(e) => QueryError::Other(e), + } + } + let started = std::time::Instant::now(); // check that the timeline exists @@ -1214,7 +1254,13 @@ impl PageServerHandler { if let Some(lsn) = lsn { // Backup was requested at a particular LSN. Wait for it to arrive. info!("waiting for {}", lsn); - timeline.wait_lsn(lsn, ctx).await?; + timeline + .wait_lsn( + lsn, + crate::tenant::timeline::WaitLsnWaiter::PageService, + ctx, + ) + .await?; timeline .check_lsn_is_in_scope(lsn, &latest_gc_cutoff_lsn) .context("invalid basebackup lsn")?; @@ -1223,7 +1269,8 @@ impl PageServerHandler { let lsn_awaited_after = started.elapsed(); // switch client to COPYOUT - pgb.write_message_noflush(&BeMessage::CopyOutResponse)?; + pgb.write_message_noflush(&BeMessage::CopyOutResponse) + .map_err(QueryError::Disconnected)?; self.flush_cancellable(pgb, &timeline.cancel).await?; // Send a tarball of the latest layer on the timeline. Compress if not @@ -1238,7 +1285,8 @@ impl PageServerHandler { full_backup, ctx, ) - .await?; + .await + .map_err(map_basebackup_error)?; } else { let mut writer = pgb.copyout_writer(); if gzip { @@ -1259,9 +1307,13 @@ impl PageServerHandler { full_backup, ctx, ) - .await?; + .await + .map_err(map_basebackup_error)?; // shutdown the encoder to ensure the gzip footer is written - encoder.shutdown().await?; + encoder + .shutdown() + .await + .map_err(|e| QueryError::Disconnected(ConnectionError::Io(e)))?; } else { basebackup::send_basebackup_tarball( &mut writer, @@ -1271,11 +1323,13 @@ impl PageServerHandler { full_backup, ctx, ) - .await?; + .await + .map_err(map_basebackup_error)?; } } - pgb.write_message_noflush(&BeMessage::CopyDone)?; + pgb.write_message_noflush(&BeMessage::CopyDone) + .map_err(QueryError::Disconnected)?; self.flush_cancellable(pgb, &timeline.cancel).await?; let basebackup_after = started @@ -1385,7 +1439,34 @@ where let ctx = self.connection_ctx.attached_child(); debug!("process query {query_string:?}"); - if query_string.starts_with("pagestream ") { + if query_string.starts_with("pagestream_v2 ") { + let (_, params_raw) = query_string.split_at("pagestream_v2 ".len()); + let params = params_raw.split(' ').collect::>(); + if params.len() != 2 { + return Err(QueryError::Other(anyhow::anyhow!( + "invalid param number for pagestream command" + ))); + } + let tenant_id = TenantId::from_str(params[0]) + .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?; + let timeline_id = TimelineId::from_str(params[1]) + .with_context(|| format!("Failed to parse timeline id from {}", params[1]))?; + + tracing::Span::current() + .record("tenant_id", field::display(tenant_id)) + .record("timeline_id", field::display(timeline_id)); + + self.check_permission(Some(tenant_id))?; + + self.handle_pagerequests( + pgb, + tenant_id, + timeline_id, + PagestreamProtocolVersion::V2, + ctx, + ) + .await?; + } else if query_string.starts_with("pagestream ") { let (_, params_raw) = query_string.split_at("pagestream ".len()); let params = params_raw.split(' ').collect::>(); if params.len() != 2 { @@ -1404,8 +1485,14 @@ where self.check_permission(Some(tenant_id))?; - self.handle_pagerequests(pgb, tenant_id, timeline_id, ctx) - .await?; + self.handle_pagerequests( + pgb, + tenant_id, + timeline_id, + PagestreamProtocolVersion::V1, + ctx, + ) + .await?; } else if query_string.starts_with("basebackup ") { let (_, params_raw) = query_string.split_at("basebackup ".len()); let params = params_raw.split_whitespace().collect::>(); diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 6f7d74bdee..a4215ee107 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -9,9 +9,10 @@ use super::tenant::{PageReconstructError, Timeline}; use crate::context::RequestContext; use crate::keyspace::{KeySpace, KeySpaceAccum}; -use crate::repository::*; +use crate::metrics::WAL_INGEST; use crate::span::debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id; use crate::walrecord::NeonWalRecord; +use crate::{aux_file, repository::*}; use anyhow::{ensure, Context}; use bytes::{Buf, Bytes, BytesMut}; use enum_map::Enum; @@ -22,6 +23,8 @@ use pageserver_api::key::{ slru_segment_key_range, slru_segment_size_to_key, twophase_file_key, twophase_key_range, AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY, }; +use pageserver_api::keyspace::SparseKeySpace; +use pageserver_api::models::AuxFilePolicy; use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind}; use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM}; use postgres_ffi::BLCKSZ; @@ -175,7 +178,6 @@ impl Timeline { tag: RelTag, blknum: BlockNumber, version: Version<'_>, - latest: bool, ctx: &RequestContext, ) -> Result { if tag.relnode == 0 { @@ -184,7 +186,7 @@ impl Timeline { )); } - let nblocks = self.get_rel_size(tag, version, latest, ctx).await?; + let nblocks = self.get_rel_size(tag, version, ctx).await?; if blknum >= nblocks { debug!( "read beyond EOF at {} blk {} at {}, size is {}: returning all-zeros page", @@ -206,7 +208,6 @@ impl Timeline { spcnode: Oid, dbnode: Oid, version: Version<'_>, - latest: bool, ctx: &RequestContext, ) -> Result { let mut total_blocks = 0; @@ -214,7 +215,7 @@ impl Timeline { let rels = self.list_rels(spcnode, dbnode, version, ctx).await?; for rel in rels { - let n_blocks = self.get_rel_size(rel, version, latest, ctx).await?; + let n_blocks = self.get_rel_size(rel, version, ctx).await?; total_blocks += n_blocks as usize; } Ok(total_blocks) @@ -225,7 +226,6 @@ impl Timeline { &self, tag: RelTag, version: Version<'_>, - latest: bool, ctx: &RequestContext, ) -> Result { if tag.relnode == 0 { @@ -239,7 +239,7 @@ impl Timeline { } if (tag.forknum == FSM_FORKNUM || tag.forknum == VISIBILITYMAP_FORKNUM) - && !self.get_rel_exists(tag, version, latest, ctx).await? + && !self.get_rel_exists(tag, version, ctx).await? { // FIXME: Postgres sometimes calls smgrcreate() to create // FSM, and smgrnblocks() on it immediately afterwards, @@ -252,16 +252,8 @@ impl Timeline { let mut buf = version.get(self, key, ctx).await?; let nblocks = buf.get_u32_le(); - if latest { - // Update relation size cache only if "latest" flag is set. - // This flag is set by compute when it is working with most recent version of relation. - // Typically master compute node always set latest=true. - // Please notice, that even if compute node "by mistake" specifies old LSN but set - // latest=true, then it can not cause cache corruption, because with latest=true - // pageserver choose max(request_lsn, last_written_lsn) and so cached value will be - // associated with most recent value of LSN. - self.update_cached_rel_size(tag, version.get_lsn(), nblocks); - } + self.update_cached_rel_size(tag, version.get_lsn(), nblocks); + Ok(nblocks) } @@ -270,7 +262,6 @@ impl Timeline { &self, tag: RelTag, version: Version<'_>, - _latest: bool, ctx: &RequestContext, ) -> Result { if tag.relnode == 0 { @@ -289,7 +280,7 @@ impl Timeline { match RelDirectory::des(&buf).context("deserialization failure") { Ok(dir) => { - let exists = dir.rels.get(&(tag.relnode, tag.forknum)).is_some(); + let exists = dir.rels.contains(&(tag.relnode, tag.forknum)); Ok(exists) } Err(e) => Err(PageReconstructError::from(e)), @@ -389,7 +380,7 @@ impl Timeline { match SlruSegmentDirectory::des(&buf).context("deserialization failure") { Ok(dir) => { - let exists = dir.segments.get(&segno).is_some(); + let exists = dir.segments.contains(&segno); Ok(exists) } Err(e) => Err(PageReconstructError::from(e)), @@ -466,6 +457,12 @@ impl Timeline { // Didn't find any commit timestamps smaller than the request Ok(LsnForTimestamp::Past(min_lsn)) } + (true, _) if commit_lsn < min_lsn => { + // the search above did set found_smaller to true but it never increased the lsn. + // Then, low is still the old min_lsn, and the subtraction above gave a value + // below the min_lsn. We should never do that. + Ok(LsnForTimestamp::Past(min_lsn)) + } (true, false) => { // Only found commits with timestamps smaller than the request. // It's still a valid case for branch creation, return it. @@ -674,7 +671,7 @@ impl Timeline { self.get(CHECKPOINT_KEY, lsn, ctx).await } - pub(crate) async fn list_aux_files( + async fn list_aux_files_v1( &self, lsn: Lsn, ctx: &RequestContext, @@ -692,6 +689,63 @@ impl Timeline { } } + async fn list_aux_files_v2( + &self, + lsn: Lsn, + ctx: &RequestContext, + ) -> Result, PageReconstructError> { + let kv = self + .scan(KeySpace::single(Key::metadata_aux_key_range()), lsn, ctx) + .await + .context("scan")?; + let mut result = HashMap::new(); + for (_, v) in kv { + let v = v.context("get value")?; + let v = aux_file::decode_file_value_bytes(&v).context("value decode")?; + for (fname, content) in v { + result.insert(fname, content); + } + } + Ok(result) + } + + pub(crate) async fn list_aux_files( + &self, + lsn: Lsn, + ctx: &RequestContext, + ) -> Result, PageReconstructError> { + match self.get_switch_aux_file_policy() { + AuxFilePolicy::V1 => self.list_aux_files_v1(lsn, ctx).await, + AuxFilePolicy::V2 => self.list_aux_files_v2(lsn, ctx).await, + AuxFilePolicy::CrossValidation => { + let v1_result = self.list_aux_files_v1(lsn, ctx).await; + let v2_result = self.list_aux_files_v2(lsn, ctx).await; + match (v1_result, v2_result) { + (Ok(v1), Ok(v2)) => { + if v1 != v2 { + tracing::error!( + "unmatched aux file v1 v2 result:\nv1 {v1:?}\nv2 {v2:?}" + ); + return Err(PageReconstructError::Other(anyhow::anyhow!( + "unmatched aux file v1 v2 result" + ))); + } + Ok(v1) + } + (Ok(_), Err(v2)) => { + tracing::error!("aux file v1 returns Ok while aux file v2 returns an err"); + Err(v2) + } + (Err(v1), Ok(_)) => { + tracing::error!("aux file v2 returns Ok while aux file v1 returns an err"); + Err(v1) + } + (Err(_), Err(v2)) => Err(v2), + } + } + } + } + /// Does the same as get_current_logical_size but counted on demand. /// Used to initialize the logical size tracking on startup. /// @@ -735,11 +789,13 @@ impl Timeline { /// Get a KeySpace that covers all the Keys that are in use at the given LSN. /// Anything that's not listed maybe removed from the underlying storage (from /// that LSN forwards). + /// + /// The return value is (dense keyspace, sparse keyspace). pub(crate) async fn collect_keyspace( &self, lsn: Lsn, ctx: &RequestContext, - ) -> Result { + ) -> Result<(KeySpace, SparseKeySpace), CollectKeySpaceError> { // Iterate through key ranges, greedily packing them into partitions let mut result = KeySpaceAccum::new(); @@ -811,13 +867,18 @@ impl Timeline { if self.get(AUX_FILES_KEY, lsn, ctx).await.is_ok() { result.add_key(AUX_FILES_KEY); } - Ok(result.to_keyspace()) + + Ok(( + result.to_keyspace(), + /* AUX sparse key space */ + SparseKeySpace(KeySpace::single(Key::metadata_aux_key_range())), + )) } /// Get cached size of relation if it not updated after specified LSN pub fn get_cached_rel_size(&self, tag: &RelTag, lsn: Lsn) -> Option { let rel_size_cache = self.rel_size_cache.read().unwrap(); - if let Some((cached_lsn, nblocks)) = rel_size_cache.get(tag) { + if let Some((cached_lsn, nblocks)) = rel_size_cache.map.get(tag) { if lsn >= *cached_lsn { return Some(*nblocks); } @@ -828,7 +889,16 @@ impl Timeline { /// Update cached relation size if there is no more recent update pub fn update_cached_rel_size(&self, tag: RelTag, lsn: Lsn, nblocks: BlockNumber) { let mut rel_size_cache = self.rel_size_cache.write().unwrap(); - match rel_size_cache.entry(tag) { + + if lsn < rel_size_cache.complete_as_of { + // Do not cache old values. It's safe to cache the size on read, as long as + // the read was at an LSN since we started the WAL ingestion. Reasoning: we + // never evict values from the cache, so if the relation size changed after + // 'lsn', the new value is already in the cache. + return; + } + + match rel_size_cache.map.entry(tag) { hash_map::Entry::Occupied(mut entry) => { let cached_lsn = entry.get_mut(); if lsn >= cached_lsn.0 { @@ -844,13 +914,13 @@ impl Timeline { /// Store cached relation size pub fn set_cached_rel_size(&self, tag: RelTag, lsn: Lsn, nblocks: BlockNumber) { let mut rel_size_cache = self.rel_size_cache.write().unwrap(); - rel_size_cache.insert(tag, (lsn, nblocks)); + rel_size_cache.map.insert(tag, (lsn, nblocks)); } /// Remove cached relation size pub fn remove_cached_rel_size(&self, tag: &RelTag) { let mut rel_size_cache = self.rel_size_cache.write().unwrap(); - rel_size_cache.remove(tag); + rel_size_cache.map.remove(tag); } } @@ -1088,7 +1158,7 @@ impl<'a> DatadirModification<'a> { ) -> anyhow::Result<()> { let total_blocks = self .tline - .get_db_size(spcnode, dbnode, Version::Modified(self), true, ctx) + .get_db_size(spcnode, dbnode, Version::Modified(self), ctx) .await?; // Remove entry from dbdir @@ -1131,21 +1201,22 @@ impl<'a> DatadirModification<'a> { let mut dbdir = DbDirectory::des(&self.get(DBDIR_KEY, ctx).await.context("read db")?) .context("deserialize db")?; let rel_dir_key = rel_dir_to_key(rel.spcnode, rel.dbnode); - let mut rel_dir = if dbdir.dbdirs.get(&(rel.spcnode, rel.dbnode)).is_none() { - // Didn't exist. Update dbdir - dbdir.dbdirs.insert((rel.spcnode, rel.dbnode), false); - let buf = DbDirectory::ser(&dbdir).context("serialize db")?; - self.pending_directory_entries - .push((DirectoryKind::Db, dbdir.dbdirs.len())); - self.put(DBDIR_KEY, Value::Image(buf.into())); + let mut rel_dir = + if let hash_map::Entry::Vacant(e) = dbdir.dbdirs.entry((rel.spcnode, rel.dbnode)) { + // Didn't exist. Update dbdir + e.insert(false); + let buf = DbDirectory::ser(&dbdir).context("serialize db")?; + self.pending_directory_entries + .push((DirectoryKind::Db, dbdir.dbdirs.len())); + self.put(DBDIR_KEY, Value::Image(buf.into())); - // and create the RelDirectory - RelDirectory::default() - } else { - // reldir already exists, fetch it - RelDirectory::des(&self.get(rel_dir_key, ctx).await.context("read db")?) - .context("deserialize db")? - }; + // and create the RelDirectory + RelDirectory::default() + } else { + // reldir already exists, fetch it + RelDirectory::des(&self.get(rel_dir_key, ctx).await.context("read db")?) + .context("deserialize db")? + }; // Add the new relation to the rel directory entry, and write it back if !rel_dir.rels.insert((rel.relnode, rel.forknum)) { @@ -1187,7 +1258,7 @@ impl<'a> DatadirModification<'a> { anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode); if self .tline - .get_rel_exists(rel, Version::Modified(self), true, ctx) + .get_rel_exists(rel, Version::Modified(self), ctx) .await? { let size_key = rel_size_to_key(rel); @@ -1376,6 +1447,9 @@ impl<'a> DatadirModification<'a> { } pub fn init_aux_dir(&mut self) -> anyhow::Result<()> { + if let AuxFilePolicy::V2 = self.tline.get_switch_aux_file_policy() { + return Ok(()); + } let buf = AuxFilesDirectory::ser(&AuxFilesDirectory { files: HashMap::new(), })?; @@ -1391,85 +1465,121 @@ impl<'a> DatadirModification<'a> { content: &[u8], ctx: &RequestContext, ) -> anyhow::Result<()> { - let file_path = path.to_string(); - let content = if content.is_empty() { - None - } else { - Some(Bytes::copy_from_slice(content)) - }; - - let n_files; - let mut aux_files = self.tline.aux_files.lock().await; - if let Some(mut dir) = aux_files.dir.take() { - // We already updated aux files in `self`: emit a delta and update our latest value - dir.upsert(file_path.clone(), content.clone()); - n_files = dir.files.len(); - if aux_files.n_deltas == MAX_AUX_FILE_DELTAS { - self.put( - AUX_FILES_KEY, - Value::Image(Bytes::from( - AuxFilesDirectory::ser(&dir).context("serialize")?, - )), - ); - aux_files.n_deltas = 0; + let policy = self.tline.get_switch_aux_file_policy(); + if let AuxFilePolicy::V2 | AuxFilePolicy::CrossValidation = policy { + let key = aux_file::encode_aux_file_key(path); + // retrieve the key from the engine + let old_val = match self.get(key, ctx).await { + Ok(val) => Some(val), + Err(PageReconstructError::MissingKey(_)) => None, + Err(e) => return Err(e.into()), + }; + let files = if let Some(ref old_val) = old_val { + aux_file::decode_file_value(old_val)? } else { - self.put( - AUX_FILES_KEY, - Value::WalRecord(NeonWalRecord::AuxFile { file_path, content }), - ); - aux_files.n_deltas += 1; - } - aux_files.dir = Some(dir); - } else { - // Check if the AUX_FILES_KEY is initialized - match self.get(AUX_FILES_KEY, ctx).await { - Ok(dir_bytes) => { - let mut dir = AuxFilesDirectory::des(&dir_bytes)?; - // Key is already set, we may append a delta - self.put( - AUX_FILES_KEY, - Value::WalRecord(NeonWalRecord::AuxFile { - file_path: file_path.clone(), - content: content.clone(), - }), - ); - dir.upsert(file_path, content); - n_files = dir.files.len(); - aux_files.dir = Some(dir); - } - Err( - e @ (PageReconstructError::AncestorStopping(_) - | PageReconstructError::Cancelled - | PageReconstructError::AncestorLsnTimeout(_)), - ) => { - // Important that we do not interpret a shutdown error as "not found" and thereby - // reset the map. - return Err(e.into()); - } - // FIXME: PageReconstructError doesn't have an explicit variant for key-not-found, so - // we are assuming that all _other_ possible errors represents a missing key. If some - // other error occurs, we may incorrectly reset the map of aux files. - Err(PageReconstructError::Other(_) | PageReconstructError::WalRedo(_)) => { - // Key is missing, we must insert an image as the basis for subsequent deltas. + Vec::new() + }; + let new_files = if content.is_empty() { + files + .into_iter() + .filter(|(p, _)| &path != p) + .collect::>() + } else { + files + .into_iter() + .filter(|(p, _)| &path != p) + .chain(std::iter::once((path, content))) + .collect::>() + }; + let new_val = aux_file::encode_file_value(&new_files)?; + self.put(key, Value::Image(new_val.into())); + } - let mut dir = AuxFilesDirectory { - files: HashMap::new(), - }; - dir.upsert(file_path, content); + if let AuxFilePolicy::V1 | AuxFilePolicy::CrossValidation = policy { + let file_path = path.to_string(); + let content = if content.is_empty() { + None + } else { + Some(Bytes::copy_from_slice(content)) + }; + + let n_files; + let mut aux_files = self.tline.aux_files.lock().await; + if let Some(mut dir) = aux_files.dir.take() { + // We already updated aux files in `self`: emit a delta and update our latest value. + dir.upsert(file_path.clone(), content.clone()); + n_files = dir.files.len(); + if aux_files.n_deltas == MAX_AUX_FILE_DELTAS { self.put( AUX_FILES_KEY, Value::Image(Bytes::from( AuxFilesDirectory::ser(&dir).context("serialize")?, )), ); - n_files = 1; - aux_files.dir = Some(dir); + aux_files.n_deltas = 0; + } else { + self.put( + AUX_FILES_KEY, + Value::WalRecord(NeonWalRecord::AuxFile { file_path, content }), + ); + aux_files.n_deltas += 1; + } + aux_files.dir = Some(dir); + } else { + // Check if the AUX_FILES_KEY is initialized + match self.get(AUX_FILES_KEY, ctx).await { + Ok(dir_bytes) => { + let mut dir = AuxFilesDirectory::des(&dir_bytes)?; + // Key is already set, we may append a delta + self.put( + AUX_FILES_KEY, + Value::WalRecord(NeonWalRecord::AuxFile { + file_path: file_path.clone(), + content: content.clone(), + }), + ); + dir.upsert(file_path, content); + n_files = dir.files.len(); + aux_files.dir = Some(dir); + } + Err( + e @ (PageReconstructError::AncestorStopping(_) + | PageReconstructError::Cancelled + | PageReconstructError::AncestorLsnTimeout(_)), + ) => { + // Important that we do not interpret a shutdown error as "not found" and thereby + // reset the map. + return Err(e.into()); + } + // Note: we added missing key error variant in https://github.com/neondatabase/neon/pull/7393 but + // the original code assumes all other errors are missing keys. Therefore, we keep the code path + // the same for now, though in theory, we should only match the `MissingKey` variant. + Err( + PageReconstructError::Other(_) + | PageReconstructError::WalRedo(_) + | PageReconstructError::MissingKey { .. }, + ) => { + // Key is missing, we must insert an image as the basis for subsequent deltas. + + let mut dir = AuxFilesDirectory { + files: HashMap::new(), + }; + dir.upsert(file_path, content); + self.put( + AUX_FILES_KEY, + Value::Image(Bytes::from( + AuxFilesDirectory::ser(&dir).context("serialize")?, + )), + ); + n_files = 1; + aux_files.dir = Some(dir); + } } } - } - self.pending_directory_entries - .push((DirectoryKind::AuxFiles, n_files)); + self.pending_directory_entries + .push((DirectoryKind::AuxFiles, n_files)); + } Ok(()) } @@ -1541,6 +1651,8 @@ impl<'a> DatadirModification<'a> { pub async fn commit(&mut self, ctx: &RequestContext) -> anyhow::Result<()> { let mut writer = self.tline.writer().await; + let timer = WAL_INGEST.time_spent_on_ingest.start_timer(); + let pending_nblocks = self.pending_nblocks; self.pending_nblocks = 0; @@ -1580,6 +1692,8 @@ impl<'a> DatadirModification<'a> { writer.update_directory_entries_count(kind, count as u64); } + timer.observe_duration(); + Ok(()) } diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs index 9959d105eb..7b30c3ecf7 100644 --- a/pageserver/src/repository.rs +++ b/pageserver/src/repository.rs @@ -33,6 +33,49 @@ impl Value { } } +#[derive(Debug, PartialEq)] +pub(crate) enum InvalidInput { + TooShortValue, + TooShortPostgresRecord, +} + +/// We could have a ValueRef where everything is `serde(borrow)`. Before implementing that, lets +/// use this type for querying if a slice looks some particular way. +pub(crate) struct ValueBytes; + +impl ValueBytes { + pub(crate) fn will_init(raw: &[u8]) -> Result { + if raw.len() < 12 { + return Err(InvalidInput::TooShortValue); + } + + let value_discriminator = &raw[0..4]; + + if value_discriminator == [0, 0, 0, 0] { + // Value::Image always initializes + return Ok(true); + } + + if value_discriminator != [0, 0, 0, 1] { + // not a Value::WalRecord(..) + return Ok(false); + } + + let walrecord_discriminator = &raw[4..8]; + + if walrecord_discriminator != [0, 0, 0, 0] { + // only NeonWalRecord::Postgres can have will_init + return Ok(false); + } + + if raw.len() < 17 { + return Err(InvalidInput::TooShortPostgresRecord); + } + + Ok(raw[8] == 1) + } +} + #[cfg(test)] mod test { use super::*; @@ -70,6 +113,8 @@ mod test { ]; roundtrip!(image, expected); + + assert!(ValueBytes::will_init(&expected).unwrap()); } #[test] @@ -93,6 +138,96 @@ mod test { ]; roundtrip!(rec, expected); + + assert!(ValueBytes::will_init(&expected).unwrap()); + } + + #[test] + fn bytes_inspection_too_short_image() { + let rec = Value::Image(Bytes::from_static(b"")); + + #[rustfmt::skip] + let expected = [ + // top level discriminator of 4 bytes + 0x00, 0x00, 0x00, 0x00, + // 8 byte length + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + ]; + + roundtrip!(rec, expected); + + assert!(ValueBytes::will_init(&expected).unwrap()); + assert_eq!(expected.len(), 12); + for len in 0..12 { + assert_eq!( + ValueBytes::will_init(&expected[..len]).unwrap_err(), + InvalidInput::TooShortValue + ); + } + } + + #[test] + fn bytes_inspection_too_short_postgres_record() { + let rec = NeonWalRecord::Postgres { + will_init: false, + rec: Bytes::from_static(b""), + }; + let rec = Value::WalRecord(rec); + + #[rustfmt::skip] + let expected = [ + // flattened discriminator of total 8 bytes + 0x00, 0x00, 0x00, 0x01, + 0x00, 0x00, 0x00, 0x00, + // will_init + 0x00, + // 8 byte length + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + ]; + + roundtrip!(rec, expected); + + assert!(!ValueBytes::will_init(&expected).unwrap()); + assert_eq!(expected.len(), 17); + for len in 12..17 { + assert_eq!( + ValueBytes::will_init(&expected[..len]).unwrap_err(), + InvalidInput::TooShortPostgresRecord + ) + } + for len in 0..12 { + assert_eq!( + ValueBytes::will_init(&expected[..len]).unwrap_err(), + InvalidInput::TooShortValue + ) + } + } + + #[test] + fn clear_visibility_map_flags_example() { + let rec = NeonWalRecord::ClearVisibilityMapFlags { + new_heap_blkno: Some(0x11), + old_heap_blkno: None, + flags: 0x03, + }; + let rec = Value::WalRecord(rec); + + #[rustfmt::skip] + let expected = [ + // discriminators + 0x00, 0x00, 0x00, 0x01, + 0x00, 0x00, 0x00, 0x01, + // Some == 1 followed by 4 bytes + 0x01, 0x00, 0x00, 0x00, 0x11, + // None == 0 + 0x00, + // flags + 0x03 + ]; + + roundtrip!(rec, expected); + + assert!(!ValueBytes::will_init(&expected).unwrap()); } } diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs index 69e163effa..5f46ce3d69 100644 --- a/pageserver/src/task_mgr.rs +++ b/pageserver/src/task_mgr.rs @@ -33,13 +33,14 @@ use std::collections::HashMap; use std::fmt; use std::future::Future; +use std::num::NonZeroUsize; use std::panic::AssertUnwindSafe; +use std::str::FromStr; use std::sync::atomic::{AtomicU64, Ordering}; use std::sync::{Arc, Mutex}; use futures::FutureExt; use pageserver_api::shard::TenantShardId; -use tokio::runtime::Runtime; use tokio::task::JoinHandle; use tokio::task_local; use tokio_util::sync::CancellationToken; @@ -48,8 +49,11 @@ use tracing::{debug, error, info, warn}; use once_cell::sync::Lazy; +use utils::env; use utils::id::TimelineId; +use crate::metrics::set_tokio_runtime_setup; + // // There are four runtimes: // @@ -98,52 +102,119 @@ use utils::id::TimelineId; // other operations, if the upload tasks e.g. get blocked on locks. It shouldn't // happen, but still. // -pub static COMPUTE_REQUEST_RUNTIME: Lazy = Lazy::new(|| { - tokio::runtime::Builder::new_multi_thread() - .thread_name("compute request worker") - .enable_all() - .build() - .expect("Failed to create compute request runtime") -}); -pub static MGMT_REQUEST_RUNTIME: Lazy = Lazy::new(|| { - tokio::runtime::Builder::new_multi_thread() - .thread_name("mgmt request worker") - .enable_all() - .build() - .expect("Failed to create mgmt request runtime") -}); - -pub static WALRECEIVER_RUNTIME: Lazy = Lazy::new(|| { - tokio::runtime::Builder::new_multi_thread() - .thread_name("walreceiver worker") - .enable_all() - .build() - .expect("Failed to create walreceiver runtime") -}); - -pub static BACKGROUND_RUNTIME: Lazy = Lazy::new(|| { - tokio::runtime::Builder::new_multi_thread() - .thread_name("background op worker") - // if you change the number of worker threads please change the constant below - .enable_all() - .build() - .expect("Failed to create background op runtime") -}); - -pub(crate) static BACKGROUND_RUNTIME_WORKER_THREADS: Lazy = Lazy::new(|| { - // force init and thus panics - let _ = BACKGROUND_RUNTIME.handle(); +pub(crate) static TOKIO_WORKER_THREADS: Lazy = Lazy::new(|| { // replicates tokio-1.28.1::loom::sys::num_cpus which is not available publicly // tokio would had already panicked for parsing errors or NotUnicode // // this will be wrong if any of the runtimes gets their worker threads configured to something // else, but that has not been needed in a long time. - std::env::var("TOKIO_WORKER_THREADS") - .map(|s| s.parse::().unwrap()) - .unwrap_or_else(|_e| usize::max(2, num_cpus::get())) + NonZeroUsize::new( + std::env::var("TOKIO_WORKER_THREADS") + .map(|s| s.parse::().unwrap()) + .unwrap_or_else(|_e| usize::max(2, num_cpus::get())), + ) + .expect("the max() ensures that this is not zero") }); +enum TokioRuntimeMode { + SingleThreaded, + MultiThreaded { num_workers: NonZeroUsize }, +} + +impl FromStr for TokioRuntimeMode { + type Err = String; + + fn from_str(s: &str) -> Result { + match s { + "current_thread" => Ok(TokioRuntimeMode::SingleThreaded), + s => match s.strip_prefix("multi_thread:") { + Some("default") => Ok(TokioRuntimeMode::MultiThreaded { + num_workers: *TOKIO_WORKER_THREADS, + }), + Some(suffix) => { + let num_workers = suffix.parse::().map_err(|e| { + format!( + "invalid number of multi-threaded runtime workers ({suffix:?}): {e}", + ) + })?; + Ok(TokioRuntimeMode::MultiThreaded { num_workers }) + } + None => Err(format!("invalid runtime config: {s:?}")), + }, + } + } +} + +static ONE_RUNTIME: Lazy> = Lazy::new(|| { + let thread_name = "pageserver-tokio"; + let Some(mode) = env::var("NEON_PAGESERVER_USE_ONE_RUNTIME") else { + // If the env var is not set, leave this static as None. + set_tokio_runtime_setup( + "multiple-runtimes", + NUM_MULTIPLE_RUNTIMES + .checked_mul(*TOKIO_WORKER_THREADS) + .unwrap(), + ); + return None; + }; + Some(match mode { + TokioRuntimeMode::SingleThreaded => { + set_tokio_runtime_setup("one-runtime-single-threaded", NonZeroUsize::new(1).unwrap()); + tokio::runtime::Builder::new_current_thread() + .thread_name(thread_name) + .enable_all() + .build() + .expect("failed to create one single runtime") + } + TokioRuntimeMode::MultiThreaded { num_workers } => { + set_tokio_runtime_setup("one-runtime-multi-threaded", num_workers); + tokio::runtime::Builder::new_multi_thread() + .thread_name(thread_name) + .enable_all() + .worker_threads(num_workers.get()) + .build() + .expect("failed to create one multi-threaded runtime") + } + }) +}); + +/// Declare a lazy static variable named `$varname` that will resolve +/// to a tokio runtime handle. If the env var `NEON_PAGESERVER_USE_ONE_RUNTIME` +/// is set, this will resolve to `ONE_RUNTIME`. Otherwise, the macro invocation +/// declares a separate runtime and the lazy static variable `$varname` +/// will resolve to that separate runtime. +/// +/// The result is is that `$varname.spawn()` will use `ONE_RUNTIME` if +/// `NEON_PAGESERVER_USE_ONE_RUNTIME` is set, and will use the separate runtime +/// otherwise. +macro_rules! pageserver_runtime { + ($varname:ident, $name:literal) => { + pub static $varname: Lazy<&'static tokio::runtime::Runtime> = Lazy::new(|| { + if let Some(runtime) = &*ONE_RUNTIME { + return runtime; + } + static RUNTIME: Lazy = Lazy::new(|| { + tokio::runtime::Builder::new_multi_thread() + .thread_name($name) + .worker_threads(TOKIO_WORKER_THREADS.get()) + .enable_all() + .build() + .expect(std::concat!("Failed to create runtime ", $name)) + }); + &*RUNTIME + }); + }; +} + +pageserver_runtime!(COMPUTE_REQUEST_RUNTIME, "compute request worker"); +pageserver_runtime!(MGMT_REQUEST_RUNTIME, "mgmt request worker"); +pageserver_runtime!(WALRECEIVER_RUNTIME, "walreceiver worker"); +pageserver_runtime!(BACKGROUND_RUNTIME, "background op worker"); +// Bump this number when adding a new pageserver_runtime! +// SAFETY: it's obviously correct +const NUM_MULTIPLE_RUNTIMES: NonZeroUsize = unsafe { NonZeroUsize::new_unchecked(4) }; + #[derive(Debug, Clone, Copy)] pub struct PageserverTaskId(u64); @@ -214,13 +285,12 @@ pub enum TaskKind { /// Internally, `Client` hands over requests to the `Connection` object. /// The `Connection` object is responsible for speaking the wire protocol. /// - /// Walreceiver uses its own abstraction called `TaskHandle` to represent the activity of establishing and handling a connection. - /// That abstraction doesn't use `task_mgr`. + /// Walreceiver uses a legacy abstraction called `TaskHandle` to represent the activity of establishing and handling a connection. /// The `WalReceiverManager` task ensures that this `TaskHandle` task does not outlive the `WalReceiverManager` task. /// For the `RequestContext` that we hand to the TaskHandle, we use the [`WalReceiverConnectionHandler`] task kind. /// - /// Once the connection is established, the `TaskHandle` task creates a - /// [`WalReceiverConnectionPoller`] task_mgr task that is responsible for polling + /// Once the connection is established, the `TaskHandle` task spawns a + /// [`WalReceiverConnectionPoller`] task that is responsible for polling /// the `Connection` object. /// A `CancellationToken` created by the `TaskHandle` task ensures /// that the [`WalReceiverConnectionPoller`] task will cancel soon after as the `TaskHandle` is dropped. @@ -230,7 +300,6 @@ pub enum TaskKind { WalReceiverManager, /// The `TaskHandle` task that executes `handle_walreceiver_connection`. - /// Not a `task_mgr` task, but we use this `TaskKind` for its `RequestContext`. /// See the comment on [`WalReceiverManager`]. /// /// [`WalReceiverManager`]: Self::WalReceiverManager @@ -250,6 +319,9 @@ pub enum TaskKind { // Eviction. One per timeline. Eviction, + // Ingest housekeeping (flushing ephemeral layers on time threshold or disk pressure) + IngestHousekeeping, + /// See [`crate::disk_usage_eviction_task`]. DiskUsageEviction, @@ -292,8 +364,14 @@ pub enum TaskKind { DebugTool, + EphemeralFilePreWarmPageCache, + + LayerDownload, + #[cfg(test)] UnitTest, + + DetachAncestor, } #[derive(Default)] diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index fe48741a89..010e56a899 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -12,6 +12,7 @@ //! use anyhow::{bail, Context}; +use arc_swap::ArcSwap; use camino::Utf8Path; use camino::Utf8PathBuf; use enumset::EnumSet; @@ -63,6 +64,7 @@ use self::timeline::uninit::UninitializedTimeline; use self::timeline::EvictionTaskTenantState; use self::timeline::TimelineResources; use self::timeline::WaitLsnError; +use self::timeline::{GcCutoffs, GcInfo}; use crate::config::PageServerConf; use crate::context::{DownloadBehavior, RequestContext}; use crate::deletion_queue::DeletionQueueClient; @@ -85,7 +87,6 @@ use crate::tenant::remote_timeline_client::INITDB_PATH; use crate::tenant::storage_layer::DeltaLayer; use crate::tenant::storage_layer::ImageLayer; use crate::InitializationOrder; -use std::cmp::min; use std::collections::hash_map::Entry; use std::collections::BTreeSet; use std::collections::HashMap; @@ -98,7 +99,7 @@ use std::ops::Bound::Included; use std::sync::atomic::AtomicU64; use std::sync::atomic::Ordering; use std::sync::Arc; -use std::sync::{Mutex, RwLock}; +use std::sync::Mutex; use std::time::{Duration, Instant}; use crate::span; @@ -260,7 +261,7 @@ pub struct Tenant { // We keep TenantConfOpt sturct here to preserve the information // about parameters that are not set. // This is necessary to allow global config updates. - tenant_conf: Arc>, + tenant_conf: Arc>, tenant_shard_id: TenantShardId, @@ -321,6 +322,9 @@ pub struct Tenant { /// All [`Tenant::timelines`] of a given [`Tenant`] instance share the same [`throttle::Throttle`] instance. pub(crate) timeline_get_throttle: Arc>, + + /// An ongoing timeline detach must be checked during attempts to GC or compact a timeline. + ongoing_timeline_detach: std::sync::Mutex>, } impl std::fmt::Debug for Tenant { @@ -385,7 +389,7 @@ impl WalRedoManager { pub(crate) fn status(&self) -> Option { match self { - WalRedoManager::Prod(m) => m.status(), + WalRedoManager::Prod(m) => Some(m.status()), #[cfg(test)] WalRedoManager::Test(_) => None, } @@ -558,9 +562,10 @@ impl Tenant { // By doing what we do here, the index part upload is retried. // If control plane retries timeline creation in the meantime, the mgmt API handler // for timeline creation will coalesce on the upload we queue here. + // FIXME: this branch should be dead code as we no longer write local metadata. let rtc = timeline.remote_client.as_ref().unwrap(); rtc.init_upload_queue_for_empty_remote(&metadata)?; - rtc.schedule_index_upload_for_metadata_update(&metadata)?; + rtc.schedule_index_upload_for_full_metadata_update(&metadata)?; } timeline @@ -886,7 +891,7 @@ impl Tenant { #[instrument(skip_all)] pub(crate) async fn preload( - self: &Arc, + self: &Arc, remote_storage: &GenericRemoteStorage, cancel: CancellationToken, ) -> anyhow::Result { @@ -916,9 +921,13 @@ impl Tenant { Ok(TenantPreload { deleting, - timelines: self - .load_timeline_metadata(remote_timeline_ids, remote_storage, cancel) - .await?, + timelines: Self::load_timeline_metadata( + self, + remote_timeline_ids, + remote_storage, + cancel, + ) + .await?, }) } @@ -1411,7 +1420,7 @@ impl Tenant { /// the same timeline ID already exists, returns CreateTimelineError::AlreadyExists. #[allow(clippy::too_many_arguments)] pub(crate) async fn create_timeline( - &self, + self: &Arc, new_timeline_id: TimelineId, ancestor_timeline_id: Option, mut ancestor_start_lsn: Option, @@ -1515,7 +1524,7 @@ impl Tenant { // sizes etc. and that would get confused if the previous page versions // are not in the repository yet. ancestor_timeline - .wait_lsn(*lsn, ctx) + .wait_lsn(*lsn, timeline::WaitLsnWaiter::Tenant, ctx) .await .map_err(|e| match e { e @ (WaitLsnError::Timeout(_) | WaitLsnError::BadState) => { @@ -1559,7 +1568,7 @@ impl Tenant { })?; } - loaded_timeline.activate(broker_client, None, ctx); + loaded_timeline.activate(self.clone(), broker_client, None, ctx); Ok(loaded_timeline) } @@ -1606,7 +1615,7 @@ impl Tenant { ); { - let conf = self.tenant_conf.read().unwrap(); + let conf = self.tenant_conf.load(); if !conf.location.may_delete_layers_hint() { info!("Skipping GC in location state {:?}", conf.location); @@ -1633,7 +1642,7 @@ impl Tenant { } { - let conf = self.tenant_conf.read().unwrap(); + let conf = self.tenant_conf.load(); if !conf.location.may_delete_layers_hint() || !conf.location.may_upload_layers_hint() { info!("Skipping compaction in location state {:?}", conf.location); return Ok(()); @@ -1670,6 +1679,34 @@ impl Tenant { Ok(()) } + // Call through to all timelines to freeze ephemeral layers if needed. Usually + // this happens during ingest: this background housekeeping is for freezing layers + // that are open but haven't been written to for some time. + async fn ingest_housekeeping(&self) { + // Scan through the hashmap and collect a list of all the timelines, + // while holding the lock. Then drop the lock and actually perform the + // compactions. We don't want to block everything else while the + // compaction runs. + let timelines = { + self.timelines + .lock() + .unwrap() + .values() + .filter_map(|timeline| { + if timeline.is_active() { + Some(timeline.clone()) + } else { + None + } + }) + .collect::>() + }; + + for timeline in &timelines { + timeline.maybe_freeze_ephemeral_layer().await; + } + } + pub fn current_state(&self) -> TenantState { self.state.borrow().clone() } @@ -1731,7 +1768,12 @@ impl Tenant { let mut activated_timelines = 0; for timeline in timelines_to_activate { - timeline.activate(broker_client.clone(), background_jobs_can_start, ctx); + timeline.activate( + self.clone(), + broker_client.clone(), + background_jobs_can_start, + ctx, + ); activated_timelines += 1; } @@ -1777,7 +1819,7 @@ impl Tenant { async fn shutdown( &self, shutdown_progress: completion::Barrier, - freeze_and_flush: bool, + shutdown_mode: timeline::ShutdownMode, ) -> Result<(), completion::Barrier> { span::debug_assert_current_span_has_tenant_id(); @@ -1824,16 +1866,8 @@ impl Tenant { timelines.values().for_each(|timeline| { let timeline = Arc::clone(timeline); let timeline_id = timeline.timeline_id; - - let span = - tracing::info_span!("timeline_shutdown", %timeline_id, ?freeze_and_flush); - js.spawn(async move { - if freeze_and_flush { - timeline.flush_and_shutdown().instrument(span).await - } else { - timeline.shutdown().instrument(span).await - } - }); + let span = tracing::info_span!("timeline_shutdown", %timeline_id, ?shutdown_mode); + js.spawn(async move { timeline.shutdown(shutdown_mode).instrument(span).await }); }) }; // test_long_timeline_create_then_tenant_delete is leaning on this message @@ -2063,7 +2097,12 @@ impl Tenant { TenantState::Active { .. } => { return Ok(()); } - TenantState::Broken { .. } | TenantState::Stopping { .. } => { + TenantState::Broken { reason, .. } => { + // This is fatal, and reported distinctly from the general case of "will never be active" because + // it's logically a 500 to external API users (broken is always a bug). + return Err(GetActiveTenantError::Broken(reason)); + } + TenantState::Stopping { .. } => { // There's no chance the tenant can transition back into ::Active return Err(GetActiveTenantError::WillNotBecomeActive(current_state)); } @@ -2072,14 +2111,14 @@ impl Tenant { } pub(crate) fn get_attach_mode(&self) -> AttachmentMode { - self.tenant_conf.read().unwrap().location.attach_mode + self.tenant_conf.load().location.attach_mode } /// For API access: generate a LocationConfig equivalent to the one that would be used to /// create a Tenant in the same state. Do not use this in hot paths: it's for relatively /// rare external API calls, like a reconciliation at startup. pub(crate) fn get_location_conf(&self) -> models::LocationConfig { - let conf = self.tenant_conf.read().unwrap(); + let conf = self.tenant_conf.load(); let location_config_mode = match conf.location.attach_mode { AttachmentMode::Single => models::LocationConfigMode::AttachedSingle, @@ -2141,7 +2180,7 @@ impl Tenant { // Shut down the timeline's remote client: this means that the indices we write // for child shards will not be invalidated by the parent shard deleting layers. - tl_client.shutdown().await?; + tl_client.shutdown().await; // Download methods can still be used after shutdown, as they don't flow through the remote client's // queue. In principal the RemoteTimelineClient could provide this without downloading it, but this @@ -2226,7 +2265,7 @@ where impl Tenant { pub fn tenant_specific_overrides(&self) -> TenantConfOpt { - self.tenant_conf.read().unwrap().tenant_conf.clone() + self.tenant_conf.load().tenant_conf.clone() } pub fn effective_config(&self) -> TenantConf { @@ -2235,84 +2274,84 @@ impl Tenant { } pub fn get_checkpoint_distance(&self) -> u64 { - let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone(); + let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); tenant_conf .checkpoint_distance .unwrap_or(self.conf.default_tenant_conf.checkpoint_distance) } pub fn get_checkpoint_timeout(&self) -> Duration { - let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone(); + let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); tenant_conf .checkpoint_timeout .unwrap_or(self.conf.default_tenant_conf.checkpoint_timeout) } pub fn get_compaction_target_size(&self) -> u64 { - let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone(); + let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); tenant_conf .compaction_target_size .unwrap_or(self.conf.default_tenant_conf.compaction_target_size) } pub fn get_compaction_period(&self) -> Duration { - let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone(); + let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); tenant_conf .compaction_period .unwrap_or(self.conf.default_tenant_conf.compaction_period) } pub fn get_compaction_threshold(&self) -> usize { - let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone(); + let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); tenant_conf .compaction_threshold .unwrap_or(self.conf.default_tenant_conf.compaction_threshold) } pub fn get_gc_horizon(&self) -> u64 { - let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone(); + let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); tenant_conf .gc_horizon .unwrap_or(self.conf.default_tenant_conf.gc_horizon) } pub fn get_gc_period(&self) -> Duration { - let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone(); + let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); tenant_conf .gc_period .unwrap_or(self.conf.default_tenant_conf.gc_period) } pub fn get_image_creation_threshold(&self) -> usize { - let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone(); + let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); tenant_conf .image_creation_threshold .unwrap_or(self.conf.default_tenant_conf.image_creation_threshold) } pub fn get_pitr_interval(&self) -> Duration { - let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone(); + let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); tenant_conf .pitr_interval .unwrap_or(self.conf.default_tenant_conf.pitr_interval) } pub fn get_trace_read_requests(&self) -> bool { - let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone(); + let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); tenant_conf .trace_read_requests .unwrap_or(self.conf.default_tenant_conf.trace_read_requests) } pub fn get_min_resident_size_override(&self) -> Option { - let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone(); + let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); tenant_conf .min_resident_size_override .or(self.conf.default_tenant_conf.min_resident_size_override) } pub fn get_heatmap_period(&self) -> Option { - let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone(); + let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); let heatmap_period = tenant_conf .heatmap_period .unwrap_or(self.conf.default_tenant_conf.heatmap_period); @@ -2324,26 +2363,40 @@ impl Tenant { } pub fn set_new_tenant_config(&self, new_tenant_conf: TenantConfOpt) { - self.tenant_conf.write().unwrap().tenant_conf = new_tenant_conf; - self.tenant_conf_updated(); + // Use read-copy-update in order to avoid overwriting the location config + // state if this races with [`Tenant::set_new_location_config`]. Note that + // this race is not possible if both request types come from the storage + // controller (as they should!) because an exclusive op lock is required + // on the storage controller side. + self.tenant_conf.rcu(|inner| { + Arc::new(AttachedTenantConf { + tenant_conf: new_tenant_conf.clone(), + location: inner.location, + }) + }); + + self.tenant_conf_updated(&new_tenant_conf); // Don't hold self.timelines.lock() during the notifies. // There's no risk of deadlock right now, but there could be if we consolidate // mutexes in struct Timeline in the future. let timelines = self.list_timelines(); for timeline in timelines { - timeline.tenant_conf_updated(); + timeline.tenant_conf_updated(&new_tenant_conf); } } pub(crate) fn set_new_location_config(&self, new_conf: AttachedTenantConf) { - *self.tenant_conf.write().unwrap() = new_conf; - self.tenant_conf_updated(); + let new_tenant_conf = new_conf.tenant_conf.clone(); + + self.tenant_conf.store(Arc::new(new_conf)); + + self.tenant_conf_updated(&new_tenant_conf); // Don't hold self.timelines.lock() during the notifies. // There's no risk of deadlock right now, but there could be if we consolidate // mutexes in struct Timeline in the future. let timelines = self.list_timelines(); for timeline in timelines { - timeline.tenant_conf_updated(); + timeline.tenant_conf_updated(&new_tenant_conf); } } @@ -2357,11 +2410,8 @@ impl Tenant { .unwrap_or(psconf.default_tenant_conf.timeline_get_throttle.clone()) } - pub(crate) fn tenant_conf_updated(&self) { - let conf = { - let guard = self.tenant_conf.read().unwrap(); - Self::get_timeline_get_throttle_config(self.conf, &guard.tenant_conf) - }; + pub(crate) fn tenant_conf_updated(&self, new_conf: &TenantConfOpt) { + let conf = Self::get_timeline_get_throttle_config(self.conf, new_conf); self.timeline_get_throttle.reconfigure(conf) } @@ -2509,7 +2559,8 @@ impl Tenant { Tenant::get_timeline_get_throttle_config(conf, &attached_conf.tenant_conf), &crate::metrics::tenant_throttling::TIMELINE_GET, )), - tenant_conf: Arc::new(RwLock::new(attached_conf)), + tenant_conf: Arc::new(ArcSwap::from_pointee(attached_conf)), + ongoing_timeline_detach: std::sync::Mutex::default(), } } @@ -2793,7 +2844,48 @@ impl Tenant { cancel: &CancellationToken, ctx: &RequestContext, ) -> anyhow::Result>> { - // grab mutex to prevent new timelines from being created here. + // before taking the gc_cs lock, do the heavier weight finding of gc_cutoff points for + // currently visible timelines. + let timelines = self + .timelines + .lock() + .unwrap() + .values() + .filter(|tl| match target_timeline_id.as_ref() { + Some(target) => &tl.timeline_id == target, + None => true, + }) + .cloned() + .collect::>(); + + let mut gc_cutoffs: HashMap = + HashMap::with_capacity(timelines.len()); + + for timeline in timelines.iter() { + let cutoff = timeline + .get_last_record_lsn() + .checked_sub(horizon) + .unwrap_or(Lsn(0)); + + let res = timeline.find_gc_cutoffs(cutoff, pitr, cancel, ctx).await; + + match res { + Ok(cutoffs) => { + let old = gc_cutoffs.insert(timeline.timeline_id, cutoffs); + assert!(old.is_none()); + } + Err(e) => { + tracing::warn!(timeline_id = %timeline.timeline_id, "ignoring failure to find gc cutoffs: {e:#}"); + } + } + } + + if !self.is_active() { + anyhow::bail!("shutting down"); + } + + // grab mutex to prevent new timelines from being created here; avoid doing long operations + // because that will stall branch creation. let gc_cs = self.gc_cs.lock().await; // Scan all timelines. For each timeline, remember the timeline ID and @@ -2855,20 +2947,36 @@ impl Tenant { } } - if let Some(cutoff) = timeline.get_last_record_lsn().checked_sub(horizon) { - let branchpoints: Vec = all_branchpoints - .range(( - Included((timeline_id, Lsn(0))), - Included((timeline_id, Lsn(u64::MAX))), - )) - .map(|&x| x.1) - .collect(); - timeline - .update_gc_info(branchpoints, cutoff, pitr, cancel, ctx) - .await?; + let branchpoints: Vec = all_branchpoints + .range(( + Included((timeline_id, Lsn(0))), + Included((timeline_id, Lsn(u64::MAX))), + )) + .map(|&x| x.1) + .collect(); - gc_timelines.push(timeline); + { + let mut target = timeline.gc_info.write().unwrap(); + + match gc_cutoffs.remove(&timeline_id) { + Some(cutoffs) => { + *target = GcInfo { + retain_lsns: branchpoints, + cutoffs, + }; + } + None => { + // reasons for this being unavailable: + // - this timeline was created while we were finding cutoffs + // - lsn for timestamp search fails for this timeline repeatedly + // + // in both cases, refreshing the branchpoints is correct. + target.retain_lsns = branchpoints; + } + }; } + + gc_timelines.push(timeline); } drop(gc_cs); Ok(gc_timelines) @@ -2955,7 +3063,7 @@ impl Tenant { // and then the planned GC cutoff { let gc_info = src_timeline.gc_info.read().unwrap(); - let cutoff = min(gc_info.pitr_cutoff, gc_info.horizon_cutoff); + let cutoff = gc_info.min_cutoff(); if start_lsn < cutoff { return Err(CreateTimelineError::AncestorLsn(anyhow::anyhow!( "invalid branch start lsn: less than planned GC cutoff {cutoff}" @@ -3013,7 +3121,7 @@ impl Tenant { // See also https://github.com/neondatabase/neon/issues/3865 if let Some(remote_client) = new_timeline.remote_client.as_ref() { remote_client - .schedule_index_upload_for_metadata_update(&metadata) + .schedule_index_upload_for_full_metadata_update(&metadata) .context("branch initial metadata upload")?; } @@ -3176,7 +3284,7 @@ impl Tenant { run_initdb(self.conf, &pgdata_path, pg_version, &self.cancel).await?; // Upload the created data dir to S3 - if self.tenant_shard_id().is_zero() { + if self.tenant_shard_id().is_shard_zero() { self.upload_initdb(&timelines_path, &pgdata_path, &timeline_id) .await?; } @@ -3384,7 +3492,11 @@ impl Tenant { // is in progress (which is not a common case). // // See more for on the issue #2748 condenced out of the initial PR review. - let mut shared_cache = self.cached_logical_sizes.lock().await; + let mut shared_cache = tokio::select! { + locked = self.cached_logical_sizes.lock() => locked, + _ = cancel.cancelled() => anyhow::bail!("cancelled"), + _ = self.cancel.cancelled() => anyhow::bail!("tenant is shutting down"), + }; size::gather_inputs( self, @@ -3423,7 +3535,7 @@ impl Tenant { .store(size, Ordering::Relaxed); // Only shard zero should be calculating synthetic sizes - debug_assert!(self.shard_identity.is_zero()); + debug_assert!(self.shard_identity.is_shard_zero()); TENANT_SYNTHETIC_SIZE_METRIC .get_metric_with_label_values(&[&self.tenant_shard_id.tenant_id.to_string()]) @@ -3495,7 +3607,7 @@ impl Tenant { } pub(crate) fn get_tenant_conf(&self) -> TenantConfOpt { - self.tenant_conf.read().unwrap().tenant_conf.clone() + self.tenant_conf.load().tenant_conf.clone() } } @@ -3643,6 +3755,10 @@ pub(crate) mod harness { heatmap_period: Some(tenant_conf.heatmap_period), lazy_slru_download: Some(tenant_conf.lazy_slru_download), timeline_get_throttle: Some(tenant_conf.timeline_get_throttle), + image_layer_creation_check_threshold: Some( + tenant_conf.image_layer_creation_check_threshold, + ), + switch_aux_file_policy: Some(tenant_conf.switch_aux_file_policy), } } } @@ -3831,6 +3947,8 @@ pub(crate) mod harness { #[cfg(test)] mod tests { + use std::collections::BTreeMap; + use super::*; use crate::keyspace::KeySpaceAccum; use crate::repository::{Key, Value}; @@ -3839,8 +3957,12 @@ mod tests { use crate::DEFAULT_PG_VERSION; use bytes::BytesMut; use hex_literal::hex; + use pageserver_api::key::{AUX_KEY_PREFIX, NON_INHERITED_RANGE}; use pageserver_api::keyspace::KeySpace; + use pageserver_api::models::CompactionAlgorithm; use rand::{thread_rng, Rng}; + use tests::storage_layer::ValuesReconstructState; + use tests::timeline::{GetVectoredError, ShutdownMode}; static TEST_KEY: Lazy = Lazy::new(|| Key::from_slice(&hex!("010000000033333333444444445500000001"))); @@ -4286,7 +4408,7 @@ mod tests { make_some_layers(tline.as_ref(), Lsn(0x8000), &ctx).await?; // so that all uploads finish & we can call harness.load() below again tenant - .shutdown(Default::default(), true) + .shutdown(Default::default(), ShutdownMode::FreezeAndFlush) .instrument(harness.span()) .await .ok() @@ -4327,7 +4449,7 @@ mod tests { // so that all uploads finish & we can call harness.load() below again tenant - .shutdown(Default::default(), true) + .shutdown(Default::default(), ShutdownMode::FreezeAndFlush) .instrument(harness.span()) .await .ok() @@ -4477,11 +4599,25 @@ mod tests { } async fn bulk_insert_compact_gc( - timeline: Arc, + tenant: &Tenant, + timeline: &Arc, + ctx: &RequestContext, + lsn: Lsn, + repeat: usize, + key_count: usize, + ) -> anyhow::Result<()> { + let compact = true; + bulk_insert_maybe_compact_gc(tenant, timeline, ctx, lsn, repeat, key_count, compact).await + } + + async fn bulk_insert_maybe_compact_gc( + tenant: &Tenant, + timeline: &Arc, ctx: &RequestContext, mut lsn: Lsn, repeat: usize, key_count: usize, + compact: bool, ) -> anyhow::Result<()> { let mut test_key = Key::from_hex("010000000033333333444444445500000000").unwrap(); let mut blknum = 0; @@ -4489,6 +4625,8 @@ mod tests { // Enforce that key range is monotonously increasing let mut keyspace = KeySpaceAccum::new(); + let cancel = CancellationToken::new(); + for _ in 0..repeat { for _ in 0..key_count { test_key.field6 = blknum; @@ -4510,22 +4648,19 @@ mod tests { blknum += 1; } - let cutoff = timeline.get_last_record_lsn(); - - timeline - .update_gc_info( - Vec::new(), - cutoff, - Duration::ZERO, - &CancellationToken::new(), - ctx, - ) - .await?; timeline.freeze_and_flush().await?; - timeline - .compact(&CancellationToken::new(), EnumSet::empty(), ctx) + if compact { + // this requires timeline to be &Arc + timeline.compact(&cancel, EnumSet::empty(), ctx).await?; + } + + // this doesn't really need to use the timeline_id target, but it is closer to what it + // originally was. + let res = tenant + .gc_iteration(Some(timeline.timeline_id), 0, Duration::ZERO, &cancel, ctx) .await?; - timeline.gc().await?; + + assert_eq!(res.layers_removed, 0, "this never removes anything"); } Ok(()) @@ -4544,7 +4679,7 @@ mod tests { .await?; let lsn = Lsn(0x10); - bulk_insert_compact_gc(tline.clone(), &ctx, lsn, 50, 10000).await?; + bulk_insert_compact_gc(&tenant, &tline, &ctx, lsn, 50, 10000).await?; Ok(()) } @@ -4575,7 +4710,7 @@ mod tests { .await?; let lsn = Lsn(0x10); - bulk_insert_compact_gc(tline.clone(), &ctx, lsn, 50, 10000).await?; + bulk_insert_compact_gc(&tenant, &tline, &ctx, lsn, 50, 10000).await?; let guard = tline.layers.read().await; guard.layer_map().dump(true, &ctx).await?; @@ -4628,7 +4763,9 @@ mod tests { for read in reads { info!("Doing vectored read on {:?}", read); - let vectored_res = tline.get_vectored_impl(read.clone(), reads_lsn, &ctx).await; + let vectored_res = tline + .get_vectored_impl(read.clone(), reads_lsn, ValuesReconstructState::new(), &ctx) + .await; tline .validate_get_vectored_impl(&vectored_res, read, reads_lsn, &ctx) .await; @@ -4637,6 +4774,59 @@ mod tests { Ok(()) } + #[tokio::test] + async fn test_get_vectored_aux_files() -> anyhow::Result<()> { + let harness = TenantHarness::create("test_get_vectored_aux_files")?; + + let (tenant, ctx) = harness.load().await; + let tline = tenant + .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx) + .await?; + let tline = tline.raw_timeline().unwrap(); + + let mut modification = tline.begin_modification(Lsn(0x1000)); + modification.put_file("foo/bar1", b"content1", &ctx).await?; + modification.set_lsn(Lsn(0x1008))?; + modification.put_file("foo/bar2", b"content2", &ctx).await?; + modification.commit(&ctx).await?; + + let child_timeline_id = TimelineId::generate(); + tenant + .branch_timeline_test( + tline, + child_timeline_id, + Some(tline.get_last_record_lsn()), + &ctx, + ) + .await?; + + let child_timeline = tenant + .get_timeline(child_timeline_id, true) + .expect("Should have the branched timeline"); + + let aux_keyspace = KeySpace { + ranges: vec![NON_INHERITED_RANGE], + }; + let read_lsn = child_timeline.get_last_record_lsn(); + + let vectored_res = child_timeline + .get_vectored_impl( + aux_keyspace.clone(), + read_lsn, + ValuesReconstructState::new(), + &ctx, + ) + .await; + + child_timeline + .validate_get_vectored_impl(&vectored_res, aux_keyspace, read_lsn, &ctx) + .await; + + let images = vectored_res?; + assert!(images.is_empty()); + Ok(()) + } + // Test that vectored get handles layer gaps correctly // by advancing into the next ancestor timeline if required. // @@ -4765,7 +4955,12 @@ mod tests { ranges: vec![key_near_gap..gap_at_key.next(), key_near_end..current_key], }; let results = child_timeline - .get_vectored_impl(read.clone(), current_lsn, &ctx) + .get_vectored_impl( + read.clone(), + current_lsn, + ValuesReconstructState::new(), + &ctx, + ) .await?; for (key, img_res) in results { @@ -4776,15 +4971,192 @@ mod tests { Ok(()) } + // Test that vectored get descends into ancestor timelines correctly and + // does not return an image that's newer than requested. + // + // The diagram below ilustrates an interesting case. We have a parent timeline + // (top of the Lsn range) and a child timeline. The request key cannot be reconstructed + // from the child timeline, so the parent timeline must be visited. When advacing into + // the child timeline, the read path needs to remember what the requested Lsn was in + // order to avoid returning an image that's too new. The test below constructs such + // a timeline setup and does a few queries around the Lsn of each page image. + // ``` + // LSN + // ^ + // | + // | + // 500 | --------------------------------------> branch point + // 400 | X + // 300 | X + // 200 | --------------------------------------> requested lsn + // 100 | X + // |---------------------------------------> Key + // | + // ------> requested key + // + // Legend: + // * X - page images + // ``` + #[tokio::test] + async fn test_get_vectored_ancestor_descent() -> anyhow::Result<()> { + let harness = TenantHarness::create("test_get_vectored_on_lsn_axis")?; + let (tenant, ctx) = harness.load().await; + + let start_key = Key::from_hex("010000000033333333444444445500000000").unwrap(); + let end_key = start_key.add(1000); + let child_gap_at_key = start_key.add(500); + let mut parent_gap_lsns: BTreeMap = BTreeMap::new(); + + let mut current_lsn = Lsn(0x10); + + let timeline_id = TimelineId::generate(); + let parent_timeline = tenant + .create_test_timeline(timeline_id, current_lsn, DEFAULT_PG_VERSION, &ctx) + .await?; + + current_lsn += 0x100; + + for _ in 0..3 { + let mut key = start_key; + while key < end_key { + current_lsn += 0x10; + + let image_value = format!("{} at {}", child_gap_at_key, current_lsn); + + let mut writer = parent_timeline.writer().await; + writer + .put( + key, + current_lsn, + &Value::Image(test_img(&image_value)), + &ctx, + ) + .await?; + writer.finish_write(current_lsn); + + if key == child_gap_at_key { + parent_gap_lsns.insert(current_lsn, image_value); + } + + key = key.next(); + } + + parent_timeline.freeze_and_flush().await?; + } + + let child_timeline_id = TimelineId::generate(); + + let child_timeline = tenant + .branch_timeline_test(&parent_timeline, child_timeline_id, Some(current_lsn), &ctx) + .await?; + + let mut key = start_key; + while key < end_key { + if key == child_gap_at_key { + key = key.next(); + continue; + } + + current_lsn += 0x10; + + let mut writer = child_timeline.writer().await; + writer + .put( + key, + current_lsn, + &Value::Image(test_img(&format!("{} at {}", key, current_lsn))), + &ctx, + ) + .await?; + writer.finish_write(current_lsn); + + key = key.next(); + } + + child_timeline.freeze_and_flush().await?; + + let lsn_offsets: [i64; 5] = [-10, -1, 0, 1, 10]; + let mut query_lsns = Vec::new(); + for image_lsn in parent_gap_lsns.keys().rev() { + for offset in lsn_offsets { + query_lsns.push(Lsn(image_lsn + .0 + .checked_add_signed(offset) + .expect("Shouldn't overflow"))); + } + } + + for query_lsn in query_lsns { + let results = child_timeline + .get_vectored_impl( + KeySpace { + ranges: vec![child_gap_at_key..child_gap_at_key.next()], + }, + query_lsn, + ValuesReconstructState::new(), + &ctx, + ) + .await; + + let expected_item = parent_gap_lsns + .iter() + .rev() + .find(|(lsn, _)| **lsn <= query_lsn); + + info!( + "Doing vectored read at LSN {}. Expecting image to be: {:?}", + query_lsn, expected_item + ); + + match expected_item { + Some((_, img_value)) => { + let key_results = results.expect("No vectored get error expected"); + let key_result = &key_results[&child_gap_at_key]; + let returned_img = key_result + .as_ref() + .expect("No page reconstruct error expected"); + + info!( + "Vectored read at LSN {} returned image {}", + query_lsn, + std::str::from_utf8(returned_img)? + ); + assert_eq!(*returned_img, test_img(img_value)); + } + None => { + assert!(matches!(results, Err(GetVectoredError::MissingKey(_)))); + } + } + } + + Ok(()) + } + #[tokio::test] async fn test_random_updates() -> anyhow::Result<()> { - let harness = TenantHarness::create("test_random_updates")?; + let names_algorithms = [ + ("test_random_updates_legacy", CompactionAlgorithm::Legacy), + ("test_random_updates_tiered", CompactionAlgorithm::Tiered), + ]; + for (name, algorithm) in names_algorithms { + test_random_updates_algorithm(name, algorithm).await?; + } + Ok(()) + } + + async fn test_random_updates_algorithm( + name: &'static str, + compaction_algorithm: CompactionAlgorithm, + ) -> anyhow::Result<()> { + let mut harness = TenantHarness::create(name)?; + harness.tenant_conf.compaction_algorithm = compaction_algorithm; let (tenant, ctx) = harness.load().await; let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) .await?; const NUM_KEYS: usize = 1000; + let cancel = CancellationToken::new(); let mut test_key = Key::from_hex("010000000033333333444444445500000000").unwrap(); @@ -4843,22 +5215,11 @@ mod tests { ); } - // Perform a cycle of flush, compact, and GC - let cutoff = tline.get_last_record_lsn(); - tline - .update_gc_info( - Vec::new(), - cutoff, - Duration::ZERO, - &CancellationToken::new(), - &ctx, - ) - .await?; + // Perform a cycle of flush, and GC tline.freeze_and_flush().await?; - tline - .compact(&CancellationToken::new(), EnumSet::empty(), &ctx) + tenant + .gc_iteration(Some(tline.timeline_id), 0, Duration::ZERO, &cancel, &ctx) .await?; - tline.gc().await?; } Ok(()) @@ -4879,6 +5240,8 @@ mod tests { let mut keyspace = KeySpaceAccum::new(); + let cancel = CancellationToken::new(); + // Track when each page was last modified. Used to assert that // a read sees the latest page version. let mut updated = [Lsn(0); NUM_KEYS]; @@ -4942,21 +5305,11 @@ mod tests { } // Perform a cycle of flush, compact, and GC - let cutoff = tline.get_last_record_lsn(); - tline - .update_gc_info( - Vec::new(), - cutoff, - Duration::ZERO, - &CancellationToken::new(), - &ctx, - ) - .await?; tline.freeze_and_flush().await?; - tline - .compact(&CancellationToken::new(), EnumSet::empty(), &ctx) + tline.compact(&cancel, EnumSet::empty(), &ctx).await?; + tenant + .gc_iteration(Some(tline.timeline_id), 0, Duration::ZERO, &cancel, &ctx) .await?; - tline.gc().await?; } Ok(()) @@ -5108,7 +5461,7 @@ mod tests { // Leave the timeline ID in [`Tenant::timelines_creating`] to exclude attempting to create it again let raw_tline = tline.raw_timeline().unwrap(); raw_tline - .shutdown() + .shutdown(super::timeline::ShutdownMode::Hard) .instrument(info_span!("test_shutdown", tenant_id=%raw_tline.tenant_shard_id, shard_id=%raw_tline.tenant_shard_id.shard_slug(), timeline_id=%TIMELINE_ID)) .await; std::mem::forget(tline); @@ -5138,19 +5491,140 @@ mod tests { #[tokio::test] async fn test_read_at_max_lsn() -> anyhow::Result<()> { - let harness = TenantHarness::create("test_read_at_max_lsn")?; + let names_algorithms = [ + ("test_read_at_max_lsn_legacy", CompactionAlgorithm::Legacy), + ("test_read_at_max_lsn_tiered", CompactionAlgorithm::Tiered), + ]; + for (name, algorithm) in names_algorithms { + test_read_at_max_lsn_algorithm(name, algorithm).await?; + } + Ok(()) + } + + async fn test_read_at_max_lsn_algorithm( + name: &'static str, + compaction_algorithm: CompactionAlgorithm, + ) -> anyhow::Result<()> { + let mut harness = TenantHarness::create(name)?; + harness.tenant_conf.compaction_algorithm = compaction_algorithm; let (tenant, ctx) = harness.load().await; let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx) .await?; let lsn = Lsn(0x10); - bulk_insert_compact_gc(tline.clone(), &ctx, lsn, 50, 10000).await?; + let compact = false; + bulk_insert_maybe_compact_gc(&tenant, &tline, &ctx, lsn, 50, 10000, compact).await?; let test_key = Key::from_hex("010000000033333333444444445500000000").unwrap(); let read_lsn = Lsn(u64::MAX - 1); - assert!(tline.get(test_key, read_lsn, &ctx).await.is_ok()); + let result = tline.get(test_key, read_lsn, &ctx).await; + assert!(result.is_ok(), "result is not Ok: {}", result.unwrap_err()); + + Ok(()) + } + + #[tokio::test] + async fn test_metadata_scan() -> anyhow::Result<()> { + let harness = TenantHarness::create("test_metadata_scan")?; + let (tenant, ctx) = harness.load().await; + let tline = tenant + .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) + .await?; + + const NUM_KEYS: usize = 1000; + const STEP: usize = 100; // random update + scan base_key + idx * STEP + + let cancel = CancellationToken::new(); + + let mut base_key = Key::from_hex("000000000033333333444444445500000000").unwrap(); + base_key.field1 = AUX_KEY_PREFIX; + let mut test_key = base_key; + + // Track when each page was last modified. Used to assert that + // a read sees the latest page version. + let mut updated = [Lsn(0); NUM_KEYS]; + + let mut lsn = Lsn(0x10); + #[allow(clippy::needless_range_loop)] + for blknum in 0..NUM_KEYS { + lsn = Lsn(lsn.0 + 0x10); + test_key.field6 = (blknum * STEP) as u32; + let mut writer = tline.writer().await; + writer + .put( + test_key, + lsn, + &Value::Image(test_img(&format!("{} at {}", blknum, lsn))), + &ctx, + ) + .await?; + writer.finish_write(lsn); + updated[blknum] = lsn; + drop(writer); + } + + let keyspace = KeySpace::single(base_key..base_key.add((NUM_KEYS * STEP) as u32)); + + for _ in 0..10 { + // Read all the blocks + for (blknum, last_lsn) in updated.iter().enumerate() { + test_key.field6 = (blknum * STEP) as u32; + assert_eq!( + tline.get(test_key, lsn, &ctx).await?, + test_img(&format!("{} at {}", blknum, last_lsn)) + ); + } + + let mut cnt = 0; + for (key, value) in tline + .get_vectored_impl( + keyspace.clone(), + lsn, + ValuesReconstructState::default(), + &ctx, + ) + .await? + { + let blknum = key.field6 as usize; + let value = value?; + assert!(blknum % STEP == 0); + let blknum = blknum / STEP; + assert_eq!( + value, + test_img(&format!("{} at {}", blknum, updated[blknum])) + ); + cnt += 1; + } + + assert_eq!(cnt, NUM_KEYS); + + for _ in 0..NUM_KEYS { + lsn = Lsn(lsn.0 + 0x10); + let blknum = thread_rng().gen_range(0..NUM_KEYS); + test_key.field6 = (blknum * STEP) as u32; + let mut writer = tline.writer().await; + writer + .put( + test_key, + lsn, + &Value::Image(test_img(&format!("{} at {}", blknum, lsn))), + &ctx, + ) + .await?; + writer.finish_write(lsn); + drop(writer); + updated[blknum] = lsn; + } + + // Perform a cycle of flush, compact, and GC + tline.freeze_and_flush().await?; + tline.compact(&cancel, EnumSet::empty(), &ctx).await?; + tenant + .gc_iteration(Some(tline.timeline_id), 0, Duration::ZERO, &cancel, &ctx) + .await?; + } Ok(()) } diff --git a/pageserver/src/tenant/blob_io.rs b/pageserver/src/tenant/blob_io.rs index 0d33100ead..1dc451f5c9 100644 --- a/pageserver/src/tenant/blob_io.rs +++ b/pageserver/src/tenant/blob_io.rs @@ -121,7 +121,7 @@ impl BlobWriter { self.offset } - const CAPACITY: usize = if BUFFERED { PAGE_SZ } else { 0 }; + const CAPACITY: usize = if BUFFERED { 64 * 1024 } else { 0 }; /// Writes the given buffer directly to the underlying `VirtualFile`. /// You need to make sure that the internal buffer is empty, otherwise @@ -130,8 +130,9 @@ impl BlobWriter { async fn write_all_unbuffered, Buf: IoBuf + Send>( &mut self, src_buf: B, + ctx: &RequestContext, ) -> (B::Buf, Result<(), Error>) { - let (src_buf, res) = self.inner.write_all(src_buf).await; + let (src_buf, res) = self.inner.write_all(src_buf, ctx).await; let nbytes = match res { Ok(nbytes) => nbytes, Err(e) => return (src_buf, Err(e)), @@ -142,9 +143,9 @@ impl BlobWriter { #[inline(always)] /// Flushes the internal buffer to the underlying `VirtualFile`. - pub async fn flush_buffer(&mut self) -> Result<(), Error> { + pub async fn flush_buffer(&mut self, ctx: &RequestContext) -> Result<(), Error> { let buf = std::mem::take(&mut self.buf); - let (mut buf, res) = self.inner.write_all(buf).await; + let (mut buf, res) = self.inner.write_all(buf, ctx).await; res?; buf.clear(); self.buf = buf; @@ -165,10 +166,11 @@ impl BlobWriter { async fn write_all, Buf: IoBuf + Send>( &mut self, src_buf: B, + ctx: &RequestContext, ) -> (B::Buf, Result<(), Error>) { if !BUFFERED { assert!(self.buf.is_empty()); - return self.write_all_unbuffered(src_buf).await; + return self.write_all_unbuffered(src_buf, ctx).await; } let remaining = Self::CAPACITY - self.buf.len(); let src_buf_len = src_buf.bytes_init(); @@ -183,7 +185,7 @@ impl BlobWriter { } // Then, if the buffer is full, flush it out if self.buf.len() == Self::CAPACITY { - if let Err(e) = self.flush_buffer().await { + if let Err(e) = self.flush_buffer(ctx).await { return (Slice::into_inner(src_buf), Err(e)); } } @@ -199,7 +201,7 @@ impl BlobWriter { assert_eq!(copied, src_buf.len()); Slice::into_inner(src_buf) } else { - let (src_buf, res) = self.write_all_unbuffered(src_buf).await; + let (src_buf, res) = self.write_all_unbuffered(src_buf, ctx).await; if let Err(e) = res { return (src_buf, Err(e)); } @@ -216,6 +218,7 @@ impl BlobWriter { pub async fn write_blob, Buf: IoBuf + Send>( &mut self, srcbuf: B, + ctx: &RequestContext, ) -> (B::Buf, Result) { let offset = self.offset; @@ -227,7 +230,7 @@ impl BlobWriter { if len < 128 { // Short blob. Write a 1-byte length header io_buf.put_u8(len as u8); - self.write_all(io_buf).await + self.write_all(io_buf, ctx).await } else { // Write a 4-byte length header if len > 0x7fff_ffff { @@ -242,7 +245,7 @@ impl BlobWriter { let mut len_buf = (len as u32).to_be_bytes(); len_buf[0] |= 0x80; io_buf.extend_from_slice(&len_buf[..]); - self.write_all(io_buf).await + self.write_all(io_buf, ctx).await } } .await; @@ -251,7 +254,7 @@ impl BlobWriter { Ok(_) => (), Err(e) => return (Slice::into_inner(srcbuf.slice(..)), Err(e)), } - let (srcbuf, res) = self.write_all(srcbuf).await; + let (srcbuf, res) = self.write_all(srcbuf, ctx).await; (srcbuf, res.map(|_| offset)) } } @@ -261,8 +264,8 @@ impl BlobWriter { /// /// This function flushes the internal buffer before giving access /// to the underlying `VirtualFile`. - pub async fn into_inner(mut self) -> Result { - self.flush_buffer().await?; + pub async fn into_inner(mut self, ctx: &RequestContext) -> Result { + self.flush_buffer(ctx).await?; Ok(self.inner) } @@ -299,16 +302,16 @@ mod tests { let file = VirtualFile::create(pathbuf.as_path()).await?; let mut wtr = BlobWriter::::new(file, 0); for blob in blobs.iter() { - let (_, res) = wtr.write_blob(blob.clone()).await; + let (_, res) = wtr.write_blob(blob.clone(), &ctx).await; let offs = res?; offsets.push(offs); } // Write out one page worth of zeros so that we can // read again with read_blk - let (_, res) = wtr.write_blob(vec![0; PAGE_SZ]).await; + let (_, res) = wtr.write_blob(vec![0; PAGE_SZ], &ctx).await; let offs = res?; println!("Writing final blob at offs={offs}"); - wtr.flush_buffer().await?; + wtr.flush_buffer(&ctx).await?; } let file = VirtualFile::open(pathbuf.as_path()).await?; diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs index 53a8c97e23..a743ce3c16 100644 --- a/pageserver/src/tenant/config.rs +++ b/pageserver/src/tenant/config.rs @@ -9,6 +9,7 @@ //! may lead to a data loss. //! use anyhow::bail; +use pageserver_api::models::AuxFilePolicy; use pageserver_api::models::CompactionAlgorithm; use pageserver_api::models::EvictionPolicy; use pageserver_api::models::{self, ThrottleConfig}; @@ -57,6 +58,9 @@ pub mod defaults { // throughputs up to 1GiB/s per timeline. pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 1024 * 1024 * 1024; pub const DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD: &str = "24 hour"; + // By default ingest enough WAL for two new L0 layers before checking if new image + // image layers should be created. + pub const DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD: u8 = 2; pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100; } @@ -362,6 +366,14 @@ pub struct TenantConf { pub lazy_slru_download: bool, pub timeline_get_throttle: pageserver_api::models::ThrottleConfig, + + // How much WAL must be ingested before checking again whether a new image layer is required. + // Expresed in multiples of checkpoint distance. + pub image_layer_creation_check_threshold: u8, + + /// Switch to a new aux file policy. Switching this flag requires the user has not written any aux file into + /// the storage before, and this flag cannot be switched back. Otherwise there will be data corruptions. + pub switch_aux_file_policy: AuxFilePolicy, } /// Same as TenantConf, but this struct preserves the information about @@ -454,6 +466,13 @@ pub struct TenantConfOpt { #[serde(skip_serializing_if = "Option::is_none")] pub timeline_get_throttle: Option, + + #[serde(skip_serializing_if = "Option::is_none")] + pub image_layer_creation_check_threshold: Option, + + #[serde(skip_serializing_if = "Option::is_none")] + #[serde(default)] + pub switch_aux_file_policy: Option, } impl TenantConfOpt { @@ -508,6 +527,12 @@ impl TenantConfOpt { .timeline_get_throttle .clone() .unwrap_or(global_conf.timeline_get_throttle), + image_layer_creation_check_threshold: self + .image_layer_creation_check_threshold + .unwrap_or(global_conf.image_layer_creation_check_threshold), + switch_aux_file_policy: self + .switch_aux_file_policy + .unwrap_or(global_conf.switch_aux_file_policy), } } } @@ -548,6 +573,8 @@ impl Default for TenantConf { heatmap_period: Duration::ZERO, lazy_slru_download: false, timeline_get_throttle: crate::tenant::throttle::Config::disabled(), + image_layer_creation_check_threshold: DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD, + switch_aux_file_policy: AuxFilePolicy::V1, } } } @@ -621,6 +648,8 @@ impl From for models::TenantConfig { heatmap_period: value.heatmap_period.map(humantime), lazy_slru_download: value.lazy_slru_download, timeline_get_throttle: value.timeline_get_throttle.map(ThrottleConfig::from), + image_layer_creation_check_threshold: value.image_layer_creation_check_threshold, + switch_aux_file_policy: value.switch_aux_file_policy, } } } diff --git a/pageserver/src/tenant/delete.rs b/pageserver/src/tenant/delete.rs index cab60c3111..2e5259bfe2 100644 --- a/pageserver/src/tenant/delete.rs +++ b/pageserver/src/tenant/delete.rs @@ -14,7 +14,10 @@ use crate::{ config::PageServerConf, context::RequestContext, task_mgr::{self, TaskKind}, - tenant::mgr::{TenantSlot, TenantsMapRemoveResult}, + tenant::{ + mgr::{TenantSlot, TenantsMapRemoveResult}, + timeline::ShutdownMode, + }, }; use super::{ @@ -111,6 +114,7 @@ async fn create_local_delete_mark( let _ = std::fs::OpenOptions::new() .write(true) .create(true) + .truncate(true) .open(&marker_path) .with_context(|| format!("could not create delete marker file {marker_path:?}"))?; @@ -432,6 +436,11 @@ impl DeleteTenantFlow { .await } + /// Check whether background deletion of this tenant is currently in progress + pub(crate) fn is_in_progress(tenant: &Tenant) -> bool { + tenant.delete_progress.try_lock().is_err() + } + async fn prepare( tenant: &Arc, ) -> Result, DeleteTenantError> { @@ -462,7 +471,7 @@ impl DeleteTenantFlow { // tenant.shutdown // Its also bad that we're holding tenants.read here. // TODO relax set_stopping to be idempotent? - if tenant.shutdown(progress, false).await.is_err() { + if tenant.shutdown(progress, ShutdownMode::Hard).await.is_err() { return Err(DeleteTenantError::Other(anyhow::anyhow!( "tenant shutdown is already in progress" ))); @@ -576,9 +585,20 @@ impl DeleteTenantFlow { // FIXME: we should not be modifying this from outside of mgr.rs. // This will go away when we simplify deletion (https://github.com/neondatabase/neon/issues/5080) - crate::metrics::TENANT_MANAGER - .tenant_slots - .set(locked.len() as u64); + + // Update stats + match &removed { + TenantsMapRemoveResult::Occupied(slot) => { + crate::metrics::TENANT_MANAGER.slot_removed(slot); + } + TenantsMapRemoveResult::InProgress(barrier) => { + crate::metrics::TENANT_MANAGER + .slot_removed(&TenantSlot::InProgress(barrier.clone())); + } + TenantsMapRemoveResult::Vacant => { + // Nothing changed in map, no metric update + } + } match removed { TenantsMapRemoveResult::Occupied(TenantSlot::Attached(tenant)) => { diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs index e48b9e83bd..8b815a1885 100644 --- a/pageserver/src/tenant/ephemeral_file.rs +++ b/pageserver/src/tenant/ephemeral_file.rs @@ -3,36 +3,26 @@ use crate::config::PageServerConf; use crate::context::RequestContext; -use crate::page_cache::{self, PAGE_SZ}; +use crate::page_cache; use crate::tenant::block_io::{BlockCursor, BlockLease, BlockReader}; use crate::virtual_file::{self, VirtualFile}; -use bytes::BytesMut; use camino::Utf8PathBuf; use pageserver_api::shard::TenantShardId; -use std::cmp::min; -use std::io::{self, ErrorKind}; -use std::ops::DerefMut; +use std::io; use std::sync::atomic::AtomicU64; -use tracing::*; use utils::id::TimelineId; pub struct EphemeralFile { - page_cache_file_id: page_cache::FileId, - _tenant_shard_id: TenantShardId, _timeline_id: TimelineId, - file: VirtualFile, - len: u64, - /// An ephemeral file is append-only. - /// We keep the last page, which can still be modified, in [`Self::mutable_tail`]. - /// The other pages, which can no longer be modified, are accessed through the page cache. - /// - /// None <=> IO is ongoing. - /// Size is fixed to PAGE_SZ at creation time and must not be changed. - mutable_tail: Option, + + rw: page_caching::RW, } +mod page_caching; +mod zero_padded_read_write; + impl EphemeralFile { pub async fn create( conf: &PageServerConf, @@ -59,17 +49,18 @@ impl EphemeralFile { .await?; Ok(EphemeralFile { - page_cache_file_id: page_cache::next_file_id(), _tenant_shard_id: tenant_shard_id, _timeline_id: timeline_id, - file, - len: 0, - mutable_tail: Some(BytesMut::zeroed(PAGE_SZ)), + rw: page_caching::RW::new(file), }) } pub(crate) fn len(&self) -> u64 { - self.len + self.rw.bytes_written() + } + + pub(crate) fn page_cache_file_id(&self) -> page_cache::FileId { + self.rw.page_cache_file_id() } pub(crate) async fn read_blk( @@ -77,44 +68,7 @@ impl EphemeralFile { blknum: u32, ctx: &RequestContext, ) -> Result { - let flushed_blknums = 0..self.len / PAGE_SZ as u64; - if flushed_blknums.contains(&(blknum as u64)) { - let cache = page_cache::get(); - match cache - .read_immutable_buf(self.page_cache_file_id, blknum, ctx) - .await - .map_err(|e| { - std::io::Error::new( - std::io::ErrorKind::Other, - // order path before error because error is anyhow::Error => might have many contexts - format!( - "ephemeral file: read immutable page #{}: {}: {:#}", - blknum, self.file.path, e, - ), - ) - })? { - page_cache::ReadBufResult::Found(guard) => { - return Ok(BlockLease::PageReadGuard(guard)) - } - page_cache::ReadBufResult::NotFound(write_guard) => { - let write_guard = self - .file - .read_exact_at_page(write_guard, blknum as u64 * PAGE_SZ as u64) - .await?; - let read_guard = write_guard.mark_valid(); - return Ok(BlockLease::PageReadGuard(read_guard)); - } - }; - } else { - debug_assert_eq!(blknum as u64, self.len / PAGE_SZ as u64); - Ok(BlockLease::EphemeralFileMutableTail( - self.mutable_tail - .as_deref() - .expect("we're not doing IO, it must be Some()") - .try_into() - .expect("we ensure that it's always PAGE_SZ"), - )) - } + self.rw.read_blk(blknum, ctx).await } pub(crate) async fn write_blob( @@ -122,137 +76,22 @@ impl EphemeralFile { srcbuf: &[u8], ctx: &RequestContext, ) -> Result { - struct Writer<'a> { - ephemeral_file: &'a mut EphemeralFile, - /// The block to which the next [`push_bytes`] will write. - blknum: u32, - /// The offset inside the block identified by [`blknum`] to which [`push_bytes`] will write. - off: usize, - } - impl<'a> Writer<'a> { - fn new(ephemeral_file: &'a mut EphemeralFile) -> io::Result> { - Ok(Writer { - blknum: (ephemeral_file.len / PAGE_SZ as u64) as u32, - off: (ephemeral_file.len % PAGE_SZ as u64) as usize, - ephemeral_file, - }) - } - #[inline(always)] - async fn push_bytes( - &mut self, - src: &[u8], - ctx: &RequestContext, - ) -> Result<(), io::Error> { - let mut src_remaining = src; - while !src_remaining.is_empty() { - let dst_remaining = &mut self - .ephemeral_file - .mutable_tail - .as_deref_mut() - .expect("IO is not yet ongoing")[self.off..]; - let n = min(dst_remaining.len(), src_remaining.len()); - dst_remaining[..n].copy_from_slice(&src_remaining[..n]); - self.off += n; - src_remaining = &src_remaining[n..]; - if self.off == PAGE_SZ { - let mutable_tail = std::mem::take(&mut self.ephemeral_file.mutable_tail) - .expect("IO is not yet ongoing"); - let (mutable_tail, res) = self - .ephemeral_file - .file - .write_all_at(mutable_tail, self.blknum as u64 * PAGE_SZ as u64) - .await; - // TODO: If we panic before we can put the mutable_tail back, subsequent calls will fail. - // I.e., the IO isn't retryable if we panic. - self.ephemeral_file.mutable_tail = Some(mutable_tail); - match res { - Ok(_) => { - // Pre-warm the page cache with what we just wrote. - // This isn't necessary for coherency/correctness, but it's how we've always done it. - let cache = page_cache::get(); - match cache - .read_immutable_buf( - self.ephemeral_file.page_cache_file_id, - self.blknum, - ctx, - ) - .await - { - Ok(page_cache::ReadBufResult::Found(_guard)) => { - // This function takes &mut self, so, it shouldn't be possible to reach this point. - unreachable!("we just wrote blknum {} and this function takes &mut self, so, no concurrent read_blk is possible", self.blknum); - } - Ok(page_cache::ReadBufResult::NotFound(mut write_guard)) => { - let buf: &mut [u8] = write_guard.deref_mut(); - debug_assert_eq!(buf.len(), PAGE_SZ); - buf.copy_from_slice( - self.ephemeral_file - .mutable_tail - .as_deref() - .expect("IO is not ongoing"), - ); - let _ = write_guard.mark_valid(); - // pre-warm successful - } - Err(e) => { - error!("ephemeral_file write_blob failed to get immutable buf to pre-warm page cache: {e:?}"); - // fail gracefully, it's not the end of the world if we can't pre-warm the cache here - } - } - // Zero the buffer for re-use. - // Zeroing is critical for correcntess because the write_blob code below - // and similarly read_blk expect zeroed pages. - self.ephemeral_file - .mutable_tail - .as_deref_mut() - .expect("IO is not ongoing") - .fill(0); - // This block is done, move to next one. - self.blknum += 1; - self.off = 0; - } - Err(e) => { - return Err(std::io::Error::new( - ErrorKind::Other, - // order error before path because path is long and error is short - format!( - "ephemeral_file: write_blob: write-back full tail blk #{}: {:#}: {}", - self.blknum, - e, - self.ephemeral_file.file.path, - ), - )); - } - } - } - } - Ok(()) - } - } - - let pos = self.len; - let mut writer = Writer::new(self)?; + let pos = self.rw.bytes_written(); // Write the length field if srcbuf.len() < 0x80 { // short one-byte length header let len_buf = [srcbuf.len() as u8]; - writer.push_bytes(&len_buf, ctx).await?; + + self.rw.write_all_borrowed(&len_buf, ctx).await?; } else { let mut len_buf = u32::to_be_bytes(srcbuf.len() as u32); len_buf[0] |= 0x80; - writer.push_bytes(&len_buf, ctx).await?; + self.rw.write_all_borrowed(&len_buf, ctx).await?; } // Write the payload - writer.push_bytes(srcbuf, ctx).await?; - - if srcbuf.len() < 0x80 { - self.len += 1; - } else { - self.len += 4; - } - self.len += srcbuf.len() as u64; + self.rw.write_all_borrowed(srcbuf, ctx).await?; Ok(pos) } @@ -267,28 +106,6 @@ pub fn is_ephemeral_file(filename: &str) -> bool { } } -impl Drop for EphemeralFile { - fn drop(&mut self) { - // There might still be pages in the [`crate::page_cache`] for this file. - // We leave them there, [`crate::page_cache::PageCache::find_victim`] will evict them when needed. - - // unlink the file - let res = std::fs::remove_file(&self.file.path); - if let Err(e) = res { - if e.kind() != std::io::ErrorKind::NotFound { - // just never log the not found errors, we cannot do anything for them; on detach - // the tenant directory is already gone. - // - // not found files might also be related to https://github.com/neondatabase/neon/issues/2442 - error!( - "could not remove ephemeral file '{}': {}", - self.file.path, e - ); - } - } - } -} - impl BlockReader for EphemeralFile { fn block_cursor(&self) -> super::block_io::BlockCursor<'_> { BlockCursor::new(super::block_io::BlockReaderRef::EphemeralFile(self)) diff --git a/pageserver/src/tenant/ephemeral_file/page_caching.rs b/pageserver/src/tenant/ephemeral_file/page_caching.rs new file mode 100644 index 0000000000..42def8858e --- /dev/null +++ b/pageserver/src/tenant/ephemeral_file/page_caching.rs @@ -0,0 +1,223 @@ +//! Wrapper around [`super::zero_padded_read_write::RW`] that uses the +//! [`crate::page_cache`] to serve reads that need to go to the underlying [`VirtualFile`]. + +use crate::context::RequestContext; +use crate::page_cache::{self, PAGE_SZ}; +use crate::tenant::block_io::BlockLease; +use crate::virtual_file::VirtualFile; + +use once_cell::sync::Lazy; +use std::io::{self, ErrorKind}; +use tokio_epoll_uring::BoundedBuf; +use tracing::*; + +use super::zero_padded_read_write; + +/// See module-level comment. +pub struct RW { + page_cache_file_id: page_cache::FileId, + rw: super::zero_padded_read_write::RW, +} + +impl RW { + pub fn new(file: VirtualFile) -> Self { + let page_cache_file_id = page_cache::next_file_id(); + Self { + page_cache_file_id, + rw: super::zero_padded_read_write::RW::new(PreWarmingWriter::new( + page_cache_file_id, + file, + )), + } + } + + pub fn page_cache_file_id(&self) -> page_cache::FileId { + self.page_cache_file_id + } + + pub(crate) async fn write_all_borrowed( + &mut self, + srcbuf: &[u8], + ctx: &RequestContext, + ) -> Result { + // It doesn't make sense to proactively fill the page cache on the Pageserver write path + // because Compute is unlikely to access recently written data. + self.rw.write_all_borrowed(srcbuf, ctx).await + } + + pub(crate) fn bytes_written(&self) -> u64 { + self.rw.bytes_written() + } + + pub(crate) async fn read_blk( + &self, + blknum: u32, + ctx: &RequestContext, + ) -> Result { + match self.rw.read_blk(blknum).await? { + zero_padded_read_write::ReadResult::NeedsReadFromWriter { writer } => { + let cache = page_cache::get(); + match cache + .read_immutable_buf(self.page_cache_file_id, blknum, ctx) + .await + .map_err(|e| { + std::io::Error::new( + std::io::ErrorKind::Other, + // order path before error because error is anyhow::Error => might have many contexts + format!( + "ephemeral file: read immutable page #{}: {}: {:#}", + blknum, + self.rw.as_writer().file.path, + e, + ), + ) + })? { + page_cache::ReadBufResult::Found(guard) => { + return Ok(BlockLease::PageReadGuard(guard)) + } + page_cache::ReadBufResult::NotFound(write_guard) => { + let write_guard = writer + .file + .read_exact_at_page(write_guard, blknum as u64 * PAGE_SZ as u64) + .await?; + let read_guard = write_guard.mark_valid(); + return Ok(BlockLease::PageReadGuard(read_guard)); + } + } + } + zero_padded_read_write::ReadResult::ServedFromZeroPaddedMutableTail { buffer } => { + Ok(BlockLease::EphemeralFileMutableTail(buffer)) + } + } + } +} + +impl Drop for RW { + fn drop(&mut self) { + // There might still be pages in the [`crate::page_cache`] for this file. + // We leave them there, [`crate::page_cache::PageCache::find_victim`] will evict them when needed. + + // unlink the file + let res = std::fs::remove_file(&self.rw.as_writer().file.path); + if let Err(e) = res { + if e.kind() != std::io::ErrorKind::NotFound { + // just never log the not found errors, we cannot do anything for them; on detach + // the tenant directory is already gone. + // + // not found files might also be related to https://github.com/neondatabase/neon/issues/2442 + error!( + "could not remove ephemeral file '{}': {}", + self.rw.as_writer().file.path, + e + ); + } + } + } +} + +struct PreWarmingWriter { + nwritten_blocks: u32, + page_cache_file_id: page_cache::FileId, + file: VirtualFile, +} + +impl PreWarmingWriter { + fn new(page_cache_file_id: page_cache::FileId, file: VirtualFile) -> Self { + Self { + nwritten_blocks: 0, + page_cache_file_id, + file, + } + } +} + +impl crate::virtual_file::owned_buffers_io::write::OwnedAsyncWriter for PreWarmingWriter { + async fn write_all< + B: tokio_epoll_uring::BoundedBuf, + Buf: tokio_epoll_uring::IoBuf + Send, + >( + &mut self, + buf: B, + ctx: &RequestContext, + ) -> std::io::Result<(usize, B::Buf)> { + let buf = buf.slice(..); + let saved_bounds = buf.bounds(); // save for reconstructing the Slice from iobuf after the IO is done + let check_bounds_stuff_works = if cfg!(test) && cfg!(debug_assertions) { + Some(buf.to_vec()) + } else { + None + }; + let buflen = buf.len(); + assert_eq!( + buflen % PAGE_SZ, + 0, + "{buflen} ; we know TAIL_SZ is a PAGE_SZ multiple, and write_buffered_borrowed is used" + ); + + // Do the IO. + let iobuf = match self.file.write_all(buf, ctx).await { + (iobuf, Ok(nwritten)) => { + assert_eq!(nwritten, buflen); + iobuf + } + (_, Err(e)) => { + return Err(std::io::Error::new( + ErrorKind::Other, + // order error before path because path is long and error is short + format!( + "ephemeral_file: write_blob: write-back tail self.nwritten_blocks={}, buflen={}, {:#}: {}", + self.nwritten_blocks, buflen, e, self.file.path, + ), + )); + } + }; + + // Reconstruct the Slice (the write path consumed the Slice and returned us the underlying IoBuf) + let buf = tokio_epoll_uring::Slice::from_buf_bounds(iobuf, saved_bounds); + if let Some(check_bounds_stuff_works) = check_bounds_stuff_works { + assert_eq!(&check_bounds_stuff_works, &*buf); + } + + // Pre-warm page cache with the contents. + // At least in isolated bulk ingest benchmarks (test_bulk_insert.py), the pre-warming + // benefits the code that writes InMemoryLayer=>L0 layers. + let nblocks = buflen / PAGE_SZ; + let nblocks32 = u32::try_from(nblocks).unwrap(); + let cache = page_cache::get(); + static CTX: Lazy = Lazy::new(|| { + RequestContext::new( + crate::task_mgr::TaskKind::EphemeralFilePreWarmPageCache, + crate::context::DownloadBehavior::Error, + ) + }); + for blknum_in_buffer in 0..nblocks { + let blk_in_buffer = &buf[blknum_in_buffer * PAGE_SZ..(blknum_in_buffer + 1) * PAGE_SZ]; + let blknum = self + .nwritten_blocks + .checked_add(blknum_in_buffer as u32) + .unwrap(); + match cache + .read_immutable_buf(self.page_cache_file_id, blknum, &CTX) + .await + { + Err(e) => { + error!("ephemeral_file write_blob failed to get immutable buf to pre-warm page cache: {e:?}"); + // fail gracefully, it's not the end of the world if we can't pre-warm the cache here + } + Ok(v) => match v { + page_cache::ReadBufResult::Found(_guard) => { + // This function takes &mut self, so, it shouldn't be possible to reach this point. + unreachable!("we just wrote block {blknum} to the VirtualFile, which is owned by Self, \ + and this function takes &mut self, so, no concurrent read_blk is possible"); + } + page_cache::ReadBufResult::NotFound(mut write_guard) => { + write_guard.copy_from_slice(blk_in_buffer); + let _ = write_guard.mark_valid(); + } + }, + } + } + self.nwritten_blocks = self.nwritten_blocks.checked_add(nblocks32).unwrap(); + Ok((buflen, buf.into_inner())) + } +} diff --git a/pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs b/pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs new file mode 100644 index 0000000000..b37eafb52c --- /dev/null +++ b/pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs @@ -0,0 +1,130 @@ +//! The heart of how [`super::EphemeralFile`] does its reads and writes. +//! +//! # Writes +//! +//! [`super::EphemeralFile`] writes small, borrowed buffers using [`RW::write_all_borrowed`]. +//! The [`RW`] batches these into [`TAIL_SZ`] bigger writes, using [`owned_buffers_io::write::BufferedWriter`]. +//! +//! # Reads +//! +//! [`super::EphemeralFile`] always reads full [`PAGE_SZ`]ed blocks using [`RW::read_blk`]. +//! +//! The [`RW`] serves these reads either from the buffered writer's in-memory buffer +//! or redirects the caller to read from the underlying [`OwnedAsyncWriter`] +//! if the read is for the prefix that has already been flushed. +//! +//! # Current Usage +//! +//! The current user of this module is [`super::page_caching::RW`]. + +mod zero_padded; + +use crate::{ + context::RequestContext, + page_cache::PAGE_SZ, + virtual_file::owned_buffers_io::{ + self, + write::{Buffer, OwnedAsyncWriter}, + }, +}; + +const TAIL_SZ: usize = 64 * 1024; + +/// See module-level comment. +pub struct RW { + buffered_writer: owned_buffers_io::write::BufferedWriter< + zero_padded::Buffer, + owned_buffers_io::util::size_tracking_writer::Writer, + >, +} + +pub enum ReadResult<'a, W> { + NeedsReadFromWriter { writer: &'a W }, + ServedFromZeroPaddedMutableTail { buffer: &'a [u8; PAGE_SZ] }, +} + +impl RW +where + W: OwnedAsyncWriter, +{ + pub fn new(writer: W) -> Self { + let bytes_flushed_tracker = + owned_buffers_io::util::size_tracking_writer::Writer::new(writer); + let buffered_writer = owned_buffers_io::write::BufferedWriter::new( + bytes_flushed_tracker, + zero_padded::Buffer::default(), + ); + Self { buffered_writer } + } + + pub(crate) fn as_writer(&self) -> &W { + self.buffered_writer.as_inner().as_inner() + } + + pub async fn write_all_borrowed( + &mut self, + buf: &[u8], + ctx: &RequestContext, + ) -> std::io::Result { + self.buffered_writer.write_buffered_borrowed(buf, ctx).await + } + + pub fn bytes_written(&self) -> u64 { + let flushed_offset = self.buffered_writer.as_inner().bytes_written(); + let buffer: &zero_padded::Buffer = self.buffered_writer.inspect_buffer(); + flushed_offset + u64::try_from(buffer.pending()).unwrap() + } + + pub(crate) async fn read_blk(&self, blknum: u32) -> Result, std::io::Error> { + let flushed_offset = self.buffered_writer.as_inner().bytes_written(); + let buffer: &zero_padded::Buffer = self.buffered_writer.inspect_buffer(); + let buffered_offset = flushed_offset + u64::try_from(buffer.pending()).unwrap(); + let read_offset = (blknum as u64) * (PAGE_SZ as u64); + + // The trailing page ("block") might only be partially filled, + // yet the blob_io code relies on us to return a full PAGE_SZed slice anyway. + // Moreover, it has to be zero-padded, because when we still had + // a write-back page cache, it provided pre-zeroed pages, and blob_io came to rely on it. + // DeltaLayer probably has the same issue, not sure why it needs no special treatment. + // => check here that the read doesn't go beyond this potentially trailing + // => the zero-padding is done in the `else` branch below + let blocks_written = if buffered_offset % (PAGE_SZ as u64) == 0 { + buffered_offset / (PAGE_SZ as u64) + } else { + (buffered_offset / (PAGE_SZ as u64)) + 1 + }; + if (blknum as u64) >= blocks_written { + return Err(std::io::Error::new(std::io::ErrorKind::Other, anyhow::anyhow!("read past end of ephemeral_file: read=0x{read_offset:x} buffered=0x{buffered_offset:x} flushed=0x{flushed_offset}"))); + } + + // assertions for the `if-else` below + assert_eq!( + flushed_offset % (TAIL_SZ as u64), 0, + "we only use write_buffered_borrowed to write to the buffered writer, so it's guaranteed that flushes happen buffer.cap()-sized chunks" + ); + assert_eq!( + flushed_offset % (PAGE_SZ as u64), + 0, + "the logic below can't handle if the page is spread across the flushed part and the buffer" + ); + + if read_offset < flushed_offset { + assert!(read_offset + (PAGE_SZ as u64) <= flushed_offset); + Ok(ReadResult::NeedsReadFromWriter { + writer: self.as_writer(), + }) + } else { + let read_offset_in_buffer = read_offset + .checked_sub(flushed_offset) + .expect("would have taken `if` branch instead of this one"); + let read_offset_in_buffer = usize::try_from(read_offset_in_buffer).unwrap(); + let zero_padded_slice = buffer.as_zero_padded_slice(); + let page = &zero_padded_slice[read_offset_in_buffer..(read_offset_in_buffer + PAGE_SZ)]; + Ok(ReadResult::ServedFromZeroPaddedMutableTail { + buffer: page + .try_into() + .expect("the slice above got it as page-size slice"), + }) + } + } +} diff --git a/pageserver/src/tenant/ephemeral_file/zero_padded_read_write/zero_padded.rs b/pageserver/src/tenant/ephemeral_file/zero_padded_read_write/zero_padded.rs new file mode 100644 index 0000000000..f90291bbf8 --- /dev/null +++ b/pageserver/src/tenant/ephemeral_file/zero_padded_read_write/zero_padded.rs @@ -0,0 +1,108 @@ +//! A [`crate::virtual_file::owned_buffers_io::write::Buffer`] whose +//! unwritten range is guaranteed to be zero-initialized. +//! This is used by [`crate::tenant::ephemeral_file::zero_padded_read_write::RW::read_blk`] +//! to serve page-sized reads of the trailing page when the trailing page has only been partially filled. + +use std::mem::MaybeUninit; + +/// See module-level comment. +pub struct Buffer { + allocation: Box<[u8; N]>, + written: usize, +} + +impl Default for Buffer { + fn default() -> Self { + Self { + allocation: Box::new( + // SAFETY: zeroed memory is a valid [u8; N] + unsafe { MaybeUninit::zeroed().assume_init() }, + ), + written: 0, + } + } +} + +impl Buffer { + #[inline(always)] + fn invariants(&self) { + // don't check by default, unoptimized is too expensive even for debug mode + if false { + debug_assert!(self.written <= N, "{}", self.written); + debug_assert!(self.allocation[self.written..N].iter().all(|v| *v == 0)); + } + } + + pub fn as_zero_padded_slice(&self) -> &[u8; N] { + &self.allocation + } +} + +impl crate::virtual_file::owned_buffers_io::write::Buffer for Buffer { + type IoBuf = Self; + + fn cap(&self) -> usize { + self.allocation.len() + } + + fn extend_from_slice(&mut self, other: &[u8]) { + self.invariants(); + let remaining = self.allocation.len() - self.written; + if other.len() > remaining { + panic!("calling extend_from_slice() with insufficient remaining capacity"); + } + self.allocation[self.written..(self.written + other.len())].copy_from_slice(other); + self.written += other.len(); + self.invariants(); + } + + fn pending(&self) -> usize { + self.written + } + + fn flush(self) -> tokio_epoll_uring::Slice { + self.invariants(); + let written = self.written; + tokio_epoll_uring::BoundedBuf::slice(self, 0..written) + } + + fn reuse_after_flush(iobuf: Self::IoBuf) -> Self { + let Self { + mut allocation, + written, + } = iobuf; + allocation[0..written].fill(0); + let new = Self { + allocation, + written: 0, + }; + new.invariants(); + new + } +} + +/// We have this trait impl so that the `flush` method in the `Buffer` impl above can produce a +/// [`tokio_epoll_uring::BoundedBuf::slice`] of the [`Self::written`] range of the data. +/// +/// Remember that bytes_init is generally _not_ a tracker of the amount +/// of valid data in the io buffer; we use `Slice` for that. +/// The `IoBuf` is _only_ for keeping track of uninitialized memory, a bit like MaybeUninit. +/// +/// SAFETY: +/// +/// The [`Self::allocation`] is stable becauses boxes are stable. +/// The memory is zero-initialized, so, bytes_init is always N. +unsafe impl tokio_epoll_uring::IoBuf for Buffer { + fn stable_ptr(&self) -> *const u8 { + self.allocation.as_ptr() + } + + fn bytes_init(&self) -> usize { + // Yes, N, not self.written; Read the full comment of this impl block! + N + } + + fn bytes_total(&self) -> usize { + N + } +} diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs index b8ed69052f..2724a5cc07 100644 --- a/pageserver/src/tenant/layer_map.rs +++ b/pageserver/src/tenant/layer_map.rs @@ -346,35 +346,6 @@ where } } -#[derive(PartialEq, Eq, Hash, Debug, Clone)] -pub enum InMemoryLayerHandle { - Open { - lsn_floor: Lsn, - end_lsn: Lsn, - }, - Frozen { - idx: usize, - lsn_floor: Lsn, - end_lsn: Lsn, - }, -} - -impl InMemoryLayerHandle { - pub fn get_lsn_floor(&self) -> Lsn { - match self { - InMemoryLayerHandle::Open { lsn_floor, .. } => *lsn_floor, - InMemoryLayerHandle::Frozen { lsn_floor, .. } => *lsn_floor, - } - } - - pub fn get_end_lsn(&self) -> Lsn { - match self { - InMemoryLayerHandle::Open { end_lsn, .. } => *end_lsn, - InMemoryLayerHandle::Frozen { end_lsn, .. } => *end_lsn, - } - } -} - impl LayerMap { /// /// Find the latest layer (by lsn.end) that covers the given @@ -576,41 +547,18 @@ impl LayerMap { self.historic.iter() } - /// Get a handle for the first in memory layer that matches the provided predicate. - /// The handle should be used with [`Self::get_in_memory_layer`] to retrieve the actual layer. - /// - /// Note: [`Self::find_in_memory_layer`] and [`Self::get_in_memory_layer`] should be called during - /// the same exclusive region established by holding the layer manager lock. - pub fn find_in_memory_layer(&self, mut pred: Pred) -> Option + /// Get a ref counted pointer for the first in memory layer that matches the provided predicate. + pub fn find_in_memory_layer(&self, mut pred: Pred) -> Option> where Pred: FnMut(&Arc) -> bool, { if let Some(open) = &self.open_layer { if pred(open) { - return Some(InMemoryLayerHandle::Open { - lsn_floor: open.get_lsn_range().start, - end_lsn: open.get_lsn_range().end, - }); + return Some(open.clone()); } } - let pos = self.frozen_layers.iter().rev().position(pred); - pos.map(|rev_idx| { - let idx = self.frozen_layers.len() - 1 - rev_idx; - InMemoryLayerHandle::Frozen { - idx, - lsn_floor: self.frozen_layers[idx].get_lsn_range().start, - end_lsn: self.frozen_layers[idx].get_lsn_range().end, - } - }) - } - - /// Get the layer pointed to by the provided handle. - pub fn get_in_memory_layer(&self, handle: &InMemoryLayerHandle) -> Option> { - match handle { - InMemoryLayerHandle::Open { .. } => self.open_layer.clone(), - InMemoryLayerHandle::Frozen { idx, .. } => self.frozen_layers.get(*idx).cloned(), - } + self.frozen_layers.iter().rfind(|l| pred(l)).cloned() } /// @@ -640,7 +588,7 @@ impl LayerMap { let kr = Key::from_i128(current_key)..Key::from_i128(change_key); coverage.push((kr, current_val.take())); current_key = change_key; - current_val = change_val.clone(); + current_val.clone_from(&change_val); } // Add the final interval @@ -724,12 +672,12 @@ impl LayerMap { // Loop through the delta coverage and recurse on each part for (change_key, change_val) in version.delta_coverage.range(start..end) { // If there's a relevant delta in this part, add 1 and recurse down - if let Some(val) = current_val { + if let Some(val) = ¤t_val { if val.get_lsn_range().end > lsn.start { let kr = Key::from_i128(current_key)..Key::from_i128(change_key); let lr = lsn.start..val.get_lsn_range().start; if !kr.is_empty() { - let base_count = Self::is_reimage_worthy(&val, key) as usize; + let base_count = Self::is_reimage_worthy(val, key) as usize; let new_limit = limit.map(|l| l - base_count); let max_stacked_deltas_underneath = self.count_deltas(&kr, &lr, new_limit); max_stacked_deltas = std::cmp::max( @@ -741,17 +689,17 @@ impl LayerMap { } current_key = change_key; - current_val = change_val.clone(); + current_val.clone_from(&change_val); } // Consider the last part - if let Some(val) = current_val { + if let Some(val) = ¤t_val { if val.get_lsn_range().end > lsn.start { let kr = Key::from_i128(current_key)..Key::from_i128(end); let lr = lsn.start..val.get_lsn_range().start; if !kr.is_empty() { - let base_count = Self::is_reimage_worthy(&val, key) as usize; + let base_count = Self::is_reimage_worthy(val, key) as usize; let new_limit = limit.map(|l| l - base_count); let max_stacked_deltas_underneath = self.count_deltas(&kr, &lr, new_limit); max_stacked_deltas = std::cmp::max( @@ -968,6 +916,7 @@ mod tests { assert_eq!(lhs, rhs); } + #[cfg(test)] fn brute_force_range_search( layer_map: &LayerMap, key_range: Range, diff --git a/pageserver/src/tenant/metadata.rs b/pageserver/src/tenant/metadata.rs index 1736950d1f..fc71ea7642 100644 --- a/pageserver/src/tenant/metadata.rs +++ b/pageserver/src/tenant/metadata.rs @@ -207,6 +207,24 @@ impl TimelineMetadata { self.body.ancestor_lsn } + /// When reparenting, the `ancestor_lsn` does not change. + pub fn reparent(&mut self, timeline: &TimelineId) { + assert!(self.body.ancestor_timeline.is_some()); + // no assertion for redoing this: it's fine, we may have to repeat this multiple times over + self.body.ancestor_timeline = Some(*timeline); + } + + pub fn detach_from_ancestor(&mut self, branchpoint: &(TimelineId, Lsn)) { + if let Some(ancestor) = self.body.ancestor_timeline { + assert_eq!(ancestor, branchpoint.0); + } + if self.body.ancestor_lsn != Lsn(0) { + assert_eq!(self.body.ancestor_lsn, branchpoint.1); + } + self.body.ancestor_timeline = None; + self.body.ancestor_lsn = Lsn(0); + } + pub fn latest_gc_cutoff_lsn(&self) -> Lsn { self.body.latest_gc_cutoff_lsn } @@ -235,6 +253,12 @@ impl TimelineMetadata { let bytes = instance.to_bytes().unwrap(); Self::from_bytes(&bytes).unwrap() } + + pub(crate) fn apply(&mut self, update: &MetadataUpdate) { + self.body.disk_consistent_lsn = update.disk_consistent_lsn; + self.body.prev_record_lsn = update.prev_record_lsn; + self.body.latest_gc_cutoff_lsn = update.latest_gc_cutoff_lsn; + } } impl<'de> Deserialize<'de> for TimelineMetadata { @@ -259,6 +283,27 @@ impl Serialize for TimelineMetadata { } } +/// Parts of the metadata which are regularly modified. +pub(crate) struct MetadataUpdate { + disk_consistent_lsn: Lsn, + prev_record_lsn: Option, + latest_gc_cutoff_lsn: Lsn, +} + +impl MetadataUpdate { + pub(crate) fn new( + disk_consistent_lsn: Lsn, + prev_record_lsn: Option, + latest_gc_cutoff_lsn: Lsn, + ) -> Self { + Self { + disk_consistent_lsn, + prev_record_lsn, + latest_gc_cutoff_lsn, + } + } +} + #[cfg(test)] mod tests { use super::*; diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index 97a505ded9..6be66e99ad 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -2,9 +2,10 @@ //! page server. use camino::{Utf8DirEntry, Utf8Path, Utf8PathBuf}; +use futures::StreamExt; use itertools::Itertools; use pageserver_api::key::Key; -use pageserver_api::models::{LocationConfigMode, ShardParameters}; +use pageserver_api::models::LocationConfigMode; use pageserver_api::shard::{ ShardCount, ShardIdentity, ShardNumber, ShardStripeSize, TenantShardId, }; @@ -16,6 +17,7 @@ use std::collections::{BTreeMap, HashMap}; use std::ops::Deref; use std::sync::Arc; use std::time::{Duration, Instant}; +use sysinfo::SystemExt; use tokio::fs; use utils::timeout::{timeout_cancellable, TimeoutCancellableError}; @@ -39,10 +41,11 @@ use crate::metrics::{TENANT, TENANT_MANAGER as METRICS}; use crate::task_mgr::{self, TaskKind}; use crate::tenant::config::{ AttachedLocationConfig, AttachmentMode, LocationConf, LocationMode, SecondaryLocationConfig, - TenantConfOpt, }; use crate::tenant::delete::DeleteTenantFlow; use crate::tenant::span::debug_assert_current_span_has_tenant_id; +use crate::tenant::storage_layer::inmemory_layer; +use crate::tenant::timeline::ShutdownMode; use crate::tenant::{AttachedTenantConf, SpawnMode, Tenant, TenantState}; use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TEMP_FILE_SUFFIX}; @@ -53,6 +56,7 @@ use utils::id::{TenantId, TimelineId}; use super::delete::DeleteTenantError; use super::secondary::SecondaryTenant; +use super::timeline::detach_ancestor::PreparedTimelineDetach; use super::TenantSharedResources; /// For a tenant that appears in TenantsMap, it may either be @@ -243,6 +247,7 @@ impl TenantsMap { } } + #[cfg(all(debug_assertions, not(test)))] pub(crate) fn len(&self) -> usize { match self { TenantsMap::Initializing => 0, @@ -251,17 +256,15 @@ impl TenantsMap { } } +/// Precursor to deletion of a tenant dir: we do a fast rename to a tmp path, and then +/// the slower actual deletion in the background. +/// /// This is "safe" in that that it won't leave behind a partially deleted directory /// at the original path, because we rename with TEMP_FILE_SUFFIX before starting deleting /// the contents. /// /// This is pageserver-specific, as it relies on future processes after a crash to check /// for TEMP_FILE_SUFFIX when loading things. -async fn safe_remove_tenant_dir_all(path: impl AsRef) -> std::io::Result<()> { - let tmp_path = safe_rename_tenant_dir(path).await?; - fs::remove_dir_all(tmp_path).await -} - async fn safe_rename_tenant_dir(path: impl AsRef) -> std::io::Result { let parent = path .as_ref() @@ -284,6 +287,28 @@ async fn safe_rename_tenant_dir(path: impl AsRef) -> std::io::Result> = Lazy::new(|| std::sync::RwLock::new(TenantsMap::Initializing)); @@ -543,6 +568,18 @@ pub async fn init_tenant_mgr( let ctx = RequestContext::todo_child(TaskKind::Startup, DownloadBehavior::Warn); + // Initialize dynamic limits that depend on system resources + let system_memory = + sysinfo::System::new_with_specifics(sysinfo::RefreshKind::new().with_memory()) + .total_memory(); + let max_ephemeral_layer_bytes = + conf.ephemeral_bytes_per_memory_kb as u64 * (system_memory / 1024); + tracing::info!("Initialized ephemeral layer size limit to {max_ephemeral_layer_bytes}, for {system_memory} bytes of memory"); + inmemory_layer::GLOBAL_RESOURCES.max_dirty_bytes.store( + max_ephemeral_layer_bytes, + std::sync::atomic::Ordering::Relaxed, + ); + // Scan local filesystem for attached tenants let tenant_configs = init_load_tenant_configs(conf).await?; @@ -556,7 +593,11 @@ pub async fn init_tenant_mgr( ); TENANT.startup_scheduled.inc_by(tenant_configs.len() as u64); - // Construct `Tenant` objects and start them running + // Accumulate futures for writing tenant configs, so that we can execute in parallel + let mut config_write_futs = Vec::new(); + + // Update the location configs according to the re-attach response and persist them to disk + tracing::info!("Updating {} location configs", tenant_configs.len()); for (tenant_shard_id, location_conf) in tenant_configs { let tenant_dir_path = conf.tenant_path(&tenant_shard_id); @@ -583,18 +624,22 @@ pub async fn init_tenant_mgr( const DEFAULT_SECONDARY_CONF: SecondaryLocationConfig = SecondaryLocationConfig { warm: true }; - // Update the location config according to the re-attach response if let Some(tenant_modes) = &tenant_modes { // We have a generation map: treat it as the authority for whether // this tenant is really attached. match tenant_modes.get(&tenant_shard_id) { None => { info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Detaching tenant, control plane omitted it in re-attach response"); - if let Err(e) = safe_remove_tenant_dir_all(&tenant_dir_path).await { - error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), - "Failed to remove detached tenant directory '{tenant_dir_path}': {e:?}", - ); - } + + match safe_rename_tenant_dir(&tenant_dir_path).await { + Ok(tmp_path) => { + spawn_background_purge(tmp_path); + } + Err(e) => { + error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), + "Failed to move detached tenant directory '{tenant_dir_path}': {e:?}"); + } + }; // We deleted local content: move on to next tenant, don't try and spawn this one. continue; @@ -640,8 +685,32 @@ pub async fn init_tenant_mgr( // Presence of a generation number implies attachment: attach the tenant // if it wasn't already, and apply the generation number. - Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf).await?; + config_write_futs.push(async move { + let r = Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf).await; + (tenant_shard_id, location_conf, r) + }); + } + // Execute config writes with concurrency, to avoid bottlenecking on local FS write latency + tracing::info!( + "Writing {} location config files...", + config_write_futs.len() + ); + let config_write_results = futures::stream::iter(config_write_futs) + .buffer_unordered(16) + .collect::>() + .await; + + tracing::info!( + "Spawning {} tenant shard locations...", + config_write_results.len() + ); + // For those shards that have live configurations, construct `Tenant` or `SecondaryTenant` objects and start them running + for (tenant_shard_id, location_conf, config_write_result) in config_write_results { + // Errors writing configs are fatal + config_write_result?; + + let tenant_dir_path = conf.tenant_path(&tenant_shard_id); let shard_identity = location_conf.shard; let slot = match location_conf.mode { LocationMode::Attached(attached_conf) => { @@ -664,14 +733,22 @@ pub async fn init_tenant_mgr( } } } - LocationMode::Secondary(secondary_conf) => TenantSlot::Secondary(SecondaryTenant::new( - tenant_shard_id, - shard_identity, - location_conf.tenant_conf, - &secondary_conf, - )), + LocationMode::Secondary(secondary_conf) => { + info!( + tenant_id = %tenant_shard_id.tenant_id, + shard_id = %tenant_shard_id.shard_slug(), + "Starting secondary tenant" + ); + TenantSlot::Secondary(SecondaryTenant::new( + tenant_shard_id, + shard_identity, + location_conf.tenant_conf, + &secondary_conf, + )) + } }; + METRICS.slot_inserted(&slot); tenants.insert(tenant_shard_id, slot); } @@ -679,7 +756,7 @@ pub async fn init_tenant_mgr( let mut tenants_map = TENANTS.write().unwrap(); assert!(matches!(&*tenants_map, &TenantsMap::Initializing)); - METRICS.tenant_slots.set(tenants.len() as u64); + *tenants_map = TenantsMap::Open(tenants); Ok(TenantManager { @@ -750,6 +827,14 @@ fn tenant_spawn( async fn shutdown_all_tenants0(tenants: &std::sync::RwLock) { let mut join_set = JoinSet::new(); + #[cfg(all(debug_assertions, not(test)))] + { + // Check that our metrics properly tracked the size of the tenants map. This is a convenient location to check, + // as it happens implicitly at the end of tests etc. + let m = tenants.read().unwrap(); + debug_assert_eq!(METRICS.slots_total(), m.len() as u64); + } + // Atomically, 1. create the shutdown tasks and 2. prevent creation of new tenants. let (total_in_progress, total_attached) = { let mut m = tenants.write().unwrap(); @@ -770,11 +855,9 @@ async fn shutdown_all_tenants0(tenants: &std::sync::RwLock) { shutdown_state.insert(tenant_shard_id, TenantSlot::Attached(t.clone())); join_set.spawn( async move { - let freeze_and_flush = true; - let res = { let (_guard, shutdown_progress) = completion::channel(); - t.shutdown(shutdown_progress, freeze_and_flush).await + t.shutdown(shutdown_progress, ShutdownMode::FreezeAndFlush).await }; if let Err(other_progress) = res { @@ -875,16 +958,6 @@ async fn shutdown_all_tenants0(tenants: &std::sync::RwLock) { // caller will log how long we took } -#[derive(Debug, thiserror::Error)] -pub(crate) enum SetNewTenantConfigError { - #[error(transparent)] - GetTenant(#[from] GetTenantError), - #[error(transparent)] - Persist(anyhow::Error), - #[error(transparent)] - Other(anyhow::Error), -} - #[derive(thiserror::Error, Debug)] pub(crate) enum UpsertLocationError { #[error("Bad config request: {0}")] @@ -910,32 +983,21 @@ impl TenantManager { self.conf } - /// Gets the attached tenant from the in-memory data, erroring if it's absent, in secondary mode, or is not fitting to the query. - /// `active_only = true` allows to query only tenants that are ready for operations, erroring on other kinds of tenants. + /// Gets the attached tenant from the in-memory data, erroring if it's absent, in secondary mode, or currently + /// undergoing a state change (i.e. slot is InProgress). + /// + /// The return Tenant is not guaranteed to be active: check its status after obtaing it, or + /// use [`Tenant::wait_to_become_active`] before using it if you will do I/O on it. pub(crate) fn get_attached_tenant_shard( &self, tenant_shard_id: TenantShardId, - active_only: bool, ) -> Result, GetTenantError> { let locked = self.tenants.read().unwrap(); let peek_slot = tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Read)?; match peek_slot { - Some(TenantSlot::Attached(tenant)) => match tenant.current_state() { - TenantState::Broken { - reason, - backtrace: _, - } if active_only => Err(GetTenantError::Broken(reason)), - TenantState::Active => Ok(Arc::clone(tenant)), - _ => { - if active_only { - Err(GetTenantError::NotActive(tenant_shard_id)) - } else { - Ok(Arc::clone(tenant)) - } - } - }, + Some(TenantSlot::Attached(tenant)) => Ok(Arc::clone(tenant)), Some(TenantSlot::InProgress(_)) => Err(GetTenantError::NotActive(tenant_shard_id)), None | Some(TenantSlot::Secondary(_)) => { Err(GetTenantError::NotFound(tenant_shard_id.tenant_id)) @@ -1115,7 +1177,7 @@ impl TenantManager { }; info!("Shutting down attached tenant"); - match tenant.shutdown(progress, false).await { + match tenant.shutdown(progress, ShutdownMode::Hard).await { Ok(()) => {} Err(barrier) => { info!("Shutdown already in progress, waiting for it to complete"); @@ -1231,7 +1293,7 @@ impl TenantManager { TenantSlot::Attached(tenant) => { let (_guard, progress) = utils::completion::channel(); info!("Shutting down just-spawned tenant, because tenant manager is shut down"); - match tenant.shutdown(progress, false).await { + match tenant.shutdown(progress, ShutdownMode::Hard).await { Ok(()) => { info!("Finished shutting down just-spawned tenant"); } @@ -1281,7 +1343,7 @@ impl TenantManager { }; let (_guard, progress) = utils::completion::channel(); - match tenant.shutdown(progress, false).await { + match tenant.shutdown(progress, ShutdownMode::Hard).await { Ok(()) => { slot_guard.drop_old_value()?; } @@ -1419,16 +1481,23 @@ impl TenantManager { match tenant.current_state() { TenantState::Broken { .. } | TenantState::Stopping { .. } => { - // If a tenant is broken or stopping, DeleteTenantFlow can - // handle it: broken tenants proceed to delete, stopping tenants - // are checked for deletion already in progress. + // If deletion is already in progress, return success (the semantics of this + // function are to rerturn success afterr deletion is spawned in background). + // Otherwise fall through and let [`DeleteTenantFlow`] handle this state. + if DeleteTenantFlow::is_in_progress(&tenant) { + // The `delete_progress` lock is held: deletion is already happening + // in the bacckground + slot_guard.revert(); + return Ok(()); + } } _ => { tenant .wait_to_become_active(activation_timeout) .await .map_err(|e| match e { - GetActiveTenantError::WillNotBecomeActive(_) => { + GetActiveTenantError::WillNotBecomeActive(_) + | GetActiveTenantError::Broken(_) => { DeleteTenantError::InvalidState(tenant.current_state()) } GetActiveTenantError::Cancelled => DeleteTenantError::Cancelled, @@ -1455,29 +1524,30 @@ impl TenantManager { result } - #[instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), new_shard_count=%new_shard_count.literal()))] + #[instrument(skip_all, fields(tenant_id=%tenant.get_tenant_shard_id().tenant_id, shard_id=%tenant.get_tenant_shard_id().shard_slug(), new_shard_count=%new_shard_count.literal()))] pub(crate) async fn shard_split( &self, - tenant_shard_id: TenantShardId, + tenant: Arc, new_shard_count: ShardCount, new_stripe_size: Option, ctx: &RequestContext, ) -> anyhow::Result> { + let tenant_shard_id = *tenant.get_tenant_shard_id(); let r = self - .do_shard_split(tenant_shard_id, new_shard_count, new_stripe_size, ctx) + .do_shard_split(tenant, new_shard_count, new_stripe_size, ctx) .await; if r.is_err() { // Shard splitting might have left the original shard in a partially shut down state (it // stops the shard's remote timeline client). Reset it to ensure we leave things in // a working state. if self.get(tenant_shard_id).is_some() { - tracing::warn!("Resetting {tenant_shard_id} after shard split failure"); + tracing::warn!("Resetting after shard split failure"); if let Err(e) = self.reset_tenant(tenant_shard_id, false, ctx).await { // Log this error because our return value will still be the original error, not this one. This is // a severe error: if this happens, we might be leaving behind a tenant that is not fully functional // (e.g. has uploads disabled). We can't do anything else: if reset fails then shutting the tenant down or // setting it broken probably won't help either. - tracing::error!("Failed to reset {tenant_shard_id}: {e}"); + tracing::error!("Failed to reset: {e}"); } } } @@ -1487,12 +1557,12 @@ impl TenantManager { pub(crate) async fn do_shard_split( &self, - tenant_shard_id: TenantShardId, + tenant: Arc, new_shard_count: ShardCount, new_stripe_size: Option, ctx: &RequestContext, ) -> anyhow::Result> { - let tenant = get_tenant(tenant_shard_id, true)?; + let tenant_shard_id = *tenant.get_tenant_shard_id(); // Validate the incoming request if new_shard_count.count() <= tenant_shard_id.shard_count.count() { @@ -1538,7 +1608,6 @@ impl TenantManager { // If [`Tenant::split_prepare`] fails, we must reload the tenant, because it might // have been left in a partially-shut-down state. tracing::warn!("Failed to prepare for split: {e}, reloading Tenant before returning"); - self.reset_tenant(tenant_shard_id, false, ctx).await?; return Err(e); } @@ -1656,7 +1725,14 @@ impl TenantManager { fail::fail_point!("shard-split-lsn-wait", |_| Err(anyhow::anyhow!( "failpoint" ))); - if let Err(e) = timeline.wait_lsn(*target_lsn, ctx).await { + if let Err(e) = timeline + .wait_lsn( + *target_lsn, + crate::tenant::timeline::WaitLsnWaiter::Tenant, + ctx, + ) + .await + { // Failure here might mean shutdown, in any case this part is an optimization // and we shouldn't hold up the split operation. tracing::warn!( @@ -1677,7 +1753,7 @@ impl TenantManager { // Phase 5: Shut down the parent shard, and erase it from disk let (_guard, progress) = completion::channel(); - match parent.shutdown(progress, false).await { + match parent.shutdown(progress, ShutdownMode::Hard).await { Ok(()) => {} Err(other) => { other.wait().await; @@ -1687,7 +1763,7 @@ impl TenantManager { let tmp_path = safe_rename_tenant_dir(&local_tenant_directory) .await .with_context(|| format!("local tenant directory {local_tenant_directory:?} rename"))?; - self.spawn_background_purge(tmp_path); + spawn_background_purge(tmp_path); fail::fail_point!("shard-split-pre-finish", |_| Err(anyhow::anyhow!( "failpoint" @@ -1842,28 +1918,6 @@ impl TenantManager { shutdown_all_tenants0(self.tenants).await } - /// When we have moved a tenant's content to a temporary directory, we may delete it lazily in - /// the background, and thereby avoid blocking any API requests on this deletion completing. - fn spawn_background_purge(&self, tmp_path: Utf8PathBuf) { - // Although we are cleaning up the tenant, this task is not meant to be bound by the lifetime of the tenant in memory. - // After a tenant is detached, there are no more task_mgr tasks for that tenant_id. - let task_tenant_id = None; - - task_mgr::spawn( - task_mgr::BACKGROUND_RUNTIME.handle(), - TaskKind::MgmtRequest, - task_tenant_id, - None, - "tenant_files_delete", - false, - async move { - fs::remove_dir_all(tmp_path.as_path()) - .await - .with_context(|| format!("tenant directory {:?} deletion", tmp_path)) - }, - ); - } - pub(crate) async fn detach_tenant( &self, conf: &'static PageServerConf, @@ -1880,7 +1934,7 @@ impl TenantManager { deletion_queue_client, ) .await?; - self.spawn_background_purge(tmp_path); + spawn_background_purge(tmp_path); Ok(()) } @@ -1936,38 +1990,118 @@ impl TenantManager { removal_result } - pub(crate) async fn set_new_tenant_config( + pub(crate) fn list_tenants( &self, - new_tenant_conf: TenantConfOpt, - tenant_id: TenantId, - ) -> Result<(), SetNewTenantConfigError> { - // Legacy API: does not support sharding - let tenant_shard_id = TenantShardId::unsharded(tenant_id); + ) -> Result, TenantMapListError> { + let tenants = TENANTS.read().unwrap(); + let m = match &*tenants { + TenantsMap::Initializing => return Err(TenantMapListError::Initializing), + TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => m, + }; + Ok(m.iter() + .filter_map(|(id, tenant)| match tenant { + TenantSlot::Attached(tenant) => { + Some((*id, tenant.current_state(), tenant.generation())) + } + TenantSlot::Secondary(_) => None, + TenantSlot::InProgress(_) => None, + }) + .collect()) + } - info!("configuring tenant {tenant_id}"); - let tenant = get_tenant(tenant_shard_id, true)?; + /// Completes an earlier prepared timeline detach ancestor. + pub(crate) async fn complete_detaching_timeline_ancestor( + &self, + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + prepared: PreparedTimelineDetach, + ctx: &RequestContext, + ) -> Result, anyhow::Error> { + struct RevertOnDropSlot(Option); - if !tenant.tenant_shard_id().shard_count.is_unsharded() { - // Note that we use ShardParameters::default below. - return Err(SetNewTenantConfigError::Other(anyhow::anyhow!( - "This API may only be used on single-sharded tenants, use the /location_config API for sharded tenants" - ))); + impl Drop for RevertOnDropSlot { + fn drop(&mut self) { + if let Some(taken) = self.0.take() { + taken.revert(); + } + } } - // This is a legacy API that only operates on attached tenants: the preferred - // API to use is the location_config/ endpoint, which lets the caller provide - // the full LocationConf. - let location_conf = LocationConf::attached_single( - new_tenant_conf.clone(), - tenant.generation, - &ShardParameters::default(), - ); + impl RevertOnDropSlot { + fn into_inner(mut self) -> SlotGuard { + self.0.take().unwrap() + } + } - Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &location_conf) - .await - .map_err(SetNewTenantConfigError::Persist)?; - tenant.set_new_tenant_config(new_tenant_conf); - Ok(()) + impl std::ops::Deref for RevertOnDropSlot { + type Target = SlotGuard; + + fn deref(&self) -> &Self::Target { + self.0.as_ref().unwrap() + } + } + + let slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)?; + let slot_guard = RevertOnDropSlot(Some(slot_guard)); + + let tenant = { + let Some(old_slot) = slot_guard.get_old_value() else { + anyhow::bail!( + "Tenant not found when trying to complete detaching timeline ancestor" + ); + }; + + let Some(tenant) = old_slot.get_attached() else { + anyhow::bail!("Tenant is not in attached state"); + }; + + if !tenant.is_active() { + anyhow::bail!("Tenant is not active"); + } + + tenant.clone() + }; + + let timeline = tenant.get_timeline(timeline_id, true)?; + + let reparented = timeline + .complete_detaching_timeline_ancestor(&tenant, prepared, ctx) + .await?; + + let mut slot_guard = slot_guard.into_inner(); + + let (_guard, progress) = utils::completion::channel(); + match tenant.shutdown(progress, ShutdownMode::Hard).await { + Ok(()) => { + slot_guard.drop_old_value()?; + } + Err(_barrier) => { + slot_guard.revert(); + // this really should not happen, at all, unless shutdown was already going? + anyhow::bail!("Cannot restart Tenant, already shutting down"); + } + } + + let tenant_path = self.conf.tenant_path(&tenant_shard_id); + let config = Tenant::load_tenant_config(self.conf, &tenant_shard_id)?; + + let shard_identity = config.shard; + let tenant = tenant_spawn( + self.conf, + tenant_shard_id, + &tenant_path, + self.resources.clone(), + AttachedTenantConf::try_from(config)?, + shard_identity, + None, + self.tenants, + SpawnMode::Eager, + ctx, + )?; + + slot_guard.upsert(TenantSlot::Attached(tenant))?; + + Ok(reparented) } } @@ -1980,51 +2114,12 @@ pub(crate) enum GetTenantError { #[error("Tenant {0} is not active")] NotActive(TenantShardId), - /// Broken is logically a subset of NotActive, but a distinct error is useful as - /// NotActive is usually a retryable state for API purposes, whereas Broken - /// is a stuck error state - #[error("Tenant is broken: {0}")] - Broken(String), // Initializing or shutting down: cannot authoritatively say whether we have this tenant #[error("Tenant map is not available: {0}")] MapState(#[from] TenantMapError), } -/// Gets the tenant from the in-memory data, erroring if it's absent or is not fitting to the query. -/// `active_only = true` allows to query only tenants that are ready for operations, erroring on other kinds of tenants. -/// -/// This method is cancel-safe. -pub(crate) fn get_tenant( - tenant_shard_id: TenantShardId, - active_only: bool, -) -> Result, GetTenantError> { - let locked = TENANTS.read().unwrap(); - - let peek_slot = tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Read)?; - - match peek_slot { - Some(TenantSlot::Attached(tenant)) => match tenant.current_state() { - TenantState::Broken { - reason, - backtrace: _, - } if active_only => Err(GetTenantError::Broken(reason)), - TenantState::Active => Ok(Arc::clone(tenant)), - _ => { - if active_only { - Err(GetTenantError::NotActive(tenant_shard_id)) - } else { - Ok(Arc::clone(tenant)) - } - } - }, - Some(TenantSlot::InProgress(_)) => Err(GetTenantError::NotActive(tenant_shard_id)), - None | Some(TenantSlot::Secondary(_)) => { - Err(GetTenantError::NotFound(tenant_shard_id.tenant_id)) - } - } -} - #[derive(thiserror::Error, Debug)] pub(crate) enum GetActiveTenantError { /// We may time out either while TenantSlot is InProgress, or while the Tenant @@ -2048,6 +2143,12 @@ pub(crate) enum GetActiveTenantError { /// Tenant exists, but is in a state that cannot become active (e.g. Stopping, Broken) #[error("will not become active. Current state: {0}")] WillNotBecomeActive(TenantState), + + /// Broken is logically a subset of WillNotBecomeActive, but a distinct error is useful as + /// WillNotBecomeActive is a permitted error under some circumstances, whereas broken should + /// never happen. + #[error("Tenant is broken: {0}")] + Broken(String), } /// Get a [`Tenant`] in its active state. If the tenant_id is currently in [`TenantSlot::InProgress`] @@ -2267,27 +2368,6 @@ pub(crate) enum TenantMapListError { Initializing, } -/// -/// Get list of tenants, for the mgmt API -/// -pub(crate) async fn list_tenants( -) -> Result, TenantMapListError> { - let tenants = TENANTS.read().unwrap(); - let m = match &*tenants { - TenantsMap::Initializing => return Err(TenantMapListError::Initializing), - TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => m, - }; - Ok(m.iter() - .filter_map(|(id, tenant)| match tenant { - TenantSlot::Attached(tenant) => { - Some((*id, tenant.current_state(), tenant.generation())) - } - TenantSlot::Secondary(_) => None, - TenantSlot::InProgress(_) => None, - }) - .collect()) -} - #[derive(Debug, thiserror::Error)] pub(crate) enum TenantMapInsertError { #[error(transparent)] @@ -2454,10 +2534,13 @@ impl SlotGuard { TenantsMap::Open(m) => m, }; + METRICS.slot_inserted(&new_value); + let replaced = m.insert(self.tenant_shard_id, new_value); self.upserted = true; - - METRICS.tenant_slots.set(m.len() as u64); + if let Some(replaced) = replaced.as_ref() { + METRICS.slot_removed(replaced); + } replaced }; @@ -2567,9 +2650,13 @@ impl Drop for SlotGuard { } if self.old_value_is_shutdown() { + METRICS.slot_removed(entry.get()); entry.remove(); } else { - entry.insert(self.old_value.take().unwrap()); + let inserting = self.old_value.take().unwrap(); + METRICS.slot_inserted(&inserting); + let replaced = entry.insert(inserting); + METRICS.slot_removed(&replaced); } } Entry::Vacant(_) => { @@ -2580,8 +2667,6 @@ impl Drop for SlotGuard { ); } } - - METRICS.tenant_slots.set(m.len() as u64); } } @@ -2661,7 +2746,9 @@ fn tenant_map_acquire_slot_impl( } _ => { let (completion, barrier) = utils::completion::channel(); - v.insert(TenantSlot::InProgress(barrier)); + let inserting = TenantSlot::InProgress(barrier); + METRICS.slot_inserted(&inserting); + v.insert(inserting); tracing::debug!("Vacant, inserted InProgress"); Ok(SlotGuard::new(*tenant_shard_id, None, completion)) } @@ -2697,7 +2784,10 @@ fn tenant_map_acquire_slot_impl( _ => { // Happy case: the slot was not in any state that violated our mode let (completion, barrier) = utils::completion::channel(); - let old_value = o.insert(TenantSlot::InProgress(barrier)); + let in_progress = TenantSlot::InProgress(barrier); + METRICS.slot_inserted(&in_progress); + let old_value = o.insert(in_progress); + METRICS.slot_removed(&old_value); tracing::debug!("Occupied, replaced with InProgress"); Ok(SlotGuard::new( *tenant_shard_id, @@ -2733,11 +2823,11 @@ where let attached_tenant = match slot_guard.get_old_value() { Some(TenantSlot::Attached(tenant)) => { // whenever we remove a tenant from memory, we don't want to flush and wait for upload - let freeze_and_flush = false; + let shutdown_mode = ShutdownMode::Hard; // shutdown is sure to transition tenant to stopping, and wait for all tasks to complete, so // that we can continue safely to cleanup. - match tenant.shutdown(progress, freeze_and_flush).await { + match tenant.shutdown(progress, shutdown_mode).await { Ok(()) => {} Err(_other) => { // if pageserver shutdown or other detach/ignore is already ongoing, we don't want to diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index 40be2ca8f3..9103760388 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -200,14 +200,18 @@ use utils::backoff::{ use std::collections::{HashMap, VecDeque}; use std::sync::atomic::{AtomicU32, Ordering}; use std::sync::{Arc, Mutex}; +use std::time::Duration; -use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath, TimeoutOrCancel}; +use remote_storage::{ + DownloadError, GenericRemoteStorage, ListingMode, RemotePath, TimeoutOrCancel, +}; use std::ops::DerefMut; use tracing::{debug, error, info, instrument, warn}; use tracing::{info_span, Instrument}; use utils::lsn::Lsn; -use crate::deletion_queue::DeletionQueueClient; +use crate::context::RequestContext; +use crate::deletion_queue::{DeletionQueueClient, DeletionQueueError}; use crate::metrics::{ MeasureRemoteOp, RemoteOpFileKind, RemoteOpKind, RemoteTimelineClientMetrics, RemoteTimelineClientMetricsCallTrackSize, REMOTE_ONDEMAND_DOWNLOADED_BYTES, @@ -217,7 +221,7 @@ use crate::task_mgr::shutdown_token; use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id; use crate::tenant::remote_timeline_client::download::download_retry; use crate::tenant::storage_layer::AsLayerDesc; -use crate::tenant::upload_queue::Delete; +use crate::tenant::upload_queue::{Delete, UploadQueueStoppedDeletable}; use crate::tenant::TIMELINES_SEGMENT_NAME; use crate::{ config::PageServerConf, @@ -235,11 +239,14 @@ use utils::id::{TenantId, TimelineId}; use self::index::IndexPart; -use super::storage_layer::{Layer, LayerFileName, ResidentLayer}; +use super::metadata::MetadataUpdate; +use super::storage_layer::{Layer, LayerName, ResidentLayer}; use super::upload_queue::SetDeletedFlagProgress; use super::Generation; -pub(crate) use download::{is_temp_download_file, list_remote_timelines}; +pub(crate) use download::{ + download_index_part, is_temp_download_file, list_remote_tenant_shards, list_remote_timelines, +}; pub(crate) use index::LayerFileMetadata; // Occasional network issues and such can cause remote operations to fail, and @@ -261,20 +268,15 @@ pub(crate) const INITDB_PRESERVED_PATH: &str = "initdb-preserved.tar.zst"; /// Default buffer size when interfacing with [`tokio::fs::File`]. pub(crate) const BUFFER_SIZE: usize = 32 * 1024; +/// Doing non-essential flushes of deletion queue is subject to this timeout, after +/// which we warn and skip. +const DELETION_QUEUE_FLUSH_TIMEOUT: Duration = Duration::from_secs(10); + pub enum MaybeDeletedIndexPart { IndexPart(IndexPart), Deleted(IndexPart), } -/// Errors that can arise when calling [`RemoteTimelineClient::stop`]. -#[derive(Debug, thiserror::Error)] -pub enum StopError { - /// Returned if the upload queue was never initialized. - /// See [`RemoteTimelineClient::init_upload_queue`] and [`RemoteTimelineClient::init_upload_queue_for_empty_remote`]. - #[error("queue is not initialized")] - QueueUninitialized, -} - #[derive(Debug, thiserror::Error)] pub enum PersistIndexPartWithDeletedFlagError { #[error("another task is already setting the deleted_flag, started at {0:?}")] @@ -399,15 +401,10 @@ impl RemoteTimelineClient { "bug: it is responsibility of the caller to provide index part from MaybeDeletedIndexPart::Deleted" ))?; - { - let mut upload_queue = self.upload_queue.lock().unwrap(); - upload_queue.initialize_with_current_remote_index_part(index_part)?; - self.update_remote_physical_size_gauge(Some(index_part)); - } - // also locks upload queue, without dropping the guard above it will be a deadlock - self.stop().expect("initialized line above"); - let mut upload_queue = self.upload_queue.lock().unwrap(); + upload_queue.initialize_with_current_remote_index_part(index_part)?; + self.update_remote_physical_size_gauge(Some(index_part)); + self.stop_impl(&mut upload_queue); upload_queue .stopped_mut() @@ -421,7 +418,8 @@ impl RemoteTimelineClient { match &mut *self.upload_queue.lock().unwrap() { UploadQueue::Uninitialized => None, UploadQueue::Initialized(q) => q.get_last_remote_consistent_lsn_projected(), - UploadQueue::Stopped(q) => q + UploadQueue::Stopped(UploadQueueStopped::Uninitialized) => None, + UploadQueue::Stopped(UploadQueueStopped::Deletable(q)) => q .upload_queue_for_deletion .get_last_remote_consistent_lsn_projected(), } @@ -431,13 +429,27 @@ impl RemoteTimelineClient { match &mut *self.upload_queue.lock().unwrap() { UploadQueue::Uninitialized => None, UploadQueue::Initialized(q) => Some(q.get_last_remote_consistent_lsn_visible()), - UploadQueue::Stopped(q) => Some( + UploadQueue::Stopped(UploadQueueStopped::Uninitialized) => None, + UploadQueue::Stopped(UploadQueueStopped::Deletable(q)) => Some( q.upload_queue_for_deletion .get_last_remote_consistent_lsn_visible(), ), } } + /// Returns true if this timeline was previously detached at this Lsn and the remote timeline + /// client is currently initialized. + pub(crate) fn is_previous_ancestor_lsn(&self, lsn: Lsn) -> bool { + // technically this is a dirty read, but given how timeline detach ancestor is implemented + // via tenant restart, the lineage has always been uploaded. + self.upload_queue + .lock() + .unwrap() + .initialized_mut() + .map(|uq| uq.latest_lineage.is_previous_ancestor_lsn(lsn)) + .unwrap_or(false) + } + fn update_remote_physical_size_gauge(&self, current_remote_index_part: Option<&IndexPart>) { let size: u64 = if let Some(current_remote_index_part) = current_remote_index_part { current_remote_index_part @@ -476,7 +488,7 @@ impl RemoteTimelineClient { }, ); - let index_part = download::download_index_part( + let (index_part, _index_generation) = download::download_index_part( &self.storage_impl, &self.tenant_shard_id, &self.timeline_id, @@ -504,9 +516,10 @@ impl RemoteTimelineClient { /// On success, returns the size of the downloaded file. pub async fn download_layer_file( &self, - layer_file_name: &LayerFileName, + layer_file_name: &LayerName, layer_metadata: &LayerFileMetadata, cancel: &CancellationToken, + ctx: &RequestContext, ) -> anyhow::Result { let downloaded_size = { let _unfinished_gauge_guard = self.metrics.call_begin( @@ -524,6 +537,7 @@ impl RemoteTimelineClient { layer_file_name, layer_metadata, cancel, + ctx, ) .measure_remote_op( RemoteOpFileKind::Layer, @@ -543,9 +557,10 @@ impl RemoteTimelineClient { // Upload operations. // - /// /// Launch an index-file upload operation in the background, with - /// updated metadata. + /// fully updated metadata. + /// + /// This should only be used to upload initial metadata to remote storage. /// /// The upload will be added to the queue immediately, but it /// won't be performed until all previously scheduled layer file @@ -557,7 +572,7 @@ impl RemoteTimelineClient { /// If there were any changes to the list of files, i.e. if any /// layer file uploads were scheduled, since the last index file /// upload, those will be included too. - pub fn schedule_index_upload_for_metadata_update( + pub fn schedule_index_upload_for_full_metadata_update( self: &Arc, metadata: &TimelineMetadata, ) -> anyhow::Result<()> { @@ -568,7 +583,28 @@ impl RemoteTimelineClient { // ahead of what's _actually_ on the remote during index upload. upload_queue.latest_metadata = metadata.clone(); - self.schedule_index_upload(upload_queue, upload_queue.latest_metadata.clone()); + self.schedule_index_upload(upload_queue); + + Ok(()) + } + + /// Launch an index-file upload operation in the background, with only parts of the metadata + /// updated. + /// + /// This is the regular way of updating metadata on layer flushes or Gc. + /// + /// Using this lighter update mechanism allows for reparenting and detaching without changes to + /// `index_part.json`, while being more clear on what values update regularly. + pub(crate) fn schedule_index_upload_for_metadata_update( + self: &Arc, + update: &MetadataUpdate, + ) -> anyhow::Result<()> { + let mut guard = self.upload_queue.lock().unwrap(); + let upload_queue = guard.initialized_mut()?; + + upload_queue.latest_metadata.apply(update); + + self.schedule_index_upload(upload_queue); Ok(()) } @@ -588,32 +624,24 @@ impl RemoteTimelineClient { let upload_queue = guard.initialized_mut()?; if upload_queue.latest_files_changes_since_metadata_upload_scheduled > 0 { - self.schedule_index_upload(upload_queue, upload_queue.latest_metadata.clone()); + self.schedule_index_upload(upload_queue); } Ok(()) } /// Launch an index-file upload operation in the background (internal function) - fn schedule_index_upload( - self: &Arc, - upload_queue: &mut UploadQueueInitialized, - metadata: TimelineMetadata, - ) { + fn schedule_index_upload(self: &Arc, upload_queue: &mut UploadQueueInitialized) { + let disk_consistent_lsn = upload_queue.latest_metadata.disk_consistent_lsn(); + info!( - "scheduling metadata upload with {} files ({} changed)", + "scheduling metadata upload up to consistent LSN {disk_consistent_lsn} with {} files ({} changed)", upload_queue.latest_files.len(), upload_queue.latest_files_changes_since_metadata_upload_scheduled, ); - let disk_consistent_lsn = upload_queue.latest_metadata.disk_consistent_lsn(); - - let index_part = IndexPart::new( - upload_queue.latest_files.clone(), - disk_consistent_lsn, - metadata, - ); - let op = UploadOp::UploadMetadata(index_part, disk_consistent_lsn); + let index_part = IndexPart::from(&*upload_queue); + let op = UploadOp::UploadMetadata(Box::new(index_part), disk_consistent_lsn); self.metric_begin(&op); upload_queue.queued_operations.push_back(op); upload_queue.latest_files_changes_since_metadata_upload_scheduled = 0; @@ -622,9 +650,67 @@ impl RemoteTimelineClient { self.launch_queued_tasks(upload_queue); } + pub(crate) async fn schedule_reparenting_and_wait( + self: &Arc, + new_parent: &TimelineId, + ) -> anyhow::Result<()> { + // FIXME: because of how Timeline::schedule_uploads works when called from layer flushing + // and reads the in-memory part we cannot do the detaching like this + let receiver = { + let mut guard = self.upload_queue.lock().unwrap(); + let upload_queue = guard.initialized_mut()?; + + let Some(prev) = upload_queue.latest_metadata.ancestor_timeline() else { + return Err(anyhow::anyhow!( + "cannot reparent without a current ancestor" + )); + }; + + upload_queue.latest_metadata.reparent(new_parent); + upload_queue.latest_lineage.record_previous_ancestor(&prev); + + self.schedule_index_upload(upload_queue); + + self.schedule_barrier0(upload_queue) + }; + + Self::wait_completion0(receiver).await + } + + /// Schedules uploading a new version of `index_part.json` with the given layers added, + /// detaching from ancestor and waits for it to complete. /// - /// Launch an upload operation in the background. - /// + /// This is used with `Timeline::detach_ancestor` functionality. + pub(crate) async fn schedule_adding_existing_layers_to_index_detach_and_wait( + self: &Arc, + layers: &[Layer], + adopted: (TimelineId, Lsn), + ) -> anyhow::Result<()> { + let barrier = { + let mut guard = self.upload_queue.lock().unwrap(); + let upload_queue = guard.initialized_mut()?; + + upload_queue.latest_metadata.detach_from_ancestor(&adopted); + upload_queue.latest_lineage.record_detaching(&adopted); + + for layer in layers { + upload_queue + .latest_files + .insert(layer.layer_desc().layer_name(), layer.metadata()); + } + + self.schedule_index_upload(upload_queue); + + let barrier = self.schedule_barrier0(upload_queue); + self.launch_queued_tasks(upload_queue); + barrier + }; + + Self::wait_completion0(barrier).await + } + + /// Launch an upload operation in the background; the file is added to be included in next + /// `index_part.json` upload. pub(crate) fn schedule_layer_file_upload( self: &Arc, layer: ResidentLayer, @@ -646,13 +732,15 @@ impl RemoteTimelineClient { upload_queue .latest_files - .insert(layer.layer_desc().filename(), metadata.clone()); + .insert(layer.layer_desc().layer_name(), metadata.clone()); upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1; info!( - "scheduled layer file upload {layer} gen={:?} shard={:?}", - metadata.generation, metadata.shard + gen=?metadata.generation, + shard=?metadata.shard, + "scheduled layer file upload {layer}", ); + let op = UploadOp::UploadLayer(layer, metadata); self.metric_begin(&op); upload_queue.queued_operations.push_back(op); @@ -668,7 +756,7 @@ impl RemoteTimelineClient { /// successfully. pub fn schedule_layer_file_deletion( self: &Arc, - names: &[LayerFileName], + names: &[LayerName], ) -> anyhow::Result<()> { let mut guard = self.upload_queue.lock().unwrap(); let upload_queue = guard.initialized_mut()?; @@ -696,7 +784,7 @@ impl RemoteTimelineClient { // the layer files as "dangling". this is fine, at worst case we create work for the // scrubber. - let names = gc_layers.iter().map(|x| x.layer_desc().filename()); + let names = gc_layers.iter().map(|x| x.layer_desc().layer_name()); self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names); @@ -711,14 +799,10 @@ impl RemoteTimelineClient { self: &Arc, upload_queue: &mut UploadQueueInitialized, names: I, - ) -> Vec<(LayerFileName, LayerFileMetadata)> + ) -> Vec<(LayerName, LayerFileMetadata)> where - I: IntoIterator, + I: IntoIterator, { - // Deleting layers doesn't affect the values stored in TimelineMetadata, - // so we don't need update it. Just serialize it. - let metadata = upload_queue.latest_metadata.clone(); - // Decorate our list of names with each name's metadata, dropping // names that are unexpectedly missing from our metadata. This metadata // is later used when physically deleting layers, to construct key paths. @@ -757,7 +841,7 @@ impl RemoteTimelineClient { // index_part update, because that needs to be uploaded before we can actually delete the // files. if upload_queue.latest_files_changes_since_metadata_upload_scheduled > 0 { - self.schedule_index_upload(upload_queue, metadata); + self.schedule_index_upload(upload_queue); } with_metadata @@ -767,7 +851,7 @@ impl RemoteTimelineClient { /// `index_part.json` with [`Self::schedule_gc_update`] or [`Self::schedule_compaction_update`]. pub(crate) fn schedule_deletion_of_unlinked( self: &Arc, - layers: Vec<(LayerFileName, LayerFileMetadata)>, + layers: Vec<(LayerName, LayerFileMetadata)>, ) -> anyhow::Result<()> { let mut guard = self.upload_queue.lock().unwrap(); let upload_queue = guard.initialized_mut()?; @@ -780,7 +864,7 @@ impl RemoteTimelineClient { fn schedule_deletion_of_unlinked0( self: &Arc, upload_queue: &mut UploadQueueInitialized, - mut with_metadata: Vec<(LayerFileName, LayerFileMetadata)>, + mut with_metadata: Vec<(LayerName, LayerFileMetadata)>, ) { // Filter out any layers which were not created by this tenant shard. These are // layers that originate from some ancestor shard after a split, and may still @@ -849,7 +933,7 @@ impl RemoteTimelineClient { self.schedule_layer_file_upload0(upload_queue, layer.clone()); } - let names = compacted_from.iter().map(|x| x.layer_desc().filename()); + let names = compacted_from.iter().map(|x| x.layer_desc().layer_name()); self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names); self.launch_queued_tasks(upload_queue); @@ -859,12 +943,18 @@ impl RemoteTimelineClient { /// Wait for all previously scheduled uploads/deletions to complete pub(crate) async fn wait_completion(self: &Arc) -> anyhow::Result<()> { - let mut receiver = { + let receiver = { let mut guard = self.upload_queue.lock().unwrap(); let upload_queue = guard.initialized_mut()?; self.schedule_barrier0(upload_queue) }; + Self::wait_completion0(receiver).await + } + + async fn wait_completion0( + mut receiver: tokio::sync::watch::Receiver<()>, + ) -> anyhow::Result<()> { if receiver.changed().await.is_err() { anyhow::bail!("wait_completion aborted because upload queue was stopped"); } @@ -898,7 +988,7 @@ impl RemoteTimelineClient { /// Wait for all previously scheduled operations to complete, and then stop. /// /// Not cancellation safe - pub(crate) async fn shutdown(self: &Arc) -> Result<(), StopError> { + pub(crate) async fn shutdown(self: &Arc) { // On cancellation the queue is left in ackward state of refusing new operations but // proper stop is yet to be called. On cancel the original or some later task must call // `stop` or `shutdown`. @@ -909,8 +999,12 @@ impl RemoteTimelineClient { let fut = { let mut guard = self.upload_queue.lock().unwrap(); let upload_queue = match &mut *guard { - UploadQueue::Stopped(_) => return Ok(()), - UploadQueue::Uninitialized => return Err(StopError::QueueUninitialized), + UploadQueue::Stopped(_) => return, + UploadQueue::Uninitialized => { + // transition into Stopped state + self.stop_impl(&mut guard); + return; + } UploadQueue::Initialized(ref mut init) => init, }; @@ -942,7 +1036,7 @@ impl RemoteTimelineClient { } } - self.stop() + self.stop(); } /// Set the deleted_at field in the remote index file. @@ -976,8 +1070,7 @@ impl RemoteTimelineClient { let deleted_at = Utc::now().naive_utc(); stopped.deleted_at = SetDeletedFlagProgress::InProgress(deleted_at); - let mut index_part = IndexPart::try_from(&stopped.upload_queue_for_deletion) - .context("IndexPart serialize")?; + let mut index_part = IndexPart::from(&stopped.upload_queue_for_deletion); index_part.deleted_at = Some(deleted_at); index_part }; @@ -1058,6 +1151,113 @@ impl RemoteTimelineClient { Ok(()) } + /// Uploads the given layer **without** adding it to be part of a future `index_part.json` upload. + /// + /// This is not normally needed. + pub(crate) async fn upload_layer_file( + self: &Arc, + uploaded: &ResidentLayer, + cancel: &CancellationToken, + ) -> anyhow::Result<()> { + let remote_path = remote_layer_path( + &self.tenant_shard_id.tenant_id, + &self.timeline_id, + self.tenant_shard_id.to_index(), + &uploaded.layer_desc().layer_name(), + uploaded.metadata().generation, + ); + + backoff::retry( + || async { + upload::upload_timeline_layer( + &self.storage_impl, + uploaded.local_path(), + &remote_path, + uploaded.metadata().file_size(), + cancel, + ) + .await + }, + TimeoutOrCancel::caused_by_cancel, + FAILED_UPLOAD_WARN_THRESHOLD, + FAILED_REMOTE_OP_RETRIES, + "upload a layer without adding it to latest files", + cancel, + ) + .await + .ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel)) + .and_then(|x| x) + .context("upload a layer without adding it to latest files") + } + + /// Copies the `adopted` remote existing layer to the remote path of `adopted_as`. The layer is + /// not added to be part of a future `index_part.json` upload. + pub(crate) async fn copy_timeline_layer( + self: &Arc, + adopted: &Layer, + adopted_as: &Layer, + cancel: &CancellationToken, + ) -> anyhow::Result<()> { + let source_remote_path = remote_layer_path( + &self.tenant_shard_id.tenant_id, + &adopted + .get_timeline_id() + .expect("Source timeline should be alive"), + self.tenant_shard_id.to_index(), + &adopted.layer_desc().layer_name(), + adopted.metadata().generation, + ); + + let target_remote_path = remote_layer_path( + &self.tenant_shard_id.tenant_id, + &self.timeline_id, + self.tenant_shard_id.to_index(), + &adopted_as.layer_desc().layer_name(), + adopted_as.metadata().generation, + ); + + backoff::retry( + || async { + upload::copy_timeline_layer( + &self.storage_impl, + &source_remote_path, + &target_remote_path, + cancel, + ) + .await + }, + TimeoutOrCancel::caused_by_cancel, + FAILED_UPLOAD_WARN_THRESHOLD, + FAILED_REMOTE_OP_RETRIES, + "copy timeline layer", + cancel, + ) + .await + .ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel)) + .and_then(|x| x) + .context("remote copy timeline layer") + } + + async fn flush_deletion_queue(&self) -> Result<(), DeletionQueueError> { + match tokio::time::timeout( + DELETION_QUEUE_FLUSH_TIMEOUT, + self.deletion_queue_client.flush_immediate(), + ) + .await + { + Ok(result) => result, + Err(_timeout) => { + // Flushing remote deletions is not mandatory: we flush here to make the system easier to test, and + // to ensure that _usually_ objects are really gone after a DELETE is acked. However, in case of deletion + // queue issues (https://github.com/neondatabase/neon/issues/6440), we don't want to wait indefinitely here. + tracing::warn!( + "Timed out waiting for deletion queue flush, acking deletion anyway" + ); + Ok(()) + } + } + } + /// Prerequisites: UploadQueue should be in stopped state and deleted_at should be successfuly set. /// The function deletes layer files one by one, then lists the prefix to see if we leaked something /// deletes leaked files if any and proceeds with deletion of index file at the end. @@ -1105,23 +1305,29 @@ impl RemoteTimelineClient { // and retry will arrive to different pageserver there wont be any traces of it on remote storage let timeline_storage_path = remote_timeline_path(&self.tenant_shard_id, &self.timeline_id); - // Execute all pending deletions, so that when we proceed to do a list_prefixes below, we aren't + // Execute all pending deletions, so that when we proceed to do a listing below, we aren't // taking the burden of listing all the layers that we already know we should delete. - self.deletion_queue_client.flush_immediate().await?; + self.flush_deletion_queue().await?; let cancel = shutdown_token(); let remaining = download_retry( || async { self.storage_impl - .list_files(Some(&timeline_storage_path), None, &cancel) + .list( + Some(&timeline_storage_path), + ListingMode::NoDelimiter, + None, + &cancel, + ) .await }, "list remaining files", &cancel, ) .await - .context("list files remaining files")?; + .context("list files remaining files")? + .keys; // We will delete the current index_part object last, since it acts as a deletion // marker via its deleted_at attribute @@ -1181,7 +1387,7 @@ impl RemoteTimelineClient { // Timeline deletion is rare and we have probably emitted a reasonably number of objects: wait // for a flush to a persistent deletion list so that we may be sure deletion will occur. - self.deletion_queue_client.flush_immediate().await?; + self.flush_deletion_queue().await?; fail::fail_point!("timeline-delete-after-index-delete", |_| { Err(anyhow::anyhow!( @@ -1203,7 +1409,7 @@ impl RemoteTimelineClient { while let Some(next_op) = upload_queue.queued_operations.front() { // Can we run this task now? let can_run_now = match next_op { - UploadOp::UploadLayer(_, _) => { + UploadOp::UploadLayer(..) => { // Can always be scheduled. true } @@ -1324,24 +1530,31 @@ impl RemoteTimelineClient { // upload finishes or times out soon enough. if cancel.is_cancelled() { info!("upload task cancelled by shutdown request"); - match self.stop() { - Ok(()) => {} - Err(StopError::QueueUninitialized) => { - unreachable!("we never launch an upload task if the queue is uninitialized, and once it is initialized, we never go back") - } - } + self.stop(); return; } let upload_result: anyhow::Result<()> = match &task.op { UploadOp::UploadLayer(ref layer, ref layer_metadata) => { - let path = layer.local_path(); + let local_path = layer.local_path(); + + // We should only be uploading layers created by this `Tenant`'s lifetime, so + // the metadata in the upload should always match our current generation. + assert_eq!(layer_metadata.generation, self.generation); + + let remote_path = remote_layer_path( + &self.tenant_shard_id.tenant_id, + &self.timeline_id, + layer_metadata.shard, + &layer.layer_desc().layer_name(), + layer_metadata.generation, + ); + upload::upload_timeline_layer( - self.conf, &self.storage_impl, - path, - layer_metadata, - self.generation, + local_path, + &remote_path, + layer_metadata.file_size(), &self.cancel, ) .measure_remote_op( @@ -1582,19 +1795,25 @@ impl RemoteTimelineClient { /// Use [`RemoteTimelineClient::shutdown`] for graceful stop. /// /// In-progress operations will still be running after this function returns. - /// Use `task_mgr::shutdown_tasks(None, Some(self.tenant_id), Some(timeline_id))` + /// Use `task_mgr::shutdown_tasks(Some(TaskKind::RemoteUploadTask), Some(self.tenant_shard_id), Some(timeline_id))` /// to wait for them to complete, after calling this function. - pub(crate) fn stop(&self) -> Result<(), StopError> { + pub(crate) fn stop(&self) { // Whichever *task* for this RemoteTimelineClient grabs the mutex first will transition the queue // into stopped state, thereby dropping all off the queued *ops* which haven't become *tasks* yet. // The other *tasks* will come here and observe an already shut down queue and hence simply wrap up their business. let mut guard = self.upload_queue.lock().unwrap(); - match &mut *guard { - UploadQueue::Uninitialized => Err(StopError::QueueUninitialized), + self.stop_impl(&mut guard); + } + + fn stop_impl(&self, guard: &mut std::sync::MutexGuard) { + match &mut **guard { + UploadQueue::Uninitialized => { + info!("UploadQueue is in state Uninitialized, nothing to do"); + **guard = UploadQueue::Stopped(UploadQueueStopped::Uninitialized); + } UploadQueue::Stopped(_) => { // nothing to do info!("another concurrent task already shut down the queue"); - Ok(()) } UploadQueue::Initialized(initialized) => { info!("shutting down upload queue"); @@ -1611,6 +1830,7 @@ impl RemoteTimelineClient { latest_files: initialized.latest_files.clone(), latest_files_changes_since_metadata_upload_scheduled: 0, latest_metadata: initialized.latest_metadata.clone(), + latest_lineage: initialized.latest_lineage.clone(), projected_remote_consistent_lsn: None, visible_remote_consistent_lsn: initialized .visible_remote_consistent_lsn @@ -1627,11 +1847,13 @@ impl RemoteTimelineClient { }; let upload_queue = std::mem::replace( - &mut *guard, - UploadQueue::Stopped(UploadQueueStopped { - upload_queue_for_deletion, - deleted_at: SetDeletedFlagProgress::NotRunning, - }), + &mut **guard, + UploadQueue::Stopped(UploadQueueStopped::Deletable( + UploadQueueStoppedDeletable { + upload_queue_for_deletion, + deleted_at: SetDeletedFlagProgress::NotRunning, + }, + )), ); if let UploadQueue::Initialized(qi) = upload_queue { qi @@ -1660,15 +1882,16 @@ impl RemoteTimelineClient { // which is exactly what we want to happen. drop(op); } - - // We're done. - drop(guard); - Ok(()) } } } } +pub fn remote_tenant_path(tenant_shard_id: &TenantShardId) -> RemotePath { + let path = format!("tenants/{tenant_shard_id}"); + RemotePath::from_string(&path).expect("Failed to construct path") +} + pub fn remote_timelines_path(tenant_shard_id: &TenantShardId) -> RemotePath { let path = format!("tenants/{tenant_shard_id}/{TIMELINES_SEGMENT_NAME}"); RemotePath::from_string(&path).expect("Failed to construct path") @@ -1693,14 +1916,14 @@ pub fn remote_layer_path( tenant_id: &TenantId, timeline_id: &TimelineId, shard: ShardIndex, - layer_file_name: &LayerFileName, + layer_file_name: &LayerName, generation: Generation, ) -> RemotePath { // Generation-aware key format let path = format!( "tenants/{tenant_id}{0}/{TIMELINES_SEGMENT_NAME}/{timeline_id}/{1}{2}", shard.get_suffix(), - layer_file_name.file_name(), + layer_file_name, generation.get_suffix() ); @@ -1761,29 +1984,6 @@ pub fn parse_remote_index_path(path: RemotePath) -> Option { } } -/// Files on the remote storage are stored with paths, relative to the workdir. -/// That path includes in itself both tenant and timeline ids, allowing to have a unique remote storage path. -/// -/// Errors if the path provided does not start from pageserver's workdir. -pub fn remote_path( - conf: &PageServerConf, - local_path: &Utf8Path, - generation: Generation, -) -> anyhow::Result { - let stripped = local_path - .strip_prefix(&conf.workdir) - .context("Failed to strip workdir prefix")?; - - let suffixed = format!("{0}{1}", stripped, generation.get_suffix()); - - RemotePath::new(Utf8Path::new(&suffixed)).with_context(|| { - format!( - "to resolve remote part of path {:?} for base {:?}", - local_path, conf.workdir - ) - }) -} - #[cfg(test)] mod tests { use super::*; @@ -1791,6 +1991,7 @@ mod tests { context::RequestContext, tenant::{ harness::{TenantHarness, TIMELINE_ID}, + storage_layer::layer::local_layer_path, Tenant, Timeline, }, DEFAULT_PG_VERSION, @@ -1819,8 +2020,8 @@ mod tests { TimelineMetadata::from_bytes(&metadata.to_bytes().unwrap()).unwrap() } - fn assert_file_list(a: &HashSet, b: &[&str]) { - let mut avec: Vec = a.iter().map(|x| x.file_name()).collect(); + fn assert_file_list(a: &HashSet, b: &[&str]) { + let mut avec: Vec = a.iter().map(|x| x.to_string()).collect(); avec.sort(); let mut bvec = b.to_vec(); @@ -1946,7 +2147,7 @@ mod tests { .layer_metadata .keys() .map(|f| f.to_owned()) - .collect::>(); + .collect::>(); let initial_layer = { assert!(initial_layers.len() == 1); initial_layers.into_iter().next().unwrap() @@ -1972,12 +2173,21 @@ mod tests { ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59DA-00000000016B5A53".parse().unwrap(), dummy_contents("baz")) ] .into_iter() - .map(|(name, contents): (LayerFileName, Vec)| { - std::fs::write(timeline_path.join(name.file_name()), &contents).unwrap(); + .map(|(name, contents): (LayerName, Vec)| { + + let local_path = local_layer_path( + harness.conf, + &timeline.tenant_shard_id, + &timeline.timeline_id, + &name, + &generation, + ); + std::fs::write(&local_path, &contents).unwrap(); Layer::for_resident( harness.conf, &timeline, + local_path, name, LayerFileMetadata::new(contents.len() as u64, generation, shard), ) @@ -2008,7 +2218,7 @@ mod tests { // Schedule upload of index. Check that it is queued let metadata = dummy_metadata(Lsn(0x20)); client - .schedule_index_upload_for_metadata_update(&metadata) + .schedule_index_upload_for_full_metadata_update(&metadata) .unwrap(); { let mut guard = client.upload_queue.lock().unwrap(); @@ -2044,9 +2254,9 @@ mod tests { .map(|f| f.to_owned()) .collect(), &[ - &initial_layer.file_name(), - &layers[0].layer_desc().filename().file_name(), - &layers[1].layer_desc().filename().file_name(), + &initial_layer.to_string(), + &layers[0].layer_desc().layer_name().to_string(), + &layers[1].layer_desc().layer_name().to_string(), ], ); assert_eq!(index_part.metadata, metadata); @@ -2060,7 +2270,7 @@ mod tests { // keep using schedule_layer_file_deletion because we don't have a way to wait for the // spawn_blocking started by the drop. client - .schedule_layer_file_deletion(&[layers[0].layer_desc().filename()]) + .schedule_layer_file_deletion(&[layers[0].layer_desc().layer_name()]) .unwrap(); { let mut guard = client.upload_queue.lock().unwrap(); @@ -2078,9 +2288,9 @@ mod tests { } assert_remote_files( &[ - &initial_layer.file_name(), - &layers[0].layer_desc().filename().file_name(), - &layers[1].layer_desc().filename().file_name(), + &initial_layer.to_string(), + &layers[0].layer_desc().layer_name().to_string(), + &layers[1].layer_desc().layer_name().to_string(), "index_part.json", ], &remote_timeline_dir, @@ -2093,9 +2303,9 @@ mod tests { assert_remote_files( &[ - &initial_layer.file_name(), - &layers[1].layer_desc().filename().file_name(), - &layers[2].layer_desc().filename().file_name(), + &initial_layer.to_string(), + &layers[1].layer_desc().layer_name().to_string(), + &layers[2].layer_desc().layer_name().to_string(), "index_part.json", ], &remote_timeline_dir, @@ -2114,19 +2324,22 @@ mod tests { .. } = TestSetup::new("metrics").await.unwrap(); let client = timeline.remote_client.as_ref().unwrap(); - let timeline_path = harness.timeline_path(&TIMELINE_ID); - let layer_file_name_1: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(); + let layer_file_name_1: LayerName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(); + let local_path = local_layer_path( + harness.conf, + &timeline.tenant_shard_id, + &timeline.timeline_id, + &layer_file_name_1, + &harness.generation, + ); let content_1 = dummy_contents("foo"); - std::fs::write( - timeline_path.join(layer_file_name_1.file_name()), - &content_1, - ) - .unwrap(); + std::fs::write(&local_path, &content_1).unwrap(); let layer_file_1 = Layer::for_resident( harness.conf, &timeline, + local_path, layer_file_name_1.clone(), LayerFileMetadata::new(content_1.len() as u64, harness.generation, harness.shard), ); @@ -2195,12 +2408,7 @@ mod tests { async fn inject_index_part(test_state: &TestSetup, generation: Generation) -> IndexPart { // An empty IndexPart, just sufficient to ensure deserialization will succeed - let example_metadata = TimelineMetadata::example(); - let example_index_part = IndexPart::new( - HashMap::new(), - example_metadata.disk_consistent_lsn(), - example_metadata, - ); + let example_index_part = IndexPart::example(); let index_part_bytes = serde_json::to_vec(&example_index_part).unwrap(); diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs index 6ee8ad7155..b464437422 100644 --- a/pageserver/src/tenant/remote_timeline_client/download.rs +++ b/pageserver/src/tenant/remote_timeline_client/download.rs @@ -5,6 +5,7 @@ use std::collections::HashSet; use std::future::Future; +use std::str::FromStr; use anyhow::{anyhow, Context}; use camino::{Utf8Path, Utf8PathBuf}; @@ -17,21 +18,23 @@ use tracing::warn; use utils::backoff; use crate::config::PageServerConf; +use crate::context::RequestContext; use crate::span::debug_assert_current_span_has_tenant_and_timeline_id; use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_path}; -use crate::tenant::storage_layer::LayerFileName; +use crate::tenant::storage_layer::layer::local_layer_path; +use crate::tenant::storage_layer::LayerName; use crate::tenant::Generation; use crate::virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile}; use crate::TEMP_FILE_SUFFIX; use remote_storage::{DownloadError, GenericRemoteStorage, ListingMode, RemotePath}; use utils::crashsafe::path_with_suffix_extension; -use utils::id::TimelineId; +use utils::id::{TenantId, TimelineId}; use super::index::{IndexPart, LayerFileMetadata}; use super::{ parse_remote_index_path, remote_index_path, remote_initdb_archive_path, - remote_initdb_preserved_archive_path, FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES, - INITDB_PATH, + remote_initdb_preserved_archive_path, remote_tenant_path, FAILED_DOWNLOAD_WARN_THRESHOLD, + FAILED_REMOTE_OP_RETRIES, INITDB_PATH, }; /// @@ -39,19 +42,27 @@ use super::{ /// in the metadata. (In the future, we might do more cross-checks, like CRC validation) /// /// Returns the size of the downloaded file. +#[allow(clippy::too_many_arguments)] pub async fn download_layer_file<'a>( conf: &'static PageServerConf, storage: &'a GenericRemoteStorage, tenant_shard_id: TenantShardId, timeline_id: TimelineId, - layer_file_name: &'a LayerFileName, + layer_file_name: &'a LayerName, layer_metadata: &'a LayerFileMetadata, cancel: &CancellationToken, + ctx: &RequestContext, ) -> Result { debug_assert_current_span_has_tenant_and_timeline_id(); let timeline_path = conf.timeline_path(&tenant_shard_id, &timeline_id); - let local_path = timeline_path.join(layer_file_name.file_name()); + let local_path = local_layer_path( + conf, + &tenant_shard_id, + &timeline_id, + layer_file_name, + &layer_metadata.generation, + ); let remote_path = remote_layer_path( &tenant_shard_id.tenant_id, @@ -74,7 +85,7 @@ pub async fn download_layer_file<'a>( let temp_file_path = path_with_suffix_extension(&local_path, TEMP_DOWNLOAD_EXTENSION); let bytes_amount = download_retry( - || async { download_object(storage, &remote_path, &temp_file_path, cancel).await }, + || async { download_object(storage, &remote_path, &temp_file_path, cancel, ctx).await }, &format!("download {remote_path:?}"), cancel, ) @@ -132,6 +143,7 @@ async fn download_object<'a>( src_path: &RemotePath, dst_path: &Utf8PathBuf, cancel: &CancellationToken, + #[cfg_attr(target_os = "macos", allow(unused_variables))] ctx: &RequestContext, ) -> Result { let res = match crate::virtual_file::io_engine::get() { crate::virtual_file::io_engine::IoEngine::NotSet => panic!("unset"), @@ -182,6 +194,7 @@ async fn download_object<'a>( #[cfg(target_os = "linux")] crate::virtual_file::io_engine::IoEngine::TokioEpollUring => { use crate::virtual_file::owned_buffers_io::{self, util::size_tracking_writer}; + use bytes::BytesMut; async { let destination_file = VirtualFile::create(dst_path) .await @@ -194,10 +207,10 @@ async fn download_object<'a>( // There's chunks_vectored() on the stream. let (bytes_amount, destination_file) = async { let size_tracking = size_tracking_writer::Writer::new(destination_file); - let mut buffered = owned_buffers_io::write::BufferedWriter::< - { super::BUFFER_SIZE }, - _, - >::new(size_tracking); + let mut buffered = owned_buffers_io::write::BufferedWriter::::new( + size_tracking, + BytesMut::with_capacity(super::BUFFER_SIZE), + ); while let Some(res) = futures::StreamExt::next(&mut download.download_stream).await { @@ -206,10 +219,10 @@ async fn download_object<'a>( Err(e) => return Err(e), }; buffered - .write_buffered(tokio_epoll_uring::BoundedBuf::slice_full(chunk)) + .write_buffered(tokio_epoll_uring::BoundedBuf::slice_full(chunk), ctx) .await?; } - let size_tracking = buffered.flush_and_into_inner().await?; + let size_tracking = buffered.flush_and_into_inner(ctx).await?; Ok(size_tracking.into_inner()) } .await?; @@ -252,42 +265,31 @@ pub(crate) fn is_temp_download_file(path: &Utf8Path) -> bool { } } -/// List timelines of given tenant in remote storage -pub async fn list_remote_timelines( +async fn list_identifiers( storage: &GenericRemoteStorage, - tenant_shard_id: TenantShardId, + prefix: RemotePath, cancel: CancellationToken, -) -> anyhow::Result<(HashSet, HashSet)> { - let remote_path = remote_timelines_path(&tenant_shard_id); - - fail::fail_point!("storage-sync-list-remote-timelines", |_| { - anyhow::bail!("storage-sync-list-remote-timelines"); - }); - +) -> anyhow::Result<(HashSet, HashSet)> +where + T: FromStr + Eq + std::hash::Hash, +{ let listing = download_retry_forever( - || { - storage.list( - Some(&remote_path), - ListingMode::WithDelimiter, - None, - &cancel, - ) - }, - &format!("list timelines for {tenant_shard_id}"), + || storage.list(Some(&prefix), ListingMode::WithDelimiter, None, &cancel), + &format!("list identifiers in prefix {prefix}"), &cancel, ) .await?; - let mut timeline_ids = HashSet::new(); + let mut parsed_ids = HashSet::new(); let mut other_prefixes = HashSet::new(); - for timeline_remote_storage_key in listing.prefixes { - let object_name = timeline_remote_storage_key.object_name().ok_or_else(|| { - anyhow::anyhow!("failed to get timeline id for remote tenant {tenant_shard_id}") + for id_remote_storage_key in listing.prefixes { + let object_name = id_remote_storage_key.object_name().ok_or_else(|| { + anyhow::anyhow!("failed to get object name for key {id_remote_storage_key}") })?; - match object_name.parse::() { - Ok(t) => timeline_ids.insert(t), + match object_name.parse::() { + Ok(t) => parsed_ids.insert(t), Err(_) => other_prefixes.insert(object_name.to_string()), }; } @@ -299,7 +301,31 @@ pub async fn list_remote_timelines( other_prefixes.insert(object_name.to_string()); } - Ok((timeline_ids, other_prefixes)) + Ok((parsed_ids, other_prefixes)) +} + +/// List shards of given tenant in remote storage +pub(crate) async fn list_remote_tenant_shards( + storage: &GenericRemoteStorage, + tenant_id: TenantId, + cancel: CancellationToken, +) -> anyhow::Result<(HashSet, HashSet)> { + let remote_path = remote_tenant_path(&TenantShardId::unsharded(tenant_id)); + list_identifiers::(storage, remote_path, cancel).await +} + +/// List timelines of given tenant shard in remote storage +pub async fn list_remote_timelines( + storage: &GenericRemoteStorage, + tenant_shard_id: TenantShardId, + cancel: CancellationToken, +) -> anyhow::Result<(HashSet, HashSet)> { + fail::fail_point!("storage-sync-list-remote-timelines", |_| { + anyhow::bail!("storage-sync-list-remote-timelines"); + }); + + let remote_path = remote_timelines_path(&tenant_shard_id).add_trailing_slash(); + list_identifiers::(storage, remote_path, cancel).await } async fn do_download_index_part( @@ -308,7 +334,7 @@ async fn do_download_index_part( timeline_id: &TimelineId, index_generation: Generation, cancel: &CancellationToken, -) -> Result { +) -> Result<(IndexPart, Generation), DownloadError> { let remote_path = remote_index_path(tenant_shard_id, timeline_id, index_generation); let index_part_bytes = download_retry_forever( @@ -333,7 +359,7 @@ async fn do_download_index_part( .with_context(|| format!("deserialize index part file at {remote_path:?}")) .map_err(DownloadError::Other)?; - Ok(index_part) + Ok((index_part, index_generation)) } /// index_part.json objects are suffixed with a generation number, so we cannot @@ -342,13 +368,13 @@ async fn do_download_index_part( /// In this function we probe for the most recent index in a generation <= our current generation. /// See "Finding the remote indices for timelines" in docs/rfcs/025-generation-numbers.md #[tracing::instrument(skip_all, fields(generation=?my_generation))] -pub(super) async fn download_index_part( +pub(crate) async fn download_index_part( storage: &GenericRemoteStorage, tenant_shard_id: &TenantShardId, timeline_id: &TimelineId, my_generation: Generation, cancel: &CancellationToken, -) -> Result { +) -> Result<(IndexPart, Generation), DownloadError> { debug_assert_current_span_has_tenant_and_timeline_id(); if my_generation.is_none() { @@ -417,11 +443,16 @@ pub(super) async fn download_index_part( let index_prefix = remote_index_path(tenant_shard_id, timeline_id, Generation::none()); let indices = download_retry( - || async { storage.list_files(Some(&index_prefix), None, cancel).await }, + || async { + storage + .list(Some(&index_prefix), ListingMode::NoDelimiter, None, cancel) + .await + }, "list index_part files", cancel, ) - .await?; + .await? + .keys; // General case logic for which index to use: the latest index whose generation // is <= our own. See "Finding the remote indices for timelines" in docs/rfcs/025-generation-numbers.md diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs index 0abfdeef02..b114d6aa10 100644 --- a/pageserver/src/tenant/remote_timeline_client/index.rs +++ b/pageserver/src/tenant/remote_timeline_client/index.rs @@ -6,10 +6,10 @@ use std::collections::HashMap; use chrono::NaiveDateTime; use serde::{Deserialize, Serialize}; -use utils::bin_ser::SerializeError; +use utils::id::TimelineId; use crate::tenant::metadata::TimelineMetadata; -use crate::tenant::storage_layer::LayerFileName; +use crate::tenant::storage_layer::LayerName; use crate::tenant::upload_queue::UploadQueueInitialized; use crate::tenant::Generation; use pageserver_api::shard::ShardIndex; @@ -76,7 +76,7 @@ pub struct IndexPart { /// /// Older versions of `IndexPart` will not have this property or have only a part of metadata /// that latest version stores. - pub layer_metadata: HashMap, + pub layer_metadata: HashMap, // 'disk_consistent_lsn' is a copy of the 'disk_consistent_lsn' in the metadata. // It's duplicated for convenience when reading the serialized structure, but is @@ -85,6 +85,9 @@ pub struct IndexPart { #[serde(rename = "metadata_bytes")] pub metadata: TimelineMetadata, + + #[serde(default)] + pub(crate) lineage: Lineage, } impl IndexPart { @@ -97,22 +100,23 @@ impl IndexPart { /// - 3: no longer deserialize `timeline_layers` (serialized format is the same, but timeline_layers /// is always generated from the keys of `layer_metadata`) /// - 4: timeline_layers is fully removed. - const LATEST_VERSION: usize = 4; + /// - 5: lineage was added + const LATEST_VERSION: usize = 5; // Versions we may see when reading from a bucket. - pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4]; + pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5]; pub const FILE_NAME: &'static str = "index_part.json"; - pub fn new( - layers_and_metadata: HashMap, + fn new( + layers_and_metadata: &HashMap, disk_consistent_lsn: Lsn, metadata: TimelineMetadata, + lineage: Lineage, ) -> Self { - // Transform LayerFileMetadata into IndexLayerMetadata let layer_metadata = layers_and_metadata - .into_iter() - .map(|(k, v)| (k, IndexLayerMetadata::from(v))) + .iter() + .map(|(k, v)| (k.to_owned(), IndexLayerMetadata::from(v))) .collect(); Self { @@ -121,6 +125,7 @@ impl IndexPart { disk_consistent_lsn, metadata, deleted_at: None, + lineage, } } @@ -141,20 +146,26 @@ impl IndexPart { pub fn to_s3_bytes(&self) -> serde_json::Result> { serde_json::to_vec(self) } + + #[cfg(test)] + pub(crate) fn example() -> Self { + let example_metadata = TimelineMetadata::example(); + Self::new( + &HashMap::new(), + example_metadata.disk_consistent_lsn(), + example_metadata, + Default::default(), + ) + } } -impl TryFrom<&UploadQueueInitialized> for IndexPart { - type Error = SerializeError; +impl From<&UploadQueueInitialized> for IndexPart { + fn from(uq: &UploadQueueInitialized) -> Self { + let disk_consistent_lsn = uq.latest_metadata.disk_consistent_lsn(); + let metadata = uq.latest_metadata.clone(); + let lineage = uq.latest_lineage.clone(); - fn try_from(upload_queue: &UploadQueueInitialized) -> Result { - let disk_consistent_lsn = upload_queue.latest_metadata.disk_consistent_lsn(); - let metadata = upload_queue.latest_metadata.clone(); - - Ok(Self::new( - upload_queue.latest_files.clone(), - disk_consistent_lsn, - metadata, - )) + Self::new(&uq.latest_files, disk_consistent_lsn, metadata, lineage) } } @@ -172,8 +183,8 @@ pub struct IndexLayerMetadata { pub shard: ShardIndex, } -impl From for IndexLayerMetadata { - fn from(other: LayerFileMetadata) -> Self { +impl From<&LayerFileMetadata> for IndexLayerMetadata { + fn from(other: &LayerFileMetadata) -> Self { IndexLayerMetadata { file_size: other.file_size, generation: other.generation, @@ -182,8 +193,76 @@ impl From for IndexLayerMetadata { } } +/// Limited history of earlier ancestors. +/// +/// A timeline can have more than 1 earlier ancestor, in the rare case that it was repeatedly +/// reparented by having an later timeline be detached from it's ancestor. +#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, Default)] +pub(crate) struct Lineage { + /// Has the `reparenting_history` been truncated to [`Lineage::REMEMBER_AT_MOST`]. + #[serde(skip_serializing_if = "is_false", default)] + reparenting_history_truncated: bool, + + /// Earlier ancestors, truncated when [`Self::reparenting_history_truncated`] + /// + /// These are stored in case we want to support WAL based DR on the timeline. There can be many + /// of these and at most one [`Self::original_ancestor`]. There cannot be more reparentings + /// after [`Self::original_ancestor`] has been set. + #[serde(skip_serializing_if = "Vec::is_empty", default)] + reparenting_history: Vec, + + /// The ancestor from which this timeline has been detached from and when. + /// + /// If you are adding support for detaching from a hierarchy, consider changing the ancestry + /// into a `Vec<(TimelineId, Lsn)>` to be a path instead. + #[serde(skip_serializing_if = "Option::is_none", default)] + original_ancestor: Option<(TimelineId, Lsn, NaiveDateTime)>, +} + +fn is_false(b: &bool) -> bool { + !b +} + +impl Lineage { + const REMEMBER_AT_MOST: usize = 100; + + pub(crate) fn record_previous_ancestor(&mut self, old_ancestor: &TimelineId) { + if self.reparenting_history.last() == Some(old_ancestor) { + // do not re-record it + return; + } + + let drop_oldest = self.reparenting_history.len() + 1 >= Self::REMEMBER_AT_MOST; + + self.reparenting_history_truncated |= drop_oldest; + if drop_oldest { + self.reparenting_history.remove(0); + } + self.reparenting_history.push(*old_ancestor); + } + + pub(crate) fn record_detaching(&mut self, branchpoint: &(TimelineId, Lsn)) { + assert!(self.original_ancestor.is_none()); + + self.original_ancestor = + Some((branchpoint.0, branchpoint.1, chrono::Utc::now().naive_utc())); + } + + /// The queried lsn is most likely the basebackup lsn, and this answers question "is it allowed + /// to start a read/write primary at this lsn". + /// + /// Returns true if the Lsn was previously a branch point. + pub(crate) fn is_previous_ancestor_lsn(&self, lsn: Lsn) -> bool { + self.original_ancestor + .as_ref() + .is_some_and(|(_, ancestor_lsn, _)| lsn == *ancestor_lsn) + } +} + #[cfg(test)] mod tests { + use std::str::FromStr; + use super::*; #[test] @@ -219,6 +298,7 @@ mod tests { disk_consistent_lsn: "0/16960E8".parse::().unwrap(), metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(), deleted_at: None, + lineage: Lineage::default(), }; let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap(); @@ -259,6 +339,7 @@ mod tests { disk_consistent_lsn: "0/16960E8".parse::().unwrap(), metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(), deleted_at: None, + lineage: Lineage::default(), }; let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap(); @@ -300,7 +381,8 @@ mod tests { disk_consistent_lsn: "0/16960E8".parse::().unwrap(), metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(), deleted_at: Some(chrono::NaiveDateTime::parse_from_str( - "2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap()) + "2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap()), + lineage: Lineage::default(), }; let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap(); @@ -345,6 +427,7 @@ mod tests { ]) .unwrap(), deleted_at: None, + lineage: Lineage::default(), }; let empty_layers_parsed = IndexPart::from_s3_bytes(empty_layers_json.as_bytes()).unwrap(); @@ -383,11 +466,58 @@ mod tests { ]), disk_consistent_lsn: "0/16960E8".parse::().unwrap(), metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(), - deleted_at: Some(chrono::NaiveDateTime::parse_from_str( - "2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap()), + deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")), + lineage: Lineage::default(), }; let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap(); assert_eq!(part, expected); } + + #[test] + fn v5_indexpart_is_parsed() { + let example = r#"{ + "version":5, + "layer_metadata":{ + "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000014EF420-00000000014EF499":{"file_size":23289856,"generation":1}, + "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000014EF499-00000000015A7619":{"file_size":1015808,"generation":1}}, + "disk_consistent_lsn":"0/15A7618", + "metadata_bytes":[226,88,25,241,0,46,0,4,0,0,0,0,1,90,118,24,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,78,244,32,0,0,0,0,1,78,244,32,0,0,0,16,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0], + "lineage":{ + "original_ancestor":["e2bfd8c633d713d279e6fcd2bcc15b6d","0/15A7618","2024-05-07T18:52:36.322426563"], + "reparenting_history":["e1bfd8c633d713d279e6fcd2bcc15b6d"] + } + }"#; + + let expected = IndexPart { + version: 5, + layer_metadata: HashMap::from([ + ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000014EF420-00000000014EF499".parse().unwrap(), IndexLayerMetadata { + file_size: 23289856, + generation: Generation::new(1), + shard: ShardIndex::unsharded(), + }), + ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000014EF499-00000000015A7619".parse().unwrap(), IndexLayerMetadata { + file_size: 1015808, + generation: Generation::new(1), + shard: ShardIndex::unsharded(), + }) + ]), + disk_consistent_lsn: Lsn::from_str("0/15A7618").unwrap(), + metadata: TimelineMetadata::from_bytes(&[226,88,25,241,0,46,0,4,0,0,0,0,1,90,118,24,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,78,244,32,0,0,0,0,1,78,244,32,0,0,0,16,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(), + deleted_at: None, + lineage: Lineage { + reparenting_history_truncated: false, + reparenting_history: vec![TimelineId::from_str("e1bfd8c633d713d279e6fcd2bcc15b6d").unwrap()], + original_ancestor: Some((TimelineId::from_str("e2bfd8c633d713d279e6fcd2bcc15b6d").unwrap(), Lsn::from_str("0/15A7618").unwrap(), parse_naive_datetime("2024-05-07T18:52:36.322426563"))), + }, + }; + + let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap(); + assert_eq!(part, expected); + } + + fn parse_naive_datetime(s: &str) -> NaiveDateTime { + chrono::NaiveDateTime::parse_from_str(s, "%Y-%m-%dT%H:%M:%S.%f").unwrap() + } } diff --git a/pageserver/src/tenant/remote_timeline_client/upload.rs b/pageserver/src/tenant/remote_timeline_client/upload.rs index 137fe48b73..caa843316f 100644 --- a/pageserver/src/tenant/remote_timeline_client/upload.rs +++ b/pageserver/src/tenant/remote_timeline_client/upload.rs @@ -12,18 +12,13 @@ use tokio_util::sync::CancellationToken; use utils::backoff; use super::Generation; -use crate::{ - config::PageServerConf, - tenant::remote_timeline_client::{ - index::IndexPart, remote_index_path, remote_initdb_archive_path, - remote_initdb_preserved_archive_path, remote_path, - }, +use crate::tenant::remote_timeline_client::{ + index::IndexPart, remote_index_path, remote_initdb_archive_path, + remote_initdb_preserved_archive_path, }; -use remote_storage::{GenericRemoteStorage, TimeTravelError}; +use remote_storage::{GenericRemoteStorage, RemotePath, TimeTravelError}; use utils::id::{TenantId, TimelineId}; -use super::index::LayerFileMetadata; - use tracing::info; /// Serializes and uploads the given index part data to the remote storage. @@ -65,11 +60,10 @@ pub(crate) async fn upload_index_part<'a>( /// /// On an error, bumps the retries count and reschedules the entire task. pub(super) async fn upload_timeline_layer<'a>( - conf: &'static PageServerConf, storage: &'a GenericRemoteStorage, - source_path: &'a Utf8Path, - known_metadata: &'a LayerFileMetadata, - generation: Generation, + local_path: &'a Utf8Path, + remote_path: &'a RemotePath, + metadata_size: u64, cancel: &CancellationToken, ) -> anyhow::Result<()> { fail_point!("before-upload-layer", |_| { @@ -78,8 +72,7 @@ pub(super) async fn upload_timeline_layer<'a>( pausable_failpoint!("before-upload-layer-pausable"); - let storage_path = remote_path(conf, source_path, generation)?; - let source_file_res = fs::File::open(&source_path).await; + let source_file_res = fs::File::open(&local_path).await; let source_file = match source_file_res { Ok(source_file) => source_file, Err(e) if e.kind() == ErrorKind::NotFound => { @@ -90,34 +83,49 @@ pub(super) async fn upload_timeline_layer<'a>( // it has been written to disk yet. // // This is tested against `test_compaction_delete_before_upload` - info!(path = %source_path, "File to upload doesn't exist. Likely the file has been deleted and an upload is not required any more."); + info!(path = %local_path, "File to upload doesn't exist. Likely the file has been deleted and an upload is not required any more."); return Ok(()); } - Err(e) => { - Err(e).with_context(|| format!("open a source file for layer {source_path:?}"))? - } + Err(e) => Err(e).with_context(|| format!("open a source file for layer {local_path:?}"))?, }; let fs_size = source_file .metadata() .await - .with_context(|| format!("get the source file metadata for layer {source_path:?}"))? + .with_context(|| format!("get the source file metadata for layer {local_path:?}"))? .len(); - let metadata_size = known_metadata.file_size(); if metadata_size != fs_size { - bail!("File {source_path:?} has its current FS size {fs_size} diferent from initially determined {metadata_size}"); + bail!("File {local_path:?} has its current FS size {fs_size} diferent from initially determined {metadata_size}"); } let fs_size = usize::try_from(fs_size) - .with_context(|| format!("convert {source_path:?} size {fs_size} usize"))?; + .with_context(|| format!("convert {local_path:?} size {fs_size} usize"))?; let reader = tokio_util::io::ReaderStream::with_capacity(source_file, super::BUFFER_SIZE); storage - .upload(reader, fs_size, &storage_path, None, cancel) + .upload(reader, fs_size, remote_path, None, cancel) .await - .with_context(|| format!("upload layer from local path '{source_path}'")) + .with_context(|| format!("upload layer from local path '{local_path}'")) +} + +pub(super) async fn copy_timeline_layer( + storage: &GenericRemoteStorage, + source_path: &RemotePath, + target_path: &RemotePath, + cancel: &CancellationToken, +) -> anyhow::Result<()> { + fail_point!("before-copy-layer", |_| { + bail!("failpoint before-copy-layer") + }); + + pausable_failpoint!("before-copy-layer-pausable"); + + storage + .copy_object(source_path, target_path, cancel) + .await + .with_context(|| format!("copy layer {source_path} to {target_path}")) } /// Uploads the given `initdb` data to the remote storage. @@ -167,7 +175,7 @@ pub(crate) async fn time_travel_recover_tenant( let warn_after = 3; let max_attempts = 10; let mut prefixes = Vec::with_capacity(2); - if tenant_shard_id.is_zero() { + if tenant_shard_id.is_shard_zero() { // Also recover the unsharded prefix for a shard of zero: // - if the tenant is totally unsharded, the unsharded prefix contains all the data // - if the tenant is sharded, we still want to recover the initdb data, but we only diff --git a/pageserver/src/tenant/secondary.rs b/pageserver/src/tenant/secondary.rs index 19f36c722e..7075044baf 100644 --- a/pageserver/src/tenant/secondary.rs +++ b/pageserver/src/tenant/secondary.rs @@ -7,6 +7,7 @@ use std::{sync::Arc, time::SystemTime}; use crate::{ config::PageServerConf, + context::RequestContext, disk_usage_eviction_task::DiskUsageEvictionInfo, task_mgr::{self, TaskKind, BACKGROUND_RUNTIME}, virtual_file::MaybeFatalIo, @@ -20,8 +21,9 @@ use self::{ use super::{ config::{SecondaryLocationConfig, TenantConfOpt}, mgr::TenantManager, + remote_timeline_client::LayerFileMetadata, span::debug_assert_current_span_has_tenant_id, - storage_layer::LayerFileName, + storage_layer::{layer::local_layer_path, LayerName}, }; use pageserver_api::{ @@ -180,7 +182,8 @@ impl SecondaryTenant { self: &Arc, conf: &PageServerConf, timeline_id: TimelineId, - name: LayerFileName, + name: LayerName, + metadata: LayerFileMetadata, ) { debug_assert_current_span_has_tenant_id(); @@ -194,9 +197,13 @@ impl SecondaryTenant { let now = SystemTime::now(); - let path = conf - .timeline_path(&self.tenant_shard_id, &timeline_id) - .join(name.file_name()); + let local_path = local_layer_path( + conf, + &self.tenant_shard_id, + &timeline_id, + &name, + &metadata.generation, + ); let this = self.clone(); @@ -207,7 +214,7 @@ impl SecondaryTenant { // it, the secondary downloader could have seen an updated heatmap that // resulted in a layer being deleted. // Other local I/O errors are process-fatal: these should never happen. - let deleted = std::fs::remove_file(path); + let deleted = std::fs::remove_file(local_path); let not_found = deleted .as_ref() @@ -316,9 +323,13 @@ pub fn spawn_tasks( let (upload_req_tx, upload_req_rx) = tokio::sync::mpsc::channel::>(16); + let downloader_task_ctx = RequestContext::new( + TaskKind::SecondaryDownloads, + crate::context::DownloadBehavior::Download, + ); task_mgr::spawn( BACKGROUND_RUNTIME.handle(), - TaskKind::SecondaryDownloads, + downloader_task_ctx.task_kind(), None, None, "secondary tenant downloads", @@ -330,6 +341,7 @@ pub fn spawn_tasks( download_req_rx, bg_jobs_clone, cancel_clone, + downloader_task_ctx, ) .await; diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs index 40f19e3b05..2a8f83be95 100644 --- a/pageserver/src/tenant/secondary/downloader.rs +++ b/pageserver/src/tenant/secondary/downloader.rs @@ -8,20 +8,21 @@ use std::{ use crate::{ config::PageServerConf, + context::RequestContext, disk_usage_eviction_task::{ finite_f32, DiskUsageEvictionInfo, EvictionCandidate, EvictionLayer, EvictionSecondaryLayer, }, - is_temporary, metrics::SECONDARY_MODE, tenant::{ config::SecondaryLocationConfig, debug_assert_current_span_has_tenant_and_timeline_id, + ephemeral_file::is_ephemeral_file, remote_timeline_client::{ index::LayerFileMetadata, is_temp_download_file, FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES, }, span::debug_assert_current_span_has_tenant_id, - storage_layer::LayerFileName, + storage_layer::{layer::local_layer_path, LayerName}, tasks::{warn_when_period_overrun, BackgroundLoopKind}, }, virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile}, @@ -30,7 +31,10 @@ use crate::{ use super::{ heatmap::HeatMapLayer, - scheduler::{self, Completion, JobGenerator, SchedulingResult, TenantBackgroundJobs}, + scheduler::{ + self, period_jitter, period_warmup, Completion, JobGenerator, SchedulingResult, + TenantBackgroundJobs, + }, SecondaryTenant, }; @@ -44,14 +48,13 @@ use chrono::format::{DelayedFormat, StrftimeItems}; use futures::Future; use pageserver_api::models::SecondaryProgress; use pageserver_api::shard::TenantShardId; -use rand::Rng; use remote_storage::{DownloadError, Etag, GenericRemoteStorage}; use tokio_util::sync::CancellationToken; use tracing::{info_span, instrument, warn, Instrument}; use utils::{ backoff, completion::Barrier, crashsafe::path_with_suffix_extension, failpoint_support, fs_ext, - id::TimelineId, + id::TimelineId, serde_system_time, }; use super::{ @@ -74,12 +77,14 @@ pub(super) async fn downloader_task( command_queue: tokio::sync::mpsc::Receiver>, background_jobs_can_start: Barrier, cancel: CancellationToken, + root_ctx: RequestContext, ) { let concurrency = tenant_manager.get_conf().secondary_download_concurrency; let generator = SecondaryDownloader { tenant_manager, remote_storage, + root_ctx, }; let mut scheduler = Scheduler::new(generator, concurrency); @@ -92,6 +97,7 @@ pub(super) async fn downloader_task( struct SecondaryDownloader { tenant_manager: Arc, remote_storage: GenericRemoteStorage, + root_ctx: RequestContext, } #[derive(Debug, Clone)] @@ -105,7 +111,7 @@ impl OnDiskState { _conf: &'static PageServerConf, _tenant_shard_id: &TenantShardId, _imeline_id: &TimelineId, - _ame: LayerFileName, + _ame: LayerName, metadata: LayerFileMetadata, access_time: SystemTime, ) -> Self { @@ -118,10 +124,10 @@ impl OnDiskState { #[derive(Debug, Clone, Default)] pub(super) struct SecondaryDetailTimeline { - pub(super) on_disk_layers: HashMap, + pub(super) on_disk_layers: HashMap, /// We remember when layers were evicted, to prevent re-downloading them. - pub(super) evicted_at: HashMap, + pub(super) evicted_at: HashMap, } /// This state is written by the secondary downloader, it is opaque @@ -270,7 +276,7 @@ impl JobGenerator SchedulingResult { @@ -301,18 +307,16 @@ impl JobGenerator next_download { Some(PendingDownload { secondary_state: secondary_tenant, last_download, @@ -367,11 +371,12 @@ impl JobGenerator { @@ -485,7 +490,7 @@ impl<'a> TenantDownloader<'a> { } } - async fn download(&self) -> Result<(), UpdateError> { + async fn download(&self, ctx: &RequestContext) -> Result<(), UpdateError> { debug_assert_current_span_has_tenant_id(); // For the duration of a download, we must hold the SecondaryTenant::gate, to ensure @@ -560,7 +565,7 @@ impl<'a> TenantDownloader<'a> { } let timeline_id = timeline.timeline_id; - self.download_timeline(timeline) + self.download_timeline(timeline, ctx) .instrument(tracing::info_span!( "secondary_download_timeline", tenant_id=%tenant_shard_id.tenant_id, @@ -591,7 +596,7 @@ impl<'a> TenantDownloader<'a> { let mut progress = SecondaryProgress { layers_total: heatmap_stats.layers, bytes_total: heatmap_stats.bytes, - heatmap_mtime: Some(heatmap_mtime), + heatmap_mtime: Some(serde_system_time::SystemTime(heatmap_mtime)), layers_downloaded: 0, bytes_downloaded: 0, }; @@ -616,12 +621,12 @@ impl<'a> TenantDownloader<'a> { let layers_in_heatmap = heatmap_timeline .layers .iter() - .map(|l| &l.name) + .map(|l| (&l.name, l.metadata.generation)) .collect::>(); let layers_on_disk = timeline_state .on_disk_layers .iter() - .map(|l| l.0) + .map(|l| (l.0, l.1.metadata.generation)) .collect::>(); let mut layer_count = layers_on_disk.len(); @@ -632,29 +637,39 @@ impl<'a> TenantDownloader<'a> { .sum(); // Remove on-disk layers that are no longer present in heatmap - for layer in layers_on_disk.difference(&layers_in_heatmap) { + for (layer_file_name, generation) in layers_on_disk.difference(&layers_in_heatmap) { layer_count -= 1; layer_byte_count -= timeline_state .on_disk_layers - .get(layer) + .get(layer_file_name) .unwrap() .metadata .file_size(); - delete_layers.push((*timeline_id, (*layer).clone())); + let local_path = local_layer_path( + self.conf, + self.secondary_state.get_tenant_shard_id(), + timeline_id, + layer_file_name, + generation, + ); + + delete_layers.push((*timeline_id, (*layer_file_name).clone(), local_path)); } progress.bytes_downloaded += layer_byte_count; progress.layers_downloaded += layer_count; } + + for delete_timeline in &delete_timelines { + // We haven't removed from disk yet, but optimistically remove from in-memory state: if removal + // from disk fails that will be a fatal error. + detail.timelines.remove(delete_timeline); + } } // Execute accumulated deletions - for (timeline_id, layer_name) in delete_layers { - let timeline_path = self - .conf - .timeline_path(self.secondary_state.get_tenant_shard_id(), &timeline_id); - let local_path = timeline_path.join(layer_name.to_string()); + for (timeline_id, layer_name, local_path) in delete_layers { tracing::info!(timeline_id=%timeline_id, "Removing secondary local layer {layer_name} because it's absent in heatmap",); tokio::fs::remove_file(&local_path) @@ -710,13 +725,14 @@ impl<'a> TenantDownloader<'a> { .await .map_err(UpdateError::from)?; + SECONDARY_MODE.download_heatmap.inc(); + if Some(&download.etag) == prev_etag { Ok(HeatMapDownload::Unmodified) } else { let mut heatmap_bytes = Vec::new(); let mut body = tokio_util::io::StreamReader::new(download.download_stream); let _size = tokio::io::copy_buf(&mut body, &mut heatmap_bytes).await?; - SECONDARY_MODE.download_heatmap.inc(); Ok(HeatMapDownload::Modified(HeatMapModified { etag: download.etag, last_modified: download.last_modified, @@ -735,12 +751,13 @@ impl<'a> TenantDownloader<'a> { .and_then(|x| x) } - async fn download_timeline(&self, timeline: HeatMapTimeline) -> Result<(), UpdateError> { + async fn download_timeline( + &self, + timeline: HeatMapTimeline, + ctx: &RequestContext, + ) -> Result<(), UpdateError> { debug_assert_current_span_has_tenant_and_timeline_id(); let tenant_shard_id = self.secondary_state.get_tenant_shard_id(); - let timeline_path = self - .conf - .timeline_path(tenant_shard_id, &timeline.timeline_id); // Accumulate updates to the state let mut touched = Vec::new(); @@ -786,6 +803,39 @@ impl<'a> TenantDownloader<'a> { // Existing on-disk layers: just update their access time. if let Some(on_disk) = timeline_state.on_disk_layers.get(&layer.name) { tracing::debug!("Layer {} is already on disk", layer.name); + + if cfg!(debug_assertions) { + // Debug for https://github.com/neondatabase/neon/issues/6966: check that the files we think + // are already present on disk are really there. + let local_path = local_layer_path( + self.conf, + tenant_shard_id, + &timeline.timeline_id, + &layer.name, + &layer.metadata.generation, + ); + + match tokio::fs::metadata(&local_path).await { + Ok(meta) => { + tracing::debug!( + "Layer {} present at {}, size {}", + layer.name, + local_path, + meta.len(), + ); + } + Err(e) => { + tracing::warn!( + "Layer {} not found at {} ({})", + layer.name, + local_path, + e + ); + debug_assert!(false); + } + } + } + if on_disk.metadata != LayerFileMetadata::from(&layer.metadata) || on_disk.access_time != layer.access_time { @@ -839,6 +889,7 @@ impl<'a> TenantDownloader<'a> { &layer.name, &LayerFileMetadata::from(&layer.metadata), &self.secondary_state.cancel, + ctx, ) .await { @@ -857,7 +908,13 @@ impl<'a> TenantDownloader<'a> { }; if downloaded_bytes != layer.metadata.file_size { - let local_path = timeline_path.join(layer.name.to_string()); + let local_path = local_layer_path( + self.conf, + tenant_shard_id, + &timeline.timeline_id, + &layer.name, + &layer.metadata.generation, + ); tracing::warn!( "Downloaded layer {} with unexpected size {} != {}. Removing download.", @@ -940,7 +997,7 @@ async fn init_timeline_state( // As we iterate through layers found on disk, we will look up their metadata from this map. // Layers not present in metadata will be discarded. - let heatmap_metadata: HashMap<&LayerFileName, &HeatMapLayer> = + let heatmap_metadata: HashMap<&LayerName, &HeatMapLayer> = heatmap.layers.iter().map(|l| (&l.name, l)).collect(); while let Some(dentry) = dir @@ -964,7 +1021,7 @@ async fn init_timeline_state( continue; } else if crate::is_temporary(&file_path) || is_temp_download_file(&file_path) - || is_temporary(&file_path) + || is_ephemeral_file(file_name) { // Temporary files are frequently left behind from restarting during downloads tracing::info!("Cleaning up temporary file {file_path}"); @@ -977,7 +1034,7 @@ async fn init_timeline_state( continue; } - match LayerFileName::from_str(file_name) { + match LayerName::from_str(file_name) { Ok(name) => { let remote_meta = heatmap_metadata.get(&name); match remote_meta { diff --git a/pageserver/src/tenant/secondary/heatmap.rs b/pageserver/src/tenant/secondary/heatmap.rs index 73cdf6c6d4..ca91ec24c6 100644 --- a/pageserver/src/tenant/secondary/heatmap.rs +++ b/pageserver/src/tenant/secondary/heatmap.rs @@ -1,8 +1,6 @@ use std::time::SystemTime; -use crate::tenant::{ - remote_timeline_client::index::IndexLayerMetadata, storage_layer::LayerFileName, -}; +use crate::tenant::{remote_timeline_client::index::IndexLayerMetadata, storage_layer::LayerName}; use serde::{Deserialize, Serialize}; use serde_with::{serde_as, DisplayFromStr, TimestampSeconds}; @@ -31,7 +29,7 @@ pub(crate) struct HeatMapTimeline { #[serde_as] #[derive(Serialize, Deserialize)] pub(crate) struct HeatMapLayer { - pub(super) name: LayerFileName, + pub(super) name: LayerName, pub(super) metadata: IndexLayerMetadata, #[serde_as(as = "TimestampSeconds")] @@ -42,7 +40,7 @@ pub(crate) struct HeatMapLayer { impl HeatMapLayer { pub(crate) fn new( - name: LayerFileName, + name: LayerName, metadata: IndexLayerMetadata, access_time: SystemTime, ) -> Self { diff --git a/pageserver/src/tenant/secondary/heatmap_uploader.rs b/pageserver/src/tenant/secondary/heatmap_uploader.rs index a8b05f4c0e..352409f5fc 100644 --- a/pageserver/src/tenant/secondary/heatmap_uploader.rs +++ b/pageserver/src/tenant/secondary/heatmap_uploader.rs @@ -9,6 +9,7 @@ use crate::{ metrics::SECONDARY_MODE, tenant::{ config::AttachmentMode, + mgr::GetTenantError, mgr::TenantManager, remote_timeline_client::remote_heatmap_path, span::debug_assert_current_span_has_tenant_id, @@ -19,12 +20,14 @@ use crate::{ use futures::Future; use pageserver_api::shard::TenantShardId; -use rand::Rng; use remote_storage::{GenericRemoteStorage, TimeoutOrCancel}; use super::{ heatmap::HeatMapTenant, - scheduler::{self, JobGenerator, RunningJob, SchedulingResult, TenantBackgroundJobs}, + scheduler::{ + self, period_jitter, period_warmup, JobGenerator, RunningJob, SchedulingResult, + TenantBackgroundJobs, + }, CommandRequest, UploadCommand, }; use tokio_util::sync::CancellationToken; @@ -180,15 +183,11 @@ impl JobGenerator let state = self .tenants .entry(*tenant.get_tenant_shard_id()) - .or_insert_with(|| { - let jittered_period = rand::thread_rng().gen_range(Duration::ZERO..period); - - UploaderTenantState { - tenant: Arc::downgrade(&tenant), - last_upload: None, - next_upload: Some(now.checked_add(jittered_period).unwrap_or(now)), - last_digest: None, - } + .or_insert_with(|| UploaderTenantState { + tenant: Arc::downgrade(&tenant), + last_upload: None, + next_upload: Some(now.checked_add(period_warmup(period)).unwrap_or(now)), + last_digest: None, }); // Decline to do the upload if insufficient time has passed @@ -273,7 +272,7 @@ impl JobGenerator let next_upload = tenant .get_heatmap_period() - .and_then(|period| now.checked_add(period)); + .and_then(|period| now.checked_add(period_jitter(period, 5))); WriteComplete { tenant_shard_id: *tenant.get_tenant_shard_id(), @@ -292,8 +291,11 @@ impl JobGenerator "Starting heatmap write on command"); let tenant = self .tenant_manager - .get_attached_tenant_shard(*tenant_shard_id, true) + .get_attached_tenant_shard(*tenant_shard_id) .map_err(|e| anyhow::anyhow!(e))?; + if !tenant.is_active() { + return Err(GetTenantError::NotActive(*tenant_shard_id).into()); + } Ok(UploadPending { // Ignore our state for last digest: this forces an upload even if nothing has changed diff --git a/pageserver/src/tenant/secondary/scheduler.rs b/pageserver/src/tenant/secondary/scheduler.rs index 3bd7be782e..3d042f4513 100644 --- a/pageserver/src/tenant/secondary/scheduler.rs +++ b/pageserver/src/tenant/secondary/scheduler.rs @@ -1,4 +1,5 @@ use futures::Future; +use rand::Rng; use std::{ collections::HashMap, marker::PhantomData, @@ -19,6 +20,26 @@ use super::{CommandRequest, CommandResponse}; const MAX_SCHEDULING_INTERVAL: Duration = Duration::from_secs(10); const MIN_SCHEDULING_INTERVAL: Duration = Duration::from_secs(1); +/// Jitter a Duration by an integer percentage. Returned values are uniform +/// in the range 100-pct..100+pct (i.e. a 5% jitter is 5% either way: a ~10% range) +pub(super) fn period_jitter(d: Duration, pct: u32) -> Duration { + if d == Duration::ZERO { + d + } else { + rand::thread_rng().gen_range((d * (100 - pct)) / 100..(d * (100 + pct)) / 100) + } +} + +/// When a periodic task first starts, it should wait for some time in the range 0..period, so +/// that starting many such tasks at the same time spreads them across the time range. +pub(super) fn period_warmup(period: Duration) -> Duration { + if period == Duration::ZERO { + period + } else { + rand::thread_rng().gen_range(Duration::ZERO..period) + } +} + /// Scheduling helper for background work across many tenants. /// /// Systems that need to run background work across many tenants may use this type diff --git a/pageserver/src/tenant/size.rs b/pageserver/src/tenant/size.rs index ad79b74d8b..64fff5536c 100644 --- a/pageserver/src/tenant/size.rs +++ b/pageserver/src/tenant/size.rs @@ -189,7 +189,9 @@ pub(super) async fn gather_inputs( // than a space bound (horizon cutoff). This means that if someone drops a database and waits for their // PITR interval, they will see synthetic size decrease, even if we are still storing data inside // horizon_cutoff. - let mut next_gc_cutoff = gc_info.pitr_cutoff; + let pitr_cutoff = gc_info.cutoffs.pitr; + let horizon_cutoff = gc_info.cutoffs.horizon; + let mut next_gc_cutoff = pitr_cutoff; // If the caller provided a shorter retention period, use that instead of the GC cutoff. let retention_param_cutoff = if let Some(max_retention_period) = max_retention_period { @@ -216,6 +218,8 @@ pub(super) async fn gather_inputs( .map(|lsn| (lsn, LsnKind::BranchPoint)) .collect::>(); + drop(gc_info); + // Add branch points we collected earlier, just in case there were any that were // not present in retain_lsns. We will remove any duplicates below later. if let Some(this_branchpoints) = branchpoints.get(&timeline_id) { @@ -294,8 +298,8 @@ pub(super) async fn gather_inputs( last_record: last_record_lsn, // this is not used above, because it might not have updated recently enough latest_gc_cutoff: *timeline.get_latest_gc_cutoff_lsn(), - horizon_cutoff: gc_info.horizon_cutoff, - pitr_cutoff: gc_info.pitr_cutoff, + horizon_cutoff, + pitr_cutoff, next_gc_cutoff, retention_param_cutoff, }); diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs index 5c3bab9868..94a5e9ec47 100644 --- a/pageserver/src/tenant/storage_layer.rs +++ b/pageserver/src/tenant/storage_layer.rs @@ -1,11 +1,11 @@ //! Common traits and structs for layers pub mod delta_layer; -mod filename; pub mod image_layer; -mod inmemory_layer; +pub(crate) mod inmemory_layer; pub(crate) mod layer; mod layer_desc; +mod layer_name; use crate::context::{AccessStatsBehavior, RequestContext}; use crate::repository::Value; @@ -25,7 +25,7 @@ use std::cmp::{Ordering, Reverse}; use std::collections::hash_map::Entry; use std::collections::{BinaryHeap, HashMap}; use std::ops::Range; -use std::sync::Mutex; +use std::sync::{Arc, Mutex}; use std::time::{Duration, SystemTime, UNIX_EPOCH}; use tracing::warn; use utils::history_buffer::HistoryBufferWithDropCounter; @@ -34,15 +34,15 @@ use utils::rate_limit::RateLimit; use utils::{id::TimelineId, lsn::Lsn}; pub use delta_layer::{DeltaLayer, DeltaLayerWriter, ValueRef}; -pub use filename::{DeltaFileName, ImageFileName, LayerFileName}; pub use image_layer::{ImageLayer, ImageLayerWriter}; pub use inmemory_layer::InMemoryLayer; pub use layer_desc::{PersistentLayerDesc, PersistentLayerKey}; +pub use layer_name::{DeltaLayerName, ImageLayerName, LayerName}; pub(crate) use layer::{EvictionError, Layer, ResidentLayer}; -use super::layer_map::InMemoryLayerHandle; -use super::timeline::layer_manager::LayerManager; +use self::inmemory_layer::InMemoryLayerFileId; + use super::timeline::GetVectoredError; use super::PageReconstructError; @@ -118,6 +118,7 @@ pub(crate) struct ValuesReconstructState { pub(crate) keys: HashMap>, keys_done: KeySpaceRandomAccum, + layers_visited: u32, } impl ValuesReconstructState { @@ -125,6 +126,7 @@ impl ValuesReconstructState { Self { keys: HashMap::new(), keys_done: KeySpaceRandomAccum::new(), + layers_visited: 0, } } @@ -138,6 +140,37 @@ impl ValuesReconstructState { } } + pub(crate) fn on_layer_visited(&mut self) { + self.layers_visited += 1; + } + + pub(crate) fn get_layers_visited(&self) -> u32 { + self.layers_visited + } + + /// This function is called after reading a keyspace from a layer. + /// It checks if the read path has now moved past the cached Lsn for any keys. + /// + /// Implementation note: We intentionally iterate over the keys for which we've + /// already collected some reconstruct data. This avoids scaling complexity with + /// the size of the search space. + pub(crate) fn on_lsn_advanced(&mut self, keyspace: &KeySpace, advanced_to: Lsn) { + for (key, value) in self.keys.iter_mut() { + if !keyspace.contains(key) { + continue; + } + + if let Ok(state) = value { + if state.situation != ValueReconstructSituation::Complete + && state.get_cached_lsn() >= Some(advanced_to) + { + state.situation = ValueReconstructSituation::Complete; + self.keys_done.add_key(*key); + } + } + } + } + /// Update the state collected for a given key. /// Returns true if this was the last value needed for the key and false otherwise. /// @@ -162,11 +195,18 @@ impl ValuesReconstructState { true } Value::WalRecord(rec) => { - let reached_cache = - state.get_cached_lsn().map(|clsn| clsn + 1) == Some(lsn); + debug_assert!( + Some(lsn) > state.get_cached_lsn(), + "Attempt to collect a record below cached LSN for walredo: {} < {}", + lsn, + state + .get_cached_lsn() + .expect("Assertion can only fire if a cached lsn is present") + ); + let will_init = rec.will_init(); state.records.push((lsn, rec)); - will_init || reached_cache + will_init } }, }; @@ -204,23 +244,30 @@ impl Default for ValuesReconstructState { } } -/// Description of layer to be read - the layer map can turn -/// this description into the actual layer. -#[derive(PartialEq, Eq, Hash, Debug, Clone)] -pub(crate) enum ReadableLayerDesc { - Persistent { - desc: PersistentLayerDesc, - lsn_range: Range, - }, - InMemory { - handle: InMemoryLayerHandle, - lsn_ceil: Lsn, - }, +/// A key that uniquely identifies a layer in a timeline +#[derive(Debug, PartialEq, Eq, Clone, Hash)] +pub(crate) enum LayerId { + PersitentLayerId(PersistentLayerKey), + InMemoryLayerId(InMemoryLayerFileId), } -/// Wraper for 'ReadableLayerDesc' sorted by Lsn +/// Layer wrapper for the read path. Note that it is valid +/// to use these layers even after external operations have +/// been performed on them (compaction, freeze, etc.). #[derive(Debug)] -struct ReadableLayerDescOrdered(ReadableLayerDesc); +pub(crate) enum ReadableLayer { + PersistentLayer(Layer), + InMemoryLayer(Arc), +} + +/// A partial description of a read to be done. +#[derive(Debug, Clone)] +struct ReadDesc { + /// An id used to resolve the readable layer within the fringe + layer_id: LayerId, + /// Lsn range for the read, used for selecting the next read + lsn_range: Range, +} /// Data structure which maintains a fringe of layers for the /// read path. The fringe is the set of layers which intersects @@ -231,41 +278,64 @@ struct ReadableLayerDescOrdered(ReadableLayerDesc); /// a two layer indexing scheme. #[derive(Debug)] pub(crate) struct LayerFringe { - layers_by_lsn: BinaryHeap, - layers: HashMap, + planned_reads_by_lsn: BinaryHeap, + layers: HashMap, +} + +#[derive(Debug)] +struct LayerKeyspace { + layer: ReadableLayer, + target_keyspace: KeySpace, } impl LayerFringe { pub(crate) fn new() -> Self { LayerFringe { - layers_by_lsn: BinaryHeap::new(), + planned_reads_by_lsn: BinaryHeap::new(), layers: HashMap::new(), } } - pub(crate) fn next_layer(&mut self) -> Option<(ReadableLayerDesc, KeySpace)> { - let handle = match self.layers_by_lsn.pop() { - Some(h) => h, + pub(crate) fn next_layer(&mut self) -> Option<(ReadableLayer, KeySpace, Range)> { + let read_desc = match self.planned_reads_by_lsn.pop() { + Some(desc) => desc, None => return None, }; - let removed = self.layers.remove_entry(&handle.0); + let removed = self.layers.remove_entry(&read_desc.layer_id); match removed { - Some((layer, keyspace)) => Some((layer, keyspace)), + Some(( + _, + LayerKeyspace { + layer, + target_keyspace, + }, + )) => Some((layer, target_keyspace, read_desc.lsn_range)), None => unreachable!("fringe internals are always consistent"), } } - pub(crate) fn update(&mut self, layer: ReadableLayerDesc, keyspace: KeySpace) { - let entry = self.layers.entry(layer.clone()); + pub(crate) fn update( + &mut self, + layer: ReadableLayer, + keyspace: KeySpace, + lsn_range: Range, + ) { + let layer_id = layer.id(); + let entry = self.layers.entry(layer_id.clone()); match entry { Entry::Occupied(mut entry) => { - entry.get_mut().merge(&keyspace); + entry.get_mut().target_keyspace.merge(&keyspace); } Entry::Vacant(entry) => { - self.layers_by_lsn - .push(ReadableLayerDescOrdered(entry.key().clone())); - entry.insert(keyspace); + self.planned_reads_by_lsn.push(ReadDesc { + lsn_range, + layer_id: layer_id.clone(), + }); + entry.insert(LayerKeyspace { + layer, + target_keyspace: keyspace, + }); } } } @@ -277,77 +347,55 @@ impl Default for LayerFringe { } } -impl Ord for ReadableLayerDescOrdered { +impl Ord for ReadDesc { fn cmp(&self, other: &Self) -> Ordering { - let ord = self.0.get_lsn_ceil().cmp(&other.0.get_lsn_ceil()); + let ord = self.lsn_range.end.cmp(&other.lsn_range.end); if ord == std::cmp::Ordering::Equal { - self.0 - .get_lsn_floor() - .cmp(&other.0.get_lsn_floor()) - .reverse() + self.lsn_range.start.cmp(&other.lsn_range.start).reverse() } else { ord } } } -impl PartialOrd for ReadableLayerDescOrdered { +impl PartialOrd for ReadDesc { fn partial_cmp(&self, other: &Self) -> Option { Some(self.cmp(other)) } } -impl PartialEq for ReadableLayerDescOrdered { +impl PartialEq for ReadDesc { fn eq(&self, other: &Self) -> bool { - self.0.get_lsn_floor() == other.0.get_lsn_floor() - && self.0.get_lsn_ceil() == other.0.get_lsn_ceil() + self.lsn_range == other.lsn_range } } -impl Eq for ReadableLayerDescOrdered {} +impl Eq for ReadDesc {} -impl ReadableLayerDesc { - pub(crate) fn get_lsn_floor(&self) -> Lsn { +impl ReadableLayer { + pub(crate) fn id(&self) -> LayerId { match self { - ReadableLayerDesc::Persistent { lsn_range, .. } => lsn_range.start, - ReadableLayerDesc::InMemory { handle, .. } => handle.get_lsn_floor(), - } - } - - pub(crate) fn get_lsn_ceil(&self) -> Lsn { - match self { - ReadableLayerDesc::Persistent { lsn_range, .. } => lsn_range.end, - ReadableLayerDesc::InMemory { lsn_ceil, .. } => *lsn_ceil, + Self::PersistentLayer(layer) => LayerId::PersitentLayerId(layer.layer_desc().key()), + Self::InMemoryLayer(layer) => LayerId::InMemoryLayerId(layer.file_id()), } } pub(crate) async fn get_values_reconstruct_data( &self, - layer_manager: &LayerManager, keyspace: KeySpace, + lsn_range: Range, reconstruct_state: &mut ValuesReconstructState, ctx: &RequestContext, ) -> Result<(), GetVectoredError> { match self { - ReadableLayerDesc::Persistent { desc, lsn_range } => { - let layer = layer_manager.get_from_desc(desc); + ReadableLayer::PersistentLayer(layer) => { layer - .get_values_reconstruct_data( - keyspace, - lsn_range.clone(), - reconstruct_state, - ctx, - ) + .get_values_reconstruct_data(keyspace, lsn_range, reconstruct_state, ctx) .await } - ReadableLayerDesc::InMemory { handle, lsn_ceil } => { - let layer = layer_manager - .layer_map() - .get_in_memory_layer(handle) - .unwrap(); - + ReadableLayer::InMemoryLayer(layer) => { layer - .get_values_reconstruct_data(keyspace, *lsn_ceil, reconstruct_state, ctx) + .get_values_reconstruct_data(keyspace, lsn_range.end, reconstruct_state, ctx) .await } } @@ -598,8 +646,8 @@ pub mod tests { use super::*; - impl From for PersistentLayerDesc { - fn from(value: DeltaFileName) -> Self { + impl From for PersistentLayerDesc { + fn from(value: DeltaLayerName) -> Self { PersistentLayerDesc::new_delta( TenantShardId::from([0; 18]), TimelineId::from_array([0; 16]), @@ -610,8 +658,8 @@ pub mod tests { } } - impl From for PersistentLayerDesc { - fn from(value: ImageFileName) -> Self { + impl From for PersistentLayerDesc { + fn from(value: ImageLayerName) -> Self { PersistentLayerDesc::new_img( TenantShardId::from([0; 18]), TimelineId::from_array([0; 16]), @@ -622,11 +670,11 @@ pub mod tests { } } - impl From for PersistentLayerDesc { - fn from(value: LayerFileName) -> Self { + impl From for PersistentLayerDesc { + fn from(value: LayerName) -> Self { match value { - LayerFileName::Delta(d) => Self::from(d), - LayerFileName::Image(i) => Self::from(i), + LayerName::Delta(d) => Self::from(d), + LayerName::Image(i) => Self::from(i), } } } diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs index b7132ee3bf..c38c9bb656 100644 --- a/pageserver/src/tenant/storage_layer/delta_layer.rs +++ b/pageserver/src/tenant/storage_layer/delta_layer.rs @@ -20,8 +20,8 @@ //! 000000067F000032BE0000400000000020B6-000000067F000032BE0000400000000030B6__000000578C6B29-0000000057A50051 //! ``` //! -//! Every delta file consists of three parts: "summary", "index", and -//! "values". The summary is a fixed size header at the beginning of the file, +//! Every delta file consists of three parts: "summary", "values", and +//! "index". The summary is a fixed size header at the beginning of the file, //! and it contains basic information about the layer, and offsets to the other //! parts. The "index" is a B-tree, mapping from Key and LSN to an offset in the //! "values" part. The actual page images and WAL records are stored in the @@ -47,6 +47,7 @@ use anyhow::{anyhow, bail, ensure, Context, Result}; use bytes::BytesMut; use camino::{Utf8Path, Utf8PathBuf}; use futures::StreamExt; +use itertools::Itertools; use pageserver_api::keyspace::KeySpace; use pageserver_api::models::LayerAccessKind; use pageserver_api::shard::TenantShardId; @@ -56,6 +57,7 @@ use std::fs::File; use std::io::SeekFrom; use std::ops::Range; use std::os::unix::fs::FileExt; +use std::str::FromStr; use std::sync::Arc; use tokio::sync::OnceCell; use tracing::*; @@ -67,7 +69,8 @@ use utils::{ }; use super::{ - AsLayerDesc, LayerAccessStats, PersistentLayerDesc, ResidentLayer, ValuesReconstructState, + AsLayerDesc, LayerAccessStats, LayerName, PersistentLayerDesc, ResidentLayer, + ValuesReconstructState, }; /// @@ -216,6 +219,7 @@ pub struct DeltaLayerInner { // values copied from summary index_start_blk: u32, index_root_blk: u32, + lsn_range: Range, file: VirtualFile, file_id: FileId, @@ -307,13 +311,13 @@ impl DeltaLayer { .and_then(|res| res)?; // not production code - let actual_filename = path.file_name().unwrap().to_owned(); - let expected_filename = self.layer_desc().filename().file_name(); + let actual_layer_name = LayerName::from_str(path.file_name().unwrap()).unwrap(); + let expected_layer_name = self.layer_desc().layer_name(); - if actual_filename != expected_filename { + if actual_layer_name != expected_layer_name { println!("warning: filename does not match what is expected from in-file summary"); - println!("actual: {:?}", actual_filename); - println!("expected: {:?}", expected_filename); + println!("actual: {:?}", actual_layer_name.to_string()); + println!("expected: {:?}", expected_layer_name.to_string()); } Ok(Arc::new(loaded)) @@ -426,9 +430,15 @@ impl DeltaLayerWriterInner { /// /// The values must be appended in key, lsn order. /// - async fn put_value(&mut self, key: Key, lsn: Lsn, val: Value) -> anyhow::Result<()> { + async fn put_value( + &mut self, + key: Key, + lsn: Lsn, + val: Value, + ctx: &RequestContext, + ) -> anyhow::Result<()> { let (_, res) = self - .put_value_bytes(key, lsn, Value::ser(&val)?, val.will_init()) + .put_value_bytes(key, lsn, Value::ser(&val)?, val.will_init(), ctx) .await; res } @@ -439,9 +449,10 @@ impl DeltaLayerWriterInner { lsn: Lsn, val: Vec, will_init: bool, + ctx: &RequestContext, ) -> (Vec, anyhow::Result<()>) { assert!(self.lsn_range.start <= lsn); - let (val, res) = self.blob_writer.write_blob(val).await; + let (val, res) = self.blob_writer.write_blob(val, ctx).await; let off = match res { Ok(off) => off, Err(e) => return (val, Err(anyhow::anyhow!(e))), @@ -461,18 +472,23 @@ impl DeltaLayerWriterInner { /// /// Finish writing the delta layer. /// - async fn finish(self, key_end: Key, timeline: &Arc) -> anyhow::Result { + async fn finish( + self, + key_end: Key, + timeline: &Arc, + ctx: &RequestContext, + ) -> anyhow::Result { let index_start_blk = ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32; - let mut file = self.blob_writer.into_inner().await?; + let mut file = self.blob_writer.into_inner(ctx).await?; // Write out the index let (index_root_blk, block_buf) = self.tree.finish()?; file.seek(SeekFrom::Start(index_start_blk as u64 * PAGE_SZ as u64)) .await?; for buf in block_buf.blocks { - let (_buf, res) = file.write_all(buf).await; + let (_buf, res) = file.write_all(buf, ctx).await; res?; } assert!(self.lsn_range.start < self.lsn_range.end); @@ -492,7 +508,7 @@ impl DeltaLayerWriterInner { // TODO: could use smallvec here but it's a pain with Slice Summary::ser_into(&summary, &mut buf)?; file.seek(SeekFrom::Start(0)).await?; - let (_buf, res) = file.write_all(buf).await; + let (_buf, res) = file.write_all(buf, ctx).await; res?; let metadata = file @@ -590,8 +606,18 @@ impl DeltaLayerWriter { /// /// The values must be appended in key, lsn order. /// - pub async fn put_value(&mut self, key: Key, lsn: Lsn, val: Value) -> anyhow::Result<()> { - self.inner.as_mut().unwrap().put_value(key, lsn, val).await + pub async fn put_value( + &mut self, + key: Key, + lsn: Lsn, + val: Value, + ctx: &RequestContext, + ) -> anyhow::Result<()> { + self.inner + .as_mut() + .unwrap() + .put_value(key, lsn, val, ctx) + .await } pub async fn put_value_bytes( @@ -600,11 +626,12 @@ impl DeltaLayerWriter { lsn: Lsn, val: Vec, will_init: bool, + ctx: &RequestContext, ) -> (Vec, anyhow::Result<()>) { self.inner .as_mut() .unwrap() - .put_value_bytes(key, lsn, val, will_init) + .put_value_bytes(key, lsn, val, will_init, ctx) .await } @@ -619,10 +646,11 @@ impl DeltaLayerWriter { mut self, key_end: Key, timeline: &Arc, + ctx: &RequestContext, ) -> anyhow::Result { let inner = self.inner.take().unwrap(); let temp_path = inner.path.clone(); - let result = inner.finish(key_end, timeline).await; + let result = inner.finish(key_end, timeline, ctx).await; // The delta layer files can sometimes be really large. Clean them up. if result.is_err() { tracing::warn!( @@ -690,7 +718,7 @@ impl DeltaLayer { // TODO: could use smallvec here, but it's a pain with Slice Summary::ser_into(&new_summary, &mut buf).context("serialize")?; file.seek(SeekFrom::Start(0)).await?; - let (_buf, res) = file.write_all(buf).await; + let (_buf, res) = file.write_all(buf, ctx).await; res?; Ok(()) } @@ -727,6 +755,9 @@ impl DeltaLayerInner { // production code path expected_summary.index_start_blk = actual_summary.index_start_blk; expected_summary.index_root_blk = actual_summary.index_root_blk; + // mask out the timeline_id, but still require the layers to be from the same tenant + expected_summary.timeline_id = actual_summary.timeline_id; + if actual_summary != expected_summary { bail!( "in-file summary does not match expected summary. actual = {:?} expected = {:?}", @@ -741,6 +772,7 @@ impl DeltaLayerInner { file_id, index_start_blk: actual_summary.index_start_blk, index_root_blk: actual_summary.index_root_blk, + lsn_range: actual_summary.lsn_range, max_vectored_read_bytes, })) } @@ -862,10 +894,10 @@ impl DeltaLayerInner { .into(), ); - let data_end_offset = self.index_start_blk as u64 * PAGE_SZ as u64; + let data_end_offset = self.index_start_offset(); let reads = Self::plan_reads( - keyspace, + &keyspace, lsn_range, data_end_offset, index_reader, @@ -879,11 +911,13 @@ impl DeltaLayerInner { self.do_reads_and_update_state(reads, reconstruct_state) .await; + reconstruct_state.on_lsn_advanced(&keyspace, self.lsn_range.start); + Ok(()) } async fn plan_reads( - keyspace: KeySpace, + keyspace: &KeySpace, lsn_range: Range, data_end_offset: u64, index_reader: DiskBtreeReader, @@ -938,7 +972,7 @@ impl DeltaLayerInner { } if !range_end_handled { - tracing::info!("Handling range end fallback at {}", data_end_offset); + tracing::debug!("Handling range end fallback at {}", data_end_offset); planner.handle_range_end(data_end_offset); } } @@ -946,6 +980,34 @@ impl DeltaLayerInner { Ok(planner.finish()) } + fn get_min_read_buffer_size( + planned_reads: &[VectoredRead], + read_size_soft_max: usize, + ) -> usize { + let Some(largest_read) = planned_reads.iter().max_by_key(|read| read.size()) else { + return read_size_soft_max; + }; + + let largest_read_size = largest_read.size(); + if largest_read_size > read_size_soft_max { + // If the read is oversized, it should only contain one key. + let offenders = largest_read + .blobs_at + .as_slice() + .iter() + .map(|(_, blob_meta)| format!("{}@{}", blob_meta.key, blob_meta.lsn)) + .join(", "); + tracing::warn!( + "Oversized vectored read ({} > {}) for keys {}", + largest_read_size, + read_size_soft_max, + offenders + ); + } + + largest_read_size + } + async fn do_reads_and_update_state( &self, reads: Vec, @@ -959,7 +1021,8 @@ impl DeltaLayerInner { .expect("Layer is loaded with max vectored bytes config") .0 .into(); - let mut buf = Some(BytesMut::with_capacity(max_vectored_read_bytes)); + let buf_size = Self::get_min_read_buffer_size(&reads, max_vectored_read_bytes); + let mut buf = Some(BytesMut::with_capacity(buf_size)); // Note that reads are processed in reverse order (from highest key+lsn). // This is the order that `ReconstructState` requires such that it can @@ -986,7 +1049,7 @@ impl DeltaLayerInner { // We have "lost" the buffer since the lower level IO api // doesn't return the buffer on error. Allocate a new one. - buf = Some(BytesMut::with_capacity(max_vectored_read_bytes)); + buf = Some(BytesMut::with_capacity(buf_size)); continue; } @@ -1073,11 +1136,206 @@ impl DeltaLayerInner { if let Some(last) = all_keys.last_mut() { // Last key occupies all space till end of value storage, // which corresponds to beginning of the index - last.size = self.index_start_blk as u64 * PAGE_SZ as u64 - last.size; + last.size = self.index_start_offset() - last.size; } Ok(all_keys) } + /// Using the given writer, write out a version which has the earlier Lsns than `until`. + /// + /// Return the amount of key value records pushed to the writer. + pub(super) async fn copy_prefix( + &self, + writer: &mut DeltaLayerWriter, + until: Lsn, + ctx: &RequestContext, + ) -> anyhow::Result { + use crate::tenant::vectored_blob_io::{ + BlobMeta, VectoredReadBuilder, VectoredReadExtended, + }; + use futures::stream::TryStreamExt; + + #[derive(Debug)] + enum Item { + Actual(Key, Lsn, BlobRef), + Sentinel, + } + + impl From for Option<(Key, Lsn, BlobRef)> { + fn from(value: Item) -> Self { + match value { + Item::Actual(key, lsn, blob) => Some((key, lsn, blob)), + Item::Sentinel => None, + } + } + } + + impl Item { + fn offset(&self) -> Option { + match self { + Item::Actual(_, _, blob) => Some(*blob), + Item::Sentinel => None, + } + } + + fn is_last(&self) -> bool { + matches!(self, Item::Sentinel) + } + } + + let block_reader = FileBlockReader::new(&self.file, self.file_id); + let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new( + self.index_start_blk, + self.index_root_blk, + block_reader, + ); + + let stream = self.stream_index_forwards(&tree_reader, &[0u8; DELTA_KEY_SIZE], ctx); + let stream = stream.map_ok(|(key, lsn, pos)| Item::Actual(key, lsn, pos)); + // put in a sentinel value for getting the end offset for last item, and not having to + // repeat the whole read part + let stream = stream.chain(futures::stream::once(futures::future::ready(Ok( + Item::Sentinel, + )))); + let mut stream = std::pin::pin!(stream); + + let mut prev: Option<(Key, Lsn, BlobRef)> = None; + + let mut read_builder: Option = None; + + let max_read_size = self + .max_vectored_read_bytes + .map(|x| x.0.get()) + .unwrap_or(8192); + + let mut buffer = Some(BytesMut::with_capacity(max_read_size)); + + // FIXME: buffering of DeltaLayerWriter + let mut per_blob_copy = Vec::new(); + + let mut records = 0; + + while let Some(item) = stream.try_next().await? { + tracing::debug!(?item, "popped"); + let offset = item + .offset() + .unwrap_or(BlobRef::new(self.index_start_offset(), false)); + + let actionable = if let Some((key, lsn, start_offset)) = prev.take() { + let end_offset = offset; + + Some((BlobMeta { key, lsn }, start_offset..end_offset)) + } else { + None + }; + + let is_last = item.is_last(); + + prev = Option::from(item); + + let actionable = actionable.filter(|x| x.0.lsn < until); + + let builder = if let Some((meta, offsets)) = actionable { + // extend or create a new builder + if read_builder + .as_mut() + .map(|x| x.extend(offsets.start.pos(), offsets.end.pos(), meta)) + .unwrap_or(VectoredReadExtended::No) + == VectoredReadExtended::Yes + { + None + } else { + read_builder.replace(VectoredReadBuilder::new( + offsets.start.pos(), + offsets.end.pos(), + meta, + max_read_size, + )) + } + } else { + // nothing to do, except perhaps flush any existing for the last element + None + }; + + // flush the possible older builder and also the new one if the item was the last one + let builders = builder.into_iter(); + let builders = if is_last { + builders.chain(read_builder.take()) + } else { + builders.chain(None) + }; + + for builder in builders { + let read = builder.build(); + + let reader = VectoredBlobReader::new(&self.file); + + let mut buf = buffer.take().unwrap(); + + buf.clear(); + buf.reserve(read.size()); + let res = reader.read_blobs(&read, buf).await?; + + for blob in res.blobs { + let key = blob.meta.key; + let lsn = blob.meta.lsn; + let data = &res.buf[blob.start..blob.end]; + + #[cfg(debug_assertions)] + Value::des(data) + .with_context(|| { + format!( + "blob failed to deserialize for {}@{}, {}..{}: {:?}", + blob.meta.key, + blob.meta.lsn, + blob.start, + blob.end, + utils::Hex(data) + ) + }) + .unwrap(); + + // is it an image or will_init walrecord? + // FIXME: this could be handled by threading the BlobRef to the + // VectoredReadBuilder + let will_init = crate::repository::ValueBytes::will_init(data) + .inspect_err(|_e| { + #[cfg(feature = "testing")] + tracing::error!(data=?utils::Hex(data), err=?_e, %key, %lsn, "failed to parse will_init out of serialized value"); + }) + .unwrap_or(false); + + per_blob_copy.clear(); + per_blob_copy.extend_from_slice(data); + + let (tmp, res) = writer + .put_value_bytes( + key, + lsn, + std::mem::take(&mut per_blob_copy), + will_init, + ctx, + ) + .await; + per_blob_copy = tmp; + + res?; + + records += 1; + } + + buffer = Some(res.buf); + } + } + + assert!( + read_builder.is_none(), + "with the sentinel above loop should had handled all" + ); + + Ok(records) + } + pub(super) async fn dump(&self, ctx: &RequestContext) -> anyhow::Result<()> { println!( "index_start_blk: {}, root {}", @@ -1147,6 +1405,43 @@ impl DeltaLayerInner { Ok(()) } + + fn stream_index_forwards<'a, R>( + &'a self, + reader: &'a DiskBtreeReader, + start: &'a [u8; DELTA_KEY_SIZE], + ctx: &'a RequestContext, + ) -> impl futures::stream::Stream< + Item = Result<(Key, Lsn, BlobRef), crate::tenant::disk_btree::DiskBtreeError>, + > + 'a + where + R: BlockReader, + { + use futures::stream::TryStreamExt; + let stream = reader.get_stream_from(start, ctx); + stream.map_ok(|(key, value)| { + let key = DeltaKey::from_slice(&key); + let (key, lsn) = (key.key(), key.lsn()); + let offset = BlobRef(value); + + (key, lsn, offset) + }) + } + + /// The file offset to the first block of index. + /// + /// The file structure is summary, values, and index. We often need this for the size of last blob. + fn index_start_offset(&self) -> u64 { + let offset = self.index_start_blk as u64 * PAGE_SZ as u64; + let bref = BlobRef(offset); + tracing::debug!( + index_start_blk = self.index_start_blk, + offset, + pos = bref.pos(), + "index_start_offset" + ); + offset + } } /// A set of data associated with a delta layer key and its value @@ -1210,9 +1505,16 @@ impl<'a> pageserver_compaction::interface::CompactionDeltaEntry<'a, Key> for Del mod test { use std::collections::BTreeMap; + use itertools::MinMaxResult; + use rand::prelude::{SeedableRng, SliceRandom, StdRng}; + use rand::RngCore; + use super::*; use crate::{ - context::DownloadBehavior, task_mgr::TaskKind, tenant::disk_btree::tests::TestDisk, + context::DownloadBehavior, + task_mgr::TaskKind, + tenant::{disk_btree::tests::TestDisk, harness::TenantHarness}, + DEFAULT_PG_VERSION, }; /// Construct an index for a fictional delta layer and and then @@ -1273,7 +1575,7 @@ mod test { // Plan and validate let vectored_reads = DeltaLayerInner::plan_reads( - keyspace.clone(), + &keyspace, lsn_range.clone(), disk_offset, reader, @@ -1332,4 +1634,444 @@ mod test { assert_eq!(planned_blobs, expected_blobs); } + + mod constants { + use utils::lsn::Lsn; + + /// Offset used by all lsns in this test + pub(super) const LSN_OFFSET: Lsn = Lsn(0x08); + /// Number of unique keys including in the test data + pub(super) const KEY_COUNT: u8 = 60; + /// Max number of different lsns for each key + pub(super) const MAX_ENTRIES_PER_KEY: u8 = 20; + /// Possible value sizes for each key along with a probability weight + pub(super) const VALUE_SIZES: [(usize, u8); 3] = [(100, 2), (1024, 2), (1024 * 1024, 1)]; + /// Probability that there will be a gap between the current key and the next one (33.3%) + pub(super) const KEY_GAP_CHANGES: [(bool, u8); 2] = [(true, 1), (false, 2)]; + /// The minimum size of a key range in all the generated reads + pub(super) const MIN_RANGE_SIZE: i128 = 10; + /// The number of ranges included in each vectored read + pub(super) const RANGES_COUNT: u8 = 2; + /// The number of vectored reads performed + pub(super) const READS_COUNT: u8 = 100; + /// Soft max size of a vectored read. Will be violated if we have to read keys + /// with values larger than the limit + pub(super) const MAX_VECTORED_READ_BYTES: usize = 64 * 1024; + } + + struct Entry { + key: Key, + lsn: Lsn, + value: Vec, + } + + fn generate_entries(rng: &mut StdRng) -> Vec { + let mut current_key = Key::MIN; + + let mut entries = Vec::new(); + for _ in 0..constants::KEY_COUNT { + let count = rng.gen_range(1..constants::MAX_ENTRIES_PER_KEY); + let mut lsns_iter = + std::iter::successors(Some(Lsn(constants::LSN_OFFSET.0 + 0x08)), |lsn| { + Some(Lsn(lsn.0 + 0x08)) + }); + let mut lsns = Vec::new(); + while lsns.len() < count as usize { + let take = rng.gen_bool(0.5); + let lsn = lsns_iter.next().unwrap(); + if take { + lsns.push(lsn); + } + } + + for lsn in lsns { + let size = constants::VALUE_SIZES + .choose_weighted(rng, |item| item.1) + .unwrap() + .0; + let mut buf = vec![0; size]; + rng.fill_bytes(&mut buf); + + entries.push(Entry { + key: current_key, + lsn, + value: buf, + }) + } + + let gap = constants::KEY_GAP_CHANGES + .choose_weighted(rng, |item| item.1) + .unwrap() + .0; + if gap { + current_key = current_key.add(2); + } else { + current_key = current_key.add(1); + } + } + + entries + } + + struct EntriesMeta { + key_range: Range, + lsn_range: Range, + index: BTreeMap<(Key, Lsn), Vec>, + } + + fn get_entries_meta(entries: &[Entry]) -> EntriesMeta { + let key_range = match entries.iter().minmax_by_key(|e| e.key) { + MinMaxResult::MinMax(min, max) => min.key..max.key.next(), + _ => panic!("More than one entry is always expected"), + }; + + let lsn_range = match entries.iter().minmax_by_key(|e| e.lsn) { + MinMaxResult::MinMax(min, max) => min.lsn..Lsn(max.lsn.0 + 1), + _ => panic!("More than one entry is always expected"), + }; + + let mut index = BTreeMap::new(); + for entry in entries.iter() { + index.insert((entry.key, entry.lsn), entry.value.clone()); + } + + EntriesMeta { + key_range, + lsn_range, + index, + } + } + + fn pick_random_keyspace(rng: &mut StdRng, key_range: &Range) -> KeySpace { + let start = key_range.start.to_i128(); + let end = key_range.end.to_i128(); + + let mut keyspace = KeySpace::default(); + + for _ in 0..constants::RANGES_COUNT { + let mut range: Option> = Option::default(); + while range.is_none() || keyspace.overlaps(range.as_ref().unwrap()) { + let range_start = rng.gen_range(start..end); + let range_end_offset = range_start + constants::MIN_RANGE_SIZE; + if range_end_offset >= end { + range = Some(Key::from_i128(range_start)..Key::from_i128(end)); + } else { + let range_end = rng.gen_range((range_start + constants::MIN_RANGE_SIZE)..end); + range = Some(Key::from_i128(range_start)..Key::from_i128(range_end)); + } + } + keyspace.ranges.push(range.unwrap()); + } + + keyspace + } + + #[tokio::test] + async fn test_delta_layer_vectored_read_end_to_end() -> anyhow::Result<()> { + let harness = TenantHarness::create("test_delta_layer_oversized_vectored_read")?; + let (tenant, ctx) = harness.load().await; + + let timeline_id = TimelineId::generate(); + let timeline = tenant + .create_test_timeline(timeline_id, constants::LSN_OFFSET, DEFAULT_PG_VERSION, &ctx) + .await?; + + tracing::info!("Generating test data ..."); + + let rng = &mut StdRng::seed_from_u64(0); + let entries = generate_entries(rng); + let entries_meta = get_entries_meta(&entries); + + tracing::info!("Done generating {} entries", entries.len()); + + tracing::info!("Writing test data to delta layer ..."); + let mut writer = DeltaLayerWriter::new( + harness.conf, + timeline_id, + harness.tenant_shard_id, + entries_meta.key_range.start, + entries_meta.lsn_range.clone(), + ) + .await?; + + for entry in entries { + let (_, res) = writer + .put_value_bytes(entry.key, entry.lsn, entry.value, false, &ctx) + .await; + res?; + } + + let resident = writer + .finish(entries_meta.key_range.end, &timeline, &ctx) + .await?; + + let inner = resident.as_delta(&ctx).await?; + + let file_size = inner.file.metadata().await?.len(); + tracing::info!( + "Done writing test data to delta layer. Resulting file size is: {}", + file_size + ); + + for i in 0..constants::READS_COUNT { + tracing::info!("Doing vectored read {}/{}", i + 1, constants::READS_COUNT); + + let block_reader = FileBlockReader::new(&inner.file, inner.file_id); + let index_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new( + inner.index_start_blk, + inner.index_root_blk, + block_reader, + ); + + let planner = VectoredReadPlanner::new(constants::MAX_VECTORED_READ_BYTES); + let mut reconstruct_state = ValuesReconstructState::new(); + let keyspace = pick_random_keyspace(rng, &entries_meta.key_range); + let data_end_offset = inner.index_start_blk as u64 * PAGE_SZ as u64; + + let vectored_reads = DeltaLayerInner::plan_reads( + &keyspace, + entries_meta.lsn_range.clone(), + data_end_offset, + index_reader, + planner, + &mut reconstruct_state, + &ctx, + ) + .await?; + + let vectored_blob_reader = VectoredBlobReader::new(&inner.file); + let buf_size = DeltaLayerInner::get_min_read_buffer_size( + &vectored_reads, + constants::MAX_VECTORED_READ_BYTES, + ); + let mut buf = Some(BytesMut::with_capacity(buf_size)); + + for read in vectored_reads { + let blobs_buf = vectored_blob_reader + .read_blobs(&read, buf.take().expect("Should have a buffer")) + .await?; + for meta in blobs_buf.blobs.iter() { + let value = &blobs_buf.buf[meta.start..meta.end]; + assert_eq!(value, entries_meta.index[&(meta.meta.key, meta.meta.lsn)]); + } + + buf = Some(blobs_buf.buf); + } + } + + Ok(()) + } + + #[tokio::test] + async fn copy_delta_prefix_smoke() { + use crate::walrecord::NeonWalRecord; + use bytes::Bytes; + + let h = crate::tenant::harness::TenantHarness::create("truncate_delta_smoke").unwrap(); + let (tenant, ctx) = h.load().await; + let ctx = &ctx; + let timeline = tenant + .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, ctx) + .await + .unwrap(); + + let initdb_layer = timeline + .layers + .read() + .await + .likely_resident_layers() + .next() + .unwrap(); + + { + let mut writer = timeline.writer().await; + + let data = [ + (0x20, 12, Value::Image(Bytes::from_static(b"foobar"))), + ( + 0x30, + 12, + Value::WalRecord(NeonWalRecord::Postgres { + will_init: false, + rec: Bytes::from_static(b"1"), + }), + ), + ( + 0x40, + 12, + Value::WalRecord(NeonWalRecord::Postgres { + will_init: true, + rec: Bytes::from_static(b"2"), + }), + ), + // build an oversized value so we cannot extend and existing read over + // this + ( + 0x50, + 12, + Value::WalRecord(NeonWalRecord::Postgres { + will_init: true, + rec: { + let mut buf = + vec![0u8; tenant.conf.max_vectored_read_bytes.0.get() + 1024]; + buf.iter_mut() + .enumerate() + .for_each(|(i, slot)| *slot = (i % 256) as u8); + Bytes::from(buf) + }, + }), + ), + // because the oversized read cannot be extended further, we are sure to exercise the + // builder created on the last round with this: + ( + 0x60, + 12, + Value::WalRecord(NeonWalRecord::Postgres { + will_init: true, + rec: Bytes::from_static(b"3"), + }), + ), + ( + 0x60, + 9, + Value::Image(Bytes::from_static(b"something for a different key")), + ), + ]; + + let mut last_lsn = None; + + for (lsn, key, value) in data { + let key = Key::from_i128(key); + writer.put(key, Lsn(lsn), &value, ctx).await.unwrap(); + last_lsn = Some(lsn); + } + + writer.finish_write(Lsn(last_lsn.unwrap())); + } + timeline.freeze_and_flush().await.unwrap(); + + let new_layer = timeline + .layers + .read() + .await + .likely_resident_layers() + .find(|x| x != &initdb_layer) + .unwrap(); + + // create a copy for the timeline, so we don't overwrite the file + let branch = tenant + .branch_timeline_test(&timeline, TimelineId::generate(), None, ctx) + .await + .unwrap(); + + assert_eq!(branch.get_ancestor_lsn(), Lsn(0x60)); + + // truncating at 0x61 gives us a full copy, otherwise just go backwards until there's just + // a single key + + for truncate_at in [0x61, 0x51, 0x41, 0x31, 0x21] { + let truncate_at = Lsn(truncate_at); + + let mut writer = DeltaLayerWriter::new( + tenant.conf, + branch.timeline_id, + tenant.tenant_shard_id, + Key::MIN, + Lsn(0x11)..truncate_at, + ) + .await + .unwrap(); + + let new_layer = new_layer.download_and_keep_resident().await.unwrap(); + + new_layer + .copy_delta_prefix(&mut writer, truncate_at, ctx) + .await + .unwrap(); + + let copied_layer = writer.finish(Key::MAX, &branch, ctx).await.unwrap(); + + copied_layer.as_delta(ctx).await.unwrap(); + + assert_keys_and_values_eq( + new_layer.as_delta(ctx).await.unwrap(), + copied_layer.as_delta(ctx).await.unwrap(), + truncate_at, + ctx, + ) + .await; + } + } + + async fn assert_keys_and_values_eq( + source: &DeltaLayerInner, + truncated: &DeltaLayerInner, + truncated_at: Lsn, + ctx: &RequestContext, + ) { + use futures::future::ready; + use futures::stream::TryStreamExt; + + let start_key = [0u8; DELTA_KEY_SIZE]; + + let source_reader = FileBlockReader::new(&source.file, source.file_id); + let source_tree = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new( + source.index_start_blk, + source.index_root_blk, + &source_reader, + ); + let source_stream = source.stream_index_forwards(&source_tree, &start_key, ctx); + let source_stream = source_stream.filter(|res| match res { + Ok((_, lsn, _)) => ready(lsn < &truncated_at), + _ => ready(true), + }); + let mut source_stream = std::pin::pin!(source_stream); + + let truncated_reader = FileBlockReader::new(&truncated.file, truncated.file_id); + let truncated_tree = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new( + truncated.index_start_blk, + truncated.index_root_blk, + &truncated_reader, + ); + let truncated_stream = truncated.stream_index_forwards(&truncated_tree, &start_key, ctx); + let mut truncated_stream = std::pin::pin!(truncated_stream); + + let mut scratch_left = Vec::new(); + let mut scratch_right = Vec::new(); + + loop { + let (src, truncated) = (source_stream.try_next(), truncated_stream.try_next()); + let (src, truncated) = tokio::try_join!(src, truncated).unwrap(); + + if src.is_none() { + assert!(truncated.is_none()); + break; + } + + let (src, truncated) = (src.unwrap(), truncated.unwrap()); + + // because we've filtered the source with Lsn, we should always have the same keys from both. + assert_eq!(src.0, truncated.0); + assert_eq!(src.1, truncated.1); + + // if this is needed for something else, just drop this assert. + assert!( + src.2.pos() >= truncated.2.pos(), + "value position should not go backwards {} vs. {}", + src.2.pos(), + truncated.2.pos() + ); + + scratch_left.clear(); + let src_cursor = source_reader.block_cursor(); + let left = src_cursor.read_blob_into_buf(src.2.pos(), &mut scratch_left, ctx); + scratch_right.clear(); + let trunc_cursor = truncated_reader.block_cursor(); + let right = trunc_cursor.read_blob_into_buf(truncated.2.pos(), &mut scratch_right, ctx); + + tokio::try_join!(left, right).unwrap(); + + assert_eq!(utils::Hex(&scratch_left), utils::Hex(&scratch_right)); + } + } } diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs index 14c79e413c..c9874873e4 100644 --- a/pageserver/src/tenant/storage_layer/image_layer.rs +++ b/pageserver/src/tenant/storage_layer/image_layer.rs @@ -44,6 +44,7 @@ use anyhow::{anyhow, bail, ensure, Context, Result}; use bytes::{Bytes, BytesMut}; use camino::{Utf8Path, Utf8PathBuf}; use hex; +use itertools::Itertools; use pageserver_api::keyspace::KeySpace; use pageserver_api::models::LayerAccessKind; use pageserver_api::shard::TenantShardId; @@ -53,6 +54,7 @@ use std::fs::File; use std::io::SeekFrom; use std::ops::Range; use std::os::unix::prelude::FileExt; +use std::str::FromStr; use std::sync::Arc; use tokio::sync::OnceCell; use tokio_stream::StreamExt; @@ -64,8 +66,10 @@ use utils::{ lsn::Lsn, }; -use super::filename::ImageFileName; -use super::{AsLayerDesc, Layer, PersistentLayerDesc, ResidentLayer, ValuesReconstructState}; +use super::layer_name::ImageLayerName; +use super::{ + AsLayerDesc, Layer, LayerName, PersistentLayerDesc, ResidentLayer, ValuesReconstructState, +}; /// /// Header stored in the beginning of the file @@ -230,7 +234,7 @@ impl ImageLayer { conf: &PageServerConf, timeline_id: TimelineId, tenant_shard_id: TenantShardId, - fname: &ImageFileName, + fname: &ImageLayerName, ) -> Utf8PathBuf { let rand_string: String = rand::thread_rng() .sample_iter(&Alphanumeric) @@ -266,13 +270,13 @@ impl ImageLayer { .and_then(|res| res)?; // not production code - let actual_filename = path.file_name().unwrap().to_owned(); - let expected_filename = self.layer_desc().filename().file_name(); + let actual_layer_name = LayerName::from_str(path.file_name().unwrap()).unwrap(); + let expected_layer_name = self.layer_desc().layer_name(); - if actual_filename != expected_filename { + if actual_layer_name != expected_layer_name { println!("warning: filename does not match what is expected from in-file summary"); - println!("actual: {:?}", actual_filename); - println!("expected: {:?}", expected_filename); + println!("actual: {:?}", actual_layer_name.to_string()); + println!("expected: {:?}", expected_layer_name.to_string()); } Ok(loaded) @@ -356,7 +360,7 @@ impl ImageLayer { // TODO: could use smallvec here but it's a pain with Slice Summary::ser_into(&new_summary, &mut buf).context("serialize")?; file.seek(SeekFrom::Start(0)).await?; - let (_buf, res) = file.write_all(buf).await; + let (_buf, res) = file.write_all(buf, ctx).await; res?; Ok(()) } @@ -395,6 +399,8 @@ impl ImageLayerInner { // production code path expected_summary.index_start_blk = actual_summary.index_start_blk; expected_summary.index_root_blk = actual_summary.index_root_blk; + // mask out the timeline_id, but still require the layers to be from the same tenant + expected_summary.timeline_id = actual_summary.timeline_id; if actual_summary != expected_summary { bail!( @@ -540,7 +546,25 @@ impl ImageLayerInner { let vectored_blob_reader = VectoredBlobReader::new(&self.file); for read in reads.into_iter() { - let buf = BytesMut::with_capacity(max_vectored_read_bytes); + let buf_size = read.size(); + + if buf_size > max_vectored_read_bytes { + // If the read is oversized, it should only contain one key. + let offenders = read + .blobs_at + .as_slice() + .iter() + .map(|(_, blob_meta)| format!("{}@{}", blob_meta.key, blob_meta.lsn)) + .join(", "); + tracing::warn!( + "Oversized vectored read ({} > {}) for keys {}", + buf_size, + max_vectored_read_bytes, + offenders + ); + } + + let buf = BytesMut::with_capacity(buf_size); let res = vectored_blob_reader.read_blobs(&read, buf).await; match res { @@ -614,7 +638,7 @@ impl ImageLayerWriterInner { conf, timeline_id, tenant_shard_id, - &ImageFileName { + &ImageLayerName { key_range: key_range.clone(), lsn, }, @@ -656,9 +680,14 @@ impl ImageLayerWriterInner { /// /// The page versions must be appended in blknum order. /// - async fn put_image(&mut self, key: Key, img: Bytes) -> anyhow::Result<()> { + async fn put_image( + &mut self, + key: Key, + img: Bytes, + ctx: &RequestContext, + ) -> anyhow::Result<()> { ensure!(self.key_range.contains(&key)); - let (_img, res) = self.blob_writer.write_blob(img).await; + let (_img, res) = self.blob_writer.write_blob(img, ctx).await; // TODO: re-use the buffer for `img` further upstack let off = res?; @@ -672,7 +701,11 @@ impl ImageLayerWriterInner { /// /// Finish writing the image layer. /// - async fn finish(self, timeline: &Arc) -> anyhow::Result { + async fn finish( + self, + timeline: &Arc, + ctx: &RequestContext, + ) -> anyhow::Result { let index_start_blk = ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32; @@ -683,7 +716,7 @@ impl ImageLayerWriterInner { .await?; let (index_root_blk, block_buf) = self.tree.finish()?; for buf in block_buf.blocks { - let (_buf, res) = file.write_all(buf).await; + let (_buf, res) = file.write_all(buf, ctx).await; res?; } @@ -703,7 +736,7 @@ impl ImageLayerWriterInner { // TODO: could use smallvec here but it's a pain with Slice Summary::ser_into(&summary, &mut buf)?; file.seek(SeekFrom::Start(0)).await?; - let (_buf, res) = file.write_all(buf).await; + let (_buf, res) = file.write_all(buf, ctx).await; res?; let metadata = file @@ -785,8 +818,13 @@ impl ImageLayerWriter { /// /// The page versions must be appended in blknum order. /// - pub async fn put_image(&mut self, key: Key, img: Bytes) -> anyhow::Result<()> { - self.inner.as_mut().unwrap().put_image(key, img).await + pub async fn put_image( + &mut self, + key: Key, + img: Bytes, + ctx: &RequestContext, + ) -> anyhow::Result<()> { + self.inner.as_mut().unwrap().put_image(key, img, ctx).await } /// @@ -795,8 +833,9 @@ impl ImageLayerWriter { pub(crate) async fn finish( mut self, timeline: &Arc, + ctx: &RequestContext, ) -> anyhow::Result { - self.inner.take().unwrap().finish(timeline).await + self.inner.take().unwrap().finish(timeline, ctx).await } } diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs index 5f1db21d49..4dacbec2f3 100644 --- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs +++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs @@ -12,19 +12,24 @@ use crate::tenant::ephemeral_file::EphemeralFile; use crate::tenant::storage_layer::ValueReconstructResult; use crate::tenant::timeline::GetVectoredError; use crate::tenant::{PageReconstructError, Timeline}; -use crate::walrecord; +use crate::{page_cache, walrecord}; use anyhow::{anyhow, ensure, Result}; use pageserver_api::keyspace::KeySpace; use pageserver_api::models::InMemoryLayerInfo; use pageserver_api::shard::TenantShardId; -use std::collections::{BinaryHeap, HashMap, HashSet}; +use std::collections::{BTreeMap, BinaryHeap, HashSet}; use std::sync::{Arc, OnceLock}; +use std::time::Instant; use tracing::*; use utils::{bin_ser::BeSer, id::TimelineId, lsn::Lsn, vec_map::VecMap}; // avoid binding to Write (conflicts with std::io::Write) // while being able to use std::fmt::Write's methods -use std::fmt::Write as _; +use crate::metrics::TIMELINE_EPHEMERAL_BYTES; +use std::cmp::Ordering; +use std::fmt::Write; use std::ops::Range; +use std::sync::atomic::Ordering as AtomicOrdering; +use std::sync::atomic::{AtomicU64, AtomicUsize}; use tokio::sync::{RwLock, RwLockWriteGuard}; use super::{ @@ -32,10 +37,14 @@ use super::{ ValuesReconstructState, }; +#[derive(Debug, PartialEq, Eq, Clone, Copy, Hash)] +pub(crate) struct InMemoryLayerFileId(page_cache::FileId); + pub struct InMemoryLayer { conf: &'static PageServerConf, tenant_shard_id: TenantShardId, timeline_id: TimelineId, + file_id: InMemoryLayerFileId, /// This layer contains all the changes from 'start_lsn'. The /// start is inclusive. @@ -45,6 +54,14 @@ pub struct InMemoryLayer { /// Writes are only allowed when this is `None`. end_lsn: OnceLock, + /// Used for traversal path. Cached representation of the in-memory layer before frozen. + local_path_str: Arc, + + /// Used for traversal path. Cached representation of the in-memory layer after frozen. + frozen_local_path_str: OnceLock>, + + opened_at: Instant, + /// The above fields never change, except for `end_lsn`, which is only set once. /// All other changing parts are in `inner`, and protected by a mutex. inner: RwLock, @@ -61,15 +78,17 @@ impl std::fmt::Debug for InMemoryLayer { } pub struct InMemoryLayerInner { - /// All versions of all pages in the layer are kept here. Indexed + /// All versions of all pages in the layer are kept here. Indexed /// by block number and LSN. The value is an offset into the /// ephemeral file where the page version is stored. - index: HashMap>, + index: BTreeMap>, /// The values are stored in a serialized format in this file. /// Each serialized Value is preceded by a 'u32' length field. /// PerSeg::page_versions map stores offsets into this file. file: EphemeralFile, + + resource_units: GlobalResourceUnits, } impl std::fmt::Debug for InMemoryLayerInner { @@ -78,7 +97,126 @@ impl std::fmt::Debug for InMemoryLayerInner { } } +/// State shared by all in-memory (ephemeral) layers. Updated infrequently during background ticks in Timeline, +/// to minimize contention. +/// +/// This global state is used to implement behaviors that require a global view of the system, e.g. +/// rolling layers proactively to limit the total amount of dirty data. +pub(crate) struct GlobalResources { + // Limit on how high dirty_bytes may grow before we start freezing layers to reduce it. + // Zero means unlimited. + pub(crate) max_dirty_bytes: AtomicU64, + // How many bytes are in all EphemeralFile objects + dirty_bytes: AtomicU64, + // How many layers are contributing to dirty_bytes + dirty_layers: AtomicUsize, +} + +// Per-timeline RAII struct for its contribution to [`GlobalResources`] +struct GlobalResourceUnits { + // How many dirty bytes have I added to the global dirty_bytes: this guard object is responsible + // for decrementing the global counter by this many bytes when dropped. + dirty_bytes: u64, +} + +impl GlobalResourceUnits { + // Hint for the layer append path to update us when the layer size differs from the last + // call to update_size by this much. If we don't reach this threshold, we'll still get + // updated when the Timeline "ticks" in the background. + const MAX_SIZE_DRIFT: u64 = 10 * 1024 * 1024; + + fn new() -> Self { + GLOBAL_RESOURCES + .dirty_layers + .fetch_add(1, AtomicOrdering::Relaxed); + Self { dirty_bytes: 0 } + } + + /// Do not call this frequently: all timelines will write to these same global atomics, + /// so this is a relatively expensive operation. Wait at least a few seconds between calls. + /// + /// Returns the effective layer size limit that should be applied, if any, to keep + /// the total number of dirty bytes below the configured maximum. + fn publish_size(&mut self, size: u64) -> Option { + let new_global_dirty_bytes = match size.cmp(&self.dirty_bytes) { + Ordering::Equal => GLOBAL_RESOURCES.dirty_bytes.load(AtomicOrdering::Relaxed), + Ordering::Greater => { + let delta = size - self.dirty_bytes; + let old = GLOBAL_RESOURCES + .dirty_bytes + .fetch_add(delta, AtomicOrdering::Relaxed); + old + delta + } + Ordering::Less => { + let delta = self.dirty_bytes - size; + let old = GLOBAL_RESOURCES + .dirty_bytes + .fetch_sub(delta, AtomicOrdering::Relaxed); + old - delta + } + }; + + // This is a sloppy update: concurrent updates to the counter will race, and the exact + // value of the metric might not be the exact latest value of GLOBAL_RESOURCES::dirty_bytes. + // That's okay: as long as the metric contains some recent value, it doesn't have to always + // be literally the last update. + TIMELINE_EPHEMERAL_BYTES.set(new_global_dirty_bytes); + + self.dirty_bytes = size; + + let max_dirty_bytes = GLOBAL_RESOURCES + .max_dirty_bytes + .load(AtomicOrdering::Relaxed); + if max_dirty_bytes > 0 && new_global_dirty_bytes > max_dirty_bytes { + // Set the layer file limit to the average layer size: this implies that all above-average + // sized layers will be elegible for freezing. They will be frozen in the order they + // next enter publish_size. + Some( + new_global_dirty_bytes + / GLOBAL_RESOURCES.dirty_layers.load(AtomicOrdering::Relaxed) as u64, + ) + } else { + None + } + } + + // Call publish_size if the input size differs from last published size by more than + // the drift limit + fn maybe_publish_size(&mut self, size: u64) { + let publish = match size.cmp(&self.dirty_bytes) { + Ordering::Equal => false, + Ordering::Greater => size - self.dirty_bytes > Self::MAX_SIZE_DRIFT, + Ordering::Less => self.dirty_bytes - size > Self::MAX_SIZE_DRIFT, + }; + + if publish { + self.publish_size(size); + } + } +} + +impl Drop for GlobalResourceUnits { + fn drop(&mut self) { + GLOBAL_RESOURCES + .dirty_layers + .fetch_sub(1, AtomicOrdering::Relaxed); + + // Subtract our contribution to the global total dirty bytes + self.publish_size(0); + } +} + +pub(crate) static GLOBAL_RESOURCES: GlobalResources = GlobalResources { + max_dirty_bytes: AtomicU64::new(0), + dirty_bytes: AtomicU64::new(0), + dirty_layers: AtomicUsize::new(0), +}; + impl InMemoryLayer { + pub(crate) fn file_id(&self) -> InMemoryLayerFileId { + self.file_id + } + pub(crate) fn get_timeline_id(&self) -> TimelineId { self.timeline_id } @@ -93,6 +231,10 @@ impl InMemoryLayer { } } + pub(crate) fn try_len(&self) -> Option { + self.inner.try_read().map(|i| i.file.len()).ok() + } + pub(crate) fn assert_writable(&self) { assert!(self.end_lsn.get().is_none()); } @@ -105,6 +247,12 @@ impl InMemoryLayer { self.start_lsn..self.end_lsn_or_max() } + pub(crate) fn local_path_str(&self) -> &Arc { + self.frozen_local_path_str + .get() + .unwrap_or(&self.local_path_str) + } + /// debugging function to print out the contents of the layer /// /// this is likely completly unused @@ -236,29 +384,24 @@ impl InMemoryLayer { let mut planned_block_reads = BinaryHeap::new(); for range in keyspace.ranges.iter() { - let mut key = range.start; - while key < range.end { - if let Some(vec_map) = inner.index.get(&key) { - let lsn_range = match reconstruct_state.get_cached_lsn(&key) { - Some(cached_lsn) => (cached_lsn + 1)..end_lsn, - None => self.start_lsn..end_lsn, - }; + for (key, vec_map) in inner.index.range(range.start..range.end) { + let lsn_range = match reconstruct_state.get_cached_lsn(key) { + Some(cached_lsn) => (cached_lsn + 1)..end_lsn, + None => self.start_lsn..end_lsn, + }; - let slice = vec_map.slice_range(lsn_range); - for (entry_lsn, pos) in slice.iter().rev() { - planned_block_reads.push(BlockRead { - key, - lsn: *entry_lsn, - block_offset: *pos, - }); - } + let slice = vec_map.slice_range(lsn_range); + for (entry_lsn, pos) in slice.iter().rev() { + planned_block_reads.push(BlockRead { + key: *key, + lsn: *entry_lsn, + block_offset: *pos, + }); } - - key = key.next(); } } - let keyspace_size = keyspace.total_size(); + let keyspace_size = keyspace.total_raw_size(); let mut completed_keys = HashSet::new(); while completed_keys.len() < keyspace_size && !planned_block_reads.is_empty() { @@ -290,14 +433,30 @@ impl InMemoryLayer { } } + reconstruct_state.on_lsn_advanced(&keyspace, self.start_lsn); + Ok(()) } } +fn inmem_layer_display(mut f: impl Write, start_lsn: Lsn, end_lsn: Lsn) -> std::fmt::Result { + write!(f, "inmem-{:016X}-{:016X}", start_lsn.0, end_lsn.0) +} + +fn inmem_layer_log_display( + mut f: impl Write, + timeline: TimelineId, + start_lsn: Lsn, + end_lsn: Lsn, +) -> std::fmt::Result { + write!(f, "timeline {} in-memory ", timeline)?; + inmem_layer_display(f, start_lsn, end_lsn) +} + impl std::fmt::Display for InMemoryLayer { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { let end_lsn = self.end_lsn_or_max(); - write!(f, "inmem-{:016X}-{:016X}", self.start_lsn.0, end_lsn.0) + inmem_layer_display(f, self.start_lsn, end_lsn) } } @@ -318,16 +477,26 @@ impl InMemoryLayer { trace!("initializing new empty InMemoryLayer for writing on timeline {timeline_id} at {start_lsn}"); let file = EphemeralFile::create(conf, tenant_shard_id, timeline_id).await?; + let key = InMemoryLayerFileId(file.page_cache_file_id()); Ok(InMemoryLayer { + file_id: key, + local_path_str: { + let mut buf = String::new(); + inmem_layer_log_display(&mut buf, timeline_id, start_lsn, Lsn::MAX).unwrap(); + buf.into() + }, + frozen_local_path_str: OnceLock::new(), conf, timeline_id, tenant_shard_id, start_lsn, end_lsn: OnceLock::new(), + opened_at: Instant::now(), inner: RwLock::new(InMemoryLayerInner { - index: HashMap::new(), + index: BTreeMap::new(), file, + resource_units: GlobalResourceUnits::new(), }), }) } @@ -378,9 +547,22 @@ impl InMemoryLayer { warn!("Key {} at {} already exists", key, lsn); } + let size = locked_inner.file.len(); + locked_inner.resource_units.maybe_publish_size(size); + Ok(()) } + pub(crate) fn get_opened_at(&self) -> Instant { + self.opened_at + } + + pub(crate) async fn tick(&self) -> Option { + let mut inner = self.inner.write().await; + let size = inner.file.len(); + inner.resource_units.publish_size(size) + } + pub(crate) async fn put_tombstones(&self, _key_ranges: &[(Range, Lsn)]) -> Result<()> { // TODO: Currently, we just leak the storage for any deleted keys Ok(()) @@ -399,6 +581,15 @@ impl InMemoryLayer { ); self.end_lsn.set(end_lsn).expect("end_lsn set only once"); + self.frozen_local_path_str + .set({ + let mut buf = String::new(); + inmem_layer_log_display(&mut buf, self.get_timeline_id(), self.start_lsn, end_lsn) + .unwrap(); + buf.into() + }) + .expect("frozen_local_path_str set only once"); + for vec_map in inner.index.values() { for (lsn, _pos) in vec_map.as_slice() { assert!(*lsn < end_lsn); @@ -406,14 +597,17 @@ impl InMemoryLayer { } } - /// Write this frozen in-memory layer to disk. + /// Write this frozen in-memory layer to disk. If `key_range` is set, the delta + /// layer will only contain the key range the user specifies, and may return `None` + /// if there are no matching keys. /// /// Returns a new delta layer with all the same data as this in-memory layer pub(crate) async fn write_to_disk( &self, timeline: &Arc, ctx: &RequestContext, - ) -> Result { + key_range: Option>, + ) -> Result> { // Grab the lock in read-mode. We hold it over the I/O, but because this // layer is not writeable anymore, no one should be trying to acquire the // write lock on it, so we shouldn't block anyone. There's one exception @@ -427,6 +621,21 @@ impl InMemoryLayer { let end_lsn = *self.end_lsn.get().unwrap(); + let keys: Vec<_> = if let Some(key_range) = key_range { + inner + .index + .iter() + .filter(|(k, _)| key_range.contains(k)) + .map(|(k, m)| (k.to_i128(), m)) + .collect() + } else { + inner.index.iter().map(|(k, m)| (k.to_i128(), m)).collect() + }; + + if keys.is_empty() { + return Ok(None); + } + let mut delta_layer_writer = DeltaLayerWriter::new( self.conf, self.timeline_id, @@ -440,33 +649,24 @@ impl InMemoryLayer { let cursor = inner.file.block_cursor(); - // Sort the keys because delta layer writer expects them sorted. - // - // NOTE: this sort can take up significant time if the layer has millions of - // keys. To speed up all the comparisons we convert the key to i128 and - // keep the value as a reference. - let mut keys: Vec<_> = inner.index.iter().map(|(k, m)| (k.to_i128(), m)).collect(); - keys.sort_unstable_by_key(|k| k.0); - let ctx = RequestContextBuilder::extend(ctx) .page_content_kind(PageContentKind::InMemoryLayer) .build(); - for (key, vec_map) in keys.iter() { - let key = Key::from_i128(*key); + for (key, vec_map) in inner.index.iter() { // Write all page versions for (lsn, pos) in vec_map.as_slice() { cursor.read_blob_into_buf(*pos, &mut buf, &ctx).await?; let will_init = Value::des(&buf)?.will_init(); let res; (buf, res) = delta_layer_writer - .put_value_bytes(key, *lsn, buf, will_init) + .put_value_bytes(*key, *lsn, buf, will_init, &ctx) .await; res?; } } // MAX is used here because we identify L0 layers by full key range - let delta_layer = delta_layer_writer.finish(Key::MAX, timeline).await?; - Ok(delta_layer) + let delta_layer = delta_layer_writer.finish(Key::MAX, timeline, &ctx).await?; + Ok(Some(delta_layer)) } } diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs index 8ba37b5a86..b5b0260327 100644 --- a/pageserver/src/tenant/storage_layer/layer.rs +++ b/pageserver/src/tenant/storage_layer/layer.rs @@ -4,26 +4,28 @@ use pageserver_api::keyspace::KeySpace; use pageserver_api::models::{ HistoricLayerInfo, LayerAccessKind, LayerResidenceEventReason, LayerResidenceStatus, }; -use pageserver_api::shard::ShardIndex; +use pageserver_api::shard::{ShardIndex, TenantShardId}; use std::ops::Range; use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering}; use std::sync::{Arc, Weak}; use std::time::{Duration, SystemTime}; use tracing::Instrument; +use utils::id::TimelineId; use utils::lsn::Lsn; use utils::sync::heavier_once_cell; use crate::config::PageServerConf; -use crate::context::RequestContext; +use crate::context::{DownloadBehavior, RequestContext}; use crate::repository::Key; use crate::span::debug_assert_current_span_has_tenant_and_timeline_id; +use crate::task_mgr::TaskKind; use crate::tenant::timeline::GetVectoredError; use crate::tenant::{remote_timeline_client::LayerFileMetadata, Timeline}; use super::delta_layer::{self, DeltaEntry}; use super::image_layer; use super::{ - AsLayerDesc, LayerAccessStats, LayerAccessStatsReset, LayerFileName, PersistentLayerDesc, + AsLayerDesc, LayerAccessStats, LayerAccessStatsReset, LayerName, PersistentLayerDesc, ValueReconstructResult, ValueReconstructState, ValuesReconstructState, }; @@ -116,14 +118,48 @@ impl AsLayerDesc for Layer { } } +impl PartialEq for Layer { + fn eq(&self, other: &Self) -> bool { + Arc::as_ptr(&self.0) == Arc::as_ptr(&other.0) + } +} + +pub(crate) fn local_layer_path( + conf: &PageServerConf, + tenant_shard_id: &TenantShardId, + timeline_id: &TimelineId, + layer_file_name: &LayerName, + _generation: &Generation, +) -> Utf8PathBuf { + let timeline_path = conf.timeline_path(tenant_shard_id, timeline_id); + + timeline_path.join(layer_file_name.to_string()) + + // TODO: switch to enabling new-style layer paths after next release + // if generation.is_none() { + // // Without a generation, we may only use legacy path style + // timeline_path.join(layer_file_name.to_string()) + // } else { + // timeline_path.join(format!("{}-v1{}", layer_file_name, generation.get_suffix())) + // } +} + impl Layer { /// Creates a layer value for a file we know to not be resident. pub(crate) fn for_evicted( conf: &'static PageServerConf, timeline: &Arc, - file_name: LayerFileName, + file_name: LayerName, metadata: LayerFileMetadata, ) -> Self { + let local_path = local_layer_path( + conf, + &timeline.tenant_shard_id, + &timeline.timeline_id, + &file_name, + &metadata.generation, + ); + let desc = PersistentLayerDesc::from_filename( timeline.tenant_shard_id, timeline.timeline_id, @@ -136,6 +172,7 @@ impl Layer { let owner = Layer(Arc::new(LayerInner::new( conf, timeline, + local_path, access_stats, desc, None, @@ -152,7 +189,8 @@ impl Layer { pub(crate) fn for_resident( conf: &'static PageServerConf, timeline: &Arc, - file_name: LayerFileName, + local_path: Utf8PathBuf, + file_name: LayerName, metadata: LayerFileMetadata, ) -> ResidentLayer { let desc = PersistentLayerDesc::from_filename( @@ -177,6 +215,7 @@ impl Layer { LayerInner::new( conf, timeline, + local_path, access_stats, desc, Some(inner), @@ -218,9 +257,19 @@ impl Layer { LayerResidenceStatus::Resident, LayerResidenceEventReason::LayerCreate, ); + + let local_path = local_layer_path( + conf, + &timeline.tenant_shard_id, + &timeline.timeline_id, + &desc.layer_name(), + &timeline.generation, + ); + LayerInner::new( conf, timeline, + local_path, access_stats, desc, Some(inner), @@ -330,6 +379,12 @@ impl Layer { .get_values_reconstruct_data(keyspace, lsn_range, reconstruct_data, &self.0, ctx) .instrument(tracing::debug_span!("get_values_reconstruct_data", layer=%self)) .await + .map_err(|err| match err { + GetVectoredError::Other(err) => GetVectoredError::Other( + err.context(format!("get_values_reconstruct_data for layer {self}")), + ), + err => err, + }) } /// Download the layer if evicted. @@ -389,10 +444,21 @@ impl Layer { &self.0.path } + pub(crate) fn debug_str(&self) -> &Arc { + &self.0.debug_str + } + pub(crate) fn metadata(&self) -> LayerFileMetadata { self.0.metadata() } + pub(crate) fn get_timeline_id(&self) -> Option { + self.0 + .timeline + .upgrade() + .map(|timeline| timeline.timeline_id) + } + /// Traditional debug dumping facility #[allow(unused)] pub(crate) async fn dump(&self, verbose: bool, ctx: &RequestContext) -> anyhow::Result<()> { @@ -511,6 +577,9 @@ struct LayerInner { /// Full path to the file; unclear if this should exist anymore. path: Utf8PathBuf, + /// String representation of the layer, used for traversal id. + debug_str: Arc, + desc: PersistentLayerDesc, /// Timeline access is needed for remote timeline client and metrics. @@ -604,16 +673,24 @@ enum Status { impl Drop for LayerInner { fn drop(&mut self) { + // if there was a pending eviction, mark it cancelled here to balance metrics + if let Some((ResidentOrWantedEvicted::WantedEvicted(..), _)) = self.inner.take_and_deinit() + { + // eviction has already been started + LAYER_IMPL_METRICS.inc_eviction_cancelled(EvictionCancelled::LayerGone); + + // eviction request is intentionally not honored as no one is present to wait for it + // and we could be delaying shutdown for nothing. + } + if !*self.wanted_deleted.get_mut() { - // should we try to evict if the last wish was for eviction? seems more like a hazard - // than a clear win. return; } let span = tracing::info_span!(parent: None, "layer_delete", tenant_id = %self.layer_desc().tenant_shard_id.tenant_id, shard_id=%self.layer_desc().tenant_shard_id.shard_slug(), timeline_id = %self.layer_desc().timeline_id); let path = std::mem::take(&mut self.path); - let file_name = self.layer_desc().filename(); + let file_name = self.layer_desc().layer_name(); let file_size = self.layer_desc().file_size; let timeline = self.timeline.clone(); let meta = self.metadata(); @@ -681,19 +758,17 @@ impl Drop for LayerInner { } impl LayerInner { + #[allow(clippy::too_many_arguments)] fn new( conf: &'static PageServerConf, timeline: &Arc, + local_path: Utf8PathBuf, access_stats: LayerAccessStats, desc: PersistentLayerDesc, downloaded: Option>, generation: Generation, shard: ShardIndex, ) -> Self { - let path = conf - .timeline_path(&timeline.tenant_shard_id, &timeline.timeline_id) - .join(desc.filename().to_string()); - let (inner, version, init_status) = if let Some(inner) = downloaded { let version = inner.version; let resident = ResidentOrWantedEvicted::Resident(inner); @@ -708,7 +783,10 @@ impl LayerInner { LayerInner { conf, - path, + debug_str: { + format!("timelines/{}/{}", timeline.timeline_id, desc.layer_name()).into() + }, + path: local_path, desc, timeline: Arc::downgrade(timeline), have_remote_client: timeline.remote_client.is_some(), @@ -911,11 +989,20 @@ impl LayerInner { return Err(DownloadError::DownloadRequired); } + let download_ctx = ctx + .map(|ctx| ctx.detached_child(TaskKind::LayerDownload, DownloadBehavior::Download)) + .unwrap_or(RequestContext::new( + TaskKind::LayerDownload, + DownloadBehavior::Download, + )); + async move { tracing::info!(%reason, "downloading on-demand"); let init_cancelled = scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled()); - let res = self.download_init_and_wait(timeline, permit).await?; + let res = self + .download_init_and_wait(timeline, permit, download_ctx) + .await?; scopeguard::ScopeGuard::into_inner(init_cancelled); Ok(res) } @@ -954,6 +1041,7 @@ impl LayerInner { self: &Arc, timeline: Arc, permit: heavier_once_cell::InitPermit, + ctx: RequestContext, ) -> Result, DownloadError> { debug_assert_current_span_has_tenant_and_timeline_id(); @@ -983,7 +1071,7 @@ impl LayerInner { .await .unwrap(); - let res = this.download_and_init(timeline, permit).await; + let res = this.download_and_init(timeline, permit, &ctx).await; if let Err(res) = tx.send(res) { match res { @@ -1026,6 +1114,7 @@ impl LayerInner { self: &Arc, timeline: Arc, permit: heavier_once_cell::InitPermit, + ctx: &RequestContext, ) -> anyhow::Result> { let client = timeline .remote_client @@ -1033,7 +1122,12 @@ impl LayerInner { .expect("checked before download_init_and_wait"); let result = client - .download_layer_file(&self.desc.filename(), &self.metadata(), &timeline.cancel) + .download_layer_file( + &self.desc.layer_name(), + &self.metadata(), + &timeline.cancel, + ctx, + ) .await; match result { @@ -1166,7 +1260,7 @@ impl LayerInner { } fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo { - let layer_file_name = self.desc.filename().file_name(); + let layer_name = self.desc.layer_name().to_string(); let resident = self .inner @@ -1180,7 +1274,7 @@ impl LayerInner { let lsn_range = &self.desc.lsn_range; HistoricLayerInfo::Delta { - layer_file_name, + layer_file_name: layer_name, layer_file_size: self.desc.file_size, lsn_start: lsn_range.start, lsn_end: lsn_range.end, @@ -1191,7 +1285,7 @@ impl LayerInner { let lsn = self.desc.image_layer_lsn(); HistoricLayerInfo::Image { - layer_file_name, + layer_file_name: layer_name, layer_file_size: self.desc.file_size, lsn_start: lsn, remote: !resident, @@ -1552,8 +1646,8 @@ impl Drop for DownloadedLayer { if let Some(owner) = self.owner.upgrade() { owner.on_downloaded_layer_drop(self.version); } else { - // no need to do anything, we are shutting down - LAYER_IMPL_METRICS.inc_eviction_cancelled(EvictionCancelled::LayerGone); + // Layer::drop will handle cancelling the eviction; because of drop order and + // `DownloadedLayer` never leaking, we cannot know here if eviction was requested. } } } @@ -1752,6 +1846,26 @@ impl ResidentLayer { } } + /// Returns the amount of keys and values written to the writer. + pub(crate) async fn copy_delta_prefix( + &self, + writer: &mut super::delta_layer::DeltaLayerWriter, + until: Lsn, + ctx: &RequestContext, + ) -> anyhow::Result { + use LayerKind::*; + + let owner = &self.owner.0; + + match self.downloaded.get(owner, ctx).await? { + Delta(ref d) => d + .copy_prefix(writer, until, ctx) + .await + .with_context(|| format!("copy_delta_prefix until {until} of {self}")), + Image(_) => anyhow::bail!(format!("cannot copy_lsn_prefix of image layer {self}")), + } + } + pub(crate) fn local_path(&self) -> &Utf8Path { &self.owner.0.path } @@ -1759,6 +1873,18 @@ impl ResidentLayer { pub(crate) fn metadata(&self) -> LayerFileMetadata { self.owner.metadata() } + + #[cfg(test)] + pub(crate) async fn as_delta( + &self, + ctx: &RequestContext, + ) -> anyhow::Result<&delta_layer::DeltaLayerInner> { + use LayerKind::*; + match self.downloaded.get(&self.owner.0, ctx).await? { + Delta(ref d) => Ok(d), + Image(_) => Err(anyhow::anyhow!("image layer")), + } + } } impl AsLayerDesc for ResidentLayer { diff --git a/pageserver/src/tenant/storage_layer/layer/tests.rs b/pageserver/src/tenant/storage_layer/layer/tests.rs index 247ff123b5..52f62faa8d 100644 --- a/pageserver/src/tenant/storage_layer/layer/tests.rs +++ b/pageserver/src/tenant/storage_layer/layer/tests.rs @@ -721,11 +721,110 @@ async fn evict_and_wait_does_not_wait_for_download() { layer.evict_and_wait(FOREVER).await.unwrap(); } +/// Asserts that there is no miscalculation when Layer is dropped while it is being kept resident, +/// which is the last value. +/// +/// Also checks that the same does not happen on a non-evicted layer (regression test). +#[tokio::test(start_paused = true)] +async fn eviction_cancellation_on_drop() { + use crate::repository::Value; + use bytes::Bytes; + + // this is the runtime on which Layer spawns the blocking tasks on + let handle = tokio::runtime::Handle::current(); + + let h = TenantHarness::create("eviction_cancellation_on_drop").unwrap(); + utils::logging::replace_panic_hook_with_tracing_panic_hook().forget(); + let (tenant, ctx) = h.load().await; + + let timeline = tenant + .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx) + .await + .unwrap(); + + { + // create_test_timeline wrote us one layer, write another + let mut writer = timeline.writer().await; + writer + .put( + Key::from_i128(5), + Lsn(0x20), + &Value::Image(Bytes::from_static(b"this does not matter either")), + &ctx, + ) + .await + .unwrap(); + + writer.finish_write(Lsn(0x20)); + } + + timeline.freeze_and_flush().await.unwrap(); + + // wait for the upload to complete so our Arc::strong_count assertion holds + timeline + .remote_client + .as_ref() + .unwrap() + .wait_completion() + .await + .unwrap(); + + let (evicted_layer, not_evicted) = { + let mut layers = { + let mut guard = timeline.layers.write().await; + let layers = guard.likely_resident_layers().collect::>(); + // remove the layers from layermap + guard.finish_gc_timeline(&layers); + + layers + }; + + assert_eq!(layers.len(), 2); + + (layers.pop().unwrap(), layers.pop().unwrap()) + }; + + let victims = [(evicted_layer, true), (not_evicted, false)]; + + for (victim, evict) in victims { + let resident = victim.keep_resident().await.unwrap(); + drop(victim); + + assert_eq!(Arc::strong_count(&resident.owner.0), 1); + + if evict { + let evict_and_wait = resident.owner.evict_and_wait(FOREVER); + + // drive the future to await on the status channel, and then drop it + tokio::time::timeout(ADVANCE, evict_and_wait) + .await + .expect_err("should had been a timeout since we are holding the layer resident"); + } + + // 1 == we only evict one of the layers + assert_eq!(1, LAYER_IMPL_METRICS.started_evictions.get()); + + drop(resident); + + // run any spawned + tokio::time::sleep(ADVANCE).await; + + SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(&handle).await; + + assert_eq!( + 1, + LAYER_IMPL_METRICS.cancelled_evictions[EvictionCancelled::LayerGone].get() + ); + } +} + +/// A test case to remind you the cost of these structures. You can bump the size limit +/// below if it is really necessary to add more fields to the structures. #[test] fn layer_size() { assert_eq!(std::mem::size_of::(), 2040); assert_eq!(std::mem::size_of::(), 104); - assert_eq!(std::mem::size_of::(), 2328); + assert_eq!(std::mem::size_of::(), 2344); // it also has the utf8 path } diff --git a/pageserver/src/tenant/storage_layer/layer_desc.rs b/pageserver/src/tenant/storage_layer/layer_desc.rs index c375923e81..a89b66e4a1 100644 --- a/pageserver/src/tenant/storage_layer/layer_desc.rs +++ b/pageserver/src/tenant/storage_layer/layer_desc.rs @@ -5,7 +5,7 @@ use utils::{id::TimelineId, lsn::Lsn}; use crate::repository::Key; -use super::{DeltaFileName, ImageFileName, LayerFileName}; +use super::{DeltaLayerName, ImageLayerName, LayerName}; use serde::{Deserialize, Serialize}; @@ -51,7 +51,7 @@ impl PersistentLayerDesc { } pub fn short_id(&self) -> impl Display { - self.filename() + self.layer_name() } #[cfg(test)] @@ -103,14 +103,14 @@ impl PersistentLayerDesc { pub fn from_filename( tenant_shard_id: TenantShardId, timeline_id: TimelineId, - filename: LayerFileName, + filename: LayerName, file_size: u64, ) -> Self { match filename { - LayerFileName::Image(i) => { + LayerName::Image(i) => { Self::new_img(tenant_shard_id, timeline_id, i.key_range, i.lsn, file_size) } - LayerFileName::Delta(d) => Self::new_delta( + LayerName::Delta(d) => Self::new_delta( tenant_shard_id, timeline_id, d.key_range, @@ -132,34 +132,34 @@ impl PersistentLayerDesc { lsn..(lsn + 1) } - /// Get a delta file name for this layer. + /// Get a delta layer name for this layer. /// /// Panic: if this is not a delta layer. - pub fn delta_file_name(&self) -> DeltaFileName { + pub fn delta_layer_name(&self) -> DeltaLayerName { assert!(self.is_delta); - DeltaFileName { + DeltaLayerName { key_range: self.key_range.clone(), lsn_range: self.lsn_range.clone(), } } - /// Get a delta file name for this layer. + /// Get a image layer name for this layer. /// /// Panic: if this is not an image layer, or the lsn range is invalid - pub fn image_file_name(&self) -> ImageFileName { + pub fn image_layer_name(&self) -> ImageLayerName { assert!(!self.is_delta); assert!(self.lsn_range.start + 1 == self.lsn_range.end); - ImageFileName { + ImageLayerName { key_range: self.key_range.clone(), lsn: self.lsn_range.start, } } - pub fn filename(&self) -> LayerFileName { + pub fn layer_name(&self) -> LayerName { if self.is_delta { - self.delta_file_name().into() + self.delta_layer_name().into() } else { - self.image_file_name().into() + self.image_layer_name().into() } } diff --git a/pageserver/src/tenant/storage_layer/filename.rs b/pageserver/src/tenant/storage_layer/layer_name.rs similarity index 57% rename from pageserver/src/tenant/storage_layer/filename.rs rename to pageserver/src/tenant/storage_layer/layer_name.rs index a98be0842b..c733404693 100644 --- a/pageserver/src/tenant/storage_layer/filename.rs +++ b/pageserver/src/tenant/storage_layer/layer_name.rs @@ -2,40 +2,42 @@ //! Helper functions for dealing with filenames of the image and delta layer files. //! use crate::repository::Key; +use std::borrow::Cow; use std::cmp::Ordering; use std::fmt; use std::ops::Range; use std::str::FromStr; +use regex::Regex; use utils::lsn::Lsn; use super::PersistentLayerDesc; // Note: Timeline::load_layer_map() relies on this sort order #[derive(PartialEq, Eq, Clone, Hash)] -pub struct DeltaFileName { +pub struct DeltaLayerName { pub key_range: Range, pub lsn_range: Range, } -impl std::fmt::Debug for DeltaFileName { +impl std::fmt::Debug for DeltaLayerName { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { use super::RangeDisplayDebug; - f.debug_struct("DeltaFileName") + f.debug_struct("DeltaLayerName") .field("key_range", &RangeDisplayDebug(&self.key_range)) .field("lsn_range", &self.lsn_range) .finish() } } -impl PartialOrd for DeltaFileName { +impl PartialOrd for DeltaLayerName { fn partial_cmp(&self, other: &Self) -> Option { Some(self.cmp(other)) } } -impl Ord for DeltaFileName { +impl Ord for DeltaLayerName { fn cmp(&self, other: &Self) -> Ordering { let mut cmp = self.key_range.start.cmp(&other.key_range.start); if cmp != Ordering::Equal { @@ -55,16 +57,14 @@ impl Ord for DeltaFileName { } } -/// Represents the filename of a DeltaLayer +/// Represents the region of the LSN-Key space covered by a DeltaLayer /// /// ```text /// -__- /// ``` -impl DeltaFileName { - /// - /// Parse a string as a delta file name. Returns None if the filename does not - /// match the expected pattern. - /// +impl DeltaLayerName { + /// Parse the part of a delta layer's file name that represents the LayerName. Returns None + /// if the filename does not match the expected pattern. pub fn parse_str(fname: &str) -> Option { let mut parts = fname.split("__"); let mut key_parts = parts.next()?.split('-'); @@ -74,10 +74,19 @@ impl DeltaFileName { let key_end_str = key_parts.next()?; let lsn_start_str = lsn_parts.next()?; let lsn_end_str = lsn_parts.next()?; + if parts.next().is_some() || key_parts.next().is_some() || key_parts.next().is_some() { return None; } + if key_start_str.len() != 36 + || key_end_str.len() != 36 + || lsn_start_str.len() != 16 + || lsn_end_str.len() != 16 + { + return None; + } + let key_start = Key::from_hex(key_start_str).ok()?; let key_end = Key::from_hex(key_end_str).ok()?; @@ -94,14 +103,14 @@ impl DeltaFileName { // or panic? } - Some(DeltaFileName { + Some(DeltaLayerName { key_range: key_start..key_end, lsn_range: start_lsn..end_lsn, }) } } -impl fmt::Display for DeltaFileName { +impl fmt::Display for DeltaLayerName { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!( f, @@ -115,29 +124,29 @@ impl fmt::Display for DeltaFileName { } #[derive(PartialEq, Eq, Clone, Hash)] -pub struct ImageFileName { +pub struct ImageLayerName { pub key_range: Range, pub lsn: Lsn, } -impl std::fmt::Debug for ImageFileName { +impl std::fmt::Debug for ImageLayerName { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { use super::RangeDisplayDebug; - f.debug_struct("ImageFileName") + f.debug_struct("ImageLayerName") .field("key_range", &RangeDisplayDebug(&self.key_range)) .field("lsn", &self.lsn) .finish() } } -impl PartialOrd for ImageFileName { +impl PartialOrd for ImageLayerName { fn partial_cmp(&self, other: &Self) -> Option { Some(self.cmp(other)) } } -impl Ord for ImageFileName { +impl Ord for ImageLayerName { fn cmp(&self, other: &Self) -> Ordering { let mut cmp = self.key_range.start.cmp(&other.key_range.start); if cmp != Ordering::Equal { @@ -153,7 +162,7 @@ impl Ord for ImageFileName { } } -impl ImageFileName { +impl ImageLayerName { pub fn lsn_as_range(&self) -> Range { // Saves from having to copypaste this all over PersistentLayerDesc::image_layer_lsn_range(self.lsn) @@ -161,16 +170,14 @@ impl ImageFileName { } /// -/// Represents the filename of an ImageLayer +/// Represents the part of the Key-LSN space covered by an ImageLayer /// /// ```text /// -__ /// ``` -impl ImageFileName { - /// - /// Parse a string as an image file name. Returns None if the filename does not - /// match the expected pattern. - /// +impl ImageLayerName { + /// Parse a string as then LayerName part of an image layer file name. Returns None if the + /// filename does not match the expected pattern. pub fn parse_str(fname: &str) -> Option { let mut parts = fname.split("__"); let mut key_parts = parts.next()?.split('-'); @@ -182,19 +189,23 @@ impl ImageFileName { return None; } + if key_start_str.len() != 36 || key_end_str.len() != 36 || lsn_str.len() != 16 { + return None; + } + let key_start = Key::from_hex(key_start_str).ok()?; let key_end = Key::from_hex(key_end_str).ok()?; let lsn = Lsn::from_hex(lsn_str).ok()?; - Some(ImageFileName { + Some(ImageLayerName { key_range: key_start..key_end, lsn, }) } } -impl fmt::Display for ImageFileName { +impl fmt::Display for ImageLayerName { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!( f, @@ -205,21 +216,24 @@ impl fmt::Display for ImageFileName { ) } } + +/// LayerName is the logical identity of a layer within a LayerMap at a moment in time. The +/// LayerName is not a unique filename, as the same LayerName may have multiple physical incarnations +/// over time (e.g. across shard splits or compression). The physical filenames of layers in local +/// storage and object names in remote storage consist of the LayerName plus some extra qualifiers +/// that uniquely identify the physical incarnation of a layer (see [crate::tenant::remote_timeline_client::remote_layer_path]) +/// and [`crate::tenant::storage_layer::layer::local_layer_path`]) #[derive(Debug, PartialEq, Eq, Hash, Clone)] -pub enum LayerFileName { - Image(ImageFileName), - Delta(DeltaFileName), +pub enum LayerName { + Image(ImageLayerName), + Delta(DeltaLayerName), } -impl LayerFileName { - pub fn file_name(&self) -> String { - self.to_string() - } - +impl LayerName { /// Determines if this layer file is considered to be in future meaning we will discard these /// layers during timeline initialization from the given disk_consistent_lsn. pub(crate) fn is_in_future(&self, disk_consistent_lsn: Lsn) -> bool { - use LayerFileName::*; + use LayerName::*; match self { Image(file_name) if file_name.lsn > disk_consistent_lsn => true, Delta(file_name) if file_name.lsn_range.end > disk_consistent_lsn + 1 => true, @@ -228,7 +242,7 @@ impl LayerFileName { } pub(crate) fn kind(&self) -> &'static str { - use LayerFileName::*; + use LayerName::*; match self { Delta(_) => "delta", Image(_) => "image", @@ -236,7 +250,7 @@ impl LayerFileName { } } -impl fmt::Display for LayerFileName { +impl fmt::Display for LayerName { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match self { Self::Image(fname) => write!(f, "{fname}"), @@ -245,23 +259,36 @@ impl fmt::Display for LayerFileName { } } -impl From for LayerFileName { - fn from(fname: ImageFileName) -> Self { +impl From for LayerName { + fn from(fname: ImageLayerName) -> Self { Self::Image(fname) } } -impl From for LayerFileName { - fn from(fname: DeltaFileName) -> Self { +impl From for LayerName { + fn from(fname: DeltaLayerName) -> Self { Self::Delta(fname) } } -impl FromStr for LayerFileName { +impl FromStr for LayerName { type Err = String; + /// Conversion from either a physical layer filename, or the string-ization of + /// Self. When loading a physical layer filename, we drop any extra information + /// not needed to build Self. fn from_str(value: &str) -> Result { - let delta = DeltaFileName::parse_str(value); - let image = ImageFileName::parse_str(value); + let gen_suffix_regex = Regex::new("^(?.+)(?-v1-[0-9a-f]{8})$").unwrap(); + let file_name: Cow = match gen_suffix_regex.captures(value) { + Some(captures) => captures + .name("base") + .expect("Non-optional group") + .as_str() + .into(), + None => value.into(), + }; + + let delta = DeltaLayerName::parse_str(&file_name); + let image = ImageLayerName::parse_str(&file_name); let ok = match (delta, image) { (None, None) => { return Err(format!( @@ -276,7 +303,7 @@ impl FromStr for LayerFileName { } } -impl serde::Serialize for LayerFileName { +impl serde::Serialize for LayerName { fn serialize(&self, serializer: S) -> Result where S: serde::Serializer, @@ -288,19 +315,19 @@ impl serde::Serialize for LayerFileName { } } -impl<'de> serde::Deserialize<'de> for LayerFileName { +impl<'de> serde::Deserialize<'de> for LayerName { fn deserialize(deserializer: D) -> Result where D: serde::Deserializer<'de>, { - deserializer.deserialize_string(LayerFileNameVisitor) + deserializer.deserialize_string(LayerNameVisitor) } } -struct LayerFileNameVisitor; +struct LayerNameVisitor; -impl<'de> serde::de::Visitor<'de> for LayerFileNameVisitor { - type Value = LayerFileName; +impl<'de> serde::de::Visitor<'de> for LayerNameVisitor { + type Value = LayerName; fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result { write!( @@ -315,3 +342,42 @@ impl<'de> serde::de::Visitor<'de> for LayerFileNameVisitor { v.parse().map_err(|e| E::custom(e)) } } + +#[cfg(test)] +mod test { + use super::*; + #[test] + fn image_layer_parse() -> anyhow::Result<()> { + let expected = LayerName::Image(ImageLayerName { + key_range: Key::from_i128(0) + ..Key::from_hex("000000067F00000001000004DF0000000006").unwrap(), + lsn: Lsn::from_hex("00000000014FED58").unwrap(), + }); + let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-v1-00000001").map_err(|s| anyhow::anyhow!(s))?; + assert_eq!(parsed, expected,); + + // Omitting generation suffix is valid + let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58").map_err(|s| anyhow::anyhow!(s))?; + assert_eq!(parsed, expected,); + + Ok(()) + } + + #[test] + fn delta_layer_parse() -> anyhow::Result<()> { + let expected = LayerName::Delta(DeltaLayerName { + key_range: Key::from_i128(0) + ..Key::from_hex("000000067F00000001000004DF0000000006").unwrap(), + lsn_range: Lsn::from_hex("00000000014FED58").unwrap() + ..Lsn::from_hex("000000000154C481").unwrap(), + }); + let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-000000000154C481-v1-00000001").map_err(|s| anyhow::anyhow!(s))?; + assert_eq!(parsed, expected); + + // Omitting generation suffix is valid + let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-000000000154C481").map_err(|s| anyhow::anyhow!(s))?; + assert_eq!(parsed, expected); + + Ok(()) + } +} diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs index e4f5f75132..f153719f98 100644 --- a/pageserver/src/tenant/tasks.rs +++ b/pageserver/src/tenant/tasks.rs @@ -2,6 +2,7 @@ //! such as compaction and GC use std::ops::ControlFlow; +use std::str::FromStr; use std::sync::Arc; use std::time::{Duration, Instant}; @@ -9,16 +10,18 @@ use crate::context::{DownloadBehavior, RequestContext}; use crate::metrics::TENANT_TASK_EVENTS; use crate::task_mgr; use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME}; +use crate::tenant::config::defaults::DEFAULT_COMPACTION_PERIOD; use crate::tenant::throttle::Stats; use crate::tenant::timeline::CompactionError; use crate::tenant::{Tenant, TenantState}; +use rand::Rng; use tokio_util::sync::CancellationToken; use tracing::*; use utils::{backoff, completion}; static CONCURRENT_BACKGROUND_TASKS: once_cell::sync::Lazy = once_cell::sync::Lazy::new(|| { - let total_threads = *task_mgr::BACKGROUND_RUNTIME_WORKER_THREADS; + let total_threads = task_mgr::TOKIO_WORKER_THREADS.get(); let permits = usize::max( 1, // while a lot of the work is done on spawn_blocking, we still do @@ -44,6 +47,7 @@ pub(crate) enum BackgroundLoopKind { Compaction, Gc, Eviction, + IngestHouseKeeping, ConsumptionMetricsCollectMetrics, ConsumptionMetricsSyntheticSizeWorker, InitialLogicalSizeCalculation, @@ -62,7 +66,7 @@ impl BackgroundLoopKind { pub(crate) async fn concurrent_background_tasks_rate_limit_permit( loop_kind: BackgroundLoopKind, _ctx: &RequestContext, -) -> impl Drop { +) -> tokio::sync::SemaphorePermit<'static> { let _guard = crate::metrics::BACKGROUND_LOOP_SEMAPHORE_WAIT_GAUGE .with_label_values(&[loop_kind.as_static_str()]) .guard(); @@ -72,6 +76,7 @@ pub(crate) async fn concurrent_background_tasks_rate_limit_permit( loop_kind == BackgroundLoopKind::InitialLogicalSizeCalculation ); + // TODO: assert that we run on BACKGROUND_RUNTIME; requires tokio_unstable Handle::id(); match CONCURRENT_BACKGROUND_TASKS.acquire().await { Ok(permit) => permit, Err(_closed) => unreachable!("we never close the semaphore"), @@ -131,6 +136,30 @@ pub fn start_background_loops( } }, ); + + task_mgr::spawn( + BACKGROUND_RUNTIME.handle(), + TaskKind::IngestHousekeeping, + Some(tenant_shard_id), + None, + &format!("ingest housekeeping for tenant {tenant_shard_id}"), + false, + { + let tenant = Arc::clone(tenant); + let background_jobs_can_start = background_jobs_can_start.cloned(); + async move { + let cancel = task_mgr::shutdown_token(); + tokio::select! { + _ = cancel.cancelled() => { return Ok(()) }, + _ = completion::Barrier::maybe_wait(background_jobs_can_start) => {} + }; + ingest_housekeeping_loop(tenant, cancel) + .instrument(info_span!("ingest_housekeeping_loop", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug())) + .await; + Ok(()) + } + }, + ); } /// @@ -378,6 +407,61 @@ async fn gc_loop(tenant: Arc, cancel: CancellationToken) { TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc(); } +async fn ingest_housekeeping_loop(tenant: Arc, cancel: CancellationToken) { + TENANT_TASK_EVENTS.with_label_values(&["start"]).inc(); + async { + loop { + tokio::select! { + _ = cancel.cancelled() => { + return; + }, + tenant_wait_result = wait_for_active_tenant(&tenant) => match tenant_wait_result { + ControlFlow::Break(()) => return, + ControlFlow::Continue(()) => (), + }, + } + + // We run ingest housekeeping with the same frequency as compaction: it is not worth + // having a distinct setting. But we don't run it in the same task, because compaction + // blocks on acquiring the background job semaphore. + let period = tenant.get_compaction_period(); + + // If compaction period is set to zero (to disable it), then we will use a reasonable default + let period = if period == Duration::ZERO { + humantime::Duration::from_str(DEFAULT_COMPACTION_PERIOD) + .unwrap() + .into() + } else { + period + }; + + // Jitter the period by +/- 5% + let period = + rand::thread_rng().gen_range((period * (95)) / 100..(period * (105)) / 100); + + // Always sleep first: we do not need to do ingest housekeeping early in the lifetime of + // a tenant, since it won't have started writing any ephemeral files yet. + if tokio::time::timeout(period, cancel.cancelled()) + .await + .is_ok() + { + break; + } + + let started_at = Instant::now(); + tenant.ingest_housekeeping().await; + + warn_when_period_overrun( + started_at.elapsed(), + period, + BackgroundLoopKind::IngestHouseKeeping, + ); + } + } + .await; + TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc(); +} + async fn wait_for_active_tenant(tenant: &Arc) -> ControlFlow<()> { // if the tenant has a proper status already, no need to wait for anything if tenant.current_state() == TenantState::Active { @@ -419,8 +503,6 @@ pub(crate) async fn random_init_delay( period: Duration, cancel: &CancellationToken, ) -> Result<(), Cancelled> { - use rand::Rng; - if period == Duration::ZERO { return Ok(()); } diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 7523130f23..505dc8c30d 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -1,5 +1,6 @@ mod compaction; pub mod delete; +pub(crate) mod detach_ancestor; mod eviction_task; mod init; pub mod layer_manager; @@ -9,20 +10,25 @@ pub mod uninit; mod walreceiver; use anyhow::{anyhow, bail, ensure, Context, Result}; +use arc_swap::ArcSwap; use bytes::Bytes; use camino::Utf8Path; use enumset::EnumSet; use fail::fail_point; use once_cell::sync::Lazy; use pageserver_api::{ - key::AUX_FILES_KEY, - keyspace::KeySpaceAccum, + key::{ + AUX_FILES_KEY, METADATA_KEY_BEGIN_PREFIX, METADATA_KEY_END_PREFIX, NON_INHERITED_RANGE, + NON_INHERITED_SPARSE_RANGE, + }, + keyspace::{KeySpaceAccum, SparseKeyPartitioning}, models::{ - CompactionAlgorithm, DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest, - EvictionPolicy, LayerMapInfo, TimelineState, + AuxFilePolicy, CompactionAlgorithm, DownloadRemoteLayersTaskInfo, + DownloadRemoteLayersTaskSpawnRequest, EvictionPolicy, InMemoryLayerInfo, LayerMapInfo, + TimelineState, }, reltag::BlockNumber, - shard::{ShardIdentity, TenantShardId}, + shard::{ShardIdentity, ShardNumber, TenantShardId}, }; use rand::Rng; use serde_with::serde_as; @@ -54,7 +60,7 @@ use std::{ ops::ControlFlow, }; -use crate::tenant::timeline::logical_size::CurrentLogicalSize; +use crate::tenant::timeline::init::LocalLayerFileMetadata; use crate::tenant::{ layer_map::{LayerMap, SearchResult}, metadata::TimelineMetadata, @@ -64,18 +70,21 @@ use crate::{ disk_usage_eviction_task::DiskUsageEvictionInfo, pgdatadir_mapping::CollectKeySpaceError, }; -use crate::{deletion_queue::DeletionQueueClient, tenant::remote_timeline_client::StopError}; +use crate::{deletion_queue::DeletionQueueClient, metrics::GetKind}; use crate::{ disk_usage_eviction_task::finite_f32, tenant::storage_layer::{ AsLayerDesc, DeltaLayerWriter, EvictionError, ImageLayerWriter, InMemoryLayer, Layer, - LayerAccessStatsReset, LayerFileName, ResidentLayer, ValueReconstructResult, + LayerAccessStatsReset, LayerName, ResidentLayer, ValueReconstructResult, ValueReconstructState, ValuesReconstructState, }, }; use crate::{ disk_usage_eviction_task::EvictionCandidate, tenant::storage_layer::delta_layer::DeltaEntry, }; +use crate::{ + metrics::ScanLatencyOngoingRecording, tenant::timeline::logical_size::CurrentLogicalSize, +}; use crate::{pgdatadir_mapping::LsnForTimestamp, tenant::tasks::BackgroundLoopKind}; use crate::{ pgdatadir_mapping::{AuxFilesDirectory, DirectoryKind}, @@ -118,11 +127,11 @@ use self::layer_manager::LayerManager; use self::logical_size::LogicalSize; use self::walreceiver::{WalReceiver, WalReceiverConf}; -use super::remote_timeline_client::RemoteTimelineClient; use super::secondary::heatmap::{HeatMapLayer, HeatMapTimeline}; -use super::{config::TenantConf, storage_layer::ReadableLayerDesc}; +use super::{config::TenantConf, storage_layer::VectoredValueReconstructState}; use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf}; use super::{remote_timeline_client::index::IndexPart, storage_layer::LayerFringe}; +use super::{remote_timeline_client::RemoteTimelineClient, storage_layer::ReadableLayer}; #[derive(Debug, PartialEq, Eq, Clone, Copy)] pub(super) enum FlushLoopState { @@ -136,6 +145,25 @@ pub(super) enum FlushLoopState { Exited, } +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +pub enum ImageLayerCreationMode { + /// Try to create image layers based on `time_for_new_image_layer`. Used in compaction code path. + Try, + /// Force creating the image layers if possible. For now, no image layers will be created + /// for metadata keys. Used in compaction code path with force flag enabled. + Force, + /// Initial ingestion of the data, and no data should be dropped in this function. This + /// means that no metadata keys should be included in the partitions. Used in flush frozen layer + /// code path. + Initial, +} + +impl std::fmt::Display for ImageLayerCreationMode { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{:?}", self) + } +} + /// Wrapper for key range to provide reverse ordering by range length for BinaryHeap #[derive(Debug, Clone, PartialEq, Eq)] pub(crate) struct Hole { @@ -181,9 +209,19 @@ pub(crate) struct AuxFilesState { pub(crate) n_deltas: usize, } +/// The relation size cache caches relation sizes at the end of the timeline. It speeds up WAL +/// ingestion considerably, because WAL ingestion needs to check on most records if the record +/// implicitly extends the relation. At startup, `complete_as_of` is initialized to the current end +/// of the timeline (disk_consistent_lsn). It's used on reads of relation sizes to check if the +/// value can be used to also update the cache, see [`Timeline::update_cached_rel_size`]. +pub(crate) struct RelSizeCache { + pub(crate) complete_as_of: Lsn, + pub(crate) map: HashMap, +} + pub struct Timeline { conf: &'static PageServerConf, - tenant_conf: Arc>, + tenant_conf: Arc>, myself: Weak, @@ -281,10 +319,12 @@ pub struct Timeline { pub(super) flush_loop_state: Mutex, /// layer_flush_start_tx can be used to wake up the layer-flushing task. - /// The value is a counter, incremented every time a new flush cycle is requested. - /// The flush cycle counter is sent back on the layer_flush_done channel when - /// the flush finishes. You can use that to wait for the flush to finish. - layer_flush_start_tx: tokio::sync::watch::Sender, + /// - The u64 value is a counter, incremented every time a new flush cycle is requested. + /// The flush cycle counter is sent back on the layer_flush_done channel when + /// the flush finishes. You can use that to wait for the flush to finish. + /// - The LSN is updated to max() of its current value and the latest disk_consistent_lsn + /// read by whoever sends an update + layer_flush_start_tx: tokio::sync::watch::Sender<(u64, Lsn)>, /// to be notified when layer flushing has finished, subscribe to the layer_flush_done channel layer_flush_done_tx: tokio::sync::watch::Sender<(u64, Result<(), FlushLayerError>)>, @@ -293,7 +333,7 @@ pub struct Timeline { // List of child timelines and their branch points. This is needed to avoid // garbage collecting data that is still needed by the child timelines. - pub gc_info: std::sync::RwLock, + pub(crate) gc_info: std::sync::RwLock, // It may change across major versions so for simplicity // keep it after running initdb for a timeline. @@ -304,11 +344,13 @@ pub struct Timeline { pub initdb_lsn: Lsn, /// When did we last calculate the partitioning? - partitioning: tokio::sync::Mutex<(KeyPartitioning, Lsn)>, + partitioning: tokio::sync::Mutex<((KeyPartitioning, SparseKeyPartitioning), Lsn)>, /// Configuration: how often should the partitioning be recalculated. repartition_threshold: u64, + last_image_layer_creation_check_at: AtomicLsn, + /// Current logical size of the "datadir", at the last LSN. current_logical_size: LogicalSize, @@ -319,7 +361,7 @@ pub struct Timeline { pub walreceiver: Mutex>, /// Relation size cache - pub rel_size_cache: RwLock>, + pub(crate) rel_size_cache: RwLock, download_all_remote_layers_task_info: RwLock>, @@ -375,33 +417,59 @@ pub struct WalReceiverInfo { pub last_received_msg_ts: u128, } -/// /// Information about how much history needs to be retained, needed by /// Garbage Collection. -/// -pub struct GcInfo { +#[derive(Default)] +pub(crate) struct GcInfo { /// Specific LSNs that are needed. /// /// Currently, this includes all points where child branches have /// been forked off from. In the future, could also include /// explicit user-defined snapshot points. - pub retain_lsns: Vec, + pub(crate) retain_lsns: Vec, - /// In addition to 'retain_lsns', keep everything newer than this - /// point. + /// The cutoff coordinates, which are combined by selecting the minimum. + pub(crate) cutoffs: GcCutoffs, +} + +impl GcInfo { + pub(crate) fn min_cutoff(&self) -> Lsn { + self.cutoffs.select_min() + } +} + +/// The `GcInfo` component describing which Lsns need to be retained. +#[derive(Debug)] +pub(crate) struct GcCutoffs { + /// Keep everything newer than this point. /// /// This is calculated by subtracting 'gc_horizon' setting from /// last-record LSN /// /// FIXME: is this inclusive or exclusive? - pub horizon_cutoff: Lsn, + pub(crate) horizon: Lsn, /// In addition to 'retain_lsns' and 'horizon_cutoff', keep everything newer than this /// point. /// /// This is calculated by finding a number such that a record is needed for PITR /// if only if its LSN is larger than 'pitr_cutoff'. - pub pitr_cutoff: Lsn, + pub(crate) pitr: Lsn, +} + +impl Default for GcCutoffs { + fn default() -> Self { + Self { + horizon: Lsn::INVALID, + pitr: Lsn::INVALID, + } + } +} + +impl GcCutoffs { + fn select_min(&self) -> Lsn { + std::cmp::min(self.horizon, self.pitr) + } } /// An error happened in a get() operation. @@ -423,6 +491,51 @@ pub(crate) enum PageReconstructError { /// An error happened replaying WAL records #[error(transparent)] WalRedo(anyhow::Error), + + #[error("{0}")] + MissingKey(MissingKeyError), +} + +#[derive(Debug)] +pub struct MissingKeyError { + key: Key, + shard: ShardNumber, + cont_lsn: Lsn, + request_lsn: Lsn, + ancestor_lsn: Option, + traversal_path: Vec, + backtrace: Option, +} + +impl std::fmt::Display for MissingKeyError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "could not find data for key {} (shard {:?}) at LSN {}, request LSN {}", + self.key, self.shard, self.cont_lsn, self.request_lsn + )?; + if let Some(ref ancestor_lsn) = self.ancestor_lsn { + write!(f, ", ancestor {}", ancestor_lsn)?; + } + + if !self.traversal_path.is_empty() { + writeln!(f)?; + } + + for (r, c, l) in &self.traversal_path { + writeln!( + f, + "layer traversal: result {:?}, cont_lsn {}, layer: {}", + r, c, l, + )?; + } + + if let Some(ref backtrace) = self.backtrace { + write!(f, "\n{}", backtrace)?; + } + + Ok(()) + } } impl PageReconstructError { @@ -434,6 +547,7 @@ impl PageReconstructError { AncestorLsnTimeout(_) => false, Cancelled | AncestorStopping(_) => true, WalRedo(_) => false, + MissingKey { .. } => false, } } } @@ -477,8 +591,8 @@ pub(crate) enum GetVectoredError { #[error("Requested at invalid LSN: {0}")] InvalidLsn(Lsn), - #[error("Requested key {0} not found")] - MissingKey(Key), + #[error("Requested key not found: {0}")] + MissingKey(MissingKeyError), #[error(transparent)] GetReadyAncestorError(GetReadyAncestorError), @@ -581,6 +695,19 @@ impl From for CreateImageLayersError { } } +impl From for PageReconstructError { + fn from(e: GetVectoredError) -> Self { + match e { + GetVectoredError::Cancelled => PageReconstructError::Cancelled, + GetVectoredError::InvalidLsn(_) => PageReconstructError::Other(anyhow!("Invalid LSN")), + err @ GetVectoredError::Oversized(_) => PageReconstructError::Other(err.into()), + GetVectoredError::MissingKey(err) => PageReconstructError::MissingKey(err), + GetVectoredError::GetReadyAncestorError(err) => PageReconstructError::from(err), + GetVectoredError::Other(err) => PageReconstructError::Other(err), + } + } +} + impl From for PageReconstructError { fn from(e: GetReadyAncestorError) -> Self { use GetReadyAncestorError::*; @@ -610,6 +737,42 @@ pub enum GetVectoredImpl { Vectored, } +#[derive( + Eq, + PartialEq, + Debug, + Copy, + Clone, + strum_macros::EnumString, + strum_macros::Display, + serde_with::DeserializeFromStr, + serde_with::SerializeDisplay, +)] +#[strum(serialize_all = "kebab-case")] +pub enum GetImpl { + Legacy, + Vectored, +} + +pub(crate) enum WaitLsnWaiter<'a> { + Timeline(&'a Timeline), + Tenant, + PageService, +} + +/// Argument to [`Timeline::shutdown`]. +#[derive(Debug, Clone, Copy)] +pub(crate) enum ShutdownMode { + /// Graceful shutdown, may do a lot of I/O as we flush any open layers to disk and then + /// also to remote storage. This method can easily take multiple seconds for a busy timeline. + /// + /// While we are flushing, we continue to accept read I/O for LSNs ingested before + /// the call to [`Timeline::shutdown`]. + FreezeAndFlush, + /// Shut down immediately, without waiting for any open layers to flush. + Hard, +} + /// Public interface functions impl Timeline { /// Get the LSN where this branch was created @@ -652,16 +815,6 @@ impl Timeline { key: Key, lsn: Lsn, ctx: &RequestContext, - ) -> Result { - self.timeline_get_throttle.throttle(ctx, 1).await; - self.get_impl(key, lsn, ctx).await - } - /// Not subject to [`Self::timeline_get_throttle`]. - async fn get_impl( - &self, - key: Key, - lsn: Lsn, - ctx: &RequestContext, ) -> Result { if !lsn.is_valid() { return Err(PageReconstructError::Other(anyhow::anyhow!("Invalid LSN"))); @@ -672,13 +825,7 @@ impl Timeline { // page_service. debug_assert!(!self.shard_identity.is_key_disposable(&key)); - // XXX: structured stats collection for layer eviction here. - trace!( - "get page request for {}@{} from task kind {:?}", - key, - lsn, - ctx.task_kind() - ); + self.timeline_get_throttle.throttle(ctx, 1).await; // Check the page cache. We will get back the most recent page with lsn <= `lsn`. // The cached image can be returned directly if there is no WAL between the cached image @@ -701,12 +848,88 @@ impl Timeline { None => None, }; - let mut reconstruct_state = ValueReconstructState { - records: Vec::new(), - img: cached_page_img, - }; + match self.conf.get_impl { + GetImpl::Legacy => { + let reconstruct_state = ValueReconstructState { + records: Vec::new(), + img: cached_page_img, + }; - let timer = crate::metrics::GET_RECONSTRUCT_DATA_TIME.start_timer(); + self.get_impl(key, lsn, reconstruct_state, ctx).await + } + GetImpl::Vectored => { + let keyspace = KeySpace { + ranges: vec![key..key.next()], + }; + + // Initialise the reconstruct state for the key with the cache + // entry returned above. + let mut reconstruct_state = ValuesReconstructState::new(); + + // Only add the cached image to the reconstruct state when it exists. + if cached_page_img.is_some() { + let mut key_state = VectoredValueReconstructState::default(); + key_state.img = cached_page_img; + reconstruct_state.keys.insert(key, Ok(key_state)); + } + + let vectored_res = self + .get_vectored_impl(keyspace.clone(), lsn, reconstruct_state, ctx) + .await; + + if self.conf.validate_vectored_get { + self.validate_get_vectored_impl(&vectored_res, keyspace, lsn, ctx) + .await; + } + + let key_value = vectored_res?.pop_first(); + match key_value { + Some((got_key, value)) => { + if got_key != key { + error!( + "Expected {}, but singular vectored get returned {}", + key, got_key + ); + Err(PageReconstructError::Other(anyhow!( + "Singular vectored get returned wrong key" + ))) + } else { + value + } + } + None => Err(PageReconstructError::MissingKey(MissingKeyError { + key, + shard: self.shard_identity.get_shard_number(&key), + cont_lsn: Lsn(0), + request_lsn: lsn, + ancestor_lsn: None, + traversal_path: Vec::new(), + backtrace: None, + })), + } + } + } + } + + /// Not subject to [`Self::timeline_get_throttle`]. + async fn get_impl( + &self, + key: Key, + lsn: Lsn, + mut reconstruct_state: ValueReconstructState, + ctx: &RequestContext, + ) -> Result { + // XXX: structured stats collection for layer eviction here. + trace!( + "get page request for {}@{} from task kind {:?}", + key, + lsn, + ctx.task_kind() + ); + + let timer = crate::metrics::GET_RECONSTRUCT_DATA_TIME + .for_get_kind(GetKind::Singular) + .start_timer(); let path = self .get_reconstruct_data(key, lsn, &mut reconstruct_state, ctx) .await?; @@ -716,7 +939,7 @@ impl Timeline { let res = self.reconstruct_value(key, lsn, reconstruct_state).await; let elapsed = start.elapsed(); crate::metrics::RECONSTRUCT_TIME - .for_result(&res) + .for_get_kind(GetKind::Singular) .observe(elapsed.as_secs_f64()); if cfg!(feature = "testing") && res.is_err() { @@ -729,7 +952,7 @@ impl Timeline { writeln!( msg, "- layer traversal: result {res:?}, cont_lsn {cont_lsn}, layer: {}", - layer(), + layer, ) .expect("string grows") }); @@ -758,7 +981,7 @@ impl Timeline { return Err(GetVectoredError::InvalidLsn(lsn)); } - let key_count = keyspace.total_size().try_into().unwrap(); + let key_count = keyspace.total_raw_size().try_into().unwrap(); if key_count > Timeline::MAX_GET_VECTORED_KEYS { return Err(GetVectoredError::Oversized(key_count)); } @@ -795,7 +1018,9 @@ impl Timeline { self.get_vectored_sequential_impl(keyspace, lsn, ctx).await } GetVectoredImpl::Vectored => { - let vectored_res = self.get_vectored_impl(keyspace.clone(), lsn, ctx).await; + let vectored_res = self + .get_vectored_impl(keyspace.clone(), lsn, ValuesReconstructState::new(), ctx) + .await; if self.conf.validate_vectored_get { self.validate_get_vectored_impl(&vectored_res, keyspace, lsn, ctx) @@ -830,6 +1055,70 @@ impl Timeline { res } + /// Scan the keyspace and return all existing key-values in the keyspace. This currently uses vectored + /// get underlying. Normal vectored get would throw an error when a key in the keyspace is not found + /// during the search, but for the scan interface, it returns all existing key-value pairs, and does + /// not expect each single key in the key space will be found. The semantics is closer to the RocksDB + /// scan iterator interface. We could optimize this interface later to avoid some checks in the vectored + /// get path to maintain and split the probing and to-be-probe keyspace. We also need to ensure that + /// the scan operation will not cause OOM in the future. + #[allow(dead_code)] + pub(crate) async fn scan( + &self, + keyspace: KeySpace, + lsn: Lsn, + ctx: &RequestContext, + ) -> Result>, GetVectoredError> { + if !lsn.is_valid() { + return Err(GetVectoredError::InvalidLsn(lsn)); + } + + trace!( + "key-value scan request for {:?}@{} from task kind {:?}", + keyspace, + lsn, + ctx.task_kind() + ); + + // We should generalize this into Keyspace::contains in the future. + for range in &keyspace.ranges { + if range.start.field1 < METADATA_KEY_BEGIN_PREFIX + || range.end.field1 > METADATA_KEY_END_PREFIX + { + return Err(GetVectoredError::Other(anyhow::anyhow!( + "only metadata keyspace can be scanned" + ))); + } + } + + let start = crate::metrics::SCAN_LATENCY + .for_task_kind(ctx.task_kind()) + .map(ScanLatencyOngoingRecording::start_recording); + + // start counting after throttle so that throttle time + // is always less than observation time + let throttled = self + .timeline_get_throttle + // assume scan = 1 quota for now until we find a better way to process this + .throttle(ctx, 1) + .await; + + let vectored_res = self + .get_vectored_impl( + keyspace.clone(), + lsn, + ValuesReconstructState::default(), + ctx, + ) + .await; + + if let Some(recording) = start { + recording.observe(throttled); + } + + vectored_res + } + /// Not subject to [`Self::timeline_get_throttle`]. pub(super) async fn get_vectored_sequential_impl( &self, @@ -838,18 +1127,47 @@ impl Timeline { ctx: &RequestContext, ) -> Result>, GetVectoredError> { let mut values = BTreeMap::new(); + for range in keyspace.ranges { let mut key = range.start; while key != range.end { - let block = self.get_impl(key, lsn, ctx).await; + let block = self + .get_impl(key, lsn, ValueReconstructState::default(), ctx) + .await; use PageReconstructError::*; match block { Err(Cancelled | AncestorStopping(_)) => { return Err(GetVectoredError::Cancelled) } - Err(Other(err)) if err.to_string().contains("could not find data for key") => { - return Err(GetVectoredError::MissingKey(key)) + Err(MissingKey(_)) + if NON_INHERITED_RANGE.contains(&key) + || NON_INHERITED_SPARSE_RANGE.contains(&key) => + { + // Ignore missing key error for aux key range. TODO: currently, we assume non_inherited_range == aux_key_range. + // When we add more types of keys into the page server, we should revisit this part of code and throw errors + // accordingly. + key = key.next(); + } + Err(MissingKey(err)) => { + return Err(GetVectoredError::MissingKey(err)); + } + Err(Other(err)) + if err + .to_string() + .contains("downloading evicted layer file failed") => + { + return Err(GetVectoredError::Other(err)) + } + Err(Other(err)) + if err + .chain() + .any(|cause| cause.to_string().contains("layer loading failed")) => + { + // The intent here is to achieve error parity with the vectored read path. + // When vectored read fails to load a layer it fails the whole read, hence + // we mimic this behaviour here to keep the validation happy. + return Err(GetVectoredError::Other(err)); } _ => { values.insert(key, block); @@ -866,14 +1184,27 @@ impl Timeline { &self, keyspace: KeySpace, lsn: Lsn, + mut reconstruct_state: ValuesReconstructState, ctx: &RequestContext, ) -> Result>, GetVectoredError> { - let mut reconstruct_state = ValuesReconstructState::new(); + let get_kind = if keyspace.total_raw_size() == 1 { + GetKind::Singular + } else { + GetKind::Vectored + }; + let get_data_timer = crate::metrics::GET_RECONSTRUCT_DATA_TIME + .for_get_kind(get_kind) + .start_timer(); self.get_vectored_reconstruct_data(keyspace, lsn, &mut reconstruct_state, ctx) .await?; + get_data_timer.stop_and_record(); + let reconstruct_timer = crate::metrics::RECONSTRUCT_TIME + .for_get_kind(get_kind) + .start_timer(); let mut results: BTreeMap> = BTreeMap::new(); + let layers_visited = reconstruct_state.get_layers_visited(); for (key, res) in reconstruct_state.keys { match res { Err(err) => { @@ -887,6 +1218,19 @@ impl Timeline { } } } + reconstruct_timer.stop_and_record(); + + // For aux file keys (v1 or v2) the vectored read path does not return an error + // when they're missing. Instead they are omitted from the resulting btree + // (this is a requirement, not a bug). Skip updating the metric in these cases + // to avoid infinite results. + if !results.is_empty() { + // Note that this is an approximation. Tracking the exact number of layers visited + // per key requires virtually unbounded memory usage and is inefficient + // (i.e. segment tree tracking each range queried from a layer) + crate::metrics::VEC_READ_NUM_LAYERS_VISITED + .observe(layers_visited as f64 / results.len() as f64); + } Ok(results) } @@ -899,6 +1243,11 @@ impl Timeline { lsn: Lsn, ctx: &RequestContext, ) { + if keyspace.overlaps(&Key::metadata_key_range()) { + // skip validation for metadata key range + return; + } + let sequential_res = self .get_vectored_sequential_impl(keyspace.clone(), lsn, ctx) .await; @@ -908,7 +1257,7 @@ impl Timeline { match (lhs, rhs) { (Oversized(l), Oversized(r)) => l == r, (InvalidLsn(l), InvalidLsn(r)) => l == r, - (MissingKey(l), MissingKey(r)) => l == r, + (MissingKey(l), MissingKey(r)) => l.key == r.key, (GetReadyAncestorError(_), GetReadyAncestorError(_)) => true, (Other(_), Other(_)) => true, _ => false, @@ -922,6 +1271,11 @@ impl Timeline { panic!(concat!("Sequential get failed with {}, but vectored get did not", " - keyspace={:?} lsn={}"), seq_err, keyspace, lsn) }, + (Ok(_), Err(GetVectoredError::GetReadyAncestorError(GetReadyAncestorError::AncestorLsnTimeout(_)))) => { + // Sequential get runs after vectored get, so it is possible for the later + // to time out while waiting for its ancestor's Lsn to become ready and for the + // former to succeed (it essentially has a doubled wait time). + }, (Ok(_), Err(vec_err)) => { panic!(concat!("Vectored get failed with {}, but sequential get did not", " - keyspace={:?} lsn={}"), @@ -1002,6 +1356,12 @@ impl Timeline { self.last_record_lsn.load() } + /// Subscribe to callers of wait_lsn(). The value of the channel is None if there are no + /// wait_lsn() calls in progress, and Some(Lsn) if there is an active waiter for wait_lsn(). + pub(crate) fn subscribe_for_wait_lsn_updates(&self) -> watch::Receiver> { + self.last_record_lsn.status_receiver() + } + pub(crate) fn get_disk_consistent_lsn(&self) -> Lsn { self.disk_consistent_lsn.load() } @@ -1058,7 +1418,8 @@ impl Timeline { pub(crate) async fn wait_lsn( &self, lsn: Lsn, - _ctx: &RequestContext, /* Prepare for use by cancellation */ + who_is_waiting: WaitLsnWaiter<'_>, + ctx: &RequestContext, /* Prepare for use by cancellation */ ) -> Result<(), WaitLsnError> { if self.cancel.is_cancelled() { return Err(WaitLsnError::Shutdown); @@ -1066,20 +1427,28 @@ impl Timeline { return Err(WaitLsnError::BadState); } - // This should never be called from the WAL receiver, because that could lead - // to a deadlock. - debug_assert!( - task_mgr::current_task_kind() != Some(TaskKind::WalReceiverManager), - "wait_lsn cannot be called in WAL receiver" - ); - debug_assert!( - task_mgr::current_task_kind() != Some(TaskKind::WalReceiverConnectionHandler), - "wait_lsn cannot be called in WAL receiver" - ); - debug_assert!( - task_mgr::current_task_kind() != Some(TaskKind::WalReceiverConnectionPoller), - "wait_lsn cannot be called in WAL receiver" - ); + if cfg!(debug_assertions) { + match ctx.task_kind() { + TaskKind::WalReceiverManager + | TaskKind::WalReceiverConnectionHandler + | TaskKind::WalReceiverConnectionPoller => { + let is_myself = match who_is_waiting { + WaitLsnWaiter::Timeline(waiter) => Weak::ptr_eq(&waiter.myself, &self.myself), + WaitLsnWaiter::Tenant | WaitLsnWaiter::PageService => unreachable!("tenant or page_service context are not expected to have task kind {:?}", ctx.task_kind()), + }; + if is_myself { + if let Err(current) = self.last_record_lsn.would_wait_for(lsn) { + // walingest is the only one that can advance last_record_lsn; it should make sure to never reach here + panic!("this timeline's walingest task is calling wait_lsn({lsn}) but we only have last_record_lsn={current}; would deadlock"); + } + } else { + // if another timeline's is waiting for us, there's no deadlock risk because + // our walreceiver task can make progress independent of theirs + } + } + _ => {} + } + } let _timer = crate::metrics::WAIT_LSN_TIME.start_timer(); @@ -1138,8 +1507,117 @@ impl Timeline { /// Flush to disk all data that was written with the put_* functions #[instrument(skip(self), fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%self.timeline_id))] pub(crate) async fn freeze_and_flush(&self) -> anyhow::Result<()> { - self.freeze_inmem_layer(false).await; - self.flush_frozen_layers_and_wait().await + self.freeze_and_flush0().await + } + + // This exists to provide a non-span creating version of `freeze_and_flush` we can call without + // polluting the span hierarchy. + pub(crate) async fn freeze_and_flush0(&self) -> anyhow::Result<()> { + let to_lsn = self.freeze_inmem_layer(false).await; + self.flush_frozen_layers_and_wait(to_lsn).await + } + + // Check if an open ephemeral layer should be closed: this provides + // background enforcement of checkpoint interval if there is no active WAL receiver, to avoid keeping + // an ephemeral layer open forever when idle. It also freezes layers if the global limit on + // ephemeral layer bytes has been breached. + pub(super) async fn maybe_freeze_ephemeral_layer(&self) { + let Ok(_write_guard) = self.write_lock.try_lock() else { + // If the write lock is held, there is an active wal receiver: rolling open layers + // is their responsibility while they hold this lock. + return; + }; + + let Ok(layers_guard) = self.layers.try_read() else { + // Don't block if the layer lock is busy + return; + }; + + let Some(open_layer) = &layers_guard.layer_map().open_layer else { + // If there is no open layer, we have no layer freezing to do. However, we might need to generate + // some updates to disk_consistent_lsn and remote_consistent_lsn, in case we ingested some WAL regions + // that didn't result in writes to this shard. + + // Must not hold the layers lock while waiting for a flush. + drop(layers_guard); + + let last_record_lsn = self.get_last_record_lsn(); + let disk_consistent_lsn = self.get_disk_consistent_lsn(); + if last_record_lsn > disk_consistent_lsn { + // We have no open layer, but disk_consistent_lsn is behind the last record: this indicates + // we are a sharded tenant and have skipped some WAL + let last_freeze_ts = *self.last_freeze_ts.read().unwrap(); + if last_freeze_ts.elapsed() >= self.get_checkpoint_timeout() { + // Only do this if have been layer-less longer than get_checkpoint_timeout, so that a shard + // without any data ingested (yet) doesn't write a remote index as soon as it + // sees its LSN advance: we only do this if we've been layer-less + // for some time. + tracing::debug!( + "Advancing disk_consistent_lsn past WAL ingest gap {} -> {}", + disk_consistent_lsn, + last_record_lsn + ); + + // The flush loop will update remote consistent LSN as well as disk consistent LSN. + self.flush_frozen_layers_and_wait(last_record_lsn) + .await + .ok(); + } + } + + return; + }; + + let Some(current_size) = open_layer.try_len() else { + // Unexpected: since we hold the write guard, nobody else should be writing to this layer, so + // read lock to get size should always succeed. + tracing::warn!("Lock conflict while reading size of open layer"); + return; + }; + + let current_lsn = self.get_last_record_lsn(); + + let checkpoint_distance_override = open_layer.tick().await; + + if let Some(size_override) = checkpoint_distance_override { + if current_size > size_override { + // This is not harmful, but it only happens in relatively rare cases where + // time-based checkpoints are not happening fast enough to keep the amount of + // ephemeral data within configured limits. It's a sign of stress on the system. + tracing::info!("Early-rolling open layer at size {current_size} (limit {size_override}) due to dirty data pressure"); + } + } + + let checkpoint_distance = + checkpoint_distance_override.unwrap_or(self.get_checkpoint_distance()); + + if self.should_roll( + current_size, + current_size, + checkpoint_distance, + self.get_last_record_lsn(), + self.last_freeze_at.load(), + open_layer.get_opened_at(), + ) { + match open_layer.info() { + InMemoryLayerInfo::Frozen { lsn_start, lsn_end } => { + // We may reach this point if the layer was already frozen by not yet flushed: flushing + // happens asynchronously in the background. + tracing::debug!( + "Not freezing open layer, it's already frozen ({lsn_start}..{lsn_end})" + ); + } + InMemoryLayerInfo::Open { .. } => { + // Upgrade to a write lock and freeze the layer + drop(layers_guard); + let mut layers_guard = self.layers.write().await; + layers_guard + .try_freeze_in_memory_layer(current_lsn, &self.last_freeze_at) + .await; + } + } + self.flush_frozen_layers(); + } } /// Outermost timeline compaction operation; downloads needed layers. @@ -1196,105 +1674,133 @@ impl Timeline { pub(crate) fn activate( self: &Arc, + parent: Arc, broker_client: BrokerClientChannel, background_jobs_can_start: Option<&completion::Barrier>, ctx: &RequestContext, ) { - if self.tenant_shard_id.is_zero() { + if self.tenant_shard_id.is_shard_zero() { // Logical size is only maintained accurately on shard zero. self.spawn_initial_logical_size_computation_task(ctx); } self.launch_wal_receiver(ctx, broker_client); self.set_state(TimelineState::Active); - self.launch_eviction_task(background_jobs_can_start); + self.launch_eviction_task(parent, background_jobs_can_start); } - /// Graceful shutdown, may do a lot of I/O as we flush any open layers to disk and then - /// also to remote storage. This method can easily take multiple seconds for a busy timeline. + /// After this function returns, there are no timeline-scoped tasks are left running. /// - /// While we are flushing, we continue to accept read I/O. - pub(crate) async fn flush_and_shutdown(&self) { + /// The preferred pattern for is: + /// - in any spawned tasks, keep Timeline::guard open + Timeline::cancel / child token + /// - if early shutdown (not just cancellation) of a sub-tree of tasks is required, + /// go the extra mile and keep track of JoinHandles + /// - Keep track of JoinHandles using a passed-down `Arc>>` or similar, + /// instead of spawning directly on a runtime. It is a more composable / testable pattern. + /// + /// For legacy reasons, we still have multiple tasks spawned using + /// `task_mgr::spawn(X, Some(tenant_id), Some(timeline_id))`. + /// We refer to these as "timeline-scoped task_mgr tasks". + /// Some of these tasks are already sensitive to Timeline::cancel while others are + /// not sensitive to Timeline::cancel and instead respect [`task_mgr::shutdown_token`] + /// or [`task_mgr::shutdown_watcher`]. + /// We want to gradually convert the code base away from these. + /// + /// Here is an inventory of timeline-scoped task_mgr tasks that are still sensitive to + /// `task_mgr::shutdown_{token,watcher}` (there are also tenant-scoped and global-scoped + /// ones that aren't mentioned here): + /// - [`TaskKind::TimelineDeletionWorker`] + /// - NB: also used for tenant deletion + /// - [`TaskKind::RemoteUploadTask`]` + /// - [`TaskKind::InitialLogicalSizeCalculation`] + /// - [`TaskKind::DownloadAllRemoteLayers`] (can we get rid of it?) + // Inventory of timeline-scoped task_mgr tasks that use spawn but aren't sensitive: + /// - [`TaskKind::Eviction`] + /// - [`TaskKind::LayerFlushTask`] + /// - [`TaskKind::OndemandLogicalSizeCalculation`] + /// - [`TaskKind::GarbageCollector`] (immediate_gc is timeline-scoped) + pub(crate) async fn shutdown(&self, mode: ShutdownMode) { debug_assert_current_span_has_tenant_and_timeline_id(); - // Stop ingesting data, so that we are not still writing to an InMemoryLayer while - // trying to flush - tracing::debug!("Waiting for WalReceiverManager..."); - task_mgr::shutdown_tasks( - Some(TaskKind::WalReceiverManager), - Some(self.tenant_shard_id), - Some(self.timeline_id), - ) - .await; + let try_freeze_and_flush = match mode { + ShutdownMode::FreezeAndFlush => true, + ShutdownMode::Hard => false, + }; - // Since we have shut down WAL ingest, we should not let anyone start waiting for the LSN to advance + // Regardless of whether we're going to try_freeze_and_flush + // or not, stop ingesting any more data. Walreceiver only provides + // cancellation but no "wait until gone", because it uses the Timeline::gate. + // So, only after the self.gate.close() below will we know for sure that + // no walreceiver tasks are left. + // For `try_freeze_and_flush=true`, this means that we might still be ingesting + // data during the call to `self.freeze_and_flush()` below. + // That's not ideal, but, we don't have the concept of a ChildGuard, + // which is what we'd need to properly model early shutdown of the walreceiver + // task sub-tree before the other Timeline task sub-trees. + let walreceiver = self.walreceiver.lock().unwrap().take(); + tracing::debug!( + is_some = walreceiver.is_some(), + "Waiting for WalReceiverManager..." + ); + if let Some(walreceiver) = walreceiver { + walreceiver.cancel(); + } + // ... and inform any waiters for newer LSNs that there won't be any. self.last_record_lsn.shutdown(); - // now all writers to InMemory layer are gone, do the final flush if requested - match self.freeze_and_flush().await { - Ok(_) => { - // drain the upload queue - if let Some(client) = self.remote_client.as_ref() { - // if we did not wait for completion here, it might be our shutdown process - // didn't wait for remote uploads to complete at all, as new tasks can forever - // be spawned. - // - // what is problematic is the shutting down of RemoteTimelineClient, because - // obviously it does not make sense to stop while we wait for it, but what - // about corner cases like s3 suddenly hanging up? - if let Err(e) = client.shutdown().await { - // Non-fatal. Shutdown is infallible. Failures to flush just mean that - // we have some extra WAL replay to do next time the timeline starts. - warn!("failed to flush to remote storage: {e:#}"); + if try_freeze_and_flush { + // we shut down walreceiver above, so, we won't add anything more + // to the InMemoryLayer; freeze it and wait for all frozen layers + // to reach the disk & upload queue, then shut the upload queue and + // wait for it to drain. + match self.freeze_and_flush().await { + Ok(_) => { + // drain the upload queue + if let Some(client) = self.remote_client.as_ref() { + // if we did not wait for completion here, it might be our shutdown process + // didn't wait for remote uploads to complete at all, as new tasks can forever + // be spawned. + // + // what is problematic is the shutting down of RemoteTimelineClient, because + // obviously it does not make sense to stop while we wait for it, but what + // about corner cases like s3 suddenly hanging up? + client.shutdown().await; } } - } - Err(e) => { - // Non-fatal. Shutdown is infallible. Failures to flush just mean that - // we have some extra WAL replay to do next time the timeline starts. - warn!("failed to freeze and flush: {e:#}"); + Err(e) => { + // Non-fatal. Shutdown is infallible. Failures to flush just mean that + // we have some extra WAL replay to do next time the timeline starts. + warn!("failed to freeze and flush: {e:#}"); + } } } - self.shutdown().await; - } - - /// Shut down immediately, without waiting for any open layers to flush to disk. This is a subset of - /// the graceful [`Timeline::flush_and_shutdown`] function. - pub(crate) async fn shutdown(&self) { - debug_assert_current_span_has_tenant_and_timeline_id(); - // Signal any subscribers to our cancellation token to drop out tracing::debug!("Cancelling CancellationToken"); self.cancel.cancel(); - // Page request handlers might be waiting for LSN to advance: they do not respect Timeline::cancel - // while doing so. - self.last_record_lsn.shutdown(); - - // Shut down the layer flush task before the remote client, as one depends on the other - task_mgr::shutdown_tasks( - Some(TaskKind::LayerFlushTask), - Some(self.tenant_shard_id), - Some(self.timeline_id), - ) - .await; - - // Shut down remote timeline client: this gracefully moves its metadata into its Stopping state in - // case our caller wants to use that for a deletion + // Transition the remote_client into a state where it's only useful for timeline deletion. + // (The deletion use case is why we can't just hook up remote_client to Self::cancel).) if let Some(remote_client) = self.remote_client.as_ref() { - match remote_client.stop() { - Ok(()) => {} - Err(StopError::QueueUninitialized) => { - // Shutting down during initialization is legal - } - } + remote_client.stop(); + // As documented in remote_client.stop()'s doc comment, it's our responsibility + // to shut down the upload queue tasks. + // TODO: fix that, task management should be encapsulated inside remote_client. + task_mgr::shutdown_tasks( + Some(TaskKind::RemoteUploadTask), + Some(self.tenant_shard_id), + Some(self.timeline_id), + ) + .await; } + // TODO: work toward making this a no-op. See this funciton's doc comment for more context. tracing::debug!("Waiting for tasks..."); - task_mgr::shutdown_tasks(None, Some(self.tenant_shard_id), Some(self.timeline_id)).await; - // Finally wait until any gate-holders are complete + // Finally wait until any gate-holders are complete. + // + // TODO: once above shutdown_tasks is a no-op, we can close the gate before calling shutdown_tasks + // and use a TBD variant of shutdown_tasks that asserts that there were no tasks left. self.gate.close().await; self.metrics.shutdown(); @@ -1405,7 +1911,7 @@ impl Timeline { #[instrument(skip_all, fields(tenant_id = %self.tenant_shard_id.tenant_id, shard_id = %self.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id))] pub(crate) async fn download_layer( &self, - layer_file_name: &str, + layer_file_name: &LayerName, ) -> anyhow::Result> { let Some(layer) = self.find_layer(layer_file_name).await else { return Ok(None); @@ -1423,7 +1929,10 @@ impl Timeline { /// Evict just one layer. /// /// Returns `Ok(None)` in the case where the layer could not be found by its `layer_file_name`. - pub(crate) async fn evict_layer(&self, layer_file_name: &str) -> anyhow::Result> { + pub(crate) async fn evict_layer( + &self, + layer_file_name: &LayerName, + ) -> anyhow::Result> { let _gate = self .gate .enter() @@ -1443,6 +1952,53 @@ impl Timeline { Err(EvictionError::Timeout) => Ok(Some(false)), } } + + fn should_roll( + &self, + layer_size: u64, + projected_layer_size: u64, + checkpoint_distance: u64, + projected_lsn: Lsn, + last_freeze_at: Lsn, + opened_at: Instant, + ) -> bool { + let distance = projected_lsn.widening_sub(last_freeze_at); + + // Rolling the open layer can be triggered by: + // 1. The distance from the last LSN we rolled at. This bounds the amount of WAL that + // the safekeepers need to store. For sharded tenants, we multiply by shard count to + // account for how writes are distributed across shards: we expect each node to consume + // 1/count of the LSN on average. + // 2. The size of the currently open layer. + // 3. The time since the last roll. It helps safekeepers to regard pageserver as caught + // up and suspend activity. + if distance >= checkpoint_distance as i128 * self.shard_identity.count.count() as i128 { + info!( + "Will roll layer at {} with layer size {} due to LSN distance ({})", + projected_lsn, layer_size, distance + ); + + true + } else if projected_layer_size >= checkpoint_distance { + info!( + "Will roll layer at {} with layer size {} due to layer size ({})", + projected_lsn, layer_size, projected_layer_size + ); + + true + } else if distance > 0 && opened_at.elapsed() >= self.get_checkpoint_timeout() { + info!( + "Will roll layer at {} with layer size {} due to time since first write to the layer ({:?})", + projected_lsn, + layer_size, + opened_at.elapsed() + ); + + true + } else { + false + } + } } /// Number of times we will compute partition within a checkpoint distance. @@ -1450,58 +2006,74 @@ const REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE: u64 = 10; // Private functions impl Timeline { - pub(crate) fn get_lazy_slru_download(&self) -> bool { - let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone(); + pub(crate) fn get_switch_aux_file_policy(&self) -> AuxFilePolicy { + let tenant_conf = self.tenant_conf.load(); tenant_conf + .tenant_conf + .switch_aux_file_policy + .unwrap_or(self.conf.default_tenant_conf.switch_aux_file_policy) + } + + pub(crate) fn get_lazy_slru_download(&self) -> bool { + let tenant_conf = self.tenant_conf.load(); + tenant_conf + .tenant_conf .lazy_slru_download .unwrap_or(self.conf.default_tenant_conf.lazy_slru_download) } fn get_checkpoint_distance(&self) -> u64 { - let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone(); + let tenant_conf = self.tenant_conf.load(); tenant_conf + .tenant_conf .checkpoint_distance .unwrap_or(self.conf.default_tenant_conf.checkpoint_distance) } fn get_checkpoint_timeout(&self) -> Duration { - let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone(); + let tenant_conf = self.tenant_conf.load(); tenant_conf + .tenant_conf .checkpoint_timeout .unwrap_or(self.conf.default_tenant_conf.checkpoint_timeout) } fn get_compaction_target_size(&self) -> u64 { - let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone(); + let tenant_conf = self.tenant_conf.load(); tenant_conf + .tenant_conf .compaction_target_size .unwrap_or(self.conf.default_tenant_conf.compaction_target_size) } fn get_compaction_threshold(&self) -> usize { - let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone(); + let tenant_conf = self.tenant_conf.load(); tenant_conf + .tenant_conf .compaction_threshold .unwrap_or(self.conf.default_tenant_conf.compaction_threshold) } fn get_image_creation_threshold(&self) -> usize { - let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone(); + let tenant_conf = self.tenant_conf.load(); tenant_conf + .tenant_conf .image_creation_threshold .unwrap_or(self.conf.default_tenant_conf.image_creation_threshold) } fn get_compaction_algorithm(&self) -> CompactionAlgorithm { - let tenant_conf = &self.tenant_conf.read().unwrap().tenant_conf; + let tenant_conf = &self.tenant_conf.load(); tenant_conf + .tenant_conf .compaction_algorithm .unwrap_or(self.conf.default_tenant_conf.compaction_algorithm) } fn get_eviction_policy(&self) -> EvictionPolicy { - let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone(); + let tenant_conf = self.tenant_conf.load(); tenant_conf + .tenant_conf .eviction_policy .unwrap_or(self.conf.default_tenant_conf.eviction_policy) } @@ -1515,14 +2087,26 @@ impl Timeline { .unwrap_or(default_tenant_conf.evictions_low_residence_duration_metric_threshold) } - pub(super) fn tenant_conf_updated(&self) { + fn get_image_layer_creation_check_threshold(&self) -> u8 { + let tenant_conf = self.tenant_conf.load(); + tenant_conf + .tenant_conf + .image_layer_creation_check_threshold + .unwrap_or( + self.conf + .default_tenant_conf + .image_layer_creation_check_threshold, + ) + } + + pub(super) fn tenant_conf_updated(&self, new_conf: &TenantConfOpt) { // NB: Most tenant conf options are read by background loops, so, // changes will automatically be picked up. // The threshold is embedded in the metric. So, we need to update it. { let new_threshold = Self::get_evictions_low_residence_duration_metric_threshold( - &self.tenant_conf.read().unwrap().tenant_conf, + new_conf, &self.conf.default_tenant_conf, ); @@ -1549,7 +2133,7 @@ impl Timeline { #[allow(clippy::too_many_arguments)] pub(super) fn new( conf: &'static PageServerConf, - tenant_conf: Arc>, + tenant_conf: Arc>, metadata: &TimelineMetadata, ancestor: Option>, timeline_id: TimelineId, @@ -1565,17 +2149,16 @@ impl Timeline { let disk_consistent_lsn = metadata.disk_consistent_lsn(); let (state, _) = watch::channel(state); - let (layer_flush_start_tx, _) = tokio::sync::watch::channel(0); + let (layer_flush_start_tx, _) = tokio::sync::watch::channel((0, disk_consistent_lsn)); let (layer_flush_done_tx, _) = tokio::sync::watch::channel((0, Ok(()))); - let tenant_conf_guard = tenant_conf.read().unwrap(); - - let evictions_low_residence_duration_metric_threshold = + let evictions_low_residence_duration_metric_threshold = { + let loaded_tenant_conf = tenant_conf.load(); Self::get_evictions_low_residence_duration_metric_threshold( - &tenant_conf_guard.tenant_conf, + &loaded_tenant_conf.tenant_conf, &conf.default_tenant_conf, - ); - drop(tenant_conf_guard); + ) + }; Arc::new_cyclic(|myself| { let mut result = Timeline { @@ -1632,11 +2215,7 @@ impl Timeline { write_lock: tokio::sync::Mutex::new(None), - gc_info: std::sync::RwLock::new(GcInfo { - retain_lsns: Vec::new(), - horizon_cutoff: Lsn(0), - pitr_cutoff: Lsn(0), - }), + gc_info: std::sync::RwLock::new(GcInfo::default()), latest_gc_cutoff_lsn: Rcu::new(metadata.latest_gc_cutoff_lsn()), initdb_lsn: metadata.initdb_lsn(), @@ -1650,11 +2229,18 @@ impl Timeline { // initial logical size is 0. LogicalSize::empty_initial() }, - partitioning: tokio::sync::Mutex::new((KeyPartitioning::new(), Lsn(0))), + partitioning: tokio::sync::Mutex::new(( + (KeyPartitioning::new(), KeyPartitioning::new().into_sparse()), + Lsn(0), + )), repartition_threshold: 0, + last_image_layer_creation_check_at: AtomicLsn::new(0), last_received_wal: Mutex::new(None), - rel_size_cache: RwLock::new(HashMap::new()), + rel_size_cache: RwLock::new(RelSizeCache { + complete_as_of: disk_consistent_lsn, + map: HashMap::new(), + }), download_all_remote_layers_task_info: RwLock::new(None), @@ -1680,6 +2266,7 @@ impl Timeline { }; result.repartition_threshold = result.get_checkpoint_distance() / REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE; + result .metrics .last_record_gauge @@ -1756,20 +2343,19 @@ impl Timeline { self.timeline_id, self.tenant_shard_id ); - let tenant_conf_guard = self.tenant_conf.read().unwrap(); - let wal_connect_timeout = tenant_conf_guard + let tenant_conf = self.tenant_conf.load(); + let wal_connect_timeout = tenant_conf .tenant_conf .walreceiver_connect_timeout .unwrap_or(self.conf.default_tenant_conf.walreceiver_connect_timeout); - let lagging_wal_timeout = tenant_conf_guard + let lagging_wal_timeout = tenant_conf .tenant_conf .lagging_wal_timeout .unwrap_or(self.conf.default_tenant_conf.lagging_wal_timeout); - let max_lsn_wal_lag = tenant_conf_guard + let max_lsn_wal_lag = tenant_conf .tenant_conf .max_lsn_wal_lag .unwrap_or(self.conf.default_tenant_conf.max_lsn_wal_lag); - drop(tenant_conf_guard); let mut guard = self.walreceiver.lock().unwrap(); assert!( @@ -1807,13 +2393,13 @@ impl Timeline { index_part: Option, ) -> anyhow::Result<()> { use init::{Decision::*, Discovered, DismissedLayer}; - use LayerFileName::*; + use LayerName::*; let mut guard = self.layers.write().await; let timer = self.metrics.load_layer_map_histo.start_timer(); - // Scan timeline directory and create ImageFileName and DeltaFilename + // Scan timeline directory and create ImageLayerName and DeltaFilename // structs representing all files on disk let timeline_path = self .conf @@ -1837,8 +2423,8 @@ impl Timeline { for discovered in discovered { let (name, kind) = match discovered { - Discovered::Layer(file_name, file_size) => { - discovered_layers.push((file_name, file_size)); + Discovered::Layer(layer_file_name, local_path, file_size) => { + discovered_layers.push((layer_file_name, local_path, file_size)); continue; } Discovered::Metadata => { @@ -1888,31 +2474,30 @@ impl Timeline { Ok(UseRemote { local, remote }) => { // Remote is authoritative, but we may still choose to retain // the local file if the contents appear to match - if local.file_size() == remote.file_size() { + if local.metadata.file_size() == remote.file_size() { // Use the local file, but take the remote metadata so that we pick up // the correct generation. - UseLocal(remote) + UseLocal( + LocalLayerFileMetadata { + metadata: remote, + local_path: local.local_path + } + ) } else { - path.push(name.file_name()); - init::cleanup_local_file_for_remote(&path, &local, &remote)?; - path.pop(); + init::cleanup_local_file_for_remote(&local, &remote)?; UseRemote { local, remote } } } Ok(decision) => decision, Err(DismissedLayer::Future { local }) => { - if local.is_some() { - path.push(name.file_name()); - init::cleanup_future_layer(&path, &name, disk_consistent_lsn)?; - path.pop(); + if let Some(local) = local { + init::cleanup_future_layer(&local.local_path, &name, disk_consistent_lsn)?; } needs_cleanup.push(name); continue; } Err(DismissedLayer::LocalOnly(local)) => { - path.push(name.file_name()); - init::cleanup_local_only_file(&path, &name, &local)?; - path.pop(); + init::cleanup_local_only_file(&name, &local)?; // this file never existed remotely, we will have to do rework continue; } @@ -1926,9 +2511,9 @@ impl Timeline { tracing::debug!(layer=%name, ?decision, "applied"); let layer = match decision { - UseLocal(m) => { - total_physical_size += m.file_size(); - Layer::for_resident(conf, &this, name, m).drop_eviction_guard() + UseLocal(local) => { + total_physical_size += local.metadata.file_size(); + Layer::for_resident(conf, &this, local.local_path, name, local.metadata).drop_eviction_guard() } Evicted(remote) | UseRemote { remote, .. } => { Layer::for_evicted(conf, &this, name, remote) @@ -1999,7 +2584,7 @@ impl Timeline { priority: GetLogicalSizePriority, ctx: &RequestContext, ) -> logical_size::CurrentLogicalSize { - if !self.tenant_shard_id.is_zero() { + if !self.tenant_shard_id.is_shard_zero() { // Logical size is only accurately maintained on shard zero: when called elsewhere, for example // when HTTP API is serving a GET for timeline zero, return zero return logical_size::CurrentLogicalSize::Approximate(logical_size::Approximate::zero()); @@ -2295,7 +2880,7 @@ impl Timeline { crate::span::debug_assert_current_span_has_tenant_and_timeline_id(); // We should never be calculating logical sizes on shard !=0, because these shards do not have // accurate relation sizes, and they do not emit consumption metrics. - debug_assert!(self.tenant_shard_id.is_zero()); + debug_assert!(self.tenant_shard_id.is_shard_zero()); let guard = self .gate @@ -2317,10 +2902,6 @@ impl Timeline { debug!("cancelling logical size calculation for timeline shutdown"); calculation.await } - _ = task_mgr::shutdown_watcher() => { - debug!("cancelling logical size calculation for task shutdown"); - calculation.await - } } } @@ -2413,11 +2994,11 @@ impl Timeline { } } - async fn find_layer(&self, layer_file_name: &str) -> Option { + async fn find_layer(&self, layer_name: &LayerName) -> Option { let guard = self.layers.read().await; for historic_layer in guard.layer_map().iter_historic_layers() { - let historic_layer_name = historic_layer.filename().file_name(); - if layer_file_name == historic_layer_name { + let historic_layer_name = historic_layer.layer_name(); + if layer_name == &historic_layer_name { return Some(guard.get_from_desc(&historic_layer)); } } @@ -2446,8 +3027,8 @@ impl Timeline { let last_activity_ts = layer.access_stats().latest_activity_or_now(); HeatMapLayer::new( - layer.layer_desc().filename(), - layer.metadata().into(), + layer.layer_desc().layer_name(), + (&layer.metadata()).into(), last_activity_ts, ) }); @@ -2456,9 +3037,21 @@ impl Timeline { Some(HeatMapTimeline::new(self.timeline_id, layers)) } + + /// Returns true if the given lsn is or was an ancestor branchpoint. + pub(crate) fn is_ancestor_lsn(&self, lsn: Lsn) -> bool { + // upon timeline detach, we set the ancestor_lsn to Lsn::INVALID and the store the original + // branchpoint in the value in IndexPart::lineage + self.ancestor_lsn == lsn + || (self.ancestor_lsn == Lsn::INVALID + && self + .remote_client + .as_ref() + .is_some_and(|rtc| rtc.is_previous_ancestor_lsn(lsn))) + } } -type TraversalId = String; +type TraversalId = Arc; trait TraversalLayerExt { fn traversal_id(&self) -> TraversalId; @@ -2466,13 +3059,13 @@ trait TraversalLayerExt { impl TraversalLayerExt for Layer { fn traversal_id(&self) -> TraversalId { - self.local_path().to_string() + Arc::clone(self.debug_str()) } } impl TraversalLayerExt for Arc { fn traversal_id(&self) -> TraversalId { - format!("timeline {} in-memory {self}", self.get_timeline_id()) + Arc::clone(self.local_path_str()) } } @@ -2501,7 +3094,7 @@ impl Timeline { let mut timeline = self; let mut read_count = scopeguard::guard(0, |cnt| { - crate::metrics::READ_NUM_FS_LAYERS.observe(cnt as f64) + crate::metrics::READ_NUM_LAYERS_VISITED.observe(cnt as f64) }); // For debugging purposes, collect the path of layers that we traversed @@ -2541,32 +3134,33 @@ impl Timeline { if prev <= cont_lsn { // Didn't make any progress in last iteration. Error out to avoid // getting stuck in the loop. - return Err(layer_traversal_error(format!( - "could not find layer with more data for key {} at LSN {}, request LSN {}, ancestor {}", + return Err(PageReconstructError::MissingKey(MissingKeyError { key, - Lsn(cont_lsn.0 - 1), + shard: self.shard_identity.get_shard_number(&key), + cont_lsn: Lsn(cont_lsn.0 - 1), request_lsn, - timeline.ancestor_lsn - ), traversal_path)); + ancestor_lsn: Some(timeline.ancestor_lsn), + traversal_path, + backtrace: None, + })); } } prev_lsn = Some(cont_lsn); } ValueReconstructResult::Missing => { - return Err(layer_traversal_error( - if cfg!(test) { - format!( - "could not find data for key {} (shard {:?}) at LSN {}, for request at LSN {}\n{}", - key, self.shard_identity.get_shard_number(&key), cont_lsn, request_lsn, std::backtrace::Backtrace::force_capture(), - ) - } else { - format!( - "could not find data for key {} (shard {:?}) at LSN {}, for request at LSN {}", - key, self.shard_identity.get_shard_number(&key), cont_lsn, request_lsn - ) - }, + return Err(PageReconstructError::MissingKey(MissingKeyError { + key, + shard: self.shard_identity.get_shard_number(&key), + cont_lsn, + request_lsn, + ancestor_lsn: None, traversal_path, - )); + backtrace: if cfg!(test) { + Some(std::backtrace::Backtrace::force_capture()) + } else { + None + }, + })); } } @@ -2592,10 +3186,14 @@ impl Timeline { if let Some(open_layer) = &layers.open_layer { let start_lsn = open_layer.get_lsn_range().start; if cont_lsn > start_lsn { - //info!("CHECKING for {} at {} on open layer {}", key, cont_lsn, open_layer.filename().display()); + //info!("CHECKING for {} at {} on open layer {}", key, cont_lsn, open_layer.layer_name().display()); // Get all the data needed to reconstruct the page version from this layer. // But if we have an older cached page image, no need to go past that. let lsn_floor = max(cached_lsn + 1, start_lsn); + + let open_layer = open_layer.clone(); + drop(guard); + result = match open_layer .get_value_reconstruct_data( key, @@ -2609,23 +3207,20 @@ impl Timeline { Err(e) => return Err(PageReconstructError::from(e)), }; cont_lsn = lsn_floor; - // metrics: open_layer does not count as fs access, so we are not updating `read_count` - traversal_path.push(( - result, - cont_lsn, - Box::new({ - let open_layer = Arc::clone(open_layer); - move || open_layer.traversal_id() - }), - )); + *read_count += 1; + traversal_path.push((result, cont_lsn, open_layer.traversal_id())); continue 'outer; } } for frozen_layer in layers.frozen_layers.iter().rev() { let start_lsn = frozen_layer.get_lsn_range().start; if cont_lsn > start_lsn { - //info!("CHECKING for {} at {} on frozen layer {}", key, cont_lsn, frozen_layer.filename().display()); + //info!("CHECKING for {} at {} on frozen layer {}", key, cont_lsn, frozen_layer.layer_name().display()); let lsn_floor = max(cached_lsn + 1, start_lsn); + + let frozen_layer = frozen_layer.clone(); + drop(guard); + result = match frozen_layer .get_value_reconstruct_data( key, @@ -2639,21 +3234,15 @@ impl Timeline { Err(e) => return Err(PageReconstructError::from(e)), }; cont_lsn = lsn_floor; - // metrics: open_layer does not count as fs access, so we are not updating `read_count` - traversal_path.push(( - result, - cont_lsn, - Box::new({ - let frozen_layer = Arc::clone(frozen_layer); - move || frozen_layer.traversal_id() - }), - )); + *read_count += 1; + traversal_path.push((result, cont_lsn, frozen_layer.traversal_id())); continue 'outer; } } if let Some(SearchResult { lsn_floor, layer }) = layers.search(key, cont_lsn) { let layer = guard.get_from_desc(&layer); + drop(guard); // Get all the data needed to reconstruct the page version from this layer. // But if we have an older cached page image, no need to go past that. let lsn_floor = max(cached_lsn + 1, lsn_floor); @@ -2666,14 +3255,7 @@ impl Timeline { }; cont_lsn = lsn_floor; *read_count += 1; - traversal_path.push(( - result, - cont_lsn, - Box::new({ - let layer = layer.to_owned(); - move || layer.traversal_id() - }), - )); + traversal_path.push((result, cont_lsn, layer.traversal_id())); continue 'outer; } else if timeline.ancestor_timeline.is_some() { // Nothing on this timeline. Traverse to parent @@ -2726,11 +3308,22 @@ impl Timeline { .await?; keyspace.remove_overlapping_with(&completed); - if keyspace.total_size() == 0 || timeline.ancestor_timeline.is_none() { + + // Do not descend into the ancestor timeline for aux files. + // We don't return a blanket [`GetVectoredError::MissingKey`] to avoid + // stalling compaction. + keyspace.remove_overlapping_with(&KeySpace { + ranges: vec![NON_INHERITED_RANGE, NON_INHERITED_SPARSE_RANGE], + }); + + // Keyspace is fully retrieved, no ancestor timeline, or metadata scan (where we do not look + // into ancestor timelines). TODO: is there any other metadata which we want to inherit? + if keyspace.total_raw_size() == 0 || timeline.ancestor_timeline.is_none() { break; } - cont_lsn = Lsn(timeline.ancestor_lsn.0 + 1); + // Take the min to avoid reconstructing a page with data newer than request Lsn. + cont_lsn = std::cmp::min(Lsn(request_lsn.0 + 1), Lsn(timeline.ancestor_lsn.0 + 1)); timeline_owned = timeline .get_ready_ancestor_timeline(ctx) .await @@ -2738,14 +3331,24 @@ impl Timeline { timeline = &*timeline_owned; } - if keyspace.total_size() != 0 { - return Err(GetVectoredError::MissingKey(keyspace.start().unwrap())); + if keyspace.total_raw_size() != 0 { + return Err(GetVectoredError::MissingKey(MissingKeyError { + key: keyspace.start().unwrap(), /* better if we can store the full keyspace */ + shard: self + .shard_identity + .get_shard_number(&keyspace.start().unwrap()), + cont_lsn, + request_lsn, + ancestor_lsn: Some(timeline.ancestor_lsn), + traversal_path: vec![], + backtrace: None, + })); } Ok(()) } - /// Collect the reconstruct data for a ketspace from the specified timeline. + /// Collect the reconstruct data for a keyspace from the specified timeline. /// /// Maintain a fringe [`LayerFringe`] which tracks all the layers that intersect /// the current keyspace. The current keyspace of the search at any given timeline @@ -2771,16 +3374,6 @@ impl Timeline { let mut completed_keyspace = KeySpace::default(); - // Hold the layer map whilst visiting the timeline to prevent - // compaction, eviction and flushes from rendering the layers unreadable. - // - // TODO: Do we actually need to do this? In theory holding on - // to [`tenant::storage_layer::Layer`] should be enough. However, - // [`Timeline::get`] also holds the lock during IO, so more investigation - // is needed. - let guard = timeline.layers.read().await; - let layers = guard.layer_map(); - loop { if cancel.is_cancelled() { return Err(GetVectoredError::Cancelled); @@ -2790,54 +3383,77 @@ impl Timeline { unmapped_keyspace.remove_overlapping_with(&keys_done_last_step); completed_keyspace.merge(&keys_done_last_step); - let in_memory_layer = layers.find_in_memory_layer(|l| { - let start_lsn = l.get_lsn_range().start; - cont_lsn > start_lsn - }); + // Do not descent any further if the last layer we visited + // completed all keys in the keyspace it inspected. This is not + // required for correctness, but avoids visiting extra layers + // which turns out to be a perf bottleneck in some cases. + if !unmapped_keyspace.is_empty() { + let guard = timeline.layers.read().await; + let layers = guard.layer_map(); - match in_memory_layer { - Some(l) => { - fringe.update( - ReadableLayerDesc::InMemory { - handle: l, - lsn_ceil: cont_lsn, - }, - unmapped_keyspace.clone(), - ); - } - None => { - for range in unmapped_keyspace.ranges.iter() { - let results = layers.range_search(range.clone(), cont_lsn); + let in_memory_layer = layers.find_in_memory_layer(|l| { + let start_lsn = l.get_lsn_range().start; + cont_lsn > start_lsn + }); - results - .found - .into_iter() - .map(|(SearchResult { layer, lsn_floor }, keyspace_accum)| { - ( - ReadableLayerDesc::Persistent { - desc: (*layer).clone(), - lsn_range: lsn_floor..cont_lsn, - }, - keyspace_accum.to_keyspace(), - ) - }) - .for_each(|(layer, keyspace)| fringe.update(layer, keyspace)); + match in_memory_layer { + Some(l) => { + let lsn_range = l.get_lsn_range().start..cont_lsn; + fringe.update( + ReadableLayer::InMemoryLayer(l), + unmapped_keyspace.clone(), + lsn_range, + ); + } + None => { + for range in unmapped_keyspace.ranges.iter() { + let results = layers.range_search(range.clone(), cont_lsn); + + results + .found + .into_iter() + .map(|(SearchResult { layer, lsn_floor }, keyspace_accum)| { + ( + ReadableLayer::PersistentLayer(guard.get_from_desc(&layer)), + keyspace_accum.to_keyspace(), + lsn_floor..cont_lsn, + ) + }) + .for_each(|(layer, keyspace, lsn_range)| { + fringe.update(layer, keyspace, lsn_range) + }); + } } } + + // It's safe to drop the layer map lock after planning the next round of reads. + // The fringe keeps readable handles for the layers which are safe to read even + // if layers were compacted or flushed. + // + // The more interesting consideration is: "Why is the read algorithm still correct + // if the layer map changes while it is operating?". Doing a vectored read on a + // timeline boils down to pushing an imaginary lsn boundary downwards for each range + // covered by the read. The layer map tells us how to move the lsn downwards for a + // range at *a particular point in time*. It is fine for the answer to be different + // at two different time points. + drop(guard); } - if let Some((layer_to_read, keyspace_to_read)) = fringe.next_layer() { + if let Some((layer_to_read, keyspace_to_read, lsn_range)) = fringe.next_layer() { + let next_cont_lsn = lsn_range.start; layer_to_read .get_values_reconstruct_data( - &guard, keyspace_to_read.clone(), + lsn_range, reconstruct_state, ctx, ) .await?; unmapped_keyspace = keyspace_to_read; - cont_lsn = layer_to_read.get_lsn_floor(); + cont_lsn = next_cont_lsn; + + reconstruct_state.on_layer_visited(); } else { break; } @@ -2915,7 +3531,7 @@ impl Timeline { } } ancestor - .wait_lsn(self.ancestor_lsn, ctx) + .wait_lsn(self.ancestor_lsn, WaitLsnWaiter::Timeline(self), ctx) .await .map_err(|e| match e { e @ WaitLsnError::Timeout(_) => GetReadyAncestorError::AncestorLsnTimeout(e), @@ -2926,7 +3542,7 @@ impl Timeline { Ok(ancestor) } - fn get_ancestor_timeline(&self) -> anyhow::Result> { + pub(crate) fn get_ancestor_timeline(&self) -> anyhow::Result> { let ancestor = self.ancestor_timeline.as_ref().with_context(|| { format!( "Ancestor is missing. Timeline id: {} Ancestor id {:?}", @@ -2965,7 +3581,9 @@ impl Timeline { self.last_record_lsn.advance(new_lsn); } - async fn freeze_inmem_layer(&self, write_lock_held: bool) { + /// Whether there was a layer to freeze or not, return the value of get_last_record_lsn + /// before we attempted the freeze: this guarantees that ingested data is frozen up to this lsn (inclusive). + async fn freeze_inmem_layer(&self, write_lock_held: bool) -> Lsn { // Freeze the current open in-memory layer. It will be written to disk on next // iteration. @@ -2975,7 +3593,9 @@ impl Timeline { Some(self.write_lock.lock().await) }; - self.freeze_inmem_layer_at(self.get_last_record_lsn()).await; + let to_lsn = self.get_last_record_lsn(); + self.freeze_inmem_layer_at(to_lsn).await; + to_lsn } async fn freeze_inmem_layer_at(&self, at: Lsn) { @@ -2988,25 +3608,24 @@ impl Timeline { /// Layer flusher task's main loop. async fn flush_loop( self: &Arc, - mut layer_flush_start_rx: tokio::sync::watch::Receiver, + mut layer_flush_start_rx: tokio::sync::watch::Receiver<(u64, Lsn)>, ctx: &RequestContext, ) { info!("started flush loop"); loop { tokio::select! { _ = self.cancel.cancelled() => { - info!("shutting down layer flush task"); - break; - }, - _ = task_mgr::shutdown_watcher() => { - info!("shutting down layer flush task"); + info!("shutting down layer flush task due to Timeline::cancel"); break; }, _ = layer_flush_start_rx.changed() => {} } - trace!("waking up"); - let flush_counter = *layer_flush_start_rx.borrow(); + let (flush_counter, frozen_to_lsn) = *layer_flush_start_rx.borrow(); + + // The highest LSN to which we flushed in the loop over frozen layers + let mut flushed_to_lsn = Lsn(0); + let result = loop { if self.cancel.is_cancelled() { info!("dropping out of flush loop for timeline shutdown"); @@ -3027,7 +3646,9 @@ impl Timeline { break Ok(()); }; match self.flush_frozen_layer(layer_to_flush, ctx).await { - Ok(()) => {} + Ok(this_layer_to_lsn) => { + flushed_to_lsn = std::cmp::max(flushed_to_lsn, this_layer_to_lsn); + } Err(FlushLayerError::Cancelled) => { info!("dropping out of flush loop for timeline shutdown"); return; @@ -3036,11 +3657,36 @@ impl Timeline { FlushLayerError::Other(_) | FlushLayerError::CreateImageLayersError(_), ) => { error!("could not flush frozen layer: {err:?}"); - break err; + break err.map(|_| ()); } } timer.stop_and_record(); }; + + // Unsharded tenants should never advance their LSN beyond the end of the + // highest layer they write: such gaps between layer data and the frozen LSN + // are only legal on sharded tenants. + debug_assert!( + self.shard_identity.count.count() > 1 + || flushed_to_lsn >= frozen_to_lsn + || !flushed_to_lsn.is_valid() + ); + + if flushed_to_lsn < frozen_to_lsn && self.shard_identity.count.count() > 1 { + // If our layer flushes didn't carry disk_consistent_lsn up to the `to_lsn` advertised + // to us via layer_flush_start_rx, then advance it here. + // + // This path is only taken for tenants with multiple shards: single sharded tenants should + // never encounter a gap in the wal. + let old_disk_consistent_lsn = self.disk_consistent_lsn.load(); + tracing::debug!("Advancing disk_consistent_lsn across layer gap {old_disk_consistent_lsn}->{frozen_to_lsn}"); + if self.set_disk_consistent_lsn(frozen_to_lsn) { + if let Err(e) = self.schedule_uploads(frozen_to_lsn, vec![]) { + tracing::warn!("Failed to schedule metadata upload after updating disk_consistent_lsn: {e}"); + } + } + } + // Notify any listeners that we're done let _ = self .layer_flush_done_tx @@ -3048,7 +3694,13 @@ impl Timeline { } } - async fn flush_frozen_layers_and_wait(&self) -> anyhow::Result<()> { + /// Request the flush loop to write out all frozen layers up to `to_lsn` as Delta L0 files to disk. + /// The caller is responsible for the freezing, e.g., [`Self::freeze_inmem_layer`]. + /// + /// `last_record_lsn` may be higher than the highest LSN of a frozen layer: if this is the case, + /// it means no data will be written between the top of the highest frozen layer and to_lsn, + /// e.g. because this tenant shard has ingested up to to_lsn and not written any data locally for that part of the WAL. + async fn flush_frozen_layers_and_wait(&self, last_record_lsn: Lsn) -> anyhow::Result<()> { let mut rx = self.layer_flush_done_tx.subscribe(); // Increment the flush cycle counter and wake up the flush task. @@ -3062,9 +3714,10 @@ impl Timeline { anyhow::bail!("cannot flush frozen layers when flush_loop is not running, state is {flush_loop_state:?}") } - self.layer_flush_start_tx.send_modify(|counter| { + self.layer_flush_start_tx.send_modify(|(counter, lsn)| { my_flush_request = *counter + 1; *counter = my_flush_request; + *lsn = std::cmp::max(last_record_lsn, *lsn); }); loop { @@ -3101,16 +3754,22 @@ impl Timeline { } fn flush_frozen_layers(&self) { - self.layer_flush_start_tx.send_modify(|val| *val += 1); + self.layer_flush_start_tx.send_modify(|(counter, lsn)| { + *counter += 1; + + *lsn = std::cmp::max(*lsn, Lsn(self.last_freeze_at.load().0 - 1)); + }); } /// Flush one frozen in-memory layer to disk, as a new delta layer. + /// + /// Return value is the last lsn (inclusive) of the layer that was frozen. #[instrument(skip_all, fields(layer=%frozen_layer))] async fn flush_frozen_layer( self: &Arc, frozen_layer: Arc, ctx: &RequestContext, - ) -> Result<(), FlushLayerError> { + ) -> Result { debug_assert_current_span_has_tenant_and_timeline_id(); // As a special case, when we have just imported an image into the repository, @@ -3118,66 +3777,103 @@ impl Timeline { // files instead. This is possible as long as *all* the data imported into the // repository have the same LSN. let lsn_range = frozen_layer.get_lsn_range(); - let (layers_to_upload, delta_layer_to_add) = - if lsn_range.start == self.initdb_lsn && lsn_range.end == Lsn(self.initdb_lsn.0 + 1) { - #[cfg(test)] - match &mut *self.flush_loop_state.lock().unwrap() { - FlushLoopState::NotStarted | FlushLoopState::Exited => { - panic!("flush loop not running") - } - FlushLoopState::Running { - initdb_optimization_count, - .. - } => { + + // Whether to directly create image layers for this flush, or flush them as delta layers + let create_image_layer = + lsn_range.start == self.initdb_lsn && lsn_range.end == Lsn(self.initdb_lsn.0 + 1); + + #[cfg(test)] + { + match &mut *self.flush_loop_state.lock().unwrap() { + FlushLoopState::NotStarted | FlushLoopState::Exited => { + panic!("flush loop not running") + } + FlushLoopState::Running { + expect_initdb_optimization, + initdb_optimization_count, + .. + } => { + if create_image_layer { *initdb_optimization_count += 1; - } - } - // Note: The 'ctx' in use here has DownloadBehavior::Error. We should not - // require downloading anything during initial import. - let (partitioning, _lsn) = self - .repartition( - self.initdb_lsn, - self.get_compaction_target_size(), - EnumSet::empty(), - ctx, - ) - .await?; - - if self.cancel.is_cancelled() { - return Err(FlushLayerError::Cancelled); - } - - // For image layers, we add them immediately into the layer map. - ( - self.create_image_layers(&partitioning, self.initdb_lsn, true, ctx) - .await?, - None, - ) - } else { - #[cfg(test)] - match &mut *self.flush_loop_state.lock().unwrap() { - FlushLoopState::NotStarted | FlushLoopState::Exited => { - panic!("flush loop not running") - } - FlushLoopState::Running { - expect_initdb_optimization, - .. - } => { + } else { assert!(!*expect_initdb_optimization, "expected initdb optimization"); } } - // Normal case, write out a L0 delta layer file. - // `create_delta_layer` will not modify the layer map. - // We will remove frozen layer and add delta layer in one atomic operation later. - let layer = self.create_delta_layer(&frozen_layer, ctx).await?; - ( - // FIXME: even though we have a single image and single delta layer assumption - // we push them to vec - vec![layer.clone()], - Some(layer), + } + } + + let (layers_to_upload, delta_layer_to_add) = if create_image_layer { + // Note: The 'ctx' in use here has DownloadBehavior::Error. We should not + // require downloading anything during initial import. + let ((rel_partition, metadata_partition), _lsn) = self + .repartition( + self.initdb_lsn, + self.get_compaction_target_size(), + EnumSet::empty(), + ctx, ) + .await?; + + if self.cancel.is_cancelled() { + return Err(FlushLayerError::Cancelled); + } + + // For metadata, always create delta layers. + let delta_layer = if !metadata_partition.parts.is_empty() { + assert_eq!( + metadata_partition.parts.len(), + 1, + "currently sparse keyspace should only contain a single aux file keyspace" + ); + let metadata_keyspace = &metadata_partition.parts[0]; + assert_eq!( + metadata_keyspace.0.ranges.len(), + 1, + "aux file keyspace should be a single range" + ); + self.create_delta_layer( + &frozen_layer, + ctx, + Some(metadata_keyspace.0.ranges[0].clone()), + ) + .await? + } else { + None }; + // For image layers, we add them immediately into the layer map. + let mut layers_to_upload = Vec::new(); + layers_to_upload.extend( + self.create_image_layers( + &rel_partition, + self.initdb_lsn, + ImageLayerCreationMode::Initial, + ctx, + ) + .await?, + ); + + if let Some(delta_layer) = delta_layer { + layers_to_upload.push(delta_layer.clone()); + (layers_to_upload, Some(delta_layer)) + } else { + (layers_to_upload, None) + } + } else { + // Normal case, write out a L0 delta layer file. + // `create_delta_layer` will not modify the layer map. + // We will remove frozen layer and add delta layer in one atomic operation later. + let Some(layer) = self.create_delta_layer(&frozen_layer, ctx, None).await? else { + panic!("delta layer cannot be empty if no filter is applied"); + }; + ( + // FIXME: even though we have a single image and single delta layer assumption + // we push them to vec + vec![layer.clone()], + Some(layer), + ) + }; + pausable_failpoint!("flush-layer-cancel-after-writing-layer-out-pausable"); if self.cancel.is_cancelled() { @@ -3185,7 +3881,6 @@ impl Timeline { } let disk_consistent_lsn = Lsn(lsn_range.end.0 - 1); - let old_disk_consistent_lsn = self.disk_consistent_lsn.load(); // The new on-disk layers are now in the layer map. We can remove the // in-memory layer from the map now. The flushed layer is stored in @@ -3199,10 +3894,7 @@ impl Timeline { guard.finish_flush_l0_layer(delta_layer_to_add.as_ref(), &frozen_layer, &self.metrics); - if disk_consistent_lsn != old_disk_consistent_lsn { - assert!(disk_consistent_lsn > old_disk_consistent_lsn); - self.disk_consistent_lsn.store(disk_consistent_lsn); - + if self.set_disk_consistent_lsn(disk_consistent_lsn) { // Schedule remote uploads that will reflect our new disk_consistent_lsn self.schedule_uploads(disk_consistent_lsn, layers_to_upload)?; } @@ -3219,7 +3911,22 @@ impl Timeline { // This failpoint is used by another test case `test_pageserver_recovery`. fail_point!("flush-frozen-exit"); - Ok(()) + Ok(Lsn(lsn_range.end.0 - 1)) + } + + /// Return true if the value changed + /// + /// This function must only be used from the layer flush task, and may not be called concurrently. + fn set_disk_consistent_lsn(&self, new_value: Lsn) -> bool { + // We do a simple load/store cycle: that's why this function isn't safe for concurrent use. + let old_value = self.disk_consistent_lsn.load(); + if new_value != old_value { + assert!(new_value >= old_value); + self.disk_consistent_lsn.store(new_value); + true + } else { + false + } } /// Update metadata file @@ -3227,7 +3934,7 @@ impl Timeline { &self, disk_consistent_lsn: Lsn, layers_to_upload: impl IntoIterator, - ) -> anyhow::Result { + ) -> anyhow::Result<()> { // We can only save a valid 'prev_record_lsn' value on disk if we // flushed *all* in-memory changes to disk. We only track // 'prev_record_lsn' in memory for the latest processed record, so we @@ -3244,19 +3951,10 @@ impl Timeline { None }; - let ancestor_timeline_id = self - .ancestor_timeline - .as_ref() - .map(|ancestor| ancestor.timeline_id); - - let metadata = TimelineMetadata::new( + let update = crate::tenant::metadata::MetadataUpdate::new( disk_consistent_lsn, ondisk_prev_record_lsn, - ancestor_timeline_id, - self.ancestor_lsn, *self.latest_gc_cutoff_lsn.read(), - self.initdb_lsn, - self.pg_version, ); fail_point!("checkpoint-before-saving-metadata", |x| bail!( @@ -3268,10 +3966,10 @@ impl Timeline { for layer in layers_to_upload { remote_client.schedule_layer_file_upload(layer)?; } - remote_client.schedule_index_upload_for_metadata_update(&metadata)?; + remote_client.schedule_index_upload_for_metadata_update(&update)?; } - Ok(metadata) + Ok(()) } pub(crate) async fn preserve_initdb_archive(&self) -> anyhow::Result<()> { @@ -3295,12 +3993,18 @@ impl Timeline { self: &Arc, frozen_layer: &Arc, ctx: &RequestContext, - ) -> anyhow::Result { + key_range: Option>, + ) -> anyhow::Result> { let self_clone = Arc::clone(self); let frozen_layer = Arc::clone(frozen_layer); let ctx = ctx.attached_child(); let work = async move { - let new_delta = frozen_layer.write_to_disk(&self_clone, &ctx).await?; + let Some(new_delta) = frozen_layer + .write_to_disk(&self_clone, &ctx, key_range) + .await? + else { + return Ok(None); + }; // The write_to_disk() above calls writer.finish() which already did the fsync of the inodes. // We just need to fsync the directory in which these inodes are linked, // which we know to be the timeline directory. @@ -3319,7 +4023,7 @@ impl Timeline { .sync_all() .await .fatal_err("VirtualFile::sync_all timeline dir"); - anyhow::Ok(new_delta) + anyhow::Ok(Some(new_delta)) }; // Before tokio-epoll-uring, we ran write_to_disk & the sync_all inside spawn_blocking. // Preserve that behavior to maintain the same behavior for `virtual_file_io_engine=std-fs`. @@ -3346,19 +4050,20 @@ impl Timeline { partition_size: u64, flags: EnumSet, ctx: &RequestContext, - ) -> anyhow::Result<(KeyPartitioning, Lsn)> { + ) -> anyhow::Result<((KeyPartitioning, SparseKeyPartitioning), Lsn)> { let Ok(mut partitioning_guard) = self.partitioning.try_lock() else { // NB: there are two callers, one is the compaction task, of which there is only one per struct Tenant and hence Timeline. // The other is the initdb optimization in flush_frozen_layer, used by `boostrap_timeline`, which runs before `.activate()` // and hence before the compaction task starts. anyhow::bail!("repartition() called concurrently, this should not happen"); }; - if lsn < partitioning_guard.1 { + let ((dense_partition, sparse_partition), partition_lsn) = &*partitioning_guard; + if lsn < *partition_lsn { anyhow::bail!("repartition() called with LSN going backwards, this should not happen"); } - let distance = lsn.0 - partitioning_guard.1 .0; - if partitioning_guard.1 != Lsn(0) + let distance = lsn.0 - partition_lsn.0; + if *partition_lsn != Lsn(0) && distance <= self.repartition_threshold && !flags.contains(CompactFlags::ForceRepartition) { @@ -3367,13 +4072,18 @@ impl Timeline { threshold = self.repartition_threshold, "no repartitioning needed" ); - return Ok((partitioning_guard.0.clone(), partitioning_guard.1)); + return Ok(( + (dense_partition.clone(), sparse_partition.clone()), + *partition_lsn, + )); } - let keyspace = self.collect_keyspace(lsn, ctx).await?; - let partitioning = keyspace.partition(partition_size); - - *partitioning_guard = (partitioning, lsn); + let (dense_ks, sparse_ks) = self.collect_keyspace(lsn, ctx).await?; + let dense_partitioning = dense_ks.partition(&self.shard_identity, partition_size); + let sparse_partitioning = SparseKeyPartitioning { + parts: vec![sparse_ks], + }; // no partitioning for metadata keys for now + *partitioning_guard = ((dense_partitioning, sparse_partitioning), lsn); Ok((partitioning_guard.0.clone(), partitioning_guard.1)) } @@ -3429,12 +4139,12 @@ impl Timeline { false } - #[tracing::instrument(skip_all, fields(%lsn, %force))] + #[tracing::instrument(skip_all, fields(%lsn, %mode))] async fn create_image_layers( self: &Arc, partitioning: &KeyPartitioning, lsn: Lsn, - force: bool, + mode: ImageLayerCreationMode, ctx: &RequestContext, ) -> Result, CreateImageLayersError> { let timer = self.metrics.create_images_time_histo.start_timer(); @@ -3451,11 +4161,46 @@ impl Timeline { // image layers <100000000..100000099> and <200000000..200000199> are not completely covering it. let mut start = Key::MIN; + let check_for_image_layers = { + let last_checks_at = self.last_image_layer_creation_check_at.load(); + let distance = lsn + .checked_sub(last_checks_at) + .expect("Attempt to compact with LSN going backwards"); + let min_distance = self.get_image_layer_creation_check_threshold() as u64 + * self.get_checkpoint_distance(); + + // Skip the expensive delta layer counting if this timeline has not ingested sufficient + // WAL since the last check. + distance.0 >= min_distance + }; + + if check_for_image_layers { + self.last_image_layer_creation_check_at.store(lsn); + } + for partition in partitioning.parts.iter() { let img_range = start..partition.ranges.last().unwrap().end; - if !force && !self.time_for_new_image_layer(partition, lsn).await { - start = img_range.end; - continue; + + if partition.overlaps(&Key::metadata_key_range()) { + // TODO(chi): The next patch will correctly create image layers for metadata keys, and it would be a + // rather big change. Keep this patch small for now. + match mode { + ImageLayerCreationMode::Force | ImageLayerCreationMode::Try => { + // skip image layer creation anyways for metadata keys. + start = img_range.end; + continue; + } + ImageLayerCreationMode::Initial => { + return Err(CreateImageLayersError::Other(anyhow::anyhow!("no image layer should be created for metadata keys when flushing frozen layers"))); + } + } + } else if let ImageLayerCreationMode::Try = mode { + // check_for_image_layers = false -> skip + // check_for_image_layers = true -> check time_for_new_image_layer -> skip/generate + if !check_for_image_layers || !self.time_for_new_image_layer(partition, lsn).await { + start = img_range.end; + continue; + } } let mut image_layer_writer = ImageLayerWriter::new( @@ -3496,8 +4241,8 @@ impl Timeline { key = key.next(); // Maybe flush `key_rest_accum` - if key_request_accum.size() >= Timeline::MAX_GET_VECTORED_KEYS - || last_key_in_range + if key_request_accum.raw_size() >= Timeline::MAX_GET_VECTORED_KEYS + || (last_key_in_range && key_request_accum.raw_size() > 0) { let results = self .get_vectored(key_request_accum.consume_keyspace(), lsn, ctx) @@ -3535,7 +4280,7 @@ impl Timeline { }; // Write all the keys we just read into our new image layer. - image_layer_writer.put_image(img_key, img).await?; + image_layer_writer.put_image(img_key, img, ctx).await?; wrote_keys = true; } } @@ -3546,7 +4291,7 @@ impl Timeline { // Normal path: we have written some data into the new image layer for this // partition, so flush it to disk. start = img_range.end; - let image_layer = image_layer_writer.finish(self).await?; + let image_layer = image_layer_writer.finish(self, ctx).await?; image_layers.push(image_layer); } else { // Special case: the image layer may be empty if this is a sharded tenant and the @@ -3613,6 +4358,48 @@ impl Timeline { _ = self.cancel.cancelled() => {} ) } + + /// Detach this timeline from its ancestor by copying all of ancestors layers as this + /// Timelines layers up to the ancestor_lsn. + /// + /// Requires a timeline that: + /// - has an ancestor to detach from + /// - the ancestor does not have an ancestor -- follows from the original RFC limitations, not + /// a technical requirement + /// + /// After the operation has been started, it cannot be canceled. Upon restart it needs to be + /// polled again until completion. + /// + /// During the operation all timelines sharing the data with this timeline will be reparented + /// from our ancestor to be branches of this timeline. + pub(crate) async fn prepare_to_detach_from_ancestor( + self: &Arc, + tenant: &crate::tenant::Tenant, + options: detach_ancestor::Options, + ctx: &RequestContext, + ) -> Result< + ( + completion::Completion, + detach_ancestor::PreparedTimelineDetach, + ), + detach_ancestor::Error, + > { + detach_ancestor::prepare(self, tenant, options, ctx).await + } + + /// Completes the ancestor detach. This method is to be called while holding the + /// TenantManager's tenant slot, so during this method we cannot be deleted nor can any + /// timeline be deleted. After this method returns successfully, tenant must be reloaded. + /// + /// Pageserver receiving a SIGKILL during this operation is not supported (yet). + pub(crate) async fn complete_detaching_timeline_ancestor( + self: &Arc, + tenant: &crate::tenant::Tenant, + prepared: detach_ancestor::PreparedTimelineDetach, + ctx: &RequestContext, + ) -> Result, anyhow::Error> { + detach_ancestor::complete(self, tenant, prepared, ctx).await + } } /// Top-level failure to compact. @@ -3721,7 +4508,43 @@ impl Timeline { Ok(()) } - /// Update information about which layer files need to be retained on + async fn rewrite_layers( + self: &Arc, + replace_layers: Vec<(Layer, ResidentLayer)>, + drop_layers: Vec, + ) -> anyhow::Result<()> { + let mut guard = self.layers.write().await; + + guard.rewrite_layers(&replace_layers, &drop_layers, &self.metrics); + + let upload_layers: Vec<_> = replace_layers.into_iter().map(|r| r.1).collect(); + + if let Some(remote_client) = self.remote_client.as_ref() { + remote_client.schedule_compaction_update(&drop_layers, &upload_layers)?; + } + + Ok(()) + } + + /// Schedules the uploads of the given image layers + fn upload_new_image_layers( + self: &Arc, + new_images: impl IntoIterator, + ) -> anyhow::Result<()> { + let Some(remote_client) = &self.remote_client else { + return Ok(()); + }; + for layer in new_images { + remote_client.schedule_layer_file_upload(layer)?; + } + // should any new image layer been created, not uploading index_part will + // result in a mismatch between remote_physical_size and layermap calculated + // size, which will fail some tests, but should not be an issue otherwise. + remote_client.schedule_index_upload_for_file_changes()?; + Ok(()) + } + + /// Find the Lsns above which layer files need to be retained on /// garbage collection. This is separate from actually performing the GC, /// and is updated more frequently, so that compaction can remove obsolete /// page versions more aggressively. @@ -3729,17 +4552,6 @@ impl Timeline { /// TODO: that's wishful thinking, compaction doesn't actually do that /// currently. /// - /// The caller specifies how much history is needed with the 3 arguments: - /// - /// retain_lsns: keep a version of each page at these LSNs - /// cutoff_horizon: also keep everything newer than this LSN - /// pitr: the time duration required to keep data for PITR - /// - /// The 'retain_lsns' list is currently used to prevent removing files that - /// are needed by child timelines. In the future, the user might be able to - /// name additional points in time to retain. The caller is responsible for - /// collecting that information. - /// /// The 'cutoff_horizon' point is used to retain recent versions that might still be /// needed by read-only nodes. (As of this writing, the caller just passes /// the latest LSN subtracted by a constant, and doesn't do anything smart @@ -3747,23 +4559,22 @@ impl Timeline { /// /// The 'pitr' duration is used to calculate a 'pitr_cutoff', which can be used to determine /// whether a record is needed for PITR. - /// - /// NOTE: This function holds a short-lived lock to protect the 'gc_info' - /// field, so that the three values passed as argument are stored - /// atomically. But the caller is responsible for ensuring that no new - /// branches are created that would need to be included in 'retain_lsns', - /// for example. The caller should hold `Tenant::gc_cs` lock to ensure - /// that. - /// #[instrument(skip_all, fields(timeline_id=%self.timeline_id))] - pub(super) async fn update_gc_info( + pub(super) async fn find_gc_cutoffs( &self, - retain_lsns: Vec, cutoff_horizon: Lsn, pitr: Duration, cancel: &CancellationToken, ctx: &RequestContext, - ) -> anyhow::Result<()> { + ) -> anyhow::Result { + let _timer = self + .metrics + .find_gc_cutoffs_histo + .start_timer() + .record_on_drop(); + + pausable_failpoint!("Timeline::find_gc_cutoffs-pausable"); + // First, calculate pitr_cutoff_timestamp and then convert it to LSN. // // Some unit tests depend on garbage-collection working even when @@ -3809,19 +4620,14 @@ impl Timeline { *self.get_latest_gc_cutoff_lsn() } } else { - // No time-based retention was configured. Set time-based cutoff to - // same as LSN based. - cutoff_horizon + // No time-based retention was configured. Interpret this as "keep no history". + self.get_last_record_lsn() }; - // Grab the lock and update the values - *self.gc_info.write().unwrap() = GcInfo { - retain_lsns, - horizon_cutoff: cutoff_horizon, - pitr_cutoff, - }; - - Ok(()) + Ok(GcCutoffs { + horizon: cutoff_horizon, + pitr: pitr_cutoff, + }) } /// Garbage collect layer files on a timeline that are no longer needed. @@ -3850,8 +4656,8 @@ impl Timeline { let (horizon_cutoff, pitr_cutoff, retain_lsns) = { let gc_info = self.gc_info.read().unwrap(); - let horizon_cutoff = min(gc_info.horizon_cutoff, self.get_disk_consistent_lsn()); - let pitr_cutoff = gc_info.pitr_cutoff; + let horizon_cutoff = min(gc_info.cutoffs.horizon, self.get_disk_consistent_lsn()); + let pitr_cutoff = gc_info.cutoffs.pitr; let retain_lsns = gc_info.retain_lsns.clone(); (horizon_cutoff, pitr_cutoff, retain_lsns) }; @@ -3878,6 +4684,8 @@ impl Timeline { retain_lsns: Vec, new_gc_cutoff: Lsn, ) -> anyhow::Result { + // FIXME: if there is an ongoing detach_from_ancestor, we should just skip gc + let now = SystemTime::now(); let mut result: GcResult = GcResult::default(); @@ -3931,7 +4739,7 @@ impl Timeline { if l.get_lsn_range().end > horizon_cutoff { debug!( "keeping {} because it's newer than horizon_cutoff {}", - l.filename(), + l.layer_name(), horizon_cutoff, ); result.layers_needed_by_cutoff += 1; @@ -3942,7 +4750,7 @@ impl Timeline { if l.get_lsn_range().end > pitr_cutoff { debug!( "keeping {} because it's newer than pitr_cutoff {}", - l.filename(), + l.layer_name(), pitr_cutoff, ); result.layers_needed_by_pitr += 1; @@ -3961,7 +4769,7 @@ impl Timeline { if &l.get_lsn_range().start <= retain_lsn { debug!( "keeping {} because it's still might be referenced by child branch forked at {} is_dropped: xx is_incremental: {}", - l.filename(), + l.layer_name(), retain_lsn, l.is_incremental(), ); @@ -3992,7 +4800,7 @@ impl Timeline { if !layers .image_layer_exists(&l.get_key_range(), &(l.get_lsn_range().end..new_gc_cutoff)) { - debug!("keeping {} because it is the latest layer", l.filename()); + debug!("keeping {} because it is the latest layer", l.layer_name()); result.layers_not_updated += 1; continue 'outer; } @@ -4000,7 +4808,7 @@ impl Timeline { // We didn't find any reason to keep this file, so remove it. debug!( "garbage collecting {} is_dropped: xx is_incremental: {}", - l.filename(), + l.layer_name(), l.is_incremental(), ); layers_to_remove.push(l); @@ -4331,35 +5139,7 @@ impl Timeline { } } -type TraversalPathItem = ( - ValueReconstructResult, - Lsn, - Box TraversalId>, -); - -/// Helper function for get_reconstruct_data() to add the path of layers traversed -/// to an error, as anyhow context information. -fn layer_traversal_error(msg: String, path: Vec) -> PageReconstructError { - // We want the original 'msg' to be the outermost context. The outermost context - // is the most high-level information, which also gets propagated to the client. - let mut msg_iter = path - .into_iter() - .map(|(r, c, l)| { - format!( - "layer traversal: result {:?}, cont_lsn {}, layer: {}", - r, - c, - l(), - ) - }) - .chain(std::iter::once(msg)); - // Construct initial message from the first traversed layer - let err = anyhow!(msg_iter.next().unwrap()); - - // Append all subsequent traversals, and the error message 'msg', as contexts. - let msg = msg_iter.fold(err, |err, msg| err.context(msg)); - PageReconstructError::from(msg) -} +type TraversalPathItem = (ValueReconstructResult, Lsn, TraversalId); struct TimelineWriterState { open_layer: Arc, @@ -4370,23 +5150,16 @@ struct TimelineWriterState { max_lsn: Option, // Cached details of the last freeze. Avoids going trough the atomic/lock on every put. cached_last_freeze_at: Lsn, - cached_last_freeze_ts: Instant, } impl TimelineWriterState { - fn new( - open_layer: Arc, - current_size: u64, - last_freeze_at: Lsn, - last_freeze_ts: Instant, - ) -> Self { + fn new(open_layer: Arc, current_size: u64, last_freeze_at: Lsn) -> Self { Self { open_layer, current_size, prev_lsn: None, max_lsn: None, cached_last_freeze_at: last_freeze_at, - cached_last_freeze_ts: last_freeze_ts, } } } @@ -4460,49 +5233,6 @@ impl<'a> TimelineWriter<'a> { res } - /// "Tick" the timeline writer: it will roll the open layer if required - /// and do nothing else. - pub(crate) async fn tick(&mut self) -> anyhow::Result<()> { - self.open_layer_if_present().await?; - - let last_record_lsn = self.get_last_record_lsn(); - let action = self.get_open_layer_action(last_record_lsn, 0); - if action == OpenLayerAction::Roll { - self.roll_layer(last_record_lsn).await?; - } - - Ok(()) - } - - /// Populate the timeline writer state only if an in-memory layer - /// is already open. - async fn open_layer_if_present(&mut self) -> anyhow::Result<()> { - assert!(self.write_guard.is_none()); - - let open_layer = { - let guard = self.layers.read().await; - let layers = guard.layer_map(); - match layers.open_layer { - Some(ref open_layer) => open_layer.clone(), - None => { - return Ok(()); - } - } - }; - - let initial_size = open_layer.size().await?; - let last_freeze_at = self.last_freeze_at.load(); - let last_freeze_ts = *self.last_freeze_ts.read().unwrap(); - self.write_guard.replace(TimelineWriterState::new( - open_layer, - initial_size, - last_freeze_at, - last_freeze_ts, - )); - - Ok(()) - } - async fn handle_open_layer_action( &mut self, at: Lsn, @@ -4528,12 +5258,10 @@ impl<'a> TimelineWriter<'a> { let initial_size = layer.size().await?; let last_freeze_at = self.last_freeze_at.load(); - let last_freeze_ts = *self.last_freeze_ts.read().unwrap(); self.write_guard.replace(TimelineWriterState::new( layer, initial_size, last_freeze_at, - last_freeze_ts, )); Ok(()) @@ -4574,43 +5302,14 @@ impl<'a> TimelineWriter<'a> { return OpenLayerAction::None; } - let distance = lsn.widening_sub(state.cached_last_freeze_at); - let proposed_open_layer_size = state.current_size + new_value_size; - - // Rolling the open layer can be triggered by: - // 1. The distance from the last LSN we rolled at. This bounds the amount of WAL that - // the safekeepers need to store. For sharded tenants, we multiply by shard count to - // account for how writes are distributed across shards: we expect each node to consume - // 1/count of the LSN on average. - // 2. The size of the currently open layer. - // 3. The time since the last roll. It helps safekeepers to regard pageserver as caught - // up and suspend activity. - if distance - >= self.get_checkpoint_distance() as i128 * self.shard_identity.count.count() as i128 - { - info!( - "Will roll layer at {} with layer size {} due to LSN distance ({})", - lsn, state.current_size, distance - ); - - OpenLayerAction::Roll - } else if proposed_open_layer_size >= self.get_checkpoint_distance() { - info!( - "Will roll layer at {} with layer size {} due to layer size ({})", - lsn, state.current_size, proposed_open_layer_size - ); - - OpenLayerAction::Roll - } else if distance > 0 - && state.cached_last_freeze_ts.elapsed() >= self.get_checkpoint_timeout() - { - info!( - "Will roll layer at {} with layer size {} due to time since last flush ({:?})", - lsn, - state.current_size, - state.cached_last_freeze_ts.elapsed() - ); - + if self.tl.should_roll( + state.current_size, + state.current_size + new_value_size, + self.get_checkpoint_distance(), + lsn, + state.cached_last_freeze_at, + state.open_layer.get_opened_at(), + ) { OpenLayerAction::Roll } else { OpenLayerAction::None diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index 74b75dabf0..e83878b8fb 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -9,14 +9,14 @@ use std::ops::{Deref, Range}; use std::sync::Arc; use super::layer_manager::LayerManager; -use super::{CompactFlags, DurationRecorder, RecordedDuration, Timeline}; +use super::{CompactFlags, DurationRecorder, ImageLayerCreationMode, RecordedDuration, Timeline}; use anyhow::{anyhow, Context}; -use async_trait::async_trait; use enumset::EnumSet; use fail::fail_point; use itertools::Itertools; -use pageserver_api::shard::TenantShardId; +use pageserver_api::keyspace::ShardedRange; +use pageserver_api::shard::{ShardCount, ShardIdentity, TenantShardId}; use tokio_util::sync::CancellationToken; use tracing::{debug, info, info_span, trace, warn, Instrument}; use utils::id::TimelineId; @@ -94,7 +94,7 @@ impl Timeline { // Define partitioning schema if needed // FIXME: the match should only cover repartitioning, not the next steps - match self + let partition_count = match self .repartition( self.get_last_record_lsn(), self.get_compaction_target_size(), @@ -103,7 +103,7 @@ impl Timeline { ) .await { - Ok((partitioning, lsn)) => { + Ok(((dense_partitioning, sparse_partitioning), lsn)) => { // Disables access_stats updates, so that the files we read remain candidates for eviction after we're done with them let image_ctx = RequestContextBuilder::extend(ctx) .access_stats_behavior(AccessStatsBehavior::Skip) @@ -116,27 +116,38 @@ impl Timeline { // 3. Create new image layers for partitions that have been modified // "enough". - let layers = self + let dense_layers = self .create_image_layers( - &partitioning, + &dense_partitioning, lsn, - flags.contains(CompactFlags::ForceImageLayerCreation), + if flags.contains(CompactFlags::ForceImageLayerCreation) { + ImageLayerCreationMode::Force + } else { + ImageLayerCreationMode::Try + }, &image_ctx, ) .await .map_err(anyhow::Error::from)?; - if let Some(remote_client) = &self.remote_client { - for layer in layers { - remote_client.schedule_layer_file_upload(layer)?; - } - } - if let Some(remote_client) = &self.remote_client { - // should any new image layer been created, not uploading index_part will - // result in a mismatch between remote_physical_size and layermap calculated - // size, which will fail some tests, but should not be an issue otherwise. - remote_client.schedule_index_upload_for_file_changes()?; - } + // For now, nothing will be produced... + let sparse_layers = self + .create_image_layers( + &sparse_partitioning.clone().into_dense(), + lsn, + if flags.contains(CompactFlags::ForceImageLayerCreation) { + ImageLayerCreationMode::Force + } else { + ImageLayerCreationMode::Try + }, + &image_ctx, + ) + .await + .map_err(anyhow::Error::from)?; + assert!(sparse_layers.is_empty()); + + self.upload_new_image_layers(dense_layers)?; + dense_partitioning.parts.len() } Err(err) => { // no partitioning? This is normal, if the timeline was just created @@ -148,9 +159,150 @@ impl Timeline { if !self.cancel.is_cancelled() { tracing::error!("could not compact, repartitioning keyspace failed: {err:?}"); } + 1 } }; + if self.shard_identity.count >= ShardCount::new(2) { + // Limit the number of layer rewrites to the number of partitions: this means its + // runtime should be comparable to a full round of image layer creations, rather than + // being potentially much longer. + let rewrite_max = partition_count; + + self.compact_shard_ancestors(rewrite_max, ctx).await?; + } + + Ok(()) + } + + /// Check for layers that are elegible to be rewritten: + /// - Shard splitting: After a shard split, ancestor layers beyond pitr_interval, so that + /// we don't indefinitely retain keys in this shard that aren't needed. + /// - For future use: layers beyond pitr_interval that are in formats we would + /// rather not maintain compatibility with indefinitely. + /// + /// Note: this phase may read and write many gigabytes of data: use rewrite_max to bound + /// how much work it will try to do in each compaction pass. + async fn compact_shard_ancestors( + self: &Arc, + rewrite_max: usize, + _ctx: &RequestContext, + ) -> anyhow::Result<()> { + let mut drop_layers = Vec::new(); + let layers_to_rewrite: Vec = Vec::new(); + + // We will use the PITR cutoff as a condition for rewriting layers. + let pitr_cutoff = self.gc_info.read().unwrap().cutoffs.pitr; + + let layers = self.layers.read().await; + for layer_desc in layers.layer_map().iter_historic_layers() { + let layer = layers.get_from_desc(&layer_desc); + if layer.metadata().shard.shard_count == self.shard_identity.count { + // This layer does not belong to a historic ancestor, no need to re-image it. + continue; + } + + // This layer was created on an ancestor shard: check if it contains any data for this shard. + let sharded_range = ShardedRange::new(layer_desc.get_key_range(), &self.shard_identity); + let layer_local_page_count = sharded_range.page_count(); + let layer_raw_page_count = ShardedRange::raw_size(&layer_desc.get_key_range()); + if layer_local_page_count == 0 { + // This ancestral layer only covers keys that belong to other shards. + // We include the full metadata in the log: if we had some critical bug that caused + // us to incorrectly drop layers, this would simplify manually debugging + reinstating those layers. + info!(%layer, old_metadata=?layer.metadata(), + "dropping layer after shard split, contains no keys for this shard.", + ); + + if cfg!(debug_assertions) { + // Expensive, exhaustive check of keys in this layer: this guards against ShardedRange's calculations being + // wrong. If ShardedRange claims the local page count is zero, then no keys in this layer + // should be !is_key_disposable() + let range = layer_desc.get_key_range(); + let mut key = range.start; + while key < range.end { + debug_assert!(self.shard_identity.is_key_disposable(&key)); + key = key.next(); + } + } + + drop_layers.push(layer); + continue; + } else if layer_local_page_count != u32::MAX + && layer_local_page_count == layer_raw_page_count + { + debug!(%layer, + "layer is entirely shard local ({} keys), no need to filter it", + layer_local_page_count + ); + continue; + } + + // Don't bother re-writing a layer unless it will at least halve its size + if layer_local_page_count != u32::MAX + && layer_local_page_count > layer_raw_page_count / 2 + { + debug!(%layer, + "layer is already mostly local ({}/{}), not rewriting", + layer_local_page_count, + layer_raw_page_count + ); + } + + // Don't bother re-writing a layer if it is within the PITR window: it will age-out eventually + // without incurring the I/O cost of a rewrite. + if layer_desc.get_lsn_range().end >= pitr_cutoff { + debug!(%layer, "Skipping rewrite of layer still in PITR window ({} >= {})", + layer_desc.get_lsn_range().end, pitr_cutoff); + continue; + } + + if layer_desc.is_delta() { + // We do not yet implement rewrite of delta layers + debug!(%layer, "Skipping rewrite of delta layer"); + continue; + } + + // Only rewrite layers if they would have different remote paths: either they belong to this + // shard but an old generation, or they belonged to another shard. This also implicitly + // guarantees that the layer is persistent in remote storage (as only remote persistent + // layers are carried across shard splits, any local-only layer would be in the current generation) + if layer.metadata().generation == self.generation + && layer.metadata().shard.shard_count == self.shard_identity.count + { + debug!(%layer, "Skipping rewrite, is not from old generation"); + continue; + } + + if layers_to_rewrite.len() >= rewrite_max { + tracing::info!(%layer, "Will rewrite layer on a future compaction, already rewrote {}", + layers_to_rewrite.len() + ); + continue; + } + + // Fall through: all our conditions for doing a rewrite passed. + // TODO: implement rewriting + tracing::debug!(%layer, "Would rewrite layer"); + } + + // Drop the layers read lock: we will acquire it for write in [`Self::rewrite_layers`] + drop(layers); + + // TODO: collect layers to rewrite + let replace_layers = Vec::new(); + + // Update the LayerMap so that readers will use the new layers, and enqueue it for writing to remote storage + self.rewrite_layers(replace_layers, drop_layers).await?; + + if let Some(remote_client) = self.remote_client.as_ref() { + // We wait for all uploads to complete before finishing this compaction stage. This is not + // necessary for correctness, but it simplifies testing, and avoids proceeding with another + // Timeline's compaction while this timeline's uploads may be generating lots of disk I/O + // load. + remote_client.wait_completion().await?; + } + Ok(()) } @@ -511,7 +663,7 @@ impl Timeline { writer .take() .unwrap() - .finish(prev_key.unwrap().next(), self) + .finish(prev_key.unwrap().next(), self, ctx) .await?, ); writer = None; @@ -553,7 +705,11 @@ impl Timeline { ); } - writer.as_mut().unwrap().put_value(key, lsn, value).await?; + writer + .as_mut() + .unwrap() + .put_value(key, lsn, value, ctx) + .await?; } else { debug!( "Dropping key {} during compaction (it belongs on shard {:?})", @@ -569,7 +725,7 @@ impl Timeline { prev_key = Some(key); } if let Some(writer) = writer { - new_layers.push(writer.finish(prev_key.unwrap().next(), self).await?); + new_layers.push(writer.finish(prev_key.unwrap().next(), self, ctx).await?); } // Sync layers @@ -769,8 +925,9 @@ impl Timeline { return Err(CompactionError::ShuttingDown); } - let keyspace = self.collect_keyspace(end_lsn, ctx).await?; - let mut adaptor = TimelineAdaptor::new(self, (end_lsn, keyspace)); + let (dense_ks, _sparse_ks) = self.collect_keyspace(end_lsn, ctx).await?; + // TODO(chi): ignore sparse_keyspace for now, compact it in the future. + let mut adaptor = TimelineAdaptor::new(self, (end_lsn, dense_ks)); pageserver_compaction::compact_tiered::compact_tiered( &mut adaptor, @@ -818,7 +975,10 @@ impl TimelineAdaptor { self.timeline .finish_compact_batch(&self.new_deltas, &self.new_images, &layers_to_delete) .await?; - self.new_images.clear(); + + self.timeline + .upload_new_image_layers(std::mem::take(&mut self.new_images))?; + self.new_deltas.clear(); self.layers_to_delete.clear(); Ok(()) @@ -839,6 +999,10 @@ impl CompactionJobExecutor for TimelineAdaptor { type RequestContext = crate::context::RequestContext; + fn get_shard_identity(&self) -> &ShardIdentity { + self.timeline.get_shard_identity() + } + async fn get_layers( &mut self, key_range: &Range, @@ -955,7 +1119,7 @@ impl CompactionJobExecutor for TimelineAdaptor { let value = val.load(ctx).await?; - writer.put_value(key, lsn, value).await?; + writer.put_value(key, lsn, value, ctx).await?; prev = Some((key, lsn)); } @@ -971,7 +1135,7 @@ impl CompactionJobExecutor for TimelineAdaptor { }); let new_delta_layer = writer - .finish(prev.unwrap().0.next(), &self.timeline) + .finish(prev.unwrap().0.next(), &self.timeline, ctx) .await?; self.new_deltas.push(new_delta_layer); @@ -1041,11 +1205,11 @@ impl TimelineAdaptor { } } }; - image_layer_writer.put_image(key, img).await?; + image_layer_writer.put_image(key, img, ctx).await?; key = key.next(); } } - let image_layer = image_layer_writer.finish(&self.timeline).await?; + let image_layer = image_layer_writer.finish(&self.timeline, ctx).await?; self.new_images.push(image_layer); @@ -1129,7 +1293,6 @@ impl CompactionLayer for ResidentDeltaLayer { } } -#[async_trait] impl CompactionDeltaLayer for ResidentDeltaLayer { type DeltaEntry<'a> = DeltaEntry<'a>; diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs index a0c9d99196..d8701be170 100644 --- a/pageserver/src/tenant/timeline/delete.rs +++ b/pageserver/src/tenant/timeline/delete.rs @@ -6,7 +6,7 @@ use std::{ use anyhow::Context; use pageserver_api::{models::TimelineState, shard::TenantShardId}; use tokio::sync::OwnedMutexGuard; -use tracing::{debug, error, info, instrument, Instrument}; +use tracing::{error, info, instrument, Instrument}; use utils::{crashsafe, fs_ext, id::TimelineId}; use crate::{ @@ -14,81 +14,14 @@ use crate::{ deletion_queue::DeletionQueueClient, task_mgr::{self, TaskKind}, tenant::{ - debug_assert_current_span_has_tenant_and_timeline_id, metadata::TimelineMetadata, - remote_timeline_client::{ - self, PersistIndexPartWithDeletedFlagError, RemoteTimelineClient, - }, + remote_timeline_client::{PersistIndexPartWithDeletedFlagError, RemoteTimelineClient}, CreateTimelineCause, DeleteTimelineError, Tenant, }, }; use super::{Timeline, TimelineResources}; -/// Now that the Timeline is in Stopping state, request all the related tasks to shut down. -async fn stop_tasks(timeline: &Timeline) -> Result<(), DeleteTimelineError> { - debug_assert_current_span_has_tenant_and_timeline_id(); - // Notify any timeline work to drop out of loops/requests - tracing::debug!("Cancelling CancellationToken"); - timeline.cancel.cancel(); - - // Stop the walreceiver first. - debug!("waiting for wal receiver to shutdown"); - let maybe_started_walreceiver = { timeline.walreceiver.lock().unwrap().take() }; - if let Some(walreceiver) = maybe_started_walreceiver { - walreceiver.stop().await; - } - debug!("wal receiver shutdown confirmed"); - - // Shut down the layer flush task before the remote client, as one depends on the other - task_mgr::shutdown_tasks( - Some(TaskKind::LayerFlushTask), - Some(timeline.tenant_shard_id), - Some(timeline.timeline_id), - ) - .await; - - // Prevent new uploads from starting. - if let Some(remote_client) = timeline.remote_client.as_ref() { - let res = remote_client.stop(); - match res { - Ok(()) => {} - Err(e) => match e { - remote_timeline_client::StopError::QueueUninitialized => { - // This case shouldn't happen currently because the - // load and attach code bails out if _any_ of the timeline fails to fetch its IndexPart. - // That is, before we declare the Tenant as Active. - // But we only allow calls to delete_timeline on Active tenants. - return Err(DeleteTimelineError::Other(anyhow::anyhow!("upload queue is uninitialized, likely the timeline was in Broken state prior to this call because it failed to fetch IndexPart during load or attach, check the logs"))); - } - }, - } - } - - // Stop & wait for the remaining timeline tasks, including upload tasks. - // NB: This and other delete_timeline calls do not run as a task_mgr task, - // so, they are not affected by this shutdown_tasks() call. - info!("waiting for timeline tasks to shutdown"); - task_mgr::shutdown_tasks( - None, - Some(timeline.tenant_shard_id), - Some(timeline.timeline_id), - ) - .await; - - fail::fail_point!("timeline-delete-before-index-deleted-at", |_| { - Err(anyhow::anyhow!( - "failpoint: timeline-delete-before-index-deleted-at" - ))? - }); - - tracing::debug!("Waiting for gate..."); - timeline.gate.close().await; - tracing::debug!("Shutdown complete"); - - Ok(()) -} - /// Mark timeline as deleted in S3 so we won't pick it up next time /// during attach or pageserver restart. /// See comment in persist_index_part_with_deleted_flag. @@ -282,7 +215,14 @@ impl DeleteTimelineFlow { guard.mark_in_progress()?; - stop_tasks(&timeline).await?; + // Now that the Timeline is in Stopping state, request all the related tasks to shut down. + timeline.shutdown(super::ShutdownMode::Hard).await; + + fail::fail_point!("timeline-delete-before-index-deleted-at", |_| { + Err(anyhow::anyhow!( + "failpoint: timeline-delete-before-index-deleted-at" + ))? + }); set_deleted_in_remote_index(&timeline).await?; @@ -482,6 +422,10 @@ impl DeleteTimelineFlow { pub(crate) fn is_finished(&self) -> bool { matches!(self, Self::Finished) } + + pub(crate) fn is_not_started(&self) -> bool { + matches!(self, Self::NotStarted) + } } struct DeletionGuard(OwnedMutexGuard); diff --git a/pageserver/src/tenant/timeline/detach_ancestor.rs b/pageserver/src/tenant/timeline/detach_ancestor.rs new file mode 100644 index 0000000000..69b82344a6 --- /dev/null +++ b/pageserver/src/tenant/timeline/detach_ancestor.rs @@ -0,0 +1,540 @@ +use std::sync::Arc; + +use super::{layer_manager::LayerManager, Timeline}; +use crate::{ + context::{DownloadBehavior, RequestContext}, + task_mgr::TaskKind, + tenant::{ + storage_layer::{AsLayerDesc as _, DeltaLayerWriter, Layer, ResidentLayer}, + Tenant, + }, + virtual_file::{MaybeFatalIo, VirtualFile}, +}; +use tokio_util::sync::CancellationToken; +use tracing::Instrument; +use utils::{completion, generation::Generation, id::TimelineId, lsn::Lsn}; + +#[derive(Debug, thiserror::Error)] +pub(crate) enum Error { + #[error("no ancestors")] + NoAncestor, + #[error("too many ancestors")] + TooManyAncestors, + #[error("shutting down, please retry later")] + ShuttingDown, + #[error("flushing failed")] + FlushAncestor(#[source] anyhow::Error), + #[error("layer download failed")] + RewrittenDeltaDownloadFailed(#[source] anyhow::Error), + #[error("copying LSN prefix locally failed")] + CopyDeltaPrefix(#[source] anyhow::Error), + #[error("upload rewritten layer")] + UploadRewritten(#[source] anyhow::Error), + + #[error("ancestor is already being detached by: {}", .0)] + OtherTimelineDetachOngoing(TimelineId), + + #[error("remote copying layer failed")] + CopyFailed(#[source] anyhow::Error), + + #[error("unexpected error")] + Unexpected(#[source] anyhow::Error), +} + +pub(crate) struct PreparedTimelineDetach { + layers: Vec, +} + +/// TODO: this should be part of PageserverConf because we cannot easily modify cplane arguments. +#[derive(Debug)] +pub(crate) struct Options { + pub(crate) rewrite_concurrency: std::num::NonZeroUsize, + pub(crate) copy_concurrency: std::num::NonZeroUsize, +} + +impl Default for Options { + fn default() -> Self { + Self { + rewrite_concurrency: std::num::NonZeroUsize::new(2).unwrap(), + copy_concurrency: std::num::NonZeroUsize::new(10).unwrap(), + } + } +} + +/// See [`Timeline::prepare_to_detach_from_ancestor`] +pub(super) async fn prepare( + detached: &Arc, + tenant: &Tenant, + options: Options, + ctx: &RequestContext, +) -> Result<(completion::Completion, PreparedTimelineDetach), Error> { + use Error::*; + + if detached.remote_client.as_ref().is_none() { + unimplemented!("no new code for running without remote storage"); + } + + let Some((ancestor, ancestor_lsn)) = detached + .ancestor_timeline + .as_ref() + .map(|tl| (tl.clone(), detached.ancestor_lsn)) + else { + return Err(NoAncestor); + }; + + if !ancestor_lsn.is_valid() { + return Err(NoAncestor); + } + + if ancestor.ancestor_timeline.is_some() { + // non-technical requirement; we could flatten N ancestors just as easily but we chose + // not to + return Err(TooManyAncestors); + } + + // before we acquire the gate, we must mark the ancestor as having a detach operation + // ongoing which will block other concurrent detach operations so we don't get to ackward + // situations where there would be two branches trying to reparent earlier branches. + let (guard, barrier) = completion::channel(); + + { + let mut guard = tenant.ongoing_timeline_detach.lock().unwrap(); + if let Some((tl, other)) = guard.as_ref() { + if !other.is_ready() { + return Err(OtherTimelineDetachOngoing(*tl)); + } + } + *guard = Some((detached.timeline_id, barrier)); + } + + let _gate_entered = detached.gate.enter().map_err(|_| ShuttingDown)?; + + if ancestor_lsn >= ancestor.get_disk_consistent_lsn() { + let span = + tracing::info_span!("freeze_and_flush", ancestor_timeline_id=%ancestor.timeline_id); + async { + let started_at = std::time::Instant::now(); + let freeze_and_flush = ancestor.freeze_and_flush0(); + let mut freeze_and_flush = std::pin::pin!(freeze_and_flush); + + let res = + tokio::time::timeout(std::time::Duration::from_secs(1), &mut freeze_and_flush) + .await; + + let res = match res { + Ok(res) => res, + Err(_elapsed) => { + tracing::info!("freezing and flushing ancestor is still ongoing"); + freeze_and_flush.await + } + }; + + res.map_err(FlushAncestor)?; + + // we do not need to wait for uploads to complete but we do need `struct Layer`, + // copying delta prefix is unsupported currently for `InMemoryLayer`. + tracing::info!( + elapsed_ms = started_at.elapsed().as_millis(), + "froze and flushed the ancestor" + ); + Ok(()) + } + .instrument(span) + .await?; + } + + let end_lsn = ancestor_lsn + 1; + + let (filtered_layers, straddling_branchpoint, rest_of_historic) = { + // we do not need to start from our layers, because they can only be layers that come + // *after* ancestor_lsn + let layers = tokio::select! { + guard = ancestor.layers.read() => guard, + _ = detached.cancel.cancelled() => { + return Err(ShuttingDown); + } + _ = ancestor.cancel.cancelled() => { + return Err(ShuttingDown); + } + }; + + // between retries, these can change if compaction or gc ran in between. this will mean + // we have to redo work. + partition_work(ancestor_lsn, &layers) + }; + + // TODO: layers are already sorted by something: use that to determine how much of remote + // copies are already done. + tracing::info!(filtered=%filtered_layers, to_rewrite = straddling_branchpoint.len(), historic=%rest_of_historic.len(), "collected layers"); + + // TODO: copying and lsn prefix copying could be done at the same time with a single fsync after + let mut new_layers: Vec = + Vec::with_capacity(straddling_branchpoint.len() + rest_of_historic.len()); + + { + tracing::debug!(to_rewrite = %straddling_branchpoint.len(), "copying prefix of delta layers"); + + let mut tasks = tokio::task::JoinSet::new(); + + let mut wrote_any = false; + + let limiter = Arc::new(tokio::sync::Semaphore::new( + options.rewrite_concurrency.get(), + )); + + for layer in straddling_branchpoint { + let limiter = limiter.clone(); + let timeline = detached.clone(); + let ctx = ctx.detached_child(TaskKind::DetachAncestor, DownloadBehavior::Download); + + tasks.spawn(async move { + let _permit = limiter.acquire().await; + let copied = + upload_rewritten_layer(end_lsn, &layer, &timeline, &timeline.cancel, &ctx) + .await?; + Ok(copied) + }); + } + + while let Some(res) = tasks.join_next().await { + match res { + Ok(Ok(Some(copied))) => { + wrote_any = true; + tracing::info!(layer=%copied, "rewrote and uploaded"); + new_layers.push(copied); + } + Ok(Ok(None)) => {} + Ok(Err(e)) => return Err(e), + Err(je) => return Err(Unexpected(je.into())), + } + } + + // FIXME: the fsync should be mandatory, after both rewrites and copies + if wrote_any { + let timeline_dir = VirtualFile::open( + &detached + .conf + .timeline_path(&detached.tenant_shard_id, &detached.timeline_id), + ) + .await + .fatal_err("VirtualFile::open for timeline dir fsync"); + timeline_dir + .sync_all() + .await + .fatal_err("VirtualFile::sync_all timeline dir"); + } + } + + let mut tasks = tokio::task::JoinSet::new(); + let limiter = Arc::new(tokio::sync::Semaphore::new(options.copy_concurrency.get())); + + for adopted in rest_of_historic { + let limiter = limiter.clone(); + let timeline = detached.clone(); + + tasks.spawn( + async move { + let _permit = limiter.acquire().await; + let owned = + remote_copy(&adopted, &timeline, timeline.generation, &timeline.cancel).await?; + tracing::info!(layer=%owned, "remote copied"); + Ok(owned) + } + .in_current_span(), + ); + } + + while let Some(res) = tasks.join_next().await { + match res { + Ok(Ok(owned)) => { + new_layers.push(owned); + } + Ok(Err(failed)) => { + return Err(failed); + } + Err(je) => return Err(Unexpected(je.into())), + } + } + + // TODO: fsync directory again if we hardlinked something + + let prepared = PreparedTimelineDetach { layers: new_layers }; + + Ok((guard, prepared)) +} + +fn partition_work( + ancestor_lsn: Lsn, + source_layermap: &LayerManager, +) -> (usize, Vec, Vec) { + let mut straddling_branchpoint = vec![]; + let mut rest_of_historic = vec![]; + + let mut later_by_lsn = 0; + + for desc in source_layermap.layer_map().iter_historic_layers() { + // off by one chances here: + // - start is inclusive + // - end is exclusive + if desc.lsn_range.start > ancestor_lsn { + later_by_lsn += 1; + continue; + } + + let target = if desc.lsn_range.start <= ancestor_lsn + && desc.lsn_range.end > ancestor_lsn + && desc.is_delta + { + // TODO: image layer at Lsn optimization + &mut straddling_branchpoint + } else { + &mut rest_of_historic + }; + + target.push(source_layermap.get_from_desc(&desc)); + } + + (later_by_lsn, straddling_branchpoint, rest_of_historic) +} + +async fn upload_rewritten_layer( + end_lsn: Lsn, + layer: &Layer, + target: &Arc, + cancel: &CancellationToken, + ctx: &RequestContext, +) -> Result, Error> { + use Error::UploadRewritten; + let copied = copy_lsn_prefix(end_lsn, layer, target, ctx).await?; + + let Some(copied) = copied else { + return Ok(None); + }; + + // FIXME: better shuttingdown error + target + .remote_client + .as_ref() + .unwrap() + .upload_layer_file(&copied, cancel) + .await + .map_err(UploadRewritten)?; + + Ok(Some(copied.into())) +} + +async fn copy_lsn_prefix( + end_lsn: Lsn, + layer: &Layer, + target_timeline: &Arc, + ctx: &RequestContext, +) -> Result, Error> { + use Error::{CopyDeltaPrefix, RewrittenDeltaDownloadFailed}; + + tracing::debug!(%layer, %end_lsn, "copying lsn prefix"); + + let mut writer = DeltaLayerWriter::new( + target_timeline.conf, + target_timeline.timeline_id, + target_timeline.tenant_shard_id, + layer.layer_desc().key_range.start, + layer.layer_desc().lsn_range.start..end_lsn, + ) + .await + .map_err(CopyDeltaPrefix)?; + + let resident = layer + .download_and_keep_resident() + .await + // likely shutdown + .map_err(RewrittenDeltaDownloadFailed)?; + + let records = resident + .copy_delta_prefix(&mut writer, end_lsn, ctx) + .await + .map_err(CopyDeltaPrefix)?; + + drop(resident); + + tracing::debug!(%layer, records, "copied records"); + + if records == 0 { + drop(writer); + // TODO: we might want to store an empty marker in remote storage for this + // layer so that we will not needlessly walk `layer` on repeated attempts. + Ok(None) + } else { + // reuse the key instead of adding more holes between layers by using the real + // highest key in the layer. + let reused_highest_key = layer.layer_desc().key_range.end; + let copied = writer + .finish(reused_highest_key, target_timeline, ctx) + .await + .map_err(CopyDeltaPrefix)?; + + tracing::debug!(%layer, %copied, "new layer produced"); + + Ok(Some(copied)) + } +} + +/// Creates a new Layer instance for the adopted layer, and ensures it is found from the remote +/// storage on successful return without the adopted layer being added to `index_part.json`. +async fn remote_copy( + adopted: &Layer, + adoptee: &Arc, + generation: Generation, + cancel: &CancellationToken, +) -> Result { + use Error::CopyFailed; + + // depending if Layer::keep_resident we could hardlink + + let mut metadata = adopted.metadata(); + debug_assert!(metadata.generation <= generation); + metadata.generation = generation; + + let owned = crate::tenant::storage_layer::Layer::for_evicted( + adoptee.conf, + adoptee, + adopted.layer_desc().layer_name(), + metadata, + ); + + // FIXME: better shuttingdown error + adoptee + .remote_client + .as_ref() + .unwrap() + .copy_timeline_layer(adopted, &owned, cancel) + .await + .map(move |()| owned) + .map_err(CopyFailed) +} + +/// See [`Timeline::complete_detaching_timeline_ancestor`]. +pub(super) async fn complete( + detached: &Arc, + tenant: &Tenant, + prepared: PreparedTimelineDetach, + _ctx: &RequestContext, +) -> Result, anyhow::Error> { + let rtc = detached + .remote_client + .as_ref() + .expect("has to have a remote timeline client for timeline ancestor detach"); + + let PreparedTimelineDetach { layers } = prepared; + + let ancestor = detached + .get_ancestor_timeline() + .expect("must still have a ancestor"); + let ancestor_lsn = detached.get_ancestor_lsn(); + + // publish the prepared layers before we reparent any of the timelines, so that on restart + // reparented timelines find layers. also do the actual detaching. + // + // if we crash after this operation, we will at least come up having detached a timeline, but + // we cannot go back and reparent the timelines which would had been reparented in normal + // execution. + // + // this is not perfect, but it avoids us a retry happening after a compaction or gc on restart + // which could give us a completely wrong layer combination. + rtc.schedule_adding_existing_layers_to_index_detach_and_wait( + &layers, + (ancestor.timeline_id, ancestor_lsn), + ) + .await?; + + let mut tasks = tokio::task::JoinSet::new(); + + // because we are now keeping the slot in progress, it is unlikely that there will be any + // timeline deletions during this time. if we raced one, then we'll just ignore it. + tenant + .timelines + .lock() + .unwrap() + .values() + .filter_map(|tl| { + if Arc::ptr_eq(tl, detached) { + return None; + } + + if !tl.is_active() { + return None; + } + + let tl_ancestor = tl.ancestor_timeline.as_ref()?; + let is_same = Arc::ptr_eq(&ancestor, tl_ancestor); + let is_earlier = tl.get_ancestor_lsn() <= ancestor_lsn; + + let is_deleting = tl + .delete_progress + .try_lock() + .map(|flow| !flow.is_not_started()) + .unwrap_or(true); + + if is_same && is_earlier && !is_deleting { + Some(tl.clone()) + } else { + None + } + }) + .for_each(|timeline| { + // important in this scope: we are holding the Tenant::timelines lock + let span = tracing::info_span!("reparent", reparented=%timeline.timeline_id); + let new_parent = detached.timeline_id; + + tasks.spawn( + async move { + let res = timeline + .remote_client + .as_ref() + .expect("reparented has to have remote client because detached has one") + .schedule_reparenting_and_wait(&new_parent) + .await; + + match res { + Ok(()) => Some(timeline), + Err(e) => { + // with the use of tenant slot, we no longer expect these. + tracing::warn!("reparenting failed: {e:#}"); + None + } + } + } + .instrument(span), + ); + }); + + let reparenting_candidates = tasks.len(); + let mut reparented = Vec::with_capacity(tasks.len()); + + while let Some(res) = tasks.join_next().await { + match res { + Ok(Some(timeline)) => { + tracing::info!(reparented=%timeline.timeline_id, "reparenting done"); + reparented.push(timeline.timeline_id); + } + Ok(None) => { + // lets just ignore this for now. one or all reparented timelines could had + // started deletion, and that is fine. + } + Err(je) if je.is_cancelled() => unreachable!("not used"), + Err(je) if je.is_panic() => { + // ignore; it's better to continue with a single reparenting failing (or even + // all of them) in order to get to the goal state. + // + // these timelines will never be reparentable, but they can be always detached as + // separate tree roots. + } + Err(je) => tracing::error!("unexpected join error: {je:?}"), + } + } + + if reparenting_candidates != reparented.len() { + tracing::info!("failed to reparent some candidates"); + } + + Ok(reparented) +} diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs index dd769d4121..3567761b9a 100644 --- a/pageserver/src/tenant/timeline/eviction_task.rs +++ b/pageserver/src/tenant/timeline/eviction_task.rs @@ -51,6 +51,7 @@ pub struct EvictionTaskTenantState { impl Timeline { pub(super) fn launch_eviction_task( self: &Arc, + parent: Arc, background_tasks_can_start: Option<&completion::Barrier>, ) { let self_clone = Arc::clone(self); @@ -66,20 +67,19 @@ impl Timeline { ), false, async move { - let cancel = task_mgr::shutdown_token(); tokio::select! { - _ = cancel.cancelled() => { return Ok(()); } + _ = self_clone.cancel.cancelled() => { return Ok(()); } _ = completion::Barrier::maybe_wait(background_tasks_can_start) => {} }; - self_clone.eviction_task(cancel).await; + self_clone.eviction_task(parent).await; Ok(()) }, ); } #[instrument(skip_all, fields(tenant_id = %self.tenant_shard_id.tenant_id, shard_id = %self.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id))] - async fn eviction_task(self: Arc, cancel: CancellationToken) { + async fn eviction_task(self: Arc, tenant: Arc) { use crate::tenant::tasks::random_init_delay; // acquire the gate guard only once within a useful span @@ -94,7 +94,7 @@ impl Timeline { EvictionPolicy::OnlyImitiate(lat) => lat.period, EvictionPolicy::NoEviction => Duration::from_secs(10), }; - if random_init_delay(period, &cancel).await.is_err() { + if random_init_delay(period, &self.cancel).await.is_err() { return; } } @@ -103,13 +103,13 @@ impl Timeline { loop { let policy = self.get_eviction_policy(); let cf = self - .eviction_iteration(&policy, &cancel, &guard, &ctx) + .eviction_iteration(&tenant, &policy, &self.cancel, &guard, &ctx) .await; match cf { ControlFlow::Break(()) => break, ControlFlow::Continue(sleep_until) => { - if tokio::time::timeout_at(sleep_until, cancel.cancelled()) + if tokio::time::timeout_at(sleep_until, self.cancel.cancelled()) .await .is_ok() { @@ -123,6 +123,7 @@ impl Timeline { #[instrument(skip_all, fields(policy_kind = policy.discriminant_str()))] async fn eviction_iteration( self: &Arc, + tenant: &Tenant, policy: &EvictionPolicy, cancel: &CancellationToken, gate: &GateGuard, @@ -137,7 +138,7 @@ impl Timeline { } EvictionPolicy::LayerAccessThreshold(p) => { match self - .eviction_iteration_threshold(p, cancel, gate, ctx) + .eviction_iteration_threshold(tenant, p, cancel, gate, ctx) .await { ControlFlow::Break(()) => return ControlFlow::Break(()), @@ -146,7 +147,11 @@ impl Timeline { (p.period, p.threshold) } EvictionPolicy::OnlyImitiate(p) => { - if self.imitiate_only(p, cancel, gate, ctx).await.is_break() { + if self + .imitiate_only(tenant, p, cancel, gate, ctx) + .await + .is_break() + { return ControlFlow::Break(()); } (p.period, p.threshold) @@ -175,6 +180,7 @@ impl Timeline { async fn eviction_iteration_threshold( self: &Arc, + tenant: &Tenant, p: &EvictionPolicyLayerAccessThreshold, cancel: &CancellationToken, gate: &GateGuard, @@ -182,21 +188,10 @@ impl Timeline { ) -> ControlFlow<()> { let now = SystemTime::now(); - let acquire_permit = crate::tenant::tasks::concurrent_background_tasks_rate_limit_permit( - BackgroundLoopKind::Eviction, - ctx, - ); + let permit = self.acquire_imitation_permit(cancel, ctx).await?; - let _permit = tokio::select! { - permit = acquire_permit => permit, - _ = cancel.cancelled() => return ControlFlow::Break(()), - _ = self.cancel.cancelled() => return ControlFlow::Break(()), - }; - - match self.imitate_layer_accesses(p, cancel, gate, ctx).await { - ControlFlow::Break(()) => return ControlFlow::Break(()), - ControlFlow::Continue(()) => (), - } + self.imitate_layer_accesses(tenant, p, cancel, gate, permit, ctx) + .await?; #[derive(Debug, Default)] struct EvictionStats { @@ -315,23 +310,33 @@ impl Timeline { /// disk usage based eviction task. async fn imitiate_only( self: &Arc, + tenant: &Tenant, p: &EvictionPolicyLayerAccessThreshold, cancel: &CancellationToken, gate: &GateGuard, ctx: &RequestContext, ) -> ControlFlow<()> { + let permit = self.acquire_imitation_permit(cancel, ctx).await?; + + self.imitate_layer_accesses(tenant, p, cancel, gate, permit, ctx) + .await + } + + async fn acquire_imitation_permit( + &self, + cancel: &CancellationToken, + ctx: &RequestContext, + ) -> ControlFlow<(), tokio::sync::SemaphorePermit<'static>> { let acquire_permit = crate::tenant::tasks::concurrent_background_tasks_rate_limit_permit( BackgroundLoopKind::Eviction, ctx, ); - let _permit = tokio::select! { - permit = acquire_permit => permit, - _ = cancel.cancelled() => return ControlFlow::Break(()), - _ = self.cancel.cancelled() => return ControlFlow::Break(()), - }; - - self.imitate_layer_accesses(p, cancel, gate, ctx).await + tokio::select! { + permit = acquire_permit => ControlFlow::Continue(permit), + _ = cancel.cancelled() => ControlFlow::Break(()), + _ = self.cancel.cancelled() => ControlFlow::Break(()), + } } /// If we evict layers but keep cached values derived from those layers, then @@ -361,12 +366,14 @@ impl Timeline { #[instrument(skip_all)] async fn imitate_layer_accesses( &self, + tenant: &Tenant, p: &EvictionPolicyLayerAccessThreshold, cancel: &CancellationToken, gate: &GateGuard, + permit: tokio::sync::SemaphorePermit<'static>, ctx: &RequestContext, ) -> ControlFlow<()> { - if !self.tenant_shard_id.is_zero() { + if !self.tenant_shard_id.is_shard_zero() { // Shards !=0 do not maintain accurate relation sizes, and do not need to calculate logical size // for consumption metrics (consumption metrics are only sent from shard 0). We may therefore // skip imitating logical size accesses for eviction purposes. @@ -396,17 +403,32 @@ impl Timeline { // Make one of the tenant's timelines draw the short straw and run the calculation. // The others wait until the calculation is done so that they take into account the // imitated accesses that the winner made. - let tenant = match crate::tenant::mgr::get_tenant(self.tenant_shard_id, true) { - Ok(t) => t, - Err(_) => { - return ControlFlow::Break(()); + let (mut state, _permit) = { + if let Ok(locked) = tenant.eviction_task_tenant_state.try_lock() { + (locked, permit) + } else { + // we might need to wait for a long time here in case of pathological synthetic + // size calculation performance + drop(permit); + let locked = tokio::select! { + locked = tenant.eviction_task_tenant_state.lock() => locked, + _ = self.cancel.cancelled() => { + return ControlFlow::Break(()) + }, + _ = cancel.cancelled() => { + return ControlFlow::Break(()) + } + }; + // then reacquire -- this will be bad if there is a lot of traffic, but because we + // released the permit, the overall latency will be much better. + let permit = self.acquire_imitation_permit(cancel, ctx).await?; + (locked, permit) } }; - let mut state = tenant.eviction_task_tenant_state.lock().await; match state.last_layer_access_imitation { Some(ts) if ts.elapsed() < inter_imitate_period => { /* no need to run */ } _ => { - self.imitate_synthetic_size_calculation_worker(&tenant, cancel, ctx) + self.imitate_synthetic_size_calculation_worker(tenant, cancel, ctx) .await; state.last_layer_access_imitation = Some(tokio::time::Instant::now()); } @@ -480,7 +502,7 @@ impl Timeline { #[instrument(skip_all)] async fn imitate_synthetic_size_calculation_worker( &self, - tenant: &Arc, + tenant: &Tenant, cancel: &CancellationToken, ctx: &RequestContext, ) { diff --git a/pageserver/src/tenant/timeline/init.rs b/pageserver/src/tenant/timeline/init.rs index 916ebfc6d9..66aa765015 100644 --- a/pageserver/src/tenant/timeline/init.rs +++ b/pageserver/src/tenant/timeline/init.rs @@ -6,13 +6,13 @@ use crate::{ self, index::{IndexPart, LayerFileMetadata}, }, - storage_layer::LayerFileName, + storage_layer::LayerName, Generation, }, METADATA_FILE_NAME, }; use anyhow::Context; -use camino::Utf8Path; +use camino::{Utf8Path, Utf8PathBuf}; use pageserver_api::shard::ShardIndex; use std::{collections::HashMap, str::FromStr}; use utils::lsn::Lsn; @@ -20,7 +20,7 @@ use utils::lsn::Lsn; /// Identified files in the timeline directory. pub(super) enum Discovered { /// The only one we care about - Layer(LayerFileName, u64), + Layer(LayerName, Utf8PathBuf, u64), /// Old ephmeral files from previous launches, should be removed Ephemeral(String), /// Old temporary timeline files, unsure what these really are, should be removed @@ -43,10 +43,10 @@ pub(super) fn scan_timeline_dir(path: &Utf8Path) -> anyhow::Result { let file_size = direntry.metadata()?.len(); - Discovered::Layer(file_name, file_size) + Discovered::Layer(file_name, direntry.path().to_owned(), file_size) } Err(_) => { if file_name == METADATA_FILE_NAME { @@ -72,6 +72,28 @@ pub(super) fn scan_timeline_dir(path: &Utf8Path) -> anyhow::Result Self { + Self { + local_path, + metadata: LayerFileMetadata::new(file_size, generation, shard), + } + } +} + /// Decision on what to do with a layer file after considering its local and remote metadata. #[derive(Clone, Debug)] pub(super) enum Decision { @@ -80,11 +102,11 @@ pub(super) enum Decision { /// The layer is present locally, but local metadata does not match remote; we must /// delete it and treat it as evicted. UseRemote { - local: LayerFileMetadata, + local: LocalLayerFileMetadata, remote: LayerFileMetadata, }, /// The layer is present locally, and metadata matches. - UseLocal(LayerFileMetadata), + UseLocal(LocalLayerFileMetadata), } /// A layer needs to be left out of the layer map. @@ -92,39 +114,42 @@ pub(super) enum Decision { pub(super) enum DismissedLayer { /// The related layer is is in future compared to disk_consistent_lsn, it must not be loaded. Future { - /// The local metadata. `None` if the layer is only known through [`IndexPart`]. - local: Option, + /// `None` if the layer is only known through [`IndexPart`]. + local: Option, }, /// The layer only exists locally. /// /// In order to make crash safe updates to layer map, we must dismiss layers which are only /// found locally or not yet included in the remote `index_part.json`. - LocalOnly(LayerFileMetadata), + LocalOnly(LocalLayerFileMetadata), } /// Merges local discoveries and remote [`IndexPart`] to a collection of decisions. pub(super) fn reconcile( - discovered: Vec<(LayerFileName, u64)>, + discovered: Vec<(LayerName, Utf8PathBuf, u64)>, index_part: Option<&IndexPart>, disk_consistent_lsn: Lsn, generation: Generation, shard: ShardIndex, -) -> Vec<(LayerFileName, Result)> { +) -> Vec<(LayerName, Result)> { use Decision::*; - // name => (local, remote) - type Collected = HashMap, Option)>; + // name => (local_metadata, remote_metadata) + type Collected = + HashMap, Option)>; let mut discovered = discovered .into_iter() - .map(|(name, file_size)| { + .map(|(layer_name, local_path, file_size)| { ( - name, + layer_name, // The generation and shard here will be corrected to match IndexPart in the merge below, unless // it is not in IndexPart, in which case using our current generation makes sense // because it will be uploaded in this generation. ( - Some(LayerFileMetadata::new(file_size, generation, shard)), + Some(LocalLayerFileMetadata::new( + local_path, file_size, generation, shard, + )), None, ), ) @@ -153,7 +178,7 @@ pub(super) fn reconcile( Err(DismissedLayer::Future { local }) } else { match (local, remote) { - (Some(local), Some(remote)) if local != remote => { + (Some(local), Some(remote)) if local.metadata != remote => { Ok(UseRemote { local, remote }) } (Some(x), Some(_)) => Ok(UseLocal(x)), @@ -177,12 +202,12 @@ pub(super) fn cleanup(path: &Utf8Path, kind: &str) -> anyhow::Result<()> { } pub(super) fn cleanup_local_file_for_remote( - path: &Utf8Path, - local: &LayerFileMetadata, + local: &LocalLayerFileMetadata, remote: &LayerFileMetadata, ) -> anyhow::Result<()> { - let local_size = local.file_size(); + let local_size = local.metadata.file_size(); let remote_size = remote.file_size(); + let path = &local.local_path; let file_name = path.file_name().expect("must be file path"); tracing::warn!("removing local file {file_name:?} because it has unexpected length {local_size}; length in remote index is {remote_size}"); @@ -199,7 +224,7 @@ pub(super) fn cleanup_local_file_for_remote( pub(super) fn cleanup_future_layer( path: &Utf8Path, - name: &LayerFileName, + name: &LayerName, disk_consistent_lsn: Lsn, ) -> anyhow::Result<()> { // future image layers are allowed to be produced always for not yet flushed to disk @@ -211,12 +236,14 @@ pub(super) fn cleanup_future_layer( } pub(super) fn cleanup_local_only_file( - path: &Utf8Path, - name: &LayerFileName, - local: &LayerFileMetadata, + name: &LayerName, + local: &LocalLayerFileMetadata, ) -> anyhow::Result<()> { let kind = name.kind(); - tracing::info!("found local-only {kind} layer {name}, metadata {local:?}"); - std::fs::remove_file(path)?; + tracing::info!( + "found local-only {kind} layer {name}, metadata {:?}", + local.metadata + ); + std::fs::remove_file(&local.local_path)?; Ok(()) } diff --git a/pageserver/src/tenant/timeline/layer_manager.rs b/pageserver/src/tenant/timeline/layer_manager.rs index d54dc1642c..a72eb1b3bf 100644 --- a/pageserver/src/tenant/timeline/layer_manager.rs +++ b/pageserver/src/tenant/timeline/layer_manager.rs @@ -120,9 +120,10 @@ impl LayerManager { /// Called from `freeze_inmem_layer`, returns true if successfully frozen. pub(crate) async fn try_freeze_in_memory_layer( &mut self, - Lsn(last_record_lsn): Lsn, + lsn: Lsn, last_freeze_at: &AtomicLsn, ) { + let Lsn(last_record_lsn) = lsn; let end_lsn = Lsn(last_record_lsn + 1); if let Some(open_layer) = &self.layer_map.open_layer { @@ -135,8 +136,11 @@ impl LayerManager { self.layer_map.frozen_layers.push_back(open_layer_rc); self.layer_map.open_layer = None; self.layer_map.next_open_layer_at = Some(end_lsn); - last_freeze_at.store(end_lsn); } + + // Even if there was no layer to freeze, advance last_freeze_at to last_record_lsn+1: this + // accounts for regions in the LSN range where we might have ingested no data due to sharding. + last_freeze_at.store(end_lsn); } /// Add image layers to the layer map, called from `create_image_layers`. @@ -201,6 +205,24 @@ impl LayerManager { updates.flush(); } + /// Called when compaction is completed. + pub(crate) fn rewrite_layers( + &mut self, + rewrite_layers: &[(Layer, ResidentLayer)], + drop_layers: &[Layer], + _metrics: &TimelineMetrics, + ) { + let mut updates = self.layer_map.batch_update(); + + // TODO: implement rewrites (currently this code path only used for drops) + assert!(rewrite_layers.is_empty()); + + for l in drop_layers { + Self::delete_historic_layer(l, &mut updates, &mut self.layer_fmgr); + } + updates.flush(); + } + /// Called when garbage collect has selected the layers to be removed. pub(crate) fn finish_gc_timeline(&mut self, gc_layers: &[Layer]) { let mut updates = self.layer_map.batch_update(); @@ -272,7 +294,7 @@ impl LayerFileManager { // A layer's descriptor is present in the LayerMap => the LayerFileManager contains a layer for the descriptor. self.0 .get(&desc.key()) - .with_context(|| format!("get layer from desc: {}", desc.filename())) + .with_context(|| format!("get layer from desc: {}", desc.layer_name())) .expect("not found") .clone() } diff --git a/pageserver/src/tenant/timeline/uninit.rs b/pageserver/src/tenant/timeline/uninit.rs index e1034a9fe2..2b60e670ea 100644 --- a/pageserver/src/tenant/timeline/uninit.rs +++ b/pageserver/src/tenant/timeline/uninit.rs @@ -86,6 +86,7 @@ impl<'t> UninitializedTimeline<'t> { /// Prepares timeline data by loading it from the basebackup archive. pub(crate) async fn import_basebackup_from_tar( self, + tenant: Arc, copyin_read: &mut (impl tokio::io::AsyncRead + Send + Sync + Unpin), base_lsn: Lsn, broker_client: storage_broker::BrokerClientChannel, @@ -114,7 +115,7 @@ impl<'t> UninitializedTimeline<'t> { // All the data has been imported. Insert the Timeline into the tenant's timelines map let tl = self.finish_creation()?; - tl.activate(broker_client, None, ctx); + tl.activate(tenant, broker_client, None, ctx); Ok(tl) } diff --git a/pageserver/src/tenant/timeline/walreceiver.rs b/pageserver/src/tenant/timeline/walreceiver.rs index 2fab6722b8..a085154a5a 100644 --- a/pageserver/src/tenant/timeline/walreceiver.rs +++ b/pageserver/src/tenant/timeline/walreceiver.rs @@ -24,26 +24,21 @@ mod connection_manager; mod walreceiver_connection; use crate::context::{DownloadBehavior, RequestContext}; -use crate::task_mgr::{self, TaskKind, WALRECEIVER_RUNTIME}; +use crate::task_mgr::{TaskKind, WALRECEIVER_RUNTIME}; use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id; use crate::tenant::timeline::walreceiver::connection_manager::{ connection_manager_loop_step, ConnectionManagerState, }; -use pageserver_api::shard::TenantShardId; use std::future::Future; use std::num::NonZeroU64; -use std::ops::ControlFlow; use std::sync::Arc; use std::time::Duration; use storage_broker::BrokerClientChannel; -use tokio::select; use tokio::sync::watch; use tokio_util::sync::CancellationToken; use tracing::*; -use utils::id::TimelineId; - use self::connection_manager::ConnectionManagerStatus; use super::Timeline; @@ -62,9 +57,10 @@ pub struct WalReceiverConf { } pub struct WalReceiver { - tenant_shard_id: TenantShardId, - timeline_id: TimelineId, manager_status: Arc>>, + /// All task spawned by [`WalReceiver::start`] and its children are sensitive to this token. + /// It's a child token of [`Timeline`] so that timeline shutdown can cancel WalReceiver tasks early for `freeze_and_flush=true`. + cancel: CancellationToken, } impl WalReceiver { @@ -78,65 +74,58 @@ impl WalReceiver { let timeline_id = timeline.timeline_id; let walreceiver_ctx = ctx.detached_child(TaskKind::WalReceiverManager, DownloadBehavior::Error); - let loop_status = Arc::new(std::sync::RwLock::new(None)); let manager_status = Arc::clone(&loop_status); - task_mgr::spawn( - WALRECEIVER_RUNTIME.handle(), - TaskKind::WalReceiverManager, - Some(timeline.tenant_shard_id), - Some(timeline_id), - &format!("walreceiver for timeline {tenant_shard_id}/{timeline_id}"), - false, + let cancel = timeline.cancel.child_token(); + WALRECEIVER_RUNTIME.spawn({ + let cancel = cancel.clone(); async move { debug_assert_current_span_has_tenant_and_timeline_id(); + // acquire timeline gate so we know the task doesn't outlive the Timeline + let Ok(_guard) = timeline.gate.enter() else { + debug!("WAL receiver manager could not enter the gate timeline gate, it's closed already"); + return; + }; debug!("WAL receiver manager started, connecting to broker"); let mut connection_manager_state = ConnectionManagerState::new( timeline, conf, + cancel.clone(), ); - loop { - select! { - _ = task_mgr::shutdown_watcher() => { - trace!("WAL receiver shutdown requested, shutting down"); + while !cancel.is_cancelled() { + let loop_step_result = connection_manager_loop_step( + &mut broker_client, + &mut connection_manager_state, + &walreceiver_ctx, + &cancel, + &loop_status, + ).await; + match loop_step_result { + Ok(()) => continue, + Err(_cancelled) => { + trace!("Connection manager loop ended, shutting down"); break; - }, - loop_step_result = connection_manager_loop_step( - &mut broker_client, - &mut connection_manager_state, - &walreceiver_ctx, - &loop_status, - ) => match loop_step_result { - ControlFlow::Continue(()) => continue, - ControlFlow::Break(()) => { - trace!("Connection manager loop ended, shutting down"); - break; - } - }, + } } } - connection_manager_state.shutdown().await; *loop_status.write().unwrap() = None; - Ok(()) + debug!("task exits"); } .instrument(info_span!(parent: None, "wal_connection_manager", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), timeline_id = %timeline_id)) - ); + }); Self { - tenant_shard_id, - timeline_id, manager_status, + cancel, } } - pub async fn stop(self) { - task_mgr::shutdown_tasks( - Some(TaskKind::WalReceiverManager), - Some(self.tenant_shard_id), - Some(self.timeline_id), - ) - .await; + #[instrument(skip_all, level = tracing::Level::DEBUG)] + pub fn cancel(&self) { + debug_assert_current_span_has_tenant_and_timeline_id(); + debug!("cancelling walreceiver tasks"); + self.cancel.cancel(); } pub(crate) fn status(&self) -> Option { @@ -170,14 +159,18 @@ enum TaskStateUpdate { impl TaskHandle { /// Initializes the task, starting it immediately after the creation. + /// + /// The second argument to `task` is a child token of `cancel_parent` ([`CancellationToken::child_token`]). + /// It being a child token enables us to provide a [`Self::shutdown`] method. fn spawn( + cancel_parent: &CancellationToken, task: impl FnOnce(watch::Sender>, CancellationToken) -> Fut + Send + 'static, ) -> Self where Fut: Future> + Send, E: Send + Sync + 'static, { - let cancellation = CancellationToken::new(); + let cancellation = cancel_parent.child_token(); let (events_sender, events_receiver) = watch::channel(TaskStateUpdate::Started); let cancellation_clone = cancellation.clone(); @@ -197,6 +190,9 @@ impl TaskHandle { } } + /// # Cancel-Safety + /// + /// Cancellation-safe. async fn next_task_event(&mut self) -> TaskEvent { match self.events_receiver.changed().await { Ok(()) => TaskEvent::Update((self.events_receiver.borrow()).clone()), diff --git a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs index cf6dee114f..991e4ac045 100644 --- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs +++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs @@ -17,17 +17,19 @@ use crate::metrics::{ WALRECEIVER_ACTIVE_MANAGERS, WALRECEIVER_BROKER_UPDATES, WALRECEIVER_CANDIDATES_ADDED, WALRECEIVER_CANDIDATES_REMOVED, WALRECEIVER_SWITCHES, }; -use crate::task_mgr::{shutdown_token, TaskKind}; +use crate::task_mgr::TaskKind; use crate::tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeline}; use anyhow::Context; use chrono::{NaiveDateTime, Utc}; use pageserver_api::models::TimelineState; -use storage_broker::proto::subscribe_safekeeper_info_request::SubscriptionKey; -use storage_broker::proto::SafekeeperTimelineInfo; -use storage_broker::proto::SubscribeSafekeeperInfoRequest; + use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId; +use storage_broker::proto::{ + FilterTenantTimelineId, MessageType, SafekeeperDiscoveryRequest, SafekeeperDiscoveryResponse, + SubscribeByFilterRequest, TypeSubscription, TypedMessage, +}; use storage_broker::{BrokerClientChannel, Code, Streaming}; -use tokio::select; +use tokio_util::sync::CancellationToken; use tracing::*; use postgres_connection::PgConnectionConfig; @@ -45,27 +47,33 @@ use super::{ TaskEvent, TaskHandle, }; +pub(crate) struct Cancelled; + /// Attempts to subscribe for timeline updates, pushed by safekeepers into the broker. /// Based on the updates, desides whether to start, keep or stop a WAL receiver task. /// If storage broker subscription is cancelled, exits. +/// +/// # Cancel-Safety +/// +/// Not cancellation-safe. Use `cancel` token to request cancellation. pub(super) async fn connection_manager_loop_step( broker_client: &mut BrokerClientChannel, connection_manager_state: &mut ConnectionManagerState, ctx: &RequestContext, + cancel: &CancellationToken, manager_status: &std::sync::RwLock>, -) -> ControlFlow<(), ()> { - match connection_manager_state - .timeline - .wait_to_become_active(ctx) - .await - { +) -> Result<(), Cancelled> { + match tokio::select! { + _ = cancel.cancelled() => { return Err(Cancelled); }, + st = connection_manager_state.timeline.wait_to_become_active(ctx) => { st } + } { Ok(()) => {} Err(new_state) => { debug!( ?new_state, "state changed, stopping wal connection manager loop" ); - return ControlFlow::Break(()); + return Err(Cancelled); } } @@ -83,17 +91,28 @@ pub(super) async fn connection_manager_loop_step( .timeline .subscribe_for_state_updates(); + let mut wait_lsn_status = connection_manager_state + .timeline + .subscribe_for_wait_lsn_updates(); + + // TODO: create a separate config option for discovery request interval + let discovery_request_interval = connection_manager_state.conf.lagging_wal_timeout; + let mut last_discovery_ts: Option = None; + // Subscribe to the broker updates. Stream shares underlying TCP connection // with other streams on this client (other connection managers). When // object goes out of scope, stream finishes in drop() automatically. - let mut broker_subscription = subscribe_for_timeline_updates(broker_client, id).await; + let mut broker_subscription = subscribe_for_timeline_updates(broker_client, id, cancel).await?; debug!("Subscribed for broker timeline updates"); loop { let time_until_next_retry = connection_manager_state.time_until_next_retry(); + let any_activity = connection_manager_state.wal_connection.is_some() + || !connection_manager_state.wal_stream_candidates.is_empty(); // These things are happening concurrently: // + // - cancellation request // - keep receiving WAL on the current connection // - if the shared state says we need to change connection, disconnect and return // - this runs in a separate task and we receive updates via a watch channel @@ -101,7 +120,12 @@ pub(super) async fn connection_manager_loop_step( // - receive updates from broker // - this might change the current desired connection // - timeline state changes to something that does not allow walreceiver to run concurrently - select! { + // - if there's no connection and no candidates, try to send a discovery request + + // NB: make sure each of the select expressions are cancellation-safe + // (no need for arms to be cancellation-safe). + tokio::select! { + _ = cancel.cancelled() => { return Err(Cancelled); } Some(wal_connection_update) = async { match connection_manager_state.wal_connection.as_mut() { Some(wal_connection) => Some(wal_connection.connection_task.next_task_event().await), @@ -133,7 +157,7 @@ pub(super) async fn connection_manager_loop_step( }, // Got a new update from the broker - broker_update = broker_subscription.message() => { + broker_update = broker_subscription.message() /* TODO: review cancellation-safety */ => { match broker_update { Ok(Some(broker_update)) => connection_manager_state.register_timeline_update(broker_update), Err(status) => { @@ -147,16 +171,17 @@ pub(super) async fn connection_manager_loop_step( warn!("broker subscription failed: {status}"); } } - return ControlFlow::Continue(()); + return Ok(()); } Ok(None) => { error!("broker subscription stream ended"); // can't happen - return ControlFlow::Continue(()); + return Ok(()); } } }, new_event = async { + // Reminder: this match arm needs to be cancellation-safe. loop { if connection_manager_state.timeline.current_state() == TimelineState::Loading { warn!("wal connection manager should only be launched after timeline has become active"); @@ -182,11 +207,11 @@ pub(super) async fn connection_manager_loop_step( } } => match new_event { ControlFlow::Continue(()) => { - return ControlFlow::Continue(()); + return Ok(()); } ControlFlow::Break(()) => { debug!("Timeline is no longer active, stopping wal connection manager loop"); - return ControlFlow::Break(()); + return Err(Cancelled); } }, @@ -202,6 +227,65 @@ pub(super) async fn connection_manager_loop_step( } } } => debug!("Waking up for the next retry after waiting for {time_until_next_retry:?}"), + + Some(()) = async { + // Reminder: this match arm needs to be cancellation-safe. + // Calculating time needed to wait until sending the next discovery request. + // Current implementation is conservative and sends discovery requests only when there are no candidates. + + if any_activity { + // No need to send discovery requests if there is an active connection or candidates. + return None; + } + + // Waiting for an active wait_lsn request. + while wait_lsn_status.borrow().is_none() { + if wait_lsn_status.changed().await.is_err() { + // wait_lsn_status channel was closed, exiting + warn!("wait_lsn_status channel was closed in connection_manager_loop_step"); + return None; + } + } + + // All preconditions met, preparing to send a discovery request. + let now = std::time::Instant::now(); + let next_discovery_ts = last_discovery_ts + .map(|ts| ts + discovery_request_interval) + .unwrap_or_else(|| now); + + if next_discovery_ts > now { + // Prevent sending discovery requests too frequently. + tokio::time::sleep(next_discovery_ts - now).await; + } + + let tenant_timeline_id = Some(ProtoTenantTimelineId { + tenant_id: id.tenant_id.as_ref().to_owned(), + timeline_id: id.timeline_id.as_ref().to_owned(), + }); + let request = SafekeeperDiscoveryRequest { tenant_timeline_id }; + let msg = TypedMessage { + r#type: MessageType::SafekeeperDiscoveryRequest as i32, + safekeeper_timeline_info: None, + safekeeper_discovery_request: Some(request), + safekeeper_discovery_response: None, + }; + + last_discovery_ts = Some(std::time::Instant::now()); + debug!("No active connection and no candidates, sending discovery request to the broker"); + + // Cancellation safety: we want to send a message to the broker, but publish_one() + // function can get cancelled by the other select! arm. This is absolutely fine, because + // we just want to receive broker updates and discovery is not important if we already + // receive updates. + // + // It is possible that `last_discovery_ts` will be updated, but the message will not be sent. + // This is totally fine because of the reason above. + + // This is a fire-and-forget request, we don't care about the response + let _ = broker_client.publish_one(msg).await; + debug!("Discovery request sent to the broker"); + None + } => {} } if let Some(new_candidate) = connection_manager_state.next_connection_candidate() { @@ -218,32 +302,46 @@ pub(super) async fn connection_manager_loop_step( async fn subscribe_for_timeline_updates( broker_client: &mut BrokerClientChannel, id: TenantTimelineId, -) -> Streaming { + cancel: &CancellationToken, +) -> Result, Cancelled> { let mut attempt = 0; - let cancel = shutdown_token(); - loop { exponential_backoff( attempt, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS, - &cancel, + cancel, ) .await; attempt += 1; // subscribe to the specific timeline - let key = SubscriptionKey::TenantTimelineId(ProtoTenantTimelineId { - tenant_id: id.tenant_id.as_ref().to_owned(), - timeline_id: id.timeline_id.as_ref().to_owned(), - }); - let request = SubscribeSafekeeperInfoRequest { - subscription_key: Some(key), + let request = SubscribeByFilterRequest { + types: vec![ + TypeSubscription { + r#type: MessageType::SafekeeperTimelineInfo as i32, + }, + TypeSubscription { + r#type: MessageType::SafekeeperDiscoveryResponse as i32, + }, + ], + tenant_timeline_id: Some(FilterTenantTimelineId { + enabled: true, + tenant_timeline_id: Some(ProtoTenantTimelineId { + tenant_id: id.tenant_id.as_ref().to_owned(), + timeline_id: id.timeline_id.as_ref().to_owned(), + }), + }), }; - match broker_client.subscribe_safekeeper_info(request).await { + match { + tokio::select! { + r = broker_client.subscribe_by_filter(request) => { r } + _ = cancel.cancelled() => { return Err(Cancelled); } + } + } { Ok(resp) => { - return resp.into_inner(); + return Ok(resp.into_inner()); } Err(e) => { // Safekeeper nodes can stop pushing timeline updates to the broker, when no new writes happen and @@ -264,6 +362,8 @@ pub(super) struct ConnectionManagerState { id: TenantTimelineId, /// Use pageserver data about the timeline to filter out some of the safekeepers. timeline: Arc, + /// Child token of [`super::WalReceiver::cancel`], inherited to all tasks we spawn. + cancel: CancellationToken, conf: WalReceiverConf, /// Current connection to safekeeper for WAL streaming. wal_connection: Option, @@ -380,13 +480,17 @@ struct RetryInfo { /// Data about the timeline to connect to, received from the broker. #[derive(Debug, Clone)] struct BrokerSkTimeline { - timeline: SafekeeperTimelineInfo, + timeline: SafekeeperDiscoveryResponse, /// Time at which the data was fetched from the broker last time, to track the stale data. latest_update: NaiveDateTime, } impl ConnectionManagerState { - pub(super) fn new(timeline: Arc, conf: WalReceiverConf) -> Self { + pub(super) fn new( + timeline: Arc, + conf: WalReceiverConf, + cancel: CancellationToken, + ) -> Self { let id = TenantTimelineId { tenant_id: timeline.tenant_shard_id.tenant_id, timeline_id: timeline.timeline_id, @@ -394,6 +498,7 @@ impl ConnectionManagerState { Self { id, timeline, + cancel, conf, wal_connection: None, wal_stream_candidates: HashMap::new(), @@ -401,6 +506,22 @@ impl ConnectionManagerState { } } + fn spawn( + &self, + task: impl FnOnce( + tokio::sync::watch::Sender>, + CancellationToken, + ) -> Fut + + Send + + 'static, + ) -> TaskHandle + where + Fut: std::future::Future> + Send, + { + // TODO: get rid of TaskHandle + super::TaskHandle::spawn(&self.cancel, task) + } + /// Shuts down the current connection (if any) and immediately starts another one with the given connection string. async fn change_connection(&mut self, new_sk: NewWalConnectionCandidate, ctx: &RequestContext) { WALRECEIVER_SWITCHES @@ -419,7 +540,7 @@ impl ConnectionManagerState { ); let span = info_span!("connection", %node_id); - let connection_handle = TaskHandle::spawn(move |events_sender, cancellation| { + let connection_handle = self.spawn(move |events_sender, cancellation| { async move { debug_assert_current_span_has_tenant_and_timeline_id(); @@ -447,6 +568,12 @@ impl ConnectionManagerState { info!("walreceiver connection handling ended: {e}"); Ok(()) } + WalReceiverError::ClosedGate => { + info!( + "walreceiver connection handling ended because of closed gate" + ); + Ok(()) + } WalReceiverError::Other(e) => { // give out an error to have task_mgr give it a really verbose logging if cancellation.is_cancelled() { @@ -486,6 +613,10 @@ impl ConnectionManagerState { /// Drops the current connection (if any) and updates retry timeout for the next /// connection attempt to the same safekeeper. + /// + /// # Cancel-Safety + /// + /// Not cancellation-safe. async fn drop_old_connection(&mut self, needs_shutdown: bool) { let wal_connection = match self.wal_connection.take() { Some(wal_connection) => wal_connection, @@ -493,7 +624,14 @@ impl ConnectionManagerState { }; if needs_shutdown { - wal_connection.connection_task.shutdown().await; + wal_connection + .connection_task + .shutdown() + // This here is why this function isn't cancellation-safe. + // If we got cancelled here, then self.wal_connection is already None and we lose track of the task. + // Even if our caller diligently calls Self::shutdown(), it will find a self.wal_connection=None + // and thus be ineffective. + .await; } let retry = self @@ -550,7 +688,41 @@ impl ConnectionManagerState { } /// Adds another broker timeline into the state, if its more recent than the one already added there for the same key. - fn register_timeline_update(&mut self, timeline_update: SafekeeperTimelineInfo) { + fn register_timeline_update(&mut self, typed_msg: TypedMessage) { + let mut is_discovery = false; + let timeline_update = match typed_msg.r#type() { + MessageType::SafekeeperTimelineInfo => { + let info = match typed_msg.safekeeper_timeline_info { + Some(info) => info, + None => { + warn!("bad proto message from broker: no safekeeper_timeline_info"); + return; + } + }; + SafekeeperDiscoveryResponse { + safekeeper_id: info.safekeeper_id, + tenant_timeline_id: info.tenant_timeline_id, + commit_lsn: info.commit_lsn, + safekeeper_connstr: info.safekeeper_connstr, + availability_zone: info.availability_zone, + } + } + MessageType::SafekeeperDiscoveryResponse => { + is_discovery = true; + match typed_msg.safekeeper_discovery_response { + Some(response) => response, + None => { + warn!("bad proto message from broker: no safekeeper_discovery_response"); + return; + } + } + } + _ => { + // unexpected message + return; + } + }; + WALRECEIVER_BROKER_UPDATES.inc(); let new_safekeeper_id = NodeId(timeline_update.safekeeper_id); @@ -563,7 +735,11 @@ impl ConnectionManagerState { ); if old_entry.is_none() { - info!("New SK node was added: {new_safekeeper_id}"); + info!( + ?is_discovery, + %new_safekeeper_id, + "New SK node was added", + ); WALRECEIVER_CANDIDATES_ADDED.inc(); } } @@ -762,7 +938,7 @@ impl ConnectionManagerState { fn select_connection_candidate( &self, node_to_omit: Option, - ) -> Option<(NodeId, &SafekeeperTimelineInfo, PgConnectionConfig)> { + ) -> Option<(NodeId, &SafekeeperDiscoveryResponse, PgConnectionConfig)> { self.applicable_connection_candidates() .filter(|&(sk_id, _, _)| Some(sk_id) != node_to_omit) .max_by_key(|(_, info, _)| info.commit_lsn) @@ -772,7 +948,7 @@ impl ConnectionManagerState { /// Some safekeepers are filtered by the retry cooldown. fn applicable_connection_candidates( &self, - ) -> impl Iterator { + ) -> impl Iterator { let now = Utc::now().naive_utc(); self.wal_stream_candidates @@ -838,6 +1014,9 @@ impl ConnectionManagerState { } } + /// # Cancel-Safety + /// + /// Not cancellation-safe. pub(super) async fn shutdown(mut self) { if let Some(wal_connection) = self.wal_connection.take() { wal_connection.connection_task.shutdown().await; @@ -909,19 +1088,11 @@ mod tests { latest_update: NaiveDateTime, ) -> BrokerSkTimeline { BrokerSkTimeline { - timeline: SafekeeperTimelineInfo { + timeline: SafekeeperDiscoveryResponse { safekeeper_id: 0, tenant_timeline_id: None, - term: 0, - last_log_term: 0, - flush_lsn: 0, commit_lsn, - backup_lsn: 0, - remote_consistent_lsn: 0, - peer_horizon_lsn: 0, - local_start_lsn: 0, safekeeper_connstr: safekeeper_connstr.to_owned(), - http_connstr: safekeeper_connstr.to_owned(), availability_zone: None, }, latest_update, @@ -986,7 +1157,7 @@ mod tests { sk_id: connected_sk_id, availability_zone: None, status: connection_status, - connection_task: TaskHandle::spawn(move |sender, _| async move { + connection_task: state.spawn(move |sender, _| async move { sender .send(TaskStateUpdate::Progress(connection_status)) .ok(); @@ -1154,7 +1325,7 @@ mod tests { sk_id: connected_sk_id, availability_zone: None, status: connection_status, - connection_task: TaskHandle::spawn(move |sender, _| async move { + connection_task: state.spawn(move |sender, _| async move { sender .send(TaskStateUpdate::Progress(connection_status)) .ok(); @@ -1221,7 +1392,7 @@ mod tests { sk_id: NodeId(1), availability_zone: None, status: connection_status, - connection_task: TaskHandle::spawn(move |sender, _| async move { + connection_task: state.spawn(move |sender, _| async move { sender .send(TaskStateUpdate::Progress(connection_status)) .ok(); @@ -1285,7 +1456,7 @@ mod tests { sk_id: NodeId(1), availability_zone: None, status: connection_status, - connection_task: TaskHandle::spawn(move |_, _| async move { Ok(()) }), + connection_task: state.spawn(move |_, _| async move { Ok(()) }), discovered_new_wal: Some(NewCommittedWAL { discovered_at: time_over_threshold, lsn: new_lsn, @@ -1341,6 +1512,7 @@ mod tests { timeline_id: TIMELINE_ID, }, timeline, + cancel: CancellationToken::new(), conf: WalReceiverConf { wal_connect_timeout: Duration::from_secs(1), lagging_wal_timeout: Duration::from_secs(1), @@ -1363,7 +1535,7 @@ mod tests { let harness = TenantHarness::create("switch_to_same_availability_zone")?; let mut state = dummy_state(&harness).await; - state.conf.availability_zone = test_az.clone(); + state.conf.availability_zone.clone_from(&test_az); let current_lsn = Lsn(100_000).align(); let now = Utc::now().naive_utc(); @@ -1384,7 +1556,7 @@ mod tests { sk_id: connected_sk_id, availability_zone: None, status: connection_status, - connection_task: TaskHandle::spawn(move |sender, _| async move { + connection_task: state.spawn(move |sender, _| async move { sender .send(TaskStateUpdate::Progress(connection_status)) .ok(); @@ -1396,7 +1568,7 @@ mod tests { // We have another safekeeper with the same commit_lsn, and it have the same availability zone as // the current pageserver. let mut same_az_sk = dummy_broker_sk_timeline(current_lsn.0, "same_az", now); - same_az_sk.timeline.availability_zone = test_az.clone(); + same_az_sk.timeline.availability_zone.clone_from(&test_az); state.wal_stream_candidates = HashMap::from([ ( diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs index d9f780cfd1..c6ee6b90c4 100644 --- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs @@ -27,7 +27,6 @@ use super::TaskStateUpdate; use crate::{ context::RequestContext, metrics::{LIVE_CONNECTIONS_COUNT, WALRECEIVER_STARTED_CONNECTIONS, WAL_INGEST}, - task_mgr, task_mgr::TaskKind, task_mgr::WALRECEIVER_RUNTIME, tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeline, WalReceiverInfo}, @@ -37,8 +36,8 @@ use crate::{ use postgres_backend::is_expected_io_error; use postgres_connection::PgConnectionConfig; use postgres_ffi::waldecoder::WalStreamDecoder; -use utils::pageserver_feedback::PageserverFeedback; use utils::{id::NodeId, lsn::Lsn}; +use utils::{pageserver_feedback::PageserverFeedback, sync::gate::GateError}; /// Status of the connection. #[derive(Debug, Clone, Copy)] @@ -68,6 +67,7 @@ pub(super) enum WalReceiverError { SuccessfulCompletion(String), /// Generic error Other(anyhow::Error), + ClosedGate, } impl From for WalReceiverError { @@ -119,6 +119,16 @@ pub(super) async fn handle_walreceiver_connection( ) -> Result<(), WalReceiverError> { debug_assert_current_span_has_tenant_and_timeline_id(); + // prevent timeline shutdown from finishing until we have exited + let _guard = timeline.gate.enter().map_err(|e| match e { + GateError::GateClosed => WalReceiverError::ClosedGate, + })?; + // This function spawns a side-car task (WalReceiverConnectionPoller). + // Get its gate guard now as well. + let poller_guard = timeline.gate.enter().map_err(|e| match e { + GateError::GateClosed => WalReceiverError::ClosedGate, + })?; + WALRECEIVER_STARTED_CONNECTIONS.inc(); // Connect to the database in replication mode. @@ -156,22 +166,19 @@ pub(super) async fn handle_walreceiver_connection( } // The connection object performs the actual communication with the database, - // so spawn it off to run on its own. + // so spawn it off to run on its own. It shouldn't outlive this function, but, + // due to lack of async drop, we can't enforce that. However, we ensure that + // 1. it is sensitive to `cancellation` and + // 2. holds the Timeline gate open so that after timeline shutdown, + // we know this task is gone. let _connection_ctx = ctx.detached_child( TaskKind::WalReceiverConnectionPoller, ctx.download_behavior(), ); let connection_cancellation = cancellation.clone(); - task_mgr::spawn( - WALRECEIVER_RUNTIME.handle(), - TaskKind::WalReceiverConnectionPoller, - Some(timeline.tenant_shard_id), - Some(timeline.timeline_id), - "walreceiver connection", - false, + WALRECEIVER_RUNTIME.spawn( async move { debug_assert_current_span_has_tenant_and_timeline_id(); - select! { connection_result = connection => match connection_result { Ok(()) => debug!("Walreceiver db connection closed"), @@ -182,6 +189,9 @@ pub(super) async fn handle_walreceiver_connection( // with a similar error. }, WalReceiverError::SuccessfulCompletion(_) => {} + WalReceiverError::ClosedGate => { + // doesn't happen at runtime + } WalReceiverError::Other(err) => { warn!("Connection aborted: {err:#}") } @@ -190,7 +200,7 @@ pub(super) async fn handle_walreceiver_connection( }, _ = connection_cancellation.cancelled() => debug!("Connection cancelled"), } - Ok(()) + drop(poller_guard); } // Enrich the log lines emitted by this closure with meaningful context. // TODO: technically, this task outlives the surrounding function, so, the @@ -303,6 +313,7 @@ pub(super) async fn handle_walreceiver_connection( trace!("received XLogData between {startlsn} and {endlsn}"); + WAL_INGEST.bytes_received.inc_by(data.len() as u64); waldecoder.feed_bytes(data); { @@ -389,17 +400,6 @@ pub(super) async fn handle_walreceiver_connection( } } - { - // This is a hack. It piggybacks on the keepalive messages sent by the - // safekeeper in order to enforce `checkpoint_timeout` on the currently - // open layer. This hack doesn't provide a bound on the total size of - // in-memory layers on a pageserver. See https://github.com/neondatabase/neon/issues/6916. - let mut writer = timeline.writer().await; - if let Err(err) = writer.tick().await { - warn!("Timeline writer tick failed: {err}"); - } - } - if let Some(last_lsn) = status_update { let timeline_remote_consistent_lsn = timeline .get_remote_consistent_lsn_visible() @@ -427,7 +427,7 @@ pub(super) async fn handle_walreceiver_connection( // Send the replication feedback message. // Regular standby_status_update fields are put into this message. - let current_timeline_size = if timeline.tenant_shard_id.is_zero() { + let current_timeline_size = if timeline.tenant_shard_id.is_shard_zero() { timeline .get_current_logical_size( crate::tenant::timeline::GetLogicalSizePriority::User, diff --git a/pageserver/src/tenant/upload_queue.rs b/pageserver/src/tenant/upload_queue.rs index a5516bb9a9..a2f761fa94 100644 --- a/pageserver/src/tenant/upload_queue.rs +++ b/pageserver/src/tenant/upload_queue.rs @@ -1,8 +1,9 @@ -use super::storage_layer::LayerFileName; +use super::storage_layer::LayerName; use super::storage_layer::ResidentLayer; use crate::tenant::metadata::TimelineMetadata; use crate::tenant::remote_timeline_client::index::IndexPart; use crate::tenant::remote_timeline_client::index::LayerFileMetadata; +use crate::tenant::remote_timeline_client::index::Lineage; use std::collections::{HashMap, VecDeque}; use std::fmt::Debug; @@ -45,7 +46,7 @@ pub(crate) struct UploadQueueInitialized { /// All layer files stored in the remote storage, taking into account all /// in-progress and queued operations - pub(crate) latest_files: HashMap, + pub(crate) latest_files: HashMap, /// How many file uploads or deletions been scheduled, since the /// last (scheduling of) metadata index upload? @@ -56,6 +57,9 @@ pub(crate) struct UploadQueueInitialized { /// DANGER: do not return to outside world, e.g., safekeepers. pub(crate) latest_metadata: TimelineMetadata, + /// Part of the flattened "next" `index_part.json`. + pub(crate) latest_lineage: Lineage, + /// `disk_consistent_lsn` from the last metadata file that was successfully /// uploaded. `Lsn(0)` if nothing was uploaded yet. /// Unlike `latest_files` or `latest_metadata`, this value is never ahead. @@ -89,7 +93,7 @@ pub(crate) struct UploadQueueInitialized { /// Putting this behind a testing feature to catch problems in tests, but assuming we could have a /// bug causing leaks, then it's better to not leave this enabled for production builds. #[cfg(feature = "testing")] - pub(crate) dangling_files: HashMap, + pub(crate) dangling_files: HashMap, /// Set to true when we have inserted the `UploadOp::Shutdown` into the `inprogress_tasks`. pub(crate) shutting_down: bool, @@ -121,11 +125,16 @@ pub(super) enum SetDeletedFlagProgress { Successful(NaiveDateTime), } -pub(super) struct UploadQueueStopped { +pub(super) struct UploadQueueStoppedDeletable { pub(super) upload_queue_for_deletion: UploadQueueInitialized, pub(super) deleted_at: SetDeletedFlagProgress, } +pub(super) enum UploadQueueStopped { + Deletable(UploadQueueStoppedDeletable), + Uninitialized, +} + #[derive(thiserror::Error, Debug)] pub(crate) enum NotInitialized { #[error("queue is in state Uninitialized")] @@ -166,6 +175,7 @@ impl UploadQueue { latest_files: HashMap::new(), latest_files_changes_since_metadata_upload_scheduled: 0, latest_metadata: metadata.clone(), + latest_lineage: Lineage::default(), projected_remote_consistent_lsn: None, visible_remote_consistent_lsn: Arc::new(AtomicLsn::new(0)), // what follows are boring default initializations @@ -213,6 +223,7 @@ impl UploadQueue { latest_files: files, latest_files_changes_since_metadata_upload_scheduled: 0, latest_metadata: index_part.metadata.clone(), + latest_lineage: index_part.lineage.clone(), projected_remote_consistent_lsn: Some(index_part.metadata.disk_consistent_lsn()), visible_remote_consistent_lsn: Arc::new( index_part.metadata.disk_consistent_lsn().into(), @@ -249,12 +260,15 @@ impl UploadQueue { } } - pub(crate) fn stopped_mut(&mut self) -> anyhow::Result<&mut UploadQueueStopped> { + pub(crate) fn stopped_mut(&mut self) -> anyhow::Result<&mut UploadQueueStoppedDeletable> { match self { UploadQueue::Initialized(_) | UploadQueue::Uninitialized => { anyhow::bail!("queue is in state {}", self.as_str()) } - UploadQueue::Stopped(stopped) => Ok(stopped), + UploadQueue::Stopped(UploadQueueStopped::Uninitialized) => { + anyhow::bail!("queue is in state Stopped(Uninitialized)") + } + UploadQueue::Stopped(UploadQueueStopped::Deletable(deletable)) => Ok(deletable), } } } @@ -273,7 +287,7 @@ pub(crate) struct UploadTask { /// for timeline deletion, which skips this queue and goes directly to DeletionQueue. #[derive(Debug)] pub(crate) struct Delete { - pub(crate) layers: Vec<(LayerFileName, LayerFileMetadata)>, + pub(crate) layers: Vec<(LayerName, LayerFileMetadata)>, } #[derive(Debug)] @@ -282,7 +296,7 @@ pub(crate) enum UploadOp { UploadLayer(ResidentLayer, LayerFileMetadata), /// Upload the metadata file - UploadMetadata(IndexPart, Lsn), + UploadMetadata(Box, Lsn), /// Delete layer files Delete(Delete), diff --git a/pageserver/src/tenant/vectored_blob_io.rs b/pageserver/src/tenant/vectored_blob_io.rs index 805f70b23b..91934d5e0e 100644 --- a/pageserver/src/tenant/vectored_blob_io.rs +++ b/pageserver/src/tenant/vectored_blob_io.rs @@ -61,18 +61,18 @@ pub struct VectoredRead { } impl VectoredRead { - fn size(&self) -> usize { + pub(crate) fn size(&self) -> usize { (self.end - self.start) as usize } } #[derive(Eq, PartialEq)] -enum VectoredReadExtended { +pub(crate) enum VectoredReadExtended { Yes, No, } -struct VectoredReadBuilder { +pub(crate) struct VectoredReadBuilder { start: u64, end: u64, blobs_at: VecMap, @@ -80,7 +80,17 @@ struct VectoredReadBuilder { } impl VectoredReadBuilder { - fn new(start_offset: u64, end_offset: u64, meta: BlobMeta, max_read_size: usize) -> Self { + /// Start building a new vectored read. + /// + /// Note that by design, this does not check against reading more than `max_read_size` to + /// support reading larger blobs than the configuration value. The builder will be single use + /// however after that. + pub(crate) fn new( + start_offset: u64, + end_offset: u64, + meta: BlobMeta, + max_read_size: usize, + ) -> Self { let mut blobs_at = VecMap::default(); blobs_at .append(start_offset, meta) @@ -97,7 +107,8 @@ impl VectoredReadBuilder { /// Attempt to extend the current read with a new blob if the start /// offset matches with the current end of the vectored read /// and the resuting size is below the max read size - fn extend(&mut self, start: u64, end: u64, meta: BlobMeta) -> VectoredReadExtended { + pub(crate) fn extend(&mut self, start: u64, end: u64, meta: BlobMeta) -> VectoredReadExtended { + tracing::trace!(start, end, "trying to extend"); let size = (end - start) as usize; if self.end == start && self.size() + size <= self.max_read_size { self.end = end; @@ -111,11 +122,11 @@ impl VectoredReadBuilder { VectoredReadExtended::No } - fn size(&self) -> usize { + pub(crate) fn size(&self) -> usize { (self.end - self.start) as usize } - fn build(self) -> VectoredRead { + pub(crate) fn build(self) -> VectoredRead { VectoredRead { start: self.start, end: self.end, diff --git a/pageserver/src/utilization.rs b/pageserver/src/utilization.rs index 830c9897ca..e6c835aa75 100644 --- a/pageserver/src/utilization.rs +++ b/pageserver/src/utilization.rs @@ -15,11 +15,23 @@ pub(crate) fn regenerate(tenants_path: &Path) -> anyhow::Result 0 { + statvfs.fragment_size() + } else { + statvfs.block_size() + }; #[cfg_attr(not(target_os = "macos"), allow(clippy::unnecessary_cast))] let free = statvfs.blocks_available() as u64 * blocksz; - let used = crate::metrics::RESIDENT_PHYSICAL_SIZE_GLOBAL.get(); + + #[cfg_attr(not(target_os = "macos"), allow(clippy::unnecessary_cast))] + let used = statvfs + .blocks() + // use blocks_free instead of available here to match df in case someone compares + .saturating_sub(statvfs.blocks_free()) as u64 + * blocksz; + let captured_at = std::time::SystemTime::now(); let doc = PageserverUtilization { @@ -29,7 +41,7 @@ pub(crate) fn regenerate(tenants_path: &Path) -> anyhow::Result (B::Buf, Result<(), Error>) { let buf_len = buf.bytes_init(); if buf_len == 0 { @@ -623,7 +625,7 @@ impl VirtualFile { let mut buf = buf.slice(0..buf_len); while !buf.is_empty() { let res; - (buf, res) = self.write_at(buf, offset).await; + (buf, res) = self.write_at(buf, offset, ctx).await; match res { Ok(0) => { return ( @@ -652,6 +654,7 @@ impl VirtualFile { pub async fn write_all, Buf: IoBuf + Send>( &mut self, buf: B, + ctx: &RequestContext, ) -> (B::Buf, Result) { let nbytes = buf.bytes_init(); if nbytes == 0 { @@ -660,7 +663,7 @@ impl VirtualFile { let mut buf = buf.slice(0..nbytes); while !buf.is_empty() { let res; - (buf, res) = self.write(buf).await; + (buf, res) = self.write(buf, ctx).await; match res { Ok(0) => { return ( @@ -684,9 +687,10 @@ impl VirtualFile { async fn write( &mut self, buf: Slice, + ctx: &RequestContext, ) -> (Slice, Result) { let pos = self.pos; - let (buf, res) = self.write_at(buf, pos).await; + let (buf, res) = self.write_at(buf, pos, ctx).await; let n = match res { Ok(n) => n, Err(e) => return (buf, Err(e)), @@ -724,6 +728,7 @@ impl VirtualFile { &self, buf: Slice, offset: u64, + _ctx: &RequestContext, /* TODO: use for metrics: https://github.com/neondatabase/neon/issues/6107 */ ) -> (Slice, Result) { let file_guard = match self.lock_file().await { Ok(file_guard) => file_guard, @@ -782,7 +787,7 @@ where } } // NB: don't use `buf.is_empty()` here; it is from the - // `impl Deref for Slice { Target = [u8] }`; the the &[u8] + // `impl Deref for Slice { Target = [u8] }`; the &[u8] // returned by it only covers the initialized portion of `buf`. // Whereas we're interested in ensuring that we filled the entire // buffer that the user passed in. @@ -1083,6 +1088,18 @@ impl Drop for VirtualFile { } } +impl OwnedAsyncWriter for VirtualFile { + #[inline(always)] + async fn write_all, Buf: IoBuf + Send>( + &mut self, + buf: B, + ctx: &RequestContext, + ) -> std::io::Result<(usize, B::Buf)> { + let (buf, res) = VirtualFile::write_all(self, buf, ctx).await; + res.map(move |v| (v, buf)) + } +} + impl OpenFiles { fn new(num_slots: usize) -> OpenFiles { let mut slots = Box::new(Vec::with_capacity(num_slots)); @@ -1135,6 +1152,9 @@ fn get_open_files() -> &'static OpenFiles { #[cfg(test)] mod tests { + use crate::context::DownloadBehavior; + use crate::task_mgr::TaskKind; + use super::*; use rand::seq::SliceRandom; use rand::thread_rng; @@ -1166,10 +1186,11 @@ mod tests { &self, buf: B, offset: u64, + ctx: &RequestContext, ) -> Result<(), Error> { match self { MaybeVirtualFile::VirtualFile(file) => { - let (_buf, res) = file.write_all_at(buf, offset).await; + let (_buf, res) = file.write_all_at(buf, offset, ctx).await; res } MaybeVirtualFile::File(file) => { @@ -1190,10 +1211,11 @@ mod tests { async fn write_all, Buf: IoBuf + Send>( &mut self, buf: B, + ctx: &RequestContext, ) -> Result<(), Error> { match self { MaybeVirtualFile::VirtualFile(file) => { - let (_buf, res) = file.write_all(buf).await; + let (_buf, res) = file.write_all(buf, ctx).await; res.map(|_| ()) } MaybeVirtualFile::File(file) => { @@ -1264,6 +1286,7 @@ mod tests { OF: Fn(Utf8PathBuf, OpenOptions) -> FT, FT: Future>, { + let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error); let testdir = crate::config::PageServerConf::test_repo_dir(testname); std::fs::create_dir_all(&testdir)?; @@ -1277,7 +1300,7 @@ mod tests { .to_owned(), ) .await?; - file_a.write_all(b"foobar".to_vec()).await?; + file_a.write_all(b"foobar".to_vec(), &ctx).await?; // cannot read from a file opened in write-only mode let _ = file_a.read_string().await.unwrap_err(); @@ -1286,7 +1309,7 @@ mod tests { let mut file_a = openfunc(path_a, OpenOptions::new().read(true).to_owned()).await?; // cannot write to a file opened in read-only mode - let _ = file_a.write_all(b"bar".to_vec()).await.unwrap_err(); + let _ = file_a.write_all(b"bar".to_vec(), &ctx).await.unwrap_err(); // Try simple read assert_eq!("foobar", file_a.read_string().await?); @@ -1328,8 +1351,8 @@ mod tests { .to_owned(), ) .await?; - file_b.write_all_at(b"BAR".to_vec(), 3).await?; - file_b.write_all_at(b"FOO".to_vec(), 0).await?; + file_b.write_all_at(b"BAR".to_vec(), 3, &ctx).await?; + file_b.write_all_at(b"FOO".to_vec(), 0, &ctx).await?; assert_eq!(file_b.read_string_at(2, 3).await?, "OBA"); diff --git a/pageserver/src/virtual_file/owned_buffers_io/util/size_tracking_writer.rs b/pageserver/src/virtual_file/owned_buffers_io/util/size_tracking_writer.rs index 7505b7487e..55b1d0b46b 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/util/size_tracking_writer.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/util/size_tracking_writer.rs @@ -1,33 +1,46 @@ -use crate::virtual_file::{owned_buffers_io::write::OwnedAsyncWriter, VirtualFile}; +use crate::{context::RequestContext, virtual_file::owned_buffers_io::write::OwnedAsyncWriter}; use tokio_epoll_uring::{BoundedBuf, IoBuf}; -pub struct Writer { - dst: VirtualFile, +pub struct Writer { + dst: W, bytes_amount: u64, } -impl Writer { - pub fn new(dst: VirtualFile) -> Self { +impl Writer { + pub fn new(dst: W) -> Self { Self { dst, bytes_amount: 0, } } + + pub fn bytes_written(&self) -> u64 { + self.bytes_amount + } + + pub fn as_inner(&self) -> &W { + &self.dst + } + /// Returns the wrapped `VirtualFile` object as well as the number /// of bytes that were written to it through this object. - pub fn into_inner(self) -> (u64, VirtualFile) { + #[cfg_attr(target_os = "macos", allow(dead_code))] + pub fn into_inner(self) -> (u64, W) { (self.bytes_amount, self.dst) } } -impl OwnedAsyncWriter for Writer { +impl OwnedAsyncWriter for Writer +where + W: OwnedAsyncWriter, +{ #[inline(always)] async fn write_all, Buf: IoBuf + Send>( &mut self, buf: B, + ctx: &RequestContext, ) -> std::io::Result<(usize, B::Buf)> { - let (buf, res) = self.dst.write_all(buf).await; - let nwritten = res?; + let (nwritten, buf) = self.dst.write_all(buf, ctx).await?; self.bytes_amount += u64::try_from(nwritten).unwrap(); Ok((nwritten, buf)) } diff --git a/pageserver/src/virtual_file/owned_buffers_io/write.rs b/pageserver/src/virtual_file/owned_buffers_io/write.rs index f1812d9b51..885a9221c5 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/write.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/write.rs @@ -1,23 +1,26 @@ use bytes::BytesMut; use tokio_epoll_uring::{BoundedBuf, IoBuf, Slice}; +use crate::context::RequestContext; + /// A trait for doing owned-buffer write IO. /// Think [`tokio::io::AsyncWrite`] but with owned buffers. pub trait OwnedAsyncWriter { async fn write_all, Buf: IoBuf + Send>( &mut self, buf: B, + ctx: &RequestContext, ) -> std::io::Result<(usize, B::Buf)>; } -/// A wrapper aorund an [`OwnedAsyncWriter`] that batches smaller writers -/// into `BUFFER_SIZE`-sized writes. +/// A wrapper aorund an [`OwnedAsyncWriter`] that uses a [`Buffer`] to batch +/// small writes into larger writes of size [`Buffer::cap`]. /// /// # Passthrough Of Large Writers /// -/// Buffered writes larger than the `BUFFER_SIZE` cause the internal -/// buffer to be flushed, even if it is not full yet. Then, the large -/// buffered write is passed through to the unerlying [`OwnedAsyncWriter`]. +/// Calls to [`BufferedWriter::write_buffered`] that are larger than [`Buffer::cap`] +/// cause the internal buffer to be flushed prematurely so that the large +/// buffered write is passed through to the underlying [`OwnedAsyncWriter`]. /// /// This pass-through is generally beneficial for throughput, but if /// the storage backend of the [`OwnedAsyncWriter`] is a shared resource, @@ -25,93 +28,194 @@ pub trait OwnedAsyncWriter { /// /// In such cases, a different implementation that always buffers in memory /// may be preferable. -pub struct BufferedWriter { +pub struct BufferedWriter { writer: W, - // invariant: always remains Some(buf) - // with buf.capacity() == BUFFER_SIZE except - // - while IO is ongoing => goes back to Some() once the IO completed successfully - // - after an IO error => stays `None` forever - // In these exceptional cases, it's `None`. - buf: Option, + /// invariant: always remains Some(buf) except + /// - while IO is ongoing => goes back to Some() once the IO completed successfully + /// - after an IO error => stays `None` forever + /// In these exceptional cases, it's `None`. + buf: Option, } -impl BufferedWriter +impl BufferedWriter where + B: Buffer + Send, + Buf: IoBuf + Send, W: OwnedAsyncWriter, { - pub fn new(writer: W) -> Self { + pub fn new(writer: W, buf: B) -> Self { Self { writer, - buf: Some(BytesMut::with_capacity(BUFFER_SIZE)), + buf: Some(buf), } } - pub async fn flush_and_into_inner(mut self) -> std::io::Result { - self.flush().await?; + pub fn as_inner(&self) -> &W { + &self.writer + } + + /// Panics if used after any of the write paths returned an error + pub fn inspect_buffer(&self) -> &B { + self.buf() + } + + #[cfg_attr(target_os = "macos", allow(dead_code))] + pub async fn flush_and_into_inner(mut self, ctx: &RequestContext) -> std::io::Result { + self.flush(ctx).await?; + let Self { buf, writer } = self; assert!(buf.is_some()); Ok(writer) } - pub async fn write_buffered(&mut self, chunk: Slice) -> std::io::Result<()> - where - B: IoBuf + Send, - { + #[inline(always)] + fn buf(&self) -> &B { + self.buf + .as_ref() + .expect("must not use after we returned an error") + } + + #[cfg_attr(target_os = "macos", allow(dead_code))] + pub async fn write_buffered( + &mut self, + chunk: Slice, + ctx: &RequestContext, + ) -> std::io::Result<(usize, S)> { + let chunk_len = chunk.len(); // avoid memcpy for the middle of the chunk - if chunk.len() >= BUFFER_SIZE { - self.flush().await?; + if chunk.len() >= self.buf().cap() { + self.flush(ctx).await?; // do a big write, bypassing `buf` assert_eq!( self.buf .as_ref() .expect("must not use after an error") - .len(), + .pending(), 0 ); - let chunk_len = chunk.len(); - let (nwritten, chunk) = self.writer.write_all(chunk).await?; + let (nwritten, chunk) = self.writer.write_all(chunk, ctx).await?; assert_eq!(nwritten, chunk_len); - drop(chunk); - return Ok(()); + return Ok((nwritten, chunk)); } // in-memory copy the < BUFFER_SIZED tail of the chunk - assert!(chunk.len() < BUFFER_SIZE); - let mut chunk = &chunk[..]; + assert!(chunk.len() < self.buf().cap()); + let mut slice = &chunk[..]; + while !slice.is_empty() { + let buf = self.buf.as_mut().expect("must not use after an error"); + let need = buf.cap() - buf.pending(); + let have = slice.len(); + let n = std::cmp::min(need, have); + buf.extend_from_slice(&slice[..n]); + slice = &slice[n..]; + if buf.pending() >= buf.cap() { + assert_eq!(buf.pending(), buf.cap()); + self.flush(ctx).await?; + } + } + assert!(slice.is_empty(), "by now we should have drained the chunk"); + Ok((chunk_len, chunk.into_inner())) + } + + /// Strictly less performant variant of [`Self::write_buffered`] that allows writing borrowed data. + /// + /// It is less performant because we always have to copy the borrowed data into the internal buffer + /// before we can do the IO. The [`Self::write_buffered`] can avoid this, which is more performant + /// for large writes. + pub async fn write_buffered_borrowed( + &mut self, + mut chunk: &[u8], + ctx: &RequestContext, + ) -> std::io::Result { + let chunk_len = chunk.len(); while !chunk.is_empty() { let buf = self.buf.as_mut().expect("must not use after an error"); - let need = BUFFER_SIZE - buf.len(); + let need = buf.cap() - buf.pending(); let have = chunk.len(); let n = std::cmp::min(need, have); buf.extend_from_slice(&chunk[..n]); chunk = &chunk[n..]; - if buf.len() >= BUFFER_SIZE { - assert_eq!(buf.len(), BUFFER_SIZE); - self.flush().await?; + if buf.pending() >= buf.cap() { + assert_eq!(buf.pending(), buf.cap()); + self.flush(ctx).await?; } } - assert!(chunk.is_empty(), "by now we should have drained the chunk"); - Ok(()) + Ok(chunk_len) } - async fn flush(&mut self) -> std::io::Result<()> { + async fn flush(&mut self, ctx: &RequestContext) -> std::io::Result<()> { let buf = self.buf.take().expect("must not use after an error"); - if buf.is_empty() { + let buf_len = buf.pending(); + if buf_len == 0 { self.buf = Some(buf); - return std::io::Result::Ok(()); + return Ok(()); } - let buf_len = buf.len(); - let (nwritten, mut buf) = self.writer.write_all(buf).await?; + let (nwritten, io_buf) = self.writer.write_all(buf.flush(), ctx).await?; assert_eq!(nwritten, buf_len); - buf.clear(); - self.buf = Some(buf); + self.buf = Some(Buffer::reuse_after_flush(io_buf)); Ok(()) } } +/// A [`Buffer`] is used by [`BufferedWriter`] to batch smaller writes into larger ones. +pub trait Buffer { + type IoBuf: IoBuf; + + /// Capacity of the buffer. Must not change over the lifetime `self`.` + fn cap(&self) -> usize; + + /// Add data to the buffer. + /// Panics if there is not enough room to accomodate `other`'s content, i.e., + /// panics if `other.len() > self.cap() - self.pending()`. + fn extend_from_slice(&mut self, other: &[u8]); + + /// Number of bytes in the buffer. + fn pending(&self) -> usize; + + /// Turns `self` into a [`tokio_epoll_uring::Slice`] of the pending data + /// so we can use [`tokio_epoll_uring`] to write it to disk. + fn flush(self) -> Slice; + + /// After the write to disk is done and we have gotten back the slice, + /// [`BufferedWriter`] uses this method to re-use the io buffer. + fn reuse_after_flush(iobuf: Self::IoBuf) -> Self; +} + +impl Buffer for BytesMut { + type IoBuf = BytesMut; + + #[inline(always)] + fn cap(&self) -> usize { + self.capacity() + } + + fn extend_from_slice(&mut self, other: &[u8]) { + BytesMut::extend_from_slice(self, other) + } + + #[inline(always)] + fn pending(&self) -> usize { + self.len() + } + + fn flush(self) -> Slice { + if self.is_empty() { + return self.slice_full(); + } + let len = self.len(); + self.slice(0..len) + } + + fn reuse_after_flush(mut iobuf: BytesMut) -> Self { + iobuf.clear(); + iobuf + } +} + impl OwnedAsyncWriter for Vec { async fn write_all, Buf: IoBuf + Send>( &mut self, buf: B, + _: &RequestContext, ) -> std::io::Result<(usize, B::Buf)> { let nbytes = buf.bytes_init(); if nbytes == 0 { @@ -125,7 +229,11 @@ impl OwnedAsyncWriter for Vec { #[cfg(test)] mod tests { + use bytes::BytesMut; + use super::*; + use crate::context::{DownloadBehavior, RequestContext}; + use crate::task_mgr::TaskKind; #[derive(Default)] struct RecorderWriter { @@ -135,6 +243,7 @@ mod tests { async fn write_all, Buf: IoBuf + Send>( &mut self, buf: B, + _: &RequestContext, ) -> std::io::Result<(usize, B::Buf)> { let nbytes = buf.bytes_init(); if nbytes == 0 { @@ -147,10 +256,14 @@ mod tests { } } + fn test_ctx() -> RequestContext { + RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error) + } + macro_rules! write { ($writer:ident, $data:literal) => {{ $writer - .write_buffered(::bytes::Bytes::from_static($data).slice_full()) + .write_buffered(::bytes::Bytes::from_static($data).slice_full(), &test_ctx()) .await?; }}; } @@ -158,13 +271,13 @@ mod tests { #[tokio::test] async fn test_buffered_writes_only() -> std::io::Result<()> { let recorder = RecorderWriter::default(); - let mut writer = BufferedWriter::<2, _>::new(recorder); + let mut writer = BufferedWriter::new(recorder, BytesMut::with_capacity(2)); write!(writer, b"a"); write!(writer, b"b"); write!(writer, b"c"); write!(writer, b"d"); write!(writer, b"e"); - let recorder = writer.flush_and_into_inner().await?; + let recorder = writer.flush_and_into_inner(&test_ctx()).await?; assert_eq!( recorder.writes, vec![Vec::from(b"ab"), Vec::from(b"cd"), Vec::from(b"e")] @@ -175,12 +288,12 @@ mod tests { #[tokio::test] async fn test_passthrough_writes_only() -> std::io::Result<()> { let recorder = RecorderWriter::default(); - let mut writer = BufferedWriter::<2, _>::new(recorder); + let mut writer = BufferedWriter::new(recorder, BytesMut::with_capacity(2)); write!(writer, b"abc"); write!(writer, b"de"); write!(writer, b""); write!(writer, b"fghijk"); - let recorder = writer.flush_and_into_inner().await?; + let recorder = writer.flush_and_into_inner(&test_ctx()).await?; assert_eq!( recorder.writes, vec![Vec::from(b"abc"), Vec::from(b"de"), Vec::from(b"fghijk")] @@ -191,16 +304,45 @@ mod tests { #[tokio::test] async fn test_passthrough_write_with_nonempty_buffer() -> std::io::Result<()> { let recorder = RecorderWriter::default(); - let mut writer = BufferedWriter::<2, _>::new(recorder); + let mut writer = BufferedWriter::new(recorder, BytesMut::with_capacity(2)); write!(writer, b"a"); write!(writer, b"bc"); write!(writer, b"d"); write!(writer, b"e"); - let recorder = writer.flush_and_into_inner().await?; + let recorder = writer.flush_and_into_inner(&test_ctx()).await?; assert_eq!( recorder.writes, vec![Vec::from(b"a"), Vec::from(b"bc"), Vec::from(b"de")] ); Ok(()) } + + #[tokio::test] + async fn test_write_all_borrowed_always_goes_through_buffer() -> std::io::Result<()> { + let ctx = test_ctx(); + let ctx = &ctx; + let recorder = RecorderWriter::default(); + let mut writer = BufferedWriter::new(recorder, BytesMut::with_capacity(2)); + + writer.write_buffered_borrowed(b"abc", ctx).await?; + writer.write_buffered_borrowed(b"d", ctx).await?; + writer.write_buffered_borrowed(b"e", ctx).await?; + writer.write_buffered_borrowed(b"fg", ctx).await?; + writer.write_buffered_borrowed(b"hi", ctx).await?; + writer.write_buffered_borrowed(b"j", ctx).await?; + writer.write_buffered_borrowed(b"klmno", ctx).await?; + + let recorder = writer.flush_and_into_inner(ctx).await?; + assert_eq!( + recorder.writes, + { + let expect: &[&[u8]] = &[b"ab", b"cd", b"ef", b"gh", b"ij", b"kl", b"mn", b"o"]; + expect + } + .iter() + .map(|v| v[..].to_vec()) + .collect::>() + ); + Ok(()) + } } diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index 9c7e8748d5..79f075b877 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -403,7 +403,7 @@ impl WalIngest { ); if !key_is_local { - if self.shard.is_zero() { + if self.shard.is_shard_zero() { // Shard 0 tracks relation sizes. Although we will not store this block, we will observe // its blkno in case it implicitly extends a relation. self.observe_decoded_block(modification, blk, ctx).await?; @@ -1034,7 +1034,7 @@ impl WalIngest { let nblocks = modification .tline - .get_rel_size(src_rel, Version::Modified(modification), true, ctx) + .get_rel_size(src_rel, Version::Modified(modification), ctx) .await?; let dst_rel = RelTag { spcnode: tablespace_id, @@ -1068,13 +1068,7 @@ impl WalIngest { let content = modification .tline - .get_rel_page_at_lsn( - src_rel, - blknum, - Version::Modified(modification), - true, - ctx, - ) + .get_rel_page_at_lsn(src_rel, blknum, Version::Modified(modification), ctx) .await?; modification.put_rel_page_image(dst_rel, blknum, content)?; num_blocks_copied += 1; @@ -1242,7 +1236,7 @@ impl WalIngest { }; if modification .tline - .get_rel_exists(rel, Version::Modified(modification), true, ctx) + .get_rel_exists(rel, Version::Modified(modification), ctx) .await? { self.put_rel_drop(modification, rel, ctx).await?; @@ -1541,7 +1535,7 @@ impl WalIngest { nblocks } else if !modification .tline - .get_rel_exists(rel, Version::Modified(modification), true, ctx) + .get_rel_exists(rel, Version::Modified(modification), ctx) .await? { // create it with 0 size initially, the logic below will extend it @@ -1553,7 +1547,7 @@ impl WalIngest { } else { modification .tline - .get_rel_size(rel, Version::Modified(modification), true, ctx) + .get_rel_size(rel, Version::Modified(modification), ctx) .await? }; @@ -1650,14 +1644,14 @@ async fn get_relsize( ) -> anyhow::Result { let nblocks = if !modification .tline - .get_rel_exists(rel, Version::Modified(modification), true, ctx) + .get_rel_exists(rel, Version::Modified(modification), ctx) .await? { 0 } else { modification .tline - .get_rel_size(rel, Version::Modified(modification), true, ctx) + .get_rel_size(rel, Version::Modified(modification), ctx) .await? }; Ok(nblocks) @@ -1732,29 +1726,29 @@ mod tests { // The relation was created at LSN 2, not visible at LSN 1 yet. assert_eq!( tline - .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x10)), false, &ctx) + .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x10)), &ctx) .await?, false ); assert!(tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x10)), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x10)), &ctx) .await .is_err()); assert_eq!( tline - .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx) + .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), &ctx) .await?, true ); assert_eq!( tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), &ctx) .await?, 1 ); assert_eq!( tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), &ctx) .await?, 3 ); @@ -1762,46 +1756,46 @@ mod tests { // Check page contents at each LSN assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x20)), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x20)), &ctx) .await?, test_img("foo blk 0 at 2") ); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x30)), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x30)), &ctx) .await?, test_img("foo blk 0 at 3") ); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x40)), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x40)), &ctx) .await?, test_img("foo blk 0 at 3") ); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x40)), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x40)), &ctx) .await?, test_img("foo blk 1 at 4") ); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x50)), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x50)), &ctx) .await?, test_img("foo blk 0 at 3") ); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x50)), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x50)), &ctx) .await?, test_img("foo blk 1 at 4") ); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 2, Version::Lsn(Lsn(0x50)), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, 2, Version::Lsn(Lsn(0x50)), &ctx) .await?, test_img("foo blk 2 at 5") ); @@ -1817,19 +1811,19 @@ mod tests { // Check reported size and contents after truncation assert_eq!( tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x60)), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x60)), &ctx) .await?, 2 ); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x60)), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x60)), &ctx) .await?, test_img("foo blk 0 at 3") ); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x60)), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x60)), &ctx) .await?, test_img("foo blk 1 at 4") ); @@ -1837,13 +1831,13 @@ mod tests { // should still see the truncated block with older LSN assert_eq!( tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), &ctx) .await?, 3 ); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 2, Version::Lsn(Lsn(0x50)), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, 2, Version::Lsn(Lsn(0x50)), &ctx) .await?, test_img("foo blk 2 at 5") ); @@ -1856,7 +1850,7 @@ mod tests { m.commit(&ctx).await?; assert_eq!( tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x68)), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x68)), &ctx) .await?, 0 ); @@ -1869,19 +1863,19 @@ mod tests { m.commit(&ctx).await?; assert_eq!( tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x70)), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x70)), &ctx) .await?, 2 ); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x70)), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x70)), &ctx) .await?, ZERO_PAGE ); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x70)), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x70)), &ctx) .await?, test_img("foo blk 1") ); @@ -1894,21 +1888,21 @@ mod tests { m.commit(&ctx).await?; assert_eq!( tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x80)), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x80)), &ctx) .await?, 1501 ); for blk in 2..1500 { assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, blk, Version::Lsn(Lsn(0x80)), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, blk, Version::Lsn(Lsn(0x80)), &ctx) .await?, ZERO_PAGE ); } assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 1500, Version::Lsn(Lsn(0x80)), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, 1500, Version::Lsn(Lsn(0x80)), &ctx) .await?, test_img("foo blk 1500") ); @@ -1935,13 +1929,13 @@ mod tests { // Check that rel exists and size is correct assert_eq!( tline - .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx) + .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), &ctx) .await?, true ); assert_eq!( tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), &ctx) .await?, 1 ); @@ -1954,7 +1948,7 @@ mod tests { // Check that rel is not visible anymore assert_eq!( tline - .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x30)), false, &ctx) + .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x30)), &ctx) .await?, false ); @@ -1972,13 +1966,13 @@ mod tests { // Check that rel exists and size is correct assert_eq!( tline - .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x40)), false, &ctx) + .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x40)), &ctx) .await?, true ); assert_eq!( tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x40)), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x40)), &ctx) .await?, 1 ); @@ -2011,24 +2005,24 @@ mod tests { // The relation was created at LSN 20, not visible at LSN 1 yet. assert_eq!( tline - .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x10)), false, &ctx) + .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x10)), &ctx) .await?, false ); assert!(tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x10)), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x10)), &ctx) .await .is_err()); assert_eq!( tline - .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx) + .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), &ctx) .await?, true ); assert_eq!( tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), &ctx) .await?, relsize ); @@ -2039,7 +2033,7 @@ mod tests { let data = format!("foo blk {} at {}", blkno, lsn); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(lsn), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(lsn), &ctx) .await?, test_img(&data) ); @@ -2056,7 +2050,7 @@ mod tests { // Check reported size and contents after truncation assert_eq!( tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x60)), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x60)), &ctx) .await?, 1 ); @@ -2066,7 +2060,7 @@ mod tests { let data = format!("foo blk {} at {}", blkno, lsn); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x60)), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x60)), &ctx) .await?, test_img(&data) ); @@ -2075,7 +2069,7 @@ mod tests { // should still see all blocks with older LSN assert_eq!( tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), &ctx) .await?, relsize ); @@ -2084,7 +2078,7 @@ mod tests { let data = format!("foo blk {} at {}", blkno, lsn); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x50)), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x50)), &ctx) .await?, test_img(&data) ); @@ -2104,13 +2098,13 @@ mod tests { assert_eq!( tline - .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x80)), false, &ctx) + .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x80)), &ctx) .await?, true ); assert_eq!( tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x80)), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x80)), &ctx) .await?, relsize ); @@ -2120,7 +2114,7 @@ mod tests { let data = format!("foo blk {} at {}", blkno, lsn); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x80)), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x80)), &ctx) .await?, test_img(&data) ); @@ -2154,7 +2148,7 @@ mod tests { assert_eq!( tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), &ctx) .await?, RELSEG_SIZE + 1 ); @@ -2168,7 +2162,7 @@ mod tests { m.commit(&ctx).await?; assert_eq!( tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), &ctx) .await?, RELSEG_SIZE ); @@ -2183,7 +2177,7 @@ mod tests { m.commit(&ctx).await?; assert_eq!( tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), &ctx) .await?, RELSEG_SIZE - 1 ); @@ -2201,7 +2195,7 @@ mod tests { m.commit(&ctx).await?; assert_eq!( tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), &ctx) .await?, size as BlockNumber ); diff --git a/pageserver/src/walrecord.rs b/pageserver/src/walrecord.rs index ae2d996879..02f6f49694 100644 --- a/pageserver/src/walrecord.rs +++ b/pageserver/src/walrecord.rs @@ -55,6 +55,7 @@ impl NeonWalRecord { /// Does replaying this WAL record initialize the page from scratch, or does /// it need to be applied over the previous image of the page? pub fn will_init(&self) -> bool { + // If you change this function, you'll also need to change ValueBytes::will_init match self { NeonWalRecord::Postgres { will_init, rec: _ } => *will_init, diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index 0004f4f3c9..9776d4ce88 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -20,6 +20,7 @@ /// Process lifecycle and abstracction for the IPC protocol. mod process; +pub use process::Kind as ProcessKind; /// Code to apply [`NeonWalRecord`]s. pub(crate) mod apply_neon; @@ -34,13 +35,14 @@ use crate::walrecord::NeonWalRecord; use anyhow::Context; use bytes::{Bytes, BytesMut}; use pageserver_api::key::key_to_rel_block; -use pageserver_api::models::WalRedoManagerStatus; +use pageserver_api::models::{WalRedoManagerProcessStatus, WalRedoManagerStatus}; use pageserver_api::shard::TenantShardId; -use std::sync::{Arc, RwLock}; +use std::sync::Arc; use std::time::Duration; use std::time::Instant; use tracing::*; use utils::lsn::Lsn; +use utils::sync::heavier_once_cell; /// /// This is the real implementation that uses a Postgres process to @@ -53,7 +55,19 @@ pub struct PostgresRedoManager { tenant_shard_id: TenantShardId, conf: &'static PageServerConf, last_redo_at: std::sync::Mutex>, - redo_process: RwLock>>, + /// The current [`process::Process`] that is used by new redo requests. + /// We use [`heavier_once_cell`] for coalescing the spawning, but the redo + /// requests don't use the [`heavier_once_cell::Guard`] to keep ahold of the + /// their process object; we use [`Arc::clone`] for that. + /// This is primarily because earlier implementations that didn't use [`heavier_once_cell`] + /// had that behavior; it's probably unnecessary. + /// The only merit of it is that if one walredo process encounters an error, + /// it can take it out of rotation (= using [`heavier_once_cell::Guard::take_and_deinit`]. + /// and retry redo, thereby starting the new process, while other redo tasks might + /// still be using the old redo process. But, those other tasks will most likely + /// encounter an error as well, and errors are an unexpected condition anyway. + /// So, probably we could get rid of the `Arc` in the future. + redo_process: heavier_once_cell::OnceCell>, } /// @@ -101,6 +115,7 @@ impl PostgresRedoManager { self.conf.wal_redo_timeout, pg_version, ) + .await }; img = Some(result?); @@ -121,11 +136,12 @@ impl PostgresRedoManager { self.conf.wal_redo_timeout, pg_version, ) + .await } } - pub(crate) fn status(&self) -> Option { - Some(WalRedoManagerStatus { + pub fn status(&self) -> WalRedoManagerStatus { + WalRedoManagerStatus { last_redo_at: { let at = *self.last_redo_at.lock().unwrap(); at.and_then(|at| { @@ -134,8 +150,14 @@ impl PostgresRedoManager { chrono::Utc::now().checked_sub_signed(chrono::Duration::from_std(age).ok()?) }) }, - pid: self.redo_process.read().unwrap().as_ref().map(|p| p.id()), - }) + process: self + .redo_process + .get() + .map(|p| WalRedoManagerProcessStatus { + pid: p.id(), + kind: std::borrow::Cow::Borrowed(p.kind().into()), + }), + } } } @@ -152,7 +174,7 @@ impl PostgresRedoManager { tenant_shard_id, conf, last_redo_at: std::sync::Mutex::default(), - redo_process: RwLock::new(None), + redo_process: heavier_once_cell::OnceCell::default(), } } @@ -164,8 +186,7 @@ impl PostgresRedoManager { if let Some(last_redo_at) = *g { if last_redo_at.elapsed() >= idle_timeout { drop(g); - let mut guard = self.redo_process.write().unwrap(); - *guard = None; + drop(self.redo_process.get().map(|guard| guard.take_and_deinit())); } } } @@ -174,8 +195,11 @@ impl PostgresRedoManager { /// /// Process one request for WAL redo using wal-redo postgres /// + /// # Cancel-Safety + /// + /// Cancellation safe. #[allow(clippy::too_many_arguments)] - fn apply_batch_postgres( + async fn apply_batch_postgres( &self, key: Key, lsn: Lsn, @@ -191,40 +215,24 @@ impl PostgresRedoManager { const MAX_RETRY_ATTEMPTS: u32 = 1; let mut n_attempts = 0u32; loop { - // launch the WAL redo process on first use - let proc: Arc = { - let proc_guard = self.redo_process.read().unwrap(); - match &*proc_guard { - None => { - // "upgrade" to write lock to launch the process - drop(proc_guard); - let mut proc_guard = self.redo_process.write().unwrap(); - match &*proc_guard { - None => { - let start = Instant::now(); - let proc = Arc::new( - process::WalRedoProcess::launch( - self.conf, - self.tenant_shard_id, - pg_version, - ) - .context("launch walredo process")?, - ); - let duration = start.elapsed(); - WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM - .observe(duration.as_secs_f64()); - info!( - duration_ms = duration.as_millis(), - pid = proc.id(), - "launched walredo process" - ); - *proc_guard = Some(Arc::clone(&proc)); - proc - } - Some(proc) => Arc::clone(proc), - } - } - Some(proc) => Arc::clone(proc), + let proc: Arc = match self.redo_process.get_or_init_detached().await { + Ok(guard) => Arc::clone(&guard), + Err(permit) => { + // don't hold poison_guard, the launch code can bail + let start = Instant::now(); + let proc = Arc::new( + process::Process::launch(self.conf, self.tenant_shard_id, pg_version) + .context("launch walredo process")?, + ); + let duration = start.elapsed(); + WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM.observe(duration.as_secs_f64()); + info!( + duration_ms = duration.as_millis(), + pid = proc.id(), + "launched walredo process" + ); + self.redo_process.set(Arc::clone(&proc), permit); + proc } }; @@ -233,6 +241,7 @@ impl PostgresRedoManager { // Relational WAL records are applied using wal-redo-postgres let result = proc .apply_wal_records(rel, blknum, &base_img, records, wal_redo_timeout) + .await .context("apply_wal_records"); let duration = started_at.elapsed(); @@ -272,34 +281,34 @@ impl PostgresRedoManager { n_attempts, e, ); - // Avoid concurrent callers hitting the same issue. - // We can't prevent it from happening because we want to enable parallelism. - { - let mut guard = self.redo_process.write().unwrap(); - match &*guard { - Some(current_field_value) => { - if Arc::ptr_eq(current_field_value, &proc) { - // We're the first to observe an error from `proc`, it's our job to take it out of rotation. - *guard = None; - } - } - None => { - // Another thread was faster to observe the error, and already took the process out of rotation. - } - } - } + // Avoid concurrent callers hitting the same issue by taking `proc` out of the rotation. + // Note that there may be other tasks concurrent with us that also hold `proc`. + // We have to deal with that here. + // Also read the doc comment on field `self.redo_process`. + // // NB: there may still be other concurrent threads using `proc`. // The last one will send SIGKILL when the underlying Arc reaches refcount 0. - // NB: it's important to drop(proc) after drop(guard). Otherwise we'd keep - // holding the lock while waiting for the process to exit. - // NB: the drop impl blocks the current threads with a wait() system call for - // the child process. We dropped the `guard` above so that other threads aren't - // affected. But, it's good that the current thread _does_ block to wait. - // If we instead deferred the waiting into the background / to tokio, it could - // happen that if walredo always fails immediately, we spawn processes faster + // + // NB: the drop impl blocks the dropping thread with a wait() system call for + // the child process. In some ways the blocking is actually good: if we + // deferred the waiting into the background / to tokio if we used `tokio::process`, + // it could happen that if walredo always fails immediately, we spawn processes faster // than we can SIGKILL & `wait` for them to exit. By doing it the way we do here, // we limit this risk of run-away to at most $num_runtimes * $num_executor_threads. // This probably needs revisiting at some later point. + match self.redo_process.get() { + None => (), + Some(guard) => { + if Arc::ptr_eq(&proc, &*guard) { + // We're the first to observe an error from `proc`, it's our job to take it out of rotation. + guard.take_and_deinit(); + } else { + // Another task already spawned another redo process (further up in this method) + // and put it into `redo_process`. Do nothing, our view of the world is behind. + } + } + } + // The last task that does this `drop()` of `proc` will do a blocking `wait()` syscall. drop(proc); } else if n_attempts != 0 { info!(n_attempts, "retried walredo succeeded"); diff --git a/pageserver/src/walredo/process.rs b/pageserver/src/walredo/process.rs index bcbb263663..ad6b4e5fe9 100644 --- a/pageserver/src/walredo/process.rs +++ b/pageserver/src/walredo/process.rs @@ -1,186 +1,67 @@ -use self::no_leak_child::NoLeakChild; -use crate::{ - config::PageServerConf, - metrics::{WalRedoKillCause, WAL_REDO_PROCESS_COUNTERS, WAL_REDO_RECORD_COUNTER}, - walrecord::NeonWalRecord, -}; -use anyhow::Context; +use std::time::Duration; + use bytes::Bytes; -use nix::poll::{PollFd, PollFlags}; use pageserver_api::{reltag::RelTag, shard::TenantShardId}; -use postgres_ffi::BLCKSZ; -use std::os::fd::AsRawFd; -#[cfg(feature = "testing")] -use std::sync::atomic::AtomicUsize; -use std::{ - collections::VecDeque, - io::{Read, Write}, - process::{ChildStdin, ChildStdout, Command, Stdio}, - sync::{Mutex, MutexGuard}, - time::Duration, -}; -use tracing::{debug, error, instrument, Instrument}; -use utils::{lsn::Lsn, nonblock::set_nonblock}; +use utils::lsn::Lsn; + +use crate::{config::PageServerConf, walrecord::NeonWalRecord}; mod no_leak_child; /// The IPC protocol that pageserver and walredo process speak over their shared pipe. mod protocol; -pub struct WalRedoProcess { - #[allow(dead_code)] - conf: &'static PageServerConf, - tenant_shard_id: TenantShardId, - // Some() on construction, only becomes None on Drop. - child: Option, - stdout: Mutex, - stdin: Mutex, - /// Counter to separate same sized walredo inputs failing at the same millisecond. - #[cfg(feature = "testing")] - dump_sequence: AtomicUsize, +mod process_impl { + pub(super) mod process_async; + pub(super) mod process_std; } -struct ProcessInput { - stdin: ChildStdin, - n_requests: usize, +#[derive( + Clone, + Copy, + Debug, + PartialEq, + Eq, + strum_macros::EnumString, + strum_macros::Display, + strum_macros::IntoStaticStr, + serde_with::DeserializeFromStr, + serde_with::SerializeDisplay, +)] +#[strum(serialize_all = "kebab-case")] +#[repr(u8)] +pub enum Kind { + Sync, + Async, } -struct ProcessOutput { - stdout: ChildStdout, - pending_responses: VecDeque>, - n_processed_responses: usize, +pub(crate) enum Process { + Sync(process_impl::process_std::WalRedoProcess), + Async(process_impl::process_async::WalRedoProcess), } -impl WalRedoProcess { - // - // Start postgres binary in special WAL redo mode. - // - #[instrument(skip_all,fields(pg_version=pg_version))] - pub(crate) fn launch( +impl Process { + #[inline(always)] + pub fn launch( conf: &'static PageServerConf, tenant_shard_id: TenantShardId, pg_version: u32, ) -> anyhow::Result { - crate::span::debug_assert_current_span_has_tenant_id(); - - let pg_bin_dir_path = conf.pg_bin_dir(pg_version).context("pg_bin_dir")?; // TODO these should be infallible. - let pg_lib_dir_path = conf.pg_lib_dir(pg_version).context("pg_lib_dir")?; - - use no_leak_child::NoLeakChildCommandExt; - // Start postgres itself - let child = Command::new(pg_bin_dir_path.join("postgres")) - // the first arg must be --wal-redo so the child process enters into walredo mode - .arg("--wal-redo") - // the child doesn't process this arg, but, having it in the argv helps indentify the - // walredo process for a particular tenant when debugging a pagserver - .args(["--tenant-shard-id", &format!("{tenant_shard_id}")]) - .stdin(Stdio::piped()) - .stderr(Stdio::piped()) - .stdout(Stdio::piped()) - .env_clear() - .env("LD_LIBRARY_PATH", &pg_lib_dir_path) - .env("DYLD_LIBRARY_PATH", &pg_lib_dir_path) - // NB: The redo process is not trusted after we sent it the first - // walredo work. Before that, it is trusted. Specifically, we trust - // it to - // 1. close all file descriptors except stdin, stdout, stderr because - // pageserver might not be 100% diligent in setting FD_CLOEXEC on all - // the files it opens, and - // 2. to use seccomp to sandbox itself before processing the first - // walredo request. - .spawn_no_leak_child(tenant_shard_id) - .context("spawn process")?; - WAL_REDO_PROCESS_COUNTERS.started.inc(); - let mut child = scopeguard::guard(child, |child| { - error!("killing wal-redo-postgres process due to a problem during launch"); - child.kill_and_wait(WalRedoKillCause::Startup); - }); - - let stdin = child.stdin.take().unwrap(); - let stdout = child.stdout.take().unwrap(); - let stderr = child.stderr.take().unwrap(); - let stderr = tokio::process::ChildStderr::from_std(stderr) - .context("convert to tokio::ChildStderr")?; - macro_rules! set_nonblock_or_log_err { - ($file:ident) => {{ - let res = set_nonblock($file.as_raw_fd()); - if let Err(e) = &res { - error!(error = %e, file = stringify!($file), pid = child.id(), "set_nonblock failed"); - } - res - }}; - } - set_nonblock_or_log_err!(stdin)?; - set_nonblock_or_log_err!(stdout)?; - - // all fallible operations post-spawn are complete, so get rid of the guard - let child = scopeguard::ScopeGuard::into_inner(child); - - tokio::spawn( - async move { - scopeguard::defer! { - debug!("wal-redo-postgres stderr_logger_task finished"); - crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_finished.inc(); - } - debug!("wal-redo-postgres stderr_logger_task started"); - crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_started.inc(); - - use tokio::io::AsyncBufReadExt; - let mut stderr_lines = tokio::io::BufReader::new(stderr); - let mut buf = Vec::new(); - let res = loop { - buf.clear(); - // TODO we don't trust the process to cap its stderr length. - // Currently it can do unbounded Vec allocation. - match stderr_lines.read_until(b'\n', &mut buf).await { - Ok(0) => break Ok(()), // eof - Ok(num_bytes) => { - let output = String::from_utf8_lossy(&buf[..num_bytes]); - error!(%output, "received output"); - } - Err(e) => { - break Err(e); - } - } - }; - match res { - Ok(()) => (), - Err(e) => { - error!(error=?e, "failed to read from walredo stderr"); - } - } - }.instrument(tracing::info_span!(parent: None, "wal-redo-postgres-stderr", pid = child.id(), tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %pg_version)) - ); - - Ok(Self { - conf, - tenant_shard_id, - child: Some(child), - stdin: Mutex::new(ProcessInput { - stdin, - n_requests: 0, - }), - stdout: Mutex::new(ProcessOutput { - stdout, - pending_responses: VecDeque::new(), - n_processed_responses: 0, - }), - #[cfg(feature = "testing")] - dump_sequence: AtomicUsize::default(), + Ok(match conf.walredo_process_kind { + Kind::Sync => Self::Sync(process_impl::process_std::WalRedoProcess::launch( + conf, + tenant_shard_id, + pg_version, + )?), + Kind::Async => Self::Async(process_impl::process_async::WalRedoProcess::launch( + conf, + tenant_shard_id, + pg_version, + )?), }) } - pub(crate) fn id(&self) -> u32 { - self.child - .as_ref() - .expect("must not call this during Drop") - .id() - } - - // Apply given WAL records ('records') over an old page image. Returns - // new page image. - // - #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), pid=%self.id()))] - pub(crate) fn apply_wal_records( + #[inline(always)] + pub(crate) async fn apply_wal_records( &self, rel: RelTag, blknum: u32, @@ -188,221 +69,29 @@ impl WalRedoProcess { records: &[(Lsn, NeonWalRecord)], wal_redo_timeout: Duration, ) -> anyhow::Result { - let tag = protocol::BufferTag { rel, blknum }; - let input = self.stdin.lock().unwrap(); - - // Serialize all the messages to send the WAL redo process first. - // - // This could be problematic if there are millions of records to replay, - // but in practice the number of records is usually so small that it doesn't - // matter, and it's better to keep this code simple. - // - // Most requests start with a before-image with BLCKSZ bytes, followed by - // by some other WAL records. Start with a buffer that can hold that - // comfortably. - let mut writebuf: Vec = Vec::with_capacity((BLCKSZ as usize) * 3); - protocol::build_begin_redo_for_block_msg(tag, &mut writebuf); - if let Some(img) = base_img { - protocol::build_push_page_msg(tag, img, &mut writebuf); - } - for (lsn, rec) in records.iter() { - if let NeonWalRecord::Postgres { - will_init: _, - rec: postgres_rec, - } = rec - { - protocol::build_apply_record_msg(*lsn, postgres_rec, &mut writebuf); - } else { - anyhow::bail!("tried to pass neon wal record to postgres WAL redo"); + match self { + Process::Sync(p) => { + p.apply_wal_records(rel, blknum, base_img, records, wal_redo_timeout) + .await } - } - protocol::build_get_page_msg(tag, &mut writebuf); - WAL_REDO_RECORD_COUNTER.inc_by(records.len() as u64); - - let res = self.apply_wal_records0(&writebuf, input, wal_redo_timeout); - - if res.is_err() { - // not all of these can be caused by this particular input, however these are so rare - // in tests so capture all. - self.record_and_log(&writebuf); - } - - res - } - - fn apply_wal_records0( - &self, - writebuf: &[u8], - input: MutexGuard, - wal_redo_timeout: Duration, - ) -> anyhow::Result { - let mut proc = { input }; // TODO: remove this legacy rename, but this keep the patch small. - let mut nwrite = 0usize; - - while nwrite < writebuf.len() { - let mut stdin_pollfds = [PollFd::new(&proc.stdin, PollFlags::POLLOUT)]; - let n = loop { - match nix::poll::poll(&mut stdin_pollfds[..], wal_redo_timeout.as_millis() as i32) { - Err(nix::errno::Errno::EINTR) => continue, - res => break res, - } - }?; - - if n == 0 { - anyhow::bail!("WAL redo timed out"); + Process::Async(p) => { + p.apply_wal_records(rel, blknum, base_img, records, wal_redo_timeout) + .await } - - // If 'stdin' is writeable, do write. - let in_revents = stdin_pollfds[0].revents().unwrap(); - if in_revents & (PollFlags::POLLERR | PollFlags::POLLOUT) != PollFlags::empty() { - nwrite += proc.stdin.write(&writebuf[nwrite..])?; - } - if in_revents.contains(PollFlags::POLLHUP) { - // We still have more data to write, but the process closed the pipe. - anyhow::bail!("WAL redo process closed its stdin unexpectedly"); - } - } - let request_no = proc.n_requests; - proc.n_requests += 1; - drop(proc); - - // To improve walredo performance we separate sending requests and receiving - // responses. Them are protected by different mutexes (output and input). - // If thread T1, T2, T3 send requests D1, D2, D3 to walredo process - // then there is not warranty that T1 will first granted output mutex lock. - // To address this issue we maintain number of sent requests, number of processed - // responses and ring buffer with pending responses. After sending response - // (under input mutex), threads remembers request number. Then it releases - // input mutex, locks output mutex and fetch in ring buffer all responses until - // its stored request number. The it takes correspondent element from - // pending responses ring buffer and truncate all empty elements from the front, - // advancing processed responses number. - - let mut output = self.stdout.lock().unwrap(); - let n_processed_responses = output.n_processed_responses; - while n_processed_responses + output.pending_responses.len() <= request_no { - // We expect the WAL redo process to respond with an 8k page image. We read it - // into this buffer. - let mut resultbuf = vec![0; BLCKSZ.into()]; - let mut nresult: usize = 0; // # of bytes read into 'resultbuf' so far - while nresult < BLCKSZ.into() { - let mut stdout_pollfds = [PollFd::new(&output.stdout, PollFlags::POLLIN)]; - // We do two things simultaneously: reading response from stdout - // and forward any logging information that the child writes to its stderr to the page server's log. - let n = loop { - match nix::poll::poll( - &mut stdout_pollfds[..], - wal_redo_timeout.as_millis() as i32, - ) { - Err(nix::errno::Errno::EINTR) => continue, - res => break res, - } - }?; - - if n == 0 { - anyhow::bail!("WAL redo timed out"); - } - - // If we have some data in stdout, read it to the result buffer. - let out_revents = stdout_pollfds[0].revents().unwrap(); - if out_revents & (PollFlags::POLLERR | PollFlags::POLLIN) != PollFlags::empty() { - nresult += output.stdout.read(&mut resultbuf[nresult..])?; - } - if out_revents.contains(PollFlags::POLLHUP) { - anyhow::bail!("WAL redo process closed its stdout unexpectedly"); - } - } - output - .pending_responses - .push_back(Some(Bytes::from(resultbuf))); - } - // Replace our request's response with None in `pending_responses`. - // Then make space in the ring buffer by clearing out any seqence of contiguous - // `None`'s from the front of `pending_responses`. - // NB: We can't pop_front() because other requests' responses because another - // requester might have grabbed the output mutex before us: - // T1: grab input mutex - // T1: send request_no 23 - // T1: release input mutex - // T2: grab input mutex - // T2: send request_no 24 - // T2: release input mutex - // T2: grab output mutex - // T2: n_processed_responses + output.pending_responses.len() <= request_no - // 23 0 24 - // T2: enters poll loop that reads stdout - // T2: put response for 23 into pending_responses - // T2: put response for 24 into pending_resposnes - // pending_responses now looks like this: Front Some(response_23) Some(response_24) Back - // T2: takes its response_24 - // pending_responses now looks like this: Front Some(response_23) None Back - // T2: does the while loop below - // pending_responses now looks like this: Front Some(response_23) None Back - // T2: releases output mutex - // T1: grabs output mutex - // T1: n_processed_responses + output.pending_responses.len() > request_no - // 23 2 23 - // T1: skips poll loop that reads stdout - // T1: takes its response_23 - // pending_responses now looks like this: Front None None Back - // T2: does the while loop below - // pending_responses now looks like this: Front Back - // n_processed_responses now has value 25 - let res = output.pending_responses[request_no - n_processed_responses] - .take() - .expect("we own this request_no, nobody else is supposed to take it"); - while let Some(front) = output.pending_responses.front() { - if front.is_none() { - output.pending_responses.pop_front(); - output.n_processed_responses += 1; - } else { - break; - } - } - Ok(res) - } - - #[cfg(feature = "testing")] - fn record_and_log(&self, writebuf: &[u8]) { - use std::sync::atomic::Ordering; - - let millis = std::time::SystemTime::now() - .duration_since(std::time::SystemTime::UNIX_EPOCH) - .unwrap() - .as_millis(); - - let seq = self.dump_sequence.fetch_add(1, Ordering::Relaxed); - - // these files will be collected to an allure report - let filename = format!("walredo-{millis}-{}-{seq}.walredo", writebuf.len()); - - let path = self.conf.tenant_path(&self.tenant_shard_id).join(&filename); - - let res = std::fs::OpenOptions::new() - .write(true) - .create_new(true) - .read(true) - .open(path) - .and_then(|mut f| f.write_all(writebuf)); - - // trip up allowed_errors - if let Err(e) = res { - tracing::error!(target=%filename, length=writebuf.len(), "failed to write out the walredo errored input: {e}"); - } else { - tracing::error!(filename, "erroring walredo input saved"); } } - #[cfg(not(feature = "testing"))] - fn record_and_log(&self, _: &[u8]) {} -} + pub(crate) fn id(&self) -> u32 { + match self { + Process::Sync(p) => p.id(), + Process::Async(p) => p.id(), + } + } -impl Drop for WalRedoProcess { - fn drop(&mut self) { - self.child - .take() - .expect("we only do this once") - .kill_and_wait(WalRedoKillCause::WalRedoProcessDrop); - // no way to wait for stderr_logger_task from Drop because that is async only + pub(crate) fn kind(&self) -> Kind { + match self { + Process::Sync(_) => Kind::Sync, + Process::Async(_) => Kind::Async, + } } } diff --git a/pageserver/src/walredo/process/process_impl/process_async.rs b/pageserver/src/walredo/process/process_impl/process_async.rs new file mode 100644 index 0000000000..262858b033 --- /dev/null +++ b/pageserver/src/walredo/process/process_impl/process_async.rs @@ -0,0 +1,374 @@ +use self::no_leak_child::NoLeakChild; +use crate::{ + config::PageServerConf, + metrics::{WalRedoKillCause, WAL_REDO_PROCESS_COUNTERS, WAL_REDO_RECORD_COUNTER}, + walrecord::NeonWalRecord, + walredo::process::{no_leak_child, protocol}, +}; +use anyhow::Context; +use bytes::Bytes; +use pageserver_api::{reltag::RelTag, shard::TenantShardId}; +use postgres_ffi::BLCKSZ; +#[cfg(feature = "testing")] +use std::sync::atomic::AtomicUsize; +use std::{ + collections::VecDeque, + process::{Command, Stdio}, + time::Duration, +}; +use tokio::io::{AsyncReadExt, AsyncWriteExt}; +use tracing::{debug, error, instrument, Instrument}; +use utils::{lsn::Lsn, poison::Poison}; + +pub struct WalRedoProcess { + #[allow(dead_code)] + conf: &'static PageServerConf, + tenant_shard_id: TenantShardId, + // Some() on construction, only becomes None on Drop. + child: Option, + stdout: tokio::sync::Mutex>, + stdin: tokio::sync::Mutex>, + /// Counter to separate same sized walredo inputs failing at the same millisecond. + #[cfg(feature = "testing")] + dump_sequence: AtomicUsize, +} + +struct ProcessInput { + stdin: tokio::process::ChildStdin, + n_requests: usize, +} + +struct ProcessOutput { + stdout: tokio::process::ChildStdout, + pending_responses: VecDeque>, + n_processed_responses: usize, +} + +impl WalRedoProcess { + // + // Start postgres binary in special WAL redo mode. + // + #[instrument(skip_all,fields(pg_version=pg_version))] + pub(crate) fn launch( + conf: &'static PageServerConf, + tenant_shard_id: TenantShardId, + pg_version: u32, + ) -> anyhow::Result { + crate::span::debug_assert_current_span_has_tenant_id(); + + let pg_bin_dir_path = conf.pg_bin_dir(pg_version).context("pg_bin_dir")?; // TODO these should be infallible. + let pg_lib_dir_path = conf.pg_lib_dir(pg_version).context("pg_lib_dir")?; + + use no_leak_child::NoLeakChildCommandExt; + // Start postgres itself + let child = Command::new(pg_bin_dir_path.join("postgres")) + // the first arg must be --wal-redo so the child process enters into walredo mode + .arg("--wal-redo") + // the child doesn't process this arg, but, having it in the argv helps indentify the + // walredo process for a particular tenant when debugging a pagserver + .args(["--tenant-shard-id", &format!("{tenant_shard_id}")]) + .stdin(Stdio::piped()) + .stderr(Stdio::piped()) + .stdout(Stdio::piped()) + .env_clear() + .env("LD_LIBRARY_PATH", &pg_lib_dir_path) + .env("DYLD_LIBRARY_PATH", &pg_lib_dir_path) + // NB: The redo process is not trusted after we sent it the first + // walredo work. Before that, it is trusted. Specifically, we trust + // it to + // 1. close all file descriptors except stdin, stdout, stderr because + // pageserver might not be 100% diligent in setting FD_CLOEXEC on all + // the files it opens, and + // 2. to use seccomp to sandbox itself before processing the first + // walredo request. + .spawn_no_leak_child(tenant_shard_id) + .context("spawn process")?; + WAL_REDO_PROCESS_COUNTERS.started.inc(); + let mut child = scopeguard::guard(child, |child| { + error!("killing wal-redo-postgres process due to a problem during launch"); + child.kill_and_wait(WalRedoKillCause::Startup); + }); + + let stdin = child.stdin.take().unwrap(); + let stdout = child.stdout.take().unwrap(); + let stderr = child.stderr.take().unwrap(); + let stderr = tokio::process::ChildStderr::from_std(stderr) + .context("convert to tokio::ChildStderr")?; + let stdin = + tokio::process::ChildStdin::from_std(stdin).context("convert to tokio::ChildStdin")?; + let stdout = tokio::process::ChildStdout::from_std(stdout) + .context("convert to tokio::ChildStdout")?; + + // all fallible operations post-spawn are complete, so get rid of the guard + let child = scopeguard::ScopeGuard::into_inner(child); + + tokio::spawn( + async move { + scopeguard::defer! { + debug!("wal-redo-postgres stderr_logger_task finished"); + crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_finished.inc(); + } + debug!("wal-redo-postgres stderr_logger_task started"); + crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_started.inc(); + + use tokio::io::AsyncBufReadExt; + let mut stderr_lines = tokio::io::BufReader::new(stderr); + let mut buf = Vec::new(); + let res = loop { + buf.clear(); + // TODO we don't trust the process to cap its stderr length. + // Currently it can do unbounded Vec allocation. + match stderr_lines.read_until(b'\n', &mut buf).await { + Ok(0) => break Ok(()), // eof + Ok(num_bytes) => { + let output = String::from_utf8_lossy(&buf[..num_bytes]); + error!(%output, "received output"); + } + Err(e) => { + break Err(e); + } + } + }; + match res { + Ok(()) => (), + Err(e) => { + error!(error=?e, "failed to read from walredo stderr"); + } + } + }.instrument(tracing::info_span!(parent: None, "wal-redo-postgres-stderr", pid = child.id(), tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %pg_version)) + ); + + Ok(Self { + conf, + tenant_shard_id, + child: Some(child), + stdin: tokio::sync::Mutex::new(Poison::new( + "stdin", + ProcessInput { + stdin, + n_requests: 0, + }, + )), + stdout: tokio::sync::Mutex::new(Poison::new( + "stdout", + ProcessOutput { + stdout, + pending_responses: VecDeque::new(), + n_processed_responses: 0, + }, + )), + #[cfg(feature = "testing")] + dump_sequence: AtomicUsize::default(), + }) + } + + pub(crate) fn id(&self) -> u32 { + self.child + .as_ref() + .expect("must not call this during Drop") + .id() + } + + /// Apply given WAL records ('records') over an old page image. Returns + /// new page image. + /// + /// # Cancel-Safety + /// + /// Cancellation safe. + #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), pid=%self.id()))] + pub(crate) async fn apply_wal_records( + &self, + rel: RelTag, + blknum: u32, + base_img: &Option, + records: &[(Lsn, NeonWalRecord)], + wal_redo_timeout: Duration, + ) -> anyhow::Result { + let tag = protocol::BufferTag { rel, blknum }; + + // Serialize all the messages to send the WAL redo process first. + // + // This could be problematic if there are millions of records to replay, + // but in practice the number of records is usually so small that it doesn't + // matter, and it's better to keep this code simple. + // + // Most requests start with a before-image with BLCKSZ bytes, followed by + // by some other WAL records. Start with a buffer that can hold that + // comfortably. + let mut writebuf: Vec = Vec::with_capacity((BLCKSZ as usize) * 3); + protocol::build_begin_redo_for_block_msg(tag, &mut writebuf); + if let Some(img) = base_img { + protocol::build_push_page_msg(tag, img, &mut writebuf); + } + for (lsn, rec) in records.iter() { + if let NeonWalRecord::Postgres { + will_init: _, + rec: postgres_rec, + } = rec + { + protocol::build_apply_record_msg(*lsn, postgres_rec, &mut writebuf); + } else { + anyhow::bail!("tried to pass neon wal record to postgres WAL redo"); + } + } + protocol::build_get_page_msg(tag, &mut writebuf); + WAL_REDO_RECORD_COUNTER.inc_by(records.len() as u64); + + let Ok(res) = + tokio::time::timeout(wal_redo_timeout, self.apply_wal_records0(&writebuf)).await + else { + anyhow::bail!("WAL redo timed out"); + }; + + if res.is_err() { + // not all of these can be caused by this particular input, however these are so rare + // in tests so capture all. + self.record_and_log(&writebuf); + } + + res + } + + /// # Cancel-Safety + /// + /// When not polled to completion (e.g. because in `tokio::select!` another + /// branch becomes ready before this future), concurrent and subsequent + /// calls may fail due to [`utils::poison::Poison::check_and_arm`] calls. + /// Dispose of this process instance and create a new one. + async fn apply_wal_records0(&self, writebuf: &[u8]) -> anyhow::Result { + let request_no = { + let mut lock_guard = self.stdin.lock().await; + let mut poison_guard = lock_guard.check_and_arm()?; + let input = poison_guard.data_mut(); + input + .stdin + .write_all(writebuf) + .await + .context("write to walredo stdin")?; + let request_no = input.n_requests; + input.n_requests += 1; + poison_guard.disarm(); + request_no + }; + + // To improve walredo performance we separate sending requests and receiving + // responses. Them are protected by different mutexes (output and input). + // If thread T1, T2, T3 send requests D1, D2, D3 to walredo process + // then there is not warranty that T1 will first granted output mutex lock. + // To address this issue we maintain number of sent requests, number of processed + // responses and ring buffer with pending responses. After sending response + // (under input mutex), threads remembers request number. Then it releases + // input mutex, locks output mutex and fetch in ring buffer all responses until + // its stored request number. The it takes correspondent element from + // pending responses ring buffer and truncate all empty elements from the front, + // advancing processed responses number. + + let mut lock_guard = self.stdout.lock().await; + let mut poison_guard = lock_guard.check_and_arm()?; + let output = poison_guard.data_mut(); + let n_processed_responses = output.n_processed_responses; + while n_processed_responses + output.pending_responses.len() <= request_no { + // We expect the WAL redo process to respond with an 8k page image. We read it + // into this buffer. + let mut resultbuf = vec![0; BLCKSZ.into()]; + output + .stdout + .read_exact(&mut resultbuf) + .await + .context("read walredo stdout")?; + output + .pending_responses + .push_back(Some(Bytes::from(resultbuf))); + } + // Replace our request's response with None in `pending_responses`. + // Then make space in the ring buffer by clearing out any seqence of contiguous + // `None`'s from the front of `pending_responses`. + // NB: We can't pop_front() because other requests' responses because another + // requester might have grabbed the output mutex before us: + // T1: grab input mutex + // T1: send request_no 23 + // T1: release input mutex + // T2: grab input mutex + // T2: send request_no 24 + // T2: release input mutex + // T2: grab output mutex + // T2: n_processed_responses + output.pending_responses.len() <= request_no + // 23 0 24 + // T2: enters poll loop that reads stdout + // T2: put response for 23 into pending_responses + // T2: put response for 24 into pending_resposnes + // pending_responses now looks like this: Front Some(response_23) Some(response_24) Back + // T2: takes its response_24 + // pending_responses now looks like this: Front Some(response_23) None Back + // T2: does the while loop below + // pending_responses now looks like this: Front Some(response_23) None Back + // T2: releases output mutex + // T1: grabs output mutex + // T1: n_processed_responses + output.pending_responses.len() > request_no + // 23 2 23 + // T1: skips poll loop that reads stdout + // T1: takes its response_23 + // pending_responses now looks like this: Front None None Back + // T2: does the while loop below + // pending_responses now looks like this: Front Back + // n_processed_responses now has value 25 + let res = output.pending_responses[request_no - n_processed_responses] + .take() + .expect("we own this request_no, nobody else is supposed to take it"); + while let Some(front) = output.pending_responses.front() { + if front.is_none() { + output.pending_responses.pop_front(); + output.n_processed_responses += 1; + } else { + break; + } + } + poison_guard.disarm(); + Ok(res) + } + + #[cfg(feature = "testing")] + fn record_and_log(&self, writebuf: &[u8]) { + use std::sync::atomic::Ordering; + + let millis = std::time::SystemTime::now() + .duration_since(std::time::SystemTime::UNIX_EPOCH) + .unwrap() + .as_millis(); + + let seq = self.dump_sequence.fetch_add(1, Ordering::Relaxed); + + // these files will be collected to an allure report + let filename = format!("walredo-{millis}-{}-{seq}.walredo", writebuf.len()); + + let path = self.conf.tenant_path(&self.tenant_shard_id).join(&filename); + + use std::io::Write; + let res = std::fs::OpenOptions::new() + .write(true) + .create_new(true) + .read(true) + .open(path) + .and_then(|mut f| f.write_all(writebuf)); + + // trip up allowed_errors + if let Err(e) = res { + tracing::error!(target=%filename, length=writebuf.len(), "failed to write out the walredo errored input: {e}"); + } else { + tracing::error!(filename, "erroring walredo input saved"); + } + } + + #[cfg(not(feature = "testing"))] + fn record_and_log(&self, _: &[u8]) {} +} + +impl Drop for WalRedoProcess { + fn drop(&mut self) { + self.child + .take() + .expect("we only do this once") + .kill_and_wait(WalRedoKillCause::WalRedoProcessDrop); + // no way to wait for stderr_logger_task from Drop because that is async only + } +} diff --git a/pageserver/src/walredo/process/process_impl/process_std.rs b/pageserver/src/walredo/process/process_impl/process_std.rs new file mode 100644 index 0000000000..e7a6c263c9 --- /dev/null +++ b/pageserver/src/walredo/process/process_impl/process_std.rs @@ -0,0 +1,405 @@ +use self::no_leak_child::NoLeakChild; +use crate::{ + config::PageServerConf, + metrics::{WalRedoKillCause, WAL_REDO_PROCESS_COUNTERS, WAL_REDO_RECORD_COUNTER}, + walrecord::NeonWalRecord, + walredo::process::{no_leak_child, protocol}, +}; +use anyhow::Context; +use bytes::Bytes; +use nix::poll::{PollFd, PollFlags}; +use pageserver_api::{reltag::RelTag, shard::TenantShardId}; +use postgres_ffi::BLCKSZ; +use std::os::fd::AsRawFd; +#[cfg(feature = "testing")] +use std::sync::atomic::AtomicUsize; +use std::{ + collections::VecDeque, + io::{Read, Write}, + process::{ChildStdin, ChildStdout, Command, Stdio}, + sync::{Mutex, MutexGuard}, + time::Duration, +}; +use tracing::{debug, error, instrument, Instrument}; +use utils::{lsn::Lsn, nonblock::set_nonblock}; + +pub struct WalRedoProcess { + #[allow(dead_code)] + conf: &'static PageServerConf, + tenant_shard_id: TenantShardId, + // Some() on construction, only becomes None on Drop. + child: Option, + stdout: Mutex, + stdin: Mutex, + /// Counter to separate same sized walredo inputs failing at the same millisecond. + #[cfg(feature = "testing")] + dump_sequence: AtomicUsize, +} + +struct ProcessInput { + stdin: ChildStdin, + n_requests: usize, +} + +struct ProcessOutput { + stdout: ChildStdout, + pending_responses: VecDeque>, + n_processed_responses: usize, +} + +impl WalRedoProcess { + // + // Start postgres binary in special WAL redo mode. + // + #[instrument(skip_all,fields(pg_version=pg_version))] + pub(crate) fn launch( + conf: &'static PageServerConf, + tenant_shard_id: TenantShardId, + pg_version: u32, + ) -> anyhow::Result { + crate::span::debug_assert_current_span_has_tenant_id(); + + let pg_bin_dir_path = conf.pg_bin_dir(pg_version).context("pg_bin_dir")?; // TODO these should be infallible. + let pg_lib_dir_path = conf.pg_lib_dir(pg_version).context("pg_lib_dir")?; + + use no_leak_child::NoLeakChildCommandExt; + // Start postgres itself + let child = Command::new(pg_bin_dir_path.join("postgres")) + // the first arg must be --wal-redo so the child process enters into walredo mode + .arg("--wal-redo") + // the child doesn't process this arg, but, having it in the argv helps indentify the + // walredo process for a particular tenant when debugging a pagserver + .args(["--tenant-shard-id", &format!("{tenant_shard_id}")]) + .stdin(Stdio::piped()) + .stderr(Stdio::piped()) + .stdout(Stdio::piped()) + .env_clear() + .env("LD_LIBRARY_PATH", &pg_lib_dir_path) + .env("DYLD_LIBRARY_PATH", &pg_lib_dir_path) + // NB: The redo process is not trusted after we sent it the first + // walredo work. Before that, it is trusted. Specifically, we trust + // it to + // 1. close all file descriptors except stdin, stdout, stderr because + // pageserver might not be 100% diligent in setting FD_CLOEXEC on all + // the files it opens, and + // 2. to use seccomp to sandbox itself before processing the first + // walredo request. + .spawn_no_leak_child(tenant_shard_id) + .context("spawn process")?; + WAL_REDO_PROCESS_COUNTERS.started.inc(); + let mut child = scopeguard::guard(child, |child| { + error!("killing wal-redo-postgres process due to a problem during launch"); + child.kill_and_wait(WalRedoKillCause::Startup); + }); + + let stdin = child.stdin.take().unwrap(); + let stdout = child.stdout.take().unwrap(); + let stderr = child.stderr.take().unwrap(); + let stderr = tokio::process::ChildStderr::from_std(stderr) + .context("convert to tokio::ChildStderr")?; + macro_rules! set_nonblock_or_log_err { + ($file:ident) => {{ + let res = set_nonblock($file.as_raw_fd()); + if let Err(e) = &res { + error!(error = %e, file = stringify!($file), pid = child.id(), "set_nonblock failed"); + } + res + }}; + } + set_nonblock_or_log_err!(stdin)?; + set_nonblock_or_log_err!(stdout)?; + + // all fallible operations post-spawn are complete, so get rid of the guard + let child = scopeguard::ScopeGuard::into_inner(child); + + tokio::spawn( + async move { + scopeguard::defer! { + debug!("wal-redo-postgres stderr_logger_task finished"); + crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_finished.inc(); + } + debug!("wal-redo-postgres stderr_logger_task started"); + crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_started.inc(); + + use tokio::io::AsyncBufReadExt; + let mut stderr_lines = tokio::io::BufReader::new(stderr); + let mut buf = Vec::new(); + let res = loop { + buf.clear(); + // TODO we don't trust the process to cap its stderr length. + // Currently it can do unbounded Vec allocation. + match stderr_lines.read_until(b'\n', &mut buf).await { + Ok(0) => break Ok(()), // eof + Ok(num_bytes) => { + let output = String::from_utf8_lossy(&buf[..num_bytes]); + error!(%output, "received output"); + } + Err(e) => { + break Err(e); + } + } + }; + match res { + Ok(()) => (), + Err(e) => { + error!(error=?e, "failed to read from walredo stderr"); + } + } + }.instrument(tracing::info_span!(parent: None, "wal-redo-postgres-stderr", pid = child.id(), tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %pg_version)) + ); + + Ok(Self { + conf, + tenant_shard_id, + child: Some(child), + stdin: Mutex::new(ProcessInput { + stdin, + n_requests: 0, + }), + stdout: Mutex::new(ProcessOutput { + stdout, + pending_responses: VecDeque::new(), + n_processed_responses: 0, + }), + #[cfg(feature = "testing")] + dump_sequence: AtomicUsize::default(), + }) + } + + pub(crate) fn id(&self) -> u32 { + self.child + .as_ref() + .expect("must not call this during Drop") + .id() + } + + // Apply given WAL records ('records') over an old page image. Returns + // new page image. + // + #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), pid=%self.id()))] + pub(crate) async fn apply_wal_records( + &self, + rel: RelTag, + blknum: u32, + base_img: &Option, + records: &[(Lsn, NeonWalRecord)], + wal_redo_timeout: Duration, + ) -> anyhow::Result { + let tag = protocol::BufferTag { rel, blknum }; + let input = self.stdin.lock().unwrap(); + + // Serialize all the messages to send the WAL redo process first. + // + // This could be problematic if there are millions of records to replay, + // but in practice the number of records is usually so small that it doesn't + // matter, and it's better to keep this code simple. + // + // Most requests start with a before-image with BLCKSZ bytes, followed by + // by some other WAL records. Start with a buffer that can hold that + // comfortably. + let mut writebuf: Vec = Vec::with_capacity((BLCKSZ as usize) * 3); + protocol::build_begin_redo_for_block_msg(tag, &mut writebuf); + if let Some(img) = base_img { + protocol::build_push_page_msg(tag, img, &mut writebuf); + } + for (lsn, rec) in records.iter() { + if let NeonWalRecord::Postgres { + will_init: _, + rec: postgres_rec, + } = rec + { + protocol::build_apply_record_msg(*lsn, postgres_rec, &mut writebuf); + } else { + anyhow::bail!("tried to pass neon wal record to postgres WAL redo"); + } + } + protocol::build_get_page_msg(tag, &mut writebuf); + WAL_REDO_RECORD_COUNTER.inc_by(records.len() as u64); + + let res = self.apply_wal_records0(&writebuf, input, wal_redo_timeout); + + if res.is_err() { + // not all of these can be caused by this particular input, however these are so rare + // in tests so capture all. + self.record_and_log(&writebuf); + } + + res + } + + fn apply_wal_records0( + &self, + writebuf: &[u8], + input: MutexGuard, + wal_redo_timeout: Duration, + ) -> anyhow::Result { + let mut proc = { input }; // TODO: remove this legacy rename, but this keep the patch small. + let mut nwrite = 0usize; + + while nwrite < writebuf.len() { + let mut stdin_pollfds = [PollFd::new(&proc.stdin, PollFlags::POLLOUT)]; + let n = loop { + match nix::poll::poll(&mut stdin_pollfds[..], wal_redo_timeout.as_millis() as i32) { + Err(nix::errno::Errno::EINTR) => continue, + res => break res, + } + }?; + + if n == 0 { + anyhow::bail!("WAL redo timed out"); + } + + // If 'stdin' is writeable, do write. + let in_revents = stdin_pollfds[0].revents().unwrap(); + if in_revents & (PollFlags::POLLERR | PollFlags::POLLOUT) != PollFlags::empty() { + nwrite += proc.stdin.write(&writebuf[nwrite..])?; + } + if in_revents.contains(PollFlags::POLLHUP) { + // We still have more data to write, but the process closed the pipe. + anyhow::bail!("WAL redo process closed its stdin unexpectedly"); + } + } + let request_no = proc.n_requests; + proc.n_requests += 1; + drop(proc); + + // To improve walredo performance we separate sending requests and receiving + // responses. Them are protected by different mutexes (output and input). + // If thread T1, T2, T3 send requests D1, D2, D3 to walredo process + // then there is not warranty that T1 will first granted output mutex lock. + // To address this issue we maintain number of sent requests, number of processed + // responses and ring buffer with pending responses. After sending response + // (under input mutex), threads remembers request number. Then it releases + // input mutex, locks output mutex and fetch in ring buffer all responses until + // its stored request number. The it takes correspondent element from + // pending responses ring buffer and truncate all empty elements from the front, + // advancing processed responses number. + + let mut output = self.stdout.lock().unwrap(); + let n_processed_responses = output.n_processed_responses; + while n_processed_responses + output.pending_responses.len() <= request_no { + // We expect the WAL redo process to respond with an 8k page image. We read it + // into this buffer. + let mut resultbuf = vec![0; BLCKSZ.into()]; + let mut nresult: usize = 0; // # of bytes read into 'resultbuf' so far + while nresult < BLCKSZ.into() { + let mut stdout_pollfds = [PollFd::new(&output.stdout, PollFlags::POLLIN)]; + // We do two things simultaneously: reading response from stdout + // and forward any logging information that the child writes to its stderr to the page server's log. + let n = loop { + match nix::poll::poll( + &mut stdout_pollfds[..], + wal_redo_timeout.as_millis() as i32, + ) { + Err(nix::errno::Errno::EINTR) => continue, + res => break res, + } + }?; + + if n == 0 { + anyhow::bail!("WAL redo timed out"); + } + + // If we have some data in stdout, read it to the result buffer. + let out_revents = stdout_pollfds[0].revents().unwrap(); + if out_revents & (PollFlags::POLLERR | PollFlags::POLLIN) != PollFlags::empty() { + nresult += output.stdout.read(&mut resultbuf[nresult..])?; + } + if out_revents.contains(PollFlags::POLLHUP) { + anyhow::bail!("WAL redo process closed its stdout unexpectedly"); + } + } + output + .pending_responses + .push_back(Some(Bytes::from(resultbuf))); + } + // Replace our request's response with None in `pending_responses`. + // Then make space in the ring buffer by clearing out any seqence of contiguous + // `None`'s from the front of `pending_responses`. + // NB: We can't pop_front() because other requests' responses because another + // requester might have grabbed the output mutex before us: + // T1: grab input mutex + // T1: send request_no 23 + // T1: release input mutex + // T2: grab input mutex + // T2: send request_no 24 + // T2: release input mutex + // T2: grab output mutex + // T2: n_processed_responses + output.pending_responses.len() <= request_no + // 23 0 24 + // T2: enters poll loop that reads stdout + // T2: put response for 23 into pending_responses + // T2: put response for 24 into pending_resposnes + // pending_responses now looks like this: Front Some(response_23) Some(response_24) Back + // T2: takes its response_24 + // pending_responses now looks like this: Front Some(response_23) None Back + // T2: does the while loop below + // pending_responses now looks like this: Front Some(response_23) None Back + // T2: releases output mutex + // T1: grabs output mutex + // T1: n_processed_responses + output.pending_responses.len() > request_no + // 23 2 23 + // T1: skips poll loop that reads stdout + // T1: takes its response_23 + // pending_responses now looks like this: Front None None Back + // T2: does the while loop below + // pending_responses now looks like this: Front Back + // n_processed_responses now has value 25 + let res = output.pending_responses[request_no - n_processed_responses] + .take() + .expect("we own this request_no, nobody else is supposed to take it"); + while let Some(front) = output.pending_responses.front() { + if front.is_none() { + output.pending_responses.pop_front(); + output.n_processed_responses += 1; + } else { + break; + } + } + Ok(res) + } + + #[cfg(feature = "testing")] + fn record_and_log(&self, writebuf: &[u8]) { + use std::sync::atomic::Ordering; + + let millis = std::time::SystemTime::now() + .duration_since(std::time::SystemTime::UNIX_EPOCH) + .unwrap() + .as_millis(); + + let seq = self.dump_sequence.fetch_add(1, Ordering::Relaxed); + + // these files will be collected to an allure report + let filename = format!("walredo-{millis}-{}-{seq}.walredo", writebuf.len()); + + let path = self.conf.tenant_path(&self.tenant_shard_id).join(&filename); + + let res = std::fs::OpenOptions::new() + .write(true) + .create_new(true) + .read(true) + .open(path) + .and_then(|mut f| f.write_all(writebuf)); + + // trip up allowed_errors + if let Err(e) = res { + tracing::error!(target=%filename, length=writebuf.len(), "failed to write out the walredo errored input: {e}"); + } else { + tracing::error!(filename, "erroring walredo input saved"); + } + } + + #[cfg(not(feature = "testing"))] + fn record_and_log(&self, _: &[u8]) {} +} + +impl Drop for WalRedoProcess { + fn drop(&mut self) { + self.child + .take() + .expect("we only do this once") + .kill_and_wait(WalRedoKillCause::WalRedoProcessDrop); + // no way to wait for stderr_logger_task from Drop because that is async only + } +} diff --git a/pgxn/neon/Makefile b/pgxn/neon/Makefile index 0bcb9545a6..cd316dbb91 100644 --- a/pgxn/neon/Makefile +++ b/pgxn/neon/Makefile @@ -14,7 +14,8 @@ OBJS = \ relsize_cache.o \ walproposer.o \ walproposer_pg.o \ - control_plane_connector.o + control_plane_connector.o \ + walsender_hooks.o PG_CPPFLAGS = -I$(libpq_srcdir) SHLIB_LINK_INTERNAL = $(libpq) diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c index e31de3c6b5..f5ce2caff3 100644 --- a/pgxn/neon/libpagestore.c +++ b/pgxn/neon/libpagestore.c @@ -49,6 +49,8 @@ char *neon_auth_token; int readahead_buffer_size = 128; int flush_every_n_requests = 8; +int neon_protocol_version = 2; + static int n_reconnect_attempts = 0; static int max_reconnect_attempts = 60; static int stripe_size; @@ -111,6 +113,7 @@ static PageServer page_servers[MAX_SHARDS]; static bool pageserver_flush(shardno_t shard_no); static void pageserver_disconnect(shardno_t shard_no); +static void pageserver_disconnect_shard(shardno_t shard_no); static bool PagestoreShmemIsValid(void) @@ -378,7 +381,17 @@ pageserver_connect(shardno_t shard_no, int elevel) pfree(msg); return false; } - query = psprintf("pagestream %s %s", neon_tenant, neon_timeline); + switch (neon_protocol_version) + { + case 2: + query = psprintf("pagestream_v2 %s %s", neon_tenant, neon_timeline); + break; + case 1: + query = psprintf("pagestream %s %s", neon_tenant, neon_timeline); + break; + default: + elog(ERROR, "unexpected neon_protocol_version %d", neon_protocol_version); + } ret = PQsendQuery(conn, query); pfree(query); if (ret != 1) @@ -439,7 +452,7 @@ pageserver_connect(shardno_t shard_no, int elevel) return false; } - neon_shard_log(shard_no, LOG, "libpagestore: connected to '%s'", connstr); + neon_shard_log(shard_no, LOG, "libpagestore: connected to '%s' with protocol version %d", connstr, neon_protocol_version); page_servers[shard_no].conn = conn; page_servers[shard_no].wes = wes; @@ -487,9 +500,32 @@ retry: return ret; } - +/* + * Reset prefetch and drop connection to the shard. + * It also drops connection to all other shards involved in prefetch. + */ static void pageserver_disconnect(shardno_t shard_no) +{ + /* + * If the connection to any pageserver is lost, we throw away the + * whole prefetch queue, even for other pageservers. It should not + * cause big problems, because connection loss is supposed to be a + * rare event. + * + * Prefetch state should be reset even if page_servers[shard_no].conn == NULL, + * because prefetch request may be registered before connection is established. + */ + prefetch_on_ps_disconnect(); + + pageserver_disconnect_shard(shard_no); +} + +/* + * Disconnect from specified shard + */ +static void +pageserver_disconnect_shard(shardno_t shard_no) { /* * If anything goes wrong while we were sending a request, it's not clear @@ -503,14 +539,6 @@ pageserver_disconnect(shardno_t shard_no) neon_shard_log(shard_no, LOG, "dropping connection to page server due to error"); PQfinish(page_servers[shard_no].conn); page_servers[shard_no].conn = NULL; - - /* - * If the connection to any pageserver is lost, we throw away the - * whole prefetch queue, even for other pageservers. It should not - * cause big problems, because connection loss is supposed to be a - * rare event. - */ - prefetch_on_ps_disconnect(); } if (page_servers[shard_no].wes != NULL) { @@ -676,7 +704,8 @@ page_server_api api = { .send = pageserver_send, .flush = pageserver_flush, - .receive = pageserver_receive + .receive = pageserver_receive, + .disconnect = pageserver_disconnect_shard }; static bool @@ -827,6 +856,16 @@ pg_init_libpagestore(void) PGC_USERSET, 0, /* no flags required */ NULL, (GucIntAssignHook) &readahead_buffer_resize, NULL); + DefineCustomIntVariable("neon.protocol_version", + "Version of compute<->page server protocol", + NULL, + &neon_protocol_version, + 2, /* use protocol version 2 */ + 1, /* min */ + 2, /* max */ + PGC_SU_BACKEND, + 0, /* no flags required */ + NULL, NULL, NULL); relsize_hash_init(); diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c index 6ede78a576..b69a3819c9 100644 --- a/pgxn/neon/neon.c +++ b/pgxn/neon/neon.c @@ -34,6 +34,7 @@ #include "walproposer.h" #include "pagestore_client.h" #include "control_plane_connector.h" +#include "walsender_hooks.h" PG_MODULE_MAGIC; void _PG_init(void); @@ -265,7 +266,6 @@ LogicalSlotsMonitorMain(Datum main_arg) } } - void _PG_init(void) { @@ -279,6 +279,7 @@ _PG_init(void) pg_init_libpagestore(); pg_init_walproposer(); + WalSender_Custom_XLogReaderRoutines = NeonOnDemandXLogReaderRoutines; InitLogicalReplicationMonitor(); @@ -312,7 +313,7 @@ pg_cluster_size(PG_FUNCTION_ARGS) { int64 size; - size = GetZenithCurrentClusterSize(); + size = GetNeonCurrentClusterSize(); if (size == 0) PG_RETURN_NULL(); diff --git a/pgxn/neon/neon.h b/pgxn/neon/neon.h index a0f8c97497..5c653fc6c6 100644 --- a/pgxn/neon/neon.h +++ b/pgxn/neon/neon.h @@ -26,6 +26,8 @@ extern void pg_init_libpagestore(void); extern void pg_init_walproposer(void); extern uint64 BackpressureThrottlingTime(void); +extern void SetNeonCurrentClusterSize(uint64 size); +extern uint64 GetNeonCurrentClusterSize(void); extern void replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn); extern void PGDLLEXPORT WalProposerSync(int argc, char *argv[]); diff --git a/pgxn/neon/neon_walreader.c b/pgxn/neon/neon_walreader.c index f7ec9e5bfa..e43f4d9d96 100644 --- a/pgxn/neon/neon_walreader.c +++ b/pgxn/neon/neon_walreader.c @@ -36,10 +36,7 @@ static NeonWALReadResult NeonWALReadRemote(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli); static NeonWALReadResult NeonWALReaderReadMsg(NeonWALReader *state); -static void NeonWALReaderResetRemote(NeonWALReader *state); static bool NeonWALReadLocal(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli); -static bool neon_wal_segment_open(NeonWALReader *state, XLogSegNo nextSegNo, TimeLineID *tli_p); -static void neon_wal_segment_close(NeonWALReader *state); static bool is_wal_segment_exists(XLogSegNo segno, int segsize, TimeLineID tli); @@ -82,8 +79,9 @@ struct NeonWALReader XLogRecPtr req_lsn; Size req_len; Size req_progress; - WalProposer *wp; /* we learn donor through walproposer */ + char donor_conninfo[MAXCONNINFO]; char donor_name[64]; /* saved donor safekeeper name for logging */ + XLogRecPtr donor_lsn; /* state of connection to safekeeper */ NeonWALReaderRemoteState rem_state; WalProposerConn *wp_conn; @@ -107,7 +105,7 @@ struct NeonWALReader /* palloc and initialize NeonWALReader */ NeonWALReader * -NeonWALReaderAllocate(int wal_segment_size, XLogRecPtr available_lsn, WalProposer *wp, char *log_prefix) +NeonWALReaderAllocate(int wal_segment_size, XLogRecPtr available_lsn, char *log_prefix) { NeonWALReader *reader; @@ -123,8 +121,6 @@ NeonWALReaderAllocate(int wal_segment_size, XLogRecPtr available_lsn, WalPropose reader->seg.ws_tli = 0; reader->segcxt.ws_segsize = wal_segment_size; - reader->wp = wp; - reader->rem_state = RS_NONE; if (log_prefix) @@ -204,21 +200,16 @@ NeonWALReadRemote(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size cou { if (state->rem_state == RS_NONE) { - XLogRecPtr donor_lsn; - - /* no connection yet; start one */ - Safekeeper *donor = GetDonor(state->wp, &donor_lsn); - - if (donor == NULL) + if (!NeonWALReaderUpdateDonor(state)) { snprintf(state->err_msg, sizeof(state->err_msg), "failed to establish remote connection to fetch WAL: no donor available"); return NEON_WALREAD_ERROR; + } - snprintf(state->donor_name, sizeof(state->donor_name), "%s:%s", donor->host, donor->port); - nwr_log(LOG, "establishing connection to %s, flush_lsn %X/%X to fetch WAL", - state->donor_name, LSN_FORMAT_ARGS(donor_lsn)); - state->wp_conn = libpqwp_connect_start(donor->conninfo); + /* no connection yet; start one */ + nwr_log(LOG, "establishing connection to %s, lsn=%X/%X to fetch WAL", state->donor_name, LSN_FORMAT_ARGS(state->donor_lsn)); + state->wp_conn = libpqwp_connect_start(state->donor_conninfo); if (PQstatus(state->wp_conn->pg_conn) == CONNECTION_BAD) { snprintf(state->err_msg, sizeof(state->err_msg), @@ -251,10 +242,22 @@ NeonWALReadRemote(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size cou { /* connection successfully established */ char start_repl_query[128]; + term_t term = pg_atomic_read_u64(&GetWalpropShmemState()->mineLastElectedTerm); + /* + * Set elected walproposer's term to pull only data from + * its history. Note: for logical walsender it means we + * might stream WAL not yet committed by safekeepers. It + * would be cleaner to fix this. + * + * mineLastElectedTerm shouldn't be 0 at this point + * because we checked above that donor exists and it + * appears only after successfull election. + */ + Assert(term > 0); snprintf(start_repl_query, sizeof(start_repl_query), "START_REPLICATION PHYSICAL %X/%X (term='" UINT64_FORMAT "')", - LSN_FORMAT_ARGS(startptr), state->wp->propTerm); + LSN_FORMAT_ARGS(startptr), term); nwr_log(LOG, "connection to %s to fetch WAL succeeded, running %s", state->donor_name, start_repl_query); if (!libpqwp_send_query(state->wp_conn, start_repl_query)) @@ -404,6 +407,10 @@ NeonWALReadRemote(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size cou state->req_lsn = InvalidXLogRecPtr; state->req_len = 0; state->req_progress = 0; + + /* Update the current segment info. */ + state->seg.ws_tli = tli; + return NEON_WALREAD_SUCCESS; } } @@ -526,7 +533,7 @@ err: } /* reset remote connection and request in progress */ -static void +void NeonWALReaderResetRemote(NeonWALReader *state) { state->req_lsn = InvalidXLogRecPtr; @@ -691,13 +698,25 @@ NeonWALReadLocal(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size coun return true; } +XLogRecPtr +NeonWALReaderGetRemLsn(NeonWALReader *state) +{ + return state->rem_lsn; +} + +const WALOpenSegment * +NeonWALReaderGetSegment(NeonWALReader *state) +{ + return &state->seg; +} + /* * Copy of vanilla wal_segment_open, but returns false in case of error instead * of ERROR, with errno set. * * XLogReaderRoutine->segment_open callback for local pg_wal files */ -static bool +bool neon_wal_segment_open(NeonWALReader *state, XLogSegNo nextSegNo, TimeLineID *tli_p) { @@ -724,7 +743,7 @@ is_wal_segment_exists(XLogSegNo segno, int segsize, TimeLineID tli) } /* copy of vanilla wal_segment_close with NeonWALReader */ -static void +void neon_wal_segment_close(NeonWALReader *state) { if (state->seg.ws_file >= 0) @@ -740,3 +759,19 @@ NeonWALReaderErrMsg(NeonWALReader *state) { return state->err_msg; } + +/* + * Returns true if there is a donor, and false otherwise + */ +bool +NeonWALReaderUpdateDonor(NeonWALReader *state) +{ + WalproposerShmemState *wps = GetWalpropShmemState(); + + SpinLockAcquire(&wps->mutex); + memcpy(state->donor_name, wps->donor_name, sizeof(state->donor_name)); + memcpy(state->donor_conninfo, wps->donor_conninfo, sizeof(state->donor_conninfo)); + state->donor_lsn = wps->donor_lsn; + SpinLockRelease(&wps->mutex); + return state->donor_name[0] != '\0'; +} diff --git a/pgxn/neon/neon_walreader.h b/pgxn/neon/neon_walreader.h index 6be9f149aa..3e41825069 100644 --- a/pgxn/neon/neon_walreader.h +++ b/pgxn/neon/neon_walreader.h @@ -19,12 +19,19 @@ typedef enum NEON_WALREAD_ERROR, } NeonWALReadResult; -extern NeonWALReader *NeonWALReaderAllocate(int wal_segment_size, XLogRecPtr available_lsn, WalProposer *wp, char *log_prefix); +extern NeonWALReader *NeonWALReaderAllocate(int wal_segment_size, XLogRecPtr available_lsn, char *log_prefix); extern void NeonWALReaderFree(NeonWALReader *state); +extern void NeonWALReaderResetRemote(NeonWALReader *state); extern NeonWALReadResult NeonWALRead(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli); extern pgsocket NeonWALReaderSocket(NeonWALReader *state); extern uint32 NeonWALReaderEvents(NeonWALReader *state); extern bool NeonWALReaderIsRemConnEstablished(NeonWALReader *state); extern char *NeonWALReaderErrMsg(NeonWALReader *state); +extern XLogRecPtr NeonWALReaderGetRemLsn(NeonWALReader *state); +extern const WALOpenSegment *NeonWALReaderGetSegment(NeonWALReader *state); +extern bool neon_wal_segment_open(NeonWALReader *state, XLogSegNo nextSegNo, TimeLineID *tli_p); +extern void neon_wal_segment_close(NeonWALReader *state); +extern bool NeonWALReaderUpdateDonor(NeonWALReader *state); + #endif /* __NEON_WALREADER_H__ */ diff --git a/pgxn/neon/pagestore_client.h b/pgxn/neon/pagestore_client.h index 2889ffacae..7709ab9d42 100644 --- a/pgxn/neon/pagestore_client.h +++ b/pgxn/neon/pagestore_client.h @@ -69,18 +69,33 @@ typedef enum { SLRU_MULTIXACT_OFFSETS } SlruKind; -/* - * supertype of all the Neon*Request structs below +/*-- + * supertype of all the Neon*Request structs below. * - * If 'latest' is true, we are requesting the latest page version, and 'lsn' - * is just a hint to the server that we know there are no versions of the page - * (or relation size, for exists/nblocks requests) later than the 'lsn'. + * All requests contain two LSNs: + * + * lsn: request page (or relation size, etc) at this LSN + * not_modified_since: Hint that the page hasn't been modified between + * this LSN and the request LSN (`lsn`). + * + * To request the latest version of a page, you can use MAX_LSN as the request + * LSN. + * + * If you don't know any better, you can always set 'not_modified_since' equal + * to 'lsn', but providing a lower value can speed up processing the request + * in the pageserver, as it doesn't need to wait for the WAL to arrive, and it + * can skip traversing through recent layers which we know to not contain any + * versions for the requested page. + * + * These structs describe the V2 of these requests. The old V1 protocol contained + * just one LSN and a boolean 'latest' flag. If the neon_protocol_version GUC is + * set to 1, we will convert these to the V1 requests before sending. */ typedef struct { NeonMessageTag tag; - bool latest; /* if true, request latest page version */ - XLogRecPtr lsn; /* request page version @ this LSN */ + XLogRecPtr lsn; + XLogRecPtr not_modified_since; } NeonRequest; typedef struct @@ -180,6 +195,7 @@ typedef struct bool (*send) (shardno_t shard_no, NeonRequest * request); NeonResponse *(*receive) (shardno_t shard_no); bool (*flush) (shardno_t shard_no); + void (*disconnect) (shardno_t shard_no); } page_server_api; extern void prefetch_on_ps_disconnect(void); @@ -192,6 +208,7 @@ extern int readahead_buffer_size; extern char *neon_timeline; extern char *neon_tenant; extern int32 max_cluster_size; +extern int neon_protocol_version; extern shardno_t get_shard_number(BufferTag* tag); @@ -224,14 +241,14 @@ extern bool neon_prefetch(SMgrRelation reln, ForkNumber forknum, extern void neon_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer); extern PGDLLEXPORT void neon_read_at_lsn(NRelFileInfo rnode, ForkNumber forkNum, BlockNumber blkno, - XLogRecPtr request_lsn, bool request_latest, char *buffer); + XLogRecPtr request_lsn, XLogRecPtr not_modified_since, char *buffer); extern void neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool skipFsync); #else extern void neon_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, void *buffer); extern PGDLLEXPORT void neon_read_at_lsn(NRelFileInfo rnode, ForkNumber forkNum, BlockNumber blkno, - XLogRecPtr request_lsn, bool request_latest, void *buffer); + XLogRecPtr request_lsn, XLogRecPtr not_modified_since, void *buffer); extern void neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const void *buffer, bool skipFsync); #endif diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c index bf3b53253e..1871431a45 100644 --- a/pgxn/neon/pagestore_smgr.c +++ b/pgxn/neon/pagestore_smgr.c @@ -169,8 +169,8 @@ typedef enum PrefetchStatus typedef struct PrefetchRequest { BufferTag buftag; /* must be first entry in the struct */ - XLogRecPtr effective_request_lsn; - XLogRecPtr actual_request_lsn; + XLogRecPtr request_lsn; + XLogRecPtr not_modified_since; NeonResponse *response; /* may be null */ PrefetchStatus status; shardno_t shard_no; @@ -270,19 +270,19 @@ static PrefetchState *MyPState; ) \ ) -static XLogRecPtr prefetch_lsn = 0; - static bool compact_prefetch_buffers(void); static void consume_prefetch_responses(void); -static uint64 prefetch_register_buffer(BufferTag tag, bool *force_latest, XLogRecPtr *force_lsn); +static uint64 prefetch_register_buffer(BufferTag tag, XLogRecPtr *force_request_lsn, XLogRecPtr *force_not_modified_since); static bool prefetch_read(PrefetchRequest *slot); -static void prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force_lsn); +static void prefetch_do_request(PrefetchRequest *slot, XLogRecPtr *force_request_lsn, XLogRecPtr *force_not_modified_since); static bool prefetch_wait_for(uint64 ring_index); static void prefetch_cleanup_trailing_unused(void); static inline void prefetch_set_unused(uint64 ring_index); -static XLogRecPtr neon_get_request_lsn(bool *latest, NRelFileInfo rinfo, - ForkNumber forknum, BlockNumber blkno); +static void neon_get_request_lsn(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno, + XLogRecPtr *request_lsn, XLogRecPtr *not_modified_since); +static bool neon_prefetch_response_usable(XLogRecPtr request_lsn, XLogRecPtr not_modified_since, + PrefetchRequest *slot); static bool compact_prefetch_buffers(void) @@ -339,8 +339,8 @@ compact_prefetch_buffers(void) target_slot->shard_no = source_slot->shard_no; target_slot->status = source_slot->status; target_slot->response = source_slot->response; - target_slot->effective_request_lsn = source_slot->effective_request_lsn; - target_slot->actual_request_lsn = source_slot->actual_request_lsn; + target_slot->request_lsn = source_slot->request_lsn; + target_slot->not_modified_since = source_slot->not_modified_since; target_slot->my_ring_index = empty_ring_index; prfh_delete(MyPState->prf_hash, source_slot); @@ -359,7 +359,8 @@ compact_prefetch_buffers(void) }; source_slot->response = NULL; source_slot->my_ring_index = 0; - source_slot->effective_request_lsn = 0; + source_slot->request_lsn = InvalidXLogRecPtr; + source_slot->not_modified_since = InvalidXLogRecPtr; /* update bookkeeping */ n_moved++; @@ -614,6 +615,14 @@ prefetch_on_ps_disconnect(void) Assert(slot->status == PRFS_REQUESTED); Assert(slot->my_ring_index == ring_index); + /* + * Drop connection to all shards which have prefetch requests. + * It is not a problem to call disconnect multiple times on the same connection + * because disconnect implementation in libpagestore.c will check if connection + * is alive and do nothing of connection was already dropped. + */ + page_server->disconnect(slot->shard_no); + /* clean up the request */ slot->status = PRFS_TAG_REMAINS; MyPState->n_requests_inflight -= 1; @@ -634,13 +643,12 @@ prefetch_on_ps_disconnect(void) static inline void prefetch_set_unused(uint64 ring_index) { - PrefetchRequest *slot = GetPrfSlot(ring_index); + PrefetchRequest *slot; if (ring_index < MyPState->ring_last) return; /* Should already be unused */ - Assert(MyPState->ring_unused > ring_index); - + slot = GetPrfSlot(ring_index); if (slot->status == PRFS_UNUSED) return; @@ -677,56 +685,39 @@ prefetch_set_unused(uint64 ring_index) compact_prefetch_buffers(); } +/* + * Send one prefetch request to the pageserver. To wait for the response, call + * prefetch_wait_for(). + */ static void -prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force_lsn) +prefetch_do_request(PrefetchRequest *slot, XLogRecPtr *force_request_lsn, XLogRecPtr *force_not_modified_since) { bool found; NeonGetPageRequest request = { .req.tag = T_NeonGetPageRequest, - .req.latest = false, - .req.lsn = 0, + /* lsn and not_modified_since are filled in below */ .rinfo = BufTagGetNRelFileInfo(slot->buftag), .forknum = slot->buftag.forkNum, .blkno = slot->buftag.blockNum, }; - if (force_lsn && force_latest) + Assert(((force_request_lsn != NULL) == (force_not_modified_since != NULL))); + + if (force_request_lsn) { - request.req.lsn = *force_lsn; - request.req.latest = *force_latest; - slot->actual_request_lsn = slot->effective_request_lsn = *force_lsn; + request.req.lsn = *force_request_lsn; + request.req.not_modified_since = *force_not_modified_since; } else { - XLogRecPtr lsn = neon_get_request_lsn( - &request.req.latest, - BufTagGetNRelFileInfo(slot->buftag), - slot->buftag.forkNum, - slot->buftag.blockNum - ); - - /* - * Note: effective_request_lsn is potentially higher than the - * requested LSN, but still correct: - * - * We know there are no changes between the actual requested LSN and - * the value of effective_request_lsn: If there were, the page would - * have been in cache and evicted between those LSN values, which then - * would have had to result in a larger request LSN for this page. - * - * It is possible that a concurrent backend loads the page, modifies - * it and then evicts it again, but the LSN of that eviction cannot be - * smaller than the current WAL insert/redo pointer, which is already - * larger than this prefetch_lsn. So in any case, that would - * invalidate this cache. - * - * The best LSN to use for effective_request_lsn would be - * XLogCtl->Insert.RedoRecPtr, but that's expensive to access. - */ - slot->actual_request_lsn = request.req.lsn = lsn; - prefetch_lsn = Max(prefetch_lsn, lsn); - slot->effective_request_lsn = prefetch_lsn; + neon_get_request_lsn(BufTagGetNRelFileInfo(slot->buftag), + slot->buftag.forkNum, + slot->buftag.blockNum, + &request.req.lsn, + &request.req.not_modified_since); } + slot->request_lsn = request.req.lsn; + slot->not_modified_since = request.req.not_modified_since; Assert(slot->response == NULL); Assert(slot->my_ring_index == MyPState->ring_unused); @@ -743,7 +734,6 @@ prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force /* update slot state */ slot->status = PRFS_REQUESTED; - prfh_insert(MyPState->prf_hash, slot, &found); Assert(!found); } @@ -753,22 +743,25 @@ prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force * * Register that we may want the contents of BufferTag in the near future. * - * If force_latest and force_lsn are not NULL, those values are sent to the - * pageserver. If they are NULL, we utilize the lastWrittenLsn -infrastructure - * to fill in these values manually. + * If force_request_lsn and force_not_modified_since are not NULL, those + * values are sent to the pageserver. If they are NULL, we utilize the + * lastWrittenLsn -infrastructure to fill them in. * * NOTE: this function may indirectly update MyPState->pfs_hash; which * invalidates any active pointers into the hash table. */ static uint64 -prefetch_register_buffer(BufferTag tag, bool *force_latest, XLogRecPtr *force_lsn) +prefetch_register_buffer(BufferTag tag, XLogRecPtr *force_request_lsn, + XLogRecPtr *force_not_modified_since) { uint64 ring_index; PrefetchRequest req; PrefetchRequest *slot; PrfHashEntry *entry; + Assert(((force_request_lsn != NULL) == (force_not_modified_since != NULL))); + /* use an intermediate PrefetchRequest struct to ensure correct alignment */ req.buftag = tag; Retry: @@ -786,38 +779,19 @@ Retry: Assert(BUFFERTAGS_EQUAL(slot->buftag, tag)); /* - * If we want a specific lsn, we do not accept requests that were made - * with a potentially different LSN. + * If the caller specified a request LSN to use, only accept prefetch + * responses that satisfy that request. */ - if (force_latest && force_lsn) + if (force_request_lsn) { - /* - * if we want the latest version, any effective_request_lsn < - * request lsn is OK - */ - if (*force_latest) + if (!neon_prefetch_response_usable(*force_request_lsn, + *force_not_modified_since, slot)) { - if (*force_lsn > slot->effective_request_lsn) - { - prefetch_wait_for(ring_index); - prefetch_set_unused(ring_index); - entry = NULL; - } - - } - - /* - * if we don't want the latest version, only accept requests with - * the exact same LSN - */ - else - { - if (*force_lsn != slot->effective_request_lsn) - { - prefetch_wait_for(ring_index); - prefetch_set_unused(ring_index); - entry = NULL; - } + /* Wait for the old request to finish and discard it */ + if (!prefetch_wait_for(ring_index)) + goto Retry; + prefetch_set_unused(ring_index); + entry = NULL; } } @@ -880,7 +854,8 @@ Retry: { case PRFS_REQUESTED: Assert(MyPState->ring_receive == cleanup_index); - prefetch_wait_for(cleanup_index); + if (!prefetch_wait_for(cleanup_index)) + goto Retry; prefetch_set_unused(cleanup_index); break; case PRFS_RECEIVED: @@ -912,7 +887,7 @@ Retry: slot->shard_no = get_shard_number(&tag); slot->my_ring_index = ring_index; - prefetch_do_request(slot, force_latest, force_lsn); + prefetch_do_request(slot, force_request_lsn, force_not_modified_since); Assert(slot->status == PRFS_REQUESTED); Assert(MyPState->ring_last <= ring_index && ring_index < MyPState->ring_unused); @@ -941,7 +916,7 @@ page_server_request(void const *req) BufferTag tag = {0}; shardno_t shard_no; - switch (((NeonRequest *) req)->tag) + switch (messageTag(req)) { case T_NeonExistsRequest: CopyNRelFileInfoToBufTag(tag, ((NeonExistsRequest *) req)->rinfo); @@ -957,11 +932,10 @@ page_server_request(void const *req) tag.blockNum = ((NeonGetPageRequest *) req)->blkno; break; default: - neon_log(ERROR, "Unexpected request tag: %d", ((NeonRequest *) req)->tag); + neon_log(ERROR, "Unexpected request tag: %d", messageTag(req)); } shard_no = get_shard_number(&tag); - /* * Current sharding model assumes that all metadata is present only at shard 0. * We still need to call get_shard_no() to check if shard map is up-to-date. @@ -988,8 +962,52 @@ nm_pack_request(NeonRequest *msg) StringInfoData s; initStringInfo(&s); - pq_sendbyte(&s, msg->tag); + if (neon_protocol_version >= 2) + { + pq_sendbyte(&s, msg->tag); + pq_sendint64(&s, msg->lsn); + pq_sendint64(&s, msg->not_modified_since); + } + else + { + bool latest; + XLogRecPtr lsn; + + /* + * In primary, we always request the latest page version. + */ + if (!RecoveryInProgress()) + { + latest = true; + lsn = msg->not_modified_since; + } + else + { + /* + * In the protocol V1, we cannot represent that we want to read + * page at LSN X, and we know that it hasn't been modified since + * Y. We can either use 'not_modified_lsn' as the request LSN, and + * risk getting an error if that LSN is too old and has already + * fallen out of the pageserver's GC horizon, or we can send + * 'request_lsn', causing the pageserver to possibly wait for the + * recent WAL to arrive unnecessarily. Or something in between. We + * choose to use the old LSN and risk GC errors, because that's + * what we've done historically. + */ + latest = false; + lsn = msg->not_modified_since; + } + + pq_sendbyte(&s, msg->tag); + pq_sendbyte(&s, latest); + pq_sendint64(&s, lsn); + } + + /* + * The rest of the request messages are the same between protocol V1 and + * V2 + */ switch (messageTag(msg)) { /* pagestore_client -> pagestore */ @@ -997,8 +1015,6 @@ nm_pack_request(NeonRequest *msg) { NeonExistsRequest *msg_req = (NeonExistsRequest *) msg; - pq_sendbyte(&s, msg_req->req.latest); - pq_sendint64(&s, msg_req->req.lsn); pq_sendint32(&s, NInfoGetSpcOid(msg_req->rinfo)); pq_sendint32(&s, NInfoGetDbOid(msg_req->rinfo)); pq_sendint32(&s, NInfoGetRelNumber(msg_req->rinfo)); @@ -1010,8 +1026,6 @@ nm_pack_request(NeonRequest *msg) { NeonNblocksRequest *msg_req = (NeonNblocksRequest *) msg; - pq_sendbyte(&s, msg_req->req.latest); - pq_sendint64(&s, msg_req->req.lsn); pq_sendint32(&s, NInfoGetSpcOid(msg_req->rinfo)); pq_sendint32(&s, NInfoGetDbOid(msg_req->rinfo)); pq_sendint32(&s, NInfoGetRelNumber(msg_req->rinfo)); @@ -1023,8 +1037,6 @@ nm_pack_request(NeonRequest *msg) { NeonDbSizeRequest *msg_req = (NeonDbSizeRequest *) msg; - pq_sendbyte(&s, msg_req->req.latest); - pq_sendint64(&s, msg_req->req.lsn); pq_sendint32(&s, msg_req->dbNode); break; @@ -1033,8 +1045,6 @@ nm_pack_request(NeonRequest *msg) { NeonGetPageRequest *msg_req = (NeonGetPageRequest *) msg; - pq_sendbyte(&s, msg_req->req.latest); - pq_sendint64(&s, msg_req->req.lsn); pq_sendint32(&s, NInfoGetSpcOid(msg_req->rinfo)); pq_sendint32(&s, NInfoGetDbOid(msg_req->rinfo)); pq_sendint32(&s, NInfoGetRelNumber(msg_req->rinfo)); @@ -1048,8 +1058,6 @@ nm_pack_request(NeonRequest *msg) { NeonGetSlruSegmentRequest *msg_req = (NeonGetSlruSegmentRequest *) msg; - pq_sendbyte(&s, msg_req->req.latest); - pq_sendint64(&s, msg_req->req.lsn); pq_sendbyte(&s, msg_req->kind); pq_sendint32(&s, msg_req->segno); @@ -1200,7 +1208,7 @@ nm_to_string(NeonMessage *msg) appendStringInfo(&s, ", \"rinfo\": \"%u/%u/%u\"", RelFileInfoFmt(msg_req->rinfo)); appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum); appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn)); - appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest); + appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.not_modified_since)); appendStringInfoChar(&s, '}'); break; } @@ -1213,7 +1221,7 @@ nm_to_string(NeonMessage *msg) appendStringInfo(&s, ", \"rinfo\": \"%u/%u/%u\"", RelFileInfoFmt(msg_req->rinfo)); appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum); appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn)); - appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest); + appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.not_modified_since)); appendStringInfoChar(&s, '}'); break; } @@ -1227,7 +1235,7 @@ nm_to_string(NeonMessage *msg) appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum); appendStringInfo(&s, ", \"blkno\": %u", msg_req->blkno); appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn)); - appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest); + appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.not_modified_since)); appendStringInfoChar(&s, '}'); break; } @@ -1238,7 +1246,7 @@ nm_to_string(NeonMessage *msg) appendStringInfoString(&s, "{\"type\": \"NeonDbSizeRequest\""); appendStringInfo(&s, ", \"dbnode\": \"%u\"", msg_req->dbNode); appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn)); - appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest); + appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.not_modified_since)); appendStringInfoChar(&s, '}'); break; } @@ -1250,7 +1258,7 @@ nm_to_string(NeonMessage *msg) appendStringInfo(&s, ", \"kind\": %u", msg_req->kind); appendStringInfo(&s, ", \"segno\": %u", msg_req->segno); appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn)); - appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest); + appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.not_modified_since)); appendStringInfoChar(&s, '}'); break; } @@ -1522,44 +1530,38 @@ nm_adjust_lsn(XLogRecPtr lsn) /* * Return LSN for requesting pages and number of blocks from page server */ -static XLogRecPtr -neon_get_request_lsn(bool *latest, NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno) +static void +neon_get_request_lsn(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno, + XLogRecPtr *request_lsn, XLogRecPtr *not_modified_since) { - XLogRecPtr lsn; + XLogRecPtr last_written_lsn; + + last_written_lsn = GetLastWrittenLSN(rinfo, forknum, blkno); + last_written_lsn = nm_adjust_lsn(last_written_lsn); + Assert(last_written_lsn != InvalidXLogRecPtr); if (RecoveryInProgress()) { - /* - * We don't know if WAL has been generated but not yet replayed, so - * we're conservative in our estimates about latest pages. - */ - *latest = false; + /* Request the page at the last replayed LSN. */ + *request_lsn = GetXLogReplayRecPtr(NULL); + *not_modified_since = last_written_lsn; + Assert(last_written_lsn <= *request_lsn); - /* - * Get the last written LSN of this page. - */ - lsn = GetLastWrittenLSN(rinfo, forknum, blkno); - lsn = nm_adjust_lsn(lsn); - - neon_log(DEBUG1, "neon_get_request_lsn GetXLogReplayRecPtr %X/%X request lsn 0 ", - (uint32) ((lsn) >> 32), (uint32) (lsn)); + neon_log(DEBUG1, "neon_get_request_lsn request lsn %X/%X, not_modified_since %X/%X", + LSN_FORMAT_ARGS(*request_lsn), LSN_FORMAT_ARGS(*not_modified_since)); } else { XLogRecPtr flushlsn; /* - * Use the latest LSN that was evicted from the buffer cache. Any - * pages modified by later WAL records must still in the buffer cache, - * so our request cannot concern those. + * Use the latest LSN that was evicted from the buffer cache as the + * 'not_modified_since' hint. Any pages modified by later WAL records + * must still in the buffer cache, so our request cannot concern + * those. */ - *latest = true; - lsn = GetLastWrittenLSN(rinfo, forknum, blkno); - Assert(lsn != InvalidXLogRecPtr); neon_log(DEBUG1, "neon_get_request_lsn GetLastWrittenLSN lsn %X/%X ", - (uint32) ((lsn) >> 32), (uint32) (lsn)); - - lsn = nm_adjust_lsn(lsn); + LSN_FORMAT_ARGS(last_written_lsn)); /* * Is it possible that the last-written LSN is ahead of last flush @@ -1574,16 +1576,109 @@ neon_get_request_lsn(bool *latest, NRelFileInfo rinfo, ForkNumber forknum, Block #else flushlsn = GetFlushRecPtr(); #endif - if (lsn > flushlsn) + if (last_written_lsn > flushlsn) { neon_log(DEBUG5, "last-written LSN %X/%X is ahead of last flushed LSN %X/%X", - (uint32) (lsn >> 32), (uint32) lsn, - (uint32) (flushlsn >> 32), (uint32) flushlsn); - XLogFlush(lsn); + LSN_FORMAT_ARGS(last_written_lsn), + LSN_FORMAT_ARGS(flushlsn)); + XLogFlush(last_written_lsn); + flushlsn = last_written_lsn; } + + /* + * Request the latest version of the page. The most up-to-date request + * LSN we could use would be the current insert LSN, but to avoid the + * overhead of looking it up, use 'flushlsn' instead. This relies on + * the assumption that if the page was modified since the last WAL + * flush, it should still be in the buffer cache, and we wouldn't be + * requesting it. + */ + *request_lsn = flushlsn; + *not_modified_since = last_written_lsn; + } +} + +/* + * neon_prefetch_response_usable -- Can a new request be satisfied by old one? + * + * This is used to check if the response to a prefetch request can be used to + * satisfy a page read now. + */ +static bool +neon_prefetch_response_usable(XLogRecPtr request_lsn, XLogRecPtr not_modified_since, + PrefetchRequest *slot) +{ + /* sanity check the LSN's on the old and the new request */ + Assert(request_lsn >= not_modified_since); + Assert(slot->request_lsn >= slot->not_modified_since); + Assert(slot->status != PRFS_UNUSED); + + /* + * The new request's LSN should never be older than the old one. This + * could be an Assert, except that for testing purposes, we do provide an + * interface in neon_test_utils to fetch pages at arbitary LSNs, which + * violates this. + * + * Similarly, the not_modified_since value calculated for a page should + * never move backwards. This assumption is a bit fragile; if we updated + * the last-written cache when we read in a page, for example, then it + * might. But as the code stands, it should not. + * + * (If two backends issue a request at the same time, they might race and + * calculate LSNs "out of order" with each other, but the prefetch queue + * is backend-private at the moment.) + */ + if (request_lsn < slot->request_lsn || not_modified_since < slot->not_modified_since) + { + ereport(LOG, + (errcode(ERRCODE_IO_ERROR), + errmsg(NEON_TAG "request with unexpected LSN after prefetch"), + errdetail("Request %X/%X not_modified_since %X/%X, prefetch %X/%X not_modified_since %X/%X)", + LSN_FORMAT_ARGS(request_lsn), LSN_FORMAT_ARGS(not_modified_since), + LSN_FORMAT_ARGS(slot->request_lsn), LSN_FORMAT_ARGS(slot->not_modified_since)))); + return false; } - return lsn; + /*--- + * Each request to the pageserver carries two LSN values: + * `not_modified_since` and `request_lsn`. The (not_modified_since, + * request_lsn] range of each request is effectively a claim that the page + * has not been modified between those LSNs. If the range of the old + * request in the queue overlaps with the new request, we know that the + * page hasn't been modified in the union of the ranges. We can use the + * response to old request to satisfy the new request in that case. For + * example: + * + * 100 500 + * Old request: +--------+ + * + * 400 800 + * New request: +--------+ + * + * The old request claims that the page was not modified between LSNs 100 + * and 500, and the second claims that it was not modified between 400 and + * 800. Together they mean that the page was not modified between 100 and + * 800. Therefore the response to the old request is also valid for the + * new request. + * + * This logic also holds at the boundary case that the old request's LSN + * matches the new request's not_modified_since LSN exactly: + * + * 100 500 + * Old request: +--------+ + * + * 500 900 + * New request: +--------+ + * + * The response to the old request is the page as it was at LSN 500, and + * the page hasn't been changed in the range (500, 900], therefore the + * response is valid also for the new request. + */ + + /* this follows from the checks above */ + Assert(request_lsn >= slot->not_modified_since); + + return not_modified_since <= slot->request_lsn; } /* @@ -1595,8 +1690,8 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum) bool exists; NeonResponse *resp; BlockNumber n_blocks; - bool latest; XLogRecPtr request_lsn; + XLogRecPtr not_modified_since; switch (reln->smgr_relpersistence) { @@ -1651,12 +1746,13 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum) return false; } - request_lsn = neon_get_request_lsn(&latest, InfoFromSMgrRel(reln), forkNum, REL_METADATA_PSEUDO_BLOCKNO); + neon_get_request_lsn(InfoFromSMgrRel(reln), forkNum, REL_METADATA_PSEUDO_BLOCKNO, + &request_lsn, ¬_modified_since); { NeonExistsRequest request = { .req.tag = T_NeonExistsRequest, - .req.latest = latest, .req.lsn = request_lsn, + .req.not_modified_since = not_modified_since, .rinfo = InfoFromSMgrRel(reln), .forknum = forkNum}; @@ -1681,7 +1777,7 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum) break; default: - neon_log(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag); + neon_log(ERROR, "unexpected response from page server with tag 0x%02x in neon_exists", resp->tag); } pfree(resp); return exists; @@ -1832,7 +1928,7 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, reln->smgr_relpersistence == RELPERSISTENCE_PERMANENT && !IsAutoVacuumWorkerProcess()) { - uint64 current_size = GetZenithCurrentClusterSize(); + uint64 current_size = GetNeonCurrentClusterSize(); if (current_size >= ((uint64) max_cluster_size) * 1024 * 1024) ereport(ERROR, @@ -1913,7 +2009,7 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum, reln->smgr_relpersistence == RELPERSISTENCE_PERMANENT && !IsAutoVacuumWorkerProcess()) { - uint64 current_size = GetZenithCurrentClusterSize(); + uint64 current_size = GetNeonCurrentClusterSize(); if (current_size >= ((uint64) max_cluster_size) * 1024 * 1024) ereport(ERROR, @@ -2093,10 +2189,10 @@ neon_writeback(SMgrRelation reln, ForkNumber forknum, void #if PG_MAJORVERSION_NUM < 16 neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, - XLogRecPtr request_lsn, bool request_latest, char *buffer) + XLogRecPtr request_lsn, XLogRecPtr not_modified_since, char *buffer) #else neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, - XLogRecPtr request_lsn, bool request_latest, void *buffer) + XLogRecPtr request_lsn, XLogRecPtr not_modified_since, void *buffer) #endif { NeonResponse *resp; @@ -2133,20 +2229,22 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, /* * Try to find prefetched page in the list of received pages. */ + Retry: entry = prfh_lookup(MyPState->prf_hash, (PrefetchRequest *) &buftag); if (entry != NULL) { slot = entry->slot; - if (slot->effective_request_lsn >= request_lsn) + if (neon_prefetch_response_usable(request_lsn, not_modified_since, slot)) { ring_index = slot->my_ring_index; pgBufferUsage.prefetch.hits += 1; } - else /* the current prefetch LSN is not large - * enough, so drop the prefetch */ + else { /* + * Cannot use this prefetch, discard it + * * We can't drop cache for not-yet-received requested items. It is * unlikely this happens, but it can happen if prefetch distance * is large enough and a backend didn't consume all prefetch @@ -2154,7 +2252,8 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, */ if (slot->status == PRFS_REQUESTED) { - prefetch_wait_for(slot->my_ring_index); + if (!prefetch_wait_for(slot->my_ring_index)) + goto Retry; } /* drop caches */ prefetch_set_unused(slot->my_ring_index); @@ -2170,8 +2269,8 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, { pgBufferUsage.prefetch.misses += 1; - ring_index = prefetch_register_buffer(buftag, &request_latest, - &request_lsn); + ring_index = prefetch_register_buffer(buftag, &request_lsn, + ¬_modified_since); slot = GetPrfSlot(ring_index); } else @@ -2217,7 +2316,7 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, ((NeonErrorResponse *) resp)->message))); break; default: - neon_log(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag); + neon_log(ERROR, "unexpected response from page server with tag 0x%02x in neon_read_at_lsn", resp->tag); } /* buffer was used, clean up for later reuse */ @@ -2235,8 +2334,8 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, char *buffer neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer) #endif { - bool latest; XLogRecPtr request_lsn; + XLogRecPtr not_modified_since; switch (reln->smgr_relpersistence) { @@ -2261,8 +2360,9 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer return; } - request_lsn = neon_get_request_lsn(&latest, InfoFromSMgrRel(reln), forkNum, blkno); - neon_read_at_lsn(InfoFromSMgrRel(reln), forkNum, blkno, request_lsn, latest, buffer); + neon_get_request_lsn(InfoFromSMgrRel(reln), forkNum, blkno, + &request_lsn, ¬_modified_since); + neon_read_at_lsn(InfoFromSMgrRel(reln), forkNum, blkno, request_lsn, not_modified_since, buffer); #ifdef DEBUG_COMPARE_LOCAL if (forkNum == MAIN_FORKNUM && IS_LOCAL_REL(reln)) @@ -2431,8 +2531,8 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum) { NeonResponse *resp; BlockNumber n_blocks; - bool latest; XLogRecPtr request_lsn; + XLogRecPtr not_modified_since; switch (reln->smgr_relpersistence) { @@ -2459,12 +2559,13 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum) return n_blocks; } - request_lsn = neon_get_request_lsn(&latest, InfoFromSMgrRel(reln), forknum, REL_METADATA_PSEUDO_BLOCKNO); + neon_get_request_lsn(InfoFromSMgrRel(reln), forknum, REL_METADATA_PSEUDO_BLOCKNO, + &request_lsn, ¬_modified_since); { NeonNblocksRequest request = { .req.tag = T_NeonNblocksRequest, - .req.latest = latest, .req.lsn = request_lsn, + .req.not_modified_since = not_modified_since, .rinfo = InfoFromSMgrRel(reln), .forknum = forknum, }; @@ -2490,7 +2591,7 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum) break; default: - neon_log(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag); + neon_log(ERROR, "unexpected response from page server with tag 0x%02x in neon_nblocks", resp->tag); } update_cached_relsize(InfoFromSMgrRel(reln), forknum, n_blocks); @@ -2512,16 +2613,17 @@ neon_dbsize(Oid dbNode) { NeonResponse *resp; int64 db_size; - XLogRecPtr request_lsn; - bool latest; + XLogRecPtr request_lsn, + not_modified_since; NRelFileInfo dummy_node = {0}; - request_lsn = neon_get_request_lsn(&latest, dummy_node, MAIN_FORKNUM, REL_METADATA_PSEUDO_BLOCKNO); + neon_get_request_lsn(dummy_node, MAIN_FORKNUM, REL_METADATA_PSEUDO_BLOCKNO, + &request_lsn, ¬_modified_since); { NeonDbSizeRequest request = { .req.tag = T_NeonDbSizeRequest, - .req.latest = latest, .req.lsn = request_lsn, + .req.not_modified_since = not_modified_since, .dbNode = dbNode, }; @@ -2545,7 +2647,7 @@ neon_dbsize(Oid dbNode) break; default: - neon_log(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag); + neon_log(ERROR, "unexpected response from page server with tag 0x%02x in neon_dbsize", resp->tag); } neon_log(SmgrTrace, "neon_dbsize: db %u (request LSN %X/%08X): %ld bytes", @@ -2594,7 +2696,6 @@ neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks) * the most recently inserted WAL record's LSN. */ lsn = GetXLogInsertRecPtr(); - lsn = nm_adjust_lsn(lsn); /* @@ -2808,14 +2909,33 @@ neon_end_unlogged_build(SMgrRelation reln) static int neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buffer) { - XLogRecPtr request_lsn; - /* - * GetRedoStartLsn() returns LSN of basebackup. - * We need to download SLRU segments only once after node startup, - * then SLRUs are maintained locally. - */ - request_lsn = GetRedoStartLsn(); + XLogRecPtr request_lsn, + not_modified_since; + + if (RecoveryInProgress()) + { + request_lsn = GetXLogReplayRecPtr(NULL); + if (request_lsn == InvalidXLogRecPtr) + { + /* + * This happens in neon startup, we start up without replaying any + * records. + */ + request_lsn = GetRedoStartLsn(); + } + } + else + request_lsn = GetXLogInsertRecPtr(); request_lsn = nm_adjust_lsn(request_lsn); + + /* + * GetRedoStartLsn() returns LSN of basebackup. We know that the SLRU + * segment has not changed since the basebackup, because in order to + * modify it, we would have had to download it already. And once + * downloaded, we never evict SLRU segments from local disk. + */ + not_modified_since = GetRedoStartLsn(); + SlruKind kind; if (STRPREFIX(path, "pg_xact")) @@ -2830,8 +2950,8 @@ neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buf NeonResponse *resp; NeonGetSlruSegmentRequest request = { .req.tag = T_NeonGetSlruSegmentRequest, - .req.latest = false, .req.lsn = request_lsn, + .req.not_modified_since = not_modified_since, .kind = kind, .segno = segno @@ -2864,7 +2984,7 @@ neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buf break; default: - neon_log(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag); + neon_log(ERROR, "unexpected response from page server with tag 0x%02x in neon_read_slru_segment", resp->tag); } pfree(resp); @@ -2959,6 +3079,9 @@ neon_extend_rel_size(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno, { BlockNumber relsize; + /* This is only used in WAL replay */ + Assert(RecoveryInProgress()); + /* Extend the relation if we know its size */ if (get_cached_relsize(rinfo, forknum, &relsize)) { @@ -2977,14 +3100,13 @@ neon_extend_rel_size(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno, * This length is later reused when we open the smgr to read the * block, which is fine and expected. */ - NeonResponse *response; NeonNblocksResponse *nbresponse; NeonNblocksRequest request = { .req = (NeonRequest) { - .lsn = end_recptr, - .latest = false, .tag = T_NeonNblocksRequest, + .lsn = end_recptr, + .not_modified_since = end_recptr, }, .rinfo = rinfo, .forknum = forknum, diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c index d7987954d4..dbc67a24f5 100644 --- a/pgxn/neon/walproposer.c +++ b/pgxn/neon/walproposer.c @@ -80,7 +80,7 @@ static int CompareLsn(const void *a, const void *b); static char *FormatSafekeeperState(Safekeeper *sk); static void AssertEventsOkForState(uint32 events, Safekeeper *sk); static char *FormatEvents(WalProposer *wp, uint32 events); - +static void UpdateDonorShmem(WalProposer *wp); WalProposer * WalProposerCreate(WalProposerConfig *config, walproposer_api api) @@ -922,7 +922,8 @@ static void DetermineEpochStartLsn(WalProposer *wp) { TermHistory *dth; - int n_ready = 0; + int n_ready = 0; + WalproposerShmemState *walprop_shared; wp->propEpochStartLsn = InvalidXLogRecPtr; wp->donorEpoch = 0; @@ -964,16 +965,18 @@ DetermineEpochStartLsn(WalProposer *wp) if (n_ready < wp->quorum) { /* - * This is a rare case that can be triggered if safekeeper has voted and disconnected. - * In this case, its state will not be SS_IDLE and its vote cannot be used, because - * we clean up `voteResponse` in `ShutdownConnection`. + * This is a rare case that can be triggered if safekeeper has voted + * and disconnected. In this case, its state will not be SS_IDLE and + * its vote cannot be used, because we clean up `voteResponse` in + * `ShutdownConnection`. */ wp_log(FATAL, "missing majority of votes, collected %d, expected %d, got %d", wp->n_votes, wp->quorum, n_ready); } /* - * If propEpochStartLsn is 0, it means flushLsn is 0 everywhere, we are bootstrapping - * and nothing was committed yet. Start streaming then from the basebackup LSN. + * If propEpochStartLsn is 0, it means flushLsn is 0 everywhere, we are + * bootstrapping and nothing was committed yet. Start streaming then from + * the basebackup LSN. */ if (wp->propEpochStartLsn == InvalidXLogRecPtr && !wp->config->syncSafekeepers) { @@ -984,11 +987,12 @@ DetermineEpochStartLsn(WalProposer *wp) } wp_log(LOG, "bumped epochStartLsn to the first record %X/%X", LSN_FORMAT_ARGS(wp->propEpochStartLsn)); } + pg_atomic_write_u64(&wp->api.get_shmem_state(wp)->propEpochStartLsn, wp->propEpochStartLsn); /* - * Safekeepers are setting truncateLsn after timelineStartLsn is known, so it - * should never be zero at this point, if we know timelineStartLsn. - * + * Safekeepers are setting truncateLsn after timelineStartLsn is known, so + * it should never be zero at this point, if we know timelineStartLsn. + * * timelineStartLsn can be zero only on the first syncSafekeepers run. */ Assert((wp->truncateLsn != InvalidXLogRecPtr) || @@ -1022,10 +1026,9 @@ DetermineEpochStartLsn(WalProposer *wp) * since which we are going to write according to the consensus. If not, * we must bail out, as clog and other non rel data is inconsistent. */ + walprop_shared = wp->api.get_shmem_state(wp); if (!wp->config->syncSafekeepers) { - WalproposerShmemState *walprop_shared = wp->api.get_shmem_state(wp); - /* * Basebackup LSN always points to the beginning of the record (not * the page), as StartupXLOG most probably wants it this way. @@ -1040,7 +1043,7 @@ DetermineEpochStartLsn(WalProposer *wp) * compute (who could generate WAL) is ok. */ if (!((dth->n_entries >= 1) && (dth->entries[dth->n_entries - 1].term == - walprop_shared->mineLastElectedTerm))) + pg_atomic_read_u64(&walprop_shared->mineLastElectedTerm)))) { /* * Panic to restart PG as we need to retake basebackup. @@ -1054,8 +1057,8 @@ DetermineEpochStartLsn(WalProposer *wp) LSN_FORMAT_ARGS(wp->api.get_redo_start_lsn(wp))); } } - walprop_shared->mineLastElectedTerm = wp->propTerm; } + pg_atomic_write_u64(&walprop_shared->mineLastElectedTerm, wp->propTerm); } /* @@ -1105,9 +1108,13 @@ SendProposerElected(Safekeeper *sk) { /* safekeeper is empty or no common point, start from the beginning */ sk->startStreamingAt = wp->propTermHistory.entries[0].lsn; - wp_log(LOG, "no common point with sk %s:%s, streaming since first term at %X/%X, timelineStartLsn=%X/%X, termHistory.n_entries=%u" , - sk->host, sk->port, LSN_FORMAT_ARGS(sk->startStreamingAt), LSN_FORMAT_ARGS(wp->timelineStartLsn), wp->propTermHistory.n_entries); - /* wp->timelineStartLsn == InvalidXLogRecPtr can be only when timeline is created manually (test_s3_wal_replay) */ + wp_log(LOG, "no common point with sk %s:%s, streaming since first term at %X/%X, timelineStartLsn=%X/%X, termHistory.n_entries=%u", + sk->host, sk->port, LSN_FORMAT_ARGS(sk->startStreamingAt), LSN_FORMAT_ARGS(wp->timelineStartLsn), wp->propTermHistory.n_entries); + + /* + * wp->timelineStartLsn == InvalidXLogRecPtr can be only when timeline + * is created manually (test_s3_wal_replay) + */ Assert(sk->startStreamingAt == wp->timelineStartLsn || wp->timelineStartLsn == InvalidXLogRecPtr); } else @@ -1177,6 +1184,12 @@ StartStreaming(Safekeeper *sk) sk->active_state = SS_ACTIVE_SEND; sk->streamingAt = sk->startStreamingAt; + /* + * Donors can only be in SS_ACTIVE state, so we potentially update the + * donor when we switch one to SS_ACTIVE. + */ + UpdateDonorShmem(sk->wp); + /* event set will be updated inside SendMessageToNode */ SendMessageToNode(sk); } @@ -1568,17 +1581,17 @@ GetAcknowledgedByQuorumWALPosition(WalProposer *wp) * none if it doesn't exist. donor_lsn is set to end position of the donor to * the best of our knowledge. */ -Safekeeper * -GetDonor(WalProposer *wp, XLogRecPtr *donor_lsn) +static void +UpdateDonorShmem(WalProposer *wp) { Safekeeper *donor = NULL; int i; - *donor_lsn = InvalidXLogRecPtr; + XLogRecPtr donor_lsn = InvalidXLogRecPtr; if (wp->n_votes < wp->quorum) { - wp_log(WARNING, "GetDonor called before elections are won"); - return NULL; + wp_log(WARNING, "UpdateDonorShmem called before elections are won"); + return; } /* @@ -1589,7 +1602,7 @@ GetDonor(WalProposer *wp, XLogRecPtr *donor_lsn) if (wp->safekeeper[wp->donor].state >= SS_IDLE) { donor = &wp->safekeeper[wp->donor]; - *donor_lsn = wp->propEpochStartLsn; + donor_lsn = wp->propEpochStartLsn; } /* @@ -1601,13 +1614,19 @@ GetDonor(WalProposer *wp, XLogRecPtr *donor_lsn) { Safekeeper *sk = &wp->safekeeper[i]; - if (sk->state == SS_ACTIVE && sk->appendResponse.flushLsn > *donor_lsn) + if (sk->state == SS_ACTIVE && sk->appendResponse.flushLsn > donor_lsn) { donor = sk; - *donor_lsn = sk->appendResponse.flushLsn; + donor_lsn = sk->appendResponse.flushLsn; } } - return donor; + + if (donor == NULL) + { + wp_log(WARNING, "UpdateDonorShmem didn't find a suitable donor, skipping"); + return; + } + wp->api.update_donor(wp, donor, donor_lsn); } /* @@ -1617,7 +1636,7 @@ static void HandleSafekeeperResponse(WalProposer *wp, Safekeeper *sk) { XLogRecPtr candidateTruncateLsn; - XLogRecPtr newCommitLsn; + XLogRecPtr newCommitLsn; newCommitLsn = GetAcknowledgedByQuorumWALPosition(wp); if (newCommitLsn > wp->commitLsn) @@ -1627,7 +1646,7 @@ HandleSafekeeperResponse(WalProposer *wp, Safekeeper *sk) BroadcastAppendRequest(wp); } - /* + /* * Unlock syncrep waiters, update ps_feedback, CheckGracefulShutdown(). * The last one will terminate the process if the shutdown is requested * and WAL is committed by the quorum. BroadcastAppendRequest() should be diff --git a/pgxn/neon/walproposer.h b/pgxn/neon/walproposer.h index 28585eb4e7..41daeb87b9 100644 --- a/pgxn/neon/walproposer.h +++ b/pgxn/neon/walproposer.h @@ -284,13 +284,19 @@ typedef struct PageserverFeedback typedef struct WalproposerShmemState { + pg_atomic_uint64 propEpochStartLsn; + char donor_name[64]; + char donor_conninfo[MAXCONNINFO]; + XLogRecPtr donor_lsn; + slock_t mutex; - term_t mineLastElectedTerm; + pg_atomic_uint64 mineLastElectedTerm; pg_atomic_uint64 backpressureThrottlingTime; + pg_atomic_uint64 currentClusterSize; /* last feedback from each shard */ PageserverFeedback shard_ps_feedback[MAX_SHARDS]; - int num_shards; + int num_shards; /* aggregated feedback with min LSNs across shards */ PageserverFeedback min_ps_feedback; @@ -464,6 +470,9 @@ typedef struct walproposer_api /* Get pointer to the latest available WAL. */ XLogRecPtr (*get_flush_rec_ptr) (WalProposer *wp); + /* Update current donor info in WalProposer Shmem */ + void (*update_donor) (WalProposer *wp, Safekeeper *donor, XLogRecPtr donor_lsn); + /* Get current time. */ TimestampTz (*get_current_timestamp) (WalProposer *wp); @@ -496,7 +505,7 @@ typedef struct walproposer_api * * On success, the data is placed in *buf. It is valid until the next call * to this function. - * + * * Returns PG_ASYNC_READ_FAIL on closed connection. */ PGAsyncReadResult (*conn_async_read) (Safekeeper *sk, char **buf, int *amount); @@ -544,13 +553,14 @@ typedef struct walproposer_api * Returns 0 if timeout is reached, 1 if some event happened. Updates * events mask to indicate events and sets sk to the safekeeper which has * an event. - * + * * On timeout, events is set to WL_NO_EVENTS. On socket event, events is * set to WL_SOCKET_READABLE and/or WL_SOCKET_WRITEABLE. When socket is * closed, events is set to WL_SOCKET_READABLE. - * - * WL_SOCKET_WRITEABLE is usually set only when we need to flush the buffer. - * It can be returned only if caller asked for this event in the last *_event_set call. + * + * WL_SOCKET_WRITEABLE is usually set only when we need to flush the + * buffer. It can be returned only if caller asked for this event in the + * last *_event_set call. */ int (*wait_event_set) (WalProposer *wp, long timeout, Safekeeper **sk, uint32 *events); @@ -570,9 +580,9 @@ typedef struct walproposer_api void (*finish_sync_safekeepers) (WalProposer *wp, XLogRecPtr lsn); /* - * Called after every AppendResponse from the safekeeper. Used to propagate - * backpressure feedback and to confirm WAL persistence (has been commited - * on the quorum of safekeepers). + * Called after every AppendResponse from the safekeeper. Used to + * propagate backpressure feedback and to confirm WAL persistence (has + * been commited on the quorum of safekeepers). */ void (*process_safekeeper_feedback) (WalProposer *wp, Safekeeper *sk); @@ -715,12 +725,14 @@ extern void WalProposerBroadcast(WalProposer *wp, XLogRecPtr startpos, XLogRecPt extern void WalProposerPoll(WalProposer *wp); extern void WalProposerFree(WalProposer *wp); +extern WalproposerShmemState *GetWalpropShmemState(); + /* * WaitEventSet API doesn't allow to remove socket, so walproposer_pg uses it to * recreate set from scratch, hence the export. */ extern void SafekeeperStateDesiredEvents(Safekeeper *sk, uint32 *sk_events, uint32 *nwr_events); -extern Safekeeper *GetDonor(WalProposer *wp, XLogRecPtr *donor_lsn); +extern TimeLineID walprop_pg_get_timeline_id(void); #define WPEVENT 1337 /* special log level for walproposer internal diff --git a/pgxn/neon/walproposer_pg.c b/pgxn/neon/walproposer_pg.c index 002bf4e2ce..e5ef93b456 100644 --- a/pgxn/neon/walproposer_pg.c +++ b/pgxn/neon/walproposer_pg.c @@ -85,7 +85,6 @@ static void walprop_pg_init_standalone_sync_safekeepers(void); static void walprop_pg_init_walsender(void); static void walprop_pg_init_bgworker(void); static TimestampTz walprop_pg_get_current_timestamp(WalProposer *wp); -static TimeLineID walprop_pg_get_timeline_id(void); static void walprop_pg_load_libpqwalreceiver(void); static process_interrupts_callback_t PrevProcessInterruptsCallback; @@ -94,6 +93,8 @@ static shmem_startup_hook_type prev_shmem_startup_hook_type; static shmem_request_hook_type prev_shmem_request_hook = NULL; static void walproposer_shmem_request(void); #endif +static void WalproposerShmemInit_SyncSafekeeper(void); + static void StartProposerReplication(WalProposer *wp, StartReplicationCmd *cmd); static void WalSndLoop(WalProposer *wp); @@ -136,6 +137,7 @@ WalProposerSync(int argc, char *argv[]) WalProposer *wp; init_walprop_config(true); + WalproposerShmemInit_SyncSafekeeper(); walprop_pg_init_standalone_sync_safekeepers(); walprop_pg_load_libpqwalreceiver(); @@ -281,13 +283,27 @@ WalproposerShmemInit(void) { memset(walprop_shared, 0, WalproposerShmemSize()); SpinLockInit(&walprop_shared->mutex); + pg_atomic_init_u64(&walprop_shared->propEpochStartLsn, 0); + pg_atomic_init_u64(&walprop_shared->mineLastElectedTerm, 0); pg_atomic_init_u64(&walprop_shared->backpressureThrottlingTime, 0); + pg_atomic_init_u64(&walprop_shared->currentClusterSize, 0); } LWLockRelease(AddinShmemInitLock); return found; } +static void +WalproposerShmemInit_SyncSafekeeper(void) +{ + walprop_shared = palloc(WalproposerShmemSize()); + memset(walprop_shared, 0, WalproposerShmemSize()); + SpinLockInit(&walprop_shared->mutex); + pg_atomic_init_u64(&walprop_shared->propEpochStartLsn, 0); + pg_atomic_init_u64(&walprop_shared->mineLastElectedTerm, 0); + pg_atomic_init_u64(&walprop_shared->backpressureThrottlingTime, 0); +} + #define BACK_PRESSURE_DELAY 10000L // 0.01 sec static bool @@ -398,6 +414,13 @@ nwp_shmem_startup_hook(void) WalproposerShmemInit(); } +WalproposerShmemState * +GetWalpropShmemState() +{ + Assert(walprop_shared != NULL); + return walprop_shared; +} + static WalproposerShmemState * walprop_pg_get_shmem_state(WalProposer *wp) { @@ -430,14 +453,15 @@ record_pageserver_feedback(PageserverFeedback *ps_feedback) for (int i = 0; i < walprop_shared->num_shards; i++) { PageserverFeedback *feedback = &walprop_shared->shard_ps_feedback[i]; + if (feedback->present) { if (min_feedback.last_received_lsn == InvalidXLogRecPtr || feedback->last_received_lsn < min_feedback.last_received_lsn) min_feedback.last_received_lsn = feedback->last_received_lsn; - + if (min_feedback.disk_consistent_lsn == InvalidXLogRecPtr || feedback->disk_consistent_lsn < min_feedback.disk_consistent_lsn) min_feedback.disk_consistent_lsn = feedback->disk_consistent_lsn; - + if (min_feedback.remote_consistent_lsn == InvalidXLogRecPtr || feedback->remote_consistent_lsn < min_feedback.remote_consistent_lsn) min_feedback.remote_consistent_lsn = feedback->remote_consistent_lsn; } @@ -550,6 +574,7 @@ static void walprop_sigusr2(SIGNAL_ARGS) { int save_errno = errno; + got_SIGUSR2 = true; SetLatch(MyLatch); errno = save_errno; @@ -597,7 +622,7 @@ walprop_pg_get_current_timestamp(WalProposer *wp) return GetCurrentTimestamp(); } -static TimeLineID +TimeLineID walprop_pg_get_timeline_id(void) { #if PG_VERSION_NUM >= 150000 @@ -616,6 +641,20 @@ walprop_pg_load_libpqwalreceiver(void) wpg_log(ERROR, "libpqwalreceiver didn't initialize correctly"); } +static void +walprop_pg_update_donor(WalProposer *wp, Safekeeper *donor, XLogRecPtr donor_lsn) +{ + WalproposerShmemState *wps = wp->api.get_shmem_state(wp); + char donor_name[64]; + + pg_snprintf(donor_name, sizeof(donor_name), "%s:%s", donor->host, donor->port); + SpinLockAcquire(&wps->mutex); + memcpy(wps->donor_name, donor_name, sizeof(donor_name)); + memcpy(wps->donor_conninfo, donor->conninfo, sizeof(donor->conninfo)); + wps->donor_lsn = donor_lsn; + SpinLockRelease(&wps->mutex); +} + /* Helper function */ static bool ensure_nonblocking_status(WalProposerConn *conn, bool is_nonblocking) @@ -716,7 +755,6 @@ walprop_connect_start(Safekeeper *sk) { Assert(sk->conn == NULL); sk->conn = libpqwp_connect_start(sk->conninfo); - } static WalProposerConnectPollStatusType @@ -1090,7 +1128,7 @@ static void StartProposerReplication(WalProposer *wp, StartReplicationCmd *cmd) { XLogRecPtr FlushPtr; - __attribute__((unused)) TimeLineID currTLI; + __attribute__((unused)) TimeLineID currTLI; #if PG_VERSION_NUM < 150000 if (ThisTimeLineID == 0) @@ -1294,116 +1332,13 @@ XLogBroadcastWalProposer(WalProposer *wp) } } -/* Download WAL before basebackup for logical walsenders from sk, if needed */ +/* + Used to download WAL before basebackup for logical walsenders from sk, no longer + needed because walsender always uses neon_walreader. + */ static bool WalProposerRecovery(WalProposer *wp, Safekeeper *sk) { - char *err; - WalReceiverConn *wrconn; - WalRcvStreamOptions options; - char conninfo[MAXCONNINFO]; - TimeLineID timeline; - XLogRecPtr startpos; - XLogRecPtr endpos; - - startpos = GetLogRepRestartLSN(wp); - if (startpos == InvalidXLogRecPtr) - return true; /* recovery not needed */ - endpos = wp->propEpochStartLsn; - - timeline = wp->greetRequest.timeline; - - if (!neon_auth_token) - { - memcpy(conninfo, sk->conninfo, MAXCONNINFO); - } - else - { - int written = 0; - - written = snprintf((char *) conninfo, MAXCONNINFO, "password=%s %s", neon_auth_token, sk->conninfo); - if (written > MAXCONNINFO || written < 0) - wpg_log(FATAL, "could not append password to the safekeeper connection string"); - } - -#if PG_MAJORVERSION_NUM < 16 - wrconn = walrcv_connect(conninfo, false, "wal_proposer_recovery", &err); -#else - wrconn = walrcv_connect(conninfo, false, false, "wal_proposer_recovery", &err); -#endif - - if (!wrconn) - { - ereport(WARNING, - (errmsg("could not connect to WAL acceptor %s:%s: %s", - sk->host, sk->port, - err))); - return false; - } - wpg_log(LOG, - "start recovery for logical replication from %s:%s starting from %X/%08X till %X/%08X timeline " - "%d", - sk->host, sk->port, (uint32) (startpos >> 32), - (uint32) startpos, (uint32) (endpos >> 32), (uint32) endpos, timeline); - - options.logical = false; - options.startpoint = startpos; - options.slotname = NULL; - options.proto.physical.startpointTLI = timeline; - - if (walrcv_startstreaming(wrconn, &options)) - { - XLogRecPtr rec_start_lsn; - XLogRecPtr rec_end_lsn = 0; - int len; - char *buf; - pgsocket wait_fd = PGINVALID_SOCKET; - - while ((len = walrcv_receive(wrconn, &buf, &wait_fd)) >= 0) - { - if (len == 0) - { - (void) WaitLatchOrSocket( - MyLatch, WL_EXIT_ON_PM_DEATH | WL_SOCKET_READABLE, wait_fd, - -1, WAIT_EVENT_WAL_RECEIVER_MAIN); - } - else - { - Assert(buf[0] == 'w' || buf[0] == 'k'); - if (buf[0] == 'k') - continue; /* keepalive */ - memcpy(&rec_start_lsn, &buf[XLOG_HDR_START_POS], - sizeof rec_start_lsn); - rec_start_lsn = pg_ntoh64(rec_start_lsn); - rec_end_lsn = rec_start_lsn + len - XLOG_HDR_SIZE; - - /* write WAL to disk */ - XLogWalPropWrite(sk->wp, &buf[XLOG_HDR_SIZE], len - XLOG_HDR_SIZE, rec_start_lsn); - - ereport(DEBUG1, - (errmsg("Recover message %X/%X length %d", - LSN_FORMAT_ARGS(rec_start_lsn), len))); - if (rec_end_lsn >= endpos) - break; - } - } - ereport(LOG, - (errmsg("end of replication stream at %X/%X: %m", - LSN_FORMAT_ARGS(rec_end_lsn)))); - walrcv_disconnect(wrconn); - - /* failed to receive all WAL till endpos */ - if (rec_end_lsn < endpos) - return false; - } - else - { - ereport(LOG, - (errmsg("primary server contains no more WAL on requested timeline %u LSN %X/%08X", - timeline, (uint32) (startpos >> 32), (uint32) startpos))); - return false; - } - return true; } @@ -1544,7 +1479,7 @@ walprop_pg_wal_reader_allocate(Safekeeper *sk) snprintf(log_prefix, sizeof(log_prefix), WP_LOG_PREFIX "sk %s:%s nwr: ", sk->host, sk->port); Assert(!sk->xlogreader); - sk->xlogreader = NeonWALReaderAllocate(wal_segment_size, sk->wp->propEpochStartLsn, sk->wp, log_prefix); + sk->xlogreader = NeonWALReaderAllocate(wal_segment_size, sk->wp->propEpochStartLsn, log_prefix); if (sk->xlogreader == NULL) wpg_log(FATAL, "failed to allocate xlog reader"); } @@ -1959,8 +1894,8 @@ CombineHotStanbyFeedbacks(HotStandbyFeedback *hs, WalProposer *wp) static void walprop_pg_process_safekeeper_feedback(WalProposer *wp, Safekeeper *sk) { - HotStandbyFeedback hsFeedback; - bool needToAdvanceSlot = false; + HotStandbyFeedback hsFeedback; + bool needToAdvanceSlot = false; if (wp->config->syncSafekeepers) return; @@ -1972,7 +1907,7 @@ walprop_pg_process_safekeeper_feedback(WalProposer *wp, Safekeeper *sk) /* Only one main shard sends non-zero currentClusterSize */ if (sk->appendResponse.ps_feedback.currentClusterSize > 0) - SetZenithCurrentClusterSize(sk->appendResponse.ps_feedback.currentClusterSize); + SetNeonCurrentClusterSize(sk->appendResponse.ps_feedback.currentClusterSize); if (min_feedback.disk_consistent_lsn != standby_apply_lsn) { @@ -2094,10 +2029,25 @@ GetLogRepRestartLSN(WalProposer *wp) return lrRestartLsn; } +void +SetNeonCurrentClusterSize(uint64 size) +{ + pg_atomic_write_u64(&walprop_shared->currentClusterSize, size); +} + +uint64 +GetNeonCurrentClusterSize(void) +{ + return pg_atomic_read_u64(&walprop_shared->currentClusterSize); +} +uint64 GetNeonCurrentClusterSize(void); + + static const walproposer_api walprop_pg = { .get_shmem_state = walprop_pg_get_shmem_state, .start_streaming = walprop_pg_start_streaming, .get_flush_rec_ptr = walprop_pg_get_flush_rec_ptr, + .update_donor = walprop_pg_update_donor, .get_current_timestamp = walprop_pg_get_current_timestamp, .conn_error_message = walprop_error_message, .conn_status = walprop_status, diff --git a/pgxn/neon/walsender_hooks.c b/pgxn/neon/walsender_hooks.c new file mode 100644 index 0000000000..93dce9de84 --- /dev/null +++ b/pgxn/neon/walsender_hooks.c @@ -0,0 +1,172 @@ +/*------------------------------------------------------------------------- + * + * walsender_hooks.c + * + * Implements XLogReaderRoutine in terms of NeonWALReader. Allows for + * fetching WAL from safekeepers, which normal xlogreader can't do. + * + *------------------------------------------------------------------------- + */ +#include "walsender_hooks.h" +#include "postgres.h" +#include "fmgr.h" +#include "access/xlogdefs.h" +#include "replication/walsender.h" +#include "access/xlog.h" +#include "access/xlog_internal.h" +#include "access/xlogreader.h" +#include "miscadmin.h" +#include "utils/wait_event.h" +#include "utils/guc.h" +#include "postmaster/interrupt.h" + +#include "neon_walreader.h" +#include "walproposer.h" + +static NeonWALReader *wal_reader = NULL; +extern XLogRecPtr WalSndWaitForWal(XLogRecPtr loc); +extern bool GetDonorShmem(XLogRecPtr *donor_lsn); + +static XLogRecPtr +NeonWALReadWaitForWAL(XLogRecPtr loc) +{ + while (!NeonWALReaderUpdateDonor(wal_reader)) + { + pg_usleep(1000); + CHECK_FOR_INTERRUPTS(); + } + + return WalSndWaitForWal(loc); +} + +static int +NeonWALPageRead( + XLogReaderState *xlogreader, + XLogRecPtr targetPagePtr, + int reqLen, + XLogRecPtr targetRecPtr, + char *readBuf) +{ + XLogRecPtr rem_lsn; + + /* Wait for flush pointer to advance past our request */ + XLogRecPtr flushptr = NeonWALReadWaitForWAL(targetPagePtr + reqLen); + int count; + + if (flushptr < targetPagePtr + reqLen) + return -1; + + /* Read at most XLOG_BLCKSZ bytes */ + if (targetPagePtr + XLOG_BLCKSZ <= flushptr) + count = XLOG_BLCKSZ; + else + count = flushptr - targetPagePtr; + + /* + * Sometimes walsender requests non-monotonic sequences of WAL. If that's + * the case, we have to reset streaming from remote at the correct + * position. For example, walsender may try to verify the segment header + * when trying to read in the middle of it. + */ + rem_lsn = NeonWALReaderGetRemLsn(wal_reader); + if (rem_lsn != InvalidXLogRecPtr && targetPagePtr != rem_lsn) + { + NeonWALReaderResetRemote(wal_reader); + } + + for (;;) + { + NeonWALReadResult res = NeonWALRead( + wal_reader, + readBuf, + targetPagePtr, + count, + walprop_pg_get_timeline_id()); + + if (res == NEON_WALREAD_SUCCESS) + { + /* + * Setting ws_tli is required by the XLogReaderRoutine, it is used + * for segment name generation in error reports. + * + * ReadPageInternal updates ws_segno after calling cb on its own + * and XLogReaderRoutine description doesn't require it, but + * WALRead sets, let's follow it. + */ + xlogreader->seg.ws_tli = NeonWALReaderGetSegment(wal_reader)->ws_tli; + xlogreader->seg.ws_segno = NeonWALReaderGetSegment(wal_reader)->ws_segno; + + /* + * ws_file doesn't exist in case of remote read, and isn't used by + * xlogreader except by WALRead on which we don't rely anyway. + */ + return count; + } + if (res == NEON_WALREAD_ERROR) + { + elog(ERROR, "[walsender] Failed to read WAL (req_lsn=%X/%X, len=%d): %s", + LSN_FORMAT_ARGS(targetPagePtr), + reqLen, + NeonWALReaderErrMsg(wal_reader)); + return -1; + } + + /* + * Res is WOULDBLOCK, so we wait on the socket, recreating event set + * if necessary + */ + { + + pgsocket sock = NeonWALReaderSocket(wal_reader); + uint32_t reader_events = NeonWALReaderEvents(wal_reader); + long timeout_ms = 1000; + + ResetLatch(MyLatch); + CHECK_FOR_INTERRUPTS(); + if (ConfigReloadPending) + { + ConfigReloadPending = false; + ProcessConfigFile(PGC_SIGHUP); + } + + WaitLatchOrSocket( + MyLatch, + WL_LATCH_SET | WL_EXIT_ON_PM_DEATH | reader_events, + sock, + timeout_ms, + WAIT_EVENT_WAL_SENDER_MAIN); + } + } +} + +static void +NeonWALReadSegmentOpen(XLogReaderState *xlogreader, XLogSegNo nextSegNo, TimeLineID *tli_p) +{ + neon_wal_segment_open(wal_reader, nextSegNo, tli_p); + xlogreader->seg.ws_file = NeonWALReaderGetSegment(wal_reader)->ws_file; +} + +static void +NeonWALReadSegmentClose(XLogReaderState *xlogreader) +{ + neon_wal_segment_close(wal_reader); + xlogreader->seg.ws_file = NeonWALReaderGetSegment(wal_reader)->ws_file; +} + +void +NeonOnDemandXLogReaderRoutines(XLogReaderRoutine *xlr) +{ + if (!wal_reader) + { + XLogRecPtr epochStartLsn = pg_atomic_read_u64(&GetWalpropShmemState()->propEpochStartLsn); + + if (epochStartLsn == 0) + { + elog(ERROR, "Unable to start walsender when propEpochStartLsn is 0!"); + } + wal_reader = NeonWALReaderAllocate(wal_segment_size, epochStartLsn, "[walsender] "); + } + xlr->page_read = NeonWALPageRead; + xlr->segment_open = NeonWALReadSegmentOpen; + xlr->segment_close = NeonWALReadSegmentClose; +} diff --git a/pgxn/neon/walsender_hooks.h b/pgxn/neon/walsender_hooks.h new file mode 100644 index 0000000000..2e3ce180f9 --- /dev/null +++ b/pgxn/neon/walsender_hooks.h @@ -0,0 +1,7 @@ +#ifndef __WALSENDER_HOOKS_H__ +#define __WALSENDER_HOOKS_H__ + +struct XLogReaderRoutine; +void NeonOnDemandXLogReaderRoutines(struct XLogReaderRoutine *xlr); + +#endif diff --git a/pgxn/neon_test_utils/Makefile b/pgxn/neon_test_utils/Makefile index 9c774ec185..1ee87357e5 100644 --- a/pgxn/neon_test_utils/Makefile +++ b/pgxn/neon_test_utils/Makefile @@ -7,7 +7,7 @@ OBJS = \ neontest.o EXTENSION = neon_test_utils -DATA = neon_test_utils--1.0.sql +DATA = neon_test_utils--1.1.sql PGFILEDESC = "neon_test_utils - helpers for neon testing and debugging" PG_CONFIG = pg_config diff --git a/pgxn/neon_test_utils/neon_test_utils--1.0.sql b/pgxn/neon_test_utils/neon_test_utils--1.1.sql similarity index 89% rename from pgxn/neon_test_utils/neon_test_utils--1.0.sql rename to pgxn/neon_test_utils/neon_test_utils--1.1.sql index 23340e352e..534784f319 100644 --- a/pgxn/neon_test_utils/neon_test_utils--1.0.sql +++ b/pgxn/neon_test_utils/neon_test_utils--1.1.sql @@ -31,12 +31,12 @@ AS 'MODULE_PATHNAME', 'clear_buffer_cache' LANGUAGE C STRICT PARALLEL UNSAFE; -CREATE FUNCTION get_raw_page_at_lsn(relname text, forkname text, blocknum int8, lsn pg_lsn) +CREATE FUNCTION get_raw_page_at_lsn(relname text, forkname text, blocknum int8, request_lsn pg_lsn, not_modified_since pg_lsn) RETURNS bytea AS 'MODULE_PATHNAME', 'get_raw_page_at_lsn' LANGUAGE C PARALLEL UNSAFE; -CREATE FUNCTION get_raw_page_at_lsn(tbspc oid, db oid, relfilenode oid, forknum int8, blocknum int8, lsn pg_lsn) +CREATE FUNCTION get_raw_page_at_lsn(tbspc oid, db oid, relfilenode oid, forknum int8, blocknum int8, request_lsn pg_lsn, not_modified_since pg_lsn) RETURNS bytea AS 'MODULE_PATHNAME', 'get_raw_page_at_lsn_ex' LANGUAGE C PARALLEL UNSAFE; diff --git a/pgxn/neon_test_utils/neon_test_utils.control b/pgxn/neon_test_utils/neon_test_utils.control index 5219571f11..5f6d640835 100644 --- a/pgxn/neon_test_utils/neon_test_utils.control +++ b/pgxn/neon_test_utils/neon_test_utils.control @@ -1,6 +1,6 @@ # neon_test_utils extension comment = 'helpers for neon testing and debugging' -default_version = '1.0' +default_version = '1.1' module_pathname = '$libdir/neon_test_utils' relocatable = true trusted = true diff --git a/pgxn/neon_test_utils/neontest.c b/pgxn/neon_test_utils/neontest.c index 82ce5be9f6..677006923d 100644 --- a/pgxn/neon_test_utils/neontest.c +++ b/pgxn/neon_test_utils/neontest.c @@ -48,10 +48,10 @@ PG_FUNCTION_INFO_V1(neon_xlogflush); */ #if PG_MAJORVERSION_NUM < 16 typedef void (*neon_read_at_lsn_type) (NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, - XLogRecPtr request_lsn, bool request_latest, char *buffer); + XLogRecPtr request_lsn, XLogRecPtr not_modified_since, char *buffer); #else typedef void (*neon_read_at_lsn_type) (NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, - XLogRecPtr request_lsn, bool request_latest, void *buffer); + XLogRecPtr request_lsn, XLogRecPtr not_modified_since, void *buffer); #endif static neon_read_at_lsn_type neon_read_at_lsn_ptr; @@ -299,8 +299,11 @@ get_raw_page_at_lsn(PG_FUNCTION_ARGS) text *forkname; uint32 blkno; - bool request_latest = PG_ARGISNULL(3); - uint64 read_lsn = request_latest ? GetXLogInsertRecPtr() : PG_GETARG_INT64(3); + XLogRecPtr request_lsn; + XLogRecPtr not_modified_since; + + if (PG_NARGS() != 5) + elog(ERROR, "unexpected number of arguments in SQL function signature"); if (PG_ARGISNULL(0) || PG_ARGISNULL(1) || PG_ARGISNULL(2)) PG_RETURN_NULL(); @@ -309,6 +312,9 @@ get_raw_page_at_lsn(PG_FUNCTION_ARGS) forkname = PG_GETARG_TEXT_PP(1); blkno = PG_GETARG_UINT32(2); + request_lsn = PG_ARGISNULL(3) ? GetXLogInsertRecPtr() : PG_GETARG_LSN(3); + not_modified_since = PG_ARGISNULL(4) ? request_lsn : PG_GETARG_LSN(4); + if (!superuser()) ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), @@ -361,7 +367,7 @@ get_raw_page_at_lsn(PG_FUNCTION_ARGS) SET_VARSIZE(raw_page, BLCKSZ + VARHDRSZ); raw_page_data = VARDATA(raw_page); - neon_read_at_lsn(InfoFromRelation(rel), forknum, blkno, read_lsn, request_latest, raw_page_data); + neon_read_at_lsn(InfoFromRelation(rel), forknum, blkno, request_lsn, not_modified_since, raw_page_data); relation_close(rel, AccessShareLock); @@ -380,6 +386,9 @@ get_raw_page_at_lsn_ex(PG_FUNCTION_ARGS) { char *raw_page_data; + if (PG_NARGS() != 7) + elog(ERROR, "unexpected number of arguments in SQL function signature"); + if (!superuser()) ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), @@ -403,18 +412,20 @@ get_raw_page_at_lsn_ex(PG_FUNCTION_ARGS) }; ForkNumber forknum = PG_GETARG_UINT32(3); - uint32 blkno = PG_GETARG_UINT32(4); - bool request_latest = PG_ARGISNULL(5); - uint64 read_lsn = request_latest ? GetXLogInsertRecPtr() : PG_GETARG_INT64(5); + XLogRecPtr request_lsn; + XLogRecPtr not_modified_since; /* Initialize buffer to copy to */ bytea *raw_page = (bytea *) palloc(BLCKSZ + VARHDRSZ); + request_lsn = PG_ARGISNULL(5) ? GetXLogInsertRecPtr() : PG_GETARG_LSN(5); + not_modified_since = PG_ARGISNULL(6) ? request_lsn : PG_GETARG_LSN(6); + SET_VARSIZE(raw_page, BLCKSZ + VARHDRSZ); raw_page_data = VARDATA(raw_page); - neon_read_at_lsn(rinfo, forknum, blkno, read_lsn, request_latest, raw_page_data); + neon_read_at_lsn(rinfo, forknum, blkno, request_lsn, not_modified_since, raw_page_data); PG_RETURN_BYTEA_P(raw_page); } } diff --git a/poetry.lock b/poetry.lock index 7b49daf42a..ef9f572b17 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,88 +1,88 @@ -# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand. [[package]] name = "aiohttp" -version = "3.9.2" +version = "3.9.4" description = "Async http client/server framework (asyncio)" optional = false python-versions = ">=3.8" files = [ - {file = "aiohttp-3.9.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:772fbe371788e61c58d6d3d904268e48a594ba866804d08c995ad71b144f94cb"}, - {file = "aiohttp-3.9.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:edd4f1af2253f227ae311ab3d403d0c506c9b4410c7fc8d9573dec6d9740369f"}, - {file = "aiohttp-3.9.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:cfee9287778399fdef6f8a11c9e425e1cb13cc9920fd3a3df8f122500978292b"}, - {file = "aiohttp-3.9.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3cc158466f6a980a6095ee55174d1de5730ad7dec251be655d9a6a9dd7ea1ff9"}, - {file = "aiohttp-3.9.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:54ec82f45d57c9a65a1ead3953b51c704f9587440e6682f689da97f3e8defa35"}, - {file = "aiohttp-3.9.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:abeb813a18eb387f0d835ef51f88568540ad0325807a77a6e501fed4610f864e"}, - {file = "aiohttp-3.9.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cc91d07280d7d169f3a0f9179d8babd0ee05c79d4d891447629ff0d7d8089ec2"}, - {file = "aiohttp-3.9.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b65e861f4bebfb660f7f0f40fa3eb9f2ab9af10647d05dac824390e7af8f75b7"}, - {file = "aiohttp-3.9.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:04fd8ffd2be73d42bcf55fd78cde7958eeee6d4d8f73c3846b7cba491ecdb570"}, - {file = "aiohttp-3.9.2-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:3d8d962b439a859b3ded9a1e111a4615357b01620a546bc601f25b0211f2da81"}, - {file = "aiohttp-3.9.2-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:8ceb658afd12b27552597cf9a65d9807d58aef45adbb58616cdd5ad4c258c39e"}, - {file = "aiohttp-3.9.2-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:0e4ee4df741670560b1bc393672035418bf9063718fee05e1796bf867e995fad"}, - {file = "aiohttp-3.9.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:2dec87a556f300d3211decf018bfd263424f0690fcca00de94a837949fbcea02"}, - {file = "aiohttp-3.9.2-cp310-cp310-win32.whl", hash = "sha256:3e1a800f988ce7c4917f34096f81585a73dbf65b5c39618b37926b1238cf9bc4"}, - {file = "aiohttp-3.9.2-cp310-cp310-win_amd64.whl", hash = "sha256:ea510718a41b95c236c992b89fdfc3d04cc7ca60281f93aaada497c2b4e05c46"}, - {file = "aiohttp-3.9.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:6aaa6f99256dd1b5756a50891a20f0d252bd7bdb0854c5d440edab4495c9f973"}, - {file = "aiohttp-3.9.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:a27d8c70ad87bcfce2e97488652075a9bdd5b70093f50b10ae051dfe5e6baf37"}, - {file = "aiohttp-3.9.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:54287bcb74d21715ac8382e9de146d9442b5f133d9babb7e5d9e453faadd005e"}, - {file = "aiohttp-3.9.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5bb3d05569aa83011fcb346b5266e00b04180105fcacc63743fc2e4a1862a891"}, - {file = "aiohttp-3.9.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c8534e7d69bb8e8d134fe2be9890d1b863518582f30c9874ed7ed12e48abe3c4"}, - {file = "aiohttp-3.9.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4bd9d5b989d57b41e4ff56ab250c5ddf259f32db17159cce630fd543376bd96b"}, - {file = "aiohttp-3.9.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fa6904088e6642609981f919ba775838ebf7df7fe64998b1a954fb411ffb4663"}, - {file = "aiohttp-3.9.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:bda42eb410be91b349fb4ee3a23a30ee301c391e503996a638d05659d76ea4c2"}, - {file = "aiohttp-3.9.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:193cc1ccd69d819562cc7f345c815a6fc51d223b2ef22f23c1a0f67a88de9a72"}, - {file = "aiohttp-3.9.2-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:b9f1cb839b621f84a5b006848e336cf1496688059d2408e617af33e3470ba204"}, - {file = "aiohttp-3.9.2-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:d22a0931848b8c7a023c695fa2057c6aaac19085f257d48baa24455e67df97ec"}, - {file = "aiohttp-3.9.2-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:4112d8ba61fbd0abd5d43a9cb312214565b446d926e282a6d7da3f5a5aa71d36"}, - {file = "aiohttp-3.9.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:c4ad4241b52bb2eb7a4d2bde060d31c2b255b8c6597dd8deac2f039168d14fd7"}, - {file = "aiohttp-3.9.2-cp311-cp311-win32.whl", hash = "sha256:ee2661a3f5b529f4fc8a8ffee9f736ae054adfb353a0d2f78218be90617194b3"}, - {file = "aiohttp-3.9.2-cp311-cp311-win_amd64.whl", hash = "sha256:4deae2c165a5db1ed97df2868ef31ca3cc999988812e82386d22937d9d6fed52"}, - {file = "aiohttp-3.9.2-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:6f4cdba12539215aaecf3c310ce9d067b0081a0795dd8a8805fdb67a65c0572a"}, - {file = "aiohttp-3.9.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:84e843b33d5460a5c501c05539809ff3aee07436296ff9fbc4d327e32aa3a326"}, - {file = "aiohttp-3.9.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8008d0f451d66140a5aa1c17e3eedc9d56e14207568cd42072c9d6b92bf19b52"}, - {file = "aiohttp-3.9.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:61c47ab8ef629793c086378b1df93d18438612d3ed60dca76c3422f4fbafa792"}, - {file = "aiohttp-3.9.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:bc71f748e12284312f140eaa6599a520389273174b42c345d13c7e07792f4f57"}, - {file = "aiohttp-3.9.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a1c3a4d0ab2f75f22ec80bca62385db2e8810ee12efa8c9e92efea45c1849133"}, - {file = "aiohttp-3.9.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9a87aa0b13bbee025faa59fa58861303c2b064b9855d4c0e45ec70182bbeba1b"}, - {file = "aiohttp-3.9.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e2cc0d04688b9f4a7854c56c18aa7af9e5b0a87a28f934e2e596ba7e14783192"}, - {file = "aiohttp-3.9.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:1956e3ac376b1711c1533266dec4efd485f821d84c13ce1217d53e42c9e65f08"}, - {file = "aiohttp-3.9.2-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:114da29f39eccd71b93a0fcacff178749a5c3559009b4a4498c2c173a6d74dff"}, - {file = "aiohttp-3.9.2-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:3f17999ae3927d8a9a823a1283b201344a0627272f92d4f3e3a4efe276972fe8"}, - {file = "aiohttp-3.9.2-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:f31df6a32217a34ae2f813b152a6f348154f948c83213b690e59d9e84020925c"}, - {file = "aiohttp-3.9.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:7a75307ffe31329928a8d47eae0692192327c599113d41b278d4c12b54e1bd11"}, - {file = "aiohttp-3.9.2-cp312-cp312-win32.whl", hash = "sha256:972b63d589ff8f305463593050a31b5ce91638918da38139b9d8deaba9e0fed7"}, - {file = "aiohttp-3.9.2-cp312-cp312-win_amd64.whl", hash = "sha256:200dc0246f0cb5405c80d18ac905c8350179c063ea1587580e3335bfc243ba6a"}, - {file = "aiohttp-3.9.2-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:158564d0d1020e0d3fe919a81d97aadad35171e13e7b425b244ad4337fc6793a"}, - {file = "aiohttp-3.9.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:da1346cd0ccb395f0ed16b113ebb626fa43b7b07fd7344fce33e7a4f04a8897a"}, - {file = "aiohttp-3.9.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:eaa9256de26ea0334ffa25f1913ae15a51e35c529a1ed9af8e6286dd44312554"}, - {file = "aiohttp-3.9.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1543e7fb00214fb4ccead42e6a7d86f3bb7c34751ec7c605cca7388e525fd0b4"}, - {file = "aiohttp-3.9.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:186e94570433a004e05f31f632726ae0f2c9dee4762a9ce915769ce9c0a23d89"}, - {file = "aiohttp-3.9.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d52d20832ac1560f4510d68e7ba8befbc801a2b77df12bd0cd2bcf3b049e52a4"}, - {file = "aiohttp-3.9.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1c45e4e815ac6af3b72ca2bde9b608d2571737bb1e2d42299fc1ffdf60f6f9a1"}, - {file = "aiohttp-3.9.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:aa906b9bdfd4a7972dd0628dbbd6413d2062df5b431194486a78f0d2ae87bd55"}, - {file = "aiohttp-3.9.2-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:68bbee9e17d66f17bb0010aa15a22c6eb28583edcc8b3212e2b8e3f77f3ebe2a"}, - {file = "aiohttp-3.9.2-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:4c189b64bd6d9a403a1a3f86a3ab3acbc3dc41a68f73a268a4f683f89a4dec1f"}, - {file = "aiohttp-3.9.2-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:8a7876f794523123bca6d44bfecd89c9fec9ec897a25f3dd202ee7fc5c6525b7"}, - {file = "aiohttp-3.9.2-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:d23fba734e3dd7b1d679b9473129cd52e4ec0e65a4512b488981a56420e708db"}, - {file = "aiohttp-3.9.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:b141753be581fab842a25cb319f79536d19c2a51995d7d8b29ee290169868eab"}, - {file = "aiohttp-3.9.2-cp38-cp38-win32.whl", hash = "sha256:103daf41ff3b53ba6fa09ad410793e2e76c9d0269151812e5aba4b9dd674a7e8"}, - {file = "aiohttp-3.9.2-cp38-cp38-win_amd64.whl", hash = "sha256:328918a6c2835861ff7afa8c6d2c70c35fdaf996205d5932351bdd952f33fa2f"}, - {file = "aiohttp-3.9.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:5264d7327c9464786f74e4ec9342afbbb6ee70dfbb2ec9e3dfce7a54c8043aa3"}, - {file = "aiohttp-3.9.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:07205ae0015e05c78b3288c1517afa000823a678a41594b3fdc870878d645305"}, - {file = "aiohttp-3.9.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ae0a1e638cffc3ec4d4784b8b4fd1cf28968febc4bd2718ffa25b99b96a741bd"}, - {file = "aiohttp-3.9.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d43302a30ba1166325974858e6ef31727a23bdd12db40e725bec0f759abce505"}, - {file = "aiohttp-3.9.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:16a967685907003765855999af11a79b24e70b34dc710f77a38d21cd9fc4f5fe"}, - {file = "aiohttp-3.9.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6fa3ee92cd441d5c2d07ca88d7a9cef50f7ec975f0117cd0c62018022a184308"}, - {file = "aiohttp-3.9.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0b500c5ad9c07639d48615a770f49618130e61be36608fc9bc2d9bae31732b8f"}, - {file = "aiohttp-3.9.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c07327b368745b1ce2393ae9e1aafed7073d9199e1dcba14e035cc646c7941bf"}, - {file = "aiohttp-3.9.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:cc7d6502c23a0ec109687bf31909b3fb7b196faf198f8cff68c81b49eb316ea9"}, - {file = "aiohttp-3.9.2-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:07be2be7071723c3509ab5c08108d3a74f2181d4964e869f2504aaab68f8d3e8"}, - {file = "aiohttp-3.9.2-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:122468f6fee5fcbe67cb07014a08c195b3d4c41ff71e7b5160a7bcc41d585a5f"}, - {file = "aiohttp-3.9.2-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:00a9abcea793c81e7f8778ca195a1714a64f6d7436c4c0bb168ad2a212627000"}, - {file = "aiohttp-3.9.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:7a9825fdd64ecac5c670234d80bb52bdcaa4139d1f839165f548208b3779c6c6"}, - {file = "aiohttp-3.9.2-cp39-cp39-win32.whl", hash = "sha256:5422cd9a4a00f24c7244e1b15aa9b87935c85fb6a00c8ac9b2527b38627a9211"}, - {file = "aiohttp-3.9.2-cp39-cp39-win_amd64.whl", hash = "sha256:7d579dcd5d82a86a46f725458418458fa43686f6a7b252f2966d359033ffc8ab"}, - {file = "aiohttp-3.9.2.tar.gz", hash = "sha256:b0ad0a5e86ce73f5368a164c10ada10504bf91869c05ab75d982c6048217fbf7"}, + {file = "aiohttp-3.9.4-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:76d32588ef7e4a3f3adff1956a0ba96faabbdee58f2407c122dd45aa6e34f372"}, + {file = "aiohttp-3.9.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:56181093c10dbc6ceb8a29dfeea1e815e1dfdc020169203d87fd8d37616f73f9"}, + {file = "aiohttp-3.9.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c7a5b676d3c65e88b3aca41816bf72831898fcd73f0cbb2680e9d88e819d1e4d"}, + {file = "aiohttp-3.9.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d1df528a85fb404899d4207a8d9934cfd6be626e30e5d3a5544a83dbae6d8a7e"}, + {file = "aiohttp-3.9.4-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f595db1bceabd71c82e92df212dd9525a8a2c6947d39e3c994c4f27d2fe15b11"}, + {file = "aiohttp-3.9.4-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9c0b09d76e5a4caac3d27752027fbd43dc987b95f3748fad2b924a03fe8632ad"}, + {file = "aiohttp-3.9.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:689eb4356649ec9535b3686200b231876fb4cab4aca54e3bece71d37f50c1d13"}, + {file = "aiohttp-3.9.4-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a3666cf4182efdb44d73602379a66f5fdfd5da0db5e4520f0ac0dcca644a3497"}, + {file = "aiohttp-3.9.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:b65b0f8747b013570eea2f75726046fa54fa8e0c5db60f3b98dd5d161052004a"}, + {file = "aiohttp-3.9.4-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:a1885d2470955f70dfdd33a02e1749613c5a9c5ab855f6db38e0b9389453dce7"}, + {file = "aiohttp-3.9.4-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:0593822dcdb9483d41f12041ff7c90d4d1033ec0e880bcfaf102919b715f47f1"}, + {file = "aiohttp-3.9.4-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:47f6eb74e1ecb5e19a78f4a4228aa24df7fbab3b62d4a625d3f41194a08bd54f"}, + {file = "aiohttp-3.9.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:c8b04a3dbd54de6ccb7604242fe3ad67f2f3ca558f2d33fe19d4b08d90701a89"}, + {file = "aiohttp-3.9.4-cp310-cp310-win32.whl", hash = "sha256:8a78dfb198a328bfb38e4308ca8167028920fb747ddcf086ce706fbdd23b2926"}, + {file = "aiohttp-3.9.4-cp310-cp310-win_amd64.whl", hash = "sha256:e78da6b55275987cbc89141a1d8e75f5070e577c482dd48bd9123a76a96f0bbb"}, + {file = "aiohttp-3.9.4-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:c111b3c69060d2bafc446917534150fd049e7aedd6cbf21ba526a5a97b4402a5"}, + {file = "aiohttp-3.9.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:efbdd51872cf170093998c87ccdf3cb5993add3559341a8e5708bcb311934c94"}, + {file = "aiohttp-3.9.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7bfdb41dc6e85d8535b00d73947548a748e9534e8e4fddd2638109ff3fb081df"}, + {file = "aiohttp-3.9.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2bd9d334412961125e9f68d5b73c1d0ab9ea3f74a58a475e6b119f5293eee7ba"}, + {file = "aiohttp-3.9.4-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:35d78076736f4a668d57ade00c65d30a8ce28719d8a42471b2a06ccd1a2e3063"}, + {file = "aiohttp-3.9.4-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:824dff4f9f4d0f59d0fa3577932ee9a20e09edec8a2f813e1d6b9f89ced8293f"}, + {file = "aiohttp-3.9.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:52b8b4e06fc15519019e128abedaeb56412b106ab88b3c452188ca47a25c4093"}, + {file = "aiohttp-3.9.4-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:eae569fb1e7559d4f3919965617bb39f9e753967fae55ce13454bec2d1c54f09"}, + {file = "aiohttp-3.9.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:69b97aa5792428f321f72aeb2f118e56893371f27e0b7d05750bcad06fc42ca1"}, + {file = "aiohttp-3.9.4-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:4d79aad0ad4b980663316f26d9a492e8fab2af77c69c0f33780a56843ad2f89e"}, + {file = "aiohttp-3.9.4-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:d6577140cd7db19e430661e4b2653680194ea8c22c994bc65b7a19d8ec834403"}, + {file = "aiohttp-3.9.4-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:9860d455847cd98eb67897f5957b7cd69fbcb436dd3f06099230f16a66e66f79"}, + {file = "aiohttp-3.9.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:69ff36d3f8f5652994e08bd22f093e11cfd0444cea310f92e01b45a4e46b624e"}, + {file = "aiohttp-3.9.4-cp311-cp311-win32.whl", hash = "sha256:e27d3b5ed2c2013bce66ad67ee57cbf614288bda8cdf426c8d8fe548316f1b5f"}, + {file = "aiohttp-3.9.4-cp311-cp311-win_amd64.whl", hash = "sha256:d6a67e26daa686a6fbdb600a9af8619c80a332556245fa8e86c747d226ab1a1e"}, + {file = "aiohttp-3.9.4-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:c5ff8ff44825736a4065d8544b43b43ee4c6dd1530f3a08e6c0578a813b0aa35"}, + {file = "aiohttp-3.9.4-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:d12a244627eba4e9dc52cbf924edef905ddd6cafc6513849b4876076a6f38b0e"}, + {file = "aiohttp-3.9.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:dcad56c8d8348e7e468899d2fb3b309b9bc59d94e6db08710555f7436156097f"}, + {file = "aiohttp-3.9.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4f7e69a7fd4b5ce419238388e55abd220336bd32212c673ceabc57ccf3d05b55"}, + {file = "aiohttp-3.9.4-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c4870cb049f10d7680c239b55428916d84158798eb8f353e74fa2c98980dcc0b"}, + {file = "aiohttp-3.9.4-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3b2feaf1b7031ede1bc0880cec4b0776fd347259a723d625357bb4b82f62687b"}, + {file = "aiohttp-3.9.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:939393e8c3f0a5bcd33ef7ace67680c318dc2ae406f15e381c0054dd658397de"}, + {file = "aiohttp-3.9.4-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7d2334e387b2adcc944680bebcf412743f2caf4eeebd550f67249c1c3696be04"}, + {file = "aiohttp-3.9.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:e0198ea897680e480845ec0ffc5a14e8b694e25b3f104f63676d55bf76a82f1a"}, + {file = "aiohttp-3.9.4-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:e40d2cd22914d67c84824045861a5bb0fb46586b15dfe4f046c7495bf08306b2"}, + {file = "aiohttp-3.9.4-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:aba80e77c227f4234aa34a5ff2b6ff30c5d6a827a91d22ff6b999de9175d71bd"}, + {file = "aiohttp-3.9.4-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:fb68dc73bc8ac322d2e392a59a9e396c4f35cb6fdbdd749e139d1d6c985f2527"}, + {file = "aiohttp-3.9.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:f3460a92638dce7e47062cf088d6e7663adb135e936cb117be88d5e6c48c9d53"}, + {file = "aiohttp-3.9.4-cp312-cp312-win32.whl", hash = "sha256:32dc814ddbb254f6170bca198fe307920f6c1308a5492f049f7f63554b88ef36"}, + {file = "aiohttp-3.9.4-cp312-cp312-win_amd64.whl", hash = "sha256:63f41a909d182d2b78fe3abef557fcc14da50c7852f70ae3be60e83ff64edba5"}, + {file = "aiohttp-3.9.4-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:c3770365675f6be220032f6609a8fbad994d6dcf3ef7dbcf295c7ee70884c9af"}, + {file = "aiohttp-3.9.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:305edae1dea368ce09bcb858cf5a63a064f3bff4767dec6fa60a0cc0e805a1d3"}, + {file = "aiohttp-3.9.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:6f121900131d116e4a93b55ab0d12ad72573f967b100e49086e496a9b24523ea"}, + {file = "aiohttp-3.9.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b71e614c1ae35c3d62a293b19eface83d5e4d194e3eb2fabb10059d33e6e8cbf"}, + {file = "aiohttp-3.9.4-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:419f009fa4cfde4d16a7fc070d64f36d70a8d35a90d71aa27670bba2be4fd039"}, + {file = "aiohttp-3.9.4-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7b39476ee69cfe64061fd77a73bf692c40021f8547cda617a3466530ef63f947"}, + {file = "aiohttp-3.9.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b33f34c9c7decdb2ab99c74be6443942b730b56d9c5ee48fb7df2c86492f293c"}, + {file = "aiohttp-3.9.4-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c78700130ce2dcebb1a8103202ae795be2fa8c9351d0dd22338fe3dac74847d9"}, + {file = "aiohttp-3.9.4-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:268ba22d917655d1259af2d5659072b7dc11b4e1dc2cb9662fdd867d75afc6a4"}, + {file = "aiohttp-3.9.4-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:17e7c051f53a0d2ebf33013a9cbf020bb4e098c4bc5bce6f7b0c962108d97eab"}, + {file = "aiohttp-3.9.4-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:7be99f4abb008cb38e144f85f515598f4c2c8932bf11b65add0ff59c9c876d99"}, + {file = "aiohttp-3.9.4-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:d58a54d6ff08d2547656356eea8572b224e6f9bbc0cf55fa9966bcaac4ddfb10"}, + {file = "aiohttp-3.9.4-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:7673a76772bda15d0d10d1aa881b7911d0580c980dbd16e59d7ba1422b2d83cd"}, + {file = "aiohttp-3.9.4-cp38-cp38-win32.whl", hash = "sha256:e4370dda04dc8951012f30e1ce7956a0a226ac0714a7b6c389fb2f43f22a250e"}, + {file = "aiohttp-3.9.4-cp38-cp38-win_amd64.whl", hash = "sha256:eb30c4510a691bb87081192a394fb661860e75ca3896c01c6d186febe7c88530"}, + {file = "aiohttp-3.9.4-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:84e90494db7df3be5e056f91412f9fa9e611fbe8ce4aaef70647297f5943b276"}, + {file = "aiohttp-3.9.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:7d4845f8501ab28ebfdbeab980a50a273b415cf69e96e4e674d43d86a464df9d"}, + {file = "aiohttp-3.9.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:69046cd9a2a17245c4ce3c1f1a4ff8c70c7701ef222fce3d1d8435f09042bba1"}, + {file = "aiohttp-3.9.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8b73a06bafc8dcc508420db43b4dd5850e41e69de99009d0351c4f3007960019"}, + {file = "aiohttp-3.9.4-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:418bb0038dfafeac923823c2e63226179976c76f981a2aaad0ad5d51f2229bca"}, + {file = "aiohttp-3.9.4-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:71a8f241456b6c2668374d5d28398f8e8cdae4cce568aaea54e0f39359cd928d"}, + {file = "aiohttp-3.9.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:935c369bf8acc2dc26f6eeb5222768aa7c62917c3554f7215f2ead7386b33748"}, + {file = "aiohttp-3.9.4-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:74e4e48c8752d14ecfb36d2ebb3d76d614320570e14de0a3aa7a726ff150a03c"}, + {file = "aiohttp-3.9.4-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:916b0417aeddf2c8c61291238ce25286f391a6acb6f28005dd9ce282bd6311b6"}, + {file = "aiohttp-3.9.4-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:9b6787b6d0b3518b2ee4cbeadd24a507756ee703adbac1ab6dc7c4434b8c572a"}, + {file = "aiohttp-3.9.4-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:221204dbda5ef350e8db6287937621cf75e85778b296c9c52260b522231940ed"}, + {file = "aiohttp-3.9.4-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:10afd99b8251022ddf81eaed1d90f5a988e349ee7d779eb429fb07b670751e8c"}, + {file = "aiohttp-3.9.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:2506d9f7a9b91033201be9ffe7d89c6a54150b0578803cce5cb84a943d075bc3"}, + {file = "aiohttp-3.9.4-cp39-cp39-win32.whl", hash = "sha256:e571fdd9efd65e86c6af2f332e0e95dad259bfe6beb5d15b3c3eca3a6eb5d87b"}, + {file = "aiohttp-3.9.4-cp39-cp39-win_amd64.whl", hash = "sha256:7d29dd5319d20aa3b7749719ac9685fbd926f71ac8c77b2477272725f882072d"}, + {file = "aiohttp-3.9.4.tar.gz", hash = "sha256:6ff71ede6d9a5a58cfb7b6fffc83ab5d4a63138276c771ac91ceaaddf5459644"}, ] [package.dependencies] @@ -158,6 +158,28 @@ files = [ attrs = ">=16.0.0" pluggy = ">=0.4.0" +[[package]] +name = "annotated-types" +version = "0.6.0" +description = "Reusable constraint types to use with typing.Annotated" +optional = false +python-versions = ">=3.8" +files = [ + {file = "annotated_types-0.6.0-py3-none-any.whl", hash = "sha256:0641064de18ba7a25dee8f96403ebc39113d0cb953a01429249d5c7564666a43"}, + {file = "annotated_types-0.6.0.tar.gz", hash = "sha256:563339e807e53ffd9c267e99fc6d9ea23eb8443c08f112651963e24e22f84a5d"}, +] + +[[package]] +name = "antlr4-python3-runtime" +version = "4.13.1" +description = "ANTLR 4.13.1 runtime for Python 3" +optional = false +python-versions = "*" +files = [ + {file = "antlr4-python3-runtime-4.13.1.tar.gz", hash = "sha256:3cd282f5ea7cfb841537fe01f143350fdb1c0b1ce7981443a2fa8513fddb6d1a"}, + {file = "antlr4_python3_runtime-4.13.1-py3-none-any.whl", hash = "sha256:78ec57aad12c97ac039ca27403ad61cb98aaec8a3f9bb8144f889aa0fa28b943"}, +] + [[package]] name = "anyio" version = "4.3.0" @@ -267,22 +289,23 @@ tests-no-zope = ["cloudpickle", "coverage[toml] (>=5.0.2)", "hypothesis", "mypy" [[package]] name = "aws-sam-translator" -version = "1.48.0" +version = "1.88.0" description = "AWS SAM Translator is a library that transform SAM templates into AWS CloudFormation templates" optional = false -python-versions = ">=3.7, <=4.0, !=4.0" +python-versions = "!=4.0,<=4.0,>=3.8" files = [ - {file = "aws-sam-translator-1.48.0.tar.gz", hash = "sha256:7171037323dfa30f8f73e9bccb9210e4c384a585e087219a9518a5204f0a2c44"}, - {file = "aws_sam_translator-1.48.0-py2-none-any.whl", hash = "sha256:be18dfa3dfe7ab291d281667c5f73ac62dbe6bfe86df7d122e4258b906b736f0"}, - {file = "aws_sam_translator-1.48.0-py3-none-any.whl", hash = "sha256:ca4f8f9910d7713aeaba59346775bfb3198f6acb47c6704572f9bd3fc0fb5bf0"}, + {file = "aws_sam_translator-1.88.0-py3-none-any.whl", hash = "sha256:aa93d498d8de3fb3d485c316155b1628144b823bbc176099a20de06df666fcac"}, + {file = "aws_sam_translator-1.88.0.tar.gz", hash = "sha256:e77c65f3488566122277accd44a0f1ec018e37403e0d5fe25120d96e537e91a7"}, ] [package.dependencies] boto3 = ">=1.19.5,<2.dev0" -jsonschema = ">=3.2,<4.0" +jsonschema = ">=3.2,<5" +pydantic = ">=1.8,<3" +typing-extensions = ">=4.4" [package.extras] -dev = ["black (==20.8b1)", "boto3 (>=1.23,<2)", "click (>=7.1,<8.0)", "coverage (>=5.3,<6.0)", "dateparser (>=0.7,<1.0)", "docopt (>=0.6.2,<0.7.0)", "flake8 (>=3.8.4,<3.9.0)", "parameterized (>=0.7.4,<0.8.0)", "pylint (>=2.9.0,<2.10.0)", "pytest (>=6.2.5,<6.3.0)", "pytest-cov (>=2.10.1,<2.11.0)", "pytest-env (>=0.6.2,<0.7.0)", "pytest-xdist (>=2.5,<3.0)", "pyyaml (>=5.4,<6.0)", "requests (>=2.24.0,<2.25.0)", "tenacity (>=7.0.0,<7.1.0)", "tox (>=3.24,<4.0)"] +dev = ["black (==24.3.0)", "boto3 (>=1.23,<2)", "boto3-stubs[appconfig,serverlessrepo] (>=1.19.5,<2.dev0)", "coverage (>=5.3,<8)", "dateparser (>=1.1,<2.0)", "mypy (>=1.3.0,<1.4.0)", "parameterized (>=0.7,<1.0)", "pytest (>=6.2,<8)", "pytest-cov (>=2.10,<5)", "pytest-env (>=0.6,<1)", "pytest-rerunfailures (>=9.1,<12)", "pytest-xdist (>=2.5,<4)", "pyyaml (>=6.0,<7.0)", "requests (>=2.28,<3.0)", "ruamel.yaml (==0.17.21)", "ruff (>=0.1.0,<0.2.0)", "tenacity (>=8.0,<9.0)", "types-PyYAML (>=6.0,<7.0)", "types-jsonschema (>=3.2,<4.0)"] [[package]] name = "aws-xray-sdk" @@ -798,24 +821,26 @@ pycparser = "*" [[package]] name = "cfn-lint" -version = "0.61.3" +version = "0.87.1" description = "Checks CloudFormation templates for practices and behaviour that could potentially be improved" optional = false -python-versions = ">=3.6, <=4.0, !=4.0" +python-versions = "!=4.0,<=4.0,>=3.8" files = [ - {file = "cfn-lint-0.61.3.tar.gz", hash = "sha256:3806e010d77901f5e935496df690c10e39676434a738fce1a1161cf9c7bd36a2"}, - {file = "cfn_lint-0.61.3-py3-none-any.whl", hash = "sha256:8e9522fad0c7c98b31ecbdd4724f8d8a5787457cc0f71e62ae0d11104d6e52ab"}, + {file = "cfn_lint-0.87.1-py3-none-any.whl", hash = "sha256:d450f450635fc223b6f66880ccac52a5fd1a52966fa1705f1ba52b88dfed3071"}, + {file = "cfn_lint-0.87.1.tar.gz", hash = "sha256:b3ce9d3e5e0eadcea5d584c8ccaa00bf2a990a36a64d7ffd8683bc60b7e4f06f"}, ] [package.dependencies] -aws-sam-translator = ">=1.47.0" +aws-sam-translator = ">=1.87.0" jschema-to-python = ">=1.2.3,<1.3.0" jsonpatch = "*" -jsonschema = ">=3.0,<4.0" +jsonschema = ">=3.0,<5" junit-xml = ">=1.9,<2.0" -networkx = ">=2.4,<3.0" +networkx = ">=2.4,<4" pyyaml = ">5.4" +regex = ">=2021.7.1" sarif-om = ">=1.0.4,<1.1.0" +sympy = ">=1.0.0" [[package]] name = "charset-normalizer" @@ -931,24 +956,6 @@ websocket-client = ">=0.32.0" ssh = ["paramiko (>=2.4.2)"] tls = ["cryptography (>=1.3.4)", "idna (>=2.0.0)", "pyOpenSSL (>=17.5.0)"] -[[package]] -name = "ecdsa" -version = "0.18.0" -description = "ECDSA cryptographic signature library (pure python)" -optional = false -python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*" -files = [ - {file = "ecdsa-0.18.0-py2.py3-none-any.whl", hash = "sha256:80600258e7ed2f16b9aa1d7c295bd70194109ad5a30fdee0eaeefef1d4c559dd"}, - {file = "ecdsa-0.18.0.tar.gz", hash = "sha256:190348041559e21b22a1d65cee485282ca11a6f81d503fddb84d5017e9ed1e49"}, -] - -[package.dependencies] -six = ">=1.9.0" - -[package.extras] -gmpy = ["gmpy"] -gmpy2 = ["gmpy2"] - [[package]] name = "exceptiongroup" version = "1.1.1" @@ -1001,18 +1008,17 @@ dotenv = ["python-dotenv"] [[package]] name = "flask-cors" -version = "3.0.10" +version = "4.0.1" description = "A Flask extension adding a decorator for CORS support" optional = false python-versions = "*" files = [ - {file = "Flask-Cors-3.0.10.tar.gz", hash = "sha256:b60839393f3b84a0f3746f6cdca56c1ad7426aa738b70d6c61375857823181de"}, - {file = "Flask_Cors-3.0.10-py2.py3-none-any.whl", hash = "sha256:74efc975af1194fc7891ff5cd85b0f7478be4f7f59fe158102e91abb72bb4438"}, + {file = "Flask_Cors-4.0.1-py2.py3-none-any.whl", hash = "sha256:f2a704e4458665580c074b714c4627dd5a306b333deb9074d0b1794dfa2fb677"}, + {file = "flask_cors-4.0.1.tar.gz", hash = "sha256:eeb69b342142fdbf4766ad99357a7f3876a2ceb77689dc10ff912aac06c389e4"}, ] [package.dependencies] Flask = ">=0.9" -Six = "*" [[package]] name = "frozenlist" @@ -1191,13 +1197,13 @@ files = [ [[package]] name = "idna" -version = "3.3" +version = "3.7" description = "Internationalized Domain Names in Applications (IDNA)" optional = false python-versions = ">=3.5" files = [ - {file = "idna-3.3-py3-none-any.whl", hash = "sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff"}, - {file = "idna-3.3.tar.gz", hash = "sha256:9d643ff0a55b762d5cdb124b8eaa99c66322e2157b69160bc32796e824360e6d"}, + {file = "idna-3.7-py3-none-any.whl", hash = "sha256:82fee1fc78add43492d3a1898bfa6d8a904cc97d8427f683ed8e798d07761aa0"}, + {file = "idna-3.7.tar.gz", hash = "sha256:028ff3aadf0609c1fd278d8ea3089299412a7a8b9bd005dd08b9f8285bcb5cfc"}, ] [[package]] @@ -1243,13 +1249,13 @@ files = [ [[package]] name = "jinja2" -version = "3.1.3" +version = "3.1.4" description = "A very fast and expressive template engine." optional = false python-versions = ">=3.7" files = [ - {file = "Jinja2-3.1.3-py3-none-any.whl", hash = "sha256:7d6d50dd97d52cbc355597bd845fabfbac3f551e1f99619e39a35ce8c370b5fa"}, - {file = "Jinja2-3.1.3.tar.gz", hash = "sha256:ac8bd6544d4bb2c9792bf3a159e80bba8fda7f07e81bc3aed565432d5925ba90"}, + {file = "jinja2-3.1.4-py3-none-any.whl", hash = "sha256:bc5dd2abb727a5319567b7a813e6a2e7318c39f4f487cfe6c89c6f9c7d25197d"}, + {file = "jinja2-3.1.4.tar.gz", hash = "sha256:4a3aee7acbbe7303aede8e9648d13b8bf88a429282aa6122a993f0ac800cb369"}, ] [package.dependencies] @@ -1269,6 +1275,23 @@ files = [ {file = "jmespath-1.0.1.tar.gz", hash = "sha256:90261b206d6defd58fdd5e85f478bf633a2901798906be2ad389150c5c60edbe"}, ] +[[package]] +name = "joserfc" +version = "0.9.0" +description = "The ultimate Python library for JOSE RFCs, including JWS, JWE, JWK, JWA, JWT" +optional = false +python-versions = ">=3.8" +files = [ + {file = "joserfc-0.9.0-py3-none-any.whl", hash = "sha256:4026bdbe2c196cd40574e916fa1e28874d99649412edaab0e373dec3077153fb"}, + {file = "joserfc-0.9.0.tar.gz", hash = "sha256:eebca7f587b1761ce43a98ffd5327f2b600b9aa5bb0a77b947687f503ad43bc0"}, +] + +[package.dependencies] +cryptography = "*" + +[package.extras] +drafts = ["pycryptodome"] + [[package]] name = "jschema-to-python" version = "1.2.3" @@ -1310,6 +1333,20 @@ files = [ [package.dependencies] jsonpointer = ">=1.9" +[[package]] +name = "jsonpath-ng" +version = "1.6.1" +description = "A final implementation of JSONPath for Python that aims to be standard compliant, including arithmetic and binary comparison operators and providing clear AST for metaprogramming." +optional = false +python-versions = "*" +files = [ + {file = "jsonpath-ng-1.6.1.tar.gz", hash = "sha256:086c37ba4917304850bd837aeab806670224d3f038fe2833ff593a672ef0a5fa"}, + {file = "jsonpath_ng-1.6.1-py3-none-any.whl", hash = "sha256:8f22cd8273d7772eea9aaa84d922e0841aa36fdb8a2c6b7f6c3791a16a9bc0be"}, +] + +[package.dependencies] +ply = "*" + [[package]] name = "jsonpickle" version = "2.2.0" @@ -1339,24 +1376,39 @@ files = [ [[package]] name = "jsonschema" -version = "3.2.0" +version = "4.17.3" description = "An implementation of JSON Schema validation for Python" optional = false -python-versions = "*" +python-versions = ">=3.7" files = [ - {file = "jsonschema-3.2.0-py2.py3-none-any.whl", hash = "sha256:4e5b3cf8216f577bee9ce139cbe72eca3ea4f292ec60928ff24758ce626cd163"}, - {file = "jsonschema-3.2.0.tar.gz", hash = "sha256:c8a85b28d377cc7737e46e2d9f2b4f44ee3c0e1deac6bf46ddefc7187d30797a"}, + {file = "jsonschema-4.17.3-py3-none-any.whl", hash = "sha256:a870ad254da1a8ca84b6a2905cac29d265f805acc57af304784962a2aa6508f6"}, + {file = "jsonschema-4.17.3.tar.gz", hash = "sha256:0f864437ab8b6076ba6707453ef8f98a6a0d512a80e93f8abdb676f737ecb60d"}, ] [package.dependencies] attrs = ">=17.4.0" -pyrsistent = ">=0.14.0" -setuptools = "*" -six = ">=1.11.0" +pyrsistent = ">=0.14.0,<0.17.0 || >0.17.0,<0.17.1 || >0.17.1,<0.17.2 || >0.17.2" [package.extras] -format = ["idna", "jsonpointer (>1.13)", "rfc3987", "strict-rfc3339", "webcolors"] -format-nongpl = ["idna", "jsonpointer (>1.13)", "rfc3339-validator", "rfc3986-validator (>0.1.0)", "webcolors"] +format = ["fqdn", "idna", "isoduration", "jsonpointer (>1.13)", "rfc3339-validator", "rfc3987", "uri-template", "webcolors (>=1.11)"] +format-nongpl = ["fqdn", "idna", "isoduration", "jsonpointer (>1.13)", "rfc3339-validator", "rfc3986-validator (>0.1.0)", "uri-template", "webcolors (>=1.11)"] + +[[package]] +name = "jsonschema-spec" +version = "0.1.6" +description = "JSONSchema Spec with object-oriented paths" +optional = false +python-versions = ">=3.7.0,<4.0.0" +files = [ + {file = "jsonschema_spec-0.1.6-py3-none-any.whl", hash = "sha256:f2206d18c89d1824c1f775ba14ed039743b41a9167bd2c5bdb774b66b3ca0bbf"}, + {file = "jsonschema_spec-0.1.6.tar.gz", hash = "sha256:90215863b56e212086641956b20127ccbf6d8a3a38343dad01d6a74d19482f76"}, +] + +[package.dependencies] +jsonschema = ">=4.0.0,<4.18.0" +pathable = ">=0.4.1,<0.5.0" +PyYAML = ">=5.1" +requests = ">=2.31.0,<3.0.0" [[package]] name = "junit-xml" @@ -1372,6 +1424,52 @@ files = [ [package.dependencies] six = "*" +[[package]] +name = "lazy-object-proxy" +version = "1.10.0" +description = "A fast and thorough lazy object proxy." +optional = false +python-versions = ">=3.8" +files = [ + {file = "lazy-object-proxy-1.10.0.tar.gz", hash = "sha256:78247b6d45f43a52ef35c25b5581459e85117225408a4128a3daf8bf9648ac69"}, + {file = "lazy_object_proxy-1.10.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:855e068b0358ab916454464a884779c7ffa312b8925c6f7401e952dcf3b89977"}, + {file = "lazy_object_proxy-1.10.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7ab7004cf2e59f7c2e4345604a3e6ea0d92ac44e1c2375527d56492014e690c3"}, + {file = "lazy_object_proxy-1.10.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dc0d2fc424e54c70c4bc06787e4072c4f3b1aa2f897dfdc34ce1013cf3ceef05"}, + {file = "lazy_object_proxy-1.10.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:e2adb09778797da09d2b5ebdbceebf7dd32e2c96f79da9052b2e87b6ea495895"}, + {file = "lazy_object_proxy-1.10.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:b1f711e2c6dcd4edd372cf5dec5c5a30d23bba06ee012093267b3376c079ec83"}, + {file = "lazy_object_proxy-1.10.0-cp310-cp310-win32.whl", hash = "sha256:76a095cfe6045c7d0ca77db9934e8f7b71b14645f0094ffcd842349ada5c5fb9"}, + {file = "lazy_object_proxy-1.10.0-cp310-cp310-win_amd64.whl", hash = "sha256:b4f87d4ed9064b2628da63830986c3d2dca7501e6018347798313fcf028e2fd4"}, + {file = "lazy_object_proxy-1.10.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:fec03caabbc6b59ea4a638bee5fce7117be8e99a4103d9d5ad77f15d6f81020c"}, + {file = "lazy_object_proxy-1.10.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:02c83f957782cbbe8136bee26416686a6ae998c7b6191711a04da776dc9e47d4"}, + {file = "lazy_object_proxy-1.10.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:009e6bb1f1935a62889ddc8541514b6a9e1fcf302667dcb049a0be5c8f613e56"}, + {file = "lazy_object_proxy-1.10.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:75fc59fc450050b1b3c203c35020bc41bd2695ed692a392924c6ce180c6f1dc9"}, + {file = "lazy_object_proxy-1.10.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:782e2c9b2aab1708ffb07d4bf377d12901d7a1d99e5e410d648d892f8967ab1f"}, + {file = "lazy_object_proxy-1.10.0-cp311-cp311-win32.whl", hash = "sha256:edb45bb8278574710e68a6b021599a10ce730d156e5b254941754a9cc0b17d03"}, + {file = "lazy_object_proxy-1.10.0-cp311-cp311-win_amd64.whl", hash = "sha256:e271058822765ad5e3bca7f05f2ace0de58a3f4e62045a8c90a0dfd2f8ad8cc6"}, + {file = "lazy_object_proxy-1.10.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:e98c8af98d5707dcdecc9ab0863c0ea6e88545d42ca7c3feffb6b4d1e370c7ba"}, + {file = "lazy_object_proxy-1.10.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:952c81d415b9b80ea261d2372d2a4a2332a3890c2b83e0535f263ddfe43f0d43"}, + {file = "lazy_object_proxy-1.10.0-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:80b39d3a151309efc8cc48675918891b865bdf742a8616a337cb0090791a0de9"}, + {file = "lazy_object_proxy-1.10.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:e221060b701e2aa2ea991542900dd13907a5c90fa80e199dbf5a03359019e7a3"}, + {file = "lazy_object_proxy-1.10.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:92f09ff65ecff3108e56526f9e2481b8116c0b9e1425325e13245abfd79bdb1b"}, + {file = "lazy_object_proxy-1.10.0-cp312-cp312-win32.whl", hash = "sha256:3ad54b9ddbe20ae9f7c1b29e52f123120772b06dbb18ec6be9101369d63a4074"}, + {file = "lazy_object_proxy-1.10.0-cp312-cp312-win_amd64.whl", hash = "sha256:127a789c75151db6af398b8972178afe6bda7d6f68730c057fbbc2e96b08d282"}, + {file = "lazy_object_proxy-1.10.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:9e4ed0518a14dd26092614412936920ad081a424bdcb54cc13349a8e2c6d106a"}, + {file = "lazy_object_proxy-1.10.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5ad9e6ed739285919aa9661a5bbed0aaf410aa60231373c5579c6b4801bd883c"}, + {file = "lazy_object_proxy-1.10.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2fc0a92c02fa1ca1e84fc60fa258458e5bf89d90a1ddaeb8ed9cc3147f417255"}, + {file = "lazy_object_proxy-1.10.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:0aefc7591920bbd360d57ea03c995cebc204b424524a5bd78406f6e1b8b2a5d8"}, + {file = "lazy_object_proxy-1.10.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:5faf03a7d8942bb4476e3b62fd0f4cf94eaf4618e304a19865abf89a35c0bbee"}, + {file = "lazy_object_proxy-1.10.0-cp38-cp38-win32.whl", hash = "sha256:e333e2324307a7b5d86adfa835bb500ee70bfcd1447384a822e96495796b0ca4"}, + {file = "lazy_object_proxy-1.10.0-cp38-cp38-win_amd64.whl", hash = "sha256:cb73507defd385b7705c599a94474b1d5222a508e502553ef94114a143ec6696"}, + {file = "lazy_object_proxy-1.10.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:366c32fe5355ef5fc8a232c5436f4cc66e9d3e8967c01fb2e6302fd6627e3d94"}, + {file = "lazy_object_proxy-1.10.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2297f08f08a2bb0d32a4265e98a006643cd7233fb7983032bd61ac7a02956b3b"}, + {file = "lazy_object_proxy-1.10.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:18dd842b49456aaa9a7cf535b04ca4571a302ff72ed8740d06b5adcd41fe0757"}, + {file = "lazy_object_proxy-1.10.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:217138197c170a2a74ca0e05bddcd5f1796c735c37d0eee33e43259b192aa424"}, + {file = "lazy_object_proxy-1.10.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:9a3a87cf1e133e5b1994144c12ca4aa3d9698517fe1e2ca82977781b16955658"}, + {file = "lazy_object_proxy-1.10.0-cp39-cp39-win32.whl", hash = "sha256:30b339b2a743c5288405aa79a69e706a06e02958eab31859f7f3c04980853b70"}, + {file = "lazy_object_proxy-1.10.0-cp39-cp39-win_amd64.whl", hash = "sha256:a899b10e17743683b293a729d3a11f2f399e8a90c73b089e29f5d0fe3509f0dd"}, + {file = "lazy_object_proxy-1.10.0-pp310.pp311.pp312.pp38.pp39-none-any.whl", hash = "sha256:80fa48bd89c8f2f456fc0765c11c23bf5af827febacd2f523ca5bc1893fcc09d"}, +] + [[package]] name = "markupsafe" version = "2.1.1" @@ -1423,64 +1521,80 @@ files = [ [[package]] name = "moto" -version = "4.1.2" +version = "5.0.6" description = "" optional = false -python-versions = ">=3.7" +python-versions = ">=3.8" files = [ - {file = "moto-4.1.2-py2.py3-none-any.whl", hash = "sha256:1b361ece638c74a657325378a259276f368aafce2f8be84f8143e69fa93ce8ec"}, - {file = "moto-4.1.2.tar.gz", hash = "sha256:63431733d2a02c7bd652ad71ec1da442a0e0d580cbac5eeb50d440a2ce066eac"}, + {file = "moto-5.0.6-py2.py3-none-any.whl", hash = "sha256:ca1e22831a741733b581ff2ef4d6ae2e1c6db1eab97af1b78b86ca2c6e88c609"}, + {file = "moto-5.0.6.tar.gz", hash = "sha256:ad8b23f2b555ad694da8b2432a42b6d96beaaf67a4e7d932196a72193a2eee2c"}, ] [package.dependencies] +antlr4-python3-runtime = {version = "*", optional = true, markers = "extra == \"server\""} aws-xray-sdk = {version = ">=0.93,<0.96 || >0.96", optional = true, markers = "extra == \"server\""} boto3 = ">=1.9.201" -botocore = ">=1.12.201" +botocore = ">=1.14.0" cfn-lint = {version = ">=0.40.0", optional = true, markers = "extra == \"server\""} cryptography = ">=3.3.1" -docker = {version = ">=2.5.1", optional = true, markers = "extra == \"server\""} -ecdsa = {version = "!=0.15", optional = true, markers = "extra == \"server\""} +docker = {version = ">=3.0.0", optional = true, markers = "extra == \"server\""} flask = {version = "<2.2.0 || >2.2.0,<2.2.1 || >2.2.1", optional = true, markers = "extra == \"server\""} flask-cors = {version = "*", optional = true, markers = "extra == \"server\""} graphql-core = {version = "*", optional = true, markers = "extra == \"server\""} Jinja2 = ">=2.10.1" +joserfc = {version = ">=0.9.0", optional = true, markers = "extra == \"server\""} jsondiff = {version = ">=1.1.2", optional = true, markers = "extra == \"server\""} -openapi-spec-validator = {version = ">=0.2.8", optional = true, markers = "extra == \"server\""} +jsonpath-ng = {version = "*", optional = true, markers = "extra == \"server\""} +openapi-spec-validator = {version = ">=0.5.0", optional = true, markers = "extra == \"server\""} +py-partiql-parser = {version = "0.5.4", optional = true, markers = "extra == \"server\""} pyparsing = {version = ">=3.0.7", optional = true, markers = "extra == \"server\""} python-dateutil = ">=2.1,<3.0.0" -python-jose = {version = ">=3.1.0,<4.0.0", extras = ["cryptography"], optional = true, markers = "extra == \"server\""} PyYAML = {version = ">=5.1", optional = true, markers = "extra == \"server\""} requests = ">=2.5" -responses = ">=0.13.0" +responses = ">=0.15.0" setuptools = {version = "*", optional = true, markers = "extra == \"server\""} -sshpubkeys = {version = ">=3.1.0", optional = true, markers = "extra == \"server\""} werkzeug = ">=0.5,<2.2.0 || >2.2.0,<2.2.1 || >2.2.1" xmltodict = "*" [package.extras] -all = ["PyYAML (>=5.1)", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.40.0)", "docker (>=2.5.1)", "ecdsa (!=0.15)", "graphql-core", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.2.8)", "pyparsing (>=3.0.7)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "setuptools", "sshpubkeys (>=3.1.0)"] -apigateway = ["PyYAML (>=5.1)", "ecdsa (!=0.15)", "openapi-spec-validator (>=0.2.8)", "python-jose[cryptography] (>=3.1.0,<4.0.0)"] -apigatewayv2 = ["PyYAML (>=5.1)"] +all = ["PyYAML (>=5.1)", "antlr4-python3-runtime", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.40.0)", "docker (>=3.0.0)", "graphql-core", "joserfc (>=0.9.0)", "jsondiff (>=1.1.2)", "jsonpath-ng", "multipart", "openapi-spec-validator (>=0.5.0)", "py-partiql-parser (==0.5.4)", "pyparsing (>=3.0.7)", "setuptools"] +apigateway = ["PyYAML (>=5.1)", "joserfc (>=0.9.0)", "openapi-spec-validator (>=0.5.0)"] +apigatewayv2 = ["PyYAML (>=5.1)", "openapi-spec-validator (>=0.5.0)"] appsync = ["graphql-core"] -awslambda = ["docker (>=2.5.1)"] -batch = ["docker (>=2.5.1)"] -cloudformation = ["PyYAML (>=5.1)", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.40.0)", "docker (>=2.5.1)", "ecdsa (!=0.15)", "graphql-core", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.2.8)", "pyparsing (>=3.0.7)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "setuptools", "sshpubkeys (>=3.1.0)"] -cognitoidp = ["ecdsa (!=0.15)", "python-jose[cryptography] (>=3.1.0,<4.0.0)"] -ds = ["sshpubkeys (>=3.1.0)"] -dynamodb = ["docker (>=2.5.1)"] -dynamodbstreams = ["docker (>=2.5.1)"] -ebs = ["sshpubkeys (>=3.1.0)"] -ec2 = ["sshpubkeys (>=3.1.0)"] -efs = ["sshpubkeys (>=3.1.0)"] -eks = ["sshpubkeys (>=3.1.0)"] +awslambda = ["docker (>=3.0.0)"] +batch = ["docker (>=3.0.0)"] +cloudformation = ["PyYAML (>=5.1)", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.40.0)", "docker (>=3.0.0)", "graphql-core", "joserfc (>=0.9.0)", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.5.0)", "py-partiql-parser (==0.5.4)", "pyparsing (>=3.0.7)", "setuptools"] +cognitoidp = ["joserfc (>=0.9.0)"] +dynamodb = ["docker (>=3.0.0)", "py-partiql-parser (==0.5.4)"] +dynamodbstreams = ["docker (>=3.0.0)", "py-partiql-parser (==0.5.4)"] glue = ["pyparsing (>=3.0.7)"] iotdata = ["jsondiff (>=1.1.2)"] -route53resolver = ["sshpubkeys (>=3.1.0)"] -s3 = ["PyYAML (>=5.1)"] -server = ["PyYAML (>=5.1)", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.40.0)", "docker (>=2.5.1)", "ecdsa (!=0.15)", "flask (!=2.2.0,!=2.2.1)", "flask-cors", "graphql-core", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.2.8)", "pyparsing (>=3.0.7)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "setuptools", "sshpubkeys (>=3.1.0)"] +proxy = ["PyYAML (>=5.1)", "antlr4-python3-runtime", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.40.0)", "docker (>=2.5.1)", "graphql-core", "joserfc (>=0.9.0)", "jsondiff (>=1.1.2)", "jsonpath-ng", "multipart", "openapi-spec-validator (>=0.5.0)", "py-partiql-parser (==0.5.4)", "pyparsing (>=3.0.7)", "setuptools"] +resourcegroupstaggingapi = ["PyYAML (>=5.1)", "cfn-lint (>=0.40.0)", "docker (>=3.0.0)", "graphql-core", "joserfc (>=0.9.0)", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.5.0)", "py-partiql-parser (==0.5.4)", "pyparsing (>=3.0.7)"] +s3 = ["PyYAML (>=5.1)", "py-partiql-parser (==0.5.4)"] +s3crc32c = ["PyYAML (>=5.1)", "crc32c", "py-partiql-parser (==0.5.4)"] +server = ["PyYAML (>=5.1)", "antlr4-python3-runtime", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.40.0)", "docker (>=3.0.0)", "flask (!=2.2.0,!=2.2.1)", "flask-cors", "graphql-core", "joserfc (>=0.9.0)", "jsondiff (>=1.1.2)", "jsonpath-ng", "openapi-spec-validator (>=0.5.0)", "py-partiql-parser (==0.5.4)", "pyparsing (>=3.0.7)", "setuptools"] ssm = ["PyYAML (>=5.1)"] +stepfunctions = ["antlr4-python3-runtime", "jsonpath-ng"] xray = ["aws-xray-sdk (>=0.93,!=0.96)", "setuptools"] +[[package]] +name = "mpmath" +version = "1.3.0" +description = "Python library for arbitrary-precision floating-point arithmetic" +optional = false +python-versions = "*" +files = [ + {file = "mpmath-1.3.0-py3-none-any.whl", hash = "sha256:a0b2b9fe80bbcd81a6647ff13108738cfb482d481d826cc0e02f5b35e5c88d2c"}, + {file = "mpmath-1.3.0.tar.gz", hash = "sha256:7a28eb2a9774d00c7bc92411c19a89209d5da7c4c9a9e227be8330a23a25b91f"}, +] + +[package.extras] +develop = ["codecov", "pycodestyle", "pytest (>=4.6)", "pytest-cov", "wheel"] +docs = ["sphinx"] +gmpy = ["gmpy2 (>=2.1.0a4)"] +tests = ["pytest (>=4.6)"] + [[package]] name = "multidict" version = "6.0.4" @@ -1655,42 +1769,38 @@ test = ["codecov (>=2.1)", "pytest (>=7.1)", "pytest-cov (>=3.0)"] [[package]] name = "openapi-schema-validator" -version = "0.2.3" +version = "0.4.4" description = "OpenAPI schema validation for Python" optional = false python-versions = ">=3.7.0,<4.0.0" files = [ - {file = "openapi-schema-validator-0.2.3.tar.gz", hash = "sha256:2c64907728c3ef78e23711c8840a423f0b241588c9ed929855e4b2d1bb0cf5f2"}, - {file = "openapi_schema_validator-0.2.3-py3-none-any.whl", hash = "sha256:9bae709212a19222892cabcc60cafd903cbf4b220223f48583afa3c0e3cc6fc4"}, + {file = "openapi_schema_validator-0.4.4-py3-none-any.whl", hash = "sha256:79f37f38ef9fd5206b924ed7a6f382cea7b649b3b56383c47f1906082b7b9015"}, + {file = "openapi_schema_validator-0.4.4.tar.gz", hash = "sha256:c573e2be2c783abae56c5a1486ab716ca96e09d1c3eab56020d1dc680aa57bf8"}, ] [package.dependencies] -jsonschema = ">=3.0.0,<5.0.0" +jsonschema = ">=4.0.0,<4.18.0" +rfc3339-validator = "*" [package.extras] -isodate = ["isodate"] -rfc3339-validator = ["rfc3339-validator"] -strict-rfc3339 = ["strict-rfc3339"] +docs = ["sphinx (>=5.3.0,<6.0.0)", "sphinx-immaterial (>=0.11.0,<0.12.0)"] [[package]] name = "openapi-spec-validator" -version = "0.4.0" -description = "OpenAPI 2.0 (aka Swagger) and OpenAPI 3.0 spec validator" +version = "0.5.7" +description = "OpenAPI 2.0 (aka Swagger) and OpenAPI 3 spec validator" optional = false python-versions = ">=3.7.0,<4.0.0" files = [ - {file = "openapi-spec-validator-0.4.0.tar.gz", hash = "sha256:97f258850afc97b048f7c2653855e0f88fa66ac103c2be5077c7960aca2ad49a"}, - {file = "openapi_spec_validator-0.4.0-py3-none-any.whl", hash = "sha256:06900ac4d546a1df3642a779da0055be58869c598e3042a2fef067cfd99d04d0"}, + {file = "openapi_spec_validator-0.5.7-py3-none-any.whl", hash = "sha256:8712d2879db7692974ef89c47a3ebfc79436442921ec3a826ac0ce80cde8c549"}, + {file = "openapi_spec_validator-0.5.7.tar.gz", hash = "sha256:6c2d42180045a80fd6314de848b94310bdb0fa4949f4b099578b69f79d9fa5ac"}, ] [package.dependencies] -jsonschema = ">=3.2.0,<5.0.0" -openapi-schema-validator = ">=0.2.0,<0.3.0" -PyYAML = ">=5.1" -setuptools = "*" - -[package.extras] -requests = ["requests"] +jsonschema = ">=4.0.0,<4.18.0" +jsonschema-spec = ">=0.1.1,<0.2.0" +lazy-object-proxy = ">=1.7.1,<2.0.0" +openapi-schema-validator = ">=0.4.2,<0.5.0" [[package]] name = "packaging" @@ -1703,6 +1813,17 @@ files = [ {file = "packaging-23.0.tar.gz", hash = "sha256:b6ad297f8907de0fa2fe1ccbd26fdaf387f5f47c7275fedf8cce89f99446cf97"}, ] +[[package]] +name = "pathable" +version = "0.4.3" +description = "Object-oriented paths" +optional = false +python-versions = ">=3.7.0,<4.0.0" +files = [ + {file = "pathable-0.4.3-py3-none-any.whl", hash = "sha256:cdd7b1f9d7d5c8b8d3315dbf5a86b2596053ae845f056f57d97c0eefff84da14"}, + {file = "pathable-0.4.3.tar.gz", hash = "sha256:5c869d315be50776cc8a993f3af43e0c60dc01506b399643f919034ebf4cdcab"}, +] + [[package]] name = "pbr" version = "5.9.0" @@ -1729,6 +1850,17 @@ files = [ dev = ["pre-commit", "tox"] testing = ["pytest", "pytest-benchmark"] +[[package]] +name = "ply" +version = "3.11" +description = "Python Lex & Yacc" +optional = false +python-versions = "*" +files = [ + {file = "ply-3.11-py2.py3-none-any.whl", hash = "sha256:096f9b8350b65ebd2fd1346b12452efe5b9607f7482813ffca50c22722a807ce"}, + {file = "ply-3.11.tar.gz", hash = "sha256:00c7c1aaa88358b9c765b6d3000c6eec0ba42abca5351b095321aef446081da3"}, +] + [[package]] name = "prometheus-client" version = "0.14.1" @@ -1841,16 +1973,19 @@ files = [ ] [[package]] -name = "pyasn1" -version = "0.4.8" -description = "ASN.1 types and codecs" +name = "py-partiql-parser" +version = "0.5.4" +description = "Pure Python PartiQL Parser" optional = false python-versions = "*" files = [ - {file = "pyasn1-0.4.8-py2.py3-none-any.whl", hash = "sha256:39c7e2ec30515947ff4e87fb6f456dfc6e84857d34be479c9d4a4ba4bf46aa5d"}, - {file = "pyasn1-0.4.8.tar.gz", hash = "sha256:aef77c9fb94a3ac588e87841208bdec464471d9871bd5050a287cc9a475cd0ba"}, + {file = "py_partiql_parser-0.5.4-py2.py3-none-any.whl", hash = "sha256:3dc4295a47da9587681a96b35c6e151886fdbd0a4acbe0d97c4c68e5f689d315"}, + {file = "py_partiql_parser-0.5.4.tar.gz", hash = "sha256:72e043919538fa63edae72fb59afc7e3fd93adbde656718a7d2b4666f23dd114"}, ] +[package.extras] +dev = ["black (==22.6.0)", "flake8", "mypy", "pytest"] + [[package]] name = "pycparser" version = "2.21" @@ -1862,6 +1997,116 @@ files = [ {file = "pycparser-2.21.tar.gz", hash = "sha256:e644fdec12f7872f86c58ff790da456218b10f863970249516d60a5eaca77206"}, ] +[[package]] +name = "pydantic" +version = "2.7.1" +description = "Data validation using Python type hints" +optional = false +python-versions = ">=3.8" +files = [ + {file = "pydantic-2.7.1-py3-none-any.whl", hash = "sha256:e029badca45266732a9a79898a15ae2e8b14840b1eabbb25844be28f0b33f3d5"}, + {file = "pydantic-2.7.1.tar.gz", hash = "sha256:e9dbb5eada8abe4d9ae5f46b9939aead650cd2b68f249bb3a8139dbe125803cc"}, +] + +[package.dependencies] +annotated-types = ">=0.4.0" +pydantic-core = "2.18.2" +typing-extensions = ">=4.6.1" + +[package.extras] +email = ["email-validator (>=2.0.0)"] + +[[package]] +name = "pydantic-core" +version = "2.18.2" +description = "Core functionality for Pydantic validation and serialization" +optional = false +python-versions = ">=3.8" +files = [ + {file = "pydantic_core-2.18.2-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:9e08e867b306f525802df7cd16c44ff5ebbe747ff0ca6cf3fde7f36c05a59a81"}, + {file = "pydantic_core-2.18.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:f0a21cbaa69900cbe1a2e7cad2aa74ac3cf21b10c3efb0fa0b80305274c0e8a2"}, + {file = "pydantic_core-2.18.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0680b1f1f11fda801397de52c36ce38ef1c1dc841a0927a94f226dea29c3ae3d"}, + {file = "pydantic_core-2.18.2-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:95b9d5e72481d3780ba3442eac863eae92ae43a5f3adb5b4d0a1de89d42bb250"}, + {file = "pydantic_core-2.18.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c4fcf5cd9c4b655ad666ca332b9a081112cd7a58a8b5a6ca7a3104bc950f2038"}, + {file = "pydantic_core-2.18.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9b5155ff768083cb1d62f3e143b49a8a3432e6789a3abee8acd005c3c7af1c74"}, + {file = "pydantic_core-2.18.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:553ef617b6836fc7e4df130bb851e32fe357ce36336d897fd6646d6058d980af"}, + {file = "pydantic_core-2.18.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:b89ed9eb7d616ef5714e5590e6cf7f23b02d0d539767d33561e3675d6f9e3857"}, + {file = "pydantic_core-2.18.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:75f7e9488238e920ab6204399ded280dc4c307d034f3924cd7f90a38b1829563"}, + {file = "pydantic_core-2.18.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:ef26c9e94a8c04a1b2924149a9cb081836913818e55681722d7f29af88fe7b38"}, + {file = "pydantic_core-2.18.2-cp310-none-win32.whl", hash = "sha256:182245ff6b0039e82b6bb585ed55a64d7c81c560715d1bad0cbad6dfa07b4027"}, + {file = "pydantic_core-2.18.2-cp310-none-win_amd64.whl", hash = "sha256:e23ec367a948b6d812301afc1b13f8094ab7b2c280af66ef450efc357d2ae543"}, + {file = "pydantic_core-2.18.2-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:219da3f096d50a157f33645a1cf31c0ad1fe829a92181dd1311022f986e5fbe3"}, + {file = "pydantic_core-2.18.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:cc1cfd88a64e012b74e94cd00bbe0f9c6df57049c97f02bb07d39e9c852e19a4"}, + {file = "pydantic_core-2.18.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:05b7133a6e6aeb8df37d6f413f7705a37ab4031597f64ab56384c94d98fa0e90"}, + {file = "pydantic_core-2.18.2-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:224c421235f6102e8737032483f43c1a8cfb1d2f45740c44166219599358c2cd"}, + {file = "pydantic_core-2.18.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b14d82cdb934e99dda6d9d60dc84a24379820176cc4a0d123f88df319ae9c150"}, + {file = "pydantic_core-2.18.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2728b01246a3bba6de144f9e3115b532ee44bd6cf39795194fb75491824a1413"}, + {file = "pydantic_core-2.18.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:470b94480bb5ee929f5acba6995251ada5e059a5ef3e0dfc63cca287283ebfa6"}, + {file = "pydantic_core-2.18.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:997abc4df705d1295a42f95b4eec4950a37ad8ae46d913caeee117b6b198811c"}, + {file = "pydantic_core-2.18.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:75250dbc5290e3f1a0f4618db35e51a165186f9034eff158f3d490b3fed9f8a0"}, + {file = "pydantic_core-2.18.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:4456f2dca97c425231d7315737d45239b2b51a50dc2b6f0c2bb181fce6207664"}, + {file = "pydantic_core-2.18.2-cp311-none-win32.whl", hash = "sha256:269322dcc3d8bdb69f054681edff86276b2ff972447863cf34c8b860f5188e2e"}, + {file = "pydantic_core-2.18.2-cp311-none-win_amd64.whl", hash = "sha256:800d60565aec896f25bc3cfa56d2277d52d5182af08162f7954f938c06dc4ee3"}, + {file = "pydantic_core-2.18.2-cp311-none-win_arm64.whl", hash = "sha256:1404c69d6a676245199767ba4f633cce5f4ad4181f9d0ccb0577e1f66cf4c46d"}, + {file = "pydantic_core-2.18.2-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:fb2bd7be70c0fe4dfd32c951bc813d9fe6ebcbfdd15a07527796c8204bd36242"}, + {file = "pydantic_core-2.18.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:6132dd3bd52838acddca05a72aafb6eab6536aa145e923bb50f45e78b7251043"}, + {file = "pydantic_core-2.18.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d7d904828195733c183d20a54230c0df0eb46ec746ea1a666730787353e87182"}, + {file = "pydantic_core-2.18.2-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:c9bd70772c720142be1020eac55f8143a34ec9f82d75a8e7a07852023e46617f"}, + {file = "pydantic_core-2.18.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2b8ed04b3582771764538f7ee7001b02e1170223cf9b75dff0bc698fadb00cf3"}, + {file = "pydantic_core-2.18.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e6dac87ddb34aaec85f873d737e9d06a3555a1cc1a8e0c44b7f8d5daeb89d86f"}, + {file = "pydantic_core-2.18.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7ca4ae5a27ad7a4ee5170aebce1574b375de390bc01284f87b18d43a3984df72"}, + {file = "pydantic_core-2.18.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:886eec03591b7cf058467a70a87733b35f44707bd86cf64a615584fd72488b7c"}, + {file = "pydantic_core-2.18.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:ca7b0c1f1c983e064caa85f3792dd2fe3526b3505378874afa84baf662e12241"}, + {file = "pydantic_core-2.18.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:4b4356d3538c3649337df4074e81b85f0616b79731fe22dd11b99499b2ebbdf3"}, + {file = "pydantic_core-2.18.2-cp312-none-win32.whl", hash = "sha256:8b172601454f2d7701121bbec3425dd71efcb787a027edf49724c9cefc14c038"}, + {file = "pydantic_core-2.18.2-cp312-none-win_amd64.whl", hash = "sha256:b1bd7e47b1558ea872bd16c8502c414f9e90dcf12f1395129d7bb42a09a95438"}, + {file = "pydantic_core-2.18.2-cp312-none-win_arm64.whl", hash = "sha256:98758d627ff397e752bc339272c14c98199c613f922d4a384ddc07526c86a2ec"}, + {file = "pydantic_core-2.18.2-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:9fdad8e35f278b2c3eb77cbdc5c0a49dada440657bf738d6905ce106dc1de439"}, + {file = "pydantic_core-2.18.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:1d90c3265ae107f91a4f279f4d6f6f1d4907ac76c6868b27dc7fb33688cfb347"}, + {file = "pydantic_core-2.18.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:390193c770399861d8df9670fb0d1874f330c79caaca4642332df7c682bf6b91"}, + {file = "pydantic_core-2.18.2-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:82d5d4d78e4448683cb467897fe24e2b74bb7b973a541ea1dcfec1d3cbce39fb"}, + {file = "pydantic_core-2.18.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4774f3184d2ef3e14e8693194f661dea5a4d6ca4e3dc8e39786d33a94865cefd"}, + {file = "pydantic_core-2.18.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d4d938ec0adf5167cb335acb25a4ee69a8107e4984f8fbd2e897021d9e4ca21b"}, + {file = "pydantic_core-2.18.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e0e8b1be28239fc64a88a8189d1df7fad8be8c1ae47fcc33e43d4be15f99cc70"}, + {file = "pydantic_core-2.18.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:868649da93e5a3d5eacc2b5b3b9235c98ccdbfd443832f31e075f54419e1b96b"}, + {file = "pydantic_core-2.18.2-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:78363590ef93d5d226ba21a90a03ea89a20738ee5b7da83d771d283fd8a56761"}, + {file = "pydantic_core-2.18.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:852e966fbd035a6468fc0a3496589b45e2208ec7ca95c26470a54daed82a0788"}, + {file = "pydantic_core-2.18.2-cp38-none-win32.whl", hash = "sha256:6a46e22a707e7ad4484ac9ee9f290f9d501df45954184e23fc29408dfad61350"}, + {file = "pydantic_core-2.18.2-cp38-none-win_amd64.whl", hash = "sha256:d91cb5ea8b11607cc757675051f61b3d93f15eca3cefb3e6c704a5d6e8440f4e"}, + {file = "pydantic_core-2.18.2-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:ae0a8a797a5e56c053610fa7be147993fe50960fa43609ff2a9552b0e07013e8"}, + {file = "pydantic_core-2.18.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:042473b6280246b1dbf530559246f6842b56119c2926d1e52b631bdc46075f2a"}, + {file = "pydantic_core-2.18.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1a388a77e629b9ec814c1b1e6b3b595fe521d2cdc625fcca26fbc2d44c816804"}, + {file = "pydantic_core-2.18.2-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:e25add29b8f3b233ae90ccef2d902d0ae0432eb0d45370fe315d1a5cf231004b"}, + {file = "pydantic_core-2.18.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f459a5ce8434614dfd39bbebf1041952ae01da6bed9855008cb33b875cb024c0"}, + {file = "pydantic_core-2.18.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:eff2de745698eb46eeb51193a9f41d67d834d50e424aef27df2fcdee1b153845"}, + {file = "pydantic_core-2.18.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a8309f67285bdfe65c372ea3722b7a5642680f3dba538566340a9d36e920b5f0"}, + {file = "pydantic_core-2.18.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:f93a8a2e3938ff656a7c1bc57193b1319960ac015b6e87d76c76bf14fe0244b4"}, + {file = "pydantic_core-2.18.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:22057013c8c1e272eb8d0eebc796701167d8377441ec894a8fed1af64a0bf399"}, + {file = "pydantic_core-2.18.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:cfeecd1ac6cc1fb2692c3d5110781c965aabd4ec5d32799773ca7b1456ac636b"}, + {file = "pydantic_core-2.18.2-cp39-none-win32.whl", hash = "sha256:0d69b4c2f6bb3e130dba60d34c0845ba31b69babdd3f78f7c0c8fae5021a253e"}, + {file = "pydantic_core-2.18.2-cp39-none-win_amd64.whl", hash = "sha256:d9319e499827271b09b4e411905b24a426b8fb69464dfa1696258f53a3334641"}, + {file = "pydantic_core-2.18.2-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:a1874c6dd4113308bd0eb568418e6114b252afe44319ead2b4081e9b9521fe75"}, + {file = "pydantic_core-2.18.2-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:ccdd111c03bfd3666bd2472b674c6899550e09e9f298954cfc896ab92b5b0e6d"}, + {file = "pydantic_core-2.18.2-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e18609ceaa6eed63753037fc06ebb16041d17d28199ae5aba0052c51449650a9"}, + {file = "pydantic_core-2.18.2-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6e5c584d357c4e2baf0ff7baf44f4994be121e16a2c88918a5817331fc7599d7"}, + {file = "pydantic_core-2.18.2-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:43f0f463cf89ace478de71a318b1b4f05ebc456a9b9300d027b4b57c1a2064fb"}, + {file = "pydantic_core-2.18.2-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:e1b395e58b10b73b07b7cf740d728dd4ff9365ac46c18751bf8b3d8cca8f625a"}, + {file = "pydantic_core-2.18.2-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:0098300eebb1c837271d3d1a2cd2911e7c11b396eac9661655ee524a7f10587b"}, + {file = "pydantic_core-2.18.2-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:36789b70d613fbac0a25bb07ab3d9dba4d2e38af609c020cf4d888d165ee0bf3"}, + {file = "pydantic_core-2.18.2-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:3f9a801e7c8f1ef8718da265bba008fa121243dfe37c1cea17840b0944dfd72c"}, + {file = "pydantic_core-2.18.2-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:3a6515ebc6e69d85502b4951d89131ca4e036078ea35533bb76327f8424531ce"}, + {file = "pydantic_core-2.18.2-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:20aca1e2298c56ececfd8ed159ae4dde2df0781988c97ef77d5c16ff4bd5b400"}, + {file = "pydantic_core-2.18.2-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:223ee893d77a310a0391dca6df00f70bbc2f36a71a895cecd9a0e762dc37b349"}, + {file = "pydantic_core-2.18.2-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:2334ce8c673ee93a1d6a65bd90327588387ba073c17e61bf19b4fd97d688d63c"}, + {file = "pydantic_core-2.18.2-pp39-pypy39_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:cbca948f2d14b09d20268cda7b0367723d79063f26c4ffc523af9042cad95592"}, + {file = "pydantic_core-2.18.2-pp39-pypy39_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:b3ef08e20ec49e02d5c6717a91bb5af9b20f1805583cb0adfe9ba2c6b505b5ae"}, + {file = "pydantic_core-2.18.2-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:c6fdc8627910eed0c01aed6a390a252fe3ea6d472ee70fdde56273f198938374"}, + {file = "pydantic_core-2.18.2.tar.gz", hash = "sha256:2e29d20810dfc3043ee13ac7d9e25105799817683348823f305ab3f349b9386e"}, +] + +[package.dependencies] +typing-extensions = ">=4.6.0,<4.7.0 || >4.7.0" + [[package]] name = "pyjwt" version = "2.4.0" @@ -2116,28 +2361,6 @@ files = [ [package.dependencies] six = ">=1.5" -[[package]] -name = "python-jose" -version = "3.3.0" -description = "JOSE implementation in Python" -optional = false -python-versions = "*" -files = [ - {file = "python-jose-3.3.0.tar.gz", hash = "sha256:55779b5e6ad599c6336191246e95eb2293a9ddebd555f796a65f838f07e5d78a"}, - {file = "python_jose-3.3.0-py2.py3-none-any.whl", hash = "sha256:9b1376b023f8b298536eedd47ae1089bcdb848f1535ab30555cd92002d78923a"}, -] - -[package.dependencies] -cryptography = {version = ">=3.4.0", optional = true, markers = "extra == \"cryptography\""} -ecdsa = "!=0.15" -pyasn1 = "*" -rsa = "*" - -[package.extras] -cryptography = ["cryptography (>=3.4.0)"] -pycrypto = ["pyasn1", "pycrypto (>=2.6.0,<2.7.0)"] -pycryptodome = ["pyasn1", "pycryptodome (>=3.3.1,<4.0.0)"] - [[package]] name = "pywin32" version = "301" @@ -2216,6 +2439,94 @@ files = [ {file = "PyYAML-6.0.1.tar.gz", hash = "sha256:bfdf460b1736c775f2ba9f6a92bca30bc2095067b8a9d77876d1fad6cc3b4a43"}, ] +[[package]] +name = "regex" +version = "2024.4.28" +description = "Alternative regular expression module, to replace re." +optional = false +python-versions = ">=3.8" +files = [ + {file = "regex-2024.4.28-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:cd196d056b40af073d95a2879678585f0b74ad35190fac04ca67954c582c6b61"}, + {file = "regex-2024.4.28-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:8bb381f777351bd534462f63e1c6afb10a7caa9fa2a421ae22c26e796fe31b1f"}, + {file = "regex-2024.4.28-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:47af45b6153522733aa6e92543938e97a70ce0900649ba626cf5aad290b737b6"}, + {file = "regex-2024.4.28-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:99d6a550425cc51c656331af0e2b1651e90eaaa23fb4acde577cf15068e2e20f"}, + {file = "regex-2024.4.28-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:bf29304a8011feb58913c382902fde3395957a47645bf848eea695839aa101b7"}, + {file = "regex-2024.4.28-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:92da587eee39a52c91aebea8b850e4e4f095fe5928d415cb7ed656b3460ae79a"}, + {file = "regex-2024.4.28-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6277d426e2f31bdbacb377d17a7475e32b2d7d1f02faaecc48d8e370c6a3ff31"}, + {file = "regex-2024.4.28-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:28e1f28d07220c0f3da0e8fcd5a115bbb53f8b55cecf9bec0c946eb9a059a94c"}, + {file = "regex-2024.4.28-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:aaa179975a64790c1f2701ac562b5eeb733946eeb036b5bcca05c8d928a62f10"}, + {file = "regex-2024.4.28-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:6f435946b7bf7a1b438b4e6b149b947c837cb23c704e780c19ba3e6855dbbdd3"}, + {file = "regex-2024.4.28-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:19d6c11bf35a6ad077eb23852827f91c804eeb71ecb85db4ee1386825b9dc4db"}, + {file = "regex-2024.4.28-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:fdae0120cddc839eb8e3c15faa8ad541cc6d906d3eb24d82fb041cfe2807bc1e"}, + {file = "regex-2024.4.28-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:e672cf9caaf669053121f1766d659a8813bd547edef6e009205378faf45c67b8"}, + {file = "regex-2024.4.28-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:f57515750d07e14743db55d59759893fdb21d2668f39e549a7d6cad5d70f9fea"}, + {file = "regex-2024.4.28-cp310-cp310-win32.whl", hash = "sha256:a1409c4eccb6981c7baabc8888d3550df518add6e06fe74fa1d9312c1838652d"}, + {file = "regex-2024.4.28-cp310-cp310-win_amd64.whl", hash = "sha256:1f687a28640f763f23f8a9801fe9e1b37338bb1ca5d564ddd41619458f1f22d1"}, + {file = "regex-2024.4.28-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:84077821c85f222362b72fdc44f7a3a13587a013a45cf14534df1cbbdc9a6796"}, + {file = "regex-2024.4.28-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b45d4503de8f4f3dc02f1d28a9b039e5504a02cc18906cfe744c11def942e9eb"}, + {file = "regex-2024.4.28-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:457c2cd5a646dd4ed536c92b535d73548fb8e216ebee602aa9f48e068fc393f3"}, + {file = "regex-2024.4.28-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2b51739ddfd013c6f657b55a508de8b9ea78b56d22b236052c3a85a675102dc6"}, + {file = "regex-2024.4.28-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:459226445c7d7454981c4c0ce0ad1a72e1e751c3e417f305722bbcee6697e06a"}, + {file = "regex-2024.4.28-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:670fa596984b08a4a769491cbdf22350431970d0112e03d7e4eeaecaafcd0fec"}, + {file = "regex-2024.4.28-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fe00f4fe11c8a521b173e6324d862ee7ee3412bf7107570c9b564fe1119b56fb"}, + {file = "regex-2024.4.28-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:36f392dc7763fe7924575475736bddf9ab9f7a66b920932d0ea50c2ded2f5636"}, + {file = "regex-2024.4.28-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:23a412b7b1a7063f81a742463f38821097b6a37ce1e5b89dd8e871d14dbfd86b"}, + {file = "regex-2024.4.28-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:f1d6e4b7b2ae3a6a9df53efbf199e4bfcff0959dbdb5fd9ced34d4407348e39a"}, + {file = "regex-2024.4.28-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:499334ad139557de97cbc4347ee921c0e2b5e9c0f009859e74f3f77918339257"}, + {file = "regex-2024.4.28-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:0940038bec2fe9e26b203d636c44d31dd8766abc1fe66262da6484bd82461ccf"}, + {file = "regex-2024.4.28-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:66372c2a01782c5fe8e04bff4a2a0121a9897e19223d9eab30c54c50b2ebeb7f"}, + {file = "regex-2024.4.28-cp311-cp311-win32.whl", hash = "sha256:c77d10ec3c1cf328b2f501ca32583625987ea0f23a0c2a49b37a39ee5c4c4630"}, + {file = "regex-2024.4.28-cp311-cp311-win_amd64.whl", hash = "sha256:fc0916c4295c64d6890a46e02d4482bb5ccf33bf1a824c0eaa9e83b148291f90"}, + {file = "regex-2024.4.28-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:08a1749f04fee2811c7617fdd46d2e46d09106fa8f475c884b65c01326eb15c5"}, + {file = "regex-2024.4.28-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:b8eb28995771c087a73338f695a08c9abfdf723d185e57b97f6175c5051ff1ae"}, + {file = "regex-2024.4.28-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:dd7ef715ccb8040954d44cfeff17e6b8e9f79c8019daae2fd30a8806ef5435c0"}, + {file = "regex-2024.4.28-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fb0315a2b26fde4005a7c401707c5352df274460f2f85b209cf6024271373013"}, + {file = "regex-2024.4.28-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f2fc053228a6bd3a17a9b0a3f15c3ab3cf95727b00557e92e1cfe094b88cc662"}, + {file = "regex-2024.4.28-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7fe9739a686dc44733d52d6e4f7b9c77b285e49edf8570754b322bca6b85b4cc"}, + {file = "regex-2024.4.28-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a74fcf77d979364f9b69fcf8200849ca29a374973dc193a7317698aa37d8b01c"}, + {file = "regex-2024.4.28-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:965fd0cf4694d76f6564896b422724ec7b959ef927a7cb187fc6b3f4e4f59833"}, + {file = "regex-2024.4.28-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:2fef0b38c34ae675fcbb1b5db760d40c3fc3612cfa186e9e50df5782cac02bcd"}, + {file = "regex-2024.4.28-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:bc365ce25f6c7c5ed70e4bc674f9137f52b7dd6a125037f9132a7be52b8a252f"}, + {file = "regex-2024.4.28-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:ac69b394764bb857429b031d29d9604842bc4cbfd964d764b1af1868eeebc4f0"}, + {file = "regex-2024.4.28-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:144a1fc54765f5c5c36d6d4b073299832aa1ec6a746a6452c3ee7b46b3d3b11d"}, + {file = "regex-2024.4.28-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:2630ca4e152c221072fd4a56d4622b5ada876f668ecd24d5ab62544ae6793ed6"}, + {file = "regex-2024.4.28-cp312-cp312-win32.whl", hash = "sha256:7f3502f03b4da52bbe8ba962621daa846f38489cae5c4a7b5d738f15f6443d17"}, + {file = "regex-2024.4.28-cp312-cp312-win_amd64.whl", hash = "sha256:0dd3f69098511e71880fb00f5815db9ed0ef62c05775395968299cb400aeab82"}, + {file = "regex-2024.4.28-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:374f690e1dd0dbdcddea4a5c9bdd97632cf656c69113f7cd6a361f2a67221cb6"}, + {file = "regex-2024.4.28-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:25f87ae6b96374db20f180eab083aafe419b194e96e4f282c40191e71980c666"}, + {file = "regex-2024.4.28-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:5dbc1bcc7413eebe5f18196e22804a3be1bfdfc7e2afd415e12c068624d48247"}, + {file = "regex-2024.4.28-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f85151ec5a232335f1be022b09fbbe459042ea1951d8a48fef251223fc67eee1"}, + {file = "regex-2024.4.28-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:57ba112e5530530fd175ed550373eb263db4ca98b5f00694d73b18b9a02e7185"}, + {file = "regex-2024.4.28-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:224803b74aab56aa7be313f92a8d9911dcade37e5f167db62a738d0c85fdac4b"}, + {file = "regex-2024.4.28-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0a54a047b607fd2d2d52a05e6ad294602f1e0dec2291152b745870afc47c1397"}, + {file = "regex-2024.4.28-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0a2a512d623f1f2d01d881513af9fc6a7c46e5cfffb7dc50c38ce959f9246c94"}, + {file = "regex-2024.4.28-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:c06bf3f38f0707592898428636cbb75d0a846651b053a1cf748763e3063a6925"}, + {file = "regex-2024.4.28-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:1031a5e7b048ee371ab3653aad3030ecfad6ee9ecdc85f0242c57751a05b0ac4"}, + {file = "regex-2024.4.28-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:d7a353ebfa7154c871a35caca7bfd8f9e18666829a1dc187115b80e35a29393e"}, + {file = "regex-2024.4.28-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:7e76b9cfbf5ced1aca15a0e5b6f229344d9b3123439ffce552b11faab0114a02"}, + {file = "regex-2024.4.28-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:5ce479ecc068bc2a74cb98dd8dba99e070d1b2f4a8371a7dfe631f85db70fe6e"}, + {file = "regex-2024.4.28-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:7d77b6f63f806578c604dca209280e4c54f0fa9a8128bb8d2cc5fb6f99da4150"}, + {file = "regex-2024.4.28-cp38-cp38-win32.whl", hash = "sha256:d84308f097d7a513359757c69707ad339da799e53b7393819ec2ea36bc4beb58"}, + {file = "regex-2024.4.28-cp38-cp38-win_amd64.whl", hash = "sha256:2cc1b87bba1dd1a898e664a31012725e48af826bf3971e786c53e32e02adae6c"}, + {file = "regex-2024.4.28-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:7413167c507a768eafb5424413c5b2f515c606be5bb4ef8c5dee43925aa5718b"}, + {file = "regex-2024.4.28-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:108e2dcf0b53a7c4ab8986842a8edcb8ab2e59919a74ff51c296772e8e74d0ae"}, + {file = "regex-2024.4.28-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:f1c5742c31ba7d72f2dedf7968998730664b45e38827637e0f04a2ac7de2f5f1"}, + {file = "regex-2024.4.28-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ecc6148228c9ae25ce403eade13a0961de1cb016bdb35c6eafd8e7b87ad028b1"}, + {file = "regex-2024.4.28-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b7d893c8cf0e2429b823ef1a1d360a25950ed11f0e2a9df2b5198821832e1947"}, + {file = "regex-2024.4.28-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4290035b169578ffbbfa50d904d26bec16a94526071ebec3dadbebf67a26b25e"}, + {file = "regex-2024.4.28-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:44a22ae1cfd82e4ffa2066eb3390777dc79468f866f0625261a93e44cdf6482b"}, + {file = "regex-2024.4.28-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:fd24fd140b69f0b0bcc9165c397e9b2e89ecbeda83303abf2a072609f60239e2"}, + {file = "regex-2024.4.28-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:39fb166d2196413bead229cd64a2ffd6ec78ebab83fff7d2701103cf9f4dfd26"}, + {file = "regex-2024.4.28-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:9301cc6db4d83d2c0719f7fcda37229691745168bf6ae849bea2e85fc769175d"}, + {file = "regex-2024.4.28-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:7c3d389e8d76a49923683123730c33e9553063d9041658f23897f0b396b2386f"}, + {file = "regex-2024.4.28-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:99ef6289b62042500d581170d06e17f5353b111a15aa6b25b05b91c6886df8fc"}, + {file = "regex-2024.4.28-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:b91d529b47798c016d4b4c1d06cc826ac40d196da54f0de3c519f5a297c5076a"}, + {file = "regex-2024.4.28-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:43548ad74ea50456e1c68d3c67fff3de64c6edb85bcd511d1136f9b5376fc9d1"}, + {file = "regex-2024.4.28-cp39-cp39-win32.whl", hash = "sha256:05d9b6578a22db7dedb4df81451f360395828b04f4513980b6bd7a1412c679cc"}, + {file = "regex-2024.4.28-cp39-cp39-win_amd64.whl", hash = "sha256:3986217ec830c2109875be740531feb8ddafe0dfa49767cdcd072ed7e8927962"}, + {file = "regex-2024.4.28.tar.gz", hash = "sha256:83ab366777ea45d58f72593adf35d36ca911ea8bd838483c1823b883a121b0e4"}, +] + [[package]] name = "requests" version = "2.31.0" @@ -2256,18 +2567,18 @@ urllib3 = ">=1.25.10" tests = ["coverage (>=6.0.0)", "flake8", "mypy", "pytest (>=7.0.0)", "pytest-asyncio", "pytest-cov", "pytest-localserver", "types-mock", "types-requests"] [[package]] -name = "rsa" -version = "4.9" -description = "Pure-Python RSA implementation" +name = "rfc3339-validator" +version = "0.1.4" +description = "A pure python RFC3339 validator" optional = false -python-versions = ">=3.6,<4" +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" files = [ - {file = "rsa-4.9-py3-none-any.whl", hash = "sha256:90260d9058e514786967344d0ef75fa8727eed8a7d2e43ce9f4bcf1b536174f7"}, - {file = "rsa-4.9.tar.gz", hash = "sha256:e38464a49c6c85d7f1351b0126661487a7e0a14a50f1675ec50eb34d4f20ef21"}, + {file = "rfc3339_validator-0.1.4-py2.py3-none-any.whl", hash = "sha256:24f6ec1eda14ef823da9e36ec7113124b39c04d50a4d3d3a3c2859577e7791fa"}, + {file = "rfc3339_validator-0.1.4.tar.gz", hash = "sha256:138a2abdf93304ad60530167e51d2dfb9549521a836871b88d7f4695d0022f6b"}, ] [package.dependencies] -pyasn1 = ">=0.1.3" +six = "*" [[package]] name = "ruff" @@ -2366,22 +2677,18 @@ files = [ ] [[package]] -name = "sshpubkeys" -version = "3.3.1" -description = "SSH public key parser" +name = "sympy" +version = "1.12" +description = "Computer algebra system (CAS) in Python" optional = false -python-versions = ">=3" +python-versions = ">=3.8" files = [ - {file = "sshpubkeys-3.3.1-py2.py3-none-any.whl", hash = "sha256:946f76b8fe86704b0e7c56a00d80294e39bc2305999844f079a217885060b1ac"}, - {file = "sshpubkeys-3.3.1.tar.gz", hash = "sha256:3020ed4f8c846849299370fbe98ff4157b0ccc1accec105e07cfa9ae4bb55064"}, + {file = "sympy-1.12-py3-none-any.whl", hash = "sha256:c3588cd4295d0c0f603d0f2ae780587e64e2efeedb3521e46b9bb1d08d184fa5"}, + {file = "sympy-1.12.tar.gz", hash = "sha256:ebf595c8dac3e0fdc4152c51878b498396ec7f30e7a914d6071e674d49420fb8"}, ] [package.dependencies] -cryptography = ">=2.1.4" -ecdsa = ">=0.13" - -[package.extras] -dev = ["twine", "wheel", "yapf"] +mpmath = ">=0.19" [[package]] name = "toml" @@ -2611,13 +2918,13 @@ files = [ [[package]] name = "werkzeug" -version = "3.0.1" +version = "3.0.3" description = "The comprehensive WSGI web application library." optional = false python-versions = ">=3.8" files = [ - {file = "werkzeug-3.0.1-py3-none-any.whl", hash = "sha256:90a285dc0e42ad56b34e696398b8122ee4c681833fb35b8334a095d82c56da10"}, - {file = "werkzeug-3.0.1.tar.gz", hash = "sha256:507e811ecea72b18a404947aded4b3390e1db8f826b494d76550ef45bb3b1dcc"}, + {file = "werkzeug-3.0.3-py3-none-any.whl", hash = "sha256:fc9645dc43e03e4d630d23143a04a7f947a9a3b5727cd535fdfe155a17cc48c8"}, + {file = "werkzeug-3.0.3.tar.gz", hash = "sha256:097e5bfda9f0aba8da6b8545146def481d06aa7d3266e7448e2cccf67dd8bd18"}, ] [package.dependencies] @@ -2889,4 +3196,4 @@ cffi = ["cffi (>=1.11)"] [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "df7161da4fdc3cba0a445176fc9dda2a0e8a53e13a7aa8a864385ca259381b41" +content-hash = "dcde14c58a32bda5f123319a069352c458b3719f3c62977991eebb9803a46a9e" diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml index 93a1fe85db..3002006aed 100644 --- a/proxy/Cargo.toml +++ b/proxy/Cargo.toml @@ -10,7 +10,13 @@ testing = [] [dependencies] anyhow.workspace = true +async-compression.workspace = true async-trait.workspace = true +atomic-take.workspace = true +aws-config.workspace = true +aws-sdk-iam.workspace = true +aws-sigv4.workspace = true +aws-types.workspace = true base64.workspace = true bstr.workspace = true bytes = { workspace = true, features = ["serde"] } @@ -27,13 +33,19 @@ hashlink.workspace = true hex.workspace = true hmac.workspace = true hostname.workspace = true +http.workspace = true humantime.workspace = true hyper-tungstenite.workspace = true hyper.workspace = true +hyper1 = { package = "hyper", version = "1.2", features = ["server"] } +hyper-util = { version = "0.1", features = ["server", "http1", "http2", "tokio"] } +http-body-util = { version = "0.1" } +indexmap.workspace = true ipnet.workspace = true itertools.workspace = true lasso = { workspace = true, features = ["multi-threaded"] } md5.workspace = true +measured = { workspace = true, features = ["lasso"] } metrics.workspace = true once_cell.workspace = true opentelemetry.workspace = true @@ -48,8 +60,8 @@ prometheus.workspace = true rand.workspace = true regex.workspace = true remote_storage = { version = "0.1", path = "../libs/remote_storage/" } -reqwest = { workspace = true, features = ["json"] } -reqwest-middleware.workspace = true +reqwest.workspace = true +reqwest-middleware = { workspace = true, features = ["json"] } reqwest-retry.workspace = true reqwest-tracing.workspace = true routerify.workspace = true @@ -73,6 +85,7 @@ tokio-postgres.workspace = true tokio-rustls.workspace = true tokio-util.workspace = true tokio = { workspace = true, features = ["signal"] } +tower-service.workspace = true tracing-opentelemetry.workspace = true tracing-subscriber.workspace = true tracing-utils.workspace = true @@ -92,6 +105,7 @@ workspace_hack.workspace = true [dev-dependencies] camino-tempfile.workspace = true +fallible-iterator.workspace = true rcgen.workspace = true rstest.workspace = true tokio-postgres-rustls.workspace = true diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs index bc307230dd..6a906b299b 100644 --- a/proxy/src/auth/backend.rs +++ b/proxy/src/auth/backend.rs @@ -2,18 +2,28 @@ mod classic; mod hacks; mod link; +use std::net::IpAddr; +use std::sync::Arc; +use std::time::Duration; + +use ipnet::{Ipv4Net, Ipv6Net}; pub use link::LinkAuthError; +use tokio::io::{AsyncRead, AsyncWrite}; use tokio_postgres::config::AuthKeys; +use tracing::{info, warn}; use crate::auth::credentials::check_peer_addr_is_in_list; -use crate::auth::validate_password_and_exchange; +use crate::auth::{validate_password_and_exchange, AuthError}; use crate::cache::Cached; use crate::console::errors::GetAuthInfoError; use crate::console::provider::{CachedRoleSecret, ConsoleBackend}; use crate::console::{AuthSecret, NodeInfo}; use crate::context::RequestMonitoring; +use crate::intern::EndpointIdInt; +use crate::metrics::Metrics; use crate::proxy::connect_compute::ComputeConnectBackend; use crate::proxy::NeonOptions; +use crate::rate_limiter::{BucketRateLimiter, EndpointRateLimiter, RateBucketInfo}; use crate::stream::Stream; use crate::{ auth::{self, ComputeUserInfoMaybeEndpoint}, @@ -25,10 +35,7 @@ use crate::{ }, stream, url, }; -use crate::{scram, EndpointCacheKey, EndpointId, RoleName}; -use std::sync::Arc; -use tokio::io::{AsyncRead, AsyncWrite}; -use tracing::info; +use crate::{scram, EndpointCacheKey, EndpointId, Normalize, RoleName}; /// Alternative to [`std::borrow::Cow`] but doesn't need `T: ToOwned` as we don't need that functionality pub enum MaybeOwned<'a, T> { @@ -174,6 +181,94 @@ impl TryFrom for ComputeUserInfo { } } +#[derive(PartialEq, PartialOrd, Hash, Eq, Ord, Debug, Copy, Clone)] +pub struct MaskedIp(IpAddr); + +impl MaskedIp { + fn new(value: IpAddr, prefix: u8) -> Self { + match value { + IpAddr::V4(v4) => Self(IpAddr::V4( + Ipv4Net::new(v4, prefix).map_or(v4, |x| x.trunc().addr()), + )), + IpAddr::V6(v6) => Self(IpAddr::V6( + Ipv6Net::new(v6, prefix).map_or(v6, |x| x.trunc().addr()), + )), + } + } +} + +// This can't be just per IP because that would limit some PaaS that share IP addresses +pub type AuthRateLimiter = BucketRateLimiter<(EndpointIdInt, MaskedIp)>; + +impl RateBucketInfo { + /// All of these are per endpoint-maskedip pair. + /// Context: 4096 rounds of pbkdf2 take about 1ms of cpu time to execute (1 milli-cpu-second or 1mcpus). + /// + /// First bucket: 1000mcpus total per endpoint-ip pair + /// * 4096000 requests per second with 1 hash rounds. + /// * 1000 requests per second with 4096 hash rounds. + /// * 6.8 requests per second with 600000 hash rounds. + pub const DEFAULT_AUTH_SET: [Self; 3] = [ + Self::new(1000 * 4096, Duration::from_secs(1)), + Self::new(600 * 4096, Duration::from_secs(60)), + Self::new(300 * 4096, Duration::from_secs(600)), + ]; +} + +impl AuthenticationConfig { + pub fn check_rate_limit( + &self, + ctx: &mut RequestMonitoring, + config: &AuthenticationConfig, + secret: AuthSecret, + endpoint: &EndpointId, + is_cleartext: bool, + ) -> auth::Result { + // we have validated the endpoint exists, so let's intern it. + let endpoint_int = EndpointIdInt::from(endpoint.normalize()); + + // only count the full hash count if password hack or websocket flow. + // in other words, if proxy needs to run the hashing + let password_weight = if is_cleartext { + match &secret { + #[cfg(any(test, feature = "testing"))] + AuthSecret::Md5(_) => 1, + AuthSecret::Scram(s) => s.iterations + 1, + } + } else { + // validating scram takes just 1 hmac_sha_256 operation. + 1 + }; + + let limit_not_exceeded = self.rate_limiter.check( + ( + endpoint_int, + MaskedIp::new(ctx.peer_addr, config.rate_limit_ip_subnet), + ), + password_weight, + ); + + if !limit_not_exceeded { + warn!( + enabled = self.rate_limiter_enabled, + "rate limiting authentication" + ); + Metrics::get().proxy.requests_auth_rate_limits_total.inc(); + Metrics::get() + .proxy + .endpoints_auth_rate_limits + .get_metric() + .measure(endpoint); + + if self.rate_limiter_enabled { + return Err(auth::AuthError::too_many_connections()); + } + } + + Ok(secret) + } +} + /// True to its name, this function encapsulates our current auth trade-offs. /// Here, we choose the appropriate auth flow based on circumstances. /// @@ -185,6 +280,7 @@ async fn auth_quirks( client: &mut stream::PqStream>, allow_cleartext: bool, config: &'static AuthenticationConfig, + endpoint_rate_limiter: Arc, ) -> auth::Result { // If there's no project so far, that entails that client doesn't // support SNI or other means of passing the endpoint (project) name. @@ -210,18 +306,33 @@ async fn auth_quirks( if !check_peer_addr_is_in_list(&ctx.peer_addr, &allowed_ips) { return Err(auth::AuthError::ip_address_not_allowed(ctx.peer_addr)); } + + if !endpoint_rate_limiter.check(info.endpoint.clone().into(), 1) { + return Err(AuthError::too_many_connections()); + } let cached_secret = match maybe_secret { Some(secret) => secret, None => api.get_role_secret(ctx, &info).await?, }; + let (cached_entry, secret) = cached_secret.take_value(); + + let secret = match secret { + Some(secret) => config.check_rate_limit( + ctx, + config, + secret, + &info.endpoint, + unauthenticated_password.is_some() || allow_cleartext, + )?, + None => { + // If we don't have an authentication secret, we mock one to + // prevent malicious probing (possible due to missing protocol steps). + // This mocked secret will never lead to successful authentication. + info!("authentication info not found, mocking it"); + AuthSecret::Scram(scram::ServerSecret::mock(rand::random())) + } + }; - let secret = cached_secret.value.clone().unwrap_or_else(|| { - // If we don't have an authentication secret, we mock one to - // prevent malicious probing (possible due to missing protocol steps). - // This mocked secret will never lead to successful authentication. - info!("authentication info not found, mocking it"); - AuthSecret::Scram(scram::ServerSecret::mock(&info.user, rand::random())) - }); match authenticate_with_secret( ctx, secret, @@ -237,7 +348,7 @@ async fn auth_quirks( Err(e) => { if e.is_auth_failed() { // The password could have been changed, so we invalidate the cache. - cached_secret.invalidate(); + cached_entry.invalidate(); } Err(e) } @@ -311,6 +422,7 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint, &()> { client: &mut stream::PqStream>, allow_cleartext: bool, config: &'static AuthenticationConfig, + endpoint_rate_limiter: Arc, ) -> auth::Result> { use BackendType::*; @@ -322,8 +434,16 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint, &()> { "performing authentication using the console" ); - let credentials = - auth_quirks(ctx, &*api, user_info, client, allow_cleartext, config).await?; + let credentials = auth_quirks( + ctx, + &*api, + user_info, + client, + allow_cleartext, + config, + endpoint_rate_limiter, + ) + .await?; BackendType::Console(api, credentials) } // NOTE: this auth backend doesn't use client credentials. @@ -408,3 +528,309 @@ impl ComputeConnectBackend for BackendType<'_, ComputeCredentials, &()> { } } } + +#[cfg(test)] +mod tests { + use std::{net::IpAddr, sync::Arc, time::Duration}; + + use bytes::BytesMut; + use fallible_iterator::FallibleIterator; + use once_cell::sync::Lazy; + use postgres_protocol::{ + authentication::sasl::{ChannelBinding, ScramSha256}, + message::{backend::Message as PgMessage, frontend}, + }; + use provider::AuthSecret; + use tokio::io::{AsyncRead, AsyncReadExt, AsyncWriteExt}; + + use crate::{ + auth::{backend::MaskedIp, ComputeUserInfoMaybeEndpoint, IpPattern}, + config::AuthenticationConfig, + console::{ + self, + provider::{self, CachedAllowedIps, CachedRoleSecret}, + CachedNodeInfo, + }, + context::RequestMonitoring, + proxy::NeonOptions, + rate_limiter::{EndpointRateLimiter, RateBucketInfo}, + scram::ServerSecret, + stream::{PqStream, Stream}, + }; + + use super::{auth_quirks, AuthRateLimiter}; + + struct Auth { + ips: Vec, + secret: AuthSecret, + } + + impl console::Api for Auth { + async fn get_role_secret( + &self, + _ctx: &mut RequestMonitoring, + _user_info: &super::ComputeUserInfo, + ) -> Result { + Ok(CachedRoleSecret::new_uncached(Some(self.secret.clone()))) + } + + async fn get_allowed_ips_and_secret( + &self, + _ctx: &mut RequestMonitoring, + _user_info: &super::ComputeUserInfo, + ) -> Result<(CachedAllowedIps, Option), console::errors::GetAuthInfoError> + { + Ok(( + CachedAllowedIps::new_uncached(Arc::new(self.ips.clone())), + Some(CachedRoleSecret::new_uncached(Some(self.secret.clone()))), + )) + } + + async fn wake_compute( + &self, + _ctx: &mut RequestMonitoring, + _user_info: &super::ComputeUserInfo, + ) -> Result { + unimplemented!() + } + } + + static CONFIG: Lazy = Lazy::new(|| AuthenticationConfig { + scram_protocol_timeout: std::time::Duration::from_secs(5), + rate_limiter_enabled: true, + rate_limiter: AuthRateLimiter::new(&RateBucketInfo::DEFAULT_AUTH_SET), + rate_limit_ip_subnet: 64, + }); + + async fn read_message(r: &mut (impl AsyncRead + Unpin), b: &mut BytesMut) -> PgMessage { + loop { + r.read_buf(&mut *b).await.unwrap(); + if let Some(m) = PgMessage::parse(&mut *b).unwrap() { + break m; + } + } + } + + #[test] + fn masked_ip() { + let ip_a = IpAddr::V4([127, 0, 0, 1].into()); + let ip_b = IpAddr::V4([127, 0, 0, 2].into()); + let ip_c = IpAddr::V4([192, 168, 1, 101].into()); + let ip_d = IpAddr::V4([192, 168, 1, 102].into()); + let ip_e = IpAddr::V6("abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd".parse().unwrap()); + let ip_f = IpAddr::V6("abcd:abcd:abcd:abcd:1234:abcd:abcd:abcd".parse().unwrap()); + + assert_ne!(MaskedIp::new(ip_a, 64), MaskedIp::new(ip_b, 64)); + assert_ne!(MaskedIp::new(ip_a, 32), MaskedIp::new(ip_b, 32)); + assert_eq!(MaskedIp::new(ip_a, 30), MaskedIp::new(ip_b, 30)); + assert_eq!(MaskedIp::new(ip_c, 30), MaskedIp::new(ip_d, 30)); + + assert_ne!(MaskedIp::new(ip_e, 128), MaskedIp::new(ip_f, 128)); + assert_eq!(MaskedIp::new(ip_e, 64), MaskedIp::new(ip_f, 64)); + } + + #[test] + fn test_default_auth_rate_limit_set() { + // these values used to exceed u32::MAX + assert_eq!( + RateBucketInfo::DEFAULT_AUTH_SET, + [ + RateBucketInfo { + interval: Duration::from_secs(1), + max_rpi: 1000 * 4096, + }, + RateBucketInfo { + interval: Duration::from_secs(60), + max_rpi: 600 * 4096 * 60, + }, + RateBucketInfo { + interval: Duration::from_secs(600), + max_rpi: 300 * 4096 * 600, + } + ] + ); + + for x in RateBucketInfo::DEFAULT_AUTH_SET { + let y = x.to_string().parse().unwrap(); + assert_eq!(x, y); + } + } + + #[tokio::test] + async fn auth_quirks_scram() { + let (mut client, server) = tokio::io::duplex(1024); + let mut stream = PqStream::new(Stream::from_raw(server)); + + let mut ctx = RequestMonitoring::test(); + let api = Auth { + ips: vec![], + secret: AuthSecret::Scram(ServerSecret::build("my-secret-password").await.unwrap()), + }; + + let user_info = ComputeUserInfoMaybeEndpoint { + user: "conrad".into(), + endpoint_id: Some("endpoint".into()), + options: NeonOptions::default(), + }; + + let handle = tokio::spawn(async move { + let mut scram = ScramSha256::new(b"my-secret-password", ChannelBinding::unsupported()); + + let mut read = BytesMut::new(); + + // server should offer scram + match read_message(&mut client, &mut read).await { + PgMessage::AuthenticationSasl(a) => { + let options: Vec<&str> = a.mechanisms().collect().unwrap(); + assert_eq!(options, ["SCRAM-SHA-256"]); + } + _ => panic!("wrong message"), + } + + // client sends client-first-message + let mut write = BytesMut::new(); + frontend::sasl_initial_response("SCRAM-SHA-256", scram.message(), &mut write).unwrap(); + client.write_all(&write).await.unwrap(); + + // server response with server-first-message + match read_message(&mut client, &mut read).await { + PgMessage::AuthenticationSaslContinue(a) => { + scram.update(a.data()).await.unwrap(); + } + _ => panic!("wrong message"), + } + + // client response with client-final-message + write.clear(); + frontend::sasl_response(scram.message(), &mut write).unwrap(); + client.write_all(&write).await.unwrap(); + + // server response with server-final-message + match read_message(&mut client, &mut read).await { + PgMessage::AuthenticationSaslFinal(a) => { + scram.finish(a.data()).unwrap(); + } + _ => panic!("wrong message"), + } + }); + let endpoint_rate_limiter = + Arc::new(EndpointRateLimiter::new(&RateBucketInfo::DEFAULT_AUTH_SET)); + + let _creds = auth_quirks( + &mut ctx, + &api, + user_info, + &mut stream, + false, + &CONFIG, + endpoint_rate_limiter, + ) + .await + .unwrap(); + + handle.await.unwrap(); + } + + #[tokio::test] + async fn auth_quirks_cleartext() { + let (mut client, server) = tokio::io::duplex(1024); + let mut stream = PqStream::new(Stream::from_raw(server)); + + let mut ctx = RequestMonitoring::test(); + let api = Auth { + ips: vec![], + secret: AuthSecret::Scram(ServerSecret::build("my-secret-password").await.unwrap()), + }; + + let user_info = ComputeUserInfoMaybeEndpoint { + user: "conrad".into(), + endpoint_id: Some("endpoint".into()), + options: NeonOptions::default(), + }; + + let handle = tokio::spawn(async move { + let mut read = BytesMut::new(); + let mut write = BytesMut::new(); + + // server should offer cleartext + match read_message(&mut client, &mut read).await { + PgMessage::AuthenticationCleartextPassword => {} + _ => panic!("wrong message"), + } + + // client responds with password + write.clear(); + frontend::password_message(b"my-secret-password", &mut write).unwrap(); + client.write_all(&write).await.unwrap(); + }); + let endpoint_rate_limiter = + Arc::new(EndpointRateLimiter::new(&RateBucketInfo::DEFAULT_AUTH_SET)); + + let _creds = auth_quirks( + &mut ctx, + &api, + user_info, + &mut stream, + true, + &CONFIG, + endpoint_rate_limiter, + ) + .await + .unwrap(); + + handle.await.unwrap(); + } + + #[tokio::test] + async fn auth_quirks_password_hack() { + let (mut client, server) = tokio::io::duplex(1024); + let mut stream = PqStream::new(Stream::from_raw(server)); + + let mut ctx = RequestMonitoring::test(); + let api = Auth { + ips: vec![], + secret: AuthSecret::Scram(ServerSecret::build("my-secret-password").await.unwrap()), + }; + + let user_info = ComputeUserInfoMaybeEndpoint { + user: "conrad".into(), + endpoint_id: None, + options: NeonOptions::default(), + }; + + let handle = tokio::spawn(async move { + let mut read = BytesMut::new(); + + // server should offer cleartext + match read_message(&mut client, &mut read).await { + PgMessage::AuthenticationCleartextPassword => {} + _ => panic!("wrong message"), + } + + // client responds with password + let mut write = BytesMut::new(); + frontend::password_message(b"endpoint=my-endpoint;my-secret-password", &mut write) + .unwrap(); + client.write_all(&write).await.unwrap(); + }); + + let endpoint_rate_limiter = + Arc::new(EndpointRateLimiter::new(&RateBucketInfo::DEFAULT_AUTH_SET)); + + let creds = auth_quirks( + &mut ctx, + &api, + user_info, + &mut stream, + true, + &CONFIG, + endpoint_rate_limiter, + ) + .await + .unwrap(); + + assert_eq!(creds.info.endpoint, "my-endpoint"); + + handle.await.unwrap(); + } +} diff --git a/proxy/src/auth/backend/link.rs b/proxy/src/auth/backend/link.rs index 7db76f3d9e..415a4b7d85 100644 --- a/proxy/src/auth/backend/link.rs +++ b/proxy/src/auth/backend/link.rs @@ -102,8 +102,7 @@ pub(super) async fn authenticate( ctx.set_user(db_info.user.into()); ctx.set_project(db_info.aux.clone()); - let cold_start_info = db_info.aux.cold_start_info.clone().unwrap_or_default(); - info!(?cold_start_info, "woken up a compute node"); + info!("woken up a compute node"); // Backwards compatibility. pg_sni_proxy uses "--" in domain names // while direct connections do not. Once we migrate to pg_sni_proxy diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs index 89773aa1ff..783a1a5a21 100644 --- a/proxy/src/auth/credentials.rs +++ b/proxy/src/auth/credentials.rs @@ -4,7 +4,7 @@ use crate::{ auth::password_hack::parse_endpoint_param, context::RequestMonitoring, error::{ReportableError, UserFacingError}, - metrics::NUM_CONNECTION_ACCEPTED_BY_SNI, + metrics::{Metrics, SniKind}, proxy::NeonOptions, serverless::SERVERLESS_DRIVER_SNI, EndpointId, RoleName, @@ -144,21 +144,22 @@ impl ComputeUserInfoMaybeEndpoint { ctx.set_endpoint_id(ep.clone()); } + let metrics = Metrics::get(); info!(%user, "credentials"); if sni.is_some() { info!("Connection with sni"); - NUM_CONNECTION_ACCEPTED_BY_SNI - .with_label_values(&["sni"]) - .inc(); + metrics.proxy.accepted_connections_by_sni.inc(SniKind::Sni); } else if endpoint.is_some() { - NUM_CONNECTION_ACCEPTED_BY_SNI - .with_label_values(&["no_sni"]) - .inc(); + metrics + .proxy + .accepted_connections_by_sni + .inc(SniKind::NoSni); info!("Connection without sni"); } else { - NUM_CONNECTION_ACCEPTED_BY_SNI - .with_label_values(&["password_hack"]) - .inc(); + metrics + .proxy + .accepted_connections_by_sni + .inc(SniKind::PasswordHack); info!("Connection with password hack"); } diff --git a/proxy/src/bin/pg_sni_router.rs b/proxy/src/bin/pg_sni_router.rs index 385f7820cb..fb16b76567 100644 --- a/proxy/src/bin/pg_sni_router.rs +++ b/proxy/src/bin/pg_sni_router.rs @@ -9,14 +9,13 @@ use futures::future::Either; use itertools::Itertools; use proxy::config::TlsServerEndPoint; use proxy::context::RequestMonitoring; -use proxy::proxy::run_until_cancelled; +use proxy::proxy::{copy_bidirectional_client_compute, run_until_cancelled}; use rustls::pki_types::PrivateKeyDer; use tokio::net::TcpListener; use anyhow::{anyhow, bail, ensure, Context}; use clap::Arg; use futures::TryFutureExt; -use proxy::console::messages::MetricsAuxInfo; use proxy::stream::{PqStream, Stream}; use tokio::io::{AsyncRead, AsyncWrite}; @@ -175,7 +174,12 @@ async fn task_main( .context("failed to set socket option")?; info!(%peer_addr, "serving"); - let ctx = RequestMonitoring::new(session_id, peer_addr.ip(), "sni_router", "sni"); + let ctx = RequestMonitoring::new( + session_id, + peer_addr.ip(), + proxy::metrics::Protocol::SniRouter, + "sni", + ); handle_client(ctx, dest_suffix, tls_config, tls_server_end_point, socket).await } .unwrap_or_else(|e| { @@ -198,6 +202,7 @@ async fn task_main( const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)"; async fn ssl_handshake( + ctx: &mut RequestMonitoring, raw_stream: S, tls_config: Arc, tls_server_end_point: TlsServerEndPoint, @@ -227,7 +232,10 @@ async fn ssl_handshake( } Ok(Stream::Tls { - tls: Box::new(raw.upgrade(tls_config).await?), + tls: Box::new( + raw.upgrade(tls_config, !ctx.has_private_peer_addr()) + .await?, + ), tls_server_end_point, }) } @@ -250,7 +258,7 @@ async fn handle_client( tls_server_end_point: TlsServerEndPoint, stream: impl AsyncRead + AsyncWrite + Unpin, ) -> anyhow::Result<()> { - let tls_stream = ssl_handshake(stream, tls_config, tls_server_end_point).await?; + let mut tls_stream = ssl_handshake(&mut ctx, stream, tls_config, tls_server_end_point).await?; // Cut off first part of the SNI domain // We receive required destination details in the format of @@ -267,13 +275,15 @@ async fn handle_client( info!("destination: {}", destination); - let client = tokio::net::TcpStream::connect(destination).await?; - - let metrics_aux: MetricsAuxInfo = Default::default(); + let mut client = tokio::net::TcpStream::connect(destination).await?; // doesn't yet matter as pg-sni-router doesn't report analytics logs ctx.set_success(); - ctx.log(); + ctx.log_connect(); - proxy::proxy::passthrough::proxy_pass(tls_stream, client, metrics_aux).await + // Starting from here we only proxy the client's traffic. + info!("performing the proxy pass..."); + let _ = copy_bidirectional_client_compute(&mut tls_stream, &mut client).await?; + + Ok(()) } diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs index b3d4fc0411..be7d961b8c 100644 --- a/proxy/src/bin/proxy.rs +++ b/proxy/src/bin/proxy.rs @@ -1,8 +1,17 @@ +use aws_config::environment::EnvironmentVariableCredentialsProvider; +use aws_config::imds::credentials::ImdsCredentialsProvider; +use aws_config::meta::credentials::CredentialsProviderChain; +use aws_config::meta::region::RegionProviderChain; +use aws_config::profile::ProfileFileCredentialsProvider; +use aws_config::provider_config::ProviderConfig; +use aws_config::web_identity_token::WebIdentityTokenCredentialsProvider; use futures::future::Either; use proxy::auth; +use proxy::auth::backend::AuthRateLimiter; use proxy::auth::backend::MaybeOwned; use proxy::cancellation::CancelMap; use proxy::cancellation::CancellationHandler; +use proxy::config::remote_storage_from_toml; use proxy::config::AuthenticationConfig; use proxy::config::CacheOptions; use proxy::config::HttpConfig; @@ -10,11 +19,15 @@ use proxy::config::ProjectInfoCacheOptions; use proxy::console; use proxy::context::parquet::ParquetUploadArgs; use proxy::http; +use proxy::http::health_server::AppMetrics; +use proxy::metrics::Metrics; use proxy::rate_limiter::EndpointRateLimiter; use proxy::rate_limiter::RateBucketInfo; -use proxy::rate_limiter::RateLimiterConfig; +use proxy::redis::cancellation_publisher::RedisPublisherClient; +use proxy::redis::connection_with_credentials_provider::ConnectionWithCredentialsProvider; +use proxy::redis::elasticache; use proxy::redis::notifications; -use proxy::redis::publisher::RedisPublisherClient; +use proxy::serverless::cancel_set::CancelSet; use proxy::serverless::GlobalConnPoolOptions; use proxy::usage_metrics; @@ -30,6 +43,7 @@ use tokio::task::JoinSet; use tokio_util::sync::CancellationToken; use tracing::info; use tracing::warn; +use tracing::Instrument; use utils::{project_build_tag, project_git_version, sentry_init::init_sentry}; project_git_version!(GIT_VERSION); @@ -105,8 +119,11 @@ struct ProxyCliArgs { #[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)] wake_compute_cache: String, /// lock for `wake_compute` api method. example: "shards=32,permits=4,epoch=10m,timeout=1s". (use `permits=0` to disable). - #[clap(long, default_value = config::WakeComputeLockOptions::DEFAULT_OPTIONS_WAKE_COMPUTE_LOCK)] + #[clap(long, default_value = config::ConcurrencyLockOptions::DEFAULT_OPTIONS_WAKE_COMPUTE_LOCK)] wake_compute_lock: String, + /// lock for `connect_compute` api method. example: "shards=32,permits=4,epoch=10m,timeout=1s". (use `permits=0` to disable). + #[clap(long, default_value = config::ConcurrencyLockOptions::DEFAULT_OPTIONS_CONNECT_COMPUTE_LOCK)] + connect_compute_lock: String, /// Allow self-signed certificates for compute nodes (for testing) #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)] allow_self_signed_compute: bool, @@ -119,28 +136,29 @@ struct ProxyCliArgs { #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)] require_client_ip: bool, /// Disable dynamic rate limiter and store the metrics to ensure its production behaviour. - #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)] + #[clap(long, default_value_t = true, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)] disable_dynamic_rate_limiter: bool, - /// Rate limit algorithm. Makes sense only if `disable_rate_limiter` is `false`. - #[clap(value_enum, long, default_value_t = proxy::rate_limiter::RateLimitAlgorithm::Aimd)] - rate_limit_algorithm: proxy::rate_limiter::RateLimitAlgorithm, - /// Timeout for rate limiter. If it didn't manage to aquire a permit in this time, it will return an error. - #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)] - rate_limiter_timeout: tokio::time::Duration, /// Endpoint rate limiter max number of requests per second. /// /// Provided in the form '@'. /// Can be given multiple times for different bucket sizes. - #[clap(long, default_values_t = RateBucketInfo::DEFAULT_SET)] + #[clap(long, default_values_t = RateBucketInfo::DEFAULT_ENDPOINT_SET)] endpoint_rps_limit: Vec, + /// Wake compute rate limiter max number of requests per second. + #[clap(long, default_values_t = RateBucketInfo::DEFAULT_SET)] + wake_compute_limit: Vec, + /// Whether the auth rate limiter actually takes effect (for testing) + #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)] + auth_rate_limit_enabled: bool, + /// Authentication rate limiter max number of hashes per second. + #[clap(long, default_values_t = RateBucketInfo::DEFAULT_AUTH_SET)] + auth_rate_limit: Vec, + /// The IP subnet to use when considering whether two IP addresses are considered the same. + #[clap(long, default_value_t = 64)] + auth_rate_limit_ip_subnet: u8, /// Redis rate limiter max number of requests per second. #[clap(long, default_values_t = RateBucketInfo::DEFAULT_SET)] redis_rps_limit: Vec, - /// Initial limit for dynamic rate limiter. Makes sense only if `rate_limit_algorithm` is *not* `None`. - #[clap(long, default_value_t = 100)] - initial_limit: usize, - #[clap(flatten)] - aimd_config: proxy::rate_limiter::AimdConfig, /// cache for `allowed_ips` (use `size=0` to disable) #[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)] allowed_ips_cache: String, @@ -150,15 +168,51 @@ struct ProxyCliArgs { /// disable ip check for http requests. If it is too time consuming, it could be turned off. #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)] disable_ip_check_for_http: bool, - /// redis url for notifications. + /// redis url for notifications (if empty, redis_host:port will be used for both notifications and streaming connections) #[clap(long)] redis_notifications: Option, + /// redis host for streaming connections (might be different from the notifications host) + #[clap(long)] + redis_host: Option, + /// redis port for streaming connections (might be different from the notifications host) + #[clap(long)] + redis_port: Option, + /// redis cluster name, used in aws elasticache + #[clap(long)] + redis_cluster_name: Option, + /// redis user_id, used in aws elasticache + #[clap(long)] + redis_user_id: Option, + /// aws region to retrieve credentials + #[clap(long, default_value_t = String::new())] + aws_region: String, /// cache for `project_info` (use `size=0` to disable) #[clap(long, default_value = config::ProjectInfoCacheOptions::CACHE_DEFAULT_OPTIONS)] project_info_cache: String, - + /// cache for all valid endpoints + #[clap(long, default_value = config::EndpointCacheConfig::CACHE_DEFAULT_OPTIONS)] + endpoint_cache_config: String, #[clap(flatten)] parquet_upload: ParquetUploadArgs, + + /// interval for backup metric collection + #[clap(long, default_value = "10m", value_parser = humantime::parse_duration)] + metric_backup_collection_interval: std::time::Duration, + /// remote storage configuration for backup metric collection + /// Encoded as toml (same format as pageservers), eg + /// `{bucket_name='the-bucket',bucket_region='us-east-1',prefix_in_bucket='proxy',endpoint='http://minio:9000'}` + #[clap(long, default_value = "{}")] + metric_backup_collection_remote_storage: String, + /// chunk size for backup metric collection + /// Size of each event is no more than 400 bytes, so 2**22 is about 200MB before the compression. + #[clap(long, default_value = "4194304")] + metric_backup_collection_chunk_size: usize, + /// Whether to retry the connection to the compute node + #[clap(long, default_value = config::RetryConfig::CONNECT_TO_COMPUTE_DEFAULT_VALUES)] + connect_to_compute_retry: String, + /// Whether to retry the wake_compute request + #[clap(long, default_value = config::RetryConfig::WAKE_COMPUTE_DEFAULT_VALUES)] + wake_compute_retry: String, } #[derive(clap::Args, Clone, Copy, Debug)] @@ -193,6 +247,12 @@ struct SqlOverHttpArgs { /// increase memory used by the pool #[clap(long, default_value_t = 128)] sql_over_http_pool_shards: usize, + + #[clap(long, default_value_t = 10000)] + sql_over_http_client_conn_threshold: u64, + + #[clap(long, default_value_t = 64)] + sql_over_http_cancel_set_shards: usize, } #[tokio::main] @@ -203,19 +263,78 @@ async fn main() -> anyhow::Result<()> { info!("Version: {GIT_VERSION}"); info!("Build_tag: {BUILD_TAG}"); - ::metrics::set_build_info_metric(GIT_VERSION, BUILD_TAG); + let neon_metrics = ::metrics::NeonMetrics::new(::metrics::BuildInfo { + revision: GIT_VERSION, + build_tag: BUILD_TAG, + }); - match proxy::jemalloc::MetricRecorder::new(prometheus::default_registry()) { - Ok(t) => { - t.start(); + let jemalloc = match proxy::jemalloc::MetricRecorder::new() { + Ok(t) => Some(t), + Err(e) => { + tracing::error!(error = ?e, "could not start jemalloc metrics loop"); + None } - Err(e) => tracing::error!(error = ?e, "could not start jemalloc metrics loop"), - } + }; let args = ProxyCliArgs::parse(); let config = build_config(&args)?; info!("Authentication backend: {}", config.auth_backend); + info!("Using region: {}", config.aws_region); + + let region_provider = RegionProviderChain::default_provider().or_else(&*config.aws_region); // Replace with your Redis region if needed + let provider_conf = + ProviderConfig::without_region().with_region(region_provider.region().await); + let aws_credentials_provider = { + // uses "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY" + CredentialsProviderChain::first_try("env", EnvironmentVariableCredentialsProvider::new()) + // uses "AWS_PROFILE" / `aws sso login --profile ` + .or_else( + "profile-sso", + ProfileFileCredentialsProvider::builder() + .configure(&provider_conf) + .build(), + ) + // uses "AWS_WEB_IDENTITY_TOKEN_FILE", "AWS_ROLE_ARN", "AWS_ROLE_SESSION_NAME" + // needed to access remote extensions bucket + .or_else( + "token", + WebIdentityTokenCredentialsProvider::builder() + .configure(&provider_conf) + .build(), + ) + // uses imds v2 + .or_else("imds", ImdsCredentialsProvider::builder().build()) + }; + let elasticache_credentials_provider = Arc::new(elasticache::CredentialsProvider::new( + elasticache::AWSIRSAConfig::new( + config.aws_region.clone(), + args.redis_cluster_name, + args.redis_user_id, + ), + aws_credentials_provider, + )); + let regional_redis_client = match (args.redis_host, args.redis_port) { + (Some(host), Some(port)) => Some( + ConnectionWithCredentialsProvider::new_with_credentials_provider( + host, + port, + elasticache_credentials_provider.clone(), + ), + ), + (None, None) => { + warn!("Redis events from console are disabled"); + None + } + _ => { + bail!("redis-host and redis-port must be specified together"); + } + }; + let redis_notifications_client = if let Some(url) = args.redis_notifications { + Some(ConnectionWithCredentialsProvider::new_with_static_credentials(url)) + } else { + regional_redis_client.clone() + }; // Check that we can bind to address before further initialization let http_address: SocketAddr = args.http.parse()?; @@ -231,21 +350,28 @@ async fn main() -> anyhow::Result<()> { let proxy_listener = TcpListener::bind(proxy_address).await?; let cancellation_token = CancellationToken::new(); - let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new(&config.endpoint_rps_limit)); let cancel_map = CancelMap::default(); - let redis_publisher = match &args.redis_notifications { - Some(url) => Some(Arc::new(Mutex::new(RedisPublisherClient::new( - url, + + let redis_publisher = match ®ional_redis_client { + Some(redis_publisher) => Some(Arc::new(Mutex::new(RedisPublisherClient::new( + redis_publisher.clone(), args.region.clone(), &config.redis_rps_limit, )?))), None => None, }; - let cancellation_handler = Arc::new(CancellationHandler::new( + let cancellation_handler = Arc::new(CancellationHandler::< + Option>>, + >::new( cancel_map.clone(), redis_publisher, + proxy::metrics::CancellationSource::FromClient, )); + let mut endpoint_rps_limit = args.endpoint_rps_limit.clone(); + RateBucketInfo::validate(&mut endpoint_rps_limit)?; + let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new(endpoint_rps_limit)); + // client facing tasks. these will exit on error or on cancellation // cancellation returns Ok(()) let mut client_tasks = JoinSet::new(); @@ -253,8 +379,8 @@ async fn main() -> anyhow::Result<()> { config, proxy_listener, cancellation_token.clone(), - endpoint_rate_limiter.clone(), cancellation_handler.clone(), + endpoint_rate_limiter.clone(), )); // TODO: rename the argument to something like serverless. @@ -268,8 +394,8 @@ async fn main() -> anyhow::Result<()> { config, serverless_listener, cancellation_token.clone(), - endpoint_rate_limiter.clone(), cancellation_handler.clone(), + endpoint_rate_limiter.clone(), )); } @@ -280,27 +406,60 @@ async fn main() -> anyhow::Result<()> { // maintenance tasks. these never return unless there's an error let mut maintenance_tasks = JoinSet::new(); - maintenance_tasks.spawn(proxy::handle_signals(cancellation_token)); - maintenance_tasks.spawn(http::health_server::task_main(http_listener)); + maintenance_tasks.spawn(proxy::handle_signals(cancellation_token.clone())); + maintenance_tasks.spawn(http::health_server::task_main( + http_listener, + AppMetrics { + jemalloc, + neon_metrics, + proxy: proxy::metrics::Metrics::get(), + }, + )); maintenance_tasks.spawn(console::mgmt::task_main(mgmt_listener)); if let Some(metrics_config) = &config.metric_collection { + // TODO: Add gc regardles of the metric collection being enabled. maintenance_tasks.spawn(usage_metrics::task_main(metrics_config)); + client_tasks.spawn(usage_metrics::task_backup( + &metrics_config.backup_metric_collection_config, + cancellation_token.clone(), + )); } if let auth::BackendType::Console(api, _) = &config.auth_backend { if let proxy::console::provider::ConsoleBackend::Console(api) = &**api { - let cache = api.caches.project_info.clone(); - if let Some(url) = args.redis_notifications { - info!("Starting redis notifications listener ({url})"); - maintenance_tasks.spawn(notifications::task_main( - url.to_owned(), - cache.clone(), - cancel_map.clone(), - args.region.clone(), - )); + match (redis_notifications_client, regional_redis_client.clone()) { + (None, None) => {} + (client1, client2) => { + let cache = api.caches.project_info.clone(); + if let Some(client) = client1 { + maintenance_tasks.spawn(notifications::task_main( + client, + cache.clone(), + cancel_map.clone(), + args.region.clone(), + )); + } + if let Some(client) = client2 { + maintenance_tasks.spawn(notifications::task_main( + client, + cache.clone(), + cancel_map.clone(), + args.region.clone(), + )); + } + maintenance_tasks.spawn(async move { cache.clone().gc_worker().await }); + } + } + if let Some(regional_redis_client) = regional_redis_client { + let cache = api.caches.endpoints_cache.clone(); + let con = regional_redis_client; + let span = tracing::info_span!("endpoints_cache"); + maintenance_tasks.spawn( + async move { cache.do_read(con, cancellation_token.clone()).await } + .instrument(span), + ); } - maintenance_tasks.spawn(async move { cache.clone().gc_worker().await }); } } @@ -343,6 +502,13 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { if args.allow_self_signed_compute { warn!("allowing self-signed compute certificates"); } + let backup_metric_collection_config = config::MetricBackupCollectionConfig { + interval: args.metric_backup_collection_interval, + remote_storage_config: remote_storage_from_toml( + &args.metric_backup_collection_remote_storage, + )?, + chunk_size: args.metric_backup_collection_chunk_size, + }; let metric_collection = match ( &args.metric_collection_endpoint, @@ -351,6 +517,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { (Some(endpoint), Some(interval)) => Some(config::MetricCollectionConfig { endpoint: endpoint.parse()?, interval: humantime::parse_duration(interval)?, + backup_metric_collection_config, }), (None, None) => None, _ => bail!( @@ -358,46 +525,59 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { and metric-collection-interval must be specified" ), }; - let rate_limiter_config = RateLimiterConfig { - disable: args.disable_dynamic_rate_limiter, - algorithm: args.rate_limit_algorithm, - timeout: args.rate_limiter_timeout, - initial_limit: args.initial_limit, - aimd_config: Some(args.aimd_config), - }; + if !args.disable_dynamic_rate_limiter { + bail!("dynamic rate limiter should be disabled"); + } let auth_backend = match &args.auth_backend { AuthBackend::Console => { let wake_compute_cache_config: CacheOptions = args.wake_compute_cache.parse()?; let project_info_cache_config: ProjectInfoCacheOptions = args.project_info_cache.parse()?; + let endpoint_cache_config: config::EndpointCacheConfig = + args.endpoint_cache_config.parse()?; info!("Using NodeInfoCache (wake_compute) with options={wake_compute_cache_config:?}"); info!( "Using AllowedIpsCache (wake_compute) with options={project_info_cache_config:?}" ); + info!("Using EndpointCacheConfig with options={endpoint_cache_config:?}"); let caches = Box::leak(Box::new(console::caches::ApiCaches::new( wake_compute_cache_config, project_info_cache_config, + endpoint_cache_config, ))); - let config::WakeComputeLockOptions { + let config::ConcurrencyLockOptions { shards, permits, epoch, timeout, } = args.wake_compute_lock.parse()?; info!(permits, shards, ?epoch, "Using NodeLocks (wake_compute)"); - let locks = Box::leak(Box::new( - console::locks::ApiLocks::new("wake_compute_lock", permits, shards, timeout) - .unwrap(), - )); - tokio::spawn(locks.garbage_collect_worker(epoch)); + let locks = Box::leak(Box::new(console::locks::ApiLocks::new( + "wake_compute_lock", + permits, + shards, + timeout, + epoch, + &Metrics::get().wake_compute_lock, + )?)); + tokio::spawn(locks.garbage_collect_worker()); let url = args.auth_endpoint.parse()?; - let endpoint = http::Endpoint::new(url, http::new_client(rate_limiter_config)); + let endpoint = http::Endpoint::new(url, http::new_client()); - let api = console::provider::neon::Api::new(endpoint, caches, locks); + let mut wake_compute_rps_limit = args.wake_compute_limit.clone(); + RateBucketInfo::validate(&mut wake_compute_rps_limit)?; + let wake_compute_endpoint_rate_limiter = + Arc::new(EndpointRateLimiter::new(wake_compute_rps_limit)); + let api = console::provider::neon::Api::new( + endpoint, + caches, + locks, + wake_compute_endpoint_rate_limiter, + ); let api = console::provider::ConsoleBackend::Console(api); auth::BackendType::Console(MaybeOwned::Owned(api), ()) } @@ -413,6 +593,23 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { auth::BackendType::Link(MaybeOwned::Owned(url), ()) } }; + + let config::ConcurrencyLockOptions { + shards, + permits, + epoch, + timeout, + } = args.connect_compute_lock.parse()?; + info!(permits, shards, ?epoch, "Using NodeLocks (connect_compute)"); + let connect_compute_locks = console::locks::ApiLocks::new( + "connect_compute_lock", + permits, + shards, + timeout, + epoch, + &Metrics::get().proxy.connect_compute_lock, + )?; + let http_config = HttpConfig { request_timeout: args.sql_over_http.sql_over_http_timeout, pool_options: GlobalConnPoolOptions { @@ -423,13 +620,16 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { opt_in: args.sql_over_http.sql_over_http_pool_opt_in, max_total_conns: args.sql_over_http.sql_over_http_pool_max_total_conns, }, + cancel_set: CancelSet::new(args.sql_over_http.sql_over_http_cancel_set_shards), + client_conn_threshold: args.sql_over_http.sql_over_http_client_conn_threshold, }; let authentication_config = AuthenticationConfig { scram_protocol_timeout: args.scram_protocol_timeout, + rate_limiter_enabled: args.auth_rate_limit_enabled, + rate_limiter: AuthRateLimiter::new(args.auth_rate_limit.clone()), + rate_limit_ip_subnet: args.auth_rate_limit_ip_subnet, }; - let mut endpoint_rps_limit = args.endpoint_rps_limit.clone(); - RateBucketInfo::validate(&mut endpoint_rps_limit)?; let mut redis_rps_limit = args.redis_rps_limit.clone(); RateBucketInfo::validate(&mut redis_rps_limit)?; @@ -442,13 +642,19 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { authentication_config, require_client_ip: args.require_client_ip, disable_ip_check_for_http: args.disable_ip_check_for_http, - endpoint_rps_limit, redis_rps_limit, handshake_timeout: args.handshake_timeout, - // TODO: add this argument region: args.region.clone(), + aws_region: args.aws_region.clone(), + wake_compute_retry_config: config::RetryConfig::parse(&args.wake_compute_retry)?, + connect_compute_locks, + connect_to_compute_retry_config: config::RetryConfig::parse( + &args.connect_to_compute_retry, + )?, })); + tokio::spawn(config.connect_compute_locks.garbage_collect_worker()); + Ok(config) } diff --git a/proxy/src/cache.rs b/proxy/src/cache.rs index fc5f416395..d1d4087241 100644 --- a/proxy/src/cache.rs +++ b/proxy/src/cache.rs @@ -1,4 +1,5 @@ pub mod common; +pub mod endpoints; pub mod project_info; mod timed_lru; diff --git a/proxy/src/cache/common.rs b/proxy/src/cache/common.rs index 2af6a70e90..bc1c37512b 100644 --- a/proxy/src/cache/common.rs +++ b/proxy/src/cache/common.rs @@ -43,6 +43,16 @@ impl Cached { Self { token: None, value } } + pub fn take_value(self) -> (Cached, V) { + ( + Cached { + token: self.token, + value: (), + }, + self.value, + ) + } + /// Drop this entry from a cache if it's still there. pub fn invalidate(self) -> V { if let Some((cache, info)) = &self.token { diff --git a/proxy/src/cache/endpoints.rs b/proxy/src/cache/endpoints.rs new file mode 100644 index 0000000000..4bc10a6020 --- /dev/null +++ b/proxy/src/cache/endpoints.rs @@ -0,0 +1,247 @@ +use std::{ + convert::Infallible, + sync::{ + atomic::{AtomicBool, Ordering}, + Arc, + }, + time::Duration, +}; + +use dashmap::DashSet; +use redis::{ + streams::{StreamReadOptions, StreamReadReply}, + AsyncCommands, FromRedisValue, Value, +}; +use serde::Deserialize; +use tokio::sync::Mutex; +use tokio_util::sync::CancellationToken; +use tracing::info; + +use crate::{ + config::EndpointCacheConfig, + context::RequestMonitoring, + intern::{BranchIdInt, EndpointIdInt, ProjectIdInt}, + metrics::{Metrics, RedisErrors, RedisEventsCount}, + rate_limiter::GlobalRateLimiter, + redis::connection_with_credentials_provider::ConnectionWithCredentialsProvider, + EndpointId, +}; + +#[derive(Deserialize, Debug, Clone)] +pub struct ControlPlaneEventKey { + endpoint_created: Option, + branch_created: Option, + project_created: Option, +} +#[derive(Deserialize, Debug, Clone)] +struct EndpointCreated { + endpoint_id: String, +} +#[derive(Deserialize, Debug, Clone)] +struct BranchCreated { + branch_id: String, +} +#[derive(Deserialize, Debug, Clone)] +struct ProjectCreated { + project_id: String, +} + +pub struct EndpointsCache { + config: EndpointCacheConfig, + endpoints: DashSet, + branches: DashSet, + projects: DashSet, + ready: AtomicBool, + limiter: Arc>, +} + +impl EndpointsCache { + pub fn new(config: EndpointCacheConfig) -> Self { + Self { + limiter: Arc::new(Mutex::new(GlobalRateLimiter::new( + config.limiter_info.clone(), + ))), + config, + endpoints: DashSet::new(), + branches: DashSet::new(), + projects: DashSet::new(), + ready: AtomicBool::new(false), + } + } + pub async fn is_valid(&self, ctx: &mut RequestMonitoring, endpoint: &EndpointId) -> bool { + if !self.ready.load(Ordering::Acquire) { + return true; + } + let rejected = self.should_reject(endpoint); + ctx.set_rejected(rejected); + info!(?rejected, "check endpoint is valid, disabled cache"); + // If cache is disabled, just collect the metrics and return or + // If the limiter allows, we don't need to check the cache. + if self.config.disable_cache || self.limiter.lock().await.check() { + return true; + } + !rejected + } + fn should_reject(&self, endpoint: &EndpointId) -> bool { + if endpoint.is_endpoint() { + !self.endpoints.contains(&EndpointIdInt::from(endpoint)) + } else if endpoint.is_branch() { + !self + .branches + .contains(&BranchIdInt::from(&endpoint.as_branch())) + } else { + !self + .projects + .contains(&ProjectIdInt::from(&endpoint.as_project())) + } + } + fn insert_event(&self, key: ControlPlaneEventKey) { + // Do not do normalization here, we expect the events to be normalized. + if let Some(endpoint_created) = key.endpoint_created { + self.endpoints + .insert(EndpointIdInt::from(&endpoint_created.endpoint_id.into())); + Metrics::get() + .proxy + .redis_events_count + .inc(RedisEventsCount::EndpointCreated); + } + if let Some(branch_created) = key.branch_created { + self.branches + .insert(BranchIdInt::from(&branch_created.branch_id.into())); + Metrics::get() + .proxy + .redis_events_count + .inc(RedisEventsCount::BranchCreated); + } + if let Some(project_created) = key.project_created { + self.projects + .insert(ProjectIdInt::from(&project_created.project_id.into())); + Metrics::get() + .proxy + .redis_events_count + .inc(RedisEventsCount::ProjectCreated); + } + } + pub async fn do_read( + &self, + mut con: ConnectionWithCredentialsProvider, + cancellation_token: CancellationToken, + ) -> anyhow::Result { + let mut last_id = "0-0".to_string(); + loop { + if let Err(e) = con.connect().await { + tracing::error!("error connecting to redis: {:?}", e); + self.ready.store(false, Ordering::Release); + } + if let Err(e) = self.read_from_stream(&mut con, &mut last_id).await { + tracing::error!("error reading from redis: {:?}", e); + self.ready.store(false, Ordering::Release); + } + if cancellation_token.is_cancelled() { + info!("cancellation token is cancelled, exiting"); + tokio::time::sleep(Duration::from_secs(60 * 60 * 24 * 7)).await; + // 1 week. + } + tokio::time::sleep(self.config.retry_interval).await; + } + } + async fn read_from_stream( + &self, + con: &mut ConnectionWithCredentialsProvider, + last_id: &mut String, + ) -> anyhow::Result<()> { + tracing::info!("reading endpoints/branches/projects from redis"); + self.batch_read( + con, + StreamReadOptions::default().count(self.config.initial_batch_size), + last_id, + true, + ) + .await?; + tracing::info!("ready to filter user requests"); + self.ready.store(true, Ordering::Release); + self.batch_read( + con, + StreamReadOptions::default() + .count(self.config.default_batch_size) + .block(self.config.xread_timeout.as_millis() as usize), + last_id, + false, + ) + .await + } + fn parse_key_value(value: &Value) -> anyhow::Result { + let s: String = FromRedisValue::from_redis_value(value)?; + Ok(serde_json::from_str(&s)?) + } + async fn batch_read( + &self, + conn: &mut ConnectionWithCredentialsProvider, + opts: StreamReadOptions, + last_id: &mut String, + return_when_finish: bool, + ) -> anyhow::Result<()> { + let mut total: usize = 0; + loop { + let mut res: StreamReadReply = conn + .xread_options(&[&self.config.stream_name], &[last_id.as_str()], &opts) + .await?; + + if res.keys.is_empty() { + if return_when_finish { + if total != 0 { + break; + } + anyhow::bail!( + "Redis stream {} is empty, cannot be used to filter endpoints", + self.config.stream_name + ); + } + // If we are not returning when finish, we should wait for more data. + continue; + } + if res.keys.len() != 1 { + anyhow::bail!("Cannot read from redis stream {}", self.config.stream_name); + } + + let res = res.keys.pop().expect("Checked length above"); + let len = res.ids.len(); + for x in res.ids { + total += 1; + for (_, v) in x.map { + let key = match Self::parse_key_value(&v) { + Ok(x) => x, + Err(e) => { + Metrics::get().proxy.redis_errors_total.inc(RedisErrors { + channel: &self.config.stream_name, + }); + tracing::error!("error parsing value {v:?}: {e:?}"); + continue; + } + }; + self.insert_event(key); + } + if total.is_power_of_two() { + tracing::debug!("endpoints read {}", total); + } + *last_id = x.id; + } + if return_when_finish && len <= self.config.default_batch_size { + break; + } + } + tracing::info!("read {} endpoints/branches/projects from redis", total); + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use super::ControlPlaneEventKey; + + #[test] + fn test() { + let s = "{\"branch_created\":null,\"endpoint_created\":{\"endpoint_id\":\"ep-rapid-thunder-w0qqw2q9\"},\"project_created\":null,\"type\":\"endpoint_created\"}"; + let _: ControlPlaneEventKey = serde_json::from_str(s).unwrap(); + } +} diff --git a/proxy/src/cache/project_info.rs b/proxy/src/cache/project_info.rs index 6e3eb8c1b0..10cc4ceee1 100644 --- a/proxy/src/cache/project_info.rs +++ b/proxy/src/cache/project_info.rs @@ -5,9 +5,11 @@ use std::{ time::Duration, }; +use async_trait::async_trait; use dashmap::DashMap; use rand::{thread_rng, Rng}; use smol_str::SmolStr; +use tokio::sync::Mutex; use tokio::time::Instant; use tracing::{debug, info}; @@ -16,16 +18,17 @@ use crate::{ config::ProjectInfoCacheOptions, console::AuthSecret, intern::{EndpointIdInt, ProjectIdInt, RoleNameInt}, - EndpointId, ProjectId, RoleName, + EndpointId, RoleName, }; use super::{Cache, Cached}; +#[async_trait] pub trait ProjectInfoCache { fn invalidate_allowed_ips_for_project(&self, project_id: ProjectIdInt); fn invalidate_role_secret_for_project(&self, project_id: ProjectIdInt, role_name: RoleNameInt); - fn enable_ttl(&self); - fn disable_ttl(&self); + async fn decrement_active_listeners(&self); + async fn increment_active_listeners(&self); } struct Entry { @@ -116,8 +119,10 @@ pub struct ProjectInfoCacheImpl { start_time: Instant, ttl_disabled_since_us: AtomicU64, + active_listeners_lock: Mutex, } +#[async_trait] impl ProjectInfoCache for ProjectInfoCacheImpl { fn invalidate_allowed_ips_for_project(&self, project_id: ProjectIdInt) { info!("invalidating allowed ips for project `{}`", project_id); @@ -148,15 +153,27 @@ impl ProjectInfoCache for ProjectInfoCacheImpl { } } } - fn enable_ttl(&self) { - self.ttl_disabled_since_us - .store(u64::MAX, std::sync::atomic::Ordering::Relaxed); + async fn decrement_active_listeners(&self) { + let mut listeners_guard = self.active_listeners_lock.lock().await; + if *listeners_guard == 0 { + tracing::error!("active_listeners count is already 0, something is broken"); + return; + } + *listeners_guard -= 1; + if *listeners_guard == 0 { + self.ttl_disabled_since_us + .store(u64::MAX, std::sync::atomic::Ordering::SeqCst); + } } - fn disable_ttl(&self) { - let new_ttl = (self.start_time.elapsed() + self.config.ttl).as_micros() as u64; - self.ttl_disabled_since_us - .store(new_ttl, std::sync::atomic::Ordering::Relaxed); + async fn increment_active_listeners(&self) { + let mut listeners_guard = self.active_listeners_lock.lock().await; + *listeners_guard += 1; + if *listeners_guard == 1 { + let new_ttl = (self.start_time.elapsed() + self.config.ttl).as_micros() as u64; + self.ttl_disabled_since_us + .store(new_ttl, std::sync::atomic::Ordering::SeqCst); + } } } @@ -168,6 +185,7 @@ impl ProjectInfoCacheImpl { config, ttl_disabled_since_us: AtomicU64::new(u64::MAX), start_time: Instant::now(), + active_listeners_lock: Mutex::new(0), } } @@ -214,14 +232,11 @@ impl ProjectInfoCacheImpl { } pub fn insert_role_secret( &self, - project_id: &ProjectId, - endpoint_id: &EndpointId, - role_name: &RoleName, + project_id: ProjectIdInt, + endpoint_id: EndpointIdInt, + role_name: RoleNameInt, secret: Option, ) { - let project_id = ProjectIdInt::from(project_id); - let endpoint_id = EndpointIdInt::from(endpoint_id); - let role_name = RoleNameInt::from(role_name); if self.cache.len() >= self.config.size { // If there are too many entries, wait until the next gc cycle. return; @@ -234,12 +249,10 @@ impl ProjectInfoCacheImpl { } pub fn insert_allowed_ips( &self, - project_id: &ProjectId, - endpoint_id: &EndpointId, + project_id: ProjectIdInt, + endpoint_id: EndpointIdInt, allowed_ips: Arc>, ) { - let project_id = ProjectIdInt::from(project_id); - let endpoint_id = EndpointIdInt::from(endpoint_id); if self.cache.len() >= self.config.size { // If there are too many entries, wait until the next gc cycle. return; @@ -358,7 +371,7 @@ impl Cache for ProjectInfoCacheImpl { #[cfg(test)] mod tests { use super::*; - use crate::scram::ServerSecret; + use crate::{scram::ServerSecret, ProjectId}; #[tokio::test] async fn test_project_info_cache_settings() { @@ -369,22 +382,33 @@ mod tests { ttl: Duration::from_secs(1), gc_interval: Duration::from_secs(600), }); - let project_id = "project".into(); - let endpoint_id = "endpoint".into(); + let project_id: ProjectId = "project".into(); + let endpoint_id: EndpointId = "endpoint".into(); let user1: RoleName = "user1".into(); let user2: RoleName = "user2".into(); - let secret1 = Some(AuthSecret::Scram(ServerSecret::mock( - user1.as_str(), - [1; 32], - ))); + let secret1 = Some(AuthSecret::Scram(ServerSecret::mock([1; 32]))); let secret2 = None; let allowed_ips = Arc::new(vec![ "127.0.0.1".parse().unwrap(), "127.0.0.2".parse().unwrap(), ]); - cache.insert_role_secret(&project_id, &endpoint_id, &user1, secret1.clone()); - cache.insert_role_secret(&project_id, &endpoint_id, &user2, secret2.clone()); - cache.insert_allowed_ips(&project_id, &endpoint_id, allowed_ips.clone()); + cache.insert_role_secret( + (&project_id).into(), + (&endpoint_id).into(), + (&user1).into(), + secret1.clone(), + ); + cache.insert_role_secret( + (&project_id).into(), + (&endpoint_id).into(), + (&user2).into(), + secret2.clone(), + ); + cache.insert_allowed_ips( + (&project_id).into(), + (&endpoint_id).into(), + allowed_ips.clone(), + ); let cached = cache.get_role_secret(&endpoint_id, &user1).unwrap(); assert!(cached.cached()); @@ -395,11 +419,13 @@ mod tests { // Shouldn't add more than 2 roles. let user3: RoleName = "user3".into(); - let secret3 = Some(AuthSecret::Scram(ServerSecret::mock( - user3.as_str(), - [3; 32], - ))); - cache.insert_role_secret(&project_id, &endpoint_id, &user3, secret3.clone()); + let secret3 = Some(AuthSecret::Scram(ServerSecret::mock([3; 32]))); + cache.insert_role_secret( + (&project_id).into(), + (&endpoint_id).into(), + (&user3).into(), + secret3.clone(), + ); assert!(cache.get_role_secret(&endpoint_id, &user3).is_none()); let cached = cache.get_allowed_ips(&endpoint_id).unwrap(); @@ -424,28 +450,36 @@ mod tests { ttl: Duration::from_secs(1), gc_interval: Duration::from_secs(600), })); - cache.clone().disable_ttl(); + cache.clone().increment_active_listeners().await; tokio::time::advance(Duration::from_secs(2)).await; - let project_id = "project".into(); - let endpoint_id = "endpoint".into(); + let project_id: ProjectId = "project".into(); + let endpoint_id: EndpointId = "endpoint".into(); let user1: RoleName = "user1".into(); let user2: RoleName = "user2".into(); - let secret1 = Some(AuthSecret::Scram(ServerSecret::mock( - user1.as_str(), - [1; 32], - ))); - let secret2 = Some(AuthSecret::Scram(ServerSecret::mock( - user2.as_str(), - [2; 32], - ))); + let secret1 = Some(AuthSecret::Scram(ServerSecret::mock([1; 32]))); + let secret2 = Some(AuthSecret::Scram(ServerSecret::mock([2; 32]))); let allowed_ips = Arc::new(vec![ "127.0.0.1".parse().unwrap(), "127.0.0.2".parse().unwrap(), ]); - cache.insert_role_secret(&project_id, &endpoint_id, &user1, secret1.clone()); - cache.insert_role_secret(&project_id, &endpoint_id, &user2, secret2.clone()); - cache.insert_allowed_ips(&project_id, &endpoint_id, allowed_ips.clone()); + cache.insert_role_secret( + (&project_id).into(), + (&endpoint_id).into(), + (&user1).into(), + secret1.clone(), + ); + cache.insert_role_secret( + (&project_id).into(), + (&endpoint_id).into(), + (&user2).into(), + secret2.clone(), + ); + cache.insert_allowed_ips( + (&project_id).into(), + (&endpoint_id).into(), + allowed_ips.clone(), + ); tokio::time::advance(Duration::from_secs(2)).await; // Nothing should be invalidated. @@ -473,7 +507,7 @@ mod tests { } #[tokio::test] - async fn test_disable_ttl_invalidate_added_before() { + async fn test_increment_active_listeners_invalidate_added_before() { tokio::time::pause(); let cache = Arc::new(ProjectInfoCacheImpl::new(ProjectInfoCacheOptions { size: 2, @@ -482,26 +516,30 @@ mod tests { gc_interval: Duration::from_secs(600), })); - let project_id = "project".into(); - let endpoint_id = "endpoint".into(); + let project_id: ProjectId = "project".into(); + let endpoint_id: EndpointId = "endpoint".into(); let user1: RoleName = "user1".into(); let user2: RoleName = "user2".into(); - let secret1 = Some(AuthSecret::Scram(ServerSecret::mock( - user1.as_str(), - [1; 32], - ))); - let secret2 = Some(AuthSecret::Scram(ServerSecret::mock( - user2.as_str(), - [2; 32], - ))); + let secret1 = Some(AuthSecret::Scram(ServerSecret::mock([1; 32]))); + let secret2 = Some(AuthSecret::Scram(ServerSecret::mock([2; 32]))); let allowed_ips = Arc::new(vec![ "127.0.0.1".parse().unwrap(), "127.0.0.2".parse().unwrap(), ]); - cache.insert_role_secret(&project_id, &endpoint_id, &user1, secret1.clone()); - cache.clone().disable_ttl(); + cache.insert_role_secret( + (&project_id).into(), + (&endpoint_id).into(), + (&user1).into(), + secret1.clone(), + ); + cache.clone().increment_active_listeners().await; tokio::time::advance(Duration::from_millis(100)).await; - cache.insert_role_secret(&project_id, &endpoint_id, &user2, secret2.clone()); + cache.insert_role_secret( + (&project_id).into(), + (&endpoint_id).into(), + (&user2).into(), + secret2.clone(), + ); // Added before ttl was disabled + ttl should be still cached. let cached = cache.get_role_secret(&endpoint_id, &user1).unwrap(); @@ -515,7 +553,11 @@ mod tests { assert!(cache.get_role_secret(&endpoint_id, &user2).is_none()); // Added after ttl was disabled + ttl should not be cached. - cache.insert_allowed_ips(&project_id, &endpoint_id, allowed_ips.clone()); + cache.insert_allowed_ips( + (&project_id).into(), + (&endpoint_id).into(), + allowed_ips.clone(), + ); let cached = cache.get_allowed_ips(&endpoint_id).unwrap(); assert!(!cached.cached()); diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs index c9607909b3..34512e9f5b 100644 --- a/proxy/src/cancellation.rs +++ b/proxy/src/cancellation.rs @@ -1,4 +1,3 @@ -use async_trait::async_trait; use dashmap::DashMap; use pq_proto::CancelKeyData; use std::{net::SocketAddr, sync::Arc}; @@ -10,18 +9,26 @@ use tracing::info; use uuid::Uuid; use crate::{ - error::ReportableError, metrics::NUM_CANCELLATION_REQUESTS, - redis::publisher::RedisPublisherClient, + error::ReportableError, + metrics::{CancellationRequest, CancellationSource, Metrics}, + redis::cancellation_publisher::{ + CancellationPublisher, CancellationPublisherMut, RedisPublisherClient, + }, }; pub type CancelMap = Arc>>; +pub type CancellationHandlerMain = CancellationHandler>>>; +pub type CancellationHandlerMainInternal = Option>>; /// Enables serving `CancelRequest`s. /// -/// If there is a `RedisPublisherClient` available, it will be used to publish the cancellation key to other proxy instances. -pub struct CancellationHandler { +/// If `CancellationPublisher` is available, cancel request will be used to publish the cancellation key to other proxy instances. +pub struct CancellationHandler

{ map: CancelMap, - redis_client: Option>>, + client: P, + /// This field used for the monitoring purposes. + /// Represents the source of the cancellation request. + from: CancellationSource, } #[derive(Debug, Error)] @@ -44,49 +51,9 @@ impl ReportableError for CancelError { } } -impl CancellationHandler { - pub fn new(map: CancelMap, redis_client: Option>>) -> Self { - Self { map, redis_client } - } - /// Cancel a running query for the corresponding connection. - pub async fn cancel_session( - &self, - key: CancelKeyData, - session_id: Uuid, - ) -> Result<(), CancelError> { - let from = "from_client"; - // NB: we should immediately release the lock after cloning the token. - let Some(cancel_closure) = self.map.get(&key).and_then(|x| x.clone()) else { - tracing::warn!("query cancellation key not found: {key}"); - if let Some(redis_client) = &self.redis_client { - NUM_CANCELLATION_REQUESTS - .with_label_values(&[from, "not_found"]) - .inc(); - info!("publishing cancellation key to Redis"); - match redis_client.lock().await.try_publish(key, session_id).await { - Ok(()) => { - info!("cancellation key successfuly published to Redis"); - } - Err(e) => { - tracing::error!("failed to publish a message: {e}"); - return Err(CancelError::IO(std::io::Error::new( - std::io::ErrorKind::Other, - e.to_string(), - ))); - } - } - } - return Ok(()); - }; - NUM_CANCELLATION_REQUESTS - .with_label_values(&[from, "found"]) - .inc(); - info!("cancelling query per user's request using key {key}"); - cancel_closure.try_cancel_query().await - } - +impl CancellationHandler

{ /// Run async action within an ephemeral session identified by [`CancelKeyData`]. - pub fn get_session(self: Arc) -> Session { + pub fn get_session(self: Arc) -> Session

{ // HACK: We'd rather get the real backend_pid but tokio_postgres doesn't // expose it and we don't want to do another roundtrip to query // for it. The client will be able to notice that this is not the @@ -112,9 +79,47 @@ impl CancellationHandler { cancellation_handler: self, } } + /// Try to cancel a running query for the corresponding connection. + /// If the cancellation key is not found, it will be published to Redis. + pub async fn cancel_session( + &self, + key: CancelKeyData, + session_id: Uuid, + ) -> Result<(), CancelError> { + // NB: we should immediately release the lock after cloning the token. + let Some(cancel_closure) = self.map.get(&key).and_then(|x| x.clone()) else { + tracing::warn!("query cancellation key not found: {key}"); + Metrics::get() + .proxy + .cancellation_requests_total + .inc(CancellationRequest { + source: self.from, + kind: crate::metrics::CancellationOutcome::NotFound, + }); + match self.client.try_publish(key, session_id).await { + Ok(()) => {} // do nothing + Err(e) => { + return Err(CancelError::IO(std::io::Error::new( + std::io::ErrorKind::Other, + e.to_string(), + ))); + } + } + return Ok(()); + }; + Metrics::get() + .proxy + .cancellation_requests_total + .inc(CancellationRequest { + source: self.from, + kind: crate::metrics::CancellationOutcome::Found, + }); + info!("cancelling query per user's request using key {key}"); + cancel_closure.try_cancel_query().await + } #[cfg(test)] - fn contains(&self, session: &Session) -> bool { + fn contains(&self, session: &Session

) -> bool { self.map.contains_key(&session.key) } @@ -124,31 +129,19 @@ impl CancellationHandler { } } -#[async_trait] -pub trait NotificationsCancellationHandler { - async fn cancel_session_no_publish(&self, key: CancelKeyData) -> Result<(), CancelError>; +impl CancellationHandler<()> { + pub fn new(map: CancelMap, from: CancellationSource) -> Self { + Self { + map, + client: (), + from, + } + } } -#[async_trait] -impl NotificationsCancellationHandler for CancellationHandler { - async fn cancel_session_no_publish(&self, key: CancelKeyData) -> Result<(), CancelError> { - let from = "from_redis"; - let cancel_closure = self.map.get(&key).and_then(|x| x.clone()); - match cancel_closure { - Some(cancel_closure) => { - NUM_CANCELLATION_REQUESTS - .with_label_values(&[from, "found"]) - .inc(); - cancel_closure.try_cancel_query().await - } - None => { - NUM_CANCELLATION_REQUESTS - .with_label_values(&[from, "not_found"]) - .inc(); - tracing::warn!("query cancellation key not found: {key}"); - Ok(()) - } - } +impl CancellationHandler>>> { + pub fn new(map: CancelMap, client: Option>>, from: CancellationSource) -> Self { + Self { map, client, from } } } @@ -178,14 +171,14 @@ impl CancelClosure { } /// Helper for registering query cancellation tokens. -pub struct Session { +pub struct Session

{ /// The user-facing key identifying this session. key: CancelKeyData, /// The [`CancelMap`] this session belongs to. - cancellation_handler: Arc, + cancellation_handler: Arc>, } -impl Session { +impl

Session

{ /// Store the cancel token for the given session. /// This enables query cancellation in `crate::proxy::prepare_client_connection`. pub fn enable_query_cancellation(&self, cancel_closure: CancelClosure) -> CancelKeyData { @@ -198,7 +191,7 @@ impl Session { } } -impl Drop for Session { +impl

Drop for Session

{ fn drop(&mut self) { self.cancellation_handler.map.remove(&self.key); info!("dropped query cancellation key {}", &self.key); @@ -211,10 +204,10 @@ mod tests { #[tokio::test] async fn check_session_drop() -> anyhow::Result<()> { - let cancellation_handler = Arc::new(CancellationHandler { - map: CancelMap::default(), - redis_client: None, - }); + let cancellation_handler = Arc::new(CancellationHandler::<()>::new( + CancelMap::default(), + CancellationSource::FromRedis, + )); let session = cancellation_handler.clone().get_session(); assert!(cancellation_handler.contains(&session)); @@ -224,4 +217,19 @@ mod tests { Ok(()) } + + #[tokio::test] + async fn cancel_session_noop_regression() { + let handler = CancellationHandler::<()>::new(Default::default(), CancellationSource::Local); + handler + .cancel_session( + CancelKeyData { + backend_pid: 0, + cancel_key: 0, + }, + Uuid::new_v4(), + ) + .await + .unwrap(); + } } diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs index b61c1fb9ef..4433b3c1c2 100644 --- a/proxy/src/compute.rs +++ b/proxy/src/compute.rs @@ -1,15 +1,15 @@ use crate::{ auth::parse_endpoint_param, cancellation::CancelClosure, - console::{errors::WakeComputeError, messages::MetricsAuxInfo}, + console::{errors::WakeComputeError, messages::MetricsAuxInfo, provider::ApiLockError}, context::RequestMonitoring, error::{ReportableError, UserFacingError}, - metrics::NUM_DB_CONNECTIONS_GAUGE, + metrics::{Metrics, NumDbConnectionsGuard}, proxy::neon_option, + Host, }; use futures::{FutureExt, TryFutureExt}; use itertools::Itertools; -use metrics::IntCounterPairGuard; use pq_proto::StartupMessageParams; use std::{io, net::SocketAddr, time::Duration}; use thiserror::Error; @@ -34,6 +34,9 @@ pub enum ConnectionError { #[error("{COULD_NOT_CONNECT}: {0}")] WakeComputeError(#[from] WakeComputeError), + + #[error("error acquiring resource permit: {0}")] + TooManyConnectionAttempts(#[from] ApiLockError), } impl UserFacingError for ConnectionError { @@ -57,6 +60,9 @@ impl UserFacingError for ConnectionError { None => err.to_string(), }, WakeComputeError(err) => err.to_string_client(), + TooManyConnectionAttempts(_) => { + "Failed to acquire permit to connect to the database. Too many database connection attempts are currently ongoing.".to_owned() + } _ => COULD_NOT_CONNECT.to_owned(), } } @@ -72,6 +78,7 @@ impl ReportableError for ConnectionError { ConnectionError::CouldNotConnect(_) => crate::error::ErrorKind::Compute, ConnectionError::TlsError(_) => crate::error::ErrorKind::Compute, ConnectionError::WakeComputeError(e) => e.get_error_kind(), + ConnectionError::TooManyConnectionAttempts(e) => e.get_error_kind(), } } } @@ -82,14 +89,13 @@ pub type ScramKeys = tokio_postgres::config::ScramKeys<32>; /// A config for establishing a connection to compute node. /// Eventually, `tokio_postgres` will be replaced with something better. /// Newtype allows us to implement methods on top of it. -#[derive(Clone)] -#[repr(transparent)] +#[derive(Clone, Default)] pub struct ConnCfg(Box); /// Creation and initialization routines. impl ConnCfg { pub fn new() -> Self { - Self(Default::default()) + Self::default() } /// Reuse password or auth keys from the other config. @@ -103,6 +109,16 @@ impl ConnCfg { } } + pub fn get_host(&self) -> Result { + match self.0.get_hosts() { + [tokio_postgres::config::Host::Tcp(s)] => Ok(s.into()), + // we should not have multiple address or unix addresses. + _ => Err(WakeComputeError::BadComputeAddress( + "invalid compute address".into(), + )), + } + } + /// Apply startup message params to the connection config. pub fn set_startup_params(&mut self, params: &StartupMessageParams) { // Only set `user` if it's not present in the config. @@ -165,12 +181,6 @@ impl std::ops::DerefMut for ConnCfg { } } -impl Default for ConnCfg { - fn default() -> Self { - Self::new() - } -} - impl ConnCfg { /// Establish a raw TCP connection to the compute node. async fn connect_raw(&self, timeout: Duration) -> io::Result<(SocketAddr, TcpStream, &str)> { @@ -256,7 +266,7 @@ pub struct PostgresConnection { /// Labels for proxy's metrics. pub aux: MetricsAuxInfo, - _guage: IntCounterPairGuard, + _guage: NumDbConnectionsGuard<'static>, } impl ConnCfg { @@ -268,7 +278,9 @@ impl ConnCfg { aux: MetricsAuxInfo, timeout: Duration, ) -> Result { + let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Compute); let (socket_addr, stream, host) = self.connect_raw(timeout).await?; + drop(pause); let tls_connector = native_tls::TlsConnector::builder() .danger_accept_invalid_certs(allow_self_signed_compute) @@ -278,11 +290,14 @@ impl ConnCfg { let tls = MakeTlsConnect::::make_tls_connect(&mut mk_tls, host)?; // connect_raw() will not use TLS if sslmode is "disable" + let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Compute); let (client, connection) = self.0.connect_raw(stream, tls).await?; + drop(pause); tracing::Span::current().record("pid", &tracing::field::display(client.get_process_id())); let stream = connection.stream.into_inner(); info!( + cold_start_info = ctx.cold_start_info.as_str(), "connected to compute node at {host} ({socket_addr}) sslmode={:?}", self.0.get_ssl_mode() ); @@ -301,9 +316,7 @@ impl ConnCfg { params, cancel_closure, aux, - _guage: NUM_DB_CONNECTIONS_GAUGE - .with_label_values(&[ctx.protocol]) - .guard(), + _guage: Metrics::get().proxy.db_connections.guard(ctx.protocol), }; Ok(connection) diff --git a/proxy/src/config.rs b/proxy/src/config.rs index 437ec9f401..b7ab2c00f9 100644 --- a/proxy/src/config.rs +++ b/proxy/src/config.rs @@ -1,6 +1,13 @@ -use crate::{auth, rate_limiter::RateBucketInfo, serverless::GlobalConnPoolOptions}; +use crate::{ + auth::{self, backend::AuthRateLimiter}, + console::locks::ApiLocks, + rate_limiter::RateBucketInfo, + serverless::{cancel_set::CancelSet, GlobalConnPoolOptions}, + Host, +}; use anyhow::{bail, ensure, Context, Ok}; use itertools::Itertools; +use remote_storage::RemoteStorageConfig; use rustls::{ crypto::ring::sign, pki_types::{CertificateDer, PrivateKeyDer}, @@ -24,16 +31,20 @@ pub struct ProxyConfig { pub authentication_config: AuthenticationConfig, pub require_client_ip: bool, pub disable_ip_check_for_http: bool, - pub endpoint_rps_limit: Vec, pub redis_rps_limit: Vec, pub region: String, pub handshake_timeout: Duration, + pub aws_region: String, + pub wake_compute_retry_config: RetryConfig, + pub connect_compute_locks: ApiLocks, + pub connect_to_compute_retry_config: RetryConfig, } #[derive(Debug)] pub struct MetricCollectionConfig { pub endpoint: reqwest::Url, pub interval: Duration, + pub backup_metric_collection_config: MetricBackupCollectionConfig, } pub struct TlsConfig { @@ -45,10 +56,15 @@ pub struct TlsConfig { pub struct HttpConfig { pub request_timeout: tokio::time::Duration, pub pool_options: GlobalConnPoolOptions, + pub cancel_set: CancelSet, + pub client_conn_threshold: u64, } pub struct AuthenticationConfig { pub scram_protocol_timeout: tokio::time::Duration, + pub rate_limiter_enabled: bool, + pub rate_limiter: AuthRateLimiter, + pub rate_limit_ip_subnet: u8, } impl TlsConfig { @@ -304,6 +320,95 @@ impl CertResolver { } } +#[derive(Debug)] +pub struct EndpointCacheConfig { + /// Batch size to receive all endpoints on the startup. + pub initial_batch_size: usize, + /// Batch size to receive endpoints. + pub default_batch_size: usize, + /// Timeouts for the stream read operation. + pub xread_timeout: Duration, + /// Stream name to read from. + pub stream_name: String, + /// Limiter info (to distinguish when to enable cache). + pub limiter_info: Vec, + /// Disable cache. + /// If true, cache is ignored, but reports all statistics. + pub disable_cache: bool, + /// Retry interval for the stream read operation. + pub retry_interval: Duration, +} + +impl EndpointCacheConfig { + /// Default options for [`crate::console::provider::NodeInfoCache`]. + /// Notice that by default the limiter is empty, which means that cache is disabled. + pub const CACHE_DEFAULT_OPTIONS: &'static str = + "initial_batch_size=1000,default_batch_size=10,xread_timeout=5m,stream_name=controlPlane,disable_cache=true,limiter_info=1000@1s,retry_interval=1s"; + + /// Parse cache options passed via cmdline. + /// Example: [`Self::CACHE_DEFAULT_OPTIONS`]. + fn parse(options: &str) -> anyhow::Result { + let mut initial_batch_size = None; + let mut default_batch_size = None; + let mut xread_timeout = None; + let mut stream_name = None; + let mut limiter_info = vec![]; + let mut disable_cache = false; + let mut retry_interval = None; + + for option in options.split(',') { + let (key, value) = option + .split_once('=') + .with_context(|| format!("bad key-value pair: {option}"))?; + + match key { + "initial_batch_size" => initial_batch_size = Some(value.parse()?), + "default_batch_size" => default_batch_size = Some(value.parse()?), + "xread_timeout" => xread_timeout = Some(humantime::parse_duration(value)?), + "stream_name" => stream_name = Some(value.to_string()), + "limiter_info" => limiter_info.push(RateBucketInfo::from_str(value)?), + "disable_cache" => disable_cache = value.parse()?, + "retry_interval" => retry_interval = Some(humantime::parse_duration(value)?), + unknown => bail!("unknown key: {unknown}"), + } + } + RateBucketInfo::validate(&mut limiter_info)?; + + Ok(Self { + initial_batch_size: initial_batch_size.context("missing `initial_batch_size`")?, + default_batch_size: default_batch_size.context("missing `default_batch_size`")?, + xread_timeout: xread_timeout.context("missing `xread_timeout`")?, + stream_name: stream_name.context("missing `stream_name`")?, + disable_cache, + limiter_info, + retry_interval: retry_interval.context("missing `retry_interval`")?, + }) + } +} + +impl FromStr for EndpointCacheConfig { + type Err = anyhow::Error; + + fn from_str(options: &str) -> Result { + let error = || format!("failed to parse endpoint cache options '{options}'"); + Self::parse(options).with_context(error) + } +} +#[derive(Debug)] +pub struct MetricBackupCollectionConfig { + pub interval: Duration, + pub remote_storage_config: OptRemoteStorageConfig, + pub chunk_size: usize, +} + +/// Hack to avoid clap being smarter. If you don't use this type alias, clap assumes more about the optional state and you get +/// runtime type errors from the value parser we use. +pub type OptRemoteStorageConfig = Option; + +pub fn remote_storage_from_toml(s: &str) -> anyhow::Result { + RemoteStorageConfig::from_toml(&s.parse()?) +} + /// Helper for cmdline cache options parsing. #[derive(Debug)] pub struct CacheOptions { @@ -419,8 +524,61 @@ impl FromStr for ProjectInfoCacheOptions { } } +/// This is a config for connect to compute and wake compute. +#[derive(Clone, Copy, Debug)] +pub struct RetryConfig { + /// Number of times we should retry. + pub max_retries: u32, + /// Retry duration is base_delay * backoff_factor ^ n, where n starts at 0 + pub base_delay: tokio::time::Duration, + /// Exponential base for retry wait duration + pub backoff_factor: f64, +} + +impl RetryConfig { + /// Default options for RetryConfig. + + /// Total delay for 5 retries with 200ms base delay and 2 backoff factor is about 6s. + pub const CONNECT_TO_COMPUTE_DEFAULT_VALUES: &'static str = + "num_retries=5,base_retry_wait_duration=200ms,retry_wait_exponent_base=2"; + /// Total delay for 8 retries with 100ms base delay and 1.6 backoff factor is about 7s. + /// Cplane has timeout of 60s on each request. 8m7s in total. + pub const WAKE_COMPUTE_DEFAULT_VALUES: &'static str = + "num_retries=8,base_retry_wait_duration=100ms,retry_wait_exponent_base=1.6"; + + /// Parse retry options passed via cmdline. + /// Example: [`Self::CONNECT_TO_COMPUTE_DEFAULT_VALUES`]. + pub fn parse(options: &str) -> anyhow::Result { + let mut num_retries = None; + let mut base_retry_wait_duration = None; + let mut retry_wait_exponent_base = None; + + for option in options.split(',') { + let (key, value) = option + .split_once('=') + .with_context(|| format!("bad key-value pair: {option}"))?; + + match key { + "num_retries" => num_retries = Some(value.parse()?), + "base_retry_wait_duration" => { + base_retry_wait_duration = Some(humantime::parse_duration(value)?) + } + "retry_wait_exponent_base" => retry_wait_exponent_base = Some(value.parse()?), + unknown => bail!("unknown key: {unknown}"), + } + } + + Ok(Self { + max_retries: num_retries.context("missing `num_retries`")?, + base_delay: base_retry_wait_duration.context("missing `base_retry_wait_duration`")?, + backoff_factor: retry_wait_exponent_base + .context("missing `retry_wait_exponent_base`")?, + }) + } +} + /// Helper for cmdline cache options parsing. -pub struct WakeComputeLockOptions { +pub struct ConcurrencyLockOptions { /// The number of shards the lock map should have pub shards: usize, /// The number of allowed concurrent requests for each endpoitn @@ -431,9 +589,12 @@ pub struct WakeComputeLockOptions { pub timeout: Duration, } -impl WakeComputeLockOptions { +impl ConcurrencyLockOptions { /// Default options for [`crate::console::provider::ApiLocks`]. pub const DEFAULT_OPTIONS_WAKE_COMPUTE_LOCK: &'static str = "permits=0"; + /// Default options for [`crate::console::provider::ApiLocks`]. + pub const DEFAULT_OPTIONS_CONNECT_COMPUTE_LOCK: &'static str = + "shards=64,permits=10,epoch=10m,timeout=10ms"; // pub const DEFAULT_OPTIONS_WAKE_COMPUTE_LOCK: &'static str = "shards=32,permits=4,epoch=10m,timeout=1s"; @@ -483,7 +644,7 @@ impl WakeComputeLockOptions { } } -impl FromStr for WakeComputeLockOptions { +impl FromStr for ConcurrencyLockOptions { type Err = anyhow::Error; fn from_str(options: &str) -> Result { @@ -519,7 +680,7 @@ mod tests { #[test] fn test_parse_lock_options() -> anyhow::Result<()> { - let WakeComputeLockOptions { + let ConcurrencyLockOptions { epoch, permits, shards, @@ -530,7 +691,7 @@ mod tests { assert_eq!(shards, 32); assert_eq!(permits, 4); - let WakeComputeLockOptions { + let ConcurrencyLockOptions { epoch, permits, shards, @@ -541,7 +702,7 @@ mod tests { assert_eq!(shards, 16); assert_eq!(permits, 8); - let WakeComputeLockOptions { + let ConcurrencyLockOptions { epoch, permits, shards, diff --git a/proxy/src/console.rs b/proxy/src/console.rs index fd3c46b946..ea95e83437 100644 --- a/proxy/src/console.rs +++ b/proxy/src/console.rs @@ -6,7 +6,7 @@ pub mod messages; /// Wrappers for console APIs and their mocks. pub mod provider; -pub use provider::{errors, Api, AuthSecret, CachedNodeInfo, NodeInfo}; +pub(crate) use provider::{errors, Api, AuthSecret, CachedNodeInfo, NodeInfo}; /// Various cache-related types. pub mod caches { diff --git a/proxy/src/console/messages.rs b/proxy/src/console/messages.rs index 102076f2c6..9869b95768 100644 --- a/proxy/src/console/messages.rs +++ b/proxy/src/console/messages.rs @@ -1,9 +1,10 @@ +use measured::FixedCardinalityLabel; use serde::{Deserialize, Serialize}; use std::fmt; use crate::auth::IpPattern; -use crate::{BranchId, EndpointId, ProjectId}; +use crate::intern::{BranchIdInt, EndpointIdInt, ProjectIdInt}; /// Generic error response with human-readable description. /// Note that we can't always present it to user as is. @@ -18,7 +19,7 @@ pub struct ConsoleError { pub struct GetRoleSecret { pub role_secret: Box, pub allowed_ips: Option>, - pub project_id: Option, + pub project_id: Option, } // Manually implement debug to omit sensitive info. @@ -93,22 +94,49 @@ impl fmt::Debug for DatabaseInfo { /// Various labels for prometheus metrics. /// Also known as `ProxyMetricsAuxInfo` in the console. -#[derive(Debug, Deserialize, Clone, Default)] +#[derive(Debug, Deserialize, Clone)] pub struct MetricsAuxInfo { - pub endpoint_id: EndpointId, - pub project_id: ProjectId, - pub branch_id: BranchId, - pub cold_start_info: Option, + pub endpoint_id: EndpointIdInt, + pub project_id: ProjectIdInt, + pub branch_id: BranchIdInt, + #[serde(default)] + pub cold_start_info: ColdStartInfo, } -#[derive(Debug, Default, Serialize, Deserialize, Clone)] +#[derive(Debug, Default, Serialize, Deserialize, Clone, Copy, FixedCardinalityLabel)] #[serde(rename_all = "snake_case")] pub enum ColdStartInfo { #[default] - Unknown = 0, - Warm = 1, - PoolHit = 2, - PoolMiss = 3, + Unknown, + /// Compute was already running + Warm, + #[serde(rename = "pool_hit")] + #[label(rename = "pool_hit")] + /// Compute was not running but there was an available VM + VmPoolHit, + #[serde(rename = "pool_miss")] + #[label(rename = "pool_miss")] + /// Compute was not running and there were no VMs available + VmPoolMiss, + + // not provided by control plane + /// Connection available from HTTP pool + HttpPoolHit, + /// Cached connection info + WarmCached, +} + +impl ColdStartInfo { + pub fn as_str(&self) -> &'static str { + match self { + ColdStartInfo::Unknown => "unknown", + ColdStartInfo::Warm => "warm", + ColdStartInfo::VmPoolHit => "pool_hit", + ColdStartInfo::VmPoolMiss => "pool_miss", + ColdStartInfo::HttpPoolHit => "http_pool_hit", + ColdStartInfo::WarmCached => "warm_cached", + } + } } #[cfg(test)] diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs index 8609606273..3b996cdbd1 100644 --- a/proxy/src/console/provider.rs +++ b/proxy/src/console/provider.rs @@ -8,15 +8,17 @@ use crate::{ backend::{ComputeCredentialKeys, ComputeUserInfo}, IpPattern, }, - cache::{project_info::ProjectInfoCacheImpl, Cached, TimedLru}, + cache::{endpoints::EndpointsCache, project_info::ProjectInfoCacheImpl, Cached, TimedLru}, compute, - config::{CacheOptions, ProjectInfoCacheOptions}, + config::{CacheOptions, EndpointCacheConfig, ProjectInfoCacheOptions}, context::RequestMonitoring, - scram, EndpointCacheKey, ProjectId, + error::ReportableError, + intern::ProjectIdInt, + metrics::ApiLockMetrics, + scram, EndpointCacheKey, }; -use async_trait::async_trait; use dashmap::DashMap; -use std::{sync::Arc, time::Duration}; +use std::{hash::Hash, sync::Arc, time::Duration}; use tokio::sync::{OwnedSemaphorePermit, Semaphore}; use tokio::time::Instant; use tracing::info; @@ -29,6 +31,8 @@ pub mod errors { }; use thiserror::Error; + use super::ApiLockError; + /// A go-to error message which doesn't leak any detail. const REQUEST_FAILED: &str = "Console request failed"; @@ -75,7 +79,7 @@ pub mod errors { } http::StatusCode::LOCKED | http::StatusCode::UNPROCESSABLE_ENTITY => { // Status 423: project might be in maintenance mode (or bad state), or quotas exceeded. - format!("{REQUEST_FAILED}: endpoint is temporary unavailable. check your quotas and/or contact our support") + format!("{REQUEST_FAILED}: endpoint is temporarily unavailable. Check your quotas and/or contact our support.") } _ => REQUEST_FAILED.to_owned(), }, @@ -207,8 +211,11 @@ pub mod errors { #[error(transparent)] ApiError(ApiError), - #[error("Timeout waiting to acquire wake compute lock")] - TimeoutError, + #[error("Too many connections attempts")] + TooManyConnections, + + #[error("error acquiring resource permit: {0}")] + TooManyConnectionAttempts(#[from] ApiLockError), } // This allows more useful interactions than `#[from]`. @@ -218,17 +225,6 @@ pub mod errors { } } - impl From for WakeComputeError { - fn from(_: tokio::sync::AcquireError) -> Self { - WakeComputeError::TimeoutError - } - } - impl From for WakeComputeError { - fn from(_: tokio::time::error::Elapsed) -> Self { - WakeComputeError::TimeoutError - } - } - impl UserFacingError for WakeComputeError { fn to_string_client(&self) -> String { use WakeComputeError::*; @@ -239,7 +235,11 @@ pub mod errors { // However, API might return a meaningful error. ApiError(e) => e.to_string_client(), - TimeoutError => "timeout while acquiring the compute resource lock".to_owned(), + TooManyConnections => self.to_string(), + + TooManyConnectionAttempts(_) => { + "Failed to acquire permit to connect to the database. Too many database connection attempts are currently ongoing.".to_owned() + } } } } @@ -249,7 +249,8 @@ pub mod errors { match self { WakeComputeError::BadComputeAddress(_) => crate::error::ErrorKind::ControlPlane, WakeComputeError::ApiError(e) => e.get_error_kind(), - WakeComputeError::TimeoutError => crate::error::ErrorKind::ServiceRateLimit, + WakeComputeError::TooManyConnections => crate::error::ErrorKind::RateLimit, + WakeComputeError::TooManyConnectionAttempts(e) => e.get_error_kind(), } } } @@ -272,7 +273,7 @@ pub struct AuthInfo { /// List of IP addresses allowed for the autorization. pub allowed_ips: Vec, /// Project ID. This is used for cache invalidation. - pub project_id: Option, + pub project_id: Option, } /// Info for establishing a connection to a compute node. @@ -326,8 +327,7 @@ pub type CachedAllowedIps = Cached<&'static ProjectInfoCacheImpl, Arc), } -#[async_trait] impl Api for ConsoleBackend { async fn get_role_secret( &self, @@ -418,12 +417,15 @@ pub struct ApiCaches { pub node_info: NodeInfoCache, /// Cache which stores project_id -> endpoint_ids mapping. pub project_info: Arc, + /// List of all valid endpoints. + pub endpoints_cache: Arc, } impl ApiCaches { pub fn new( wake_compute_cache_config: CacheOptions, project_info_cache_config: ProjectInfoCacheOptions, + endpoint_cache_config: EndpointCacheConfig, ) -> Self { Self { node_info: NodeInfoCache::new( @@ -433,83 +435,58 @@ impl ApiCaches { true, ), project_info: Arc::new(ProjectInfoCacheImpl::new(project_info_cache_config)), + endpoints_cache: Arc::new(EndpointsCache::new(endpoint_cache_config)), } } } /// Various caches for [`console`](super). -pub struct ApiLocks { +pub struct ApiLocks { name: &'static str, - node_locks: DashMap>, + node_locks: DashMap>, permits: usize, timeout: Duration, - registered: prometheus::IntCounter, - unregistered: prometheus::IntCounter, - reclamation_lag: prometheus::Histogram, - lock_acquire_lag: prometheus::Histogram, + epoch: std::time::Duration, + metrics: &'static ApiLockMetrics, } -impl ApiLocks { +#[derive(Debug, thiserror::Error)] +pub enum ApiLockError { + #[error("lock was closed")] + AcquireError(#[from] tokio::sync::AcquireError), + #[error("permit could not be acquired")] + TimeoutError(#[from] tokio::time::error::Elapsed), +} + +impl ReportableError for ApiLockError { + fn get_error_kind(&self) -> crate::error::ErrorKind { + match self { + ApiLockError::AcquireError(_) => crate::error::ErrorKind::Service, + ApiLockError::TimeoutError(_) => crate::error::ErrorKind::RateLimit, + } + } +} + +impl ApiLocks { pub fn new( name: &'static str, permits: usize, shards: usize, timeout: Duration, + epoch: std::time::Duration, + metrics: &'static ApiLockMetrics, ) -> prometheus::Result { - let registered = prometheus::IntCounter::with_opts( - prometheus::Opts::new( - "semaphores_registered", - "Number of semaphores registered in this api lock", - ) - .namespace(name), - )?; - prometheus::register(Box::new(registered.clone()))?; - let unregistered = prometheus::IntCounter::with_opts( - prometheus::Opts::new( - "semaphores_unregistered", - "Number of semaphores unregistered in this api lock", - ) - .namespace(name), - )?; - prometheus::register(Box::new(unregistered.clone()))?; - let reclamation_lag = prometheus::Histogram::with_opts( - prometheus::HistogramOpts::new( - "reclamation_lag_seconds", - "Time it takes to reclaim unused semaphores in the api lock", - ) - .namespace(name) - // 1us -> 65ms - // benchmarks on my mac indicate it's usually in the range of 256us and 512us - .buckets(prometheus::exponential_buckets(1e-6, 2.0, 16)?), - )?; - prometheus::register(Box::new(reclamation_lag.clone()))?; - let lock_acquire_lag = prometheus::Histogram::with_opts( - prometheus::HistogramOpts::new( - "semaphore_acquire_seconds", - "Time it takes to reclaim unused semaphores in the api lock", - ) - .namespace(name) - // 0.1ms -> 6s - .buckets(prometheus::exponential_buckets(1e-4, 2.0, 16)?), - )?; - prometheus::register(Box::new(lock_acquire_lag.clone()))?; - Ok(Self { name, node_locks: DashMap::with_shard_amount(shards), permits, timeout, - lock_acquire_lag, - registered, - unregistered, - reclamation_lag, + epoch, + metrics, }) } - pub async fn get_wake_compute_permit( - &self, - key: &EndpointCacheKey, - ) -> Result { + pub async fn get_permit(&self, key: &K) -> Result { if self.permits == 0 { return Ok(WakeComputePermit { permit: None }); } @@ -522,7 +499,7 @@ impl ApiLocks { self.node_locks .entry(key.clone()) .or_insert_with(|| { - self.registered.inc(); + self.metrics.semaphores_registered.inc(); Arc::new(Semaphore::new(self.permits)) }) .clone() @@ -530,20 +507,21 @@ impl ApiLocks { }; let permit = tokio::time::timeout_at(now + self.timeout, semaphore.acquire_owned()).await; - self.lock_acquire_lag - .observe((Instant::now() - now).as_secs_f64()); + self.metrics + .semaphore_acquire_seconds + .observe(now.elapsed().as_secs_f64()); Ok(WakeComputePermit { permit: Some(permit??), }) } - pub async fn garbage_collect_worker(&self, epoch: std::time::Duration) { + pub async fn garbage_collect_worker(&self) { if self.permits == 0 { return; } - - let mut interval = tokio::time::interval(epoch / (self.node_locks.shards().len()) as u32); + let mut interval = + tokio::time::interval(self.epoch / (self.node_locks.shards().len()) as u32); loop { for (i, shard) in self.node_locks.shards().iter().enumerate() { interval.tick().await; @@ -556,13 +534,13 @@ impl ApiLocks { "performing epoch reclamation on api lock" ); let mut lock = shard.write(); - let timer = self.reclamation_lag.start_timer(); + let timer = self.metrics.reclamation_lag_seconds.start_timer(); let count = lock .extract_if(|_, semaphore| Arc::strong_count(semaphore.get_mut()) == 1) .count(); drop(lock); - self.unregistered.inc_by(count as u64); - timer.observe_duration() + self.metrics.semaphores_unregistered.inc_by(count as u64); + timer.observe(); } } } diff --git a/proxy/src/console/provider/mock.rs b/proxy/src/console/provider/mock.rs index 0579ef6fc4..cfe491f2aa 100644 --- a/proxy/src/console/provider/mock.rs +++ b/proxy/src/console/provider/mock.rs @@ -4,11 +4,16 @@ use super::{ errors::{ApiError, GetAuthInfoError, WakeComputeError}, AuthInfo, AuthSecret, CachedNodeInfo, NodeInfo, }; -use crate::console::provider::{CachedAllowedIps, CachedRoleSecret}; use crate::context::RequestMonitoring; use crate::{auth::backend::ComputeUserInfo, compute, error::io_error, scram, url::ApiUrl}; use crate::{auth::IpPattern, cache::Cached}; -use async_trait::async_trait; +use crate::{ + console::{ + messages::MetricsAuxInfo, + provider::{CachedAllowedIps, CachedRoleSecret}, + }, + BranchId, EndpointId, ProjectId, +}; use futures::TryFutureExt; use std::{str::FromStr, sync::Arc}; use thiserror::Error; @@ -115,7 +120,12 @@ impl Api { let node = NodeInfo { config, - aux: Default::default(), + aux: MetricsAuxInfo { + endpoint_id: (&EndpointId::from("endpoint")).into(), + project_id: (&ProjectId::from("project")).into(), + branch_id: (&BranchId::from("branch")).into(), + cold_start_info: crate::console::messages::ColdStartInfo::Warm, + }, allow_self_signed_compute: false, }; @@ -144,7 +154,6 @@ async fn get_execute_postgres_query( Ok(Some(entry)) } -#[async_trait] impl super::Api for Api { #[tracing::instrument(skip_all)] async fn get_role_secret( diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs index b36663518d..7728d2cafa 100644 --- a/proxy/src/console/provider/neon.rs +++ b/proxy/src/console/provider/neon.rs @@ -7,14 +7,15 @@ use super::{ NodeInfo, }; use crate::{ - auth::backend::ComputeUserInfo, compute, console::messages::ColdStartInfo, http, scram, + auth::backend::ComputeUserInfo, + compute, + console::messages::ColdStartInfo, + http, + metrics::{CacheOutcome, Metrics}, + rate_limiter::EndpointRateLimiter, + scram, EndpointCacheKey, Normalize, }; -use crate::{ - cache::Cached, - context::RequestMonitoring, - metrics::{ALLOWED_IPS_BY_CACHE_OUTCOME, ALLOWED_IPS_NUMBER}, -}; -use async_trait::async_trait; +use crate::{cache::Cached, context::RequestMonitoring}; use futures::TryFutureExt; use std::sync::Arc; use tokio::time::Instant; @@ -24,7 +25,8 @@ use tracing::{error, info, info_span, warn, Instrument}; pub struct Api { endpoint: http::Endpoint, pub caches: &'static ApiCaches, - locks: &'static ApiLocks, + pub locks: &'static ApiLocks, + pub wake_compute_endpoint_rate_limiter: Arc, jwt: String, } @@ -33,7 +35,8 @@ impl Api { pub fn new( endpoint: http::Endpoint, caches: &'static ApiCaches, - locks: &'static ApiLocks, + locks: &'static ApiLocks, + wake_compute_endpoint_rate_limiter: Arc, ) -> Self { let jwt: String = match std::env::var("NEON_PROXY_TO_CONTROLPLANE_TOKEN") { Ok(v) => v, @@ -43,6 +46,7 @@ impl Api { endpoint, caches, locks, + wake_compute_endpoint_rate_limiter, jwt, } } @@ -56,7 +60,16 @@ impl Api { ctx: &mut RequestMonitoring, user_info: &ComputeUserInfo, ) -> Result { - let request_id = uuid::Uuid::new_v4().to_string(); + if !self + .caches + .endpoints_cache + .is_valid(ctx, &user_info.endpoint.normalize()) + .await + { + info!("endpoint is not valid, skipping the request"); + return Ok(AuthInfo::default()); + } + let request_id = ctx.session_id.to_string(); let application_name = ctx.console_application_name(); async { let request = self @@ -82,7 +95,9 @@ impl Api { Ok(body) => body, // Error 404 is special: it's ok not to have a secret. Err(e) => match e.http_status_code() { - Some(http::StatusCode::NOT_FOUND) => return Ok(AuthInfo::default()), + Some(http::StatusCode::NOT_FOUND) => { + return Ok(AuthInfo::default()); + } _otherwise => return Err(e.into()), }, }; @@ -96,7 +111,10 @@ impl Api { Some(secret) }; let allowed_ips = body.allowed_ips.unwrap_or_default(); - ALLOWED_IPS_NUMBER.observe(allowed_ips.len() as f64); + Metrics::get() + .proxy + .allowed_ips_number + .observe(allowed_ips.len() as f64); Ok(AuthInfo { secret, allowed_ips, @@ -113,7 +131,7 @@ impl Api { ctx: &mut RequestMonitoring, user_info: &ComputeUserInfo, ) -> Result { - let request_id = uuid::Uuid::new_v4().to_string(); + let request_id = ctx.session_id.to_string(); let application_name = ctx.console_application_name(); async { let mut request_builder = self @@ -168,7 +186,6 @@ impl Api { } } -#[async_trait] impl super::Api for Api { #[tracing::instrument(skip_all)] async fn get_role_secret( @@ -176,22 +193,27 @@ impl super::Api for Api { ctx: &mut RequestMonitoring, user_info: &ComputeUserInfo, ) -> Result { - let ep = &user_info.endpoint; + let normalized_ep = &user_info.endpoint.normalize(); let user = &user_info.user; - if let Some(role_secret) = self.caches.project_info.get_role_secret(ep, user) { + if let Some(role_secret) = self + .caches + .project_info + .get_role_secret(normalized_ep, user) + { return Ok(role_secret); } let auth_info = self.do_get_auth_info(ctx, user_info).await?; if let Some(project_id) = auth_info.project_id { + let normalized_ep_int = normalized_ep.into(); self.caches.project_info.insert_role_secret( - &project_id, - ep, - user, + project_id, + normalized_ep_int, + user.into(), auth_info.secret.clone(), ); self.caches.project_info.insert_allowed_ips( - &project_id, - ep, + project_id, + normalized_ep_int, Arc::new(auth_info.allowed_ips), ); ctx.set_project_id(project_id); @@ -205,29 +227,34 @@ impl super::Api for Api { ctx: &mut RequestMonitoring, user_info: &ComputeUserInfo, ) -> Result<(CachedAllowedIps, Option), GetAuthInfoError> { - let ep = &user_info.endpoint; - if let Some(allowed_ips) = self.caches.project_info.get_allowed_ips(ep) { - ALLOWED_IPS_BY_CACHE_OUTCOME - .with_label_values(&["hit"]) - .inc(); + let normalized_ep = &user_info.endpoint.normalize(); + if let Some(allowed_ips) = self.caches.project_info.get_allowed_ips(normalized_ep) { + Metrics::get() + .proxy + .allowed_ips_cache_misses + .inc(CacheOutcome::Hit); return Ok((allowed_ips, None)); } - ALLOWED_IPS_BY_CACHE_OUTCOME - .with_label_values(&["miss"]) - .inc(); + Metrics::get() + .proxy + .allowed_ips_cache_misses + .inc(CacheOutcome::Miss); let auth_info = self.do_get_auth_info(ctx, user_info).await?; let allowed_ips = Arc::new(auth_info.allowed_ips); let user = &user_info.user; if let Some(project_id) = auth_info.project_id { + let normalized_ep_int = normalized_ep.into(); self.caches.project_info.insert_role_secret( - &project_id, - ep, - user, + project_id, + normalized_ep_int, + user.into(), auth_info.secret.clone(), ); - self.caches - .project_info - .insert_allowed_ips(&project_id, ep, allowed_ips.clone()); + self.caches.project_info.insert_allowed_ips( + project_id, + normalized_ep_int, + allowed_ips.clone(), + ); ctx.set_project_id(project_id); } Ok(( @@ -250,29 +277,40 @@ impl super::Api for Api { // which means that we might cache it to reduce the load and latency. if let Some(cached) = self.caches.node_info.get(&key) { info!(key = &*key, "found cached compute node info"); - info!("cold_start_info=warm"); - ctx.set_cold_start_info(ColdStartInfo::Warm); + ctx.set_project(cached.aux.clone()); return Ok(cached); } - let permit = self.locks.get_wake_compute_permit(&key).await?; + // check rate limit + if !self + .wake_compute_endpoint_rate_limiter + .check(user_info.endpoint.normalize().into(), 1) + { + return Err(WakeComputeError::TooManyConnections); + } + + let permit = self.locks.get_permit(&key).await?; // after getting back a permit - it's possible the cache was filled // double check if permit.should_check_cache() { if let Some(cached) = self.caches.node_info.get(&key) { info!(key = &*key, "found cached compute node info"); - info!("cold_start_info=warm"); - ctx.set_cold_start_info(ColdStartInfo::Warm); + ctx.set_project(cached.aux.clone()); return Ok(cached); } } - let node = self.do_wake_compute(ctx, user_info).await?; + let mut node = self.do_wake_compute(ctx, user_info).await?; ctx.set_project(node.aux.clone()); - let cold_start_info = node.aux.cold_start_info.clone().unwrap_or_default(); - info!(?cold_start_info, "woken up a compute node"); - let (_, cached) = self.caches.node_info.insert(key.clone(), node); + let cold_start_info = node.aux.cold_start_info; + info!("woken up a compute node"); + + // store the cached node as 'warm' + node.aux.cold_start_info = ColdStartInfo::WarmCached; + let (_, mut cached) = self.caches.node_info.insert(key.clone(), node); + cached.aux.cold_start_info = cold_start_info; + info!(key = &*key, "created a cache entry for compute node info"); Ok(cached) diff --git a/proxy/src/context.rs b/proxy/src/context.rs index 7ca830cdb4..dfd3ef108e 100644 --- a/proxy/src/context.rs +++ b/proxy/src/context.rs @@ -5,21 +5,23 @@ use once_cell::sync::OnceCell; use smol_str::SmolStr; use std::net::IpAddr; use tokio::sync::mpsc; -use tracing::{field::display, info_span, Span}; +use tracing::{field::display, info, info_span, Span}; use uuid::Uuid; use crate::{ console::messages::{ColdStartInfo, MetricsAuxInfo}, error::ErrorKind, - metrics::{LatencyTimer, ENDPOINT_ERRORS_BY_KIND, ERROR_BY_KIND}, - BranchId, DbName, EndpointId, ProjectId, RoleName, + intern::{BranchIdInt, ProjectIdInt}, + metrics::{ConnectOutcome, InvalidEndpointsGroup, LatencyTimer, Metrics, Protocol}, + DbName, EndpointId, RoleName, }; use self::parquet::RequestData; pub mod parquet; -static LOG_CHAN: OnceCell> = OnceCell::new(); +pub static LOG_CHAN: OnceCell> = OnceCell::new(); +pub static LOG_CHAN_DISCONNECT: OnceCell> = OnceCell::new(); /// Context data for a single request to connect to a database. /// @@ -28,14 +30,14 @@ static LOG_CHAN: OnceCell> = OnceCell::ne pub struct RequestMonitoring { pub peer_addr: IpAddr, pub session_id: Uuid, - pub protocol: &'static str, + pub protocol: Protocol, first_packet: chrono::DateTime, region: &'static str, pub span: Span, // filled in as they are discovered - project: Option, - branch: Option, + project: Option, + branch: Option, endpoint_id: Option, dbname: Option, user: Option, @@ -43,12 +45,17 @@ pub struct RequestMonitoring { error_kind: Option, pub(crate) auth_method: Option, success: bool, - cold_start_info: Option, + pub(crate) cold_start_info: ColdStartInfo, // extra // This sender is here to keep the request monitoring channel open while requests are taking place. sender: Option>, + // This sender is only used to log the length of session in case of success. + disconnect_sender: Option>, pub latency_timer: LatencyTimer, + // Whether proxy decided that it's not a valid endpoint end rejected it before going to cplane. + rejected: Option, + disconnect_timestamp: Option>, } #[derive(Clone, Debug)] @@ -64,7 +71,7 @@ impl RequestMonitoring { pub fn new( session_id: Uuid, peer_addr: IpAddr, - protocol: &'static str, + protocol: Protocol, region: &'static str, ) -> Self { let span = info_span!( @@ -73,6 +80,7 @@ impl RequestMonitoring { ?session_id, %peer_addr, ep = tracing::field::Empty, + role = tracing::field::Empty, ); Self { @@ -92,16 +100,19 @@ impl RequestMonitoring { error_kind: None, auth_method: None, success: false, - cold_start_info: None, + rejected: None, + cold_start_info: ColdStartInfo::Unknown, sender: LOG_CHAN.get().and_then(|tx| tx.upgrade()), + disconnect_sender: LOG_CHAN_DISCONNECT.get().and_then(|tx| tx.upgrade()), latency_timer: LatencyTimer::new(protocol), + disconnect_timestamp: None, } } #[cfg(test)] pub fn test() -> Self { - RequestMonitoring::new(Uuid::now_v7(), [127, 0, 0, 1].into(), "test", "test") + RequestMonitoring::new(Uuid::now_v7(), [127, 0, 0, 1].into(), Protocol::Tcp, "test") } pub fn console_application_name(&self) -> String { @@ -112,27 +123,36 @@ impl RequestMonitoring { ) } + pub fn set_rejected(&mut self, rejected: bool) { + self.rejected = Some(rejected); + } + pub fn set_cold_start_info(&mut self, info: ColdStartInfo) { - self.cold_start_info = Some(info); + self.cold_start_info = info; + self.latency_timer.cold_start_info(info); } pub fn set_project(&mut self, x: MetricsAuxInfo) { - self.set_endpoint_id(x.endpoint_id); + if self.endpoint_id.is_none() { + self.set_endpoint_id(x.endpoint_id.as_str().into()) + } self.branch = Some(x.branch_id); self.project = Some(x.project_id); - self.cold_start_info = x.cold_start_info; + self.set_cold_start_info(x.cold_start_info); } - pub fn set_project_id(&mut self, project_id: ProjectId) { + pub fn set_project_id(&mut self, project_id: ProjectIdInt) { self.project = Some(project_id); } pub fn set_endpoint_id(&mut self, endpoint_id: EndpointId) { - self.span.record("ep", display(&endpoint_id)); - crate::metrics::CONNECTING_ENDPOINTS - .with_label_values(&[self.protocol]) - .measure(&endpoint_id); - self.endpoint_id = Some(endpoint_id); + if self.endpoint_id.is_none() { + self.span.record("ep", display(&endpoint_id)); + let metric = &Metrics::get().proxy.connecting_endpoints; + let label = metric.with_labels(self.protocol); + metric.get_metric(label).measure(&endpoint_id); + self.endpoint_id = Some(endpoint_id); + } } pub fn set_application(&mut self, app: Option) { @@ -144,6 +164,7 @@ impl RequestMonitoring { } pub fn set_user(&mut self, user: RoleName) { + self.span.record("role", display(&user)); self.user = Some(user); } @@ -151,14 +172,22 @@ impl RequestMonitoring { self.auth_method = Some(auth_method); } + pub fn has_private_peer_addr(&self) -> bool { + match self.peer_addr { + IpAddr::V4(ip) => ip.is_private(), + _ => false, + } + } + pub fn set_error_kind(&mut self, kind: ErrorKind) { - ERROR_BY_KIND - .with_label_values(&[kind.to_metric_label()]) - .inc(); + // Do not record errors from the private address to metrics. + if !self.has_private_peer_addr() { + Metrics::get().proxy.errors_total.inc(kind); + } if let Some(ep) = &self.endpoint_id { - ENDPOINT_ERRORS_BY_KIND - .with_label_values(&[kind.to_metric_label()]) - .measure(ep); + let metric = &Metrics::get().proxy.endpoints_affected_by_errors; + let label = metric.with_labels(kind); + metric.get_metric(label).measure(ep); } self.error_kind = Some(kind); } @@ -167,13 +196,55 @@ impl RequestMonitoring { self.success = true; } - pub fn log(self) {} -} - -impl Drop for RequestMonitoring { - fn drop(&mut self) { + pub fn log_connect(&mut self) { + let outcome = if self.success { + ConnectOutcome::Success + } else { + ConnectOutcome::Failed + }; + if let Some(rejected) = self.rejected { + let ep = self + .endpoint_id + .as_ref() + .map(|x| x.as_str()) + .unwrap_or_default(); + // This makes sense only if cache is disabled + info!( + ?outcome, + ?rejected, + ?ep, + "check endpoint is valid with outcome" + ); + Metrics::get() + .proxy + .invalid_endpoints_total + .inc(InvalidEndpointsGroup { + protocol: self.protocol, + rejected: rejected.into(), + outcome, + }); + } if let Some(tx) = self.sender.take() { let _: Result<(), _> = tx.send(RequestData::from(&*self)); } } + + fn log_disconnect(&mut self) { + // If we are here, it's guaranteed that the user successfully connected to the endpoint. + // Here we log the length of the session. + self.disconnect_timestamp = Some(Utc::now()); + if let Some(tx) = self.disconnect_sender.take() { + let _: Result<(), _> = tx.send(RequestData::from(&*self)); + } + } +} + +impl Drop for RequestMonitoring { + fn drop(&mut self) { + if self.sender.is_some() { + self.log_connect(); + } else { + self.log_disconnect(); + } + } } diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs index a2be1c4186..8104fe6087 100644 --- a/proxy/src/context/parquet.rs +++ b/proxy/src/context/parquet.rs @@ -13,12 +13,17 @@ use parquet::{ }, record::RecordWriter, }; -use remote_storage::{GenericRemoteStorage, RemotePath, RemoteStorageConfig, TimeoutOrCancel}; +use remote_storage::{GenericRemoteStorage, RemotePath, TimeoutOrCancel}; use tokio::{sync::mpsc, time}; use tokio_util::sync::CancellationToken; use tracing::{debug, info, Span}; use utils::backoff; +use crate::{ + config::{remote_storage_from_toml, OptRemoteStorageConfig}, + context::LOG_CHAN_DISCONNECT, +}; + use super::{RequestMonitoring, LOG_CHAN}; #[derive(clap::Args, Clone, Debug)] @@ -29,6 +34,9 @@ pub struct ParquetUploadArgs { #[clap(long, default_value = "{}", value_parser = remote_storage_from_toml)] parquet_upload_remote_storage: OptRemoteStorageConfig, + #[clap(long, default_value = "{}", value_parser = remote_storage_from_toml)] + parquet_upload_disconnect_events_remote_storage: OptRemoteStorageConfig, + /// How many rows to include in a row group #[clap(long, default_value_t = 8192)] parquet_upload_row_group_size: usize, @@ -50,21 +58,13 @@ pub struct ParquetUploadArgs { parquet_upload_compression: Compression, } -/// Hack to avoid clap being smarter. If you don't use this type alias, clap assumes more about the optional state and you get -/// runtime type errors from the value parser we use. -type OptRemoteStorageConfig = Option; - -fn remote_storage_from_toml(s: &str) -> anyhow::Result { - RemoteStorageConfig::from_toml(&s.parse()?) -} - // Occasional network issues and such can cause remote operations to fail, and // that's expected. If a upload fails, we log it at info-level, and retry. // But after FAILED_UPLOAD_WARN_THRESHOLD retries, we start to log it at WARN // level instead, as repeated failures can mean a more serious problem. If it // fails more than FAILED_UPLOAD_RETRIES times, we give up -pub(crate) const FAILED_UPLOAD_WARN_THRESHOLD: u32 = 3; -pub(crate) const FAILED_UPLOAD_MAX_RETRIES: u32 = 10; +pub const FAILED_UPLOAD_WARN_THRESHOLD: u32 = 3; +pub const FAILED_UPLOAD_MAX_RETRIES: u32 = 10; // the parquet crate leaves a lot to be desired... // what follows is an attempt to write parquet files with minimal allocs. @@ -93,10 +93,12 @@ pub struct RequestData { /// Or if we make it to proxy_pass success: bool, /// Indicates if the cplane started the new compute node for this request. - cold_start_info: Option<&'static str>, + cold_start_info: &'static str, /// Tracks time from session start (HTTP request/libpq TCP handshake) /// Through to success/failure duration_us: u64, + /// If the session was successful after the disconnect, will be created one more event with filled `disconnect_timestamp`. + disconnect_timestamp: Option, } impl From<&RequestMonitoring> for RequestData { @@ -117,20 +119,16 @@ impl From<&RequestMonitoring> for RequestData { super::AuthMethod::ScramSha256Plus => "scram_sha_256_plus", super::AuthMethod::Cleartext => "cleartext", }), - protocol: value.protocol, + protocol: value.protocol.as_str(), region: value.region, error: value.error_kind.as_ref().map(|e| e.to_metric_label()), success: value.success, - cold_start_info: value.cold_start_info.as_ref().map(|x| match x { - crate::console::messages::ColdStartInfo::Unknown => "unknown", - crate::console::messages::ColdStartInfo::Warm => "warm", - crate::console::messages::ColdStartInfo::PoolHit => "pool_hit", - crate::console::messages::ColdStartInfo::PoolMiss => "pool_miss", - }), + cold_start_info: value.cold_start_info.as_str(), duration_us: SystemTime::from(value.first_packet) .elapsed() .unwrap_or_default() .as_micros() as u64, // 584 millenia... good enough + disconnect_timestamp: value.disconnect_timestamp.map(|x| x.naive_utc()), } } } @@ -152,8 +150,9 @@ pub async fn worker( LOG_CHAN.set(tx.downgrade()).unwrap(); // setup row stream that will close on cancellation + let cancellation_token2 = cancellation_token.clone(); tokio::spawn(async move { - cancellation_token.cancelled().await; + cancellation_token2.cancelled().await; // dropping this sender will cause the channel to close only once // all the remaining inflight requests have been completed. drop(tx); @@ -178,9 +177,38 @@ pub async fn worker( test_remote_failures: 0, }; - worker_inner(storage, rx, parquet_config).await + // TODO(anna): consider moving this to a separate function. + if let Some(disconnect_events_storage_config) = + config.parquet_upload_disconnect_events_remote_storage + { + let (tx_disconnect, mut rx_disconnect) = mpsc::unbounded_channel(); + LOG_CHAN_DISCONNECT.set(tx_disconnect.downgrade()).unwrap(); + + // setup row stream that will close on cancellation + tokio::spawn(async move { + cancellation_token.cancelled().await; + // dropping this sender will cause the channel to close only once + // all the remaining inflight requests have been completed. + drop(tx_disconnect); + }); + let rx_disconnect = futures::stream::poll_fn(move |cx| rx_disconnect.poll_recv(cx)); + let rx_disconnect = rx_disconnect.map(RequestData::from); + + let storage_disconnect = + GenericRemoteStorage::from_config(&disconnect_events_storage_config) + .context("remote storage for disconnect events init")?; + let parquet_config_disconnect = parquet_config.clone(); + tokio::try_join!( + worker_inner(storage, rx, parquet_config), + worker_inner(storage_disconnect, rx_disconnect, parquet_config_disconnect) + ) + .map(|_| ()) + } else { + worker_inner(storage, rx, parquet_config).await + } } +#[derive(Clone, Debug)] struct ParquetConfig { propeties: WriterPropertiesPtr, rows_per_group: usize, @@ -424,6 +452,7 @@ mod tests { ) .unwrap(), max_keys_per_list_response: DEFAULT_MAX_KEYS_PER_LIST_RESPONSE, + upload_storage_class: None, }), timeout: RemoteStorageConfig::DEFAULT_TIMEOUT, }) @@ -460,8 +489,9 @@ mod tests { region: "us-east-1", error: None, success: rng.gen(), - cold_start_info: Some("no"), + cold_start_info: "no", duration_us: rng.gen_range(0..30_000_000), + disconnect_timestamp: None, } } @@ -530,15 +560,15 @@ mod tests { assert_eq!( file_stats, [ - (1314406, 3, 6000), - (1314399, 3, 6000), - (1314459, 3, 6000), - (1314416, 3, 6000), - (1314546, 3, 6000), - (1314388, 3, 6000), - (1314180, 3, 6000), - (1314416, 3, 6000), - (438359, 1, 2000) + (1315008, 3, 6000), + (1315001, 3, 6000), + (1315061, 3, 6000), + (1315018, 3, 6000), + (1315148, 3, 6000), + (1314990, 3, 6000), + (1314782, 3, 6000), + (1315018, 3, 6000), + (438575, 1, 2000) ] ); @@ -568,11 +598,11 @@ mod tests { assert_eq!( file_stats, [ - (1220668, 5, 10000), - (1226818, 5, 10000), - (1228612, 5, 10000), - (1227974, 5, 10000), - (1219252, 5, 10000) + (1221738, 5, 10000), + (1227888, 5, 10000), + (1229682, 5, 10000), + (1229044, 5, 10000), + (1220322, 5, 10000) ] ); @@ -604,11 +634,11 @@ mod tests { assert_eq!( file_stats, [ - (1206315, 5, 10000), - (1206046, 5, 10000), - (1206339, 5, 10000), - (1206327, 5, 10000), - (1206582, 5, 10000) + (1207385, 5, 10000), + (1207116, 5, 10000), + (1207409, 5, 10000), + (1207397, 5, 10000), + (1207652, 5, 10000) ] ); @@ -633,15 +663,15 @@ mod tests { assert_eq!( file_stats, [ - (1314406, 3, 6000), - (1314399, 3, 6000), - (1314459, 3, 6000), - (1314416, 3, 6000), - (1314546, 3, 6000), - (1314388, 3, 6000), - (1314180, 3, 6000), - (1314416, 3, 6000), - (438359, 1, 2000) + (1315008, 3, 6000), + (1315001, 3, 6000), + (1315061, 3, 6000), + (1315018, 3, 6000), + (1315148, 3, 6000), + (1314990, 3, 6000), + (1314782, 3, 6000), + (1315018, 3, 6000), + (438575, 1, 2000) ] ); @@ -678,7 +708,7 @@ mod tests { // files are smaller than the size threshold, but they took too long to fill so were flushed early assert_eq!( file_stats, - [(658837, 2, 3001), (658551, 2, 3000), (658347, 2, 2999)] + [(659240, 2, 3001), (658954, 2, 3000), (658750, 2, 2999)] ); tmpdir.close().unwrap(); diff --git a/proxy/src/error.rs b/proxy/src/error.rs index 4614f3913d..fdfe50a494 100644 --- a/proxy/src/error.rs +++ b/proxy/src/error.rs @@ -1,5 +1,7 @@ use std::{error::Error as StdError, fmt, io}; +use measured::FixedCardinalityLabel; + /// Upcast (almost) any error into an opaque [`io::Error`]. pub fn io_error(e: impl Into>) -> io::Error { io::Error::new(io::ErrorKind::Other, e) @@ -29,24 +31,29 @@ pub trait UserFacingError: ReportableError { } } -#[derive(Copy, Clone, Debug, Eq, PartialEq)] +#[derive(Copy, Clone, Debug, Eq, PartialEq, FixedCardinalityLabel)] +#[label(singleton = "type")] pub enum ErrorKind { /// Wrong password, unknown endpoint, protocol violation, etc... User, /// Network error between user and proxy. Not necessarily user error + #[label(rename = "clientdisconnect")] ClientDisconnect, /// Proxy self-imposed user rate limits + #[label(rename = "ratelimit")] RateLimit, /// Proxy self-imposed service-wise rate limits + #[label(rename = "serviceratelimit")] ServiceRateLimit, /// internal errors Service, /// Error communicating with control plane + #[label(rename = "controlplane")] ControlPlane, /// Postgres error diff --git a/proxy/src/http.rs b/proxy/src/http.rs index 59e1492ed4..fc7400869f 100644 --- a/proxy/src/http.rs +++ b/proxy/src/http.rs @@ -4,7 +4,7 @@ pub mod health_server; -use std::{sync::Arc, time::Duration}; +use std::{str::FromStr, sync::Arc, time::Duration}; use futures::FutureExt; pub use reqwest::{Request, Response, StatusCode}; @@ -13,13 +13,16 @@ pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware}; use tokio::time::Instant; use tracing::trace; -use crate::{metrics::CONSOLE_REQUEST_LATENCY, rate_limiter, url::ApiUrl}; +use crate::{ + metrics::{ConsoleRequest, Metrics}, + url::ApiUrl, +}; use reqwest_middleware::RequestBuilder; /// This is the preferred way to create new http clients, /// because it takes care of observability (OpenTelemetry). /// We deliberately don't want to replace this with a public static. -pub fn new_client(rate_limiter_config: rate_limiter::RateLimiterConfig) -> ClientWithMiddleware { +pub fn new_client() -> ClientWithMiddleware { let client = reqwest::ClientBuilder::new() .dns_resolver(Arc::new(GaiResolver::default())) .connection_verbose(true) @@ -28,7 +31,6 @@ pub fn new_client(rate_limiter_config: rate_limiter::RateLimiterConfig) -> Clien reqwest_middleware::ClientBuilder::new(client) .with(reqwest_tracing::TracingMiddleware::default()) - .with(rate_limiter::Limiter::new(rate_limiter_config)) .build() } @@ -90,22 +92,23 @@ impl Endpoint { /// Execute a [request](reqwest::Request). pub async fn execute(&self, request: Request) -> Result { - let path = request.url().path().to_string(); - let start = Instant::now(); - let res = self.client.execute(request).await; - CONSOLE_REQUEST_LATENCY - .with_label_values(&[&path]) - .observe(start.elapsed().as_secs_f64()); - res + let _timer = Metrics::get() + .proxy + .console_request_latency + .start_timer(ConsoleRequest { + request: request.url().path(), + }); + + self.client.execute(request).await } } -/// https://docs.rs/reqwest/0.11.18/src/reqwest/dns/gai.rs.html -use hyper::{ - client::connect::dns::{GaiResolver as HyperGaiResolver, Name}, - service::Service, +use hyper_util::client::legacy::connect::dns::{ + GaiResolver as HyperGaiResolver, Name as HyperName, }; -use reqwest::dns::{Addrs, Resolve, Resolving}; +use reqwest::dns::{Addrs, Name, Resolve, Resolving}; +/// https://docs.rs/reqwest/0.11.18/src/reqwest/dns/gai.rs.html +use tower_service::Service; #[derive(Debug)] pub struct GaiResolver(HyperGaiResolver); @@ -118,11 +121,12 @@ impl Default for GaiResolver { impl Resolve for GaiResolver { fn resolve(&self, name: Name) -> Resolving { let this = &mut self.0.clone(); + let hyper_name = HyperName::from_str(name.as_str()).expect("name should be valid"); let start = Instant::now(); Box::pin( - Service::::call(this, name.clone()).map(move |result| { + Service::::call(this, hyper_name).map(move |result| { let resolve_duration = start.elapsed(); - trace!(duration = ?resolve_duration, addr = %name, "resolve host complete"); + trace!(duration = ?resolve_duration, addr = %name.as_str(), "resolve host complete"); result .map(|addrs| -> Addrs { Box::new(addrs) }) .map_err(|err| -> Box { Box::new(err) }) diff --git a/proxy/src/http/health_server.rs b/proxy/src/http/health_server.rs index cbb17ebcb7..cae9eb5b97 100644 --- a/proxy/src/http/health_server.rs +++ b/proxy/src/http/health_server.rs @@ -1,30 +1,49 @@ use anyhow::{anyhow, bail}; -use hyper::{Body, Request, Response, StatusCode}; -use std::{convert::Infallible, net::TcpListener}; -use tracing::info; +use hyper::{header::CONTENT_TYPE, Body, Request, Response, StatusCode}; +use measured::{text::BufferedTextEncoder, MetricGroup}; +use metrics::NeonMetrics; +use std::{ + convert::Infallible, + net::TcpListener, + sync::{Arc, Mutex}, +}; +use tracing::{info, info_span}; use utils::http::{ - endpoint::{self, prometheus_metrics_handler, request_span}, + endpoint::{self, request_span}, error::ApiError, json::json_response, RouterBuilder, RouterService, }; +use crate::jemalloc; + async fn status_handler(_: Request) -> Result, ApiError> { json_response(StatusCode::OK, "") } -fn make_router() -> RouterBuilder { +fn make_router(metrics: AppMetrics) -> RouterBuilder { + let state = Arc::new(Mutex::new(PrometheusHandler { + encoder: BufferedTextEncoder::new(), + metrics, + })); + endpoint::make_router() - .get("/metrics", |r| request_span(r, prometheus_metrics_handler)) + .get("/metrics", move |r| { + let state = state.clone(); + request_span(r, move |b| prometheus_metrics_handler(b, state)) + }) .get("/v1/status", status_handler) } -pub async fn task_main(http_listener: TcpListener) -> anyhow::Result { +pub async fn task_main( + http_listener: TcpListener, + metrics: AppMetrics, +) -> anyhow::Result { scopeguard::defer! { info!("http has shut down"); } - let service = || RouterService::new(make_router().build()?); + let service = || RouterService::new(make_router(metrics).build()?); hyper::Server::from_tcp(http_listener)? .serve(service().map_err(|e| anyhow!(e))?) @@ -32,3 +51,57 @@ pub async fn task_main(http_listener: TcpListener) -> anyhow::Result bail!("hyper server without shutdown handling cannot shutdown successfully"); } + +struct PrometheusHandler { + encoder: BufferedTextEncoder, + metrics: AppMetrics, +} + +#[derive(MetricGroup)] +pub struct AppMetrics { + #[metric(namespace = "jemalloc")] + pub jemalloc: Option, + #[metric(flatten)] + pub neon_metrics: NeonMetrics, + #[metric(flatten)] + pub proxy: &'static crate::metrics::Metrics, +} + +async fn prometheus_metrics_handler( + _req: Request, + state: Arc>, +) -> Result, ApiError> { + let started_at = std::time::Instant::now(); + + let span = info_span!("blocking"); + let body = tokio::task::spawn_blocking(move || { + let _span = span.entered(); + + let mut state = state.lock().unwrap(); + let PrometheusHandler { encoder, metrics } = &mut *state; + + metrics + .collect_group_into(&mut *encoder) + .unwrap_or_else(|infallible| match infallible {}); + + let body = encoder.finish(); + + tracing::info!( + bytes = body.len(), + elapsed_ms = started_at.elapsed().as_millis(), + "responded /metrics" + ); + + body + }) + .await + .unwrap(); + + let response = Response::builder() + .status(200) + .header(CONTENT_TYPE, "text/plain; version=0.0.4") + .body(Body::from(body)) + .unwrap(); + + Ok(response) +} diff --git a/proxy/src/intern.rs b/proxy/src/intern.rs index a6519bdff9..e38135dd22 100644 --- a/proxy/src/intern.rs +++ b/proxy/src/intern.rs @@ -160,6 +160,11 @@ impl From<&EndpointId> for EndpointIdInt { EndpointIdTag::get_interner().get_or_intern(value) } } +impl From for EndpointIdInt { + fn from(value: EndpointId) -> Self { + EndpointIdTag::get_interner().get_or_intern(&value) + } +} #[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] pub struct BranchIdTag; @@ -175,6 +180,11 @@ impl From<&BranchId> for BranchIdInt { BranchIdTag::get_interner().get_or_intern(value) } } +impl From for BranchIdInt { + fn from(value: BranchId) -> Self { + BranchIdTag::get_interner().get_or_intern(&value) + } +} #[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] pub struct ProjectIdTag; @@ -190,6 +200,11 @@ impl From<&ProjectId> for ProjectIdInt { ProjectIdTag::get_interner().get_or_intern(value) } } +impl From for ProjectIdInt { + fn from(value: ProjectId) -> Self { + ProjectIdTag::get_interner().get_or_intern(&value) + } +} #[cfg(test)] mod tests { diff --git a/proxy/src/jemalloc.rs b/proxy/src/jemalloc.rs index ed20798d56..3243e6a140 100644 --- a/proxy/src/jemalloc.rs +++ b/proxy/src/jemalloc.rs @@ -1,27 +1,45 @@ -use std::time::Duration; +use std::marker::PhantomData; -use metrics::IntGauge; -use prometheus::{register_int_gauge_with_registry, Registry}; +use measured::{ + label::NoLabels, + metric::{ + gauge::GaugeState, group::Encoding, group::MetricValue, name::MetricNameEncoder, + MetricEncoding, MetricFamilyEncoding, MetricType, + }, + text::TextEncoder, + LabelGroup, MetricGroup, +}; use tikv_jemalloc_ctl::{config, epoch, epoch_mib, stats, version}; pub struct MetricRecorder { epoch: epoch_mib, - active: stats::active_mib, - active_gauge: IntGauge, - allocated: stats::allocated_mib, - allocated_gauge: IntGauge, - mapped: stats::mapped_mib, - mapped_gauge: IntGauge, - metadata: stats::metadata_mib, - metadata_gauge: IntGauge, - resident: stats::resident_mib, - resident_gauge: IntGauge, - retained: stats::retained_mib, - retained_gauge: IntGauge, + inner: Metrics, +} + +#[derive(MetricGroup)] +struct Metrics { + active_bytes: JemallocGaugeFamily, + allocated_bytes: JemallocGaugeFamily, + mapped_bytes: JemallocGaugeFamily, + metadata_bytes: JemallocGaugeFamily, + resident_bytes: JemallocGaugeFamily, + retained_bytes: JemallocGaugeFamily, +} + +impl MetricGroup for MetricRecorder +where + Metrics: MetricGroup, +{ + fn collect_group_into(&self, enc: &mut Enc) -> Result<(), Enc::Err> { + if self.epoch.advance().is_ok() { + self.inner.collect_group_into(enc)?; + } + Ok(()) + } } impl MetricRecorder { - pub fn new(registry: &Registry) -> Result { + pub fn new() -> Result { tracing::info!( config = config::malloc_conf::read()?, version = version::read()?, @@ -30,71 +48,69 @@ impl MetricRecorder { Ok(Self { epoch: epoch::mib()?, - active: stats::active::mib()?, - active_gauge: register_int_gauge_with_registry!( - "jemalloc_active_bytes", - "Total number of bytes in active pages allocated by the process", - registry - )?, - allocated: stats::allocated::mib()?, - allocated_gauge: register_int_gauge_with_registry!( - "jemalloc_allocated_bytes", - "Total number of bytes allocated by the process", - registry - )?, - mapped: stats::mapped::mib()?, - mapped_gauge: register_int_gauge_with_registry!( - "jemalloc_mapped_bytes", - "Total number of bytes in active extents mapped by the allocator", - registry - )?, - metadata: stats::metadata::mib()?, - metadata_gauge: register_int_gauge_with_registry!( - "jemalloc_metadata_bytes", - "Total number of bytes dedicated to jemalloc metadata", - registry - )?, - resident: stats::resident::mib()?, - resident_gauge: register_int_gauge_with_registry!( - "jemalloc_resident_bytes", - "Total number of bytes in physically resident data pages mapped by the allocator", - registry - )?, - retained: stats::retained::mib()?, - retained_gauge: register_int_gauge_with_registry!( - "jemalloc_retained_bytes", - "Total number of bytes in virtual memory mappings that were retained rather than being returned to the operating system", - registry - )?, - }) - } - - fn _poll(&self) -> Result<(), anyhow::Error> { - self.epoch.advance()?; - self.active_gauge.set(self.active.read()? as i64); - self.allocated_gauge.set(self.allocated.read()? as i64); - self.mapped_gauge.set(self.mapped.read()? as i64); - self.metadata_gauge.set(self.metadata.read()? as i64); - self.resident_gauge.set(self.resident.read()? as i64); - self.retained_gauge.set(self.retained.read()? as i64); - Ok(()) - } - - #[inline] - pub fn poll(&self) { - if let Err(error) = self._poll() { - tracing::warn!(%error, "Failed to poll jemalloc stats"); - } - } - - pub fn start(self) -> tokio::task::JoinHandle<()> { - tokio::task::spawn(async move { - let mut interval = tokio::time::interval(Duration::from_secs(15)); - interval.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip); - loop { - self.poll(); - interval.tick().await; - } + inner: Metrics { + active_bytes: JemallocGaugeFamily(stats::active::mib()?), + allocated_bytes: JemallocGaugeFamily(stats::allocated::mib()?), + mapped_bytes: JemallocGaugeFamily(stats::mapped::mib()?), + metadata_bytes: JemallocGaugeFamily(stats::metadata::mib()?), + resident_bytes: JemallocGaugeFamily(stats::resident::mib()?), + retained_bytes: JemallocGaugeFamily(stats::retained::mib()?), + }, }) } } + +struct JemallocGauge(PhantomData); + +impl Default for JemallocGauge { + fn default() -> Self { + JemallocGauge(PhantomData) + } +} +impl MetricType for JemallocGauge { + type Metadata = T; +} + +struct JemallocGaugeFamily(T); +impl MetricFamilyEncoding for JemallocGaugeFamily +where + JemallocGauge: MetricEncoding, +{ + fn collect_family_into(&self, name: impl MetricNameEncoder, enc: &mut T) -> Result<(), T::Err> { + JemallocGauge::write_type(&name, enc)?; + JemallocGauge(PhantomData).collect_into(&self.0, NoLabels, name, enc) + } +} + +macro_rules! jemalloc_gauge { + ($stat:ident, $mib:ident) => { + impl MetricEncoding> for JemallocGauge { + fn write_type( + name: impl MetricNameEncoder, + enc: &mut TextEncoder, + ) -> Result<(), std::io::Error> { + GaugeState::write_type(name, enc) + } + + fn collect_into( + &self, + mib: &stats::$mib, + labels: impl LabelGroup, + name: impl MetricNameEncoder, + enc: &mut TextEncoder, + ) -> Result<(), std::io::Error> { + if let Ok(v) = mib.read() { + enc.write_metric_value(name, labels, MetricValue::Int(v as i64))?; + } + Ok(()) + } + } + }; +} + +jemalloc_gauge!(active, active_mib); +jemalloc_gauge!(allocated, allocated_mib); +jemalloc_gauge!(mapped, mapped_mib); +jemalloc_gauge!(metadata, metadata_mib); +jemalloc_gauge!(resident, resident_mib); +jemalloc_gauge!(retained, retained_mib); diff --git a/proxy/src/lib.rs b/proxy/src/lib.rs index da7c7f3ed2..35c1616481 100644 --- a/proxy/src/lib.rs +++ b/proxy/src/lib.rs @@ -127,6 +127,24 @@ macro_rules! smol_str_wrapper { }; } +const POOLER_SUFFIX: &str = "-pooler"; + +pub trait Normalize { + fn normalize(&self) -> Self; +} + +impl + From> Normalize for S { + fn normalize(&self) -> Self { + if self.as_ref().ends_with(POOLER_SUFFIX) { + let mut s = self.as_ref().to_string(); + s.truncate(s.len() - POOLER_SUFFIX.len()); + s.into() + } else { + self.clone() + } + } +} + // 90% of role name strings are 20 characters or less. smol_str_wrapper!(RoleName); // 50% of endpoint strings are 23 characters or less. @@ -140,3 +158,25 @@ smol_str_wrapper!(ProjectId); smol_str_wrapper!(EndpointCacheKey); smol_str_wrapper!(DbName); + +// postgres hostname, will likely be a port:ip addr +smol_str_wrapper!(Host); + +// Endpoints are a bit tricky. Rare they might be branches or projects. +impl EndpointId { + pub fn is_endpoint(&self) -> bool { + self.0.starts_with("ep-") + } + pub fn is_branch(&self) -> bool { + self.0.starts_with("br-") + } + pub fn is_project(&self) -> bool { + !self.is_endpoint() && !self.is_branch() + } + pub fn as_branch(&self) -> BranchId { + BranchId(self.0.clone()) + } + pub fn as_project(&self) -> ProjectId { + ProjectId(self.0.clone()) + } +} diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs index 02ebcd6aaa..1590316925 100644 --- a/proxy/src/metrics.rs +++ b/proxy/src/metrics.rs @@ -1,170 +1,372 @@ -use ::metrics::{ - exponential_buckets, register_histogram, register_histogram_vec, register_hll_vec, - register_int_counter_pair_vec, register_int_counter_vec, register_int_gauge, - register_int_gauge_vec, Histogram, HistogramVec, HyperLogLogVec, IntCounterPairVec, - IntCounterVec, IntGauge, IntGaugeVec, -}; -use metrics::{register_int_counter, register_int_counter_pair, IntCounter, IntCounterPair}; +use std::sync::OnceLock; + +use lasso::ThreadedRodeo; +use measured::{ + label::StaticLabelSet, + metric::{histogram::Thresholds, name::MetricName}, + Counter, CounterVec, FixedCardinalityLabel, Gauge, Histogram, HistogramVec, LabelGroup, + MetricGroup, +}; +use metrics::{CounterPairAssoc, CounterPairVec, HyperLogLog, HyperLogLogVec}; -use once_cell::sync::Lazy; use tokio::time::{self, Instant}; -pub static NUM_DB_CONNECTIONS_GAUGE: Lazy = Lazy::new(|| { - register_int_counter_pair_vec!( - "proxy_opened_db_connections_total", - "Number of opened connections to a database.", - "proxy_closed_db_connections_total", - "Number of closed connections to a database.", - &["protocol"], - ) - .unwrap() -}); +use crate::console::messages::ColdStartInfo; -pub static NUM_CLIENT_CONNECTION_GAUGE: Lazy = Lazy::new(|| { - register_int_counter_pair_vec!( - "proxy_opened_client_connections_total", - "Number of opened connections from a client.", - "proxy_closed_client_connections_total", - "Number of closed connections from a client.", - &["protocol"], - ) - .unwrap() -}); +#[derive(MetricGroup)] +pub struct Metrics { + #[metric(namespace = "proxy")] + pub proxy: ProxyMetrics, -pub static NUM_CONNECTION_REQUESTS_GAUGE: Lazy = Lazy::new(|| { - register_int_counter_pair_vec!( - "proxy_accepted_connections_total", - "Number of client connections accepted.", - "proxy_closed_connections_total", - "Number of client connections closed.", - &["protocol"], - ) - .unwrap() -}); + #[metric(namespace = "wake_compute_lock")] + pub wake_compute_lock: ApiLockMetrics, +} -pub static COMPUTE_CONNECTION_LATENCY: Lazy = Lazy::new(|| { - register_histogram_vec!( - "proxy_compute_connection_latency_seconds", - "Time it took for proxy to establish a connection to the compute endpoint", - // http/ws/tcp, true/false, true/false, success/failure, client/client_and_cplane - // 3 * 2 * 2 * 2 * 2 = 48 counters - &["protocol", "cache_miss", "pool_miss", "outcome", "excluded"], - // largest bucket = 2^16 * 0.5ms = 32s - exponential_buckets(0.0005, 2.0, 16).unwrap(), - ) - .unwrap() -}); +impl Metrics { + pub fn get() -> &'static Self { + static SELF: OnceLock = OnceLock::new(); + SELF.get_or_init(|| Metrics { + proxy: ProxyMetrics::default(), + wake_compute_lock: ApiLockMetrics::new(), + }) + } +} -pub static CONSOLE_REQUEST_LATENCY: Lazy = Lazy::new(|| { - register_histogram_vec!( - "proxy_console_request_latency", - "Time it took for proxy to establish a connection to the compute endpoint", - // proxy_wake_compute/proxy_get_role_info - &["request"], +#[derive(MetricGroup)] +#[metric(new())] +pub struct ProxyMetrics { + #[metric(flatten)] + pub db_connections: CounterPairVec, + #[metric(flatten)] + pub client_connections: CounterPairVec, + #[metric(flatten)] + pub connection_requests: CounterPairVec, + #[metric(flatten)] + pub http_endpoint_pools: HttpEndpointPools, + + /// Time it took for proxy to establish a connection to the compute endpoint. + // largest bucket = 2^16 * 0.5ms = 32s + #[metric(metadata = Thresholds::exponential_buckets(0.0005, 2.0))] + pub compute_connection_latency_seconds: HistogramVec, + + /// Time it took for proxy to receive a response from control plane. + #[metric( // largest bucket = 2^16 * 0.2ms = 13s - exponential_buckets(0.0002, 2.0, 16).unwrap(), - ) - .unwrap() -}); + metadata = Thresholds::exponential_buckets(0.0002, 2.0), + )] + pub console_request_latency: HistogramVec, -pub static ALLOWED_IPS_BY_CACHE_OUTCOME: Lazy = Lazy::new(|| { - register_int_counter_vec!( - "proxy_allowed_ips_cache_misses", - "Number of cache hits/misses for allowed ips", - // hit/miss - &["outcome"], - ) - .unwrap() -}); + /// Time it takes to acquire a token to call console plane. + // largest bucket = 3^16 * 0.05ms = 2.15s + #[metric(metadata = Thresholds::exponential_buckets(0.00005, 3.0))] + pub control_plane_token_acquire_seconds: Histogram<16>, -pub static RATE_LIMITER_ACQUIRE_LATENCY: Lazy = Lazy::new(|| { - register_histogram!( - "proxy_control_plane_token_acquire_seconds", - "Time it took for proxy to establish a connection to the compute endpoint", - // largest bucket = 3^16 * 0.05ms = 2.15s - exponential_buckets(0.00005, 3.0, 16).unwrap(), - ) - .unwrap() -}); + /// Size of the HTTP request body lengths. + // smallest bucket = 16 bytes + // largest bucket = 4^12 * 16 bytes = 256MB + #[metric(metadata = Thresholds::exponential_buckets(16.0, 4.0))] + pub http_conn_content_length_bytes: HistogramVec, 12>, -pub static RATE_LIMITER_LIMIT: Lazy = Lazy::new(|| { - register_int_gauge_vec!( - "semaphore_control_plane_limit", - "Current limit of the semaphore control plane", - &["limit"], // 2 counters - ) - .unwrap() -}); + /// Time it takes to reclaim unused connection pools. + #[metric(metadata = Thresholds::exponential_buckets(1e-6, 2.0))] + pub http_pool_reclaimation_lag_seconds: Histogram<16>, -pub static NUM_CONNECTION_ACCEPTED_BY_SNI: Lazy = Lazy::new(|| { - register_int_counter_vec!( - "proxy_accepted_connections_by_sni", - "Number of connections (per sni).", - &["kind"], - ) - .unwrap() -}); + /// Number of opened connections to a database. + pub http_pool_opened_connections: Gauge, -pub static ALLOWED_IPS_NUMBER: Lazy = Lazy::new(|| { - register_histogram!( - "proxy_allowed_ips_number", - "Number of allowed ips", - vec![0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 10.0, 20.0, 50.0, 100.0], - ) - .unwrap() -}); + /// Number of cache hits/misses for allowed ips. + pub allowed_ips_cache_misses: CounterVec>, -pub static HTTP_CONTENT_LENGTH: Lazy = Lazy::new(|| { - register_histogram!( - "proxy_http_conn_content_length_bytes", - "Time it took for proxy to establish a connection to the compute endpoint", - // largest bucket = 3^16 * 0.05ms = 2.15s - exponential_buckets(8.0, 2.0, 20).unwrap() - ) - .unwrap() -}); + /// Number of allowed ips + #[metric(metadata = Thresholds::with_buckets([0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 10.0, 20.0, 50.0, 100.0]))] + pub allowed_ips_number: Histogram<10>, -pub static GC_LATENCY: Lazy = Lazy::new(|| { - register_histogram!( - "proxy_http_pool_reclaimation_lag_seconds", - "Time it takes to reclaim unused connection pools", - // 1us -> 65ms - exponential_buckets(1e-6, 2.0, 16).unwrap(), - ) - .unwrap() -}); + /// Number of connections (per sni). + pub accepted_connections_by_sni: CounterVec>, -pub static ENDPOINT_POOLS: Lazy = Lazy::new(|| { - register_int_counter_pair!( - "proxy_http_pool_endpoints_registered_total", - "Number of endpoints we have registered pools for", - "proxy_http_pool_endpoints_unregistered_total", - "Number of endpoints we have unregistered pools for", - ) - .unwrap() -}); + /// Number of connection failures (per kind). + pub connection_failures_total: CounterVec>, -pub static NUM_OPEN_CLIENTS_IN_HTTP_POOL: Lazy = Lazy::new(|| { - register_int_gauge!( - "proxy_http_pool_opened_connections", - "Number of opened connections to a database.", - ) - .unwrap() -}); + /// Number of wake-up failures (per kind). + pub connection_failures_breakdown: CounterVec, -pub static NUM_CANCELLATION_REQUESTS: Lazy = Lazy::new(|| { - register_int_counter_vec!( - "proxy_cancellation_requests_total", - "Number of cancellation requests (per found/not_found).", - &["source", "kind"], - ) - .unwrap() -}); + /// Number of bytes sent/received between all clients and backends. + pub io_bytes: CounterVec>, + + /// Number of errors by a given classification. + pub errors_total: CounterVec>, + + /// Number of cancellation requests (per found/not_found). + pub cancellation_requests_total: CounterVec, + + /// Number of errors by a given classification + pub redis_errors_total: CounterVec, + + /// Number of TLS handshake failures + pub tls_handshake_failures: Counter, + + /// Number of connection requests affected by authentication rate limits + pub requests_auth_rate_limits_total: Counter, + + /// HLL approximate cardinality of endpoints that are connecting + pub connecting_endpoints: HyperLogLogVec, 32>, + + /// Number of endpoints affected by errors of a given classification + pub endpoints_affected_by_errors: HyperLogLogVec, 32>, + + /// Number of endpoints affected by authentication rate limits + pub endpoints_auth_rate_limits: HyperLogLog<32>, + + /// Number of invalid endpoints (per protocol, per rejected). + pub invalid_endpoints_total: CounterVec, + + /// Number of retries (per outcome, per retry_type). + #[metric(metadata = Thresholds::with_buckets([0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]))] + pub retries_metric: HistogramVec, + + /// Number of events consumed from redis (per event type). + pub redis_events_count: CounterVec>, + + #[metric(namespace = "connect_compute_lock")] + pub connect_compute_lock: ApiLockMetrics, +} + +#[derive(MetricGroup)] +#[metric(new())] +pub struct ApiLockMetrics { + /// Number of semaphores registered in this api lock + pub semaphores_registered: Counter, + /// Number of semaphores unregistered in this api lock + pub semaphores_unregistered: Counter, + /// Time it takes to reclaim unused semaphores in the api lock + #[metric(metadata = Thresholds::exponential_buckets(1e-6, 2.0))] + pub reclamation_lag_seconds: Histogram<16>, + /// Time it takes to acquire a semaphore lock + #[metric(metadata = Thresholds::exponential_buckets(1e-4, 2.0))] + pub semaphore_acquire_seconds: Histogram<16>, +} + +impl Default for ProxyMetrics { + fn default() -> Self { + Self::new() + } +} + +impl Default for ApiLockMetrics { + fn default() -> Self { + Self::new() + } +} + +#[derive(FixedCardinalityLabel, Copy, Clone)] +#[label(singleton = "direction")] +pub enum HttpDirection { + Request, + Response, +} + +#[derive(FixedCardinalityLabel, Copy, Clone)] +#[label(singleton = "direction")] +pub enum Direction { + Tx, + Rx, +} + +#[derive(FixedCardinalityLabel, Clone, Copy, Debug)] +#[label(singleton = "protocol")] +pub enum Protocol { + Http, + Ws, + Tcp, + SniRouter, +} + +impl Protocol { + pub fn as_str(&self) -> &'static str { + match self { + Protocol::Http => "http", + Protocol::Ws => "ws", + Protocol::Tcp => "tcp", + Protocol::SniRouter => "sni_router", + } + } +} + +impl std::fmt::Display for Protocol { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.write_str(self.as_str()) + } +} + +#[derive(FixedCardinalityLabel, Copy, Clone)] +pub enum Bool { + True, + False, +} + +#[derive(FixedCardinalityLabel, Copy, Clone)] +#[label(singleton = "outcome")] +pub enum Outcome { + Success, + Failed, +} + +#[derive(FixedCardinalityLabel, Copy, Clone)] +#[label(singleton = "outcome")] +pub enum CacheOutcome { + Hit, + Miss, +} + +#[derive(LabelGroup)] +#[label(set = ConsoleRequestSet)] +pub struct ConsoleRequest<'a> { + #[label(dynamic_with = ThreadedRodeo, default)] + pub request: &'a str, +} + +#[derive(MetricGroup, Default)] +pub struct HttpEndpointPools { + /// Number of endpoints we have registered pools for + pub http_pool_endpoints_registered_total: Counter, + /// Number of endpoints we have unregistered pools for + pub http_pool_endpoints_unregistered_total: Counter, +} + +pub struct HttpEndpointPoolsGuard<'a> { + dec: &'a Counter, +} + +impl Drop for HttpEndpointPoolsGuard<'_> { + fn drop(&mut self) { + self.dec.inc(); + } +} + +impl HttpEndpointPools { + pub fn guard(&self) -> HttpEndpointPoolsGuard { + self.http_pool_endpoints_registered_total.inc(); + HttpEndpointPoolsGuard { + dec: &self.http_pool_endpoints_unregistered_total, + } + } +} +pub struct NumDbConnectionsGauge; +impl CounterPairAssoc for NumDbConnectionsGauge { + const INC_NAME: &'static MetricName = MetricName::from_str("opened_db_connections_total"); + const DEC_NAME: &'static MetricName = MetricName::from_str("closed_db_connections_total"); + const INC_HELP: &'static str = "Number of opened connections to a database."; + const DEC_HELP: &'static str = "Number of closed connections to a database."; + type LabelGroupSet = StaticLabelSet; +} +pub type NumDbConnectionsGuard<'a> = metrics::MeasuredCounterPairGuard<'a, NumDbConnectionsGauge>; + +pub struct NumClientConnectionsGauge; +impl CounterPairAssoc for NumClientConnectionsGauge { + const INC_NAME: &'static MetricName = MetricName::from_str("opened_client_connections_total"); + const DEC_NAME: &'static MetricName = MetricName::from_str("closed_client_connections_total"); + const INC_HELP: &'static str = "Number of opened connections from a client."; + const DEC_HELP: &'static str = "Number of closed connections from a client."; + type LabelGroupSet = StaticLabelSet; +} +pub type NumClientConnectionsGuard<'a> = + metrics::MeasuredCounterPairGuard<'a, NumClientConnectionsGauge>; + +pub struct NumConnectionRequestsGauge; +impl CounterPairAssoc for NumConnectionRequestsGauge { + const INC_NAME: &'static MetricName = MetricName::from_str("accepted_connections_total"); + const DEC_NAME: &'static MetricName = MetricName::from_str("closed_connections_total"); + const INC_HELP: &'static str = "Number of client connections accepted."; + const DEC_HELP: &'static str = "Number of client connections closed."; + type LabelGroupSet = StaticLabelSet; +} +pub type NumConnectionRequestsGuard<'a> = + metrics::MeasuredCounterPairGuard<'a, NumConnectionRequestsGauge>; + +#[derive(LabelGroup)] +#[label(set = ComputeConnectionLatencySet)] +pub struct ComputeConnectionLatencyGroup { + protocol: Protocol, + cold_start_info: ColdStartInfo, + outcome: ConnectOutcome, + excluded: LatencyExclusions, +} + +#[derive(FixedCardinalityLabel, Copy, Clone)] +pub enum LatencyExclusions { + Client, + ClientAndCplane, + ClientCplaneCompute, + ClientCplaneComputeRetry, +} + +#[derive(FixedCardinalityLabel, Copy, Clone)] +#[label(singleton = "kind")] +pub enum SniKind { + Sni, + NoSni, + PasswordHack, +} + +#[derive(FixedCardinalityLabel, Copy, Clone)] +#[label(singleton = "kind")] +pub enum ConnectionFailureKind { + ComputeCached, + ComputeUncached, +} + +#[derive(FixedCardinalityLabel, Copy, Clone)] +#[label(singleton = "kind")] +pub enum WakeupFailureKind { + BadComputeAddress, + ApiTransportError, + QuotaExceeded, + ApiConsoleLocked, + ApiConsoleBadRequest, + ApiConsoleOtherServerError, + ApiConsoleOtherError, + TimeoutError, +} + +#[derive(LabelGroup)] +#[label(set = ConnectionFailuresBreakdownSet)] +pub struct ConnectionFailuresBreakdownGroup { + pub kind: WakeupFailureKind, + pub retry: Bool, +} + +#[derive(LabelGroup, Copy, Clone)] +#[label(set = RedisErrorsSet)] +pub struct RedisErrors<'a> { + #[label(dynamic_with = ThreadedRodeo, default)] + pub channel: &'a str, +} + +#[derive(FixedCardinalityLabel, Copy, Clone)] +pub enum CancellationSource { + FromClient, + FromRedis, + Local, +} + +#[derive(FixedCardinalityLabel, Copy, Clone)] +pub enum CancellationOutcome { + NotFound, + Found, +} + +#[derive(LabelGroup)] +#[label(set = CancellationRequestSet)] +pub struct CancellationRequest { + pub source: CancellationSource, + pub kind: CancellationOutcome, +} pub enum Waiting { Cplane, Client, Compute, + RetryTimeout, } #[derive(Default)] @@ -172,6 +374,7 @@ struct Accumulated { cplane: time::Duration, client: time::Duration, compute: time::Duration, + retry: time::Duration, } pub struct LatencyTimer { @@ -182,10 +385,9 @@ pub struct LatencyTimer { // accumulated time on the stopwatch accumulated: Accumulated, // label data - protocol: &'static str, - cache_miss: bool, - pool_miss: bool, - outcome: &'static str, + protocol: Protocol, + cold_start_info: ColdStartInfo, + outcome: ConnectOutcome, } pub struct LatencyTimerPause<'a> { @@ -195,17 +397,15 @@ pub struct LatencyTimerPause<'a> { } impl LatencyTimer { - pub fn new(protocol: &'static str) -> Self { + pub fn new(protocol: Protocol) -> Self { Self { start: time::Instant::now(), stop: None, accumulated: Accumulated::default(), protocol, - cache_miss: false, - // by default we don't do pooling - pool_miss: true, + cold_start_info: ColdStartInfo::Unknown, // assume failed unless otherwise specified - outcome: "failed", + outcome: ConnectOutcome::Failed, } } @@ -217,12 +417,8 @@ impl LatencyTimer { } } - pub fn cache_miss(&mut self) { - self.cache_miss = true; - } - - pub fn pool_hit(&mut self) { - self.pool_miss = false; + pub fn cold_start_info(&mut self, cold_start_info: ColdStartInfo) { + self.cold_start_info = cold_start_info; } pub fn success(&mut self) { @@ -230,7 +426,7 @@ impl LatencyTimer { self.stop = Some(time::Instant::now()); // success - self.outcome = "success"; + self.outcome = ConnectOutcome::Success; } } @@ -241,117 +437,119 @@ impl Drop for LatencyTimerPause<'_> { Waiting::Cplane => self.timer.accumulated.cplane += dur, Waiting::Client => self.timer.accumulated.client += dur, Waiting::Compute => self.timer.accumulated.compute += dur, + Waiting::RetryTimeout => self.timer.accumulated.retry += dur, } } } +#[derive(FixedCardinalityLabel, Clone, Copy, Debug)] +pub enum ConnectOutcome { + Success, + Failed, +} + impl Drop for LatencyTimer { fn drop(&mut self) { let duration = self .stop .unwrap_or_else(time::Instant::now) .duration_since(self.start); - // Excluding cplane communication from the accumulated time. - COMPUTE_CONNECTION_LATENCY - .with_label_values(&[ - self.protocol, - bool_to_str(self.cache_miss), - bool_to_str(self.pool_miss), - self.outcome, - "client", - ]) - .observe((duration.saturating_sub(self.accumulated.client)).as_secs_f64()); + + let metric = &Metrics::get().proxy.compute_connection_latency_seconds; + + // Excluding client communication from the accumulated time. + metric.observe( + ComputeConnectionLatencyGroup { + protocol: self.protocol, + cold_start_info: self.cold_start_info, + outcome: self.outcome, + excluded: LatencyExclusions::Client, + }, + duration + .saturating_sub(self.accumulated.client) + .as_secs_f64(), + ); + // Exclude client and cplane communication from the accumulated time. let accumulated_total = self.accumulated.client + self.accumulated.cplane; - COMPUTE_CONNECTION_LATENCY - .with_label_values(&[ - self.protocol, - bool_to_str(self.cache_miss), - bool_to_str(self.pool_miss), - self.outcome, - "client_and_cplane", - ]) - .observe((duration.saturating_sub(accumulated_total)).as_secs_f64()); + metric.observe( + ComputeConnectionLatencyGroup { + protocol: self.protocol, + cold_start_info: self.cold_start_info, + outcome: self.outcome, + excluded: LatencyExclusions::ClientAndCplane, + }, + duration.saturating_sub(accumulated_total).as_secs_f64(), + ); + + // Exclude client cplane, compue communication from the accumulated time. + let accumulated_total = + self.accumulated.client + self.accumulated.cplane + self.accumulated.compute; + metric.observe( + ComputeConnectionLatencyGroup { + protocol: self.protocol, + cold_start_info: self.cold_start_info, + outcome: self.outcome, + excluded: LatencyExclusions::ClientCplaneCompute, + }, + duration.saturating_sub(accumulated_total).as_secs_f64(), + ); + + // Exclude client cplane, compue, retry communication from the accumulated time. + let accumulated_total = self.accumulated.client + + self.accumulated.cplane + + self.accumulated.compute + + self.accumulated.retry; + metric.observe( + ComputeConnectionLatencyGroup { + protocol: self.protocol, + cold_start_info: self.cold_start_info, + outcome: self.outcome, + excluded: LatencyExclusions::ClientCplaneComputeRetry, + }, + duration.saturating_sub(accumulated_total).as_secs_f64(), + ); } } -pub static NUM_CONNECTION_FAILURES: Lazy = Lazy::new(|| { - register_int_counter_vec!( - "proxy_connection_failures_total", - "Number of connection failures (per kind).", - &["kind"], - ) - .unwrap() -}); - -pub static NUM_WAKEUP_FAILURES: Lazy = Lazy::new(|| { - register_int_counter_vec!( - "proxy_connection_failures_breakdown", - "Number of wake-up failures (per kind).", - &["retry", "kind"], - ) - .unwrap() -}); - -pub static NUM_BYTES_PROXIED_COUNTER: Lazy = Lazy::new(|| { - register_int_counter_vec!( - "proxy_io_bytes", - "Number of bytes sent/received between all clients and backends.", - &["direction"], - ) - .unwrap() -}); - -pub const fn bool_to_str(x: bool) -> &'static str { - if x { - "true" - } else { - "false" +impl From for Bool { + fn from(value: bool) -> Self { + if value { + Bool::True + } else { + Bool::False + } } } -pub static CONNECTING_ENDPOINTS: Lazy> = Lazy::new(|| { - register_hll_vec!( - 32, - "proxy_connecting_endpoints", - "HLL approximate cardinality of endpoints that are connecting", - &["protocol"], - ) - .unwrap() -}); +#[derive(LabelGroup)] +#[label(set = InvalidEndpointsSet)] +pub struct InvalidEndpointsGroup { + pub protocol: Protocol, + pub rejected: Bool, + pub outcome: ConnectOutcome, +} -pub static ERROR_BY_KIND: Lazy = Lazy::new(|| { - register_int_counter_vec!( - "proxy_errors_total", - "Number of errors by a given classification", - &["type"], - ) - .unwrap() -}); +#[derive(LabelGroup)] +#[label(set = RetriesMetricSet)] +pub struct RetriesMetricGroup { + pub outcome: ConnectOutcome, + pub retry_type: RetryType, +} -pub static ENDPOINT_ERRORS_BY_KIND: Lazy> = Lazy::new(|| { - register_hll_vec!( - 32, - "proxy_endpoints_affected_by_errors", - "Number of endpoints affected by errors of a given classification", - &["type"], - ) - .unwrap() -}); +#[derive(FixedCardinalityLabel, Clone, Copy, Debug)] +pub enum RetryType { + WakeCompute, + ConnectToCompute, +} -pub static REDIS_BROKEN_MESSAGES: Lazy = Lazy::new(|| { - register_int_counter_vec!( - "proxy_redis_errors_total", - "Number of errors by a given classification", - &["channel"], - ) - .unwrap() -}); - -pub static TLS_HANDSHAKE_FAILURES: Lazy = Lazy::new(|| { - register_int_counter!( - "proxy_tls_handshake_failures", - "Number of TLS handshake failures", - ) - .unwrap() -}); +#[derive(FixedCardinalityLabel, Clone, Copy, Debug)] +#[label(singleton = "event")] +pub enum RedisEventsCount { + EndpointCreated, + BranchCreated, + ProjectCreated, + CancelSession, + PasswordUpdate, + AllowedIpsUpdate, +} diff --git a/proxy/src/protocol2.rs b/proxy/src/protocol2.rs index 700c8c8681..1dd4563514 100644 --- a/proxy/src/protocol2.rs +++ b/proxy/src/protocol2.rs @@ -1,48 +1,26 @@ //! Proxy Protocol V2 implementation use std::{ - future::{poll_fn, Future}, io, net::SocketAddr, - pin::{pin, Pin}, - sync::Mutex, - task::{ready, Context, Poll}, + pin::Pin, + task::{Context, Poll}, }; -use bytes::{Buf, BytesMut}; -use hyper::server::accept::Accept; -use hyper::server::conn::{AddrIncoming, AddrStream}; -use metrics::IntCounterPairGuard; +use bytes::BytesMut; use pin_project_lite::pin_project; use tokio::io::{AsyncRead, AsyncReadExt, AsyncWrite, ReadBuf}; -use uuid::Uuid; - -use crate::metrics::NUM_CLIENT_CONNECTION_GAUGE; - -pub struct ProxyProtocolAccept { - pub incoming: AddrIncoming, - pub protocol: &'static str, -} pin_project! { - pub struct WithClientIp { + /// A chained [`AsyncRead`] with [`AsyncWrite`] passthrough + pub struct ChainRW { #[pin] pub inner: T, buf: BytesMut, - tlv_bytes: u16, - state: ProxyParse, } } -#[derive(Clone, PartialEq, Debug)] -enum ProxyParse { - NotStarted, - - Finished(SocketAddr), - None, -} - -impl AsyncWrite for WithClientIp { +impl AsyncWrite for ChainRW { #[inline] fn poll_write( self: Pin<&mut Self>, @@ -77,364 +55,174 @@ impl AsyncWrite for WithClientIp { } } -impl WithClientIp { - pub fn new(inner: T) -> Self { - WithClientIp { - inner, - buf: BytesMut::with_capacity(128), - tlv_bytes: 0, - state: ProxyParse::NotStarted, - } - } - - pub fn client_addr(&self) -> Option { - match self.state { - ProxyParse::Finished(socket) => Some(socket), - _ => None, - } - } -} - -impl WithClientIp { - pub async fn wait_for_addr(&mut self) -> io::Result> { - match self.state { - ProxyParse::NotStarted => { - let mut pin = Pin::new(&mut *self); - let addr = poll_fn(|cx| pin.as_mut().poll_client_ip(cx)).await?; - match addr { - Some(addr) => self.state = ProxyParse::Finished(addr), - None => self.state = ProxyParse::None, - } - Ok(addr) - } - ProxyParse::Finished(addr) => Ok(Some(addr)), - ProxyParse::None => Ok(None), - } - } -} - /// Proxy Protocol Version 2 Header const HEADER: [u8; 12] = [ 0x0D, 0x0A, 0x0D, 0x0A, 0x00, 0x0D, 0x0A, 0x51, 0x55, 0x49, 0x54, 0x0A, ]; -impl WithClientIp { - /// implementation of - /// Version 2 (Binary Format) - fn poll_client_ip( - mut self: Pin<&mut Self>, - cx: &mut Context<'_>, - ) -> Poll>> { - // The binary header format starts with a constant 12 bytes block containing the protocol signature : - // \x0D \x0A \x0D \x0A \x00 \x0D \x0A \x51 \x55 \x49 \x54 \x0A - while self.buf.len() < 16 { - let mut this = self.as_mut().project(); - let bytes_read = pin!(this.inner.read_buf(this.buf)).poll(cx)?; +pub async fn read_proxy_protocol( + mut read: T, +) -> std::io::Result<(ChainRW, Option)> { + let mut buf = BytesMut::with_capacity(128); + while buf.len() < 16 { + let bytes_read = read.read_buf(&mut buf).await?; - // exit for bad header - let len = usize::min(self.buf.len(), HEADER.len()); - if self.buf[..len] != HEADER[..len] { - return Poll::Ready(Ok(None)); - } - - // if no more bytes available then exit - if ready!(bytes_read) == 0 { - return Poll::Ready(Ok(None)); - }; + // exit for bad header + let len = usize::min(buf.len(), HEADER.len()); + if buf[..len] != HEADER[..len] { + return Ok((ChainRW { inner: read, buf }, None)); } - // The next byte (the 13th one) is the protocol version and command. - // The highest four bits contains the version. As of this specification, it must - // always be sent as \x2 and the receiver must only accept this value. - let vc = self.buf[12]; - let version = vc >> 4; - let command = vc & 0b1111; - if version != 2 { - return Poll::Ready(Err(io::Error::new( + // if no more bytes available then exit + if bytes_read == 0 { + return Ok((ChainRW { inner: read, buf }, None)); + }; + } + + let header = buf.split_to(16); + + // The next byte (the 13th one) is the protocol version and command. + // The highest four bits contains the version. As of this specification, it must + // always be sent as \x2 and the receiver must only accept this value. + let vc = header[12]; + let version = vc >> 4; + let command = vc & 0b1111; + if version != 2 { + return Err(io::Error::new( + io::ErrorKind::Other, + "invalid proxy protocol version. expected version 2", + )); + } + match command { + // the connection was established on purpose by the proxy + // without being relayed. The connection endpoints are the sender and the + // receiver. Such connections exist when the proxy sends health-checks to the + // server. The receiver must accept this connection as valid and must use the + // real connection endpoints and discard the protocol block including the + // family which is ignored. + 0 => {} + // the connection was established on behalf of another node, + // and reflects the original connection endpoints. The receiver must then use + // the information provided in the protocol block to get original the address. + 1 => {} + // other values are unassigned and must not be emitted by senders. Receivers + // must drop connections presenting unexpected values here. + _ => { + return Err(io::Error::new( io::ErrorKind::Other, - "invalid proxy protocol version. expected version 2", - ))); + "invalid proxy protocol command. expected local (0) or proxy (1)", + )) } - match command { - // the connection was established on purpose by the proxy - // without being relayed. The connection endpoints are the sender and the - // receiver. Such connections exist when the proxy sends health-checks to the - // server. The receiver must accept this connection as valid and must use the - // real connection endpoints and discard the protocol block including the - // family which is ignored. - 0 => {} - // the connection was established on behalf of another node, - // and reflects the original connection endpoints. The receiver must then use - // the information provided in the protocol block to get original the address. - 1 => {} - // other values are unassigned and must not be emitted by senders. Receivers - // must drop connections presenting unexpected values here. - _ => { - return Poll::Ready(Err(io::Error::new( - io::ErrorKind::Other, - "invalid proxy protocol command. expected local (0) or proxy (1)", - ))) - } - }; + }; - // The 14th byte contains the transport protocol and address family. The highest 4 - // bits contain the address family, the lowest 4 bits contain the protocol. - let ft = self.buf[13]; - let address_length = match ft { - // - \x11 : TCP over IPv4 : the forwarded connection uses TCP over the AF_INET - // protocol family. Address length is 2*4 + 2*2 = 12 bytes. - // - \x12 : UDP over IPv4 : the forwarded connection uses UDP over the AF_INET - // protocol family. Address length is 2*4 + 2*2 = 12 bytes. - 0x11 | 0x12 => 12, - // - \x21 : TCP over IPv6 : the forwarded connection uses TCP over the AF_INET6 - // protocol family. Address length is 2*16 + 2*2 = 36 bytes. - // - \x22 : UDP over IPv6 : the forwarded connection uses UDP over the AF_INET6 - // protocol family. Address length is 2*16 + 2*2 = 36 bytes. - 0x21 | 0x22 => 36, - // unspecified or unix stream. ignore the addresses - _ => 0, - }; + // The 14th byte contains the transport protocol and address family. The highest 4 + // bits contain the address family, the lowest 4 bits contain the protocol. + let ft = header[13]; + let address_length = match ft { + // - \x11 : TCP over IPv4 : the forwarded connection uses TCP over the AF_INET + // protocol family. Address length is 2*4 + 2*2 = 12 bytes. + // - \x12 : UDP over IPv4 : the forwarded connection uses UDP over the AF_INET + // protocol family. Address length is 2*4 + 2*2 = 12 bytes. + 0x11 | 0x12 => 12, + // - \x21 : TCP over IPv6 : the forwarded connection uses TCP over the AF_INET6 + // protocol family. Address length is 2*16 + 2*2 = 36 bytes. + // - \x22 : UDP over IPv6 : the forwarded connection uses UDP over the AF_INET6 + // protocol family. Address length is 2*16 + 2*2 = 36 bytes. + 0x21 | 0x22 => 36, + // unspecified or unix stream. ignore the addresses + _ => 0, + }; - // The 15th and 16th bytes is the address length in bytes in network endian order. - // It is used so that the receiver knows how many address bytes to skip even when - // it does not implement the presented protocol. Thus the length of the protocol - // header in bytes is always exactly 16 + this value. When a sender presents a - // LOCAL connection, it should not present any address so it sets this field to - // zero. Receivers MUST always consider this field to skip the appropriate number - // of bytes and must not assume zero is presented for LOCAL connections. When a - // receiver accepts an incoming connection showing an UNSPEC address family or - // protocol, it may or may not decide to log the address information if present. - let remaining_length = u16::from_be_bytes(self.buf[14..16].try_into().unwrap()); - if remaining_length < address_length { - return Poll::Ready(Err(io::Error::new( - io::ErrorKind::Other, - "invalid proxy protocol length. not enough to fit requested IP addresses", - ))); + // The 15th and 16th bytes is the address length in bytes in network endian order. + // It is used so that the receiver knows how many address bytes to skip even when + // it does not implement the presented protocol. Thus the length of the protocol + // header in bytes is always exactly 16 + this value. When a sender presents a + // LOCAL connection, it should not present any address so it sets this field to + // zero. Receivers MUST always consider this field to skip the appropriate number + // of bytes and must not assume zero is presented for LOCAL connections. When a + // receiver accepts an incoming connection showing an UNSPEC address family or + // protocol, it may or may not decide to log the address information if present. + let remaining_length = u16::from_be_bytes(header[14..16].try_into().unwrap()); + if remaining_length < address_length { + return Err(io::Error::new( + io::ErrorKind::Other, + "invalid proxy protocol length. not enough to fit requested IP addresses", + )); + } + drop(header); + + while buf.len() < remaining_length as usize { + if read.read_buf(&mut buf).await? == 0 { + return Err(io::Error::new( + io::ErrorKind::UnexpectedEof, + "stream closed while waiting for proxy protocol addresses", + )); } - - while self.buf.len() < 16 + address_length as usize { - let mut this = self.as_mut().project(); - if ready!(pin!(this.inner.read_buf(this.buf)).poll(cx)?) == 0 { - return Poll::Ready(Err(io::Error::new( - io::ErrorKind::UnexpectedEof, - "stream closed while waiting for proxy protocol addresses", - ))); - } - } - - let this = self.as_mut().project(); - - // we are sure this is a proxy protocol v2 entry and we have read all the bytes we need - // discard the header we have parsed - this.buf.advance(16); - - // Starting from the 17th byte, addresses are presented in network byte order. - // The address order is always the same : - // - source layer 3 address in network byte order - // - destination layer 3 address in network byte order - // - source layer 4 address if any, in network byte order (port) - // - destination layer 4 address if any, in network byte order (port) - let addresses = this.buf.split_to(address_length as usize); - let socket = match address_length { - 12 => { - let src_addr: [u8; 4] = addresses[0..4].try_into().unwrap(); - let src_port = u16::from_be_bytes(addresses[8..10].try_into().unwrap()); - Some(SocketAddr::from((src_addr, src_port))) - } - 36 => { - let src_addr: [u8; 16] = addresses[0..16].try_into().unwrap(); - let src_port = u16::from_be_bytes(addresses[32..34].try_into().unwrap()); - Some(SocketAddr::from((src_addr, src_port))) - } - _ => None, - }; - - *this.tlv_bytes = remaining_length - address_length; - self.as_mut().skip_tlv_inner(); - - Poll::Ready(Ok(socket)) } - #[cold] - fn read_ip(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { - let ip = ready!(self.as_mut().poll_client_ip(cx)?); - match ip { - Some(x) => *self.as_mut().project().state = ProxyParse::Finished(x), - None => *self.as_mut().project().state = ProxyParse::None, + // Starting from the 17th byte, addresses are presented in network byte order. + // The address order is always the same : + // - source layer 3 address in network byte order + // - destination layer 3 address in network byte order + // - source layer 4 address if any, in network byte order (port) + // - destination layer 4 address if any, in network byte order (port) + let addresses = buf.split_to(remaining_length as usize); + let socket = match address_length { + 12 => { + let src_addr: [u8; 4] = addresses[0..4].try_into().unwrap(); + let src_port = u16::from_be_bytes(addresses[8..10].try_into().unwrap()); + Some(SocketAddr::from((src_addr, src_port))) } - Poll::Ready(Ok(())) - } + 36 => { + let src_addr: [u8; 16] = addresses[0..16].try_into().unwrap(); + let src_port = u16::from_be_bytes(addresses[32..34].try_into().unwrap()); + Some(SocketAddr::from((src_addr, src_port))) + } + _ => None, + }; - #[cold] - fn skip_tlv(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { - let mut this = self.as_mut().project(); - // we know that this.buf is empty - debug_assert_eq!(this.buf.len(), 0); - - this.buf.reserve((*this.tlv_bytes).clamp(0, 1024) as usize); - ready!(pin!(this.inner.read_buf(this.buf)).poll(cx)?); - self.skip_tlv_inner(); - - Poll::Ready(Ok(())) - } - - fn skip_tlv_inner(self: Pin<&mut Self>) { - let tlv_bytes_read = match u16::try_from(self.buf.len()) { - // we read more than u16::MAX therefore we must have read the full tlv_bytes - Err(_) => self.tlv_bytes, - // we might not have read the full tlv bytes yet - Ok(n) => u16::min(n, self.tlv_bytes), - }; - let this = self.project(); - *this.tlv_bytes -= tlv_bytes_read; - this.buf.advance(tlv_bytes_read as usize); - } + Ok((ChainRW { inner: read, buf }, socket)) } -impl AsyncRead for WithClientIp { +impl AsyncRead for ChainRW { #[inline] fn poll_read( - mut self: Pin<&mut Self>, + self: Pin<&mut Self>, cx: &mut Context<'_>, buf: &mut ReadBuf<'_>, ) -> Poll> { - // I'm assuming these 3 comparisons will be easy to branch predict. - // especially with the cold attributes - // which should make this read wrapper almost invisible - - if let ProxyParse::NotStarted = self.state { - ready!(self.as_mut().read_ip(cx)?); - } - - while self.tlv_bytes > 0 { - ready!(self.as_mut().skip_tlv(cx)?) - } - - let this = self.project(); - if this.buf.is_empty() { - this.inner.poll_read(cx, buf) + if self.buf.is_empty() { + self.project().inner.poll_read(cx, buf) } else { - // we know that tlv_bytes is 0 - debug_assert_eq!(*this.tlv_bytes, 0); - - let write = usize::min(this.buf.len(), buf.remaining()); - let slice = this.buf.split_to(write).freeze(); - buf.put_slice(&slice); - - // reset the allocation so it can be freed - if this.buf.is_empty() { - *this.buf = BytesMut::new(); - } - - Poll::Ready(Ok(())) + self.read_from_buf(buf) } } } -impl Accept for ProxyProtocolAccept { - type Conn = WithConnectionGuard>; +impl ChainRW { + #[cold] + fn read_from_buf(self: Pin<&mut Self>, buf: &mut ReadBuf<'_>) -> Poll> { + debug_assert!(!self.buf.is_empty()); + let this = self.project(); - type Error = io::Error; + let write = usize::min(this.buf.len(), buf.remaining()); + let slice = this.buf.split_to(write).freeze(); + buf.put_slice(&slice); - fn poll_accept( - mut self: Pin<&mut Self>, - cx: &mut Context<'_>, - ) -> Poll>> { - let conn = ready!(Pin::new(&mut self.incoming).poll_accept(cx)?); - - let conn_id = uuid::Uuid::new_v4(); - let span = tracing::info_span!("http_conn", ?conn_id); - { - let _enter = span.enter(); - tracing::info!("accepted new TCP connection"); + // reset the allocation so it can be freed + if this.buf.is_empty() { + *this.buf = BytesMut::new(); } - let Some(conn) = conn else { - return Poll::Ready(None); - }; - - Poll::Ready(Some(Ok(WithConnectionGuard { - inner: WithClientIp::new(conn), - connection_id: Uuid::new_v4(), - gauge: Mutex::new(Some( - NUM_CLIENT_CONNECTION_GAUGE - .with_label_values(&[self.protocol]) - .guard(), - )), - span, - }))) - } -} - -pin_project! { - pub struct WithConnectionGuard { - #[pin] - pub inner: T, - pub connection_id: Uuid, - pub gauge: Mutex>, - pub span: tracing::Span, - } - - impl PinnedDrop for WithConnectionGuard { - fn drop(this: Pin<&mut Self>) { - let _enter = this.span.enter(); - tracing::info!("HTTP connection closed") - } - } -} - -impl AsyncWrite for WithConnectionGuard { - #[inline] - fn poll_write( - self: Pin<&mut Self>, - cx: &mut Context<'_>, - buf: &[u8], - ) -> Poll> { - self.project().inner.poll_write(cx, buf) - } - - #[inline] - fn poll_flush(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { - self.project().inner.poll_flush(cx) - } - - #[inline] - fn poll_shutdown(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { - self.project().inner.poll_shutdown(cx) - } - - #[inline] - fn poll_write_vectored( - self: Pin<&mut Self>, - cx: &mut Context<'_>, - bufs: &[io::IoSlice<'_>], - ) -> Poll> { - self.project().inner.poll_write_vectored(cx, bufs) - } - - #[inline] - fn is_write_vectored(&self) -> bool { - self.inner.is_write_vectored() - } -} - -impl AsyncRead for WithConnectionGuard { - fn poll_read( - self: Pin<&mut Self>, - cx: &mut Context<'_>, - buf: &mut ReadBuf<'_>, - ) -> Poll> { - self.project().inner.poll_read(cx, buf) + Poll::Ready(Ok(())) } } #[cfg(test)] mod tests { - use std::pin::pin; - use tokio::io::AsyncReadExt; - use crate::protocol2::{ProxyParse, WithClientIp}; + use crate::protocol2::read_proxy_protocol; #[tokio::test] async fn test_ipv4() { @@ -456,16 +244,15 @@ mod tests { let extra_data = [0x55; 256]; - let mut read = pin!(WithClientIp::new(header.chain(extra_data.as_slice()))); + let (mut read, addr) = read_proxy_protocol(header.chain(extra_data.as_slice())) + .await + .unwrap(); let mut bytes = vec![]; read.read_to_end(&mut bytes).await.unwrap(); assert_eq!(bytes, extra_data); - assert_eq!( - read.state, - ProxyParse::Finished(([127, 0, 0, 1], 65535).into()) - ); + assert_eq!(addr, Some(([127, 0, 0, 1], 65535).into())); } #[tokio::test] @@ -488,17 +275,17 @@ mod tests { let extra_data = [0x55; 256]; - let mut read = pin!(WithClientIp::new(header.chain(extra_data.as_slice()))); + let (mut read, addr) = read_proxy_protocol(header.chain(extra_data.as_slice())) + .await + .unwrap(); let mut bytes = vec![]; read.read_to_end(&mut bytes).await.unwrap(); assert_eq!(bytes, extra_data); assert_eq!( - read.state, - ProxyParse::Finished( - ([15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0], 257).into() - ) + addr, + Some(([15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0], 257).into()) ); } @@ -506,24 +293,24 @@ mod tests { async fn test_invalid() { let data = [0x55; 256]; - let mut read = pin!(WithClientIp::new(data.as_slice())); + let (mut read, addr) = read_proxy_protocol(data.as_slice()).await.unwrap(); let mut bytes = vec![]; read.read_to_end(&mut bytes).await.unwrap(); assert_eq!(bytes, data); - assert_eq!(read.state, ProxyParse::None); + assert_eq!(addr, None); } #[tokio::test] async fn test_short() { let data = [0x55; 10]; - let mut read = pin!(WithClientIp::new(data.as_slice())); + let (mut read, addr) = read_proxy_protocol(data.as_slice()).await.unwrap(); let mut bytes = vec![]; read.read_to_end(&mut bytes).await.unwrap(); assert_eq!(bytes, data); - assert_eq!(read.state, ProxyParse::None); + assert_eq!(addr, None); } #[tokio::test] @@ -549,15 +336,14 @@ mod tests { let extra_data = [0xaa; 256]; - let mut read = pin!(WithClientIp::new(header.chain(extra_data.as_slice()))); + let (mut read, addr) = read_proxy_protocol(header.chain(extra_data.as_slice())) + .await + .unwrap(); let mut bytes = vec![]; read.read_to_end(&mut bytes).await.unwrap(); assert_eq!(bytes, extra_data); - assert_eq!( - read.state, - ProxyParse::Finished(([55, 56, 57, 58], 65535).into()) - ); + assert_eq!(addr, Some(([55, 56, 57, 58], 65535).into())); } } diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs index ab5bf5d494..5824b70df9 100644 --- a/proxy/src/proxy.rs +++ b/proxy/src/proxy.rs @@ -7,16 +7,17 @@ pub mod handshake; pub mod passthrough; pub mod retry; pub mod wake_compute; +pub use copy_bidirectional::copy_bidirectional_client_compute; use crate::{ auth, - cancellation::{self, CancellationHandler}, + cancellation::{self, CancellationHandlerMain, CancellationHandlerMainInternal}, compute, config::{ProxyConfig, TlsConfig}, context::RequestMonitoring, error::ReportableError, - metrics::{NUM_CLIENT_CONNECTION_GAUGE, NUM_CONNECTION_REQUESTS_GAUGE}, - protocol2::WithClientIp, + metrics::{Metrics, NumClientConnectionsGuard}, + protocol2::read_proxy_protocol, proxy::handshake::{handshake, HandshakeData}, rate_limiter::EndpointRateLimiter, stream::{PqStream, Stream}, @@ -24,7 +25,6 @@ use crate::{ }; use futures::TryFutureExt; use itertools::Itertools; -use metrics::IntCounterPairGuard; use once_cell::sync::OnceCell; use pq_proto::{BeMessage as Be, StartupMessageParams}; use regex::Regex; @@ -61,8 +61,8 @@ pub async fn task_main( config: &'static ProxyConfig, listener: tokio::net::TcpListener, cancellation_token: CancellationToken, + cancellation_handler: Arc, endpoint_rate_limiter: Arc, - cancellation_handler: Arc, ) -> anyhow::Result<()> { scopeguard::defer! { info!("proxy has shut down"); @@ -79,31 +79,30 @@ pub async fn task_main( { let (socket, peer_addr) = accept_result?; - let conn_gauge = NUM_CLIENT_CONNECTION_GAUGE - .with_label_values(&["tcp"]) - .guard(); + let conn_gauge = Metrics::get() + .proxy + .client_connections + .guard(crate::metrics::Protocol::Tcp); let session_id = uuid::Uuid::new_v4(); let cancellation_handler = Arc::clone(&cancellation_handler); - let endpoint_rate_limiter = endpoint_rate_limiter.clone(); tracing::info!(protocol = "tcp", %session_id, "accepted new TCP connection"); + let endpoint_rate_limiter2 = endpoint_rate_limiter.clone(); connections.spawn(async move { - let mut socket = WithClientIp::new(socket); - let mut peer_addr = peer_addr.ip(); - match socket.wait_for_addr().await { - Ok(Some(addr)) => peer_addr = addr.ip(), + let (socket, peer_addr) = match read_proxy_protocol(socket).await{ + Ok((socket, Some(addr))) => (socket, addr.ip()), Err(e) => { error!("per-client task finished with an error: {e:#}"); return; } - Ok(None) if config.require_client_ip => { + Ok((_socket, None)) if config.require_client_ip => { error!("missing required client IP"); return; } - Ok(None) => {} - } + Ok((socket, None)) => (socket, peer_addr.ip()) + }; match socket.inner.set_nodelay(true) { Ok(()) => {}, @@ -113,7 +112,12 @@ pub async fn task_main( }, }; - let mut ctx = RequestMonitoring::new(session_id, peer_addr, "tcp", &config.region); + let mut ctx = RequestMonitoring::new( + session_id, + peer_addr, + crate::metrics::Protocol::Tcp, + &config.region, + ); let span = ctx.span.clone(); let res = handle_client( @@ -122,7 +126,7 @@ pub async fn task_main( cancellation_handler, socket, ClientMode::Tcp, - endpoint_rate_limiter, + endpoint_rate_limiter2, conn_gauge, ) .instrument(span.clone()) @@ -132,16 +136,14 @@ pub async fn task_main( Err(e) => { // todo: log and push to ctx the error kind ctx.set_error_kind(e.get_error_kind()); - ctx.log(); error!(parent: &span, "per-client task finished with an error: {e:#}"); } Ok(None) => { ctx.set_success(); - ctx.log(); } Ok(Some(p)) => { ctx.set_success(); - ctx.log(); + ctx.log_connect(); match p.proxy_pass().instrument(span.clone()).await { Ok(()) => {} Err(e) => { @@ -233,23 +235,26 @@ impl ReportableError for ClientRequestError { pub async fn handle_client( config: &'static ProxyConfig, ctx: &mut RequestMonitoring, - cancellation_handler: Arc, + cancellation_handler: Arc, stream: S, mode: ClientMode, endpoint_rate_limiter: Arc, - conn_gauge: IntCounterPairGuard, -) -> Result>, ClientRequestError> { - info!("handling interactive connection from client"); + conn_gauge: NumClientConnectionsGuard<'static>, +) -> Result>, ClientRequestError> { + info!( + protocol = %ctx.protocol, + "handling interactive connection from client" + ); + let metrics = &Metrics::get().proxy; let proto = ctx.protocol; - let _request_gauge = NUM_CONNECTION_REQUESTS_GAUGE - .with_label_values(&[proto]) - .guard(); + let _request_gauge = metrics.connection_requests.guard(proto); let tls = config.tls_config.as_ref(); + let record_handshake_error = !ctx.has_private_peer_addr(); let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Client); - let do_handshake = handshake(stream, mode.handshake_tls(tls)); + let do_handshake = handshake(stream, mode.handshake_tls(tls), record_handshake_error); let (mut stream, params) = match tokio::time::timeout(config.handshake_timeout, do_handshake).await?? { HandshakeData::Startup(stream, params) => (stream, params), @@ -278,15 +283,6 @@ pub async fn handle_client( Err(e) => stream.throw_error(e).await?, }; - // check rate limit - if let Some(ep) = user_info.get_endpoint() { - if !endpoint_rate_limiter.check(ep) { - return stream - .throw_error(auth::AuthError::too_many_connections()) - .await?; - } - } - let user = user_info.get_user().to_owned(); let user_info = match user_info .authenticate( @@ -294,6 +290,7 @@ pub async fn handle_client( &mut stream, mode.allow_cleartext(), &config.authentication_config, + endpoint_rate_limiter, ) .await { @@ -309,9 +306,14 @@ pub async fn handle_client( let mut node = connect_to_compute( ctx, - &TcpMechanism { params: ¶ms }, + &TcpMechanism { + params: ¶ms, + locks: &config.connect_compute_locks, + }, &user_info, mode.allow_self_signed_compute(config), + config.wake_compute_retry_config, + config.connect_to_compute_retry_config, ) .or_else(|e| stream.throw_error(e)) .await?; @@ -338,9 +340,9 @@ pub async fn handle_client( /// Finish client connection initialization: confirm auth success, send params, etc. #[tracing::instrument(skip_all)] -async fn prepare_client_connection( +async fn prepare_client_connection

( node: &compute::PostgresConnection, - session: &cancellation::Session, + session: &cancellation::Session

, stream: &mut PqStream, ) -> Result<(), std::io::Error> { // Register compute's query cancellation token and produce a new, unique one. diff --git a/proxy/src/proxy/connect_compute.rs b/proxy/src/proxy/connect_compute.rs index c76e2ff6d9..c8528d0296 100644 --- a/proxy/src/proxy/connect_compute.rs +++ b/proxy/src/proxy/connect_compute.rs @@ -1,14 +1,16 @@ use crate::{ auth::backend::ComputeCredentialKeys, compute::{self, PostgresConnection}, - console::{self, errors::WakeComputeError, CachedNodeInfo, NodeInfo}, + config::RetryConfig, + console::{self, errors::WakeComputeError, locks::ApiLocks, CachedNodeInfo, NodeInfo}, context::RequestMonitoring, error::ReportableError, - metrics::NUM_CONNECTION_FAILURES, + metrics::{ConnectOutcome, ConnectionFailureKind, Metrics, RetriesMetricGroup, RetryType}, proxy::{ retry::{retry_after, ShouldRetry}, wake_compute::wake_compute, }, + Host, }; use async_trait::async_trait; use pq_proto::StartupMessageParams; @@ -27,10 +29,10 @@ pub fn invalidate_cache(node_info: console::CachedNodeInfo) -> NodeInfo { warn!("invalidating stalled compute node info cache entry"); } let label = match is_cached { - true => "compute_cached", - false => "compute_uncached", + true => ConnectionFailureKind::ComputeCached, + false => ConnectionFailureKind::ComputeUncached, }; - NUM_CONNECTION_FAILURES.with_label_values(&[label]).inc(); + Metrics::get().proxy.connection_failures_total.inc(label); node_info.invalidate() } @@ -63,6 +65,9 @@ pub trait ComputeConnectBackend { pub struct TcpMechanism<'a> { /// KV-dictionary with PostgreSQL connection params. pub params: &'a StartupMessageParams, + + /// connect_to_compute concurrency lock + pub locks: &'static ApiLocks, } #[async_trait] @@ -78,6 +83,8 @@ impl ConnectMechanism for TcpMechanism<'_> { node_info: &console::CachedNodeInfo, timeout: time::Duration, ) -> Result { + let host = node_info.config.get_host()?; + let _permit = self.locks.get_permit(&host).await?; node_info.connect(ctx, timeout).await } @@ -87,26 +94,29 @@ impl ConnectMechanism for TcpMechanism<'_> { } /// Try to connect to the compute node, retrying if necessary. -/// This function might update `node_info`, so we take it by `&mut`. #[tracing::instrument(skip_all)] pub async fn connect_to_compute( ctx: &mut RequestMonitoring, mechanism: &M, user_info: &B, allow_self_signed_compute: bool, + wake_compute_retry_config: RetryConfig, + connect_to_compute_retry_config: RetryConfig, ) -> Result where M::ConnectError: ShouldRetry + std::fmt::Debug, M::Error: From, { let mut num_retries = 0; - let mut node_info = wake_compute(&mut num_retries, ctx, user_info).await?; + let mut node_info = + wake_compute(&mut num_retries, ctx, user_info, wake_compute_retry_config).await?; if let Some(keys) = user_info.get_keys() { node_info.set_keys(keys); } node_info.allow_self_signed_compute = allow_self_signed_compute; // let mut node_info = credentials.get_node_info(ctx, user_info).await?; mechanism.update_connect_config(&mut node_info.config); + let retry_type = RetryType::ConnectToCompute; // try once let err = match mechanism @@ -115,6 +125,13 @@ where { Ok(res) => { ctx.latency_timer.success(); + Metrics::get().proxy.retries_metric.observe( + RetriesMetricGroup { + outcome: ConnectOutcome::Success, + retry_type, + }, + num_retries.into(), + ); return Ok(res); } Err(e) => e, @@ -122,19 +139,26 @@ where error!(error = ?err, "could not connect to compute node"); - let node_info = if !node_info.cached() { + let node_info = if !node_info.cached() || !err.should_retry_database_address() { // If we just recieved this from cplane and dodn't get it from cache, we shouldn't retry. // Do not need to retrieve a new node_info, just return the old one. - if !err.should_retry(num_retries) { + if !err.should_retry(num_retries, connect_to_compute_retry_config) { + Metrics::get().proxy.retries_metric.observe( + RetriesMetricGroup { + outcome: ConnectOutcome::Failed, + retry_type, + }, + num_retries.into(), + ); return Err(err.into()); } node_info } else { // if we failed to connect, it's likely that the compute node was suspended, wake a new compute node info!("compute node's state has likely changed; requesting a wake-up"); - ctx.latency_timer.cache_miss(); let old_node_info = invalidate_cache(node_info); - let mut node_info = wake_compute(&mut num_retries, ctx, user_info).await?; + let mut node_info = + wake_compute(&mut num_retries, ctx, user_info, wake_compute_retry_config).await?; node_info.reuse_settings(old_node_info); mechanism.update_connect_config(&mut node_info.config); @@ -153,21 +177,40 @@ where { Ok(res) => { ctx.latency_timer.success(); + Metrics::get().proxy.retries_metric.observe( + RetriesMetricGroup { + outcome: ConnectOutcome::Success, + retry_type, + }, + num_retries.into(), + ); + info!(?num_retries, "connected to compute node after"); return Ok(res); } Err(e) => { - let retriable = e.should_retry(num_retries); + let retriable = e.should_retry(num_retries, connect_to_compute_retry_config); if !retriable { error!(error = ?e, num_retries, retriable, "couldn't connect to compute node"); + Metrics::get().proxy.retries_metric.observe( + RetriesMetricGroup { + outcome: ConnectOutcome::Failed, + retry_type, + }, + num_retries.into(), + ); return Err(e.into()); } warn!(error = ?e, num_retries, retriable, "couldn't connect to compute node"); } } - let wait_duration = retry_after(num_retries); + let wait_duration = retry_after(num_retries, connect_to_compute_retry_config); num_retries += 1; + let pause = ctx + .latency_timer + .pause(crate::metrics::Waiting::RetryTimeout); time::sleep(wait_duration).await; + drop(pause); } } diff --git a/proxy/src/proxy/copy_bidirectional.rs b/proxy/src/proxy/copy_bidirectional.rs index 684be74f9a..4b09ebd8dc 100644 --- a/proxy/src/proxy/copy_bidirectional.rs +++ b/proxy/src/proxy/copy_bidirectional.rs @@ -41,7 +41,7 @@ where } #[tracing::instrument(skip_all)] -pub(super) async fn copy_bidirectional_client_compute( +pub async fn copy_bidirectional_client_compute( client: &mut Client, compute: &mut Compute, ) -> Result<(u64, u64), std::io::Error> diff --git a/proxy/src/proxy/handshake.rs b/proxy/src/proxy/handshake.rs index 4665e07d23..dd935cc245 100644 --- a/proxy/src/proxy/handshake.rs +++ b/proxy/src/proxy/handshake.rs @@ -63,6 +63,7 @@ pub enum HandshakeData { pub async fn handshake( stream: S, mut tls: Option<&TlsConfig>, + record_handshake_error: bool, ) -> Result, HandshakeError> { // Client may try upgrading to each protocol only once let (mut tried_ssl, mut tried_gss) = (false, false); @@ -95,7 +96,9 @@ pub async fn handshake( if !read_buf.is_empty() { return Err(HandshakeError::EarlyData); } - let tls_stream = raw.upgrade(tls.to_server_config()).await?; + let tls_stream = raw + .upgrade(tls.to_server_config(), record_handshake_error) + .await?; let (_, tls_server_end_point) = tls .cert_resolver diff --git a/proxy/src/proxy/passthrough.rs b/proxy/src/proxy/passthrough.rs index b2f682fd2f..62de79946f 100644 --- a/proxy/src/proxy/passthrough.rs +++ b/proxy/src/proxy/passthrough.rs @@ -2,11 +2,10 @@ use crate::{ cancellation, compute::PostgresConnection, console::messages::MetricsAuxInfo, - metrics::NUM_BYTES_PROXIED_COUNTER, + metrics::{Direction, Metrics, NumClientConnectionsGuard, NumConnectionRequestsGuard}, stream::Stream, - usage_metrics::{Ids, USAGE_METRICS}, + usage_metrics::{Ids, MetricCounterRecorder, USAGE_METRICS}, }; -use metrics::IntCounterPairGuard; use tokio::io::{AsyncRead, AsyncWrite}; use tracing::info; use utils::measured_stream::MeasuredStream; @@ -19,28 +18,29 @@ pub async fn proxy_pass( aux: MetricsAuxInfo, ) -> anyhow::Result<()> { let usage = USAGE_METRICS.register(Ids { - endpoint_id: aux.endpoint_id.clone(), - branch_id: aux.branch_id.clone(), + endpoint_id: aux.endpoint_id, + branch_id: aux.branch_id, }); - let m_sent = NUM_BYTES_PROXIED_COUNTER.with_label_values(&["tx"]); + let metrics = &Metrics::get().proxy.io_bytes; + let m_sent = metrics.with_labels(Direction::Tx); let mut client = MeasuredStream::new( client, |_| {}, |cnt| { // Number of bytes we sent to the client (outbound). - m_sent.inc_by(cnt as u64); + metrics.get_metric(m_sent).inc_by(cnt as u64); usage.record_egress(cnt as u64); }, ); - let m_recv = NUM_BYTES_PROXIED_COUNTER.with_label_values(&["rx"]); + let m_recv = metrics.with_labels(Direction::Rx); let mut compute = MeasuredStream::new( compute, |_| {}, |cnt| { // Number of bytes the client sent to the compute node (inbound). - m_recv.inc_by(cnt as u64); + metrics.get_metric(m_recv).inc_by(cnt as u64); }, ); @@ -55,17 +55,17 @@ pub async fn proxy_pass( Ok(()) } -pub struct ProxyPassthrough { +pub struct ProxyPassthrough { pub client: Stream, pub compute: PostgresConnection, pub aux: MetricsAuxInfo, - pub req: IntCounterPairGuard, - pub conn: IntCounterPairGuard, - pub cancel: cancellation::Session, + pub req: NumConnectionRequestsGuard<'static>, + pub conn: NumClientConnectionsGuard<'static>, + pub cancel: cancellation::Session

, } -impl ProxyPassthrough { +impl ProxyPassthrough { pub async fn proxy_pass(self) -> anyhow::Result<()> { let res = proxy_pass(self.client, self.compute.stream, self.aux).await; self.compute.cancel_closure.try_cancel_query().await?; diff --git a/proxy/src/proxy/retry.rs b/proxy/src/proxy/retry.rs index a85ed380b0..8dec1f1137 100644 --- a/proxy/src/proxy/retry.rs +++ b/proxy/src/proxy/retry.rs @@ -1,21 +1,18 @@ -use crate::compute; +use crate::{compute, config::RetryConfig}; use std::{error::Error, io}; use tokio::time; -/// Number of times we should retry the `/proxy_wake_compute` http request. -/// Retry duration is BASE_RETRY_WAIT_DURATION * RETRY_WAIT_EXPONENT_BASE ^ n, where n starts at 0 -pub const NUM_RETRIES_CONNECT: u32 = 16; -const BASE_RETRY_WAIT_DURATION: time::Duration = time::Duration::from_millis(25); -const RETRY_WAIT_EXPONENT_BASE: f64 = std::f64::consts::SQRT_2; - pub trait ShouldRetry { fn could_retry(&self) -> bool; - fn should_retry(&self, num_retries: u32) -> bool { + fn should_retry(&self, num_retries: u32, config: RetryConfig) -> bool { match self { - _ if num_retries >= NUM_RETRIES_CONNECT => false, + _ if num_retries >= config.max_retries => false, err => err.could_retry(), } } + fn should_retry_database_address(&self) -> bool { + true + } } impl ShouldRetry for io::Error { @@ -39,6 +36,21 @@ impl ShouldRetry for tokio_postgres::error::DbError { | &SqlState::SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION, ) } + fn should_retry_database_address(&self) -> bool { + use tokio_postgres::error::SqlState; + // Here are errors that happens after the user successfully authenticated to the database. + // TODO: there are pgbouncer errors that should be retried, but they are not listed here. + !matches!( + self.code(), + &SqlState::TOO_MANY_CONNECTIONS + | &SqlState::OUT_OF_MEMORY + | &SqlState::SYNTAX_ERROR + | &SqlState::T_R_SERIALIZATION_FAILURE + | &SqlState::INVALID_CATALOG_NAME + | &SqlState::INVALID_SCHEMA_NAME + | &SqlState::INVALID_PARAMETER_VALUE + ) + } } impl ShouldRetry for tokio_postgres::Error { @@ -51,6 +63,15 @@ impl ShouldRetry for tokio_postgres::Error { false } } + fn should_retry_database_address(&self) -> bool { + if let Some(io_err) = self.source().and_then(|x| x.downcast_ref()) { + io::Error::should_retry_database_address(io_err) + } else if let Some(db_err) = self.source().and_then(|x| x.downcast_ref()) { + tokio_postgres::error::DbError::should_retry_database_address(db_err) + } else { + true + } + } } impl ShouldRetry for compute::ConnectionError { @@ -61,8 +82,19 @@ impl ShouldRetry for compute::ConnectionError { _ => false, } } + fn should_retry_database_address(&self) -> bool { + match self { + compute::ConnectionError::Postgres(err) => err.should_retry_database_address(), + compute::ConnectionError::CouldNotConnect(err) => err.should_retry_database_address(), + // the cache entry was not checked for validity + compute::ConnectionError::TooManyConnectionAttempts(_) => false, + _ => true, + } + } } -pub fn retry_after(num_retries: u32) -> time::Duration { - BASE_RETRY_WAIT_DURATION.mul_f64(RETRY_WAIT_EXPONENT_BASE.powi((num_retries as i32) - 1)) +pub fn retry_after(num_retries: u32, config: RetryConfig) -> time::Duration { + config + .base_delay + .mul_f64(config.backoff_factor.powi((num_retries as i32) - 1)) } diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs index 9c3be73612..ad48af0093 100644 --- a/proxy/src/proxy/tests.rs +++ b/proxy/src/proxy/tests.rs @@ -10,13 +10,14 @@ use super::*; use crate::auth::backend::{ ComputeCredentialKeys, ComputeCredentials, ComputeUserInfo, MaybeOwned, TestBackend, }; -use crate::config::CertResolver; +use crate::config::{CertResolver, RetryConfig}; use crate::console::caches::NodeInfoCache; +use crate::console::messages::MetricsAuxInfo; use crate::console::provider::{CachedAllowedIps, CachedRoleSecret, ConsoleBackend}; use crate::console::{self, CachedNodeInfo, NodeInfo}; use crate::error::ErrorKind; -use crate::proxy::retry::{retry_after, NUM_RETRIES_CONNECT}; -use crate::{http, sasl, scram}; +use crate::proxy::retry::retry_after; +use crate::{http, sasl, scram, BranchId, EndpointId, ProjectId}; use anyhow::{bail, Context}; use async_trait::async_trait; use rstest::rstest; @@ -142,8 +143,8 @@ impl Scram { Ok(Scram(secret)) } - fn mock(user: &str) -> Self { - Scram(scram::ServerSecret::mock(user, rand::random())) + fn mock() -> Self { + Scram(scram::ServerSecret::mock(rand::random())) } } @@ -173,8 +174,8 @@ async fn dummy_proxy( tls: Option, auth: impl TestAuth + Send, ) -> anyhow::Result<()> { - let client = WithClientIp::new(client); - let mut stream = match handshake(client, tls.as_ref()).await? { + let (client, _) = read_proxy_protocol(client).await?; + let mut stream = match handshake(client, tls.as_ref(), false).await? { HandshakeData::Startup(stream, _) => stream, HandshakeData::Cancel(_) => bail!("cancellation not supported"), }; @@ -330,11 +331,7 @@ async fn scram_auth_mock() -> anyhow::Result<()> { let (client_config, server_config) = generate_tls_config("generic-project-name.localhost", "localhost")?; - let proxy = tokio::spawn(dummy_proxy( - client, - Some(server_config), - Scram::mock("user"), - )); + let proxy = tokio::spawn(dummy_proxy(client, Some(server_config), Scram::mock())); use rand::{distributions::Alphanumeric, Rng}; let password: String = rand::thread_rng() @@ -364,11 +361,15 @@ async fn scram_auth_mock() -> anyhow::Result<()> { #[test] fn connect_compute_total_wait() { let mut total_wait = tokio::time::Duration::ZERO; - for num_retries in 1..NUM_RETRIES_CONNECT { - total_wait += retry_after(num_retries); + let config = RetryConfig { + base_delay: Duration::from_secs(1), + max_retries: 5, + backoff_factor: 2.0, + }; + for num_retries in 1..config.max_retries { + total_wait += retry_after(num_retries, config); } - assert!(total_wait < tokio::time::Duration::from_secs(12)); - assert!(total_wait > tokio::time::Duration::from_secs(10)); + assert!(f64::abs(total_wait.as_secs_f64() - 15.0) < 0.1); } #[derive(Clone, Copy, Debug)] @@ -516,7 +517,12 @@ impl TestBackend for TestConnectMechanism { fn helper_create_cached_node_info(cache: &'static NodeInfoCache) -> CachedNodeInfo { let node = NodeInfo { config: compute::ConnCfg::new(), - aux: Default::default(), + aux: MetricsAuxInfo { + endpoint_id: (&EndpointId::from("endpoint")).into(), + project_id: (&ProjectId::from("project")).into(), + branch_id: (&BranchId::from("branch")).into(), + cold_start_info: crate::console::messages::ColdStartInfo::Warm, + }, allow_self_signed_compute: false, }; let (_, node) = cache.insert("key".into(), node); @@ -547,7 +553,12 @@ async fn connect_to_compute_success() { let mut ctx = RequestMonitoring::test(); let mechanism = TestConnectMechanism::new(vec![Wake, Connect]); let user_info = helper_create_connect_info(&mechanism); - connect_to_compute(&mut ctx, &mechanism, &user_info, false) + let config = RetryConfig { + base_delay: Duration::from_secs(1), + max_retries: 5, + backoff_factor: 2.0, + }; + connect_to_compute(&mut ctx, &mechanism, &user_info, false, config, config) .await .unwrap(); mechanism.verify(); @@ -560,7 +571,12 @@ async fn connect_to_compute_retry() { let mut ctx = RequestMonitoring::test(); let mechanism = TestConnectMechanism::new(vec![Wake, Retry, Wake, Connect]); let user_info = helper_create_connect_info(&mechanism); - connect_to_compute(&mut ctx, &mechanism, &user_info, false) + let config = RetryConfig { + base_delay: Duration::from_secs(1), + max_retries: 5, + backoff_factor: 2.0, + }; + connect_to_compute(&mut ctx, &mechanism, &user_info, false, config, config) .await .unwrap(); mechanism.verify(); @@ -574,7 +590,12 @@ async fn connect_to_compute_non_retry_1() { let mut ctx = RequestMonitoring::test(); let mechanism = TestConnectMechanism::new(vec![Wake, Retry, Wake, Fail]); let user_info = helper_create_connect_info(&mechanism); - connect_to_compute(&mut ctx, &mechanism, &user_info, false) + let config = RetryConfig { + base_delay: Duration::from_secs(1), + max_retries: 5, + backoff_factor: 2.0, + }; + connect_to_compute(&mut ctx, &mechanism, &user_info, false, config, config) .await .unwrap_err(); mechanism.verify(); @@ -588,7 +609,12 @@ async fn connect_to_compute_non_retry_2() { let mut ctx = RequestMonitoring::test(); let mechanism = TestConnectMechanism::new(vec![Wake, Fail, Wake, Connect]); let user_info = helper_create_connect_info(&mechanism); - connect_to_compute(&mut ctx, &mechanism, &user_info, false) + let config = RetryConfig { + base_delay: Duration::from_secs(1), + max_retries: 5, + backoff_factor: 2.0, + }; + connect_to_compute(&mut ctx, &mechanism, &user_info, false, config, config) .await .unwrap(); mechanism.verify(); @@ -598,17 +624,32 @@ async fn connect_to_compute_non_retry_2() { #[tokio::test] async fn connect_to_compute_non_retry_3() { let _ = env_logger::try_init(); - assert_eq!(NUM_RETRIES_CONNECT, 16); + tokio::time::pause(); use ConnectAction::*; let mut ctx = RequestMonitoring::test(); - let mechanism = TestConnectMechanism::new(vec![ - Wake, Retry, Wake, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry, - Retry, Retry, Retry, Retry, Retry, /* the 17th time */ Retry, - ]); + let mechanism = + TestConnectMechanism::new(vec![Wake, Retry, Wake, Retry, Retry, Retry, Retry, Retry]); let user_info = helper_create_connect_info(&mechanism); - connect_to_compute(&mut ctx, &mechanism, &user_info, false) - .await - .unwrap_err(); + let wake_compute_retry_config = RetryConfig { + base_delay: Duration::from_secs(1), + max_retries: 1, + backoff_factor: 2.0, + }; + let connect_to_compute_retry_config = RetryConfig { + base_delay: Duration::from_secs(1), + max_retries: 5, + backoff_factor: 2.0, + }; + connect_to_compute( + &mut ctx, + &mechanism, + &user_info, + false, + wake_compute_retry_config, + connect_to_compute_retry_config, + ) + .await + .unwrap_err(); mechanism.verify(); } @@ -620,7 +661,12 @@ async fn wake_retry() { let mut ctx = RequestMonitoring::test(); let mechanism = TestConnectMechanism::new(vec![WakeRetry, Wake, Connect]); let user_info = helper_create_connect_info(&mechanism); - connect_to_compute(&mut ctx, &mechanism, &user_info, false) + let config = RetryConfig { + base_delay: Duration::from_secs(1), + max_retries: 5, + backoff_factor: 2.0, + }; + connect_to_compute(&mut ctx, &mechanism, &user_info, false, config, config) .await .unwrap(); mechanism.verify(); @@ -634,7 +680,12 @@ async fn wake_non_retry() { let mut ctx = RequestMonitoring::test(); let mechanism = TestConnectMechanism::new(vec![WakeRetry, WakeFail]); let user_info = helper_create_connect_info(&mechanism); - connect_to_compute(&mut ctx, &mechanism, &user_info, false) + let config = RetryConfig { + base_delay: Duration::from_secs(1), + max_retries: 5, + backoff_factor: 2.0, + }; + connect_to_compute(&mut ctx, &mechanism, &user_info, false, config, config) .await .unwrap_err(); mechanism.verify(); diff --git a/proxy/src/proxy/tests/mitm.rs b/proxy/src/proxy/tests/mitm.rs index 3b760e5dab..cbfc9f1358 100644 --- a/proxy/src/proxy/tests/mitm.rs +++ b/proxy/src/proxy/tests/mitm.rs @@ -34,7 +34,10 @@ async fn proxy_mitm( tokio::spawn(async move { // begin handshake with end_server let end_server = connect_tls(server2, client_config2.make_tls_connect().unwrap()).await; - let (end_client, startup) = match handshake(client1, Some(&server_config1)).await.unwrap() { + let (end_client, startup) = match handshake(client1, Some(&server_config1), false) + .await + .unwrap() + { HandshakeData::Startup(stream, params) => (stream, params), HandshakeData::Cancel(_) => panic!("cancellation not supported"), }; diff --git a/proxy/src/proxy/wake_compute.rs b/proxy/src/proxy/wake_compute.rs index bfe4b7ec3a..94b03e1ccc 100644 --- a/proxy/src/proxy/wake_compute.rs +++ b/proxy/src/proxy/wake_compute.rs @@ -1,10 +1,14 @@ +use crate::config::RetryConfig; use crate::console::{errors::WakeComputeError, provider::CachedNodeInfo}; use crate::context::RequestMonitoring; -use crate::metrics::{bool_to_str, NUM_WAKEUP_FAILURES}; +use crate::metrics::{ + ConnectOutcome, ConnectionFailuresBreakdownGroup, Metrics, RetriesMetricGroup, RetryType, + WakeupFailureKind, +}; use crate::proxy::retry::retry_after; -use hyper::StatusCode; +use hyper1::StatusCode; use std::ops::ControlFlow; -use tracing::{error, warn}; +use tracing::{error, info, warn}; use super::connect_compute::ComputeConnectBackend; use super::retry::ShouldRetry; @@ -13,25 +17,48 @@ pub async fn wake_compute( num_retries: &mut u32, ctx: &mut RequestMonitoring, api: &B, + config: RetryConfig, ) -> Result { + let retry_type = RetryType::WakeCompute; loop { let wake_res = api.wake_compute(ctx).await; - match handle_try_wake(wake_res, *num_retries) { + match handle_try_wake(wake_res, *num_retries, config) { Err(e) => { error!(error = ?e, num_retries, retriable = false, "couldn't wake compute node"); report_error(&e, false); + Metrics::get().proxy.retries_metric.observe( + RetriesMetricGroup { + outcome: ConnectOutcome::Failed, + retry_type, + }, + (*num_retries).into(), + ); return Err(e); } Ok(ControlFlow::Continue(e)) => { warn!(error = ?e, num_retries, retriable = true, "couldn't wake compute node"); report_error(&e, true); } - Ok(ControlFlow::Break(n)) => return Ok(n), + Ok(ControlFlow::Break(n)) => { + Metrics::get().proxy.retries_metric.observe( + RetriesMetricGroup { + outcome: ConnectOutcome::Success, + retry_type, + }, + (*num_retries).into(), + ); + info!(?num_retries, "compute node woken up after"); + return Ok(n); + } } - let wait_duration = retry_after(*num_retries); + let wait_duration = retry_after(*num_retries, config); *num_retries += 1; + let pause = ctx + .latency_timer + .pause(crate::metrics::Waiting::RetryTimeout); tokio::time::sleep(wait_duration).await; + drop(pause); } } @@ -42,10 +69,11 @@ pub async fn wake_compute( pub fn handle_try_wake( result: Result, num_retries: u32, + config: RetryConfig, ) -> Result, WakeComputeError> { match result { Err(err) => match &err { - WakeComputeError::ApiError(api) if api.should_retry(num_retries) => { + WakeComputeError::ApiError(api) if api.should_retry(num_retries, config) => { Ok(ControlFlow::Continue(err)) } _ => Err(err), @@ -57,39 +85,47 @@ pub fn handle_try_wake( fn report_error(e: &WakeComputeError, retry: bool) { use crate::console::errors::ApiError; - let retry = bool_to_str(retry); let kind = match e { - WakeComputeError::BadComputeAddress(_) => "bad_compute_address", - WakeComputeError::ApiError(ApiError::Transport(_)) => "api_transport_error", + WakeComputeError::BadComputeAddress(_) => WakeupFailureKind::BadComputeAddress, + WakeComputeError::ApiError(ApiError::Transport(_)) => WakeupFailureKind::ApiTransportError, WakeComputeError::ApiError(ApiError::Console { status: StatusCode::LOCKED, ref text, }) if text.contains("written data quota exceeded") || text.contains("the limit for current plan reached") => { - "quota_exceeded" + WakeupFailureKind::QuotaExceeded } WakeComputeError::ApiError(ApiError::Console { status: StatusCode::UNPROCESSABLE_ENTITY, ref text, }) if text.contains("compute time quota of non-primary branches is exceeded") => { - "quota_exceeded" + WakeupFailureKind::QuotaExceeded } WakeComputeError::ApiError(ApiError::Console { status: StatusCode::LOCKED, .. - }) => "api_console_locked", + }) => WakeupFailureKind::ApiConsoleLocked, WakeComputeError::ApiError(ApiError::Console { status: StatusCode::BAD_REQUEST, .. - }) => "api_console_bad_request", + }) => WakeupFailureKind::ApiConsoleBadRequest, WakeComputeError::ApiError(ApiError::Console { status, .. }) if status.is_server_error() => { - "api_console_other_server_error" + WakeupFailureKind::ApiConsoleOtherServerError } - WakeComputeError::ApiError(ApiError::Console { .. }) => "api_console_other_error", - WakeComputeError::TimeoutError => "timeout_error", + WakeComputeError::ApiError(ApiError::Console { .. }) => { + WakeupFailureKind::ApiConsoleOtherError + } + WakeComputeError::TooManyConnections => WakeupFailureKind::ApiConsoleLocked, + WakeComputeError::TooManyConnectionAttempts(_) => WakeupFailureKind::TimeoutError, }; - NUM_WAKEUP_FAILURES.with_label_values(&[retry, kind]).inc(); + Metrics::get() + .proxy + .connection_failures_breakdown + .inc(ConnectionFailuresBreakdownGroup { + kind, + retry: retry.into(), + }); } diff --git a/proxy/src/rate_limiter.rs b/proxy/src/rate_limiter.rs index f0da4ead23..c542267547 100644 --- a/proxy/src/rate_limiter.rs +++ b/proxy/src/rate_limiter.rs @@ -1,7 +1,2 @@ -mod aimd; -mod limit_algorithm; mod limiter; -pub use aimd::Aimd; -pub use limit_algorithm::{AimdConfig, Fixed, RateLimitAlgorithm, RateLimiterConfig}; -pub use limiter::Limiter; -pub use limiter::{EndpointRateLimiter, RateBucketInfo, RedisRateLimiter}; +pub use limiter::{BucketRateLimiter, EndpointRateLimiter, GlobalRateLimiter, RateBucketInfo}; diff --git a/proxy/src/rate_limiter/aimd.rs b/proxy/src/rate_limiter/aimd.rs deleted file mode 100644 index 2c14a54a6c..0000000000 --- a/proxy/src/rate_limiter/aimd.rs +++ /dev/null @@ -1,166 +0,0 @@ -use std::usize; - -use async_trait::async_trait; - -use super::limit_algorithm::{AimdConfig, LimitAlgorithm, Sample}; - -use super::limiter::Outcome; - -/// Loss-based congestion avoidance. -/// -/// Additive-increase, multiplicative decrease. -/// -/// Adds available currency when: -/// 1. no load-based errors are observed, and -/// 2. the utilisation of the current limit is high. -/// -/// Reduces available concurrency by a factor when load-based errors are detected. -pub struct Aimd { - min_limit: usize, - max_limit: usize, - decrease_factor: f32, - increase_by: usize, - min_utilisation_threshold: f32, -} - -impl Aimd { - pub fn new(config: AimdConfig) -> Self { - Self { - min_limit: config.aimd_min_limit, - max_limit: config.aimd_max_limit, - decrease_factor: config.aimd_decrease_factor, - increase_by: config.aimd_increase_by, - min_utilisation_threshold: config.aimd_min_utilisation_threshold, - } - } -} - -#[async_trait] -impl LimitAlgorithm for Aimd { - async fn update(&mut self, old_limit: usize, sample: Sample) -> usize { - use Outcome::*; - match sample.outcome { - Success => { - let utilisation = sample.in_flight as f32 / old_limit as f32; - - if utilisation > self.min_utilisation_threshold { - let limit = old_limit + self.increase_by; - limit.clamp(self.min_limit, self.max_limit) - } else { - old_limit - } - } - Overload => { - let limit = old_limit as f32 * self.decrease_factor; - - // Floor instead of round, so the limit reduces even with small numbers. - // E.g. round(2 * 0.9) = 2, but floor(2 * 0.9) = 1 - let limit = limit.floor() as usize; - - limit.clamp(self.min_limit, self.max_limit) - } - } - } -} - -#[cfg(test)] -mod tests { - use std::sync::Arc; - - use tokio::sync::Notify; - - use super::*; - - use crate::rate_limiter::{Limiter, RateLimiterConfig}; - - #[tokio::test] - async fn should_decrease_limit_on_overload() { - let config = RateLimiterConfig { - initial_limit: 10, - aimd_config: Some(AimdConfig { - aimd_decrease_factor: 0.5, - ..Default::default() - }), - disable: false, - ..Default::default() - }; - - let release_notifier = Arc::new(Notify::new()); - - let limiter = Limiter::new(config).with_release_notifier(release_notifier.clone()); - - let token = limiter.try_acquire().unwrap(); - limiter.release(token, Some(Outcome::Overload)).await; - release_notifier.notified().await; - assert_eq!(limiter.state().limit(), 5, "overload: decrease"); - } - - #[tokio::test] - async fn should_increase_limit_on_success_when_using_gt_util_threshold() { - let config = RateLimiterConfig { - initial_limit: 4, - aimd_config: Some(AimdConfig { - aimd_decrease_factor: 0.5, - aimd_min_utilisation_threshold: 0.5, - aimd_increase_by: 1, - ..Default::default() - }), - disable: false, - ..Default::default() - }; - - let limiter = Limiter::new(config); - - let token = limiter.try_acquire().unwrap(); - let _token = limiter.try_acquire().unwrap(); - let _token = limiter.try_acquire().unwrap(); - - limiter.release(token, Some(Outcome::Success)).await; - assert_eq!(limiter.state().limit(), 5, "success: increase"); - } - - #[tokio::test] - async fn should_not_change_limit_on_success_when_using_lt_util_threshold() { - let config = RateLimiterConfig { - initial_limit: 4, - aimd_config: Some(AimdConfig { - aimd_decrease_factor: 0.5, - aimd_min_utilisation_threshold: 0.5, - ..Default::default() - }), - disable: false, - ..Default::default() - }; - - let limiter = Limiter::new(config); - - let token = limiter.try_acquire().unwrap(); - - limiter.release(token, Some(Outcome::Success)).await; - assert_eq!( - limiter.state().limit(), - 4, - "success: ignore when < half limit" - ); - } - - #[tokio::test] - async fn should_not_change_limit_when_no_outcome() { - let config = RateLimiterConfig { - initial_limit: 10, - aimd_config: Some(AimdConfig { - aimd_decrease_factor: 0.5, - aimd_min_utilisation_threshold: 0.5, - ..Default::default() - }), - disable: false, - ..Default::default() - }; - - let limiter = Limiter::new(config); - - let token = limiter.try_acquire().unwrap(); - limiter.release(token, None).await; - assert_eq!(limiter.state().limit(), 10, "ignore"); - } -} diff --git a/proxy/src/rate_limiter/limit_algorithm.rs b/proxy/src/rate_limiter/limit_algorithm.rs deleted file mode 100644 index 5cd2d5ebb7..0000000000 --- a/proxy/src/rate_limiter/limit_algorithm.rs +++ /dev/null @@ -1,98 +0,0 @@ -//! Algorithms for controlling concurrency limits. -use async_trait::async_trait; -use std::time::Duration; - -use super::{limiter::Outcome, Aimd}; - -/// An algorithm for controlling a concurrency limit. -#[async_trait] -pub trait LimitAlgorithm: Send + Sync + 'static { - /// Update the concurrency limit in response to a new job completion. - async fn update(&mut self, old_limit: usize, sample: Sample) -> usize; -} - -/// The result of a job (or jobs), including the [Outcome] (loss) and latency (delay). -#[derive(Debug, Clone, PartialEq, Eq)] -pub struct Sample { - pub(crate) latency: Duration, - /// Jobs in flight when the sample was taken. - pub(crate) in_flight: usize, - pub(crate) outcome: Outcome, -} - -#[derive(Clone, Copy, Debug, Default, clap::ValueEnum)] -pub enum RateLimitAlgorithm { - Fixed, - #[default] - Aimd, -} - -pub struct Fixed; - -#[async_trait] -impl LimitAlgorithm for Fixed { - async fn update(&mut self, old_limit: usize, _sample: Sample) -> usize { - old_limit - } -} - -#[derive(Clone, Copy, Debug)] -pub struct RateLimiterConfig { - pub disable: bool, - pub algorithm: RateLimitAlgorithm, - pub timeout: Duration, - pub initial_limit: usize, - pub aimd_config: Option, -} - -impl RateLimiterConfig { - pub fn create_rate_limit_algorithm(self) -> Box { - match self.algorithm { - RateLimitAlgorithm::Fixed => Box::new(Fixed), - RateLimitAlgorithm::Aimd => Box::new(Aimd::new(self.aimd_config.unwrap())), // For aimd algorithm config is mandatory. - } - } -} - -impl Default for RateLimiterConfig { - fn default() -> Self { - Self { - disable: true, - algorithm: RateLimitAlgorithm::Aimd, - timeout: Duration::from_secs(1), - initial_limit: 100, - aimd_config: Some(AimdConfig::default()), - } - } -} - -#[derive(clap::Parser, Clone, Copy, Debug)] -pub struct AimdConfig { - /// Minimum limit for AIMD algorithm. Makes sense only if `rate_limit_algorithm` is `Aimd`. - #[clap(long, default_value_t = 1)] - pub aimd_min_limit: usize, - /// Maximum limit for AIMD algorithm. Makes sense only if `rate_limit_algorithm` is `Aimd`. - #[clap(long, default_value_t = 1500)] - pub aimd_max_limit: usize, - /// Increase AIMD increase by value in case of success. Makes sense only if `rate_limit_algorithm` is `Aimd`. - #[clap(long, default_value_t = 10)] - pub aimd_increase_by: usize, - /// Decrease AIMD decrease by value in case of timout/429. Makes sense only if `rate_limit_algorithm` is `Aimd`. - #[clap(long, default_value_t = 0.9)] - pub aimd_decrease_factor: f32, - /// A threshold below which the limit won't be increased. Makes sense only if `rate_limit_algorithm` is `Aimd`. - #[clap(long, default_value_t = 0.8)] - pub aimd_min_utilisation_threshold: f32, -} - -impl Default for AimdConfig { - fn default() -> Self { - Self { - aimd_min_limit: 1, - aimd_max_limit: 1500, - aimd_increase_by: 10, - aimd_decrease_factor: 0.9, - aimd_min_utilisation_threshold: 0.8, - } - } -} diff --git a/proxy/src/rate_limiter/limiter.rs b/proxy/src/rate_limiter/limiter.rs index 3181060e2f..b8c9490696 100644 --- a/proxy/src/rate_limiter/limiter.rs +++ b/proxy/src/rate_limiter/limiter.rs @@ -1,9 +1,10 @@ use std::{ + borrow::Cow, collections::hash_map::RandomState, - hash::BuildHasher, + hash::{BuildHasher, Hash}, sync::{ atomic::{AtomicUsize, Ordering}, - Arc, Mutex, + Mutex, }, }; @@ -11,24 +12,18 @@ use anyhow::bail; use dashmap::DashMap; use itertools::Itertools; use rand::{rngs::StdRng, Rng, SeedableRng}; -use tokio::sync::{Mutex as AsyncMutex, Semaphore, SemaphorePermit}; -use tokio::time::{timeout, Duration, Instant}; +use tokio::time::{Duration, Instant}; use tracing::info; -use crate::EndpointId; +use crate::intern::EndpointIdInt; -use super::{ - limit_algorithm::{LimitAlgorithm, Sample}, - RateLimiterConfig, -}; - -pub struct RedisRateLimiter { +pub struct GlobalRateLimiter { data: Vec, - info: &'static [RateBucketInfo], + info: Vec, } -impl RedisRateLimiter { - pub fn new(info: &'static [RateBucketInfo]) -> Self { +impl GlobalRateLimiter { + pub fn new(info: Vec) -> Self { Self { data: vec![ RateBucket { @@ -48,12 +43,12 @@ impl RedisRateLimiter { let should_allow_request = self .data .iter_mut() - .zip(self.info) - .all(|(bucket, info)| bucket.should_allow_request(info, now)); + .zip(&self.info) + .all(|(bucket, info)| bucket.should_allow_request(info, now, 1)); if should_allow_request { // only increment the bucket counts if the request will actually be accepted - self.data.iter_mut().for_each(RateBucket::inc); + self.data.iter_mut().for_each(|b| b.inc(1)); } should_allow_request @@ -66,14 +61,11 @@ impl RedisRateLimiter { // Purposefully ignore user name and database name as clients can reconnect // with different names, so we'll end up sending some http requests to // the control plane. -// -// We also may save quite a lot of CPU (I think) by bailing out right after we -// saw SNI, before doing TLS handshake. User-side error messages in that case -// does not look very nice (`SSL SYSCALL error: Undefined error: 0`), so for now -// I went with a more expensive way that yields user-friendlier error messages. -pub struct EndpointRateLimiter { - map: DashMap, Hasher>, - info: &'static [RateBucketInfo], +pub type EndpointRateLimiter = BucketRateLimiter; + +pub struct BucketRateLimiter { + map: DashMap, Hasher>, + info: Cow<'static, [RateBucketInfo]>, access_count: AtomicUsize, rand: Mutex, } @@ -85,9 +77,9 @@ struct RateBucket { } impl RateBucket { - fn should_allow_request(&mut self, info: &RateBucketInfo, now: Instant) -> bool { + fn should_allow_request(&mut self, info: &RateBucketInfo, now: Instant, n: u32) -> bool { if now - self.start < info.interval { - self.count < info.max_rpi + self.count + n <= info.max_rpi } else { // bucket expired, reset self.count = 0; @@ -97,8 +89,8 @@ impl RateBucket { } } - fn inc(&mut self) { - self.count += 1; + fn inc(&mut self, n: u32) { + self.count += n; } } @@ -111,7 +103,7 @@ pub struct RateBucketInfo { impl std::fmt::Display for RateBucketInfo { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - let rps = self.max_rpi * 1000 / self.interval.as_millis() as u32; + let rps = (self.max_rpi as u64) * 1000 / self.interval.as_millis() as u64; write!(f, "{rps}@{}", humantime::format_duration(self.interval)) } } @@ -142,6 +134,12 @@ impl RateBucketInfo { Self::new(100, Duration::from_secs(600)), ]; + pub const DEFAULT_ENDPOINT_SET: [Self; 3] = [ + Self::new(500, Duration::from_secs(1)), + Self::new(300, Duration::from_secs(60)), + Self::new(200, Duration::from_secs(600)), + ]; + pub fn validate(info: &mut [Self]) -> anyhow::Result<()> { info.sort_unstable_by_key(|info| info.interval); let invalid = info @@ -150,7 +148,7 @@ impl RateBucketInfo { .find(|(a, b)| a.max_rpi > b.max_rpi); if let Some((a, b)) = invalid { bail!( - "invalid endpoint RPS limits. {b} allows fewer requests per bucket than {a} ({} vs {})", + "invalid bucket RPS limits. {b} allows fewer requests per bucket than {a} ({} vs {})", b.max_rpi, a.max_rpi, ); @@ -162,19 +160,24 @@ impl RateBucketInfo { pub const fn new(max_rps: u32, interval: Duration) -> Self { Self { interval, - max_rpi: max_rps * interval.as_millis() as u32 / 1000, + max_rpi: ((max_rps as u64) * (interval.as_millis() as u64) / 1000) as u32, } } } -impl EndpointRateLimiter { - pub fn new(info: &'static [RateBucketInfo]) -> Self { +impl BucketRateLimiter { + pub fn new(info: impl Into>) -> Self { Self::new_with_rand_and_hasher(info, StdRng::from_entropy(), RandomState::new()) } } -impl EndpointRateLimiter { - fn new_with_rand_and_hasher(info: &'static [RateBucketInfo], rand: R, hasher: S) -> Self { +impl BucketRateLimiter { + fn new_with_rand_and_hasher( + info: impl Into>, + rand: R, + hasher: S, + ) -> Self { + let info = info.into(); info!(buckets = ?info, "endpoint rate limiter"); Self { info, @@ -185,7 +188,7 @@ impl EndpointRateLimiter { } /// Check that number of connections to the endpoint is below `max_rps` rps. - pub fn check(&self, endpoint: EndpointId) -> bool { + pub fn check(&self, key: K, n: u32) -> bool { // do a partial GC every 2k requests. This cleans up ~ 1/64th of the map. // worst case memory usage is about: // = 2 * 2048 * 64 * (48B + 72B) @@ -195,7 +198,7 @@ impl EndpointRateLimiter { } let now = Instant::now(); - let mut entry = self.map.entry(endpoint).or_insert_with(|| { + let mut entry = self.map.entry(key).or_insert_with(|| { vec![ RateBucket { start: now, @@ -207,12 +210,12 @@ impl EndpointRateLimiter { let should_allow_request = entry .iter_mut() - .zip(self.info) - .all(|(bucket, info)| bucket.should_allow_request(info, now)); + .zip(&*self.info) + .all(|(bucket, info)| bucket.should_allow_request(info, now, n)); if should_allow_request { // only increment the bucket counts if the request will actually be accepted - entry.iter_mut().for_each(RateBucket::inc); + entry.iter_mut().for_each(|b| b.inc(n)); } should_allow_request @@ -223,7 +226,7 @@ impl EndpointRateLimiter { /// But that way deletion does not aquire mutex on each entry access. pub fn do_gc(&self) { info!( - "cleaning up endpoint rate limiter, current size = {}", + "cleaning up bucket rate limiter, current size = {}", self.map.len() ); let n = self.map.shards().len(); @@ -234,419 +237,16 @@ impl EndpointRateLimiter { } } -/// Limits the number of concurrent jobs. -/// -/// Concurrency is limited through the use of [Token]s. Acquire a token to run a job, and release the -/// token once the job is finished. -/// -/// The limit will be automatically adjusted based on observed latency (delay) and/or failures -/// caused by overload (loss). -pub struct Limiter { - limit_algo: AsyncMutex>, - semaphore: std::sync::Arc, - config: RateLimiterConfig, - - // ONLY WRITE WHEN LIMIT_ALGO IS LOCKED - limits: AtomicUsize, - - // ONLY USE ATOMIC ADD/SUB - in_flight: Arc, - - #[cfg(test)] - notifier: Option>, -} - -/// A concurrency token, required to run a job. -/// -/// Release the token back to the [Limiter] after the job is complete. -#[derive(Debug)] -pub struct Token<'t> { - permit: Option>, - start: Instant, - in_flight: Arc, -} - -/// A snapshot of the state of the [Limiter]. -/// -/// Not guaranteed to be consistent under high concurrency. -#[derive(Debug, Clone, Copy)] -pub struct LimiterState { - limit: usize, - in_flight: usize, -} - -/// Whether a job succeeded or failed as a result of congestion/overload. -/// -/// Errors not considered to be caused by overload should be ignored. -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub enum Outcome { - /// The job succeeded, or failed in a way unrelated to overload. - Success, - /// The job failed because of overload, e.g. it timed out or an explicit backpressure signal - /// was observed. - Overload, -} - -impl Outcome { - fn from_reqwest_error(error: &reqwest_middleware::Error) -> Self { - match error { - reqwest_middleware::Error::Middleware(_) => Outcome::Success, - reqwest_middleware::Error::Reqwest(e) => { - if let Some(status) = e.status() { - if status.is_server_error() - || reqwest::StatusCode::TOO_MANY_REQUESTS.as_u16() == status - { - Outcome::Overload - } else { - Outcome::Success - } - } else { - Outcome::Success - } - } - } - } - fn from_reqwest_response(response: &reqwest::Response) -> Self { - if response.status().is_server_error() - || response.status() == reqwest::StatusCode::TOO_MANY_REQUESTS - { - Outcome::Overload - } else { - Outcome::Success - } - } -} - -impl Limiter { - /// Create a limiter with a given limit control algorithm. - pub fn new(config: RateLimiterConfig) -> Self { - assert!(config.initial_limit > 0); - Self { - limit_algo: AsyncMutex::new(config.create_rate_limit_algorithm()), - semaphore: Arc::new(Semaphore::new(config.initial_limit)), - config, - limits: AtomicUsize::new(config.initial_limit), - in_flight: Arc::new(AtomicUsize::new(0)), - #[cfg(test)] - notifier: None, - } - } - // pub fn new(limit_algorithm: T, timeout: Duration, initial_limit: usize) -> Self { - // assert!(initial_limit > 0); - - // Self { - // limit_algo: AsyncMutex::new(limit_algorithm), - // semaphore: Arc::new(Semaphore::new(initial_limit)), - // timeout, - // limits: AtomicUsize::new(initial_limit), - // in_flight: Arc::new(AtomicUsize::new(0)), - // #[cfg(test)] - // notifier: None, - // } - // } - - /// In some cases [Token]s are acquired asynchronously when updating the limit. - #[cfg(test)] - pub fn with_release_notifier(mut self, n: std::sync::Arc) -> Self { - self.notifier = Some(n); - self - } - - /// Try to immediately acquire a concurrency [Token]. - /// - /// Returns `None` if there are none available. - pub fn try_acquire(&self) -> Option { - let result = if self.config.disable { - // If the rate limiter is disabled, we can always acquire a token. - Some(Token::new(None, self.in_flight.clone())) - } else { - self.semaphore - .try_acquire() - .map(|permit| Token::new(Some(permit), self.in_flight.clone())) - .ok() - }; - if result.is_some() { - self.in_flight.fetch_add(1, Ordering::AcqRel); - } - result - } - - /// Try to acquire a concurrency [Token], waiting for `duration` if there are none available. - /// - /// Returns `None` if there are none available after `duration`. - pub async fn acquire_timeout(&self, duration: Duration) -> Option> { - info!("acquiring token: {:?}", self.semaphore.available_permits()); - let result = if self.config.disable { - // If the rate limiter is disabled, we can always acquire a token. - Some(Token::new(None, self.in_flight.clone())) - } else { - match timeout(duration, self.semaphore.acquire()).await { - Ok(maybe_permit) => maybe_permit - .map(|permit| Token::new(Some(permit), self.in_flight.clone())) - .ok(), - Err(_) => None, - } - }; - if result.is_some() { - self.in_flight.fetch_add(1, Ordering::AcqRel); - } - result - } - - /// Return the concurrency [Token], along with the outcome of the job. - /// - /// The [Outcome] of the job, and the time taken to perform it, may be used - /// to update the concurrency limit. - /// - /// Set the outcome to `None` to ignore the job. - pub async fn release(&self, mut token: Token<'_>, outcome: Option) { - tracing::info!("outcome is {:?}", outcome); - let in_flight = self.in_flight.load(Ordering::Acquire); - let old_limit = self.limits.load(Ordering::Acquire); - let available = if self.config.disable { - 0 // This is not used in the algorithm and can be anything. If the config disable it makes sense to set it to 0. - } else { - self.semaphore.available_permits() - }; - let total = in_flight + available; - - let mut algo = self.limit_algo.lock().await; - - let new_limit = if let Some(outcome) = outcome { - let sample = Sample { - latency: token.start.elapsed(), - in_flight, - outcome, - }; - algo.update(old_limit, sample).await - } else { - old_limit - }; - tracing::info!("new limit is {}", new_limit); - let actual_limit = if new_limit < total { - token.forget(); - total.saturating_sub(1) - } else { - if !self.config.disable { - self.semaphore.add_permits(new_limit.saturating_sub(total)); - } - new_limit - }; - crate::metrics::RATE_LIMITER_LIMIT - .with_label_values(&["expected"]) - .set(new_limit as i64); - crate::metrics::RATE_LIMITER_LIMIT - .with_label_values(&["actual"]) - .set(actual_limit as i64); - self.limits.store(new_limit, Ordering::Release); - #[cfg(test)] - if let Some(n) = &self.notifier { - n.notify_one(); - } - } - - /// The current state of the limiter. - pub fn state(&self) -> LimiterState { - let limit = self.limits.load(Ordering::Relaxed); - let in_flight = self.in_flight.load(Ordering::Relaxed); - LimiterState { limit, in_flight } - } -} - -impl<'t> Token<'t> { - fn new(permit: Option>, in_flight: Arc) -> Self { - Self { - permit, - start: Instant::now(), - in_flight, - } - } - - pub fn forget(&mut self) { - if let Some(permit) = self.permit.take() { - permit.forget(); - } - } -} - -impl Drop for Token<'_> { - fn drop(&mut self) { - self.in_flight.fetch_sub(1, Ordering::AcqRel); - } -} - -impl LimiterState { - /// The current concurrency limit. - pub fn limit(&self) -> usize { - self.limit - } - /// The number of jobs in flight. - pub fn in_flight(&self) -> usize { - self.in_flight - } -} - -#[async_trait::async_trait] -impl reqwest_middleware::Middleware for Limiter { - async fn handle( - &self, - req: reqwest::Request, - extensions: &mut task_local_extensions::Extensions, - next: reqwest_middleware::Next<'_>, - ) -> reqwest_middleware::Result { - let start = Instant::now(); - let token = self - .acquire_timeout(self.config.timeout) - .await - .ok_or_else(|| { - reqwest_middleware::Error::Middleware( - // TODO: Should we map it into user facing errors? - crate::console::errors::ApiError::Console { - status: crate::http::StatusCode::TOO_MANY_REQUESTS, - text: "Too many requests".into(), - } - .into(), - ) - })?; - info!(duration = ?start.elapsed(), "waiting for token to connect to the control plane"); - crate::metrics::RATE_LIMITER_ACQUIRE_LATENCY.observe(start.elapsed().as_secs_f64()); - match next.run(req, extensions).await { - Ok(response) => { - self.release(token, Some(Outcome::from_reqwest_response(&response))) - .await; - Ok(response) - } - Err(e) => { - self.release(token, Some(Outcome::from_reqwest_error(&e))) - .await; - Err(e) - } - } - } -} - #[cfg(test)] mod tests { - use std::{hash::BuildHasherDefault, pin::pin, task::Context, time::Duration}; + use std::{hash::BuildHasherDefault, time::Duration}; - use futures::{task::noop_waker_ref, Future}; use rand::SeedableRng; use rustc_hash::FxHasher; use tokio::time; - use super::{EndpointRateLimiter, Limiter, Outcome}; - use crate::{ - rate_limiter::{RateBucketInfo, RateLimitAlgorithm}, - EndpointId, - }; - - #[tokio::test] - async fn it_works() { - let config = super::RateLimiterConfig { - algorithm: RateLimitAlgorithm::Fixed, - timeout: Duration::from_secs(1), - initial_limit: 10, - disable: false, - ..Default::default() - }; - let limiter = Limiter::new(config); - - let token = limiter.try_acquire().unwrap(); - - limiter.release(token, Some(Outcome::Success)).await; - - assert_eq!(limiter.state().limit(), 10); - } - - #[tokio::test] - async fn is_fair() { - let config = super::RateLimiterConfig { - algorithm: RateLimitAlgorithm::Fixed, - timeout: Duration::from_secs(1), - initial_limit: 1, - disable: false, - ..Default::default() - }; - let limiter = Limiter::new(config); - - // === TOKEN 1 === - let token1 = limiter.try_acquire().unwrap(); - - let mut token2_fut = pin!(limiter.acquire_timeout(Duration::from_secs(1))); - assert!( - token2_fut - .as_mut() - .poll(&mut Context::from_waker(noop_waker_ref())) - .is_pending(), - "token is acquired by token1" - ); - - let mut token3_fut = pin!(limiter.acquire_timeout(Duration::from_secs(1))); - assert!( - token3_fut - .as_mut() - .poll(&mut Context::from_waker(noop_waker_ref())) - .is_pending(), - "token is acquired by token1" - ); - - limiter.release(token1, Some(Outcome::Success)).await; - // === END TOKEN 1 === - - // === TOKEN 2 === - assert!( - limiter.try_acquire().is_none(), - "token is acquired by token2" - ); - - assert!( - token3_fut - .as_mut() - .poll(&mut Context::from_waker(noop_waker_ref())) - .is_pending(), - "token is acquired by token2" - ); - - let token2 = token2_fut.await.unwrap(); - - limiter.release(token2, Some(Outcome::Success)).await; - // === END TOKEN 2 === - - // === TOKEN 3 === - assert!( - limiter.try_acquire().is_none(), - "token is acquired by token3" - ); - - let token3 = token3_fut.await.unwrap(); - limiter.release(token3, Some(Outcome::Success)).await; - // === END TOKEN 3 === - - // === TOKEN 4 === - let token4 = limiter.try_acquire().unwrap(); - limiter.release(token4, Some(Outcome::Success)).await; - } - - #[tokio::test] - async fn disable() { - let config = super::RateLimiterConfig { - algorithm: RateLimitAlgorithm::Fixed, - timeout: Duration::from_secs(1), - initial_limit: 1, - disable: true, - ..Default::default() - }; - let limiter = Limiter::new(config); - - // === TOKEN 1 === - let token1 = limiter.try_acquire().unwrap(); - let token2 = limiter.try_acquire().unwrap(); - let state = limiter.state(); - assert_eq!(state.limit(), 1); - assert_eq!(state.in_flight(), 2); // For disabled limiter, it's expected. - limiter.release(token1, None).await; - limiter.release(token2, None).await; - } + use super::{BucketRateLimiter, EndpointRateLimiter}; + use crate::{intern::EndpointIdInt, rate_limiter::RateBucketInfo, EndpointId}; #[test] fn rate_bucket_rpi() { @@ -677,7 +277,7 @@ mod tests { } #[test] - #[should_panic = "invalid endpoint RPS limits. 10@10s allows fewer requests per bucket than 300@1s (100 vs 300)"] + #[should_panic = "invalid bucket RPS limits. 10@10s allows fewer requests per bucket than 300@1s (100 vs 300)"] fn rate_buckets_validate() { let mut rates: Vec = ["300@1s", "10@10s"] .into_iter() @@ -693,42 +293,43 @@ mod tests { .map(|s| s.parse().unwrap()) .collect(); RateBucketInfo::validate(&mut rates).unwrap(); - let limiter = EndpointRateLimiter::new(Vec::leak(rates)); + let limiter = EndpointRateLimiter::new(rates); let endpoint = EndpointId::from("ep-my-endpoint-1234"); + let endpoint = EndpointIdInt::from(endpoint); time::pause(); for _ in 0..100 { - assert!(limiter.check(endpoint.clone())); + assert!(limiter.check(endpoint, 1)); } // more connections fail - assert!(!limiter.check(endpoint.clone())); + assert!(!limiter.check(endpoint, 1)); // fail even after 500ms as it's in the same bucket time::advance(time::Duration::from_millis(500)).await; - assert!(!limiter.check(endpoint.clone())); + assert!(!limiter.check(endpoint, 1)); // after a full 1s, 100 requests are allowed again time::advance(time::Duration::from_millis(500)).await; for _ in 1..6 { - for _ in 0..100 { - assert!(limiter.check(endpoint.clone())); + for _ in 0..50 { + assert!(limiter.check(endpoint, 2)); } time::advance(time::Duration::from_millis(1000)).await; } // more connections after 600 will exceed the 20rps@30s limit - assert!(!limiter.check(endpoint.clone())); + assert!(!limiter.check(endpoint, 1)); // will still fail before the 30 second limit time::advance(time::Duration::from_millis(30_000 - 6_000 - 1)).await; - assert!(!limiter.check(endpoint.clone())); + assert!(!limiter.check(endpoint, 1)); // after the full 30 seconds, 100 requests are allowed again time::advance(time::Duration::from_millis(1)).await; for _ in 0..100 { - assert!(limiter.check(endpoint.clone())); + assert!(limiter.check(endpoint, 1)); } } @@ -738,13 +339,10 @@ mod tests { let rand = rand::rngs::StdRng::from_seed([1; 32]); let hasher = BuildHasherDefault::::default(); - let limiter = EndpointRateLimiter::new_with_rand_and_hasher( - &RateBucketInfo::DEFAULT_SET, - rand, - hasher, - ); + let limiter = + BucketRateLimiter::new_with_rand_and_hasher(&RateBucketInfo::DEFAULT_SET, rand, hasher); for i in 0..1_000_000 { - limiter.check(format!("{i}").into()); + limiter.check(i, 1); } assert!(limiter.map.len() < 150_000); } diff --git a/proxy/src/redis.rs b/proxy/src/redis.rs index 35d6db074e..a322f0368c 100644 --- a/proxy/src/redis.rs +++ b/proxy/src/redis.rs @@ -1,2 +1,4 @@ +pub mod cancellation_publisher; +pub mod connection_with_credentials_provider; +pub mod elasticache; pub mod notifications; -pub mod publisher; diff --git a/proxy/src/redis/cancellation_publisher.rs b/proxy/src/redis/cancellation_publisher.rs new file mode 100644 index 0000000000..7baf104374 --- /dev/null +++ b/proxy/src/redis/cancellation_publisher.rs @@ -0,0 +1,161 @@ +use std::sync::Arc; + +use pq_proto::CancelKeyData; +use redis::AsyncCommands; +use tokio::sync::Mutex; +use uuid::Uuid; + +use crate::rate_limiter::{GlobalRateLimiter, RateBucketInfo}; + +use super::{ + connection_with_credentials_provider::ConnectionWithCredentialsProvider, + notifications::{CancelSession, Notification, PROXY_CHANNEL_NAME}, +}; + +pub trait CancellationPublisherMut: Send + Sync + 'static { + #[allow(async_fn_in_trait)] + async fn try_publish( + &mut self, + cancel_key_data: CancelKeyData, + session_id: Uuid, + ) -> anyhow::Result<()>; +} + +pub trait CancellationPublisher: Send + Sync + 'static { + #[allow(async_fn_in_trait)] + async fn try_publish( + &self, + cancel_key_data: CancelKeyData, + session_id: Uuid, + ) -> anyhow::Result<()>; +} + +impl CancellationPublisher for () { + async fn try_publish( + &self, + _cancel_key_data: CancelKeyData, + _session_id: Uuid, + ) -> anyhow::Result<()> { + Ok(()) + } +} + +impl CancellationPublisherMut for P { + async fn try_publish( + &mut self, + cancel_key_data: CancelKeyData, + session_id: Uuid, + ) -> anyhow::Result<()> { +

::try_publish(self, cancel_key_data, session_id).await + } +} + +impl CancellationPublisher for Option

{ + async fn try_publish( + &self, + cancel_key_data: CancelKeyData, + session_id: Uuid, + ) -> anyhow::Result<()> { + if let Some(p) = self { + p.try_publish(cancel_key_data, session_id).await + } else { + Ok(()) + } + } +} + +impl CancellationPublisher for Arc> { + async fn try_publish( + &self, + cancel_key_data: CancelKeyData, + session_id: Uuid, + ) -> anyhow::Result<()> { + self.lock() + .await + .try_publish(cancel_key_data, session_id) + .await + } +} + +pub struct RedisPublisherClient { + client: ConnectionWithCredentialsProvider, + region_id: String, + limiter: GlobalRateLimiter, +} + +impl RedisPublisherClient { + pub fn new( + client: ConnectionWithCredentialsProvider, + region_id: String, + info: &'static [RateBucketInfo], + ) -> anyhow::Result { + Ok(Self { + client, + region_id, + limiter: GlobalRateLimiter::new(info.into()), + }) + } + + async fn publish( + &mut self, + cancel_key_data: CancelKeyData, + session_id: Uuid, + ) -> anyhow::Result<()> { + let payload = serde_json::to_string(&Notification::Cancel(CancelSession { + region_id: Some(self.region_id.clone()), + cancel_key_data, + session_id, + }))?; + self.client.publish(PROXY_CHANNEL_NAME, payload).await?; + Ok(()) + } + pub async fn try_connect(&mut self) -> anyhow::Result<()> { + match self.client.connect().await { + Ok(()) => {} + Err(e) => { + tracing::error!("failed to connect to redis: {e}"); + return Err(e); + } + } + Ok(()) + } + async fn try_publish_internal( + &mut self, + cancel_key_data: CancelKeyData, + session_id: Uuid, + ) -> anyhow::Result<()> { + if !self.limiter.check() { + tracing::info!("Rate limit exceeded. Skipping cancellation message"); + return Err(anyhow::anyhow!("Rate limit exceeded")); + } + match self.publish(cancel_key_data, session_id).await { + Ok(()) => return Ok(()), + Err(e) => { + tracing::error!("failed to publish a message: {e}"); + } + } + tracing::info!("Publisher is disconnected. Reconnectiong..."); + self.try_connect().await?; + self.publish(cancel_key_data, session_id).await + } +} + +impl CancellationPublisherMut for RedisPublisherClient { + async fn try_publish( + &mut self, + cancel_key_data: CancelKeyData, + session_id: Uuid, + ) -> anyhow::Result<()> { + tracing::info!("publishing cancellation key to Redis"); + match self.try_publish_internal(cancel_key_data, session_id).await { + Ok(()) => { + tracing::info!("cancellation key successfuly published to Redis"); + Ok(()) + } + Err(e) => { + tracing::error!("failed to publish a message: {e}"); + Err(e) + } + } + } +} diff --git a/proxy/src/redis/connection_with_credentials_provider.rs b/proxy/src/redis/connection_with_credentials_provider.rs new file mode 100644 index 0000000000..3a90d911c2 --- /dev/null +++ b/proxy/src/redis/connection_with_credentials_provider.rs @@ -0,0 +1,237 @@ +use std::{sync::Arc, time::Duration}; + +use futures::FutureExt; +use redis::{ + aio::{ConnectionLike, MultiplexedConnection}, + ConnectionInfo, IntoConnectionInfo, RedisConnectionInfo, RedisResult, +}; +use tokio::task::JoinHandle; +use tracing::{error, info}; + +use super::elasticache::CredentialsProvider; + +enum Credentials { + Static(ConnectionInfo), + Dynamic(Arc, redis::ConnectionAddr), +} + +impl Clone for Credentials { + fn clone(&self) -> Self { + match self { + Credentials::Static(info) => Credentials::Static(info.clone()), + Credentials::Dynamic(provider, addr) => { + Credentials::Dynamic(Arc::clone(provider), addr.clone()) + } + } + } +} + +/// A wrapper around `redis::MultiplexedConnection` that automatically refreshes the token. +/// Provides PubSub connection without credentials refresh. +pub struct ConnectionWithCredentialsProvider { + credentials: Credentials, + con: Option, + refresh_token_task: Option>, + mutex: tokio::sync::Mutex<()>, +} + +impl Clone for ConnectionWithCredentialsProvider { + fn clone(&self) -> Self { + Self { + credentials: self.credentials.clone(), + con: None, + refresh_token_task: None, + mutex: tokio::sync::Mutex::new(()), + } + } +} + +impl ConnectionWithCredentialsProvider { + pub fn new_with_credentials_provider( + host: String, + port: u16, + credentials_provider: Arc, + ) -> Self { + Self { + credentials: Credentials::Dynamic( + credentials_provider, + redis::ConnectionAddr::TcpTls { + host, + port, + insecure: false, + tls_params: None, + }, + ), + con: None, + refresh_token_task: None, + mutex: tokio::sync::Mutex::new(()), + } + } + + pub fn new_with_static_credentials(params: T) -> Self { + Self { + credentials: Credentials::Static(params.into_connection_info().unwrap()), + con: None, + refresh_token_task: None, + mutex: tokio::sync::Mutex::new(()), + } + } + + async fn ping(con: &mut MultiplexedConnection) -> RedisResult<()> { + redis::cmd("PING").query_async(con).await + } + + pub async fn connect(&mut self) -> anyhow::Result<()> { + let _guard = self.mutex.lock().await; + if let Some(con) = self.con.as_mut() { + match Self::ping(con).await { + Ok(()) => { + return Ok(()); + } + Err(e) => { + error!("Error during PING: {e:?}"); + } + } + } else { + info!("Connection is not established"); + } + info!("Establishing a new connection..."); + self.con = None; + if let Some(f) = self.refresh_token_task.take() { + f.abort() + } + let mut con = self + .get_client() + .await? + .get_multiplexed_tokio_connection() + .await?; + if let Credentials::Dynamic(credentials_provider, _) = &self.credentials { + let credentials_provider = credentials_provider.clone(); + let con2 = con.clone(); + let f = tokio::spawn(async move { + let _ = Self::keep_connection(con2, credentials_provider).await; + }); + self.refresh_token_task = Some(f); + } + match Self::ping(&mut con).await { + Ok(()) => { + info!("Connection succesfully established"); + } + Err(e) => { + error!("Connection is broken. Error during PING: {e:?}"); + } + } + self.con = Some(con); + Ok(()) + } + + async fn get_connection_info(&self) -> anyhow::Result { + match &self.credentials { + Credentials::Static(info) => Ok(info.clone()), + Credentials::Dynamic(provider, addr) => { + let (username, password) = provider.provide_credentials().await?; + Ok(ConnectionInfo { + addr: addr.clone(), + redis: RedisConnectionInfo { + db: 0, + username: Some(username), + password: Some(password.clone()), + }, + }) + } + } + } + + async fn get_client(&self) -> anyhow::Result { + let client = redis::Client::open(self.get_connection_info().await?)?; + Ok(client) + } + + // PubSub does not support credentials refresh. + // Requires manual reconnection every 12h. + pub async fn get_async_pubsub(&self) -> anyhow::Result { + Ok(self.get_client().await?.get_async_pubsub().await?) + } + + // The connection lives for 12h. + // It can be prolonged with sending `AUTH` commands with the refreshed token. + // https://docs.aws.amazon.com/AmazonElastiCache/latest/red-ug/auth-iam.html#auth-iam-limits + async fn keep_connection( + mut con: MultiplexedConnection, + credentials_provider: Arc, + ) -> anyhow::Result<()> { + loop { + // The connection lives for 12h, for the sanity check we refresh it every hour. + tokio::time::sleep(Duration::from_secs(60 * 60)).await; + match Self::refresh_token(&mut con, credentials_provider.clone()).await { + Ok(()) => { + info!("Token refreshed"); + } + Err(e) => { + error!("Error during token refresh: {e:?}"); + } + } + } + } + async fn refresh_token( + con: &mut MultiplexedConnection, + credentials_provider: Arc, + ) -> anyhow::Result<()> { + let (user, password) = credentials_provider.provide_credentials().await?; + redis::cmd("AUTH") + .arg(user) + .arg(password) + .query_async(con) + .await?; + Ok(()) + } + /// Sends an already encoded (packed) command into the TCP socket and + /// reads the single response from it. + pub async fn send_packed_command(&mut self, cmd: &redis::Cmd) -> RedisResult { + // Clone connection to avoid having to lock the ArcSwap in write mode + let con = self.con.as_mut().ok_or(redis::RedisError::from(( + redis::ErrorKind::IoError, + "Connection not established", + )))?; + con.send_packed_command(cmd).await + } + + /// Sends multiple already encoded (packed) command into the TCP socket + /// and reads `count` responses from it. This is used to implement + /// pipelining. + pub async fn send_packed_commands( + &mut self, + cmd: &redis::Pipeline, + offset: usize, + count: usize, + ) -> RedisResult> { + // Clone shared connection future to avoid having to lock the ArcSwap in write mode + let con = self.con.as_mut().ok_or(redis::RedisError::from(( + redis::ErrorKind::IoError, + "Connection not established", + )))?; + con.send_packed_commands(cmd, offset, count).await + } +} + +impl ConnectionLike for ConnectionWithCredentialsProvider { + fn req_packed_command<'a>( + &'a mut self, + cmd: &'a redis::Cmd, + ) -> redis::RedisFuture<'a, redis::Value> { + (async move { self.send_packed_command(cmd).await }).boxed() + } + + fn req_packed_commands<'a>( + &'a mut self, + cmd: &'a redis::Pipeline, + offset: usize, + count: usize, + ) -> redis::RedisFuture<'a, Vec> { + (async move { self.send_packed_commands(cmd, offset, count).await }).boxed() + } + + fn get_db(&self) -> i64 { + 0 + } +} diff --git a/proxy/src/redis/elasticache.rs b/proxy/src/redis/elasticache.rs new file mode 100644 index 0000000000..eded8250af --- /dev/null +++ b/proxy/src/redis/elasticache.rs @@ -0,0 +1,110 @@ +use std::time::{Duration, SystemTime}; + +use aws_config::meta::credentials::CredentialsProviderChain; +use aws_sdk_iam::config::ProvideCredentials; +use aws_sigv4::http_request::{ + self, SignableBody, SignableRequest, SignatureLocation, SigningSettings, +}; +use tracing::info; + +#[derive(Debug)] +pub struct AWSIRSAConfig { + region: String, + service_name: String, + cluster_name: String, + user_id: String, + token_ttl: Duration, + action: String, +} + +impl AWSIRSAConfig { + pub fn new(region: String, cluster_name: Option, user_id: Option) -> Self { + AWSIRSAConfig { + region, + service_name: "elasticache".to_string(), + cluster_name: cluster_name.unwrap_or_default(), + user_id: user_id.unwrap_or_default(), + // "The IAM authentication token is valid for 15 minutes" + // https://docs.aws.amazon.com/memorydb/latest/devguide/auth-iam.html#auth-iam-limits + token_ttl: Duration::from_secs(15 * 60), + action: "connect".to_string(), + } + } +} + +/// Credentials provider for AWS elasticache authentication. +/// +/// Official documentation: +/// +/// +/// Useful resources: +/// +pub struct CredentialsProvider { + config: AWSIRSAConfig, + credentials_provider: CredentialsProviderChain, +} + +impl CredentialsProvider { + pub fn new(config: AWSIRSAConfig, credentials_provider: CredentialsProviderChain) -> Self { + CredentialsProvider { + config, + credentials_provider, + } + } + pub async fn provide_credentials(&self) -> anyhow::Result<(String, String)> { + let aws_credentials = self + .credentials_provider + .provide_credentials() + .await? + .into(); + info!("AWS credentials successfully obtained"); + info!("Connecting to Redis with configuration: {:?}", self.config); + let mut settings = SigningSettings::default(); + settings.signature_location = SignatureLocation::QueryParams; + settings.expires_in = Some(self.config.token_ttl); + let signing_params = aws_sigv4::sign::v4::SigningParams::builder() + .identity(&aws_credentials) + .region(&self.config.region) + .name(&self.config.service_name) + .time(SystemTime::now()) + .settings(settings) + .build()? + .into(); + let auth_params = [ + ("Action", &self.config.action), + ("User", &self.config.user_id), + ]; + let auth_params = url::form_urlencoded::Serializer::new(String::new()) + .extend_pairs(auth_params) + .finish(); + let auth_uri = http::Uri::builder() + .scheme("http") + .authority(self.config.cluster_name.as_bytes()) + .path_and_query(format!("/?{auth_params}")) + .build()?; + info!("{}", auth_uri); + + // Convert the HTTP request into a signable request + let signable_request = SignableRequest::new( + "GET", + auth_uri.to_string(), + std::iter::empty(), + SignableBody::Bytes(&[]), + )?; + + // Sign and then apply the signature to the request + let (si, _) = http_request::sign(signable_request, &signing_params)?.into_parts(); + let mut signable_request = http::Request::builder() + .method("GET") + .uri(auth_uri) + .body(())?; + si.apply_to_request_http1x(&mut signable_request); + Ok(( + self.config.user_id.clone(), + signable_request + .uri() + .to_string() + .replacen("http://", "", 1), + )) + } +} diff --git a/proxy/src/redis/notifications.rs b/proxy/src/redis/notifications.rs index 6ae848c0d2..87d723d17e 100644 --- a/proxy/src/redis/notifications.rs +++ b/proxy/src/redis/notifications.rs @@ -4,13 +4,15 @@ use futures::StreamExt; use pq_proto::CancelKeyData; use redis::aio::PubSub; use serde::{Deserialize, Serialize}; +use tokio_util::sync::CancellationToken; use uuid::Uuid; +use super::connection_with_credentials_provider::ConnectionWithCredentialsProvider; use crate::{ cache::project_info::ProjectInfoCache, - cancellation::{CancelMap, CancellationHandler, NotificationsCancellationHandler}, + cancellation::{CancelMap, CancellationHandler}, intern::{ProjectIdInt, RoleNameInt}, - metrics::REDIS_BROKEN_MESSAGES, + metrics::{Metrics, RedisErrors, RedisEventsCount}, }; const CPLANE_CHANNEL_NAME: &str = "neondb-proxy-ws-updates"; @@ -18,23 +20,13 @@ pub(crate) const PROXY_CHANNEL_NAME: &str = "neondb-proxy-to-proxy-updates"; const RECONNECT_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(20); const INVALIDATION_LAG: std::time::Duration = std::time::Duration::from_secs(20); -struct RedisConsumerClient { - client: redis::Client, -} - -impl RedisConsumerClient { - pub fn new(url: &str) -> anyhow::Result { - let client = redis::Client::open(url)?; - Ok(Self { client }) - } - async fn try_connect(&self) -> anyhow::Result { - let mut conn = self.client.get_async_connection().await?.into_pubsub(); - tracing::info!("subscribing to a channel `{CPLANE_CHANNEL_NAME}`"); - conn.subscribe(CPLANE_CHANNEL_NAME).await?; - tracing::info!("subscribing to a channel `{PROXY_CHANNEL_NAME}`"); - conn.subscribe(PROXY_CHANNEL_NAME).await?; - Ok(conn) - } +async fn try_connect(client: &ConnectionWithCredentialsProvider) -> anyhow::Result { + let mut conn = client.get_async_pubsub().await?; + tracing::info!("subscribing to a channel `{CPLANE_CHANNEL_NAME}`"); + conn.subscribe(CPLANE_CHANNEL_NAME).await?; + tracing::info!("subscribing to a channel `{PROXY_CHANNEL_NAME}`"); + conn.subscribe(PROXY_CHANNEL_NAME).await?; + Ok(conn) } #[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)] @@ -80,32 +72,39 @@ where serde_json::from_str(&s).map_err(::custom) } -struct MessageHandler< - C: ProjectInfoCache + Send + Sync + 'static, - H: NotificationsCancellationHandler + Send + Sync + 'static, -> { +struct MessageHandler { cache: Arc, - cancellation_handler: Arc, + cancellation_handler: Arc>, region_id: String, } -impl< - C: ProjectInfoCache + Send + Sync + 'static, - H: NotificationsCancellationHandler + Send + Sync + 'static, - > MessageHandler -{ - pub fn new(cache: Arc, cancellation_handler: Arc, region_id: String) -> Self { +impl Clone for MessageHandler { + fn clone(&self) -> Self { + Self { + cache: self.cache.clone(), + cancellation_handler: self.cancellation_handler.clone(), + region_id: self.region_id.clone(), + } + } +} + +impl MessageHandler { + pub fn new( + cache: Arc, + cancellation_handler: Arc>, + region_id: String, + ) -> Self { Self { cache, cancellation_handler, region_id, } } - pub fn disable_ttl(&self) { - self.cache.disable_ttl(); + pub async fn increment_active_listeners(&self) { + self.cache.increment_active_listeners().await; } - pub fn enable_ttl(&self) { - self.cache.enable_ttl(); + pub async fn decrement_active_listeners(&self) { + self.cache.decrement_active_listeners().await; } #[tracing::instrument(skip(self, msg), fields(session_id = tracing::field::Empty))] async fn handle_message(&self, msg: redis::Msg) -> anyhow::Result<()> { @@ -116,9 +115,9 @@ impl< let msg: Notification = match serde_json::from_str(&payload) { Ok(msg) => msg, Err(e) => { - REDIS_BROKEN_MESSAGES - .with_label_values(&[msg.get_channel_name()]) - .inc(); + Metrics::get().proxy.redis_errors_total.inc(RedisErrors { + channel: msg.get_channel_name(), + }); tracing::error!("broken message: {e}"); return Ok(()); } @@ -130,6 +129,10 @@ impl< "session_id", &tracing::field::display(cancel_session.session_id), ); + Metrics::get() + .proxy + .redis_events_count + .inc(RedisEventsCount::CancelSession); if let Some(cancel_region) = cancel_session.region_id { // If the message is not for this region, ignore it. if cancel_region != self.region_id { @@ -139,7 +142,7 @@ impl< // This instance of cancellation_handler doesn't have a RedisPublisherClient so it can't publish the message. match self .cancellation_handler - .cancel_session_no_publish(cancel_session.cancel_key_data) + .cancel_session(cancel_session.cancel_key_data, uuid::Uuid::nil()) .await { Ok(()) => {} @@ -150,6 +153,17 @@ impl< } _ => { invalidate_cache(self.cache.clone(), msg.clone()); + if matches!(msg, AllowedIpsUpdate { .. }) { + Metrics::get() + .proxy + .redis_events_count + .inc(RedisEventsCount::AllowedIpsUpdate); + } else if matches!(msg, PasswordUpdate { .. }) { + Metrics::get() + .proxy + .redis_events_count + .inc(RedisEventsCount::PasswordUpdate); + } // It might happen that the invalid entry is on the way to be cached. // To make sure that the entry is invalidated, let's repeat the invalidation in INVALIDATION_LAG seconds. // TODO: include the version (or the timestamp) in the message and invalidate only if the entry is cached before the message. @@ -179,40 +193,29 @@ fn invalidate_cache(cache: Arc, msg: Notification) { } } -/// Handle console's invalidation messages. -#[tracing::instrument(name = "console_notifications", skip_all)] -pub async fn task_main( - url: String, - cache: Arc, - cancel_map: CancelMap, - region_id: String, -) -> anyhow::Result -where - C: ProjectInfoCache + Send + Sync + 'static, -{ - cache.enable_ttl(); - let handler = MessageHandler::new( - cache, - Arc::new(CancellationHandler::new(cancel_map, None)), - region_id, - ); - +async fn handle_messages( + handler: MessageHandler, + redis: ConnectionWithCredentialsProvider, + cancellation_token: CancellationToken, +) -> anyhow::Result<()> { loop { - let redis = RedisConsumerClient::new(&url)?; - let conn = match redis.try_connect().await { + if cancellation_token.is_cancelled() { + return Ok(()); + } + let mut conn = match try_connect(&redis).await { Ok(conn) => { - handler.disable_ttl(); + handler.increment_active_listeners().await; conn } Err(e) => { tracing::error!( - "failed to connect to redis: {e}, will try to reconnect in {RECONNECT_TIMEOUT:#?}" - ); + "failed to connect to redis: {e}, will try to reconnect in {RECONNECT_TIMEOUT:#?}" + ); tokio::time::sleep(RECONNECT_TIMEOUT).await; continue; } }; - let mut stream = conn.into_on_message(); + let mut stream = conn.on_message(); while let Some(msg) = stream.next().await { match handler.handle_message(msg).await { Ok(()) => {} @@ -221,8 +224,47 @@ where break; } } + if cancellation_token.is_cancelled() { + handler.decrement_active_listeners().await; + return Ok(()); + } } - handler.enable_ttl(); + handler.decrement_active_listeners().await; + } +} + +/// Handle console's invalidation messages. +#[tracing::instrument(name = "redis_notifications", skip_all)] +pub async fn task_main( + redis: ConnectionWithCredentialsProvider, + cache: Arc, + cancel_map: CancelMap, + region_id: String, +) -> anyhow::Result +where + C: ProjectInfoCache + Send + Sync + 'static, +{ + let cancellation_handler = Arc::new(CancellationHandler::<()>::new( + cancel_map, + crate::metrics::CancellationSource::FromRedis, + )); + let handler = MessageHandler::new(cache, cancellation_handler, region_id); + // 6h - 1m. + // There will be 1 minute overlap between two tasks. But at least we can be sure that no message is lost. + let mut interval = tokio::time::interval(std::time::Duration::from_secs(6 * 60 * 60 - 60)); + loop { + let cancellation_token = CancellationToken::new(); + interval.tick().await; + + tokio::spawn(handle_messages( + handler.clone(), + redis.clone(), + cancellation_token.clone(), + )); + tokio::spawn(async move { + tokio::time::sleep(std::time::Duration::from_secs(6 * 60 * 60)).await; // 6h. + cancellation_token.cancel(); + }); } } diff --git a/proxy/src/redis/publisher.rs b/proxy/src/redis/publisher.rs deleted file mode 100644 index f85593afdd..0000000000 --- a/proxy/src/redis/publisher.rs +++ /dev/null @@ -1,80 +0,0 @@ -use pq_proto::CancelKeyData; -use redis::AsyncCommands; -use uuid::Uuid; - -use crate::rate_limiter::{RateBucketInfo, RedisRateLimiter}; - -use super::notifications::{CancelSession, Notification, PROXY_CHANNEL_NAME}; - -pub struct RedisPublisherClient { - client: redis::Client, - publisher: Option, - region_id: String, - limiter: RedisRateLimiter, -} - -impl RedisPublisherClient { - pub fn new( - url: &str, - region_id: String, - info: &'static [RateBucketInfo], - ) -> anyhow::Result { - let client = redis::Client::open(url)?; - Ok(Self { - client, - publisher: None, - region_id, - limiter: RedisRateLimiter::new(info), - }) - } - pub async fn try_publish( - &mut self, - cancel_key_data: CancelKeyData, - session_id: Uuid, - ) -> anyhow::Result<()> { - if !self.limiter.check() { - tracing::info!("Rate limit exceeded. Skipping cancellation message"); - return Err(anyhow::anyhow!("Rate limit exceeded")); - } - match self.publish(cancel_key_data, session_id).await { - Ok(()) => return Ok(()), - Err(e) => { - tracing::error!("failed to publish a message: {e}"); - self.publisher = None; - } - } - tracing::info!("Publisher is disconnected. Reconnectiong..."); - self.try_connect().await?; - self.publish(cancel_key_data, session_id).await - } - - async fn publish( - &mut self, - cancel_key_data: CancelKeyData, - session_id: Uuid, - ) -> anyhow::Result<()> { - let conn = self - .publisher - .as_mut() - .ok_or_else(|| anyhow::anyhow!("not connected"))?; - let payload = serde_json::to_string(&Notification::Cancel(CancelSession { - region_id: Some(self.region_id.clone()), - cancel_key_data, - session_id, - }))?; - conn.publish(PROXY_CHANNEL_NAME, payload).await?; - Ok(()) - } - pub async fn try_connect(&mut self) -> anyhow::Result<()> { - match self.client.get_async_connection().await { - Ok(conn) => { - self.publisher = Some(conn); - } - Err(e) => { - tracing::error!("failed to connect to redis: {e}"); - return Err(e.into()); - } - } - Ok(()) - } -} diff --git a/proxy/src/scram/exchange.rs b/proxy/src/scram/exchange.rs index 682cbe795f..89dd33e59f 100644 --- a/proxy/src/scram/exchange.rs +++ b/proxy/src/scram/exchange.rs @@ -3,9 +3,7 @@ use std::convert::Infallible; use hmac::{Hmac, Mac}; -use sha2::digest::FixedOutput; -use sha2::{Digest, Sha256}; -use subtle::{Choice, ConstantTimeEq}; +use sha2::Sha256; use tokio::task::yield_now; use super::messages::{ @@ -13,6 +11,7 @@ use super::messages::{ }; use super::secret::ServerSecret; use super::signature::SignatureBuilder; +use super::ScramKey; use crate::config; use crate::sasl::{self, ChannelBinding, Error as SaslError}; @@ -104,7 +103,7 @@ async fn pbkdf2(str: &[u8], salt: &[u8], iterations: u32) -> [u8; 32] { } // copied from -async fn derive_keys(password: &[u8], salt: &[u8], iterations: u32) -> ([u8; 32], [u8; 32]) { +async fn derive_client_key(password: &[u8], salt: &[u8], iterations: u32) -> ScramKey { let salted_password = pbkdf2(password, salt, iterations).await; let make_key = |name| { @@ -116,7 +115,7 @@ async fn derive_keys(password: &[u8], salt: &[u8], iterations: u32) -> ([u8; 32] <[u8; 32]>::from(key.into_bytes()) }; - (make_key(b"Client Key"), make_key(b"Server Key")) + make_key(b"Client Key").into() } pub async fn exchange( @@ -124,21 +123,12 @@ pub async fn exchange( password: &[u8], ) -> sasl::Result> { let salt = base64::decode(&secret.salt_base64)?; - let (client_key, server_key) = derive_keys(password, &salt, secret.iterations).await; - let stored_key: [u8; 32] = Sha256::default() - .chain_update(client_key) - .finalize_fixed() - .into(); + let client_key = derive_client_key(password, &salt, secret.iterations).await; - // constant time to not leak partial key match - let valid = stored_key.ct_eq(&secret.stored_key.as_bytes()) - | server_key.ct_eq(&secret.server_key.as_bytes()) - | Choice::from(secret.doomed as u8); - - if valid.into() { - Ok(sasl::Outcome::Success(super::ScramKey::from(client_key))) - } else { + if secret.is_password_invalid(&client_key).into() { Ok(sasl::Outcome::Failure("password doesn't match")) + } else { + Ok(sasl::Outcome::Success(client_key)) } } @@ -220,7 +210,7 @@ impl SaslSentInner { .derive_client_key(&client_final_message.proof); // Auth fails either if keys don't match or it's pre-determined to fail. - if client_key.sha256() != secret.stored_key || secret.doomed { + if secret.is_password_invalid(&client_key).into() { return Ok(sasl::Step::Failure("password doesn't match")); } diff --git a/proxy/src/scram/key.rs b/proxy/src/scram/key.rs index 973126e729..32a3dbd203 100644 --- a/proxy/src/scram/key.rs +++ b/proxy/src/scram/key.rs @@ -1,17 +1,31 @@ //! Tools for client/server/stored key management. +use subtle::ConstantTimeEq; + /// Faithfully taken from PostgreSQL. pub const SCRAM_KEY_LEN: usize = 32; /// One of the keys derived from the user's password. /// We use the same structure for all keys, i.e. /// `ClientKey`, `StoredKey`, and `ServerKey`. -#[derive(Clone, Default, PartialEq, Eq, Debug)] +#[derive(Clone, Default, Eq, Debug)] #[repr(transparent)] pub struct ScramKey { bytes: [u8; SCRAM_KEY_LEN], } +impl PartialEq for ScramKey { + fn eq(&self, other: &Self) -> bool { + self.ct_eq(other).into() + } +} + +impl ConstantTimeEq for ScramKey { + fn ct_eq(&self, other: &Self) -> subtle::Choice { + self.bytes.ct_eq(&other.bytes) + } +} + impl ScramKey { pub fn sha256(&self) -> Self { super::sha256([self.as_ref()]).into() diff --git a/proxy/src/scram/messages.rs b/proxy/src/scram/messages.rs index b59baec508..f9372540ca 100644 --- a/proxy/src/scram/messages.rs +++ b/proxy/src/scram/messages.rs @@ -206,6 +206,28 @@ mod tests { } } + #[test] + fn parse_client_first_message_with_invalid_gs2_authz() { + assert!(ClientFirstMessage::parse("n,authzid,n=user,r=nonce").is_none()) + } + + #[test] + fn parse_client_first_message_with_extra_params() { + let msg = ClientFirstMessage::parse("n,,n=user,r=nonce,a=foo,b=bar,c=baz").unwrap(); + assert_eq!(msg.bare, "n=user,r=nonce,a=foo,b=bar,c=baz"); + assert_eq!(msg.username, "user"); + assert_eq!(msg.nonce, "nonce"); + assert_eq!(msg.cbind_flag, ChannelBinding::NotSupportedClient); + } + + #[test] + fn parse_client_first_message_with_extra_params_invalid() { + // must be of the form `=<...>` + assert!(ClientFirstMessage::parse("n,,n=user,r=nonce,abc=foo").is_none()); + assert!(ClientFirstMessage::parse("n,,n=user,r=nonce,1=foo").is_none()); + assert!(ClientFirstMessage::parse("n,,n=user,r=nonce,a").is_none()); + } + #[test] fn parse_client_final_message() { let input = [ diff --git a/proxy/src/scram/secret.rs b/proxy/src/scram/secret.rs index b46d8c3ab5..44c4f9e44a 100644 --- a/proxy/src/scram/secret.rs +++ b/proxy/src/scram/secret.rs @@ -1,5 +1,7 @@ //! Tools for SCRAM server secret management. +use subtle::{Choice, ConstantTimeEq}; + use super::base64_decode_array; use super::key::ScramKey; @@ -40,16 +42,21 @@ impl ServerSecret { Some(secret) } + pub fn is_password_invalid(&self, client_key: &ScramKey) -> Choice { + // constant time to not leak partial key match + client_key.sha256().ct_ne(&self.stored_key) | Choice::from(self.doomed as u8) + } + /// To avoid revealing information to an attacker, we use a /// mocked server secret even if the user doesn't exist. /// See `auth-scram.c : mock_scram_secret` for details. - pub fn mock(user: &str, nonce: [u8; 32]) -> Self { - // Refer to `auth-scram.c : scram_mock_salt`. - let mocked_salt = super::sha256([user.as_bytes(), &nonce]); - + pub fn mock(nonce: [u8; 32]) -> Self { Self { - iterations: 4096, - salt_base64: base64::encode(mocked_salt), + // this doesn't reveal much information as we're going to use + // iteration count 1 for our generated passwords going forward. + // PG16 users can set iteration count=1 already today. + iterations: 1, + salt_base64: base64::encode(nonce), stored_key: ScramKey::default(), server_key: ScramKey::default(), doomed: true, diff --git a/proxy/src/serverless.rs b/proxy/src/serverless.rs index be9f90acde..f634ab4e98 100644 --- a/proxy/src/serverless.rs +++ b/proxy/src/serverless.rs @@ -3,42 +3,50 @@ //! Handles both SQL over HTTP and SQL over Websockets. mod backend; +pub mod cancel_set; mod conn_pool; +mod http_util; mod json; mod sql_over_http; -pub mod tls_listener; mod websocket; +use atomic_take::AtomicTake; +use bytes::Bytes; pub use conn_pool::GlobalConnPoolOptions; -use anyhow::bail; -use hyper::StatusCode; -use metrics::IntCounterPairGuard; +use anyhow::Context; +use futures::future::{select, Either}; +use futures::TryFutureExt; +use http::{Method, Response, StatusCode}; +use http_body_util::Full; +use hyper1::body::Incoming; +use hyper_util::rt::TokioExecutor; +use hyper_util::server::conn::auto::Builder; use rand::rngs::StdRng; use rand::SeedableRng; pub use reqwest_middleware::{ClientWithMiddleware, Error}; pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware}; +use tokio::time::timeout; +use tokio_rustls::TlsAcceptor; use tokio_util::task::TaskTracker; -use tracing::instrument::Instrumented; +use crate::cancellation::CancellationHandlerMain; +use crate::config::ProxyConfig; use crate::context::RequestMonitoring; -use crate::protocol2::{ProxyProtocolAccept, WithClientIp, WithConnectionGuard}; +use crate::metrics::Metrics; +use crate::protocol2::read_proxy_protocol; +use crate::proxy::run_until_cancelled; use crate::rate_limiter::EndpointRateLimiter; use crate::serverless::backend::PoolingBackend; -use crate::{cancellation::CancellationHandler, config::ProxyConfig}; -use hyper::{ - server::conn::{AddrIncoming, AddrStream}, - Body, Method, Request, Response, -}; +use crate::serverless::http_util::{api_error_into_response, json_response}; -use std::net::IpAddr; +use std::net::{IpAddr, SocketAddr}; +use std::pin::pin; use std::sync::Arc; -use std::task::Poll; -use tls_listener::TlsListener; -use tokio::net::TcpListener; -use tokio_util::sync::{CancellationToken, DropGuard}; +use tokio::net::{TcpListener, TcpStream}; +use tokio_util::sync::CancellationToken; use tracing::{error, info, warn, Instrument}; -use utils::http::{error::ApiError, json::json_response}; +use utils::http::error::ApiError; pub const SERVERLESS_DRIVER_SNI: &str = "api"; @@ -46,8 +54,8 @@ pub async fn task_main( config: &'static ProxyConfig, ws_listener: TcpListener, cancellation_token: CancellationToken, + cancellation_handler: Arc, endpoint_rate_limiter: Arc, - cancellation_handler: Arc, ) -> anyhow::Result<()> { scopeguard::defer! { info!("websocket server has shut down"); @@ -76,6 +84,7 @@ pub async fn task_main( let backend = Arc::new(PoolingBackend { pool: Arc::clone(&conn_pool), config, + endpoint_rate_limiter: Arc::clone(&endpoint_rate_limiter), }); let tls_config = match config.tls_config.as_ref() { @@ -90,161 +99,200 @@ pub async fn task_main( tls_server_config.alpn_protocols = vec![b"h2".to_vec(), b"http/1.1".to_vec()]; let tls_acceptor: tokio_rustls::TlsAcceptor = Arc::new(tls_server_config).into(); - let mut addr_incoming = AddrIncoming::from_listener(ws_listener)?; - let _ = addr_incoming.set_nodelay(true); - let addr_incoming = ProxyProtocolAccept { - incoming: addr_incoming, - protocol: "http", - }; + let connections = tokio_util::task::task_tracker::TaskTracker::new(); + connections.close(); // allows `connections.wait to complete` - let ws_connections = tokio_util::task::task_tracker::TaskTracker::new(); - ws_connections.close(); // allows `ws_connections.wait to complete` + let server = Builder::new(hyper_util::rt::TokioExecutor::new()); - let tls_listener = TlsListener::new(tls_acceptor, addr_incoming, config.handshake_timeout); + while let Some(res) = run_until_cancelled(ws_listener.accept(), &cancellation_token).await { + let (conn, peer_addr) = res.context("could not accept TCP stream")?; + if let Err(e) = conn.set_nodelay(true) { + tracing::error!("could not set nodelay: {e}"); + continue; + } + let conn_id = uuid::Uuid::new_v4(); + let http_conn_span = tracing::info_span!("http_conn", ?conn_id); - let make_svc = hyper::service::make_service_fn( - |stream: &tokio_rustls::server::TlsStream< - WithConnectionGuard>, - >| { - let (conn, _) = stream.get_ref(); - - // this is jank. should dissapear with hyper 1.0 migration. - let gauge = conn - .gauge - .lock() - .expect("lock should not be poisoned") - .take() - .expect("gauge should be set on connection start"); - - // Cancel all current inflight HTTP requests if the HTTP connection is closed. - let http_cancellation_token = CancellationToken::new(); - let cancel_connection = http_cancellation_token.clone().drop_guard(); - - let span = conn.span.clone(); - let client_addr = conn.inner.client_addr(); - let remote_addr = conn.inner.inner.remote_addr(); - let backend = backend.clone(); - let ws_connections = ws_connections.clone(); - let endpoint_rate_limiter = endpoint_rate_limiter.clone(); - let cancellation_handler = cancellation_handler.clone(); - async move { - let peer_addr = match client_addr { - Some(addr) => addr, - None if config.require_client_ip => bail!("missing required client ip"), - None => remote_addr, - }; - Ok(MetricService::new( - hyper::service::service_fn(move |req: Request| { - let backend = backend.clone(); - let ws_connections2 = ws_connections.clone(); - let endpoint_rate_limiter = endpoint_rate_limiter.clone(); - let cancellation_handler = cancellation_handler.clone(); - let http_cancellation_token = http_cancellation_token.child_token(); - - // `request_handler` is not cancel safe. It expects to be cancelled only at specific times. - // By spawning the future, we ensure it never gets cancelled until it decides to. - ws_connections.spawn( - async move { - // Cancel the current inflight HTTP request if the requets stream is closed. - // This is slightly different to `_cancel_connection` in that - // h2 can cancel individual requests with a `RST_STREAM`. - let _cancel_session = http_cancellation_token.clone().drop_guard(); - - let res = request_handler( - req, - config, - backend, - ws_connections2, - cancellation_handler, - peer_addr.ip(), - endpoint_rate_limiter, - http_cancellation_token, - ) - .await - .map_or_else(|e| e.into_response(), |r| r); - - _cancel_session.disarm(); - - res - } - .in_current_span(), - ) - }), - gauge, - cancel_connection, - span, - )) + let n_connections = Metrics::get() + .proxy + .client_connections + .sample(crate::metrics::Protocol::Http); + tracing::trace!(?n_connections, threshold = ?config.http_config.client_conn_threshold, "check"); + if n_connections > config.http_config.client_conn_threshold { + tracing::trace!("attempting to cancel a random connection"); + if let Some(token) = config.http_config.cancel_set.take() { + tracing::debug!("cancelling a random connection"); + token.cancel() } - }, - ); + } - hyper::Server::builder(tls_listener) - .serve(make_svc) - .with_graceful_shutdown(cancellation_token.cancelled()) - .await?; + let conn_token = cancellation_token.child_token(); + let conn = connection_handler( + config, + backend.clone(), + connections.clone(), + cancellation_handler.clone(), + endpoint_rate_limiter.clone(), + conn_token.clone(), + server.clone(), + tls_acceptor.clone(), + conn, + peer_addr, + ) + .instrument(http_conn_span); - // await websocket connections - ws_connections.wait().await; + connections.spawn(async move { + let _cancel_guard = config.http_config.cancel_set.insert(conn_id, conn_token); + conn.await + }); + } + + connections.wait().await; Ok(()) } -struct MetricService { - inner: S, - _gauge: IntCounterPairGuard, - _cancel: DropGuard, - span: tracing::Span, -} +/// Handles the TCP lifecycle. +/// +/// 1. Parses PROXY protocol V2 +/// 2. Handles TLS handshake +/// 3. Handles HTTP connection +/// 1. With graceful shutdowns +/// 2. With graceful request cancellation with connection failure +/// 3. With websocket upgrade support. +#[allow(clippy::too_many_arguments)] +async fn connection_handler( + config: &'static ProxyConfig, + backend: Arc, + connections: TaskTracker, + cancellation_handler: Arc, + endpoint_rate_limiter: Arc, + cancellation_token: CancellationToken, + server: Builder, + tls_acceptor: TlsAcceptor, + conn: TcpStream, + peer_addr: SocketAddr, +) { + let session_id = uuid::Uuid::new_v4(); -impl MetricService { - fn new( - inner: S, - _gauge: IntCounterPairGuard, - _cancel: DropGuard, - span: tracing::Span, - ) -> MetricService { - MetricService { - inner, - _gauge, - _cancel, - span, + let _gauge = Metrics::get() + .proxy + .client_connections + .guard(crate::metrics::Protocol::Http); + + // handle PROXY protocol + let (conn, peer) = match read_proxy_protocol(conn).await { + Ok(c) => c, + Err(e) => { + tracing::error!(?session_id, %peer_addr, "failed to accept TCP connection: invalid PROXY protocol V2 header: {e:#}"); + return; } - } -} + }; -impl hyper::service::Service> for MetricService -where - S: hyper::service::Service>, -{ - type Response = S::Response; - type Error = S::Error; - type Future = Instrumented; + let peer_addr = peer.unwrap_or(peer_addr).ip(); + let has_private_peer_addr = match peer_addr { + IpAddr::V4(ip) => ip.is_private(), + _ => false, + }; + info!(?session_id, %peer_addr, "accepted new TCP connection"); - fn poll_ready(&mut self, cx: &mut std::task::Context<'_>) -> Poll> { - self.inner.poll_ready(cx) - } + // try upgrade to TLS, but with a timeout. + let conn = match timeout(config.handshake_timeout, tls_acceptor.accept(conn)).await { + Ok(Ok(conn)) => { + info!(?session_id, %peer_addr, "accepted new TLS connection"); + conn + } + // The handshake failed + Ok(Err(e)) => { + if !has_private_peer_addr { + Metrics::get().proxy.tls_handshake_failures.inc(); + } + warn!(?session_id, %peer_addr, "failed to accept TLS connection: {e:?}"); + return; + } + // The handshake timed out + Err(e) => { + if !has_private_peer_addr { + Metrics::get().proxy.tls_handshake_failures.inc(); + } + warn!(?session_id, %peer_addr, "failed to accept TLS connection: {e:?}"); + return; + } + }; - fn call(&mut self, req: Request) -> Self::Future { - self.span - .in_scope(|| self.inner.call(req)) - .instrument(self.span.clone()) + let session_id = AtomicTake::new(session_id); + + // Cancel all current inflight HTTP requests if the HTTP connection is closed. + let http_cancellation_token = CancellationToken::new(); + let _cancel_connection = http_cancellation_token.clone().drop_guard(); + + let conn = server.serve_connection_with_upgrades( + hyper_util::rt::TokioIo::new(conn), + hyper1::service::service_fn(move |req: hyper1::Request| { + // First HTTP request shares the same session ID + let session_id = session_id.take().unwrap_or_else(uuid::Uuid::new_v4); + + // Cancel the current inflight HTTP request if the requets stream is closed. + // This is slightly different to `_cancel_connection` in that + // h2 can cancel individual requests with a `RST_STREAM`. + let http_request_token = http_cancellation_token.child_token(); + let cancel_request = http_request_token.clone().drop_guard(); + + // `request_handler` is not cancel safe. It expects to be cancelled only at specific times. + // By spawning the future, we ensure it never gets cancelled until it decides to. + let handler = connections.spawn( + request_handler( + req, + config, + backend.clone(), + connections.clone(), + cancellation_handler.clone(), + session_id, + peer_addr, + http_request_token, + endpoint_rate_limiter.clone(), + ) + .in_current_span() + .map_ok_or_else(api_error_into_response, |r| r), + ); + + async move { + let res = handler.await; + cancel_request.disarm(); + res + } + }), + ); + + // On cancellation, trigger the HTTP connection handler to shut down. + let res = match select(pin!(cancellation_token.cancelled()), pin!(conn)).await { + Either::Left((_cancelled, mut conn)) => { + tracing::debug!(%peer_addr, "cancelling connection"); + conn.as_mut().graceful_shutdown(); + conn.await + } + Either::Right((res, _)) => res, + }; + + match res { + Ok(()) => tracing::info!(%peer_addr, "HTTP connection closed"), + Err(e) => tracing::warn!(%peer_addr, "HTTP connection error {e}"), } } #[allow(clippy::too_many_arguments)] async fn request_handler( - mut request: Request, + mut request: hyper1::Request, config: &'static ProxyConfig, backend: Arc, ws_connections: TaskTracker, - cancellation_handler: Arc, + cancellation_handler: Arc, + session_id: uuid::Uuid, peer_addr: IpAddr, - endpoint_rate_limiter: Arc, // used to cancel in-flight HTTP requests. not used to cancel websockets http_cancellation_token: CancellationToken, -) -> Result, ApiError> { - let session_id = uuid::Uuid::new_v4(); - + endpoint_rate_limiter: Arc, +) -> Result>, ApiError> { let host = request .headers() .get("host") @@ -254,7 +302,13 @@ async fn request_handler( // Check if the request is a websocket upgrade request. if hyper_tungstenite::is_upgrade_request(&request) { - let ctx = RequestMonitoring::new(session_id, peer_addr, "ws", &config.region); + let ctx = RequestMonitoring::new( + session_id, + peer_addr, + crate::metrics::Protocol::Ws, + &config.region, + ); + let span = ctx.span.clone(); info!(parent: &span, "performing websocket upgrade"); @@ -268,8 +322,8 @@ async fn request_handler( ctx, websocket, cancellation_handler, - host, endpoint_rate_limiter, + host, ) .await { @@ -281,14 +335,19 @@ async fn request_handler( // Return the response so the spawned future can continue. Ok(response) - } else if request.uri().path() == "/sql" && request.method() == Method::POST { - let ctx = RequestMonitoring::new(session_id, peer_addr, "http", &config.region); + } else if request.uri().path() == "/sql" && *request.method() == Method::POST { + let ctx = RequestMonitoring::new( + session_id, + peer_addr, + crate::metrics::Protocol::Http, + &config.region, + ); let span = ctx.span.clone(); sql_over_http::handle(config, ctx, request, backend, http_cancellation_token) .instrument(span) .await - } else if request.uri().path() == "/sql" && request.method() == Method::OPTIONS { + } else if request.uri().path() == "/sql" && *request.method() == Method::OPTIONS { Response::builder() .header("Allow", "OPTIONS, POST") .header("Access-Control-Allow-Origin", "*") @@ -298,7 +357,7 @@ async fn request_handler( ) .header("Access-Control-Max-Age", "86400" /* 24 hours */) .status(StatusCode::OK) // 204 is also valid, but see: https://developer.mozilla.org/en-US/docs/Web/HTTP/Methods/OPTIONS#status_code - .body(Body::empty()) + .body(Full::new(Bytes::new())) .map_err(|e| ApiError::InternalServerError(e.into())) } else { json_response(StatusCode::BAD_REQUEST, "query is not supported") diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs index 72b55c45f0..6b79c12316 100644 --- a/proxy/src/serverless/backend.rs +++ b/proxy/src/serverless/backend.rs @@ -6,15 +6,18 @@ use tracing::{field::display, info}; use crate::{ auth::{backend::ComputeCredentials, check_peer_addr_is_in_list, AuthError}, compute, - config::ProxyConfig, + config::{AuthenticationConfig, ProxyConfig}, console::{ errors::{GetAuthInfoError, WakeComputeError}, - messages::ColdStartInfo, + locks::ApiLocks, + provider::ApiLockError, CachedNodeInfo, }, context::RequestMonitoring, error::{ErrorKind, ReportableError, UserFacingError}, - proxy::connect_compute::ConnectMechanism, + proxy::{connect_compute::ConnectMechanism, retry::ShouldRetry}, + rate_limiter::EndpointRateLimiter, + Host, }; use super::conn_pool::{poll_client, Client, ConnInfo, GlobalConnPool}; @@ -22,12 +25,14 @@ use super::conn_pool::{poll_client, Client, ConnInfo, GlobalConnPool}; pub struct PoolingBackend { pub pool: Arc>, pub config: &'static ProxyConfig, + pub endpoint_rate_limiter: Arc, } impl PoolingBackend { pub async fn authenticate( &self, ctx: &mut RequestMonitoring, + config: &AuthenticationConfig, conn_info: &ConnInfo, ) -> Result { let user_info = conn_info.user_info.clone(); @@ -36,13 +41,25 @@ impl PoolingBackend { if !check_peer_addr_is_in_list(&ctx.peer_addr, &allowed_ips) { return Err(AuthError::ip_address_not_allowed(ctx.peer_addr)); } + if !self + .endpoint_rate_limiter + .check(conn_info.user_info.endpoint.clone().into(), 1) + { + return Err(AuthError::too_many_connections()); + } let cached_secret = match maybe_secret { Some(secret) => secret, None => backend.get_role_secret(ctx).await?, }; let secret = match cached_secret.value.clone() { - Some(secret) => secret, + Some(secret) => self.config.authentication_config.check_rate_limit( + ctx, + config, + secret, + &user_info.endpoint, + true, + )?, None => { // If we don't have an authentication secret, for the http flow we can just return an error. info!("authentication info not found"); @@ -52,7 +69,10 @@ impl PoolingBackend { let auth_outcome = crate::auth::validate_password_and_exchange(&conn_info.password, secret).await?; let res = match auth_outcome { - crate::sasl::Outcome::Success(key) => Ok(key), + crate::sasl::Outcome::Success(key) => { + info!("user successfully authenticated"); + Ok(key) + } crate::sasl::Outcome::Failure(reason) => { info!("auth backend failed with an error: {reason}"); Err(AuthError::auth_failed(&*conn_info.user_info.user)) @@ -84,8 +104,6 @@ impl PoolingBackend { }; if let Some(client) = maybe_client { - info!("cold_start_info=warm"); - ctx.set_cold_start_info(ColdStartInfo::Warm); return Ok(client); } let conn_id = uuid::Uuid::new_v4(); @@ -98,9 +116,12 @@ impl PoolingBackend { conn_id, conn_info, pool: self.pool.clone(), + locks: &self.config.connect_compute_locks, }, &backend, false, // do not allow self signed compute for http flow + self.config.wake_compute_retry_config, + self.config.connect_to_compute_retry_config, ) .await } @@ -119,6 +140,8 @@ pub enum HttpConnError { AuthError(#[from] AuthError), #[error("wake_compute returned error")] WakeCompute(#[from] WakeComputeError), + #[error("error acquiring resource permit: {0}")] + TooManyConnectionAttempts(#[from] ApiLockError), } impl ReportableError for HttpConnError { @@ -129,6 +152,7 @@ impl ReportableError for HttpConnError { HttpConnError::GetAuthInfo(a) => a.get_error_kind(), HttpConnError::AuthError(a) => a.get_error_kind(), HttpConnError::WakeCompute(w) => w.get_error_kind(), + HttpConnError::TooManyConnectionAttempts(w) => w.get_error_kind(), } } } @@ -141,6 +165,30 @@ impl UserFacingError for HttpConnError { HttpConnError::GetAuthInfo(c) => c.to_string_client(), HttpConnError::AuthError(c) => c.to_string_client(), HttpConnError::WakeCompute(c) => c.to_string_client(), + HttpConnError::TooManyConnectionAttempts(_) => { + "Failed to acquire permit to connect to the database. Too many database connection attempts are currently ongoing.".to_owned() + } + } + } +} + +impl ShouldRetry for HttpConnError { + fn could_retry(&self) -> bool { + match self { + HttpConnError::ConnectionError(e) => e.could_retry(), + HttpConnError::ConnectionClosedAbruptly(_) => false, + HttpConnError::GetAuthInfo(_) => false, + HttpConnError::AuthError(_) => false, + HttpConnError::WakeCompute(_) => false, + HttpConnError::TooManyConnectionAttempts(_) => false, + } + } + fn should_retry_database_address(&self) -> bool { + match self { + HttpConnError::ConnectionError(e) => e.should_retry_database_address(), + // we never checked cache validity + HttpConnError::TooManyConnectionAttempts(_) => false, + _ => true, } } } @@ -149,12 +197,15 @@ struct TokioMechanism { pool: Arc>, conn_info: ConnInfo, conn_id: uuid::Uuid, + + /// connect_to_compute concurrency lock + locks: &'static ApiLocks, } #[async_trait] impl ConnectMechanism for TokioMechanism { type Connection = Client; - type ConnectError = tokio_postgres::Error; + type ConnectError = HttpConnError; type Error = HttpConnError; async fn connect_once( @@ -163,6 +214,9 @@ impl ConnectMechanism for TokioMechanism { node_info: &CachedNodeInfo, timeout: Duration, ) -> Result { + let host = node_info.config.get_host()?; + let permit = self.locks.get_permit(&host).await?; + let mut config = (*node_info.config).clone(); let config = config .user(&self.conn_info.user_info.user) @@ -170,7 +224,10 @@ impl ConnectMechanism for TokioMechanism { .dbname(&self.conn_info.dbname) .connect_timeout(timeout); + let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Compute); let (client, connection) = config.connect(tokio_postgres::NoTls).await?; + drop(pause); + drop(permit); tracing::Span::current().record("pid", &tracing::field::display(client.get_process_id())); Ok(poll_client( diff --git a/proxy/src/serverless/cancel_set.rs b/proxy/src/serverless/cancel_set.rs new file mode 100644 index 0000000000..390df7f4f7 --- /dev/null +++ b/proxy/src/serverless/cancel_set.rs @@ -0,0 +1,102 @@ +//! A set for cancelling random http connections + +use std::{ + hash::{BuildHasher, BuildHasherDefault}, + num::NonZeroUsize, + time::Duration, +}; + +use indexmap::IndexMap; +use parking_lot::Mutex; +use rand::{thread_rng, Rng}; +use rustc_hash::FxHasher; +use tokio::time::Instant; +use tokio_util::sync::CancellationToken; +use uuid::Uuid; + +type Hasher = BuildHasherDefault; + +pub struct CancelSet { + shards: Box<[Mutex]>, + // keyed by random uuid, fxhasher is fine + hasher: Hasher, +} + +pub struct CancelShard { + tokens: IndexMap, +} + +impl CancelSet { + pub fn new(shards: usize) -> Self { + CancelSet { + shards: (0..shards) + .map(|_| { + Mutex::new(CancelShard { + tokens: IndexMap::with_hasher(Hasher::default()), + }) + }) + .collect(), + hasher: Hasher::default(), + } + } + + pub fn take(&self) -> Option { + for _ in 0..4 { + if let Some(token) = self.take_raw(thread_rng().gen()) { + return Some(token); + } + tracing::trace!("failed to get cancel token"); + } + None + } + + pub fn take_raw(&self, rng: usize) -> Option { + NonZeroUsize::new(self.shards.len()) + .and_then(|len| self.shards[rng % len].lock().take(rng / len)) + } + + pub fn insert(&self, id: uuid::Uuid, token: CancellationToken) -> CancelGuard<'_> { + let shard = NonZeroUsize::new(self.shards.len()).map(|len| { + let hash = self.hasher.hash_one(id) as usize; + let shard = &self.shards[hash % len]; + shard.lock().insert(id, token); + shard + }); + CancelGuard { shard, id } + } +} + +impl CancelShard { + fn take(&mut self, rng: usize) -> Option { + NonZeroUsize::new(self.tokens.len()).and_then(|len| { + // 10 second grace period so we don't cancel new connections + if self.tokens.get_index(rng % len)?.1 .0.elapsed() < Duration::from_secs(10) { + return None; + } + + let (_key, (_insert, token)) = self.tokens.swap_remove_index(rng % len)?; + Some(token) + }) + } + + fn remove(&mut self, id: uuid::Uuid) { + self.tokens.swap_remove(&id); + } + + fn insert(&mut self, id: uuid::Uuid, token: CancellationToken) { + self.tokens.insert(id, (Instant::now(), token)); + } +} + +pub struct CancelGuard<'a> { + shard: Option<&'a Mutex>, + id: Uuid, +} + +impl Drop for CancelGuard<'_> { + fn drop(&mut self) { + if let Some(shard) = self.shard { + shard.lock().remove(self.id); + } + } +} diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs index c7e8eaef76..5fa253acf8 100644 --- a/proxy/src/serverless/conn_pool.rs +++ b/proxy/src/serverless/conn_pool.rs @@ -1,6 +1,5 @@ use dashmap::DashMap; use futures::{future::poll_fn, Future}; -use metrics::IntCounterPairGuard; use parking_lot::RwLock; use rand::Rng; use smallvec::SmallVec; @@ -16,13 +15,13 @@ use std::{ use tokio::time::Instant; use tokio_postgres::tls::NoTlsStream; use tokio_postgres::{AsyncMessage, ReadyForQueryStatus, Socket}; +use tokio_util::sync::CancellationToken; -use crate::console::messages::MetricsAuxInfo; -use crate::metrics::{ENDPOINT_POOLS, GC_LATENCY, NUM_OPEN_CLIENTS_IN_HTTP_POOL}; +use crate::console::messages::{ColdStartInfo, MetricsAuxInfo}; +use crate::metrics::{HttpEndpointPoolsGuard, Metrics}; use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS}; use crate::{ - auth::backend::ComputeUserInfo, context::RequestMonitoring, metrics::NUM_DB_CONNECTIONS_GAUGE, - DbName, EndpointCacheKey, RoleName, + auth::backend::ComputeUserInfo, context::RequestMonitoring, DbName, EndpointCacheKey, RoleName, }; use tracing::{debug, error, warn, Span}; @@ -78,7 +77,7 @@ pub struct EndpointConnPool { pools: HashMap<(DbName, RoleName), DbUserConnPool>, total_conns: usize, max_conns: usize, - _guard: IntCounterPairGuard, + _guard: HttpEndpointPoolsGuard<'static>, global_connections_count: Arc, global_pool_size_max_conns: usize, } @@ -110,7 +109,11 @@ impl EndpointConnPool { let removed = old_len - new_len; if removed > 0 { global_connections_count.fetch_sub(removed, atomic::Ordering::Relaxed); - NUM_OPEN_CLIENTS_IN_HTTP_POOL.sub(removed as i64); + Metrics::get() + .proxy + .http_pool_opened_connections + .get_metric() + .dec_by(removed as i64); } *total_conns -= removed; removed > 0 @@ -156,7 +159,11 @@ impl EndpointConnPool { pool.total_conns += 1; pool.global_connections_count .fetch_add(1, atomic::Ordering::Relaxed); - NUM_OPEN_CLIENTS_IN_HTTP_POOL.inc(); + Metrics::get() + .proxy + .http_pool_opened_connections + .get_metric() + .inc(); } pool.total_conns @@ -176,7 +183,11 @@ impl Drop for EndpointConnPool { if self.total_conns > 0 { self.global_connections_count .fetch_sub(self.total_conns, atomic::Ordering::Relaxed); - NUM_OPEN_CLIENTS_IN_HTTP_POOL.sub(self.total_conns as i64); + Metrics::get() + .proxy + .http_pool_opened_connections + .get_metric() + .dec_by(self.total_conns as i64); } } } @@ -215,7 +226,11 @@ impl DbUserConnPool { removed += 1; } global_connections_count.fetch_sub(removed, atomic::Ordering::Relaxed); - NUM_OPEN_CLIENTS_IN_HTTP_POOL.sub(removed as i64); + Metrics::get() + .proxy + .http_pool_opened_connections + .get_metric() + .dec_by(removed as i64); conn } } @@ -303,7 +318,10 @@ impl GlobalConnPool { // acquire a random shard lock let mut shard = self.global_pool.shards()[shard].write(); - let timer = GC_LATENCY.start_timer(); + let timer = Metrics::get() + .proxy + .http_pool_reclaimation_lag_seconds + .start_timer(); let current_len = shard.len(); let mut clients_removed = 0; shard.retain(|endpoint, x| { @@ -331,7 +349,7 @@ impl GlobalConnPool { let new_len = shard.len(); drop(shard); - timer.observe_duration(); + timer.observe(); // Do logging outside of the lock. if clients_removed > 0 { @@ -339,7 +357,11 @@ impl GlobalConnPool { .global_connections_count .fetch_sub(clients_removed, atomic::Ordering::Relaxed) - clients_removed; - NUM_OPEN_CLIENTS_IN_HTTP_POOL.sub(clients_removed as i64); + Metrics::get() + .proxy + .http_pool_opened_connections + .get_metric() + .dec_by(clients_removed as i64); info!("pool: performed global pool gc. removed {clients_removed} clients, total number of clients in pool is {size}"); } let removed = current_len - new_len; @@ -383,9 +405,12 @@ impl GlobalConnPool { "pid", &tracing::field::display(client.inner.get_process_id()), ); - info!("pool: reusing connection '{conn_info}'"); + info!( + cold_start_info = ColdStartInfo::HttpPoolHit.as_str(), + "pool: reusing connection '{conn_info}'" + ); client.session.send(ctx.session_id)?; - ctx.latency_timer.pool_hit(); + ctx.set_cold_start_info(ColdStartInfo::HttpPoolHit); ctx.latency_timer.success(); return Ok(Some(Client::new(client, conn_info.clone(), endpoint_pool))); } @@ -407,7 +432,7 @@ impl GlobalConnPool { pools: HashMap::new(), total_conns: 0, max_conns: self.config.pool_options.max_conns_per_endpoint, - _guard: ENDPOINT_POOLS.guard(), + _guard: Metrics::get().proxy.http_endpoint_pools.guard(), global_connections_count: self.global_connections_count.clone(), global_pool_size_max_conns: self.config.pool_options.max_total_conns, })); @@ -447,15 +472,14 @@ pub fn poll_client( conn_id: uuid::Uuid, aux: MetricsAuxInfo, ) -> Client { - let conn_gauge = NUM_DB_CONNECTIONS_GAUGE - .with_label_values(&[ctx.protocol]) - .guard(); + let conn_gauge = Metrics::get().proxy.db_connections.guard(ctx.protocol); let mut session_id = ctx.session_id; let (tx, mut rx) = tokio::sync::watch::channel(session_id); let span = info_span!(parent: None, "connection", %conn_id); + let cold_start_info = ctx.cold_start_info; span.in_scope(|| { - info!(%conn_info, %session_id, "new connection"); + info!(cold_start_info = cold_start_info.as_str(), %conn_info, %session_id, "new connection"); }); let pool = match conn_info.endpoint_cache_key() { Some(endpoint) => Arc::downgrade(&global_pool.get_or_create_endpoint_pool(&endpoint)), @@ -465,15 +489,32 @@ pub fn poll_client( let db_user = conn_info.db_and_user(); let idle = global_pool.get_idle_timeout(); + let cancel = CancellationToken::new(); + let cancelled = cancel.clone().cancelled_owned(); + tokio::spawn( async move { let _conn_gauge = conn_gauge; let mut idle_timeout = pin!(tokio::time::sleep(idle)); + let mut cancelled = pin!(cancelled); + poll_fn(move |cx| { - if matches!(rx.has_changed(), Ok(true)) { - session_id = *rx.borrow_and_update(); - info!(%session_id, "changed session"); - idle_timeout.as_mut().reset(Instant::now() + idle); + if cancelled.as_mut().poll(cx).is_ready() { + info!("connection dropped"); + return Poll::Ready(()) + } + + match rx.has_changed() { + Ok(true) => { + session_id = *rx.borrow_and_update(); + info!(%session_id, "changed session"); + idle_timeout.as_mut().reset(Instant::now() + idle); + } + Err(_) => { + info!("connection dropped"); + return Poll::Ready(()) + } + _ => {} } // 5 minute idle connection timeout @@ -528,6 +569,7 @@ pub fn poll_client( let inner = ClientInner { inner: client, session: tx, + cancel, aux, conn_id, }; @@ -537,10 +579,18 @@ pub fn poll_client( struct ClientInner { inner: C, session: tokio::sync::watch::Sender, + cancel: CancellationToken, aux: MetricsAuxInfo, conn_id: uuid::Uuid, } +impl Drop for ClientInner { + fn drop(&mut self) { + // on client drop, tell the conn to shut down + self.cancel.cancel(); + } +} + pub trait ClientInnerExt: Sync + Send + 'static { fn is_closed(&self) -> bool; fn get_process_id(&self) -> i32; @@ -565,8 +615,8 @@ impl Client { pub fn metrics(&self) -> Arc { let aux = &self.inner.as_ref().unwrap().aux; USAGE_METRICS.register(Ids { - endpoint_id: aux.endpoint_id.clone(), - branch_id: aux.branch_id.clone(), + endpoint_id: aux.endpoint_id, + branch_id: aux.branch_id, }) } } @@ -666,6 +716,8 @@ impl Drop for Client { mod tests { use std::{mem, sync::atomic::AtomicBool}; + use crate::{serverless::cancel_set::CancelSet, BranchId, EndpointId, ProjectId}; + use super::*; struct MockClient(Arc); @@ -691,7 +743,13 @@ mod tests { ClientInner { inner: client, session: tokio::sync::watch::Sender::new(uuid::Uuid::new_v4()), - aux: Default::default(), + cancel: CancellationToken::new(), + aux: MetricsAuxInfo { + endpoint_id: (&EndpointId::from("endpoint")).into(), + project_id: (&ProjectId::from("project")).into(), + branch_id: (&BranchId::from("branch")).into(), + cold_start_info: crate::console::messages::ColdStartInfo::Warm, + }, conn_id: uuid::Uuid::new_v4(), } } @@ -709,6 +767,8 @@ mod tests { max_total_conns: 3, }, request_timeout: Duration::from_secs(1), + cancel_set: CancelSet::new(0), + client_conn_threshold: u64::MAX, })); let pool = GlobalConnPool::new(config); let conn_info = ConnInfo { diff --git a/proxy/src/serverless/http_util.rs b/proxy/src/serverless/http_util.rs new file mode 100644 index 0000000000..ab9127b13e --- /dev/null +++ b/proxy/src/serverless/http_util.rs @@ -0,0 +1,92 @@ +//! Things stolen from `libs/utils/src/http` to add hyper 1.0 compatibility +//! Will merge back in at some point in the future. + +use bytes::Bytes; + +use anyhow::Context; +use http::{Response, StatusCode}; +use http_body_util::Full; + +use serde::Serialize; +use utils::http::error::ApiError; + +/// Like [`ApiError::into_response`] +pub fn api_error_into_response(this: ApiError) -> Response> { + match this { + ApiError::BadRequest(err) => HttpErrorBody::response_from_msg_and_status( + format!("{err:#?}"), // use debug printing so that we give the cause + StatusCode::BAD_REQUEST, + ), + ApiError::Forbidden(_) => { + HttpErrorBody::response_from_msg_and_status(this.to_string(), StatusCode::FORBIDDEN) + } + ApiError::Unauthorized(_) => { + HttpErrorBody::response_from_msg_and_status(this.to_string(), StatusCode::UNAUTHORIZED) + } + ApiError::NotFound(_) => { + HttpErrorBody::response_from_msg_and_status(this.to_string(), StatusCode::NOT_FOUND) + } + ApiError::Conflict(_) => { + HttpErrorBody::response_from_msg_and_status(this.to_string(), StatusCode::CONFLICT) + } + ApiError::PreconditionFailed(_) => HttpErrorBody::response_from_msg_and_status( + this.to_string(), + StatusCode::PRECONDITION_FAILED, + ), + ApiError::ShuttingDown => HttpErrorBody::response_from_msg_and_status( + "Shutting down".to_string(), + StatusCode::SERVICE_UNAVAILABLE, + ), + ApiError::ResourceUnavailable(err) => HttpErrorBody::response_from_msg_and_status( + err.to_string(), + StatusCode::SERVICE_UNAVAILABLE, + ), + ApiError::Timeout(err) => HttpErrorBody::response_from_msg_and_status( + err.to_string(), + StatusCode::REQUEST_TIMEOUT, + ), + ApiError::InternalServerError(err) => HttpErrorBody::response_from_msg_and_status( + err.to_string(), + StatusCode::INTERNAL_SERVER_ERROR, + ), + } +} + +/// Same as [`utils::http::error::HttpErrorBody`] +#[derive(Serialize)] +struct HttpErrorBody { + pub msg: String, +} + +impl HttpErrorBody { + /// Same as [`utils::http::error::HttpErrorBody::response_from_msg_and_status`] + fn response_from_msg_and_status(msg: String, status: StatusCode) -> Response> { + HttpErrorBody { msg }.to_response(status) + } + + /// Same as [`utils::http::error::HttpErrorBody::to_response`] + fn to_response(&self, status: StatusCode) -> Response> { + Response::builder() + .status(status) + .header(http::header::CONTENT_TYPE, "application/json") + // we do not have nested maps with non string keys so serialization shouldn't fail + .body(Full::new(Bytes::from(serde_json::to_string(self).unwrap()))) + .unwrap() + } +} + +/// Same as [`utils::http::json::json_response`] +pub fn json_response( + status: StatusCode, + data: T, +) -> Result>, ApiError> { + let json = serde_json::to_string(&data) + .context("Failed to serialize JSON response") + .map_err(ApiError::InternalServerError)?; + let response = Response::builder() + .status(status) + .header(http::header::CONTENT_TYPE, "application/json") + .body(Full::new(Bytes::from(json))) + .map_err(|e| ApiError::InternalServerError(e.into()))?; + Ok(response) +} diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs index f675375ff1..5376bddfd3 100644 --- a/proxy/src/serverless/sql_over_http.rs +++ b/proxy/src/serverless/sql_over_http.rs @@ -1,18 +1,22 @@ use std::pin::pin; use std::sync::Arc; +use bytes::Bytes; use futures::future::select; use futures::future::try_join; use futures::future::Either; use futures::StreamExt; use futures::TryFutureExt; -use hyper::body::HttpBody; -use hyper::header; -use hyper::http::HeaderName; -use hyper::http::HeaderValue; -use hyper::Response; -use hyper::StatusCode; -use hyper::{Body, HeaderMap, Request}; +use http_body_util::BodyExt; +use http_body_util::Full; +use hyper1::body::Body; +use hyper1::body::Incoming; +use hyper1::header; +use hyper1::http::HeaderName; +use hyper1::http::HeaderValue; +use hyper1::Response; +use hyper1::StatusCode; +use hyper1::{HeaderMap, Request}; use serde_json::json; use serde_json::Value; use tokio::time; @@ -29,7 +33,6 @@ use tracing::error; use tracing::info; use url::Url; use utils::http::error::ApiError; -use utils::http::json::json_response; use crate::auth::backend::ComputeUserInfo; use crate::auth::endpoint_sni; @@ -40,15 +43,19 @@ use crate::context::RequestMonitoring; use crate::error::ErrorKind; use crate::error::ReportableError; use crate::error::UserFacingError; -use crate::metrics::HTTP_CONTENT_LENGTH; -use crate::metrics::NUM_CONNECTION_REQUESTS_GAUGE; +use crate::metrics::HttpDirection; +use crate::metrics::Metrics; +use crate::proxy::run_until_cancelled; use crate::proxy::NeonOptions; use crate::serverless::backend::HttpConnError; +use crate::usage_metrics::MetricCounterRecorder; use crate::DbName; use crate::RoleName; use super::backend::PoolingBackend; +use super::conn_pool::Client; use super::conn_pool::ConnInfo; +use super::http_util::json_response; use super::json::json_to_pg_text; use super::json::pg_text_row_to_json; use super::json::JsonConversionError; @@ -215,18 +222,11 @@ fn get_conn_info( pub async fn handle( config: &'static ProxyConfig, mut ctx: RequestMonitoring, - request: Request, + request: Request, backend: Arc, cancel: CancellationToken, -) -> Result, ApiError> { - let cancel2 = cancel.clone(); - let handle = tokio::spawn(async move { - time::sleep(config.http_config.request_timeout).await; - cancel2.cancel(); - }); - +) -> Result>, ApiError> { let result = handle_inner(cancel, config, &mut ctx, request, backend).await; - handle.abort(); let mut response = match result { Ok(r) => { @@ -237,10 +237,7 @@ pub async fn handle( let error_kind = e.get_error_kind(); ctx.set_error_kind(error_kind); - let message = format!( - "Query cancelled, runtime exceeded. SQL queries over HTTP must not exceed {} seconds of runtime. Please consider using our websocket based connections", - config.http_config.request_timeout.as_secs_f64() - ); + let message = "Query cancelled, connection was terminated"; tracing::info!( kind=error_kind.to_metric_label(), @@ -339,10 +336,9 @@ pub async fn handle( } }; - response.headers_mut().insert( - "Access-Control-Allow-Origin", - hyper::http::HeaderValue::from_static("*"), - ); + response + .headers_mut() + .insert("Access-Control-Allow-Origin", HeaderValue::from_static("*")); Ok(response) } @@ -403,7 +399,7 @@ impl UserFacingError for SqlOverHttpError { #[derive(Debug, thiserror::Error)] pub enum ReadPayloadError { #[error("could not read the HTTP request body: {0}")] - Read(#[from] hyper::Error), + Read(#[from] hyper1::Error), #[error("could not parse the HTTP request body: {0}")] Parse(#[from] serde_json::Error), } @@ -428,65 +424,107 @@ pub enum SqlOverHttpCancel { impl ReportableError for SqlOverHttpCancel { fn get_error_kind(&self) -> ErrorKind { match self { - SqlOverHttpCancel::Postgres => ErrorKind::RateLimit, - SqlOverHttpCancel::Connect => ErrorKind::ServiceRateLimit, + SqlOverHttpCancel::Postgres => ErrorKind::ClientDisconnect, + SqlOverHttpCancel::Connect => ErrorKind::ClientDisconnect, } } } +#[derive(Clone, Copy, Debug)] +struct HttpHeaders { + raw_output: bool, + default_array_mode: bool, + txn_isolation_level: Option, + txn_read_only: bool, + txn_deferrable: bool, +} + +impl HttpHeaders { + fn try_parse(headers: &hyper1::http::HeaderMap) -> Result { + // Determine the output options. Default behaviour is 'false'. Anything that is not + // strictly 'true' assumed to be false. + let raw_output = headers.get(&RAW_TEXT_OUTPUT) == Some(&HEADER_VALUE_TRUE); + let default_array_mode = headers.get(&ARRAY_MODE) == Some(&HEADER_VALUE_TRUE); + + // isolation level, read only and deferrable + let txn_isolation_level = match headers.get(&TXN_ISOLATION_LEVEL) { + Some(x) => Some( + map_header_to_isolation_level(x).ok_or(SqlOverHttpError::InvalidIsolationLevel)?, + ), + None => None, + }; + + let txn_read_only = headers.get(&TXN_READ_ONLY) == Some(&HEADER_VALUE_TRUE); + let txn_deferrable = headers.get(&TXN_DEFERRABLE) == Some(&HEADER_VALUE_TRUE); + + Ok(Self { + raw_output, + default_array_mode, + txn_isolation_level, + txn_read_only, + txn_deferrable, + }) + } +} + +fn map_header_to_isolation_level(level: &HeaderValue) -> Option { + match level.as_bytes() { + b"Serializable" => Some(IsolationLevel::Serializable), + b"ReadUncommitted" => Some(IsolationLevel::ReadUncommitted), + b"ReadCommitted" => Some(IsolationLevel::ReadCommitted), + b"RepeatableRead" => Some(IsolationLevel::RepeatableRead), + _ => None, + } +} + +fn map_isolation_level_to_headers(level: IsolationLevel) -> Option { + match level { + IsolationLevel::ReadUncommitted => Some(HeaderValue::from_static("ReadUncommitted")), + IsolationLevel::ReadCommitted => Some(HeaderValue::from_static("ReadCommitted")), + IsolationLevel::RepeatableRead => Some(HeaderValue::from_static("RepeatableRead")), + IsolationLevel::Serializable => Some(HeaderValue::from_static("Serializable")), + _ => None, + } +} + async fn handle_inner( cancel: CancellationToken, config: &'static ProxyConfig, ctx: &mut RequestMonitoring, - request: Request, + request: Request, backend: Arc, -) -> Result, SqlOverHttpError> { - let _request_gauge = NUM_CONNECTION_REQUESTS_GAUGE - .with_label_values(&[ctx.protocol]) - .guard(); - info!("handling interactive connection from client"); +) -> Result>, SqlOverHttpError> { + let _requeset_gauge = Metrics::get().proxy.connection_requests.guard(ctx.protocol); + info!( + protocol = %ctx.protocol, + "handling interactive connection from client" + ); // // Determine the destination and connection params // let headers = request.headers(); + // TLS config should be there. let conn_info = get_conn_info(ctx, headers, config.tls_config.as_ref().unwrap())?; info!(user = conn_info.user_info.user.as_str(), "credentials"); - // Determine the output options. Default behaviour is 'false'. Anything that is not - // strictly 'true' assumed to be false. - let raw_output = headers.get(&RAW_TEXT_OUTPUT) == Some(&HEADER_VALUE_TRUE); - let default_array_mode = headers.get(&ARRAY_MODE) == Some(&HEADER_VALUE_TRUE); - // Allow connection pooling only if explicitly requested // or if we have decided that http pool is no longer opt-in let allow_pool = !config.http_config.pool_options.opt_in || headers.get(&ALLOW_POOL) == Some(&HEADER_VALUE_TRUE); - // isolation level, read only and deferrable - - let txn_isolation_level_raw = headers.get(&TXN_ISOLATION_LEVEL).cloned(); - let txn_isolation_level = match txn_isolation_level_raw { - Some(ref x) => Some(match x.as_bytes() { - b"Serializable" => IsolationLevel::Serializable, - b"ReadUncommitted" => IsolationLevel::ReadUncommitted, - b"ReadCommitted" => IsolationLevel::ReadCommitted, - b"RepeatableRead" => IsolationLevel::RepeatableRead, - _ => return Err(SqlOverHttpError::InvalidIsolationLevel), - }), - None => None, - }; - - let txn_read_only = headers.get(&TXN_READ_ONLY) == Some(&HEADER_VALUE_TRUE); - let txn_deferrable = headers.get(&TXN_DEFERRABLE) == Some(&HEADER_VALUE_TRUE); + let parsed_headers = HttpHeaders::try_parse(headers)?; let request_content_length = match request.body().size_hint().upper() { Some(v) => v, None => MAX_REQUEST_SIZE + 1, }; info!(request_content_length, "request size in bytes"); - HTTP_CONTENT_LENGTH.observe(request_content_length as f64); + Metrics::get() + .proxy + .http_conn_content_length_bytes + .observe(HttpDirection::Request, request_content_length as f64); // we don't have a streaming request support yet so this is to prevent OOM // from a malicious user sending an extremely large request body @@ -495,7 +533,7 @@ async fn handle_inner( } let fetch_and_process_request = async { - let body = hyper::body::to_bytes(request.into_body()).await?; + let body = request.into_body().collect().await?.to_bytes(); info!(length = body.len(), "request payload read"); let payload: Payload = serde_json::from_slice(&body)?; Ok::(payload) // Adjust error type accordingly @@ -503,7 +541,9 @@ async fn handle_inner( .map_err(SqlOverHttpError::from); let authenticate_and_connect = async { - let keys = backend.authenticate(ctx, &conn_info).await?; + let keys = backend + .authenticate(ctx, &config.authentication_config, &conn_info) + .await?; let client = backend .connect_to_compute(ctx, conn_info, keys, !allow_pool) .await?; @@ -514,20 +554,18 @@ async fn handle_inner( } .map_err(SqlOverHttpError::from); - // Run both operations in parallel - let (payload, mut client) = match select( + let (payload, mut client) = match run_until_cancelled( + // Run both operations in parallel try_join( pin!(fetch_and_process_request), pin!(authenticate_and_connect), ), - pin!(cancel.cancelled()), + &cancel, ) .await { - Either::Left((result, _cancelled)) => result?, - Either::Right((_cancelled, _)) => { - return Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Connect)) - } + Some(result) => result?, + None => return Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Connect)), }; let mut response = Response::builder() @@ -537,95 +575,145 @@ async fn handle_inner( // // Now execute the query and return the result // - let mut size = 0; let result = match payload { - Payload::Single(stmt) => { - let mut size = 0; - let (inner, mut discard) = client.inner(); - let cancel_token = inner.cancel_token(); - let query = pin!(query_to_json( - &*inner, - stmt, - &mut size, - raw_output, - default_array_mode - )); - let cancelled = pin!(cancel.cancelled()); - let res = select(query, cancelled).await; - match res { - Either::Left((Ok((status, results)), _cancelled)) => { - discard.check_idle(status); - results - } - Either::Left((Err(e), _cancelled)) => { - discard.discard(); - return Err(e); - } - Either::Right((_cancelled, query)) => { - if let Err(err) = cancel_token.cancel_query(NoTls).await { - tracing::error!(?err, "could not cancel query"); - } - match time::timeout(time::Duration::from_millis(100), query).await { - Ok(Ok((status, results))) => { - discard.check_idle(status); - results - } - Ok(Err(error)) => { - let db_error = match &error { - SqlOverHttpError::ConnectCompute( - HttpConnError::ConnectionError(e), - ) - | SqlOverHttpError::Postgres(e) => e.as_db_error(), - _ => None, - }; - - // if errored for some other reason, it might not be safe to return - if !db_error.is_some_and(|e| *e.code() == SqlState::QUERY_CANCELED) { - discard.discard(); - } - - return Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Postgres)); - } - Err(_timeout) => { - discard.discard(); - return Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Postgres)); - } - } - } - } - } + Payload::Single(stmt) => stmt.process(cancel, &mut client, parsed_headers).await?, Payload::Batch(statements) => { - info!("starting transaction"); - let (inner, mut discard) = client.inner(); - let cancel_token = inner.cancel_token(); - let mut builder = inner.build_transaction(); - if let Some(isolation_level) = txn_isolation_level { - builder = builder.isolation_level(isolation_level); + if parsed_headers.txn_read_only { + response = response.header(TXN_READ_ONLY.clone(), &HEADER_VALUE_TRUE); } - if txn_read_only { - builder = builder.read_only(true); + if parsed_headers.txn_deferrable { + response = response.header(TXN_DEFERRABLE.clone(), &HEADER_VALUE_TRUE); } - if txn_deferrable { - builder = builder.deferrable(true); - } - - let transaction = builder.start().await.map_err(|e| { - // if we cannot start a transaction, we should return immediately - // and not return to the pool. connection is clearly broken - discard.discard(); - e - })?; - - let results = match query_batch( - cancel.child_token(), - &transaction, - statements, - &mut size, - raw_output, - default_array_mode, - ) - .await + if let Some(txn_isolation_level) = parsed_headers + .txn_isolation_level + .and_then(map_isolation_level_to_headers) { + response = response.header(TXN_ISOLATION_LEVEL.clone(), txn_isolation_level); + } + + statements + .process(cancel, &mut client, parsed_headers) + .await? + } + }; + + let metrics = client.metrics(); + + // how could this possibly fail + let body = serde_json::to_string(&result).expect("json serialization should not fail"); + let len = body.len(); + let response = response + .body(Full::new(Bytes::from(body))) + // only fails if invalid status code or invalid header/values are given. + // these are not user configurable so it cannot fail dynamically + .expect("building response payload should not fail"); + + // count the egress bytes - we miss the TLS and header overhead but oh well... + // moving this later in the stack is going to be a lot of effort and ehhhh + metrics.record_egress(len as u64); + Metrics::get() + .proxy + .http_conn_content_length_bytes + .observe(HttpDirection::Response, len as f64); + + Ok(response) +} + +impl QueryData { + async fn process( + self, + cancel: CancellationToken, + client: &mut Client, + parsed_headers: HttpHeaders, + ) -> Result { + let (inner, mut discard) = client.inner(); + let cancel_token = inner.cancel_token(); + + let res = match select( + pin!(query_to_json(&*inner, self, &mut 0, parsed_headers)), + pin!(cancel.cancelled()), + ) + .await + { + // The query successfully completed. + Either::Left((Ok((status, results)), __not_yet_cancelled)) => { + discard.check_idle(status); + Ok(results) + } + // The query failed with an error + Either::Left((Err(e), __not_yet_cancelled)) => { + discard.discard(); + return Err(e); + } + // The query was cancelled. + Either::Right((_cancelled, query)) => { + tracing::info!("cancelling query"); + if let Err(err) = cancel_token.cancel_query(NoTls).await { + tracing::error!(?err, "could not cancel query"); + } + // wait for the query cancellation + match time::timeout(time::Duration::from_millis(100), query).await { + // query successed before it was cancelled. + Ok(Ok((status, results))) => { + discard.check_idle(status); + Ok(results) + } + // query failed or was cancelled. + Ok(Err(error)) => { + let db_error = match &error { + SqlOverHttpError::ConnectCompute(HttpConnError::ConnectionError(e)) + | SqlOverHttpError::Postgres(e) => e.as_db_error(), + _ => None, + }; + + // if errored for some other reason, it might not be safe to return + if !db_error.is_some_and(|e| *e.code() == SqlState::QUERY_CANCELED) { + discard.discard(); + } + + Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Postgres)) + } + Err(_timeout) => { + discard.discard(); + Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Postgres)) + } + } + } + }; + res + } +} + +impl BatchQueryData { + async fn process( + self, + cancel: CancellationToken, + client: &mut Client, + parsed_headers: HttpHeaders, + ) -> Result { + info!("starting transaction"); + let (inner, mut discard) = client.inner(); + let cancel_token = inner.cancel_token(); + let mut builder = inner.build_transaction(); + if let Some(isolation_level) = parsed_headers.txn_isolation_level { + builder = builder.isolation_level(isolation_level); + } + if parsed_headers.txn_read_only { + builder = builder.read_only(true); + } + if parsed_headers.txn_deferrable { + builder = builder.deferrable(true); + } + + let transaction = builder.start().await.map_err(|e| { + // if we cannot start a transaction, we should return immediately + // and not return to the pool. connection is clearly broken + discard.discard(); + e + })?; + + let results = + match query_batch(cancel.child_token(), &transaction, self, parsed_headers).await { Ok(results) => { info!("commit"); let status = transaction.commit().await.map_err(|e| { @@ -659,44 +747,15 @@ async fn handle_inner( } }; - if txn_read_only { - response = response.header(TXN_READ_ONLY.clone(), &HEADER_VALUE_TRUE); - } - if txn_deferrable { - response = response.header(TXN_DEFERRABLE.clone(), &HEADER_VALUE_TRUE); - } - if let Some(txn_isolation_level) = txn_isolation_level_raw { - response = response.header(TXN_ISOLATION_LEVEL.clone(), txn_isolation_level); - } - json!({ "results": results }) - } - }; - - let metrics = client.metrics(); - - // how could this possibly fail - let body = serde_json::to_string(&result).expect("json serialization should not fail"); - let len = body.len(); - let response = response - .body(Body::from(body)) - // only fails if invalid status code or invalid header/values are given. - // these are not user configurable so it cannot fail dynamically - .expect("building response payload should not fail"); - - // count the egress bytes - we miss the TLS and header overhead but oh well... - // moving this later in the stack is going to be a lot of effort and ehhhh - metrics.record_egress(len as u64); - - Ok(response) + Ok(json!({ "results": results })) + } } async fn query_batch( cancel: CancellationToken, transaction: &Transaction<'_>, queries: BatchQueryData, - total_size: &mut usize, - raw_output: bool, - array_mode: bool, + parsed_headers: HttpHeaders, ) -> Result, SqlOverHttpError> { let mut results = Vec::with_capacity(queries.queries.len()); let mut current_size = 0; @@ -705,8 +764,7 @@ async fn query_batch( transaction, stmt, &mut current_size, - raw_output, - array_mode + parsed_headers, )); let cancelled = pin!(cancel.cancelled()); let res = select(query, cancelled).await; @@ -723,7 +781,6 @@ async fn query_batch( } } } - *total_size += current_size; Ok(results) } @@ -731,8 +788,7 @@ async fn query_to_json( client: &T, data: QueryData, current_size: &mut usize, - raw_output: bool, - default_array_mode: bool, + parsed_headers: HttpHeaders, ) -> Result<(ReadyForQueryStatus, Value), SqlOverHttpError> { info!("executing query"); let query_params = data.params; @@ -792,12 +848,12 @@ async fn query_to_json( columns.push(client.get_type(c.type_oid()).await?); } - let array_mode = data.array_mode.unwrap_or(default_array_mode); + let array_mode = data.array_mode.unwrap_or(parsed_headers.default_array_mode); // convert rows to JSON let rows = rows .iter() - .map(|row| pg_text_row_to_json(row, &columns, raw_output, array_mode)) + .map(|row| pg_text_row_to_json(row, &columns, parsed_headers.raw_output, array_mode)) .collect::, _>>()?; // resulting JSON format is based on the format of node-postgres result diff --git a/proxy/src/serverless/tls_listener.rs b/proxy/src/serverless/tls_listener.rs deleted file mode 100644 index 33f194dd59..0000000000 --- a/proxy/src/serverless/tls_listener.rs +++ /dev/null @@ -1,123 +0,0 @@ -use std::{ - convert::Infallible, - pin::Pin, - task::{Context, Poll}, - time::Duration, -}; - -use hyper::server::{accept::Accept, conn::AddrStream}; -use pin_project_lite::pin_project; -use tokio::{ - io::{AsyncRead, AsyncWrite}, - task::JoinSet, - time::timeout, -}; -use tokio_rustls::{server::TlsStream, TlsAcceptor}; -use tracing::{info, warn, Instrument}; - -use crate::{ - metrics::TLS_HANDSHAKE_FAILURES, - protocol2::{WithClientIp, WithConnectionGuard}, -}; - -pin_project! { - /// Wraps a `Stream` of connections (such as a TCP listener) so that each connection is itself - /// encrypted using TLS. - pub(crate) struct TlsListener { - #[pin] - listener: A, - tls: TlsAcceptor, - waiting: JoinSet>>, - timeout: Duration, - } -} - -impl TlsListener { - /// Create a `TlsListener` with default options. - pub(crate) fn new(tls: TlsAcceptor, listener: A, timeout: Duration) -> Self { - TlsListener { - listener, - tls, - waiting: JoinSet::new(), - timeout, - } - } -} - -impl Accept for TlsListener -where - A: Accept>>, - A::Error: std::error::Error, - A::Conn: AsyncRead + AsyncWrite + Unpin + Send + 'static, -{ - type Conn = TlsStream; - - type Error = Infallible; - - fn poll_accept( - self: Pin<&mut Self>, - cx: &mut Context<'_>, - ) -> Poll>> { - let mut this = self.project(); - - loop { - match this.listener.as_mut().poll_accept(cx) { - Poll::Pending => break, - Poll::Ready(Some(Ok(mut conn))) => { - let t = *this.timeout; - let tls = this.tls.clone(); - let span = conn.span.clone(); - this.waiting.spawn(async move { - let peer_addr = match conn.inner.wait_for_addr().await { - Ok(Some(addr)) => addr, - Err(e) => { - tracing::error!("failed to accept TCP connection: invalid PROXY protocol V2 header: {e:#}"); - return None; - } - Ok(None) => conn.inner.inner.remote_addr() - }; - - let accept = tls.accept(conn); - match timeout(t, accept).await { - Ok(Ok(conn)) => { - info!(%peer_addr, "accepted new TLS connection"); - Some(conn) - }, - // The handshake failed, try getting another connection from the queue - Ok(Err(e)) => { - TLS_HANDSHAKE_FAILURES.inc(); - warn!(%peer_addr, "failed to accept TLS connection: {e:?}"); - None - } - // The handshake timed out, try getting another connection from the queue - Err(_) => { - TLS_HANDSHAKE_FAILURES.inc(); - warn!(%peer_addr, "failed to accept TLS connection: timeout"); - None - } - } - }.instrument(span)); - } - Poll::Ready(Some(Err(e))) => { - tracing::error!("error accepting TCP connection: {e}"); - continue; - } - Poll::Ready(None) => return Poll::Ready(None), - } - } - - loop { - return match this.waiting.poll_join_next(cx) { - Poll::Ready(Some(Ok(Some(conn)))) => Poll::Ready(Some(Ok(conn))), - // The handshake failed to complete, try getting another connection from the queue - Poll::Ready(Some(Ok(None))) => continue, - // The handshake panicked or was cancelled. ignore and get another connection - Poll::Ready(Some(Err(e))) => { - tracing::warn!("handshake aborted: {e}"); - continue; - } - _ => Poll::Pending, - }; - } - } -} diff --git a/proxy/src/serverless/websocket.rs b/proxy/src/serverless/websocket.rs index a72ede6d0a..649bec2c7c 100644 --- a/proxy/src/serverless/websocket.rs +++ b/proxy/src/serverless/websocket.rs @@ -1,9 +1,9 @@ use crate::{ - cancellation::CancellationHandler, + cancellation::CancellationHandlerMain, config::ProxyConfig, context::RequestMonitoring, error::{io_error, ReportableError}, - metrics::NUM_CLIENT_CONNECTION_GAUGE, + metrics::Metrics, proxy::{handle_client, ClientMode}, rate_limiter::EndpointRateLimiter, }; @@ -134,14 +134,15 @@ pub async fn serve_websocket( config: &'static ProxyConfig, mut ctx: RequestMonitoring, websocket: HyperWebsocket, - cancellation_handler: Arc, - hostname: Option, + cancellation_handler: Arc, endpoint_rate_limiter: Arc, + hostname: Option, ) -> anyhow::Result<()> { let websocket = websocket.await?; - let conn_gauge = NUM_CLIENT_CONNECTION_GAUGE - .with_label_values(&["ws"]) - .guard(); + let conn_gauge = Metrics::get() + .proxy + .client_connections + .guard(crate::metrics::Protocol::Ws); let res = handle_client( config, @@ -158,17 +159,15 @@ pub async fn serve_websocket( Err(e) => { // todo: log and push to ctx the error kind ctx.set_error_kind(e.get_error_kind()); - ctx.log(); Err(e.into()) } Ok(None) => { ctx.set_success(); - ctx.log(); Ok(()) } Ok(Some(p)) => { ctx.set_success(); - ctx.log(); + ctx.log_connect(); p.proxy_pass().await } } diff --git a/proxy/src/stream.rs b/proxy/src/stream.rs index b6b7a85659..690e92ffb1 100644 --- a/proxy/src/stream.rs +++ b/proxy/src/stream.rs @@ -1,6 +1,6 @@ use crate::config::TlsServerEndPoint; use crate::error::{ErrorKind, ReportableError, UserFacingError}; -use crate::metrics::TLS_HANDSHAKE_FAILURES; +use crate::metrics::Metrics; use bytes::BytesMut; use pq_proto::framed::{ConnectionError, Framed}; @@ -223,12 +223,20 @@ pub enum StreamUpgradeError { impl Stream { /// If possible, upgrade raw stream into a secure TLS-based stream. - pub async fn upgrade(self, cfg: Arc) -> Result, StreamUpgradeError> { + pub async fn upgrade( + self, + cfg: Arc, + record_handshake_error: bool, + ) -> Result, StreamUpgradeError> { match self { Stream::Raw { raw } => Ok(tokio_rustls::TlsAcceptor::from(cfg) .accept(raw) .await - .inspect_err(|_| TLS_HANDSHAKE_FAILURES.inc())?), + .inspect_err(|_| { + if record_handshake_error { + Metrics::get().proxy.tls_handshake_failures.inc() + } + })?), Stream::Tls { .. } => Err(StreamUpgradeError::AlreadyTls), } } diff --git a/proxy/src/usage_metrics.rs b/proxy/src/usage_metrics.rs index d75aedf89b..56ed2145dc 100644 --- a/proxy/src/usage_metrics.rs +++ b/proxy/src/usage_metrics.rs @@ -1,20 +1,35 @@ //! Periodically collect proxy consumption metrics //! and push them to a HTTP endpoint. -use crate::{config::MetricCollectionConfig, http, BranchId, EndpointId}; -use chrono::{DateTime, Utc}; +use crate::{ + config::{MetricBackupCollectionConfig, MetricCollectionConfig}, + context::parquet::{FAILED_UPLOAD_MAX_RETRIES, FAILED_UPLOAD_WARN_THRESHOLD}, + http, + intern::{BranchIdInt, EndpointIdInt}, +}; +use anyhow::Context; +use async_compression::tokio::write::GzipEncoder; +use bytes::Bytes; +use chrono::{DateTime, Datelike, Timelike, Utc}; use consumption_metrics::{idempotency_key, Event, EventChunk, EventType, CHUNK_SIZE}; use dashmap::{mapref::entry::Entry, DashMap}; +use futures::future::select; use once_cell::sync::Lazy; +use remote_storage::{GenericRemoteStorage, RemotePath, TimeoutOrCancel}; use serde::{Deserialize, Serialize}; use std::{ convert::Infallible, + pin::pin, sync::{ atomic::{AtomicU64, AtomicUsize, Ordering}, Arc, }, time::Duration, }; +use tokio::io::AsyncWriteExt; +use tokio_util::sync::CancellationToken; use tracing::{error, info, instrument, trace}; +use utils::backoff; +use uuid::{NoContext, Timestamp}; const PROXY_IO_BYTES_PER_CLIENT: &str = "proxy_io_bytes_per_client"; @@ -29,23 +44,97 @@ const DEFAULT_HTTP_REPORTING_TIMEOUT: Duration = Duration::from_secs(60); /// because we enrich the event with project_id in the control-plane endpoint. #[derive(Eq, Hash, PartialEq, Serialize, Deserialize, Debug, Clone)] pub struct Ids { - pub endpoint_id: EndpointId, - pub branch_id: BranchId, + pub endpoint_id: EndpointIdInt, + pub branch_id: BranchIdInt, +} + +pub trait MetricCounterRecorder { + /// Record that some bytes were sent from the proxy to the client + fn record_egress(&self, bytes: u64); + /// Record that some connections were opened + fn record_connection(&self, count: usize); +} + +trait MetricCounterReporter { + fn get_metrics(&mut self) -> (u64, usize); + fn move_metrics(&self) -> (u64, usize); +} + +#[derive(Debug)] +struct MetricBackupCounter { + transmitted: AtomicU64, + opened_connections: AtomicUsize, +} + +impl MetricCounterRecorder for MetricBackupCounter { + fn record_egress(&self, bytes: u64) { + self.transmitted.fetch_add(bytes, Ordering::AcqRel); + } + + fn record_connection(&self, count: usize) { + self.opened_connections.fetch_add(count, Ordering::AcqRel); + } +} + +impl MetricCounterReporter for MetricBackupCounter { + fn get_metrics(&mut self) -> (u64, usize) { + ( + *self.transmitted.get_mut(), + *self.opened_connections.get_mut(), + ) + } + fn move_metrics(&self) -> (u64, usize) { + ( + self.transmitted.swap(0, Ordering::AcqRel), + self.opened_connections.swap(0, Ordering::AcqRel), + ) + } } #[derive(Debug)] pub struct MetricCounter { transmitted: AtomicU64, opened_connections: AtomicUsize, + backup: Arc, } -impl MetricCounter { +impl MetricCounterRecorder for MetricCounter { /// Record that some bytes were sent from the proxy to the client - pub fn record_egress(&self, bytes: u64) { + fn record_egress(&self, bytes: u64) { self.transmitted.fetch_add(bytes, Ordering::AcqRel); + self.backup.record_egress(bytes); } + /// Record that some connections were opened + fn record_connection(&self, count: usize) { + self.opened_connections.fetch_add(count, Ordering::AcqRel); + self.backup.record_connection(count); + } +} + +impl MetricCounterReporter for MetricCounter { + fn get_metrics(&mut self) -> (u64, usize) { + ( + *self.transmitted.get_mut(), + *self.opened_connections.get_mut(), + ) + } + fn move_metrics(&self) -> (u64, usize) { + ( + self.transmitted.swap(0, Ordering::AcqRel), + self.opened_connections.swap(0, Ordering::AcqRel), + ) + } +} + +trait Clearable { /// extract the value that should be reported + fn should_report(self: &Arc) -> Option; + /// Determine whether the counter should be cleared from the global map. + fn should_clear(self: &mut Arc) -> bool; +} + +impl Clearable for C { fn should_report(self: &Arc) -> Option { // heuristic to see if the branch is still open // if a clone happens while we are observing, the heuristic will be incorrect. @@ -54,13 +143,12 @@ impl MetricCounter { // However, for the strong count to be 1 it must have occured that at one instant // all the endpoints were closed, so missing a report because the endpoints are closed is valid. let is_open = Arc::strong_count(self) > 1; - let opened = self.opened_connections.swap(0, Ordering::AcqRel); // update cached metrics eagerly, even if they can't get sent // (to avoid sending the same metrics twice) // see the relevant discussion on why to do so even if the status is not success: // https://github.com/neondatabase/neon/pull/4563#discussion_r1246710956 - let value = self.transmitted.swap(0, Ordering::AcqRel); + let (value, opened) = self.move_metrics(); // Our only requirement is that we report in every interval if there was an open connection // if there were no opened connections since, then we don't need to report @@ -70,15 +158,12 @@ impl MetricCounter { Some(value) } } - - /// Determine whether the counter should be cleared from the global map. fn should_clear(self: &mut Arc) -> bool { // we can't clear this entry if it's acquired elsewhere let Some(counter) = Arc::get_mut(self) else { return false; }; - let opened = *counter.opened_connections.get_mut(); - let value = *counter.transmitted.get_mut(); + let (opened, value) = counter.get_metrics(); // clear if there's no data to report value == 0 && opened == 0 } @@ -90,11 +175,26 @@ type FastHasher = std::hash::BuildHasherDefault; #[derive(Default)] pub struct Metrics { endpoints: DashMap, FastHasher>, + backup_endpoints: DashMap, FastHasher>, } impl Metrics { /// Register a new byte metrics counter for this endpoint pub fn register(&self, ids: Ids) -> Arc { + let backup = if let Some(entry) = self.backup_endpoints.get(&ids) { + entry.clone() + } else { + self.backup_endpoints + .entry(ids.clone()) + .or_insert_with(|| { + Arc::new(MetricBackupCounter { + transmitted: AtomicU64::new(0), + opened_connections: AtomicUsize::new(0), + }) + }) + .clone() + }; + let entry = if let Some(entry) = self.endpoints.get(&ids) { entry.clone() } else { @@ -104,12 +204,13 @@ impl Metrics { Arc::new(MetricCounter { transmitted: AtomicU64::new(0), opened_connections: AtomicUsize::new(0), + backup: backup.clone(), }) }) .clone() }; - entry.opened_connections.fetch_add(1, Ordering::AcqRel); + entry.record_connection(1); entry } } @@ -132,7 +233,7 @@ pub async fn task_main(config: &MetricCollectionConfig) -> anyhow::Result anyhow::Result, - now: DateTime, -) { - info!( - "starting collect_metrics_iteration. metric_collection_endpoint: {}", - metric_collection_endpoint - ); - +fn collect_and_clear_metrics( + endpoints: &DashMap, FastHasher>, +) -> Vec<(Ids, u64)> { let mut metrics_to_clear = Vec::new(); - let metrics_to_send: Vec<(Ids, u64)> = metrics - .endpoints + let metrics_to_send: Vec<(Ids, u64)> = endpoints .iter() .filter_map(|counter| { let key = counter.key().clone(); @@ -173,33 +262,71 @@ async fn collect_metrics_iteration( }) .collect(); + for metric in metrics_to_clear { + match endpoints.entry(metric) { + Entry::Occupied(mut counter) => { + if counter.get_mut().should_clear() { + counter.remove_entry(); + } + } + Entry::Vacant(_) => {} + } + } + metrics_to_send +} + +fn create_event_chunks<'a>( + metrics_to_send: &'a [(Ids, u64)], + hostname: &'a str, + prev: DateTime, + now: DateTime, + chunk_size: usize, +) -> impl Iterator>> + 'a { + // Split into chunks of 1000 metrics to avoid exceeding the max request size + metrics_to_send + .chunks(chunk_size) + .map(move |chunk| EventChunk { + events: chunk + .iter() + .map(|(ids, value)| Event { + kind: EventType::Incremental { + start_time: prev, + stop_time: now, + }, + metric: PROXY_IO_BYTES_PER_CLIENT, + idempotency_key: idempotency_key(hostname), + value: *value, + extra: ids.clone(), + }) + .collect(), + }) +} + +#[instrument(skip_all)] +async fn collect_metrics_iteration( + endpoints: &DashMap, FastHasher>, + client: &http::ClientWithMiddleware, + metric_collection_endpoint: &reqwest::Url, + hostname: &str, + prev: DateTime, + now: DateTime, +) { + info!( + "starting collect_metrics_iteration. metric_collection_endpoint: {}", + metric_collection_endpoint + ); + + let metrics_to_send = collect_and_clear_metrics(endpoints); + if metrics_to_send.is_empty() { trace!("no new metrics to send"); } // Send metrics. - // Split into chunks of 1000 metrics to avoid exceeding the max request size - for chunk in metrics_to_send.chunks(CHUNK_SIZE) { - let events = chunk - .iter() - .map(|(ids, value)| Event { - kind: EventType::Incremental { - start_time: prev, - stop_time: now, - }, - metric: PROXY_IO_BYTES_PER_CLIENT, - idempotency_key: idempotency_key(hostname), - value: *value, - extra: Ids { - endpoint_id: ids.endpoint_id.clone(), - branch_id: ids.branch_id.clone(), - }, - }) - .collect(); - + for chunk in create_event_chunks(&metrics_to_send, hostname, prev, now, CHUNK_SIZE) { let res = client .post(metric_collection_endpoint.clone()) - .json(&EventChunk { events }) + .json(&chunk) .send() .await; @@ -213,23 +340,142 @@ async fn collect_metrics_iteration( if !res.status().is_success() { error!("metrics endpoint refused the sent metrics: {:?}", res); - for metric in chunk.iter().filter(|(_, value)| *value > (1u64 << 40)) { + for metric in chunk.events.iter().filter(|e| e.value > (1u64 << 40)) { // Report if the metric value is suspiciously large error!("potentially abnormal metric value: {:?}", metric); } } } +} - for metric in metrics_to_clear { - match metrics.endpoints.entry(metric) { - Entry::Occupied(mut counter) => { - if counter.get_mut().should_clear() { - counter.remove_entry(); - } - } - Entry::Vacant(_) => {} +pub async fn task_backup( + backup_config: &MetricBackupCollectionConfig, + cancellation_token: CancellationToken, +) -> anyhow::Result<()> { + info!("metrics backup config: {backup_config:?}"); + scopeguard::defer! { + info!("metrics backup has shut down"); + } + // Even if the remote storage is not configured, we still want to clear the metrics. + let storage = backup_config + .remote_storage_config + .as_ref() + .map(|config| GenericRemoteStorage::from_config(config).context("remote storage init")) + .transpose()?; + let mut ticker = tokio::time::interval(backup_config.interval); + let mut prev = Utc::now(); + let hostname = hostname::get()?.as_os_str().to_string_lossy().into_owned(); + loop { + select(pin!(ticker.tick()), pin!(cancellation_token.cancelled())).await; + let now = Utc::now(); + collect_metrics_backup_iteration( + &USAGE_METRICS.backup_endpoints, + &storage, + &hostname, + prev, + now, + backup_config.chunk_size, + ) + .await; + + prev = now; + if cancellation_token.is_cancelled() { + info!("metrics backup has been cancelled"); + break; } } + Ok(()) +} + +#[instrument(skip_all)] +async fn collect_metrics_backup_iteration( + endpoints: &DashMap, FastHasher>, + storage: &Option, + hostname: &str, + prev: DateTime, + now: DateTime, + chunk_size: usize, +) { + let year = now.year(); + let month = now.month(); + let day = now.day(); + let hour = now.hour(); + let minute = now.minute(); + let second = now.second(); + let cancel = CancellationToken::new(); + + info!("starting collect_metrics_backup_iteration"); + + let metrics_to_send = collect_and_clear_metrics(endpoints); + + if metrics_to_send.is_empty() { + trace!("no new metrics to send"); + } + + // Send metrics. + for chunk in create_event_chunks(&metrics_to_send, hostname, prev, now, chunk_size) { + let real_now = Utc::now(); + let id = uuid::Uuid::new_v7(Timestamp::from_unix( + NoContext, + real_now.second().into(), + real_now.nanosecond(), + )); + let path = format!("year={year:04}/month={month:02}/day={day:02}/{hour:02}:{minute:02}:{second:02}Z_{id}.json.gz"); + let remote_path = match RemotePath::from_string(&path) { + Ok(remote_path) => remote_path, + Err(e) => { + error!("failed to create remote path from str {path}: {:?}", e); + continue; + } + }; + + let res = upload_events_chunk(storage, chunk, &remote_path, &cancel).await; + + if let Err(e) = res { + error!( + "failed to upload consumption events to remote storage: {:?}", + e + ); + } + } +} + +async fn upload_events_chunk( + storage: &Option, + chunk: EventChunk<'_, Event>, + remote_path: &RemotePath, + cancel: &CancellationToken, +) -> anyhow::Result<()> { + let storage = match storage { + Some(storage) => storage, + None => { + error!("no remote storage configured"); + return Ok(()); + } + }; + let data = serde_json::to_vec(&chunk).context("serialize metrics")?; + let mut encoder = GzipEncoder::new(Vec::new()); + encoder.write_all(&data).await.context("compress metrics")?; + encoder.shutdown().await.context("compress metrics")?; + let compressed_data: Bytes = encoder.get_ref().clone().into(); + backoff::retry( + || async { + let stream = futures::stream::once(futures::future::ready(Ok(compressed_data.clone()))); + storage + .upload(stream, compressed_data.len(), remote_path, None, cancel) + .await + }, + TimeoutOrCancel::caused_by_cancel, + FAILED_UPLOAD_WARN_THRESHOLD, + FAILED_UPLOAD_MAX_RETRIES, + "request_data_upload", + cancel, + ) + .await + .ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel)) + .and_then(|x| x) + .context("request_data_upload")?; + Ok(()) } #[cfg(test)] @@ -248,8 +494,8 @@ mod tests { }; use url::Url; - use super::{collect_metrics_iteration, Ids, Metrics}; - use crate::{http, rate_limiter::RateLimiterConfig}; + use super::*; + use crate::{http, BranchId, EndpointId}; #[tokio::test] async fn metrics() { @@ -279,23 +525,24 @@ mod tests { tokio::spawn(server); let metrics = Metrics::default(); - let client = http::new_client(RateLimiterConfig::default()); + let client = http::new_client(); let endpoint = Url::parse(&format!("http://{addr}")).unwrap(); let now = Utc::now(); // no counters have been registered - collect_metrics_iteration(&metrics, &client, &endpoint, "foo", now, now).await; + collect_metrics_iteration(&metrics.endpoints, &client, &endpoint, "foo", now, now).await; let r = std::mem::take(&mut *reports2.lock().unwrap()); assert!(r.is_empty()); // register a new counter + let counter = metrics.register(Ids { - endpoint_id: "e1".into(), - branch_id: "b1".into(), + endpoint_id: (&EndpointId::from("e1")).into(), + branch_id: (&BranchId::from("b1")).into(), }); // the counter should be observed despite 0 egress - collect_metrics_iteration(&metrics, &client, &endpoint, "foo", now, now).await; + collect_metrics_iteration(&metrics.endpoints, &client, &endpoint, "foo", now, now).await; let r = std::mem::take(&mut *reports2.lock().unwrap()); assert_eq!(r.len(), 1); assert_eq!(r[0].events.len(), 1); @@ -305,7 +552,7 @@ mod tests { counter.record_egress(1); // egress should be observered - collect_metrics_iteration(&metrics, &client, &endpoint, "foo", now, now).await; + collect_metrics_iteration(&metrics.endpoints, &client, &endpoint, "foo", now, now).await; let r = std::mem::take(&mut *reports2.lock().unwrap()); assert_eq!(r.len(), 1); assert_eq!(r[0].events.len(), 1); @@ -315,11 +562,19 @@ mod tests { drop(counter); // we do not observe the counter - collect_metrics_iteration(&metrics, &client, &endpoint, "foo", now, now).await; + collect_metrics_iteration(&metrics.endpoints, &client, &endpoint, "foo", now, now).await; let r = std::mem::take(&mut *reports2.lock().unwrap()); assert!(r.is_empty()); // counter is unregistered assert!(metrics.endpoints.is_empty()); + + collect_metrics_backup_iteration(&metrics.backup_endpoints, &None, "foo", now, now, 1000) + .await; + assert!(!metrics.backup_endpoints.is_empty()); + collect_metrics_backup_iteration(&metrics.backup_endpoints, &None, "foo", now, now, 1000) + .await; + // backup counter is unregistered after the second iteration + assert!(metrics.backup_endpoints.is_empty()); } } diff --git a/pyproject.toml b/pyproject.toml index e347d47cbf..ac7f9b061c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,17 +14,17 @@ requests = "^2.31.0" pytest-xdist = "^3.3.1" asyncpg = "^0.29.0" aiopg = "^1.4.0" -Jinja2 = "^3.1.3" +Jinja2 = "^3.1.4" types-requests = "^2.31.0.0" types-psycopg2 = "^2.9.21.10" boto3 = "^1.34.11" boto3-stubs = {extras = ["s3"], version = "^1.26.16"} -moto = {extras = ["server"], version = "^4.1.2"} +moto = {extras = ["server"], version = "^5.0.6"} backoff = "^2.2.1" pytest-lazy-fixture = "^0.6.3" prometheus-client = "^0.14.1" pytest-timeout = "^2.1.0" -Werkzeug = "^3.0.1" +Werkzeug = "^3.0.3" pytest-order = "^1.1.0" allure-pytest = "^2.13.2" pytest-asyncio = "^0.21.0" @@ -33,7 +33,7 @@ psutil = "^5.9.4" types-psutil = "^5.9.5.12" types-toml = "^0.10.8.6" pytest-httpserver = "^1.0.8" -aiohttp = "3.9.2" +aiohttp = "3.9.4" pytest-rerunfailures = "^13.0" types-pytest-lazy-fixture = "^0.6.3.3" pytest-split = "^0.8.1" @@ -94,4 +94,5 @@ select = [ "I", # isort "W", # pycodestyle "B", # bugbear + "UP032", # f-string ] diff --git a/rust-toolchain.toml b/rust-toolchain.toml index b0949c32b1..214de0a77d 100644 --- a/rust-toolchain.toml +++ b/rust-toolchain.toml @@ -1,5 +1,5 @@ [toolchain] -channel = "1.76.0" +channel = "1.78.0" profile = "default" # The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy. # https://rust-lang.github.io/rustup/concepts/profiles.html diff --git a/s3_scrubber/Cargo.toml b/s3_scrubber/Cargo.toml index 4d136472e0..dd5d453a2b 100644 --- a/s3_scrubber/Cargo.toml +++ b/s3_scrubber/Cargo.toml @@ -22,9 +22,15 @@ serde_with.workspace = true workspace_hack.workspace = true utils.workspace = true async-stream.workspace = true +native-tls.workspace = true +postgres-native-tls.workspace = true +postgres_ffi.workspace = true tokio-stream.workspace = true +tokio-postgres.workspace = true +tokio-util = { workspace = true } futures-util.workspace = true itertools.workspace = true +camino.workspace = true tokio = { workspace = true, features = ["macros", "rt-multi-thread"] } chrono = { workspace = true, default-features = false, features = ["clock", "serde"] } diff --git a/s3_scrubber/README.md b/s3_scrubber/README.md index 2f21b9f191..c1deab8852 100644 --- a/s3_scrubber/README.md +++ b/s3_scrubber/README.md @@ -67,10 +67,12 @@ the purge command will log all the keys that it would have deleted. #### `scan-metadata` -Walk objects in a pageserver S3 bucket, and report statistics on the contents. +Walk objects in a pageserver or safekeeper S3 bucket, and report statistics on the contents and checking consistency. +Errors are logged to stderr and summary to stdout. +For pageserver: ``` -env SSO_ACCOUNT_ID=123456 REGION=eu-west-1 BUCKET=my-dev-bucket CLOUD_ADMIN_API_TOKEN=${NEON_CLOUD_ADMIN_API_STAGING_KEY} CLOUD_ADMIN_API_URL=[url] cargo run --release -- scan-metadata +env SSO_ACCOUNT_ID=123456 REGION=eu-west-1 BUCKET=my-dev-bucket CLOUD_ADMIN_API_TOKEN=${NEON_CLOUD_ADMIN_API_STAGING_KEY} CLOUD_ADMIN_API_URL=[url] cargo run --release -- scan-metadata --node-kind pageserver Timelines: 31106 With errors: 3 @@ -82,6 +84,10 @@ Layer size bytes: min 24576, 1% 36879, 10% 36879, 50% 61471, 90% 44695551, 99% 2 Timeline layer count: min 1, 1% 3, 10% 6, 50% 16, 90% 25, 99% 39, max 1053 ``` +For safekeepers, dump_db_connstr and dump_db_table must be +specified; they should point to table with debug dump which will be used +to list timelines and find their backup and start LSNs. + ## Cleaning up running pageservers If S3 state is altered first manually, pageserver in-memory state will contain wrong data about S3 state, and tenants/timelines may get recreated on S3 (due to any layer upload due to compaction, pageserver restart, etc.). So before proceeding, for tenants/timelines which are already deleted in the console, we must remove these from pageservers. diff --git a/s3_scrubber/src/checks.rs b/s3_scrubber/src/checks.rs index 7c0f699958..dd64a0a98f 100644 --- a/s3_scrubber/src/checks.rs +++ b/s3_scrubber/src/checks.rs @@ -13,7 +13,7 @@ use crate::metadata_stream::stream_listing; use crate::{download_object_with_retries, RootTarget, TenantShardTimelineId}; use futures_util::StreamExt; use pageserver::tenant::remote_timeline_client::parse_remote_index_path; -use pageserver::tenant::storage_layer::LayerFileName; +use pageserver::tenant::storage_layer::LayerName; use pageserver::tenant::IndexPart; use remote_storage::RemotePath; @@ -110,7 +110,7 @@ pub(crate) fn branch_cleanup_and_check_errors( for (layer, metadata) in index_part.layer_metadata { if metadata.file_size == 0 { result.errors.push(format!( - "index_part.json contains a layer {} that has 0 size in its layer metadata", layer.file_name(), + "index_part.json contains a layer {} that has 0 size in its layer metadata", layer, )) } @@ -121,7 +121,7 @@ pub(crate) fn branch_cleanup_and_check_errors( // layer we think is missing. result.errors.push(format!( "index_part.json contains a layer {}{} (shard {}) that is not present in remote storage", - layer.file_name(), + layer, metadata.generation.get_suffix(), metadata.shard )) @@ -170,8 +170,7 @@ pub(crate) struct LayerRef { /// the tenant to query whether an object exists. #[derive(Default)] pub(crate) struct TenantObjectListing { - shard_timelines: - HashMap<(ShardIndex, TimelineId), HashMap<(LayerFileName, Generation), LayerRef>>, + shard_timelines: HashMap<(ShardIndex, TimelineId), HashMap<(LayerName, Generation), LayerRef>>, } impl TenantObjectListing { @@ -180,7 +179,7 @@ impl TenantObjectListing { pub(crate) fn push( &mut self, ttid: TenantShardTimelineId, - layers: HashSet<(LayerFileName, Generation)>, + layers: HashSet<(LayerName, Generation)>, ) { let shard_index = ShardIndex::new( ttid.tenant_shard_id.shard_number, @@ -208,7 +207,7 @@ impl TenantObjectListing { pub(crate) fn check_ref( &mut self, timeline_id: TimelineId, - layer_file: &LayerFileName, + layer_file: &LayerName, metadata: &IndexLayerMetadata, ) -> bool { let Some(shard_tl) = self.shard_timelines.get_mut(&(metadata.shard, timeline_id)) else { @@ -224,7 +223,7 @@ impl TenantObjectListing { true } - pub(crate) fn get_orphans(&self) -> Vec<(ShardIndex, TimelineId, LayerFileName, Generation)> { + pub(crate) fn get_orphans(&self) -> Vec<(ShardIndex, TimelineId, LayerName, Generation)> { let mut result = Vec::new(); for ((shard_index, timeline_id), layers) in &self.shard_timelines { for ((layer_file, generation), layer_ref) in layers { @@ -247,25 +246,25 @@ pub(crate) struct S3TimelineBlobData { #[derive(Debug)] pub(crate) enum BlobDataParseResult { Parsed { - index_part: IndexPart, + index_part: Box, index_part_generation: Generation, - s3_layers: HashSet<(LayerFileName, Generation)>, + s3_layers: HashSet<(LayerName, Generation)>, }, /// The remains of a deleted Timeline (i.e. an initdb archive only) Relic, Incorrect(Vec), } -fn parse_layer_object_name(name: &str) -> Result<(LayerFileName, Generation), String> { +fn parse_layer_object_name(name: &str) -> Result<(LayerName, Generation), String> { match name.rsplit_once('-') { // FIXME: this is gross, just use a regex? Some((layer_filename, gen)) if gen.len() == 8 => { - let layer = layer_filename.parse::()?; + let layer = layer_filename.parse::()?; let gen = Generation::parse_suffix(gen).ok_or("Malformed generation suffix".to_string())?; Ok((layer, gen)) } - _ => Ok((name.parse::()?, Generation::none())), + _ => Ok((name.parse::()?, Generation::none())), } } @@ -369,7 +368,7 @@ pub(crate) async fn list_timeline_blobs( Ok(index_part) => { return Ok(S3TimelineBlobData { blob_data: BlobDataParseResult::Parsed { - index_part, + index_part: Box::new(index_part), index_part_generation, s3_layers, }, diff --git a/s3_scrubber/src/cloud_admin_api.rs b/s3_scrubber/src/cloud_admin_api.rs index 45cac23690..70b108cf23 100644 --- a/s3_scrubber/src/cloud_admin_api.rs +++ b/s3_scrubber/src/cloud_admin_api.rs @@ -1,11 +1,13 @@ -use std::time::Duration; - use chrono::{DateTime, Utc}; +use futures::Future; use hex::FromHex; + use reqwest::{header, Client, StatusCode, Url}; use serde::Deserialize; use tokio::sync::Semaphore; +use tokio_util::sync::CancellationToken; +use utils::backoff; use utils::id::{TenantId, TimelineId}; use utils::lsn::Lsn; @@ -137,7 +139,7 @@ pub struct ProjectData { pub region_id: String, pub platform_id: String, pub user_id: String, - pub pageserver_id: u64, + pub pageserver_id: Option, #[serde(deserialize_with = "from_nullable_id")] pub tenant: TenantId, pub safekeepers: Vec, @@ -155,7 +157,7 @@ pub struct ProjectData { pub maintenance_set: Option, } -#[derive(Debug, serde::Deserialize)] +#[derive(Debug, Clone, serde::Deserialize)] pub struct BranchData { pub id: BranchId, pub created_at: DateTime, @@ -210,30 +212,39 @@ impl CloudAdminApiClient { .await .expect("Semaphore is not closed"); - let response = self - .http_client - .get(self.append_url("/projects")) - .query(&[ - ("tenant_id", tenant_id.to_string()), - ("show_deleted", "true".to_string()), - ]) - .header(header::ACCEPT, "application/json") - .bearer_auth(&self.token) - .send() - .await - .map_err(|e| { - Error::new( - "Find project for tenant".to_string(), - ErrorKind::RequestSend(e), - ) - })?; + let response = CloudAdminApiClient::with_retries( + || async { + let response = self + .http_client + .get(self.append_url("/projects")) + .query(&[ + ("tenant_id", tenant_id.to_string()), + ("show_deleted", "true".to_string()), + ]) + .header(header::ACCEPT, "application/json") + .bearer_auth(&self.token) + .send() + .await + .map_err(|e| { + Error::new( + "Find project for tenant".to_string(), + ErrorKind::RequestSend(e), + ) + })?; + + let response: AdminApiResponse> = + response.json().await.map_err(|e| { + Error::new( + "Find project for tenant".to_string(), + ErrorKind::BodyRead(e), + ) + })?; + Ok(response) + }, + "find_tenant_project", + ) + .await?; - let response: AdminApiResponse> = response.json().await.map_err(|e| { - Error::new( - "Find project for tenant".to_string(), - ErrorKind::BodyRead(e), - ) - })?; match response.data.len() { 0 => Ok(None), 1 => Ok(Some( @@ -261,42 +272,34 @@ impl CloudAdminApiClient { const PAGINATION_LIMIT: usize = 512; let mut result: Vec = Vec::with_capacity(PAGINATION_LIMIT); loop { - let response = self - .http_client - .get(self.append_url("/projects")) - .query(&[ - ("show_deleted", "false".to_string()), - ("limit", format!("{PAGINATION_LIMIT}")), - ("offset", format!("{pagination_offset}")), - ]) - .header(header::ACCEPT, "application/json") - .bearer_auth(&self.token) - .send() - .await - .map_err(|e| { - Error::new( - "List active projects".to_string(), - ErrorKind::RequestSend(e), - ) - })?; + let response_bytes = CloudAdminApiClient::with_retries( + || async { + let response = self + .http_client + .get(self.append_url("/projects")) + .query(&[ + ("show_deleted", "false".to_string()), + ("limit", format!("{PAGINATION_LIMIT}")), + ("offset", format!("{pagination_offset}")), + ]) + .header(header::ACCEPT, "application/json") + .bearer_auth(&self.token) + .send() + .await + .map_err(|e| { + Error::new( + "List active projects".to_string(), + ErrorKind::RequestSend(e), + ) + })?; - match response.status() { - StatusCode::OK => {} - StatusCode::SERVICE_UNAVAILABLE | StatusCode::TOO_MANY_REQUESTS => { - tokio::time::sleep(Duration::from_millis(500)).await; - continue; - } - _status => { - return Err(Error::new( - "List active projects".to_string(), - ErrorKind::ResponseStatus(response.status()), - )) - } - } - - let response_bytes = response.bytes().await.map_err(|e| { - Error::new("List active projects".to_string(), ErrorKind::BodyRead(e)) - })?; + response.bytes().await.map_err(|e| { + Error::new("List active projects".to_string(), ErrorKind::BodyRead(e)) + }) + }, + "list_projects", + ) + .await?; let decode_result = serde_json::from_slice::>>(&response_bytes); @@ -327,6 +330,7 @@ impl CloudAdminApiClient { pub async fn find_timeline_branch( &self, + tenant_id: TenantId, timeline_id: TimelineId, ) -> Result, Error> { let _permit = self @@ -335,43 +339,61 @@ impl CloudAdminApiClient { .await .expect("Semaphore is not closed"); - let response = self - .http_client - .get(self.append_url("/branches")) - .query(&[ - ("timeline_id", timeline_id.to_string()), - ("show_deleted", "true".to_string()), - ]) - .header(header::ACCEPT, "application/json") - .bearer_auth(&self.token) - .send() - .await - .map_err(|e| { - Error::new( - "Find branch for timeline".to_string(), - ErrorKind::RequestSend(e), - ) - })?; + let response = CloudAdminApiClient::with_retries( + || async { + let response = self + .http_client + .get(self.append_url("/branches")) + .query(&[ + ("timeline_id", timeline_id.to_string()), + ("show_deleted", "true".to_string()), + ]) + .header(header::ACCEPT, "application/json") + .bearer_auth(&self.token) + .send() + .await + .map_err(|e| { + Error::new( + "Find branch for timeline".to_string(), + ErrorKind::RequestSend(e), + ) + })?; - let response: AdminApiResponse> = response.json().await.map_err(|e| { - Error::new( - "Find branch for timeline".to_string(), - ErrorKind::BodyRead(e), - ) - })?; - match response.data.len() { - 0 => Ok(None), - 1 => Ok(Some( - response - .data - .into_iter() - .next() - .expect("Should have exactly one element"), - )), - too_many => Err(Error::new( - format!("Find branch for timeline returned {too_many} branches instead of 0 or 1"), + let response: AdminApiResponse> = + response.json().await.map_err(|e| { + Error::new( + "Find branch for timeline".to_string(), + ErrorKind::BodyRead(e), + ) + })?; + Ok(response) + }, + "find_timeline_branch", + ) + .await?; + + let mut branches: Vec = response.data.into_iter().collect(); + // Normally timeline_id is unique. However, we do have at least one case + // of the same timeline_id in two different projects, apparently after + // manual recovery. So always recheck project_id (discovered through + // tenant_id). + let project_data = match self.find_tenant_project(tenant_id).await? { + Some(pd) => pd, + None => return Ok(None), + }; + branches.retain(|b| b.project_id == project_data.id); + if branches.len() < 2 { + Ok(branches.first().cloned()) + } else { + Err(Error::new( + format!( + "Find branch for timeline {}/{} returned {} branches instead of 0 or 1", + tenant_id, + timeline_id, + branches.len() + ), ErrorKind::UnexpectedState, - )), + )) } } @@ -532,4 +554,15 @@ impl CloudAdminApiClient { .parse() .unwrap_or_else(|e| panic!("Could not append {subpath} to base url: {e}")) } + + async fn with_retries(op: O, description: &str) -> Result + where + O: FnMut() -> F, + F: Future>, + { + let cancel = CancellationToken::new(); // not really used + backoff::retry(op, |_| false, 1, 20, description, &cancel) + .await + .expect("cancellations are disabled") + } } diff --git a/s3_scrubber/src/garbage.rs b/s3_scrubber/src/garbage.rs index 7a08dffc66..ce0ff10ec6 100644 --- a/s3_scrubber/src/garbage.rs +++ b/s3_scrubber/src/garbage.rs @@ -60,6 +60,7 @@ pub struct GarbageList { /// see garbage, we saw some active tenants too. This protects against classes of bugs /// in the scrubber that might otherwise generate a "deleted all" result. active_tenant_count: usize, + active_timeline_count: usize, } impl GarbageList { @@ -67,6 +68,7 @@ impl GarbageList { Self { items: Vec::new(), active_tenant_count: 0, + active_timeline_count: 0, node_kind, bucket_config, } @@ -119,7 +121,10 @@ pub async fn find_garbage( const S3_CONCURRENCY: usize = 32; // How many concurrent API requests to make to the console API. -const CONSOLE_CONCURRENCY: usize = 128; +// +// Be careful increasing this; roughly we shouldn't have more than ~100 rps. It +// would be better to implement real rsp limiter. +const CONSOLE_CONCURRENCY: usize = 16; struct ConsoleCache { /// Set of tenants found in the control plane API @@ -221,6 +226,7 @@ async fn find_garbage_inner( } else { tracing::debug!("Tenant {tenant_shard_id} is active"); active_tenants.push(tenant_shard_id); + garbage.active_tenant_count = active_tenants.len(); } counter += 1; @@ -261,7 +267,7 @@ async fn find_garbage_inner( let api_client = cloud_admin_api_client.clone(); async move { api_client - .find_timeline_branch(ttid.timeline_id) + .find_timeline_branch(ttid.tenant_shard_id.tenant_id, ttid.timeline_id) .await .map_err(|e| anyhow::anyhow!(e)) .map(|r| (ttid, r)) @@ -271,15 +277,29 @@ async fn find_garbage_inner( std::pin::pin!(timelines_checked.try_buffer_unordered(CONSOLE_CONCURRENCY)); // Update the GarbageList with any timelines which appear not to exist. + let mut active_timelines: Vec = vec![]; while let Some(result) = timelines_checked.next().await { let (ttid, console_result) = result?; if garbage.maybe_append(GarbageEntity::Timeline(ttid), console_result) { tracing::debug!("Timeline {ttid} is garbage"); } else { tracing::debug!("Timeline {ttid} is active"); + active_timelines.push(ttid); + garbage.active_timeline_count = active_timelines.len(); } } + let num_garbage_timelines = garbage + .items + .iter() + .filter(|g| matches!(g.entity, GarbageEntity::Timeline(_))) + .count(); + tracing::info!( + "Found {}/{} garbage timelines in active tenants", + num_garbage_timelines, + active_timelines.len(), + ); + Ok(garbage) } @@ -344,16 +364,22 @@ pub async fn get_timeline_objects( const MAX_KEYS_PER_DELETE: usize = 1000; /// Drain a buffer of keys into DeleteObjects requests +/// +/// If `drain` is true, drains keys completely; otherwise stops when < +/// MAX_KEYS_PER_DELETE keys are left. +/// `num_deleted` returns number of deleted keys. async fn do_delete( s3_client: &Arc, bucket_name: &str, keys: &mut Vec, dry_run: bool, drain: bool, + progress_tracker: &mut DeletionProgressTracker, ) -> anyhow::Result<()> { while (!keys.is_empty() && drain) || (keys.len() >= MAX_KEYS_PER_DELETE) { let request_keys = keys.split_off(keys.len() - (std::cmp::min(MAX_KEYS_PER_DELETE, keys.len()))); + let num_deleted = request_keys.len(); if dry_run { tracing::info!("Dry-run deletion of objects: "); for k in request_keys { @@ -368,12 +394,30 @@ async fn do_delete( .send() .await .context("DeleteObjects request")?; + progress_tracker.register(num_deleted); } } Ok(()) } +/// Simple tracker reporting each 10k deleted keys. +#[derive(Default)] +struct DeletionProgressTracker { + num_deleted: usize, + last_reported_num_deleted: usize, +} + +impl DeletionProgressTracker { + fn register(&mut self, n: usize) { + self.num_deleted += n; + if self.num_deleted - self.last_reported_num_deleted > 10000 { + tracing::info!("progress: deleted {} keys", self.num_deleted); + self.last_reported_num_deleted = self.num_deleted; + } + } +} + pub async fn purge_garbage( input_path: String, mode: PurgeMode, @@ -394,6 +438,14 @@ pub async fn purge_garbage( if garbage_list.active_tenant_count == 0 { anyhow::bail!("Refusing to purge a garbage list that reports 0 active tenants"); } + if garbage_list + .items + .iter() + .any(|g| matches!(g.entity, GarbageEntity::Timeline(_))) + && garbage_list.active_timeline_count == 0 + { + anyhow::bail!("Refusing to purge a garbage list containing garbage timelines that reports 0 active timelines"); + } let filtered_items = garbage_list .items @@ -429,6 +481,7 @@ pub async fn purge_garbage( std::pin::pin!(get_objects_results.try_buffer_unordered(S3_CONCURRENCY)); let mut objects_to_delete = Vec::new(); + let mut progress_tracker = DeletionProgressTracker::default(); while let Some(result) = get_objects_results.next().await { let mut object_list = result?; objects_to_delete.append(&mut object_list); @@ -439,6 +492,7 @@ pub async fn purge_garbage( &mut objects_to_delete, dry_run, false, + &mut progress_tracker, ) .await?; } @@ -450,10 +504,11 @@ pub async fn purge_garbage( &mut objects_to_delete, dry_run, true, + &mut progress_tracker, ) .await?; - tracing::info!("Fell through"); + tracing::info!("{} keys deleted in total", progress_tracker.num_deleted); Ok(()) } diff --git a/s3_scrubber/src/lib.rs b/s3_scrubber/src/lib.rs index d2842877d0..7966fb6a88 100644 --- a/s3_scrubber/src/lib.rs +++ b/s3_scrubber/src/lib.rs @@ -4,7 +4,9 @@ pub mod checks; pub mod cloud_admin_api; pub mod garbage; pub mod metadata_stream; -pub mod scan_metadata; +pub mod scan_pageserver_metadata; +pub mod scan_safekeeper_metadata; +pub mod tenant_snapshot; use std::env; use std::fmt::Display; @@ -23,17 +25,18 @@ use aws_sdk_s3::config::{AsyncSleep, Region, SharedAsyncSleep}; use aws_sdk_s3::{Client, Config}; use aws_smithy_async::rt::sleep::TokioSleep; +use camino::{Utf8Path, Utf8PathBuf}; use clap::ValueEnum; use pageserver::tenant::TENANTS_SEGMENT_NAME; use pageserver_api::shard::TenantShardId; use reqwest::Url; use serde::{Deserialize, Serialize}; -use std::io::IsTerminal; use tokio::io::AsyncReadExt; use tracing::error; use tracing_appender::non_blocking::WorkerGuard; use tracing_subscriber::{fmt, prelude::*, EnvFilter}; -use utils::id::TimelineId; +use utils::fs_ext; +use utils::id::{TenantId, TimelineId}; const MAX_RETRIES: usize = 20; const CLOUD_ADMIN_API_TOKEN_ENV_VAR: &str = "CLOUD_ADMIN_API_TOKEN"; @@ -139,12 +142,34 @@ impl RootTarget { pub fn tenants_root(&self) -> S3Target { match self { Self::Pageserver(root) => root.with_sub_segment(TENANTS_SEGMENT_NAME), - Self::Safekeeper(root) => root.with_sub_segment("wal"), + Self::Safekeeper(root) => root.clone(), } } pub fn tenant_root(&self, tenant_id: &TenantShardId) -> S3Target { - self.tenants_root().with_sub_segment(&tenant_id.to_string()) + match self { + Self::Pageserver(_) => self.tenants_root().with_sub_segment(&tenant_id.to_string()), + Self::Safekeeper(_) => self + .tenants_root() + .with_sub_segment(&tenant_id.tenant_id.to_string()), + } + } + + pub(crate) fn tenant_shards_prefix(&self, tenant_id: &TenantId) -> S3Target { + // Only pageserver remote storage contains tenant-shards + assert!(matches!(self, Self::Pageserver(_))); + let Self::Pageserver(root) = self else { + panic!(); + }; + + S3Target { + bucket_name: root.bucket_name.clone(), + prefix_in_bucket: format!( + "{}/{TENANTS_SEGMENT_NAME}/{tenant_id}", + root.prefix_in_bucket + ), + delimiter: root.delimiter.clone(), + } } pub fn timelines_root(&self, tenant_id: &TenantShardId) -> S3Target { @@ -240,7 +265,6 @@ pub fn init_logging(file_name: &str) -> WorkerGuard { .with_ansi(false) .with_writer(file_writer); let stderr_logs = fmt::Layer::new() - .with_ansi(std::io::stderr().is_terminal()) .with_target(false) .with_writer(std::io::stderr); tracing_subscriber::registry() @@ -288,7 +312,10 @@ pub fn init_s3_client(account_id: Option, bucket_region: Region) -> Clie let sleep_impl: Arc = Arc::new(TokioSleep::new()); let mut builder = Config::builder() - .behavior_version(BehaviorVersion::v2023_11_09()) + .behavior_version( + #[allow(deprecated)] /* TODO: https://github.com/neondatabase/neon/issues/7665 */ + BehaviorVersion::v2023_11_09(), + ) .region(bucket_region) .retry_config(RetryConfig::adaptive().with_max_attempts(3)) .sleep_impl(SharedAsyncSleep::from(sleep_impl)) @@ -319,9 +346,7 @@ fn init_remote( }), NodeKind::Safekeeper => RootTarget::Safekeeper(S3Target { bucket_name: bucket_config.bucket, - prefix_in_bucket: bucket_config - .prefix_in_bucket - .unwrap_or("safekeeper/v1".to_string()), + prefix_in_bucket: bucket_config.prefix_in_bucket.unwrap_or("wal/".to_string()), delimiter, }), }; @@ -346,7 +371,10 @@ async fn list_objects_with_retries( { Ok(response) => return Ok(response), Err(e) => { - error!("list_objects_v2 query failed: {e}"); + error!( + "list_objects_v2 query failed: {e}, bucket_name={}, prefix={}, delimiter={}", + s3_target.bucket_name, s3_target.prefix_in_bucket, s3_target.delimiter + ); tokio::time::sleep(Duration::from_secs(1)).await; } } @@ -396,3 +424,50 @@ async fn download_object_with_retries( anyhow::bail!("Failed to download objects with key {key} {MAX_RETRIES} times") } + +async fn download_object_to_file( + s3_client: &Client, + bucket_name: &str, + key: &str, + version_id: Option<&str>, + local_path: &Utf8Path, +) -> anyhow::Result<()> { + let tmp_path = Utf8PathBuf::from(format!("{local_path}.tmp")); + for _ in 0..MAX_RETRIES { + tokio::fs::remove_file(&tmp_path) + .await + .or_else(fs_ext::ignore_not_found)?; + + let mut file = tokio::fs::File::create(&tmp_path) + .await + .context("Opening output file")?; + + let request = s3_client.get_object().bucket(bucket_name).key(key); + + let request = match version_id { + Some(version_id) => request.version_id(version_id), + None => request, + }; + + let response_stream = match request.send().await { + Ok(response) => response, + Err(e) => { + error!( + "Failed to download object for key {key} version {}: {e:#}", + version_id.unwrap_or("") + ); + tokio::time::sleep(Duration::from_secs(1)).await; + continue; + } + }; + + let mut read_stream = response_stream.body.into_async_read(); + + tokio::io::copy(&mut read_stream, &mut file).await?; + + tokio::fs::rename(&tmp_path, local_path).await?; + return Ok(()); + } + + anyhow::bail!("Failed to download objects with key {key} {MAX_RETRIES} times") +} diff --git a/s3_scrubber/src/main.rs b/s3_scrubber/src/main.rs index 957213856b..e49c280b99 100644 --- a/s3_scrubber/src/main.rs +++ b/s3_scrubber/src/main.rs @@ -1,9 +1,16 @@ +use anyhow::bail; +use camino::Utf8PathBuf; use pageserver_api::shard::TenantShardId; use s3_scrubber::garbage::{find_garbage, purge_garbage, PurgeMode}; -use s3_scrubber::scan_metadata::scan_metadata; -use s3_scrubber::{init_logging, BucketConfig, ConsoleConfig, NodeKind, TraversingDepth}; +use s3_scrubber::scan_pageserver_metadata::scan_metadata; +use s3_scrubber::tenant_snapshot::SnapshotDownloader; +use s3_scrubber::{ + init_logging, scan_safekeeper_metadata::scan_safekeeper_metadata, BucketConfig, ConsoleConfig, + NodeKind, TraversingDepth, +}; use clap::{Parser, Subcommand}; +use utils::id::TenantId; #[derive(Parser)] #[command(author, version, about, long_about = None)] @@ -32,11 +39,28 @@ enum Command { #[arg(short, long, default_value_t = PurgeMode::DeletedOnly)] mode: PurgeMode, }, + #[command(verbatim_doc_comment)] ScanMetadata { + #[arg(short, long)] + node_kind: NodeKind, #[arg(short, long, default_value_t = false)] json: bool, #[arg(long = "tenant-id", num_args = 0..)] tenant_ids: Vec, + #[arg(long, default_value = None)] + /// For safekeeper node_kind only, points to db with debug dump + dump_db_connstr: Option, + /// For safekeeper node_kind only, table in the db with debug dump + #[arg(long, default_value = None)] + dump_db_table: Option, + }, + TenantSnapshot { + #[arg(long = "tenant-id")] + tenant_id: TenantId, + #[arg(long = "concurrency", short = 'j', default_value_t = 8)] + concurrency: usize, + #[arg(short, long)] + output_path: Utf8PathBuf, }, } @@ -50,6 +74,7 @@ async fn main() -> anyhow::Result<()> { Command::ScanMetadata { .. } => "scan", Command::FindGarbage { .. } => "find-garbage", Command::PurgeGarbage { .. } => "purge-garbage", + Command::TenantSnapshot { .. } => "tenant-snapshot", }; let _guard = init_logging(&format!( "{}_{}_{}_{}.log", @@ -60,33 +85,75 @@ async fn main() -> anyhow::Result<()> { )); match cli.command { - Command::ScanMetadata { json, tenant_ids } => { - match scan_metadata(bucket_config.clone(), tenant_ids).await { - Err(e) => { - tracing::error!("Failed: {e}"); - Err(e) + Command::ScanMetadata { + json, + tenant_ids, + node_kind, + dump_db_connstr, + dump_db_table, + } => { + if let NodeKind::Safekeeper = node_kind { + let dump_db_connstr = + dump_db_connstr.ok_or(anyhow::anyhow!("dump_db_connstr not specified"))?; + let dump_db_table = + dump_db_table.ok_or(anyhow::anyhow!("dump_db_table not specified"))?; + + let summary = scan_safekeeper_metadata( + bucket_config.clone(), + tenant_ids.iter().map(|tshid| tshid.tenant_id).collect(), + dump_db_connstr, + dump_db_table, + ) + .await?; + if json { + println!("{}", serde_json::to_string(&summary).unwrap()) + } else { + println!("{}", summary.summary_string()); } - Ok(summary) => { - if json { - println!("{}", serde_json::to_string(&summary).unwrap()) - } else { - println!("{}", summary.summary_string()); + if summary.is_fatal() { + bail!("Fatal scrub errors detected"); + } + if summary.is_empty() { + // Strictly speaking an empty bucket is a valid bucket, but if someone ran the + // scrubber they were likely expecting to scan something, and if we see no timelines + // at all then it's likely due to some configuration issues like a bad prefix + bail!( + "No timelines found in bucket {} prefix {}", + bucket_config.bucket, + bucket_config + .prefix_in_bucket + .unwrap_or("".to_string()) + ); + } + Ok(()) + } else { + match scan_metadata(bucket_config.clone(), tenant_ids).await { + Err(e) => { + tracing::error!("Failed: {e}"); + Err(e) } - if summary.is_fatal() { - Err(anyhow::anyhow!("Fatal scrub errors detected")) - } else if summary.is_empty() { - // Strictly speaking an empty bucket is a valid bucket, but if someone ran the - // scrubber they were likely expecting to scan something, and if we see no timelines - // at all then it's likely due to some configuration issues like a bad prefix - Err(anyhow::anyhow!( - "No timelines found in bucket {} prefix {}", - bucket_config.bucket, - bucket_config - .prefix_in_bucket - .unwrap_or("".to_string()) - )) - } else { - Ok(()) + Ok(summary) => { + if json { + println!("{}", serde_json::to_string(&summary).unwrap()) + } else { + println!("{}", summary.summary_string()); + } + if summary.is_fatal() { + Err(anyhow::anyhow!("Fatal scrub errors detected")) + } else if summary.is_empty() { + // Strictly speaking an empty bucket is a valid bucket, but if someone ran the + // scrubber they were likely expecting to scan something, and if we see no timelines + // at all then it's likely due to some configuration issues like a bad prefix + Err(anyhow::anyhow!( + "No timelines found in bucket {} prefix {}", + bucket_config.bucket, + bucket_config + .prefix_in_bucket + .unwrap_or("".to_string()) + )) + } else { + Ok(()) + } } } } @@ -102,5 +169,14 @@ async fn main() -> anyhow::Result<()> { Command::PurgeGarbage { input_path, mode } => { purge_garbage(input_path, mode, !cli.delete).await } + Command::TenantSnapshot { + tenant_id, + output_path, + concurrency, + } => { + let downloader = + SnapshotDownloader::new(bucket_config, tenant_id, output_path, concurrency)?; + downloader.download().await + } } } diff --git a/s3_scrubber/src/metadata_stream.rs b/s3_scrubber/src/metadata_stream.rs index 073f37f319..c05874f556 100644 --- a/s3_scrubber/src/metadata_stream.rs +++ b/s3_scrubber/src/metadata_stream.rs @@ -5,7 +5,7 @@ use tokio_stream::Stream; use crate::{list_objects_with_retries, RootTarget, S3Target, TenantShardTimelineId}; use pageserver_api::shard::TenantShardId; -use utils::id::TimelineId; +use utils::id::{TenantId, TimelineId}; /// Given an S3 bucket, output a stream of TenantIds discovered via ListObjectsv2 pub fn stream_tenants<'a>( @@ -45,6 +45,62 @@ pub fn stream_tenants<'a>( } } +pub async fn stream_tenant_shards<'a>( + s3_client: &'a Client, + target: &'a RootTarget, + tenant_id: TenantId, +) -> anyhow::Result> + 'a> { + let mut tenant_shard_ids: Vec> = Vec::new(); + let mut continuation_token = None; + let shards_target = target.tenant_shards_prefix(&tenant_id); + + loop { + tracing::info!("Listing in {}", shards_target.prefix_in_bucket); + let fetch_response = + list_objects_with_retries(s3_client, &shards_target, continuation_token.clone()).await; + let fetch_response = match fetch_response { + Err(e) => { + tenant_shard_ids.push(Err(e)); + break; + } + Ok(r) => r, + }; + + let new_entry_ids = fetch_response + .common_prefixes() + .iter() + .filter_map(|prefix| prefix.prefix()) + .filter_map(|prefix| -> Option<&str> { + prefix + .strip_prefix(&target.tenants_root().prefix_in_bucket)? + .strip_suffix('/') + }) + .map(|entry_id_str| { + let first_part = entry_id_str.split('/').next().unwrap(); + + first_part + .parse::() + .with_context(|| format!("Incorrect entry id str: {first_part}")) + }); + + for i in new_entry_ids { + tenant_shard_ids.push(i); + } + + match fetch_response.next_continuation_token { + Some(new_token) => continuation_token = Some(new_token), + None => break, + } + } + + Ok(stream! { + for i in tenant_shard_ids { + let id = i?; + yield Ok(id); + } + }) +} + /// Given a TenantShardId, output a stream of the timelines within that tenant, discovered /// using ListObjectsv2. The listing is done before the stream is built, so that this /// function can be used to generate concurrency on a stream using buffer_unordered. @@ -58,7 +114,7 @@ pub async fn stream_tenant_timelines<'a>( let timelines_target = target.timelines_root(&tenant); loop { - tracing::info!("Listing in {}", tenant); + tracing::debug!("Listing in {}", tenant); let fetch_response = list_objects_with_retries(s3_client, &timelines_target, continuation_token.clone()) .await; @@ -95,7 +151,7 @@ pub async fn stream_tenant_timelines<'a>( } } - tracing::info!("Yielding for {}", tenant); + tracing::debug!("Yielding for {}", tenant); Ok(stream! { for i in timeline_ids { let id = i?; diff --git a/s3_scrubber/src/scan_metadata.rs b/s3_scrubber/src/scan_pageserver_metadata.rs similarity index 100% rename from s3_scrubber/src/scan_metadata.rs rename to s3_scrubber/src/scan_pageserver_metadata.rs diff --git a/s3_scrubber/src/scan_safekeeper_metadata.rs b/s3_scrubber/src/scan_safekeeper_metadata.rs new file mode 100644 index 0000000000..73dd49ceb5 --- /dev/null +++ b/s3_scrubber/src/scan_safekeeper_metadata.rs @@ -0,0 +1,236 @@ +use std::{collections::HashSet, str::FromStr}; + +use aws_sdk_s3::Client; +use futures::stream::{StreamExt, TryStreamExt}; +use pageserver_api::shard::TenantShardId; +use postgres_ffi::{XLogFileName, PG_TLI}; +use serde::Serialize; +use tokio_postgres::types::PgLsn; +use tracing::{error, info, trace}; +use utils::{ + id::{TenantId, TenantTimelineId, TimelineId}, + lsn::Lsn, +}; + +use crate::{ + cloud_admin_api::CloudAdminApiClient, init_remote, metadata_stream::stream_listing, + BucketConfig, ConsoleConfig, NodeKind, RootTarget, TenantShardTimelineId, +}; + +/// Generally we should ask safekeepers, but so far we use everywhere default 16MB. +const WAL_SEGSIZE: usize = 16 * 1024 * 1024; + +#[derive(Serialize)] +pub struct MetadataSummary { + timeline_count: usize, + with_errors: HashSet, + deleted_count: usize, +} + +impl MetadataSummary { + fn new() -> Self { + Self { + timeline_count: 0, + with_errors: HashSet::new(), + deleted_count: 0, + } + } + + pub fn summary_string(&self) -> String { + format!( + "timeline_count: {}, with_errors: {}", + self.timeline_count, + self.with_errors.len() + ) + } + + pub fn is_empty(&self) -> bool { + self.timeline_count == 0 + } + + pub fn is_fatal(&self) -> bool { + !self.with_errors.is_empty() + } +} + +/// Scan the safekeeper metadata in an S3 bucket, reporting errors and +/// statistics. +/// +/// It works by listing timelines along with timeline_start_lsn and backup_lsn +/// in debug dump in dump_db_table and verifying its s3 contents. If some WAL +/// segments are missing, before complaining control plane is queried to check if +/// the project wasn't deleted in the meanwhile. +pub async fn scan_safekeeper_metadata( + bucket_config: BucketConfig, + tenant_ids: Vec, + dump_db_connstr: String, + dump_db_table: String, +) -> anyhow::Result { + info!( + "checking bucket {}, region {}, dump_db_table {}", + bucket_config.bucket, bucket_config.region, dump_db_table + ); + // Use the native TLS implementation (Neon requires TLS) + let tls_connector = + postgres_native_tls::MakeTlsConnector::new(native_tls::TlsConnector::new().unwrap()); + let (client, connection) = tokio_postgres::connect(&dump_db_connstr, tls_connector).await?; + // The connection object performs the actual communication with the database, + // so spawn it off to run on its own. + tokio::spawn(async move { + if let Err(e) = connection.await { + eprintln!("connection error: {}", e); + } + }); + + let tenant_filter_clause = if !tenant_ids.is_empty() { + format!( + "and tenant_id in ({})", + tenant_ids + .iter() + .map(|t| format!("'{}'", t)) + .collect::>() + .join(", ") + ) + } else { + "".to_owned() + }; + let query = format!( + "select tenant_id, timeline_id, min(timeline_start_lsn), max(backup_lsn) from \"{}\" where not is_cancelled {} group by tenant_id, timeline_id;", + dump_db_table, tenant_filter_clause, + ); + info!("query is {}", query); + let timelines = client.query(&query, &[]).await?; + info!("loaded {} timelines", timelines.len()); + + let (s3_client, target) = init_remote(bucket_config, NodeKind::Safekeeper)?; + let console_config = ConsoleConfig::from_env()?; + let cloud_admin_api_client = CloudAdminApiClient::new(console_config); + + let checks = futures::stream::iter(timelines.iter().map(Ok)).map_ok(|row| { + let tenant_id = TenantId::from_str(row.get(0)).expect("failed to parse tenant_id"); + let timeline_id = TimelineId::from_str(row.get(1)).expect("failed to parse tenant_id"); + let timeline_start_lsn_pg: PgLsn = row.get(2); + let timeline_start_lsn: Lsn = Lsn(u64::from(timeline_start_lsn_pg)); + let backup_lsn_pg: PgLsn = row.get(3); + let backup_lsn: Lsn = Lsn(u64::from(backup_lsn_pg)); + let ttid = TenantTimelineId::new(tenant_id, timeline_id); + check_timeline( + &s3_client, + &target, + &cloud_admin_api_client, + ttid, + timeline_start_lsn, + backup_lsn, + ) + }); + // Run multiple check_timeline's concurrently. + const CONCURRENCY: usize = 32; + let mut timelines = checks.try_buffered(CONCURRENCY); + + let mut summary = MetadataSummary::new(); + while let Some(r) = timelines.next().await { + let res = r?; + summary.timeline_count += 1; + if !res.is_ok { + summary.with_errors.insert(res.ttid); + } + if res.is_deleted { + summary.deleted_count += 1; + } + } + + Ok(summary) +} + +struct TimelineCheckResult { + ttid: TenantTimelineId, + is_ok: bool, + is_deleted: bool, // timeline is deleted in cplane +} + +/// List s3 and check that is has all expected WAL for the ttid. Consistency +/// errors are logged to stderr; returns Ok(true) if timeline is consistent, +/// Ok(false) if not, Err if failed to check. +async fn check_timeline( + s3_client: &Client, + root: &RootTarget, + api_client: &CloudAdminApiClient, + ttid: TenantTimelineId, + timeline_start_lsn: Lsn, + backup_lsn: Lsn, +) -> anyhow::Result { + trace!( + "checking ttid {}, should contain WAL [{}-{}]", + ttid, + timeline_start_lsn, + backup_lsn + ); + // calculate expected segfiles + let expected_first_segno = timeline_start_lsn.segment_number(WAL_SEGSIZE); + let expected_last_segno = backup_lsn.segment_number(WAL_SEGSIZE); + let mut expected_segfiles: HashSet = HashSet::from_iter( + (expected_first_segno..expected_last_segno) + .map(|segno| XLogFileName(PG_TLI, segno, WAL_SEGSIZE)), + ); + let expected_files_num = expected_segfiles.len(); + trace!("expecting {} files", expected_segfiles.len(),); + + // now list s3 and check if it misses something + let ttshid = + TenantShardTimelineId::new(TenantShardId::unsharded(ttid.tenant_id), ttid.timeline_id); + let mut timeline_dir_target = root.timeline_root(&ttshid); + // stream_listing yields only common_prefixes if delimiter is not empty, but + // we need files, so unset it. + timeline_dir_target.delimiter = String::new(); + + let mut stream = std::pin::pin!(stream_listing(s3_client, &timeline_dir_target)); + while let Some(obj) = stream.next().await { + let obj = obj?; + let key = obj.key(); + + let seg_name = key + .strip_prefix(&timeline_dir_target.prefix_in_bucket) + .expect("failed to extract segment name"); + expected_segfiles.remove(seg_name); + } + if !expected_segfiles.is_empty() { + // Before complaining check cplane, probably timeline is already deleted. + let bdata = api_client + .find_timeline_branch(ttid.tenant_id, ttid.timeline_id) + .await?; + let deleted = match bdata { + Some(bdata) => bdata.deleted, + None => { + // note: should be careful with selecting proper cplane address + info!("ttid {} not found, assuming it is deleted", ttid); + true + } + }; + if deleted { + // ok, branch is deleted + return Ok(TimelineCheckResult { + ttid, + is_ok: true, + is_deleted: true, + }); + } + error!( + "ttid {}: missing {} files out of {}, timeline_start_lsn {}, wal_backup_lsn {}", + ttid, + expected_segfiles.len(), + expected_files_num, + timeline_start_lsn, + backup_lsn, + ); + return Ok(TimelineCheckResult { + ttid, + is_ok: false, + is_deleted: false, + }); + } + Ok(TimelineCheckResult { + ttid, + is_ok: true, + is_deleted: false, + }) +} diff --git a/s3_scrubber/src/tenant_snapshot.rs b/s3_scrubber/src/tenant_snapshot.rs new file mode 100644 index 0000000000..a24a1e92ae --- /dev/null +++ b/s3_scrubber/src/tenant_snapshot.rs @@ -0,0 +1,293 @@ +use std::collections::HashMap; +use std::sync::Arc; + +use crate::checks::{list_timeline_blobs, BlobDataParseResult, S3TimelineBlobData}; +use crate::metadata_stream::{stream_tenant_shards, stream_tenant_timelines}; +use crate::{ + download_object_to_file, init_remote, BucketConfig, NodeKind, RootTarget, TenantShardTimelineId, +}; +use anyhow::Context; +use async_stream::stream; +use aws_sdk_s3::Client; +use camino::Utf8PathBuf; +use futures::{StreamExt, TryStreamExt}; +use pageserver::tenant::remote_timeline_client::index::IndexLayerMetadata; +use pageserver::tenant::storage_layer::LayerName; +use pageserver::tenant::IndexPart; +use pageserver_api::shard::TenantShardId; +use utils::generation::Generation; +use utils::id::TenantId; + +pub struct SnapshotDownloader { + s3_client: Arc, + s3_root: RootTarget, + bucket_config: BucketConfig, + tenant_id: TenantId, + output_path: Utf8PathBuf, + concurrency: usize, +} + +impl SnapshotDownloader { + pub fn new( + bucket_config: BucketConfig, + tenant_id: TenantId, + output_path: Utf8PathBuf, + concurrency: usize, + ) -> anyhow::Result { + let (s3_client, s3_root) = init_remote(bucket_config.clone(), NodeKind::Pageserver)?; + Ok(Self { + s3_client, + s3_root, + bucket_config, + tenant_id, + output_path, + concurrency, + }) + } + + async fn download_layer( + &self, + ttid: TenantShardTimelineId, + layer_name: LayerName, + layer_metadata: IndexLayerMetadata, + ) -> anyhow::Result<(LayerName, IndexLayerMetadata)> { + // Note this is local as in a local copy of S3 data, not local as in the pageserver's local format. They use + // different layer names (remote-style has the generation suffix) + let local_path = self.output_path.join(format!( + "{}/timelines/{}/{}{}", + ttid.tenant_shard_id, + ttid.timeline_id, + layer_name, + layer_metadata.generation.get_suffix() + )); + + // We should only be called for layers that are owned by the input TTID + assert_eq!(layer_metadata.shard, ttid.tenant_shard_id.to_index()); + + // Assumption: we always write layer files atomically, and layer files are immutable. Therefore if the file + // already exists on local disk, we assume it is fully correct and skip it. + if tokio::fs::try_exists(&local_path).await? { + tracing::debug!("{} already exists", local_path); + return Ok((layer_name, layer_metadata)); + } else { + tracing::debug!("{} requires download...", local_path); + + let timeline_root = self.s3_root.timeline_root(&ttid); + let remote_layer_path = format!( + "{}{}{}", + timeline_root.prefix_in_bucket, + layer_name, + layer_metadata.generation.get_suffix() + ); + + // List versions: the object might be deleted. + let versions = self + .s3_client + .list_object_versions() + .bucket(self.bucket_config.bucket.clone()) + .prefix(&remote_layer_path) + .send() + .await?; + let Some(version) = versions.versions.as_ref().and_then(|v| v.first()) else { + return Err(anyhow::anyhow!("No versions found for {remote_layer_path}")); + }; + download_object_to_file( + &self.s3_client, + &self.bucket_config.bucket, + &remote_layer_path, + version.version_id.as_deref(), + &local_path, + ) + .await?; + + tracing::debug!("Downloaded successfully to {local_path}"); + } + + Ok((layer_name, layer_metadata)) + } + + /// Download many layers belonging to the same TTID, with some concurrency + async fn download_layers( + &self, + ttid: TenantShardTimelineId, + layers: Vec<(LayerName, IndexLayerMetadata)>, + ) -> anyhow::Result<()> { + let layer_count = layers.len(); + tracing::info!("Downloading {} layers for timeline {ttid}...", layer_count); + let layers_stream = stream! { + for (layer_name, layer_metadata) in layers { + yield self.download_layer(ttid, layer_name, layer_metadata); + } + }; + + tokio::fs::create_dir_all(self.output_path.join(format!( + "{}/timelines/{}", + ttid.tenant_shard_id, ttid.timeline_id + ))) + .await?; + + let layer_results = layers_stream.buffered(self.concurrency); + let mut layer_results = std::pin::pin!(layer_results); + + let mut err = None; + let mut download_count = 0; + while let Some(i) = layer_results.next().await { + download_count += 1; + match i { + Ok((layer_name, layer_metadata)) => { + tracing::info!( + "[{download_count}/{layer_count}] OK: {} bytes {ttid} {}", + layer_metadata.file_size, + layer_name + ); + } + Err(e) => { + // Warn and continue: we will download what we can + tracing::warn!("Download error: {e}"); + err = Some(e); + } + } + } + if let Some(e) = err { + tracing::warn!("Some errors occurred downloading {ttid} layers, last error: {e}"); + Err(e) + } else { + Ok(()) + } + } + + async fn download_timeline( + &self, + ttid: TenantShardTimelineId, + index_part: Box, + index_part_generation: Generation, + ancestor_layers: &mut HashMap< + TenantShardTimelineId, + HashMap, + >, + ) -> anyhow::Result<()> { + let index_bytes = serde_json::to_string(&index_part).unwrap(); + + let layers = index_part + .layer_metadata + .into_iter() + .filter_map(|(layer_name, layer_metadata)| { + if layer_metadata.shard.shard_count != ttid.tenant_shard_id.shard_count { + // Accumulate ancestor layers for later download + let ancestor_ttid = TenantShardTimelineId::new( + TenantShardId { + tenant_id: ttid.tenant_shard_id.tenant_id, + shard_number: layer_metadata.shard.shard_number, + shard_count: layer_metadata.shard.shard_count, + }, + ttid.timeline_id, + ); + let ancestor_ttid_layers = ancestor_layers.entry(ancestor_ttid).or_default(); + use std::collections::hash_map::Entry; + match ancestor_ttid_layers.entry(layer_name) { + Entry::Occupied(entry) => { + // Descendent shards that reference a layer from an ancestor should always have matching metadata, + // as their siblings, because it is read atomically during a shard split. + assert_eq!(entry.get(), &layer_metadata); + } + Entry::Vacant(entry) => { + entry.insert(layer_metadata); + } + } + None + } else { + Some((layer_name, layer_metadata)) + } + }) + .collect(); + + let download_result = self.download_layers(ttid, layers).await; + + // Write index last, once all the layers it references are downloaded + let local_index_path = self.output_path.join(format!( + "{}/timelines/{}/index_part.json{}", + ttid.tenant_shard_id, + ttid.timeline_id, + index_part_generation.get_suffix() + )); + tokio::fs::write(&local_index_path, index_bytes) + .await + .context("writing index")?; + + download_result + } + + pub async fn download(&self) -> anyhow::Result<()> { + let (s3_client, target) = init_remote(self.bucket_config.clone(), NodeKind::Pageserver)?; + + // Generate a stream of TenantShardId + let shards = stream_tenant_shards(&s3_client, &target, self.tenant_id).await?; + let shards: Vec = shards.try_collect().await?; + + // Only read from shards that have the highest count: avoids redundantly downloading + // from ancestor shards. + let Some(shard_count) = shards.iter().map(|s| s.shard_count).max() else { + anyhow::bail!("No shards found"); + }; + + // We will build a collection of layers in anccestor shards to download (this will only + // happen if this tenant has been split at some point) + let mut ancestor_layers: HashMap< + TenantShardTimelineId, + HashMap, + > = Default::default(); + + for shard in shards.into_iter().filter(|s| s.shard_count == shard_count) { + // Generate a stream of TenantTimelineId + let timelines = stream_tenant_timelines(&s3_client, &self.s3_root, shard).await?; + + // Generate a stream of S3TimelineBlobData + async fn load_timeline_index( + s3_client: &Client, + target: &RootTarget, + ttid: TenantShardTimelineId, + ) -> anyhow::Result<(TenantShardTimelineId, S3TimelineBlobData)> { + let data = list_timeline_blobs(s3_client, ttid, target).await?; + Ok((ttid, data)) + } + let timelines = timelines.map_ok(|ttid| load_timeline_index(&s3_client, &target, ttid)); + let mut timelines = std::pin::pin!(timelines.try_buffered(8)); + + while let Some(i) = timelines.next().await { + let (ttid, data) = i?; + match data.blob_data { + BlobDataParseResult::Parsed { + index_part, + index_part_generation, + s3_layers: _, + } => { + self.download_timeline( + ttid, + index_part, + index_part_generation, + &mut ancestor_layers, + ) + .await + .context("Downloading timeline")?; + } + BlobDataParseResult::Relic => {} + BlobDataParseResult::Incorrect(_) => { + tracing::error!("Bad metadata in timeline {ttid}"); + } + }; + } + } + + for (ttid, layers) in ancestor_layers.into_iter() { + tracing::info!( + "Downloading {} layers from ancvestor timeline {ttid}...", + layers.len() + ); + + self.download_layers(ttid, layers.into_iter().collect()) + .await?; + } + + Ok(()) + } +} diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml index cb4a1def1f..c8b732fee1 100644 --- a/safekeeper/Cargo.toml +++ b/safekeeper/Cargo.toml @@ -33,6 +33,7 @@ once_cell.workspace = true parking_lot.workspace = true postgres.workspace = true postgres-protocol.workspace = true +rand.workspace = true regex.workspace = true scopeguard.workspace = true reqwest = { workspace = true, features = ["json"] } diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs index 3c4c81e499..09c565ce71 100644 --- a/safekeeper/src/bin/safekeeper.rs +++ b/safekeeper/src/bin/safekeeper.rs @@ -28,7 +28,7 @@ use utils::pid_file; use metrics::set_build_info_metric; use safekeeper::defaults::{ DEFAULT_HEARTBEAT_TIMEOUT, DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_MAX_OFFLOADER_LAG_BYTES, - DEFAULT_PG_LISTEN_ADDR, + DEFAULT_PARTIAL_BACKUP_TIMEOUT, DEFAULT_PG_LISTEN_ADDR, }; use safekeeper::wal_service; use safekeeper::GlobalTimelines; @@ -170,6 +170,17 @@ struct Args { /// still needed for existing replication connection. #[arg(long)] walsenders_keep_horizon: bool, + /// Enable partial backup. If disabled, safekeeper will not upload partial + /// segments to remote storage. + #[arg(long)] + partial_backup_enabled: bool, + /// Controls how long backup will wait until uploading the partial segment. + #[arg(long, value_parser = humantime::parse_duration, default_value = DEFAULT_PARTIAL_BACKUP_TIMEOUT, verbatim_doc_comment)] + partial_backup_timeout: Duration, + /// Disable task to push messages to broker every second. Supposed to + /// be used in tests. + #[arg(long)] + disable_periodic_broker_push: bool, } // Like PathBufValueParser, but allows empty string. @@ -300,6 +311,9 @@ async fn main() -> anyhow::Result<()> { http_auth, current_thread_runtime: args.current_thread_runtime, walsenders_keep_horizon: args.walsenders_keep_horizon, + partial_backup_enabled: args.partial_backup_enabled, + partial_backup_timeout: args.partial_backup_timeout, + disable_periodic_broker_push: args.disable_periodic_broker_push, }; // initialize sentry if SENTRY_DSN is provided @@ -365,6 +379,8 @@ async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> { let (wal_backup_launcher_tx, wal_backup_launcher_rx) = mpsc::channel(100); + wal_backup::init_remote_storage(&conf); + // Keep handles to main tasks to die if any of them disappears. let mut tasks_handles: FuturesUnordered> = FuturesUnordered::new(); diff --git a/safekeeper/src/broker.rs b/safekeeper/src/broker.rs index 2b1db2714b..98f58d3e49 100644 --- a/safekeeper/src/broker.rs +++ b/safekeeper/src/broker.rs @@ -10,11 +10,20 @@ use anyhow::Result; use storage_broker::parse_proto_ttid; use storage_broker::proto::subscribe_safekeeper_info_request::SubscriptionKey as ProtoSubscriptionKey; +use storage_broker::proto::FilterTenantTimelineId; +use storage_broker::proto::MessageType; +use storage_broker::proto::SafekeeperDiscoveryResponse; +use storage_broker::proto::SubscribeByFilterRequest; use storage_broker::proto::SubscribeSafekeeperInfoRequest; +use storage_broker::proto::TypeSubscription; +use storage_broker::proto::TypedMessage; use storage_broker::Request; +use std::sync::atomic::AtomicU64; +use std::sync::Arc; use std::time::Duration; use std::time::Instant; +use std::time::UNIX_EPOCH; use tokio::task::JoinHandle; use tokio::time::sleep; use tracing::*; @@ -31,6 +40,12 @@ const PUSH_INTERVAL_MSEC: u64 = 1000; /// Push once in a while data about all active timelines to the broker. async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> { + if conf.disable_periodic_broker_push { + info!("broker push_loop is disabled, doing nothing..."); + futures::future::pending::<()>().await; // sleep forever + return Ok(()); + } + let mut client = storage_broker::connect(conf.broker_endpoint.clone(), conf.broker_keepalive_interval)?; let push_interval = Duration::from_millis(PUSH_INTERVAL_MSEC); @@ -75,7 +90,7 @@ async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> { } /// Subscribe and fetch all the interesting data from the broker. -async fn pull_loop(conf: SafeKeeperConf) -> Result<()> { +async fn pull_loop(conf: SafeKeeperConf, stats: Arc) -> Result<()> { let mut client = storage_broker::connect(conf.broker_endpoint, conf.broker_keepalive_interval)?; // TODO: subscribe only to local timelines instead of all @@ -94,6 +109,8 @@ async fn pull_loop(conf: SafeKeeperConf) -> Result<()> { let err_counter = BROKER_PULLED_UPDATES.with_label_values(&["error"]); while let Some(msg) = stream.message().await? { + stats.update_pulled(); + let proto_ttid = msg .tenant_timeline_id .as_ref() @@ -119,12 +136,93 @@ async fn pull_loop(conf: SafeKeeperConf) -> Result<()> { bail!("end of stream"); } +/// Process incoming discover requests. This is done in a separate task to avoid +/// interfering with the normal pull/push loops. +async fn discover_loop(conf: SafeKeeperConf, stats: Arc) -> Result<()> { + let mut client = + storage_broker::connect(conf.broker_endpoint.clone(), conf.broker_keepalive_interval)?; + + let request = SubscribeByFilterRequest { + types: vec![TypeSubscription { + r#type: MessageType::SafekeeperDiscoveryRequest as i32, + }], + tenant_timeline_id: Some(FilterTenantTimelineId { + enabled: false, + tenant_timeline_id: None, + }), + }; + + let mut stream = client + .subscribe_by_filter(request) + .await + .context("subscribe_by_filter request failed")? + .into_inner(); + + let discover_counter = BROKER_PULLED_UPDATES.with_label_values(&["discover"]); + + while let Some(typed_msg) = stream.message().await? { + stats.update_pulled(); + + match typed_msg.r#type() { + MessageType::SafekeeperDiscoveryRequest => { + let msg = typed_msg + .safekeeper_discovery_request + .expect("proto type mismatch from broker message"); + + let proto_ttid = msg + .tenant_timeline_id + .as_ref() + .ok_or_else(|| anyhow!("missing tenant_timeline_id"))?; + let ttid = parse_proto_ttid(proto_ttid)?; + if let Ok(tli) = GlobalTimelines::get(ttid) { + // we received a discovery request for a timeline we know about + discover_counter.inc(); + + // create and reply with discovery response + let sk_info = tli.get_safekeeper_info(&conf).await; + let response = SafekeeperDiscoveryResponse { + safekeeper_id: sk_info.safekeeper_id, + tenant_timeline_id: sk_info.tenant_timeline_id, + commit_lsn: sk_info.commit_lsn, + safekeeper_connstr: sk_info.safekeeper_connstr, + availability_zone: sk_info.availability_zone, + }; + + // note this is a blocking call + client + .publish_one(TypedMessage { + r#type: MessageType::SafekeeperDiscoveryResponse as i32, + safekeeper_timeline_info: None, + safekeeper_discovery_request: None, + safekeeper_discovery_response: Some(response), + }) + .await?; + } + } + + _ => { + warn!( + "unexpected message type i32 {}, {:?}", + typed_msg.r#type, + typed_msg.r#type() + ); + } + } + } + bail!("end of stream"); +} + pub async fn task_main(conf: SafeKeeperConf) -> anyhow::Result<()> { info!("started, broker endpoint {:?}", conf.broker_endpoint); let mut ticker = tokio::time::interval(Duration::from_millis(RETRY_INTERVAL_MSEC)); let mut push_handle: Option>> = None; let mut pull_handle: Option>> = None; + let mut discover_handle: Option>> = None; + + let stats = Arc::new(BrokerStats::new()); + let stats_task = task_stats(stats.clone()); + tokio::pin!(stats_task); // Selecting on JoinHandles requires some squats; is there a better way to // reap tasks individually? @@ -153,13 +251,77 @@ pub async fn task_main(conf: SafeKeeperConf) -> anyhow::Result<()> { }; pull_handle = None; }, + res = async { discover_handle.as_mut().unwrap().await }, if discover_handle.is_some() => { + // was it panic or normal error? + match res { + Ok(res_internal) => if let Err(err_inner) = res_internal { + warn!("discover task failed: {:?}", err_inner); + } + Err(err_outer) => { warn!("discover task panicked: {:?}", err_outer) } + }; + discover_handle = None; + }, _ = ticker.tick() => { if push_handle.is_none() { push_handle = Some(tokio::spawn(push_loop(conf.clone()))); } if pull_handle.is_none() { - pull_handle = Some(tokio::spawn(pull_loop(conf.clone()))); + pull_handle = Some(tokio::spawn(pull_loop(conf.clone(), stats.clone()))); } + if discover_handle.is_none() { + discover_handle = Some(tokio::spawn(discover_loop(conf.clone(), stats.clone()))); + } + }, + _ = &mut stats_task => {} + } + } +} + +struct BrokerStats { + /// Timestamp of the last received message from the broker. + last_pulled_ts: AtomicU64, +} + +impl BrokerStats { + fn new() -> Self { + BrokerStats { + last_pulled_ts: AtomicU64::new(0), + } + } + + fn now_millis() -> u64 { + std::time::SystemTime::now() + .duration_since(UNIX_EPOCH) + .expect("time is before epoch") + .as_millis() as u64 + } + + /// Update last_pulled timestamp to current time. + fn update_pulled(&self) { + self.last_pulled_ts + .store(Self::now_millis(), std::sync::atomic::Ordering::Relaxed); + } +} + +/// Periodically write to logs if there are issues with receiving data from the broker. +async fn task_stats(stats: Arc) { + let warn_duration = Duration::from_secs(10); + let mut ticker = tokio::time::interval(warn_duration); + + loop { + tokio::select! { + _ = ticker.tick() => { + let last_pulled = stats.last_pulled_ts.load(std::sync::atomic::Ordering::SeqCst); + if last_pulled == 0 { + // no broker updates yet + continue; + } + + let now = BrokerStats::now_millis(); + if now > last_pulled && now - last_pulled > warn_duration.as_millis() as u64 { + let ts = chrono::NaiveDateTime::from_timestamp_millis(last_pulled as i64).expect("invalid timestamp"); + info!("no broker updates for some time, last update: {:?}", ts); + } } } } diff --git a/safekeeper/src/control_file.rs b/safekeeper/src/control_file.rs index d822c87c0e..fe9f2e6899 100644 --- a/safekeeper/src/control_file.rs +++ b/safekeeper/src/control_file.rs @@ -20,7 +20,7 @@ use utils::{bin_ser::LeSer, id::TenantTimelineId}; use crate::SafeKeeperConf; pub const SK_MAGIC: u32 = 0xcafeceefu32; -pub const SK_FORMAT_VERSION: u32 = 7; +pub const SK_FORMAT_VERSION: u32 = 8; // contains persistent metadata for safekeeper const CONTROL_FILE_NAME: &str = "safekeeper.control"; diff --git a/safekeeper/src/control_file_upgrade.rs b/safekeeper/src/control_file_upgrade.rs index 2fd719326d..8f4dfe9b43 100644 --- a/safekeeper/src/control_file_upgrade.rs +++ b/safekeeper/src/control_file_upgrade.rs @@ -2,6 +2,7 @@ use crate::{ safekeeper::{AcceptorState, PgUuid, ServerInfo, Term, TermHistory, TermLsn}, state::{PersistedPeers, TimelinePersistentState}, + wal_backup_partial, }; use anyhow::{bail, Result}; use pq_proto::SystemId; @@ -138,6 +139,50 @@ pub struct SafeKeeperStateV4 { pub peers: PersistedPeers, } +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub struct SafeKeeperStateV7 { + #[serde(with = "hex")] + pub tenant_id: TenantId, + #[serde(with = "hex")] + pub timeline_id: TimelineId, + /// persistent acceptor state + pub acceptor_state: AcceptorState, + /// information about server + pub server: ServerInfo, + /// Unique id of the last *elected* proposer we dealt with. Not needed + /// for correctness, exists for monitoring purposes. + #[serde(with = "hex")] + pub proposer_uuid: PgUuid, + /// Since which LSN this timeline generally starts. Safekeeper might have + /// joined later. + pub timeline_start_lsn: Lsn, + /// Since which LSN safekeeper has (had) WAL for this timeline. + /// All WAL segments next to one containing local_start_lsn are + /// filled with data from the beginning. + pub local_start_lsn: Lsn, + /// Part of WAL acknowledged by quorum *and available locally*. Always points + /// to record boundary. + pub commit_lsn: Lsn, + /// LSN that points to the end of the last backed up segment. Useful to + /// persist to avoid finding out offloading progress on boot. + pub backup_lsn: Lsn, + /// Minimal LSN which may be needed for recovery of some safekeeper (end_lsn + /// of last record streamed to everyone). Persisting it helps skipping + /// recovery in walproposer, generally we compute it from peers. In + /// walproposer proto called 'truncate_lsn'. Updates are currently drived + /// only by walproposer. + pub peer_horizon_lsn: Lsn, + /// LSN of the oldest known checkpoint made by pageserver and successfully + /// pushed to s3. We don't remove WAL beyond it. Persisted only for + /// informational purposes, we receive it from pageserver (or broker). + pub remote_consistent_lsn: Lsn, + // Peers and their state as we remember it. Knowing peers themselves is + // fundamental; but state is saved here only for informational purposes and + // obviously can be stale. (Currently not saved at all, but let's provision + // place to have less file version upgrades). + pub peers: PersistedPeers, +} + pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result { // migrate to storing full term history if version == 1 { @@ -167,6 +212,7 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result Result Result Result Result RouterBuilder .get("/v1/status", |r| request_span(r, status_handler)) .put("/v1/failpoints", |r| { request_span(r, move |r| async { + check_permission(&r, None)?; let cancel = CancellationToken::new(); failpoints_handler(r, cancel).await }) diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs index ce4b4d7bd0..543714a54e 100644 --- a/safekeeper/src/lib.rs +++ b/safekeeper/src/lib.rs @@ -32,6 +32,7 @@ pub mod send_wal; pub mod state; pub mod timeline; pub mod wal_backup; +pub mod wal_backup_partial; pub mod wal_service; pub mod wal_storage; @@ -48,6 +49,7 @@ pub mod defaults { pub const DEFAULT_HEARTBEAT_TIMEOUT: &str = "5000ms"; pub const DEFAULT_MAX_OFFLOADER_LAG_BYTES: u64 = 128 * (1 << 20); + pub const DEFAULT_PARTIAL_BACKUP_TIMEOUT: &str = "15m"; } #[derive(Debug, Clone)] @@ -79,6 +81,9 @@ pub struct SafeKeeperConf { pub http_auth: Option>, pub current_thread_runtime: bool, pub walsenders_keep_horizon: bool, + pub partial_backup_enabled: bool, + pub partial_backup_timeout: Duration, + pub disable_periodic_broker_push: bool, } impl SafeKeeperConf { @@ -123,6 +128,9 @@ impl SafeKeeperConf { max_offloader_lag_bytes: defaults::DEFAULT_MAX_OFFLOADER_LAG_BYTES, current_thread_runtime: false, walsenders_keep_horizon: false, + partial_backup_enabled: false, + partial_backup_timeout: Duration::from_secs(0), + disable_periodic_broker_push: false, } } } diff --git a/safekeeper/src/metrics.rs b/safekeeper/src/metrics.rs index e541527b6a..28ae042bb3 100644 --- a/safekeeper/src/metrics.rs +++ b/safekeeper/src/metrics.rs @@ -147,6 +147,21 @@ pub static RECEIVED_PS_FEEDBACKS: Lazy = Lazy::new(|| { ) .expect("Failed to register safekeeper_received_ps_feedbacks_total counter") }); +pub static PARTIAL_BACKUP_UPLOADS: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "safekeeper_partial_backup_uploads_total", + "Number of partial backup uploads to the S3", + &["result"] + ) + .expect("Failed to register safekeeper_partial_backup_uploads_total counter") +}); +pub static PARTIAL_BACKUP_UPLOADED_BYTES: Lazy = Lazy::new(|| { + register_int_counter!( + "safekeeper_partial_backup_uploaded_bytes_total", + "Number of bytes uploaded to the S3 during partial backup" + ) + .expect("Failed to register safekeeper_partial_backup_uploaded_bytes_total counter") +}); pub const LABEL_UNKNOWN: &str = "unknown"; diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs index d7c8fa6955..e671d4f36a 100644 --- a/safekeeper/src/safekeeper.rs +++ b/safekeeper/src/safekeeper.rs @@ -725,6 +725,18 @@ where self.state.inmem.commit_lsn ); + // Before first WAL write initialize its segment. It makes first segment + // pg_waldump'able because stream from compute doesn't include its + // segment and page headers. + // + // If we fail before first WAL write flush this action would be + // repeated, that's ok because it is idempotent. + if self.wal_store.flush_lsn() == Lsn::INVALID { + self.wal_store + .initialize_first_segment(msg.start_streaming_at) + .await?; + } + // TODO: cross check divergence point, check if msg.start_streaming_at corresponds to // intersection of our history and history from msg @@ -1007,6 +1019,10 @@ mod tests { self.lsn } + async fn initialize_first_segment(&mut self, _init_lsn: Lsn) -> Result<()> { + Ok(()) + } + async fn write_wal(&mut self, startpos: Lsn, buf: &[u8]) -> Result<()> { self.lsn = startpos + buf.len() as u64; Ok(()) @@ -1221,6 +1237,7 @@ mod tests { commit_lsn: Lsn(1234567600), }, )]), + partial_backup: crate::wal_backup_partial::State::default(), }; let ser = state.ser().unwrap(); @@ -1266,6 +1283,8 @@ mod tests { 0x2a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x70, 0x02, 0x96, 0x49, 0x00, 0x00, 0x00, 0x00, 0xb0, 0x01, 0x96, 0x49, 0x00, 0x00, 0x00, 0x00, + // partial_backup + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, ]; assert_eq!(Hex(&ser), Hex(&expected)); diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs index 7da5fd00b0..59a8c595ab 100644 --- a/safekeeper/src/send_wal.rs +++ b/safekeeper/src/send_wal.rs @@ -506,6 +506,8 @@ struct WalSender<'a, IO> { send_buf: [u8; MAX_SEND_SIZE], } +const POLL_STATE_TIMEOUT: Duration = Duration::from_secs(1); + impl WalSender<'_, IO> { /// Send WAL until /// - an error occurs @@ -584,14 +586,22 @@ impl WalSender<'_, IO> { async fn wait_wal(&mut self) -> Result<(), CopyStreamHandlerEnd> { loop { self.end_pos = self.end_watch.get(); - if self.end_pos > self.start_pos { - // We have something to send. + let have_something_to_send = (|| { + fail::fail_point!( + "sk-pause-send", + self.appname.as_deref() != Some("pageserver"), + |_| { false } + ); + self.end_pos > self.start_pos + })(); + + if have_something_to_send { trace!("got end_pos {:?}, streaming", self.end_pos); return Ok(()); } // Wait for WAL to appear, now self.end_pos == self.start_pos. - if let Some(lsn) = wait_for_lsn(&mut self.end_watch, self.term, self.start_pos).await? { + if let Some(lsn) = self.wait_for_lsn().await? { self.end_pos = lsn; trace!("got end_pos {:?}, streaming", self.end_pos); return Ok(()); @@ -628,6 +638,54 @@ impl WalSender<'_, IO> { .await?; } } + + /// Wait until we have available WAL > start_pos or timeout expires. Returns + /// - Ok(Some(end_pos)) if needed lsn is successfully observed; + /// - Ok(None) if timeout expired; + /// - Err in case of error -- only if 1) term changed while fetching in recovery + /// mode 2) watch channel closed, which must never happen. + async fn wait_for_lsn(&mut self) -> anyhow::Result> { + let fp = (|| { + fail::fail_point!( + "sk-pause-send", + self.appname.as_deref() != Some("pageserver"), + |_| { true } + ); + false + })(); + if fp { + tokio::time::sleep(POLL_STATE_TIMEOUT).await; + return Ok(None); + } + + let res = timeout(POLL_STATE_TIMEOUT, async move { + loop { + let end_pos = self.end_watch.get(); + if end_pos > self.start_pos { + return Ok(end_pos); + } + if let EndWatch::Flush(rx) = &self.end_watch { + let curr_term = rx.borrow().term; + if let Some(client_term) = self.term { + if curr_term != client_term { + bail!("term changed: requested {}, now {}", client_term, curr_term); + } + } + } + self.end_watch.changed().await?; + } + }) + .await; + + match res { + // success + Ok(Ok(commit_lsn)) => Ok(Some(commit_lsn)), + // error inside closure + Ok(Err(err)) => Err(err), + // timeout + Err(_) => Ok(None), + } + } } /// A half driving receiving replies. @@ -685,47 +743,6 @@ impl ReplyReader { } } -const POLL_STATE_TIMEOUT: Duration = Duration::from_secs(1); - -/// Wait until we have available WAL > start_pos or timeout expires. Returns -/// - Ok(Some(end_pos)) if needed lsn is successfully observed; -/// - Ok(None) if timeout expired; -/// - Err in case of error -- only if 1) term changed while fetching in recovery -/// mode 2) watch channel closed, which must never happen. -async fn wait_for_lsn( - rx: &mut EndWatch, - client_term: Option, - start_pos: Lsn, -) -> anyhow::Result> { - let res = timeout(POLL_STATE_TIMEOUT, async move { - loop { - let end_pos = rx.get(); - if end_pos > start_pos { - return Ok(end_pos); - } - if let EndWatch::Flush(rx) = rx { - let curr_term = rx.borrow().term; - if let Some(client_term) = client_term { - if curr_term != client_term { - bail!("term changed: requested {}, now {}", client_term, curr_term); - } - } - } - rx.changed().await?; - } - }) - .await; - - match res { - // success - Ok(Ok(commit_lsn)) => Ok(Some(commit_lsn)), - // error inside closure - Ok(Err(err)) => Err(err), - // timeout - Err(_) => Ok(None), - } -} - #[cfg(test)] mod tests { use utils::id::{TenantId, TimelineId}; diff --git a/safekeeper/src/state.rs b/safekeeper/src/state.rs index 82f7954051..be5e516296 100644 --- a/safekeeper/src/state.rs +++ b/safekeeper/src/state.rs @@ -13,6 +13,7 @@ use utils::{ use crate::{ control_file, safekeeper::{AcceptorState, PersistedPeerInfo, PgUuid, ServerInfo, TermHistory}, + wal_backup_partial::{self}, }; /// Persistent information stored on safekeeper node about timeline. @@ -54,11 +55,14 @@ pub struct TimelinePersistentState { /// pushed to s3. We don't remove WAL beyond it. Persisted only for /// informational purposes, we receive it from pageserver (or broker). pub remote_consistent_lsn: Lsn, - // Peers and their state as we remember it. Knowing peers themselves is - // fundamental; but state is saved here only for informational purposes and - // obviously can be stale. (Currently not saved at all, but let's provision - // place to have less file version upgrades). + /// Peers and their state as we remember it. Knowing peers themselves is + /// fundamental; but state is saved here only for informational purposes and + /// obviously can be stale. (Currently not saved at all, but let's provision + /// place to have less file version upgrades). pub peers: PersistedPeers, + /// Holds names of partial segments uploaded to remote storage. Used to + /// clean up old objects without leaving garbage in remote storage. + pub partial_backup: wal_backup_partial::State, } #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] @@ -93,6 +97,7 @@ impl TimelinePersistentState { .map(|p| (*p, PersistedPeerInfo::new())) .collect(), ), + partial_backup: wal_backup_partial::State::default(), } } diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index 4901b86acf..64f764f191 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -38,7 +38,7 @@ use crate::{control_file, safekeeper::UNKNOWN_SERVER_VERSION}; use crate::metrics::FullTimelineInfo; use crate::wal_storage::Storage as wal_storage_iface; -use crate::{debug_dump, wal_storage}; +use crate::{debug_dump, wal_backup_partial, wal_storage}; use crate::{GlobalTimelines, SafeKeeperConf}; /// Things safekeeper should know about timeline state on peers. @@ -503,6 +503,9 @@ impl Timeline { if conf.peer_recovery_enabled { tokio::spawn(recovery_main(self.clone(), conf.clone())); } + if conf.is_wal_backup_enabled() && conf.partial_backup_enabled { + tokio::spawn(wal_backup_partial::main_task(self.clone(), conf.clone())); + } } /// Delete timeline from disk completely, by removing timeline directory. @@ -667,8 +670,8 @@ impl Timeline { term_flush_lsn = TermLsn::from((shared_state.sk.get_term(), shared_state.sk.flush_lsn())); } - self.commit_lsn_watch_tx.send(commit_lsn)?; self.term_flush_lsn_watch_tx.send(term_flush_lsn)?; + self.commit_lsn_watch_tx.send(commit_lsn)?; Ok(rmsg) } diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs index 944d80f777..e496f07114 100644 --- a/safekeeper/src/wal_backup.rs +++ b/safekeeper/src/wal_backup.rs @@ -18,7 +18,7 @@ use std::time::Duration; use postgres_ffi::v14::xlog_utils::XLogSegNoOffsetToRecPtr; use postgres_ffi::XLogFileName; use postgres_ffi::{XLogSegNo, PG_TLI}; -use remote_storage::{GenericRemoteStorage, RemotePath}; +use remote_storage::{GenericRemoteStorage, ListingMode, RemotePath, StorageMetadata}; use tokio::fs::File; use tokio::select; @@ -180,6 +180,16 @@ fn get_configured_remote_storage() -> &'static GenericRemoteStorage { .unwrap() } +pub fn init_remote_storage(conf: &SafeKeeperConf) { + // TODO: refactor REMOTE_STORAGE to avoid using global variables, and provide + // dependencies to all tasks instead. + REMOTE_STORAGE.get_or_init(|| { + conf.remote_storage + .as_ref() + .map(|c| GenericRemoteStorage::from_config(c).expect("failed to create remote storage")) + }); +} + const CHECK_TASKS_INTERVAL_MSEC: u64 = 1000; /// Sits on wal_backup_launcher_rx and starts/stops per timeline wal backup @@ -194,14 +204,6 @@ pub async fn wal_backup_launcher_task_main( conf.remote_storage ); - let conf_ = conf.clone(); - REMOTE_STORAGE.get_or_init(|| { - conf_ - .remote_storage - .as_ref() - .map(|c| GenericRemoteStorage::from_config(c).expect("failed to create remote storage")) - }); - // Presence in this map means launcher is aware s3 offloading is needed for // the timeline, but task is started only if it makes sense for to offload // from this safekeeper. @@ -518,6 +520,35 @@ async fn backup_object( .await } +pub(crate) async fn backup_partial_segment( + source_file: &Utf8Path, + target_file: &RemotePath, + size: usize, +) -> Result<()> { + let storage = get_configured_remote_storage(); + + let file = File::open(&source_file) + .await + .with_context(|| format!("Failed to open file {source_file:?} for wal backup"))?; + + // limiting the file to read only the first `size` bytes + let limited_file = tokio::io::AsyncReadExt::take(file, size as u64); + + let file = tokio_util::io::ReaderStream::with_capacity(limited_file, BUFFER_SIZE); + + let cancel = CancellationToken::new(); + + storage + .upload( + file, + size, + target_file, + Some(StorageMetadata::from([("sk_type", "partial_segment")])), + &cancel, + ) + .await +} + pub async fn read_object( file_path: &RemotePath, offset: u64, @@ -570,12 +601,18 @@ pub async fn delete_timeline(ttid: &TenantTimelineId) -> Result<()> { backoff::retry( || async { // Do list-delete in batch_size batches to make progress even if there a lot of files. - // Alternatively we could make list_files return iterator, but it is more complicated and + // Alternatively we could make remote storage list return iterator, but it is more complicated and // I'm not sure deleting while iterating is expected in s3. loop { let files = storage - .list_files(Some(&remote_path), Some(batch_size), &cancel) - .await?; + .list( + Some(&remote_path), + ListingMode::NoDelimiter, + Some(batch_size), + &cancel, + ) + .await? + .keys; if files.is_empty() { return Ok(()); // done } @@ -604,6 +641,13 @@ pub async fn delete_timeline(ttid: &TenantTimelineId) -> Result<()> { Ok(()) } +/// Used by wal_backup_partial. +pub async fn delete_objects(paths: &[RemotePath]) -> Result<()> { + let cancel = CancellationToken::new(); // not really used + let storage = get_configured_remote_storage(); + storage.delete_objects(paths, &cancel).await +} + /// Copy segments from one timeline to another. Used in copy_timeline. pub async fn copy_s3_segments( wal_seg_size: usize, @@ -628,8 +672,9 @@ pub async fn copy_s3_segments( let cancel = CancellationToken::new(); let files = storage - .list_files(Some(&remote_path), None, &cancel) - .await?; + .list(Some(&remote_path), ListingMode::NoDelimiter, None, &cancel) + .await? + .keys; let uploaded_segments = &files .iter() diff --git a/safekeeper/src/wal_backup_partial.rs b/safekeeper/src/wal_backup_partial.rs new file mode 100644 index 0000000000..200096ac5c --- /dev/null +++ b/safekeeper/src/wal_backup_partial.rs @@ -0,0 +1,407 @@ +//! Safekeeper timeline has a background task which is subscribed to `commit_lsn` +//! and `flush_lsn` updates. After the partial segment was updated (`flush_lsn` +//! was changed), the segment will be uploaded to S3 in about 15 minutes. +//! +//! The filename format for partial segments is +//! `Segment_Term_Flush_Commit_skNN.partial`, where: +//! - `Segment` – the segment name, like `000000010000000000000001` +//! - `Term` – current term +//! - `Flush` – flush_lsn in hex format `{:016X}`, e.g. `00000000346BC568` +//! - `Commit` – commit_lsn in the same hex format +//! - `NN` – safekeeper_id, like `1` +//! +//! The full object name example: +//! `000000010000000000000002_2_0000000002534868_0000000002534410_sk1.partial` +//! +//! Each safekeeper will keep info about remote partial segments in its control +//! file. Code updates state in the control file before doing any S3 operations. +//! This way control file stores information about all potentially existing +//! remote partial segments and can clean them up after uploading a newer version. + +use std::sync::Arc; + +use camino::Utf8PathBuf; +use postgres_ffi::{XLogFileName, XLogSegNo, PG_TLI}; +use rand::Rng; +use remote_storage::RemotePath; +use serde::{Deserialize, Serialize}; + +use tracing::{debug, error, info, instrument}; +use utils::lsn::Lsn; + +use crate::{ + metrics::{PARTIAL_BACKUP_UPLOADED_BYTES, PARTIAL_BACKUP_UPLOADS}, + safekeeper::Term, + timeline::Timeline, + wal_backup, SafeKeeperConf, +}; + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub enum UploadStatus { + /// Upload is in progress + InProgress, + /// Upload is finished + Uploaded, + /// Deletion is in progress + Deleting, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub struct PartialRemoteSegment { + pub status: UploadStatus, + pub name: String, + pub commit_lsn: Lsn, + pub flush_lsn: Lsn, + pub term: Term, +} + +impl PartialRemoteSegment { + fn eq_without_status(&self, other: &Self) -> bool { + self.name == other.name + && self.commit_lsn == other.commit_lsn + && self.flush_lsn == other.flush_lsn + && self.term == other.term + } +} + +// NB: these structures are a part of a control_file, you can't change them without +// changing the control file format version. +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Default)] +pub struct State { + pub segments: Vec, +} + +impl State { + /// Find an Uploaded segment. There should be only one Uploaded segment at a time. + fn uploaded_segment(&self) -> Option { + self.segments + .iter() + .find(|seg| seg.status == UploadStatus::Uploaded) + .cloned() + } +} + +struct PartialBackup { + wal_seg_size: usize, + tli: Arc, + conf: SafeKeeperConf, + local_prefix: Utf8PathBuf, + remote_prefix: Utf8PathBuf, + + state: State, +} + +// Read-only methods for getting segment names +impl PartialBackup { + fn segno(&self, lsn: Lsn) -> XLogSegNo { + lsn.segment_number(self.wal_seg_size) + } + + fn segment_name(&self, segno: u64) -> String { + XLogFileName(PG_TLI, segno, self.wal_seg_size) + } + + fn remote_segment_name( + &self, + segno: u64, + term: u64, + commit_lsn: Lsn, + flush_lsn: Lsn, + ) -> String { + format!( + "{}_{}_{:016X}_{:016X}_sk{}.partial", + self.segment_name(segno), + term, + flush_lsn.0, + commit_lsn.0, + self.conf.my_id.0, + ) + } + + fn local_segment_name(&self, segno: u64) -> String { + format!("{}.partial", self.segment_name(segno)) + } +} + +impl PartialBackup { + /// Takes a lock to read actual safekeeper state and returns a segment that should be uploaded. + async fn prepare_upload(&self) -> PartialRemoteSegment { + // this operation takes a lock to get the actual state + let sk_info = self.tli.get_safekeeper_info(&self.conf).await; + let flush_lsn = Lsn(sk_info.flush_lsn); + let commit_lsn = Lsn(sk_info.commit_lsn); + let term = sk_info.term; + let segno = self.segno(flush_lsn); + + let name = self.remote_segment_name(segno, term, commit_lsn, flush_lsn); + + PartialRemoteSegment { + status: UploadStatus::InProgress, + name, + commit_lsn, + flush_lsn, + term, + } + } + + /// Reads segment from disk and uploads it to the remote storage. + async fn upload_segment(&mut self, prepared: PartialRemoteSegment) -> anyhow::Result<()> { + let flush_lsn = prepared.flush_lsn; + let segno = self.segno(flush_lsn); + + // We're going to backup bytes from the start of the segment up to flush_lsn. + let backup_bytes = flush_lsn.segment_offset(self.wal_seg_size); + + let local_path = self.local_prefix.join(self.local_segment_name(segno)); + let remote_path = RemotePath::new(self.remote_prefix.join(&prepared.name).as_ref())?; + + // Upload first `backup_bytes` bytes of the segment to the remote storage. + wal_backup::backup_partial_segment(&local_path, &remote_path, backup_bytes).await?; + PARTIAL_BACKUP_UPLOADED_BYTES.inc_by(backup_bytes as u64); + + // We uploaded the segment, now let's verify that the data is still actual. + // If the term changed, we cannot guarantee the validity of the uploaded data. + // If the term is the same, we know the data is not corrupted. + let sk_info = self.tli.get_safekeeper_info(&self.conf).await; + if sk_info.term != prepared.term { + anyhow::bail!("term changed during upload"); + } + assert!(prepared.commit_lsn <= Lsn(sk_info.commit_lsn)); + assert!(prepared.flush_lsn <= Lsn(sk_info.flush_lsn)); + + Ok(()) + } + + /// Write new state to disk. If in-memory and on-disk states diverged, returns an error. + async fn commit_state(&mut self, new_state: State) -> anyhow::Result<()> { + self.tli + .map_control_file(|cf| { + if cf.partial_backup != self.state { + let memory = self.state.clone(); + self.state = cf.partial_backup.clone(); + anyhow::bail!( + "partial backup state diverged, memory={:?}, disk={:?}", + memory, + cf.partial_backup + ); + } + + cf.partial_backup = new_state.clone(); + Ok(()) + }) + .await?; + // update in-memory state + self.state = new_state; + Ok(()) + } + + /// Upload the latest version of the partial segment and garbage collect older versions. + #[instrument(name = "upload", skip_all, fields(name = %prepared.name))] + async fn do_upload(&mut self, prepared: &PartialRemoteSegment) -> anyhow::Result<()> { + info!("starting upload {:?}", prepared); + + let state_0 = self.state.clone(); + let state_1 = { + let mut state = state_0.clone(); + state.segments.push(prepared.clone()); + state + }; + + // we're going to upload a new segment, let's write it to disk to make GC later + self.commit_state(state_1).await?; + + self.upload_segment(prepared.clone()).await?; + + let state_2 = { + let mut state = state_0.clone(); + for seg in state.segments.iter_mut() { + seg.status = UploadStatus::Deleting; + } + let mut actual_remote_segment = prepared.clone(); + actual_remote_segment.status = UploadStatus::Uploaded; + state.segments.push(actual_remote_segment); + state + }; + + // we've uploaded new segment, it's actual, all other segments should be GCed + self.commit_state(state_2).await?; + self.gc().await?; + + Ok(()) + } + + /// Delete all non-Uploaded segments from the remote storage. There should be only one + /// Uploaded segment at a time. + #[instrument(name = "gc", skip_all)] + async fn gc(&mut self) -> anyhow::Result<()> { + let mut segments_to_delete = vec![]; + + let new_segments: Vec = self + .state + .segments + .iter() + .filter_map(|seg| { + if seg.status == UploadStatus::Uploaded { + Some(seg.clone()) + } else { + segments_to_delete.push(seg.name.clone()); + None + } + }) + .collect(); + + info!("deleting objects: {:?}", segments_to_delete); + let mut objects_to_delete = vec![]; + for seg in segments_to_delete.iter() { + let remote_path = RemotePath::new(self.remote_prefix.join(seg).as_ref())?; + objects_to_delete.push(remote_path); + } + + // removing segments from remote storage + wal_backup::delete_objects(&objects_to_delete).await?; + + // now we can update the state on disk + let new_state = { + let mut state = self.state.clone(); + state.segments = new_segments; + state + }; + self.commit_state(new_state).await?; + + Ok(()) + } +} + +#[instrument(name = "Partial backup", skip_all, fields(ttid = %tli.ttid))] +pub async fn main_task(tli: Arc, conf: SafeKeeperConf) { + debug!("started"); + let await_duration = conf.partial_backup_timeout; + + let mut cancellation_rx = match tli.get_cancellation_rx() { + Ok(rx) => rx, + Err(_) => { + info!("timeline canceled during task start"); + return; + } + }; + + // sleep for random time to avoid thundering herd + { + let randf64 = rand::thread_rng().gen_range(0.0..1.0); + let sleep_duration = await_duration.mul_f64(randf64); + tokio::time::sleep(sleep_duration).await; + } + + let (_, persistent_state) = tli.get_state().await; + let mut commit_lsn_rx = tli.get_commit_lsn_watch_rx(); + let mut flush_lsn_rx = tli.get_term_flush_lsn_watch_rx(); + let wal_seg_size = tli.get_wal_seg_size().await; + + let local_prefix = tli.timeline_dir.clone(); + let remote_prefix = match tli.timeline_dir.strip_prefix(&conf.workdir) { + Ok(path) => path.to_owned(), + Err(e) => { + error!("failed to strip workspace dir prefix: {:?}", e); + return; + } + }; + + let mut backup = PartialBackup { + wal_seg_size, + tli, + state: persistent_state.partial_backup, + conf, + local_prefix, + remote_prefix, + }; + + debug!("state: {:?}", backup.state); + + 'outer: loop { + // wait until we have something to upload + let uploaded_segment = backup.state.uploaded_segment(); + if let Some(seg) = &uploaded_segment { + // if we already uploaded something, wait until we have something new + while flush_lsn_rx.borrow().lsn == seg.flush_lsn + && *commit_lsn_rx.borrow() == seg.commit_lsn + && flush_lsn_rx.borrow().term == seg.term + { + tokio::select! { + _ = cancellation_rx.changed() => { + info!("timeline canceled"); + return; + } + _ = commit_lsn_rx.changed() => {} + _ = flush_lsn_rx.changed() => {} + } + } + } + + // if we don't have any data and zero LSNs, wait for something + while flush_lsn_rx.borrow().lsn == Lsn(0) { + tokio::select! { + _ = cancellation_rx.changed() => { + info!("timeline canceled"); + return; + } + _ = flush_lsn_rx.changed() => {} + } + } + + // fixing the segno and waiting some time to prevent reuploading the same segment too often + let pending_segno = backup.segno(flush_lsn_rx.borrow().lsn); + let timeout = tokio::time::sleep(await_duration); + tokio::pin!(timeout); + let mut timeout_expired = false; + + // waiting until timeout expires OR segno changes + 'inner: loop { + tokio::select! { + _ = cancellation_rx.changed() => { + info!("timeline canceled"); + return; + } + _ = commit_lsn_rx.changed() => {} + _ = flush_lsn_rx.changed() => { + let segno = backup.segno(flush_lsn_rx.borrow().lsn); + if segno != pending_segno { + // previous segment is no longer partial, aborting the wait + break 'inner; + } + } + _ = &mut timeout => { + // timeout expired, now we are ready for upload + timeout_expired = true; + break 'inner; + } + } + } + + if !timeout_expired { + // likely segno has changed, let's try again in the next iteration + continue 'outer; + } + + let prepared = backup.prepare_upload().await; + if let Some(seg) = &uploaded_segment { + if seg.eq_without_status(&prepared) { + // we already uploaded this segment, nothing to do + continue 'outer; + } + } + + match backup.do_upload(&prepared).await { + Ok(()) => { + debug!( + "uploaded {} up to flush_lsn {}", + prepared.name, prepared.flush_lsn + ); + PARTIAL_BACKUP_UPLOADS.with_label_values(&["ok"]).inc(); + } + Err(e) => { + info!("failed to upload {}: {:#}", prepared.name, e); + PARTIAL_BACKUP_UPLOADS.with_label_values(&["error"]).inc(); + } + } + } +} diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs index 8bbd95e9e8..6bc8c7c3f9 100644 --- a/safekeeper/src/wal_storage.rs +++ b/safekeeper/src/wal_storage.rs @@ -38,6 +38,12 @@ pub trait Storage { /// LSN of last durably stored WAL record. fn flush_lsn(&self) -> Lsn; + /// Initialize segment by creating proper long header at the beginning of + /// the segment and short header at the page of given LSN. This is only used + /// for timeline initialization because compute will stream data only since + /// init_lsn. Other segment headers are included in compute stream. + async fn initialize_first_segment(&mut self, init_lsn: Lsn) -> Result<()>; + /// Write piece of WAL from buf to disk, but not necessarily sync it. async fn write_wal(&mut self, startpos: Lsn, buf: &[u8]) -> Result<()>; @@ -78,6 +84,8 @@ pub struct PhysicalStorage { /// Size of WAL segment in bytes. wal_seg_size: usize, + pg_version: u32, + system_id: u64, /// Written to disk, but possibly still in the cache and not fully persisted. /// Also can be ahead of record_lsn, if happen to be in the middle of a WAL record. @@ -169,6 +177,8 @@ impl PhysicalStorage { timeline_dir, conf: conf.clone(), wal_seg_size, + pg_version: state.server.pg_version, + system_id: state.server.system_id, write_lsn, write_record_lsn: write_lsn, flush_record_lsn: flush_lsn, @@ -221,6 +231,7 @@ impl PhysicalStorage { // half initialized segment, first bake it under tmp filename and // then rename. let tmp_path = self.timeline_dir.join("waltmp"); + #[allow(clippy::suspicious_open_options)] let mut file = OpenOptions::new() .create(true) .write(true) @@ -323,6 +334,20 @@ impl Storage for PhysicalStorage { self.flush_record_lsn } + async fn initialize_first_segment(&mut self, init_lsn: Lsn) -> Result<()> { + let segno = init_lsn.segment_number(self.wal_seg_size); + let (mut file, _) = self.open_or_create(segno).await?; + let major_pg_version = self.pg_version / 10000; + let wal_seg = + postgres_ffi::generate_wal_segment(segno, self.system_id, major_pg_version, init_lsn)?; + file.seek(SeekFrom::Start(0)).await?; + file.write_all(&wal_seg).await?; + file.flush().await?; + info!("initialized segno {} at lsn {}", segno, init_lsn); + // note: file is *not* fsynced + Ok(()) + } + /// Write WAL to disk. async fn write_wal(&mut self, startpos: Lsn, buf: &[u8]) -> Result<()> { // Disallow any non-sequential writes, which can result in gaps or overwrites. diff --git a/safekeeper/tests/walproposer_sim/safekeeper.rs b/safekeeper/tests/walproposer_sim/safekeeper.rs index e3aaf5d391..27e2a4453b 100644 --- a/safekeeper/tests/walproposer_sim/safekeeper.rs +++ b/safekeeper/tests/walproposer_sim/safekeeper.rs @@ -176,6 +176,9 @@ pub fn run_server(os: NodeOs, disk: Arc) -> Result<()> { http_auth: None, current_thread_runtime: false, walsenders_keep_horizon: false, + partial_backup_enabled: false, + partial_backup_timeout: Duration::from_secs(0), + disable_periodic_broker_push: false, }; let mut global = GlobalMap::new(disk, conf.clone())?; diff --git a/safekeeper/tests/walproposer_sim/safekeeper_disk.rs b/safekeeper/tests/walproposer_sim/safekeeper_disk.rs index 35bca325aa..c2db9de78a 100644 --- a/safekeeper/tests/walproposer_sim/safekeeper_disk.rs +++ b/safekeeper/tests/walproposer_sim/safekeeper_disk.rs @@ -182,6 +182,10 @@ impl wal_storage::Storage for DiskWALStorage { self.flush_record_lsn } + async fn initialize_first_segment(&mut self, _init_lsn: Lsn) -> Result<()> { + Ok(()) + } + /// Write piece of WAL from buf to disk, but not necessarily sync it. async fn write_wal(&mut self, startpos: Lsn, buf: &[u8]) -> Result<()> { if self.write_lsn != startpos { diff --git a/safekeeper/tests/walproposer_sim/walproposer_api.rs b/safekeeper/tests/walproposer_sim/walproposer_api.rs index 42340ba1df..5578c94cf6 100644 --- a/safekeeper/tests/walproposer_sim/walproposer_api.rs +++ b/safekeeper/tests/walproposer_sim/walproposer_api.rs @@ -17,8 +17,7 @@ use utils::lsn::Lsn; use walproposer::{ api_bindings::Level, bindings::{ - pg_atomic_uint64, NeonWALReadResult, PageserverFeedback, SafekeeperStateDesiredEvents, - WL_SOCKET_READABLE, WL_SOCKET_WRITEABLE, + NeonWALReadResult, SafekeeperStateDesiredEvents, WL_SOCKET_READABLE, WL_SOCKET_WRITEABLE, }, walproposer::{ApiImpl, Config}, }; @@ -224,30 +223,13 @@ impl SimulationApi { }) .collect::>(); - let empty_feedback = PageserverFeedback { - present: false, - currentClusterSize: 0, - last_received_lsn: 0, - disk_consistent_lsn: 0, - remote_consistent_lsn: 0, - replytime: 0, - shard_number: 0, - }; - Self { os: args.os, safekeepers: RefCell::new(sk_conns), disk: args.disk, redo_start_lsn: args.redo_start_lsn, last_logged_commit_lsn: 0, - shmem: UnsafeCell::new(walproposer::bindings::WalproposerShmemState { - mutex: 0, - mineLastElectedTerm: 0, - backpressureThrottlingTime: pg_atomic_uint64 { value: 0 }, - shard_ps_feedback: [empty_feedback; 128], - num_shards: 0, - min_ps_feedback: empty_feedback, - }), + shmem: UnsafeCell::new(walproposer::api_bindings::empty_shmem()), config: args.config, event_set: RefCell::new(None), } @@ -273,6 +255,12 @@ impl ApiImpl for SimulationApi { self.os.now() as i64 * 1000 } + fn update_donor(&self, donor: &mut walproposer::bindings::Safekeeper, donor_lsn: u64) { + let mut shmem = unsafe { *self.get_shmem_state() }; + shmem.propEpochStartLsn.value = donor_lsn; + shmem.donor_conninfo = donor.conninfo; + } + fn conn_status( &self, _: &mut walproposer::bindings::Safekeeper, diff --git a/scripts/export_import_between_pageservers.py b/scripts/export_import_between_pageservers.py deleted file mode 100755 index 980f343047..0000000000 --- a/scripts/export_import_between_pageservers.py +++ /dev/null @@ -1,736 +0,0 @@ -# -# Script to export tenants from one pageserver and import them into another page server. -# -# Outline of steps: -# 1. Get `(last_lsn, prev_lsn)` from old pageserver -# 2. Get `fullbackup` from old pageserver, which creates a basebackup tar file -# 3. This tar file might be missing relation files for empty relations, if the pageserver -# is old enough (we didn't always store those). So to recreate them, we start a local -# vanilla postgres on this basebackup and ask it what relations should exist, then touch -# any missing files and re-pack the tar. -# TODO This functionality is no longer needed, so we can delete it later if we don't -# end up using the same utils for the pg 15 upgrade. Not sure. -# 4. We import the patched basebackup into a new pageserver -# 5. We export again via fullbackup, now from the new pageserver and compare the returned -# tar file with the one we imported. This confirms that we imported everything that was -# exported, but doesn't guarantee correctness (what if we didn't **export** everything -# initially?) -# 6. We wait for the new pageserver's remote_consistent_lsn to catch up -# -# For more context on how to use this, see: -# https://www.notion.so/neondatabase/Storage-format-migration-9a8eba33ccf8417ea8cf50e6a0c542cf - -import argparse -import os -import shutil -import subprocess -import tempfile -import time -import uuid -from contextlib import closing -from pathlib import Path -from typing import Any, Dict, List, Optional, Tuple, cast - -import psycopg2 -import requests -from psycopg2.extensions import connection as PgConnection -from psycopg2.extensions import parse_dsn - -############################################### -### client-side utils copied from test fixtures -############################################### - -Env = Dict[str, str] - -_global_counter = 0 - - -def global_counter() -> int: - """A really dumb global counter. - This is useful for giving output files a unique number, so if we run the - same command multiple times we can keep their output separate. - """ - global _global_counter - _global_counter += 1 - return _global_counter - - -def subprocess_capture(capture_dir: str, cmd: List[str], **kwargs: Any) -> str: - """Run a process and capture its output - Output will go to files named "cmd_NNN.stdout" and "cmd_NNN.stderr" - where "cmd" is the name of the program and NNN is an incrementing - counter. - If those files already exist, we will overwrite them. - Returns basepath for files with captured output. - """ - assert isinstance(cmd, list) - base = os.path.basename(cmd[0]) + "_{}".format(global_counter()) - basepath = os.path.join(capture_dir, base) - stdout_filename = basepath + ".stdout" - stderr_filename = basepath + ".stderr" - - with open(stdout_filename, "w") as stdout_f: - with open(stderr_filename, "w") as stderr_f: - print('(capturing output to "{}.stdout")'.format(base)) - subprocess.run(cmd, **kwargs, stdout=stdout_f, stderr=stderr_f) - - return basepath - - -class PgBin: - """A helper class for executing postgres binaries""" - - def __init__(self, log_dir: Path, pg_distrib_dir, pg_version): - self.log_dir = log_dir - self.pg_bin_path = os.path.join(str(pg_distrib_dir), "v{}".format(pg_version), "bin") - self.env = os.environ.copy() - self.env["LD_LIBRARY_PATH"] = os.path.join( - str(pg_distrib_dir), "v{}".format(pg_version), "lib" - ) - - def _fixpath(self, command: List[str]): - if "/" not in command[0]: - command[0] = os.path.join(self.pg_bin_path, command[0]) - - def _build_env(self, env_add: Optional[Env]) -> Env: - if env_add is None: - return self.env - env = self.env.copy() - env.update(env_add) - return env - - def run(self, command: List[str], env: Optional[Env] = None, cwd: Optional[str] = None): - """ - Run one of the postgres binaries. - The command should be in list form, e.g. ['pgbench', '-p', '55432'] - All the necessary environment variables will be set. - If the first argument (the command name) doesn't include a path (no '/' - characters present), then it will be edited to include the correct path. - If you want stdout/stderr captured to files, use `run_capture` instead. - """ - - self._fixpath(command) - print('Running command "{}"'.format(" ".join(command))) - env = self._build_env(env) - subprocess.run(command, env=env, cwd=cwd, check=True) - - def run_capture( - self, - command: List[str], - env: Optional[Env] = None, - cwd: Optional[str] = None, - **kwargs: Any, - ) -> str: - """ - Run one of the postgres binaries, with stderr and stdout redirected to a file. - This is just like `run`, but for chatty programs. Returns basepath for files - with captured output. - """ - - self._fixpath(command) - print('Running command "{}"'.format(" ".join(command))) - env = self._build_env(env) - return subprocess_capture( - str(self.log_dir), command, env=env, cwd=cwd, check=True, **kwargs - ) - - -class PgProtocol: - """Reusable connection logic""" - - def __init__(self, **kwargs): - self.default_options = kwargs - - def conn_options(self, **kwargs): - conn_options = self.default_options.copy() - if "dsn" in kwargs: - conn_options.update(parse_dsn(kwargs["dsn"])) - conn_options.update(kwargs) - - # Individual statement timeout in seconds. 2 minutes should be - # enough for our tests, but if you need a longer, you can - # change it by calling "SET statement_timeout" after - # connecting. - conn_options["options"] = f"-cstatement_timeout=120s {conn_options.get('options', '')}" - - return conn_options - - # autocommit=True here by default because that's what we need most of the time - def connect(self, autocommit=True, **kwargs) -> PgConnection: - """ - Connect to the node. - Returns psycopg2's connection object. - This method passes all extra params to connstr. - """ - conn: PgConnection = psycopg2.connect(**self.conn_options(**kwargs)) - - # WARNING: this setting affects *all* tests! - conn.autocommit = autocommit - return conn - - def safe_psql(self, query: str, **kwargs: Any) -> List[Tuple[Any, ...]]: - """ - Execute query against the node and return all rows. - This method passes all extra params to connstr. - """ - return self.safe_psql_many([query], **kwargs)[0] - - def safe_psql_many(self, queries: List[str], **kwargs: Any) -> List[List[Tuple[Any, ...]]]: - """ - Execute queries against the node and return all rows. - This method passes all extra params to connstr. - """ - result: List[List[Any]] = [] - with closing(self.connect(**kwargs)) as conn: - with conn.cursor() as cur: - for query in queries: - print(f"Executing query: {query}") - cur.execute(query) - - if cur.description is None: - result.append([]) # query didn't return data - else: - result.append(cast(List[Any], cur.fetchall())) - return result - - -class VanillaPostgres(PgProtocol): - def __init__(self, pgdatadir: Path, pg_bin: PgBin, port: int, init=True): - super().__init__(host="localhost", port=port, dbname="postgres") - self.pgdatadir = pgdatadir - self.pg_bin = pg_bin - self.running = False - if init: - self.pg_bin.run_capture(["initdb", "-D", str(pgdatadir)]) - self.configure([f"port = {port}\n"]) - - def configure(self, options: List[str]): - """Append lines into postgresql.conf file.""" - assert not self.running - with open(os.path.join(self.pgdatadir, "postgresql.conf"), "a") as conf_file: - conf_file.write("\n".join(options)) - - def start(self, log_path: Optional[str] = None): - assert not self.running - self.running = True - - log_path = log_path or os.path.join(self.pgdatadir, "pg.log") - - self.pg_bin.run_capture( - ["pg_ctl", "-w", "-D", str(self.pgdatadir), "-l", log_path, "start"] - ) - - def stop(self): - assert self.running - self.running = False - self.pg_bin.run_capture(["pg_ctl", "-w", "-D", str(self.pgdatadir), "stop"]) - - def __enter__(self): - return self - - def __exit__(self, exc_type, exc, tb): - if self.running: - self.stop() - - -class NeonPageserverApiException(Exception): - pass - - -class NeonPageserverHttpClient(requests.Session): - def __init__(self, host, port): - super().__init__() - self.host = host - self.port = port - - def verbose_error(self, res: requests.Response): - try: - res.raise_for_status() - except requests.RequestException as e: - try: - msg = res.json()["msg"] - except: # noqa: E722 - msg = "" - raise NeonPageserverApiException(msg) from e - - def check_status(self): - self.get(f"http://{self.host}:{self.port}/v1/status").raise_for_status() - - def tenant_list(self): - res = self.get(f"http://{self.host}:{self.port}/v1/tenant") - self.verbose_error(res) - res_json = res.json() - assert isinstance(res_json, list) - return res_json - - def tenant_create(self, new_tenant_id: uuid.UUID, ok_if_exists): - res = self.post( - f"http://{self.host}:{self.port}/v1/tenant", - json={"new_tenant_id": new_tenant_id.hex, "generation": 1}, - ) - - if res.status_code == 409: - if ok_if_exists: - print(f"could not create tenant: already exists for id {new_tenant_id}") - else: - res.raise_for_status() - elif res.status_code == 201: - print(f"created tenant {new_tenant_id}") - else: - self.verbose_error(res) - - return new_tenant_id - - def timeline_list(self, tenant_id: uuid.UUID): - res = self.get(f"http://{self.host}:{self.port}/v1/tenant/{tenant_id.hex}/timeline") - self.verbose_error(res) - res_json = res.json() - assert isinstance(res_json, list) - return res_json - - def timeline_detail(self, tenant_id: uuid.UUID, timeline_id: uuid.UUID) -> Dict[Any, Any]: - res = self.get( - f"http://localhost:{self.port}/v1/tenant/{tenant_id.hex}/timeline/{timeline_id.hex}?include-non-incremental-logical-size=true" - ) - self.verbose_error(res) - res_json = res.json() - assert isinstance(res_json, dict) - return res_json - - -def lsn_to_hex(num: int) -> str: - """Convert lsn from int to standard hex notation.""" - return "{:X}/{:X}".format(num >> 32, num & 0xFFFFFFFF) - - -def lsn_from_hex(lsn_hex: str) -> int: - """Convert lsn from hex notation to int.""" - left, right = lsn_hex.split("/") - return (int(left, 16) << 32) + int(right, 16) - - -def remote_consistent_lsn( - pageserver_http_client: NeonPageserverHttpClient, tenant: uuid.UUID, timeline: uuid.UUID -) -> int: - detail = pageserver_http_client.timeline_detail(tenant, timeline) - - lsn_str = detail["remote_consistent_lsn"] - assert isinstance(lsn_str, str) - return lsn_from_hex(lsn_str) - - -def wait_for_upload( - pageserver_http_client: NeonPageserverHttpClient, - tenant: uuid.UUID, - timeline: uuid.UUID, - lsn: int, -): - """waits for local timeline upload up to specified lsn""" - for i in range(10): - current_lsn = remote_consistent_lsn(pageserver_http_client, tenant, timeline) - if current_lsn >= lsn: - return - print( - "waiting for remote_consistent_lsn to reach {}, now {}, iteration {}".format( - lsn_to_hex(lsn), lsn_to_hex(current_lsn), i + 1 - ) - ) - time.sleep(1) - - raise Exception( - "timed out while waiting for remote_consistent_lsn to reach {}, was {}".format( - lsn_to_hex(lsn), lsn_to_hex(current_lsn) - ) - ) - - -############## -# End of utils -############## - - -def pack_base(log_dir, restored_dir, output_tar): - """Create tar file from basebackup, being careful to produce relative filenames.""" - tmp_tar_name = "tmp.tar" - tmp_tar_path = os.path.join(restored_dir, tmp_tar_name) - cmd = ["tar", "-cf", tmp_tar_name] + os.listdir(restored_dir) - # We actually cd into the dir and call tar from there. If we call tar from - # outside we won't encode filenames as relative, and they won't parse well - # on import. - subprocess_capture(log_dir, cmd, cwd=restored_dir) - shutil.move(tmp_tar_path, output_tar) - - -def reconstruct_paths(log_dir, pg_bin, base_tar, port: int): - """Reconstruct what relation files should exist in the datadir by querying postgres.""" - with tempfile.TemporaryDirectory() as restored_dir: - # Unpack the base tar - subprocess_capture(log_dir, ["tar", "-xf", base_tar, "-C", restored_dir]) - - # Start a vanilla postgres from the given datadir and query it to find - # what relfiles should exist, but possibly don't. - with VanillaPostgres(Path(restored_dir), pg_bin, port, init=False) as vanilla_pg: - vanilla_pg.configure([f"port={port}"]) - vanilla_pg.start(log_path=os.path.join(log_dir, "tmp_pg.log")) - - # Create database based on template0 because we can't connect to template0 - query = "create database template0copy template template0" - vanilla_pg.safe_psql(query, user="cloud_admin") - vanilla_pg.safe_psql("CHECKPOINT", user="cloud_admin") - - # Get all databases - query = "select oid, datname from pg_database" - oid_dbname_pairs = vanilla_pg.safe_psql(query, user="cloud_admin") - template0_oid = [ - oid for (oid, database) in oid_dbname_pairs if database == "template0" - ][0] - - # Get rel paths for each database - for oid, database in oid_dbname_pairs: - if database == "template0": - # We can't connect to template0 - continue - - query = "select relname, pg_relation_filepath(oid) from pg_class" - result = vanilla_pg.safe_psql(query, user="cloud_admin", dbname=database) - for _relname, filepath in result: - if filepath is not None: - if database == "template0copy": - # Add all template0copy paths to template0 - prefix = f"base/{oid}/" - if filepath.startswith(prefix): - suffix = filepath[len(prefix) :] - yield f"base/{template0_oid}/{suffix}" - elif filepath.startswith("global"): - print(f"skipping {database} global file {filepath}") - else: - raise AssertionError - else: - yield filepath - - -def touch_missing_rels(log_dir, corrupt_tar, output_tar, paths): - """Add the appropriate empty files to a basebadkup tar.""" - with tempfile.TemporaryDirectory() as restored_dir: - # Unpack the base tar - subprocess_capture(log_dir, ["tar", "-xf", corrupt_tar, "-C", restored_dir]) - - # Touch files that don't exist - for path in paths: - absolute_path = os.path.join(restored_dir, path) - exists = os.path.exists(absolute_path) - if not exists: - print(f"File {absolute_path} didn't exist. Creating..") - Path(absolute_path).touch() - - # Repackage - pack_base(log_dir, restored_dir, output_tar) - - -# HACK This is a workaround for exporting from old pageservers that -# can't export empty relations. In this case we need to start -# a vanilla postgres from the exported datadir, and query it -# to see what empty relations are missing, and then create -# those empty files before importing. -def add_missing_rels(base_tar, output_tar, log_dir, pg_bin, tmp_pg_port: int): - reconstructed_paths = set(reconstruct_paths(log_dir, pg_bin, base_tar, tmp_pg_port)) - touch_missing_rels(log_dir, base_tar, output_tar, reconstructed_paths) - - -def get_rlsn(pageserver_connstr, tenant_id, timeline_id): - with closing(psycopg2.connect(pageserver_connstr)) as conn: - conn.autocommit = True - with conn.cursor() as cur: - cmd = f"get_last_record_rlsn {tenant_id} {timeline_id}" - cur.execute(cmd) - res = cur.fetchone() - assert res is not None - prev_lsn = res[0] - last_lsn = res[1] - - return last_lsn, prev_lsn - - -def import_timeline( - args, - psql_path, - pageserver_connstr, - pageserver_http, - tenant_id, - timeline_id, - last_lsn, - prev_lsn, - tar_filename, - pg_version, -): - # Import timelines to new pageserver - import_cmd = f"import basebackup {tenant_id} {timeline_id} {last_lsn} {last_lsn} {pg_version}" - full_cmd = rf"""cat {tar_filename} | {psql_path} {pageserver_connstr} -c '{import_cmd}' """ - - stderr_filename2 = os.path.join(args.work_dir, f"import_{tenant_id}_{timeline_id}.stderr") - stdout_filename = os.path.join(args.work_dir, f"import_{tenant_id}_{timeline_id}.stdout") - - print(f"Running: {full_cmd}") - - with open(stdout_filename, "w") as stdout_f: - with open(stderr_filename2, "w") as stderr_f: - print(f"(capturing output to {stdout_filename})") - pg_bin = PgBin(args.work_dir, args.pg_distrib_dir, pg_version) - subprocess.run( - full_cmd, - stdout=stdout_f, - stderr=stderr_f, - env=pg_bin._build_env(None), - shell=True, - check=True, - ) - - print("Done import") - - # Wait until pageserver persists the files - wait_for_upload( - pageserver_http, uuid.UUID(tenant_id), uuid.UUID(timeline_id), lsn_from_hex(last_lsn) - ) - - -def export_timeline( - args, - psql_path, - pageserver_connstr, - tenant_id, - timeline_id, - last_lsn, - prev_lsn, - tar_filename, - pg_version, -): - # Choose filenames - incomplete_filename = tar_filename + ".incomplete" - stderr_filename = os.path.join(args.work_dir, f"{tenant_id}_{timeline_id}.stderr") - - # Construct export command - query = f"fullbackup {tenant_id} {timeline_id} {last_lsn} {prev_lsn}" - cmd = [psql_path, "--no-psqlrc", pageserver_connstr, "-c", query] - - # Run export command - print(f"Running: {cmd}") - with open(incomplete_filename, "w") as stdout_f: - with open(stderr_filename, "w") as stderr_f: - print(f"(capturing output to {incomplete_filename})") - pg_bin = PgBin(args.work_dir, args.pg_distrib_dir, pg_version) - subprocess.run( - cmd, stdout=stdout_f, stderr=stderr_f, env=pg_bin._build_env(None), check=True - ) - - # Add missing rels - pg_bin = PgBin(args.work_dir, args.pg_distrib_dir, pg_version) - add_missing_rels(incomplete_filename, tar_filename, args.work_dir, pg_bin, args.tmp_pg_port) - - # Log more info - file_size = os.path.getsize(tar_filename) - print(f"Done export: {tar_filename}, size {file_size}") - - -def main(args: argparse.Namespace): - # any psql version will do here. use current DEFAULT_PG_VERSION = 15 - psql_path = str(Path(args.pg_distrib_dir) / "v15" / "bin" / "psql") - - old_pageserver_host = args.old_pageserver_host - new_pageserver_host = args.new_pageserver_host - - old_http_client = NeonPageserverHttpClient(old_pageserver_host, args.old_pageserver_http_port) - old_http_client.check_status() - old_pageserver_connstr = f"postgresql://{old_pageserver_host}:{args.old_pageserver_pg_port}" - - new_http_client = NeonPageserverHttpClient(new_pageserver_host, args.new_pageserver_http_port) - new_http_client.check_status() - new_pageserver_connstr = f"postgresql://{new_pageserver_host}:{args.new_pageserver_pg_port}" - - for tenant_id in args.tenants: - print(f"Tenant: {tenant_id}") - timelines = old_http_client.timeline_list(uuid.UUID(tenant_id)) - print(f"Timelines: {timelines}") - - # Create tenant in new pageserver - if args.only_import is False and not args.timelines: - new_http_client.tenant_create(uuid.UUID(tenant_id), args.ok_if_exists) - - for timeline in timelines: - # Skip timelines we don't need to export - if args.timelines and timeline["timeline_id"] not in args.timelines: - print(f"Skipping timeline {timeline['timeline_id']}") - continue - - # Choose filenames - tar_filename = os.path.join( - args.work_dir, f"{timeline['tenant_id']}_{timeline['timeline_id']}.tar" - ) - - pg_version = timeline["pg_version"] - - # Export timeline from old pageserver - if args.only_import is False: - last_lsn, prev_lsn = get_rlsn( - old_pageserver_connstr, - timeline["tenant_id"], - timeline["timeline_id"], - ) - export_timeline( - args, - psql_path, - old_pageserver_connstr, - timeline["tenant_id"], - timeline["timeline_id"], - last_lsn, - prev_lsn, - tar_filename, - pg_version, - ) - - # Import into new pageserver - import_timeline( - args, - psql_path, - new_pageserver_connstr, - new_http_client, - timeline["tenant_id"], - timeline["timeline_id"], - last_lsn, - prev_lsn, - tar_filename, - pg_version, - ) - - # Re-export and compare - re_export_filename = tar_filename + ".reexport" - export_timeline( - args, - psql_path, - new_pageserver_connstr, - timeline["tenant_id"], - timeline["timeline_id"], - last_lsn, - prev_lsn, - re_export_filename, - pg_version, - ) - - # Check the size is the same - old_size = (os.path.getsize(tar_filename),) - new_size = (os.path.getsize(re_export_filename),) - if old_size != new_size: - raise AssertionError(f"Sizes don't match old: {old_size} new: {new_size}") - - -def non_zero_tcp_port(arg: Any): - port = int(arg) - if port < 1 or port > 65535: - raise argparse.ArgumentTypeError(f"invalid tcp port: {arg}") - return port - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "--tenant-id", - dest="tenants", - required=True, - nargs="+", - help="Id of the tenant to migrate. You can pass multiple arguments", - ) - parser.add_argument( - "--timeline-id", - dest="timelines", - required=False, - nargs="+", - help="Id of the timeline to migrate. You can pass multiple arguments", - ) - parser.add_argument( - "--from-host", - dest="old_pageserver_host", - required=True, - help="Host of the pageserver to migrate data from", - ) - parser.add_argument( - "--from-http-port", - dest="old_pageserver_http_port", - required=False, - type=int, - default=9898, - help="HTTP port of the pageserver to migrate data from. Default: 9898", - ) - parser.add_argument( - "--from-pg-port", - dest="old_pageserver_pg_port", - required=False, - type=int, - default=6400, - help="pg port of the pageserver to migrate data from. Default: 6400", - ) - parser.add_argument( - "--to-host", - dest="new_pageserver_host", - required=True, - help="Host of the pageserver to migrate data to", - ) - parser.add_argument( - "--to-http-port", - dest="new_pageserver_http_port", - required=False, - default=9898, - type=int, - help="HTTP port of the pageserver to migrate data to. Default: 9898", - ) - parser.add_argument( - "--to-pg-port", - dest="new_pageserver_pg_port", - required=False, - default=6400, - type=int, - help="pg port of the pageserver to migrate data to. Default: 6400", - ) - parser.add_argument( - "--ignore-tenant-exists", - dest="ok_if_exists", - required=False, - help="Ignore error if we are trying to create the tenant that already exists. It can be dangerous if existing tenant already contains some data.", - ) - parser.add_argument( - "--pg-distrib-dir", - dest="pg_distrib_dir", - required=False, - default="/usr/local/", - help="Path where postgres binaries are installed. Default: /usr/local/", - ) - parser.add_argument( - "--psql-path", - dest="psql_path", - required=False, - default="/usr/local/v14/bin/psql", - help="Path to the psql binary. Default: /usr/local/v14/bin/psql", - ) - parser.add_argument( - "--only-import", - dest="only_import", - required=False, - default=False, - action="store_true", - help="Skip export and tenant creation part", - ) - parser.add_argument( - "--work-dir", - dest="work_dir", - required=True, - default=False, - help="directory where temporary tar files are stored", - ) - parser.add_argument( - "--tmp-pg-port", - dest="tmp_pg_port", - required=False, - default=55439, - type=non_zero_tcp_port, - help="localhost port to use for temporary postgres instance", - ) - args = parser.parse_args() - main(args) diff --git a/scripts/flaky_tests.py b/scripts/flaky_tests.py index 853c67d218..878840fcee 100755 --- a/scripts/flaky_tests.py +++ b/scripts/flaky_tests.py @@ -15,8 +15,7 @@ FLAKY_TESTS_QUERY = """ DISTINCT parent_suite, suite, name FROM results WHERE - started_at > CURRENT_DATE - INTERVAL '10' day - AND started_at > '2024-03-11 14:50:11.845+00' -- we switched the default PAGESERVER_VIRTUAL_FILE_IO_ENGINE to `tokio-epoll-uring` from `std-fs` on this date, we want to ignore the flaky tests for `std-fs` + started_at > CURRENT_DATE - INTERVAL '%s' day AND ( (status IN ('failed', 'broken') AND reference = 'refs/heads/main') OR flaky diff --git a/scripts/sk_cleanup_tenants/script.py b/scripts/sk_cleanup_tenants/script.py index fa22433614..c20a4bb830 100644 --- a/scripts/sk_cleanup_tenants/script.py +++ b/scripts/sk_cleanup_tenants/script.py @@ -22,7 +22,7 @@ parser.add_argument("--safekeeper-host", required=True, type=str) args = parser.parse_args() access_key = os.getenv("CONSOLE_API_TOKEN") -endpoint: str = "https://console.stage.neon.tech/api" +endpoint: str = "https://console-stage.neon.build/api" trash_dir: Path = args.trash_dir dry_run: bool = args.dry_run diff --git a/scripts/sk_collect_dumps/readme.md b/scripts/sk_collect_dumps/readme.md index 7494a6cb78..5ae55e058b 100644 --- a/scripts/sk_collect_dumps/readme.md +++ b/scripts/sk_collect_dumps/readme.md @@ -3,7 +3,7 @@ 3. Issue admin token (add/remove .stage from url for staging/prod and setting proper API key): ``` # staging: -AUTH_TOKEN=$(curl https://console.stage.neon.tech/regions/console/api/v1/admin/issue_token -H "Accept: application/json" -H "Content-Type: application/json" -H "Authorization: Bearer $NEON_STAGING_KEY" -X POST -d '{"ttl_seconds": 43200, "scope": "safekeeperdata"}' 2>/dev/null | jq --raw-output '.jwt') +AUTH_TOKEN=$(curl https://console-stage.neon.build/regions/console/api/v1/admin/issue_token -H "Accept: application/json" -H "Content-Type: application/json" -H "Authorization: Bearer $NEON_STAGING_KEY" -X POST -d '{"ttl_seconds": 43200, "scope": "safekeeperdata"}' 2>/dev/null | jq --raw-output '.jwt') # prod: AUTH_TOKEN=$(curl https://console.neon.tech/regions/console/api/v1/admin/issue_token -H "Accept: application/json" -H "Content-Type: application/json" -H "Authorization: Bearer $NEON_PROD_KEY" -X POST -d '{"ttl_seconds": 43200, "scope": "safekeeperdata"}' 2>/dev/null | jq --raw-output '.jwt') # check diff --git a/storage_broker/src/bin/storage_broker.rs b/storage_broker/src/bin/storage_broker.rs index 4e5f8ed724..8c88b61abc 100644 --- a/storage_broker/src/bin/storage_broker.rs +++ b/storage_broker/src/bin/storage_broker.rs @@ -196,8 +196,13 @@ impl SubscriptionKey { /// Parse from FilterTenantTimelineId pub fn from_proto_filter_tenant_timeline_id( - f: &FilterTenantTimelineId, + opt: Option<&FilterTenantTimelineId>, ) -> Result { + if opt.is_none() { + return Ok(SubscriptionKey::All); + } + + let f = opt.unwrap(); if !f.enabled { return Ok(SubscriptionKey::All); } @@ -534,10 +539,7 @@ impl BrokerService for Broker { .remote_addr() .expect("TCPConnectInfo inserted by handler"); let proto_filter = request.into_inner(); - let ttid_filter = proto_filter - .tenant_timeline_id - .as_ref() - .ok_or_else(|| Status::new(Code::InvalidArgument, "missing tenant_timeline_id"))?; + let ttid_filter = proto_filter.tenant_timeline_id.as_ref(); let sub_key = SubscriptionKey::from_proto_filter_tenant_timeline_id(ttid_filter)?; let types_set = proto_filter diff --git a/control_plane/attachment_service/Cargo.toml b/storage_controller/Cargo.toml similarity index 74% rename from control_plane/attachment_service/Cargo.toml rename to storage_controller/Cargo.toml index 34882659e3..194619a496 100644 --- a/control_plane/attachment_service/Cargo.toml +++ b/storage_controller/Cargo.toml @@ -1,5 +1,5 @@ [package] -name = "attachment_service" +name = "storage_controller" version = "0.1.0" edition.workspace = true license.workspace = true @@ -16,7 +16,6 @@ testing = [] [dependencies] anyhow.workspace = true aws-config.workspace = true -aws-sdk-secretsmanager.workspace = true bytes.workspace = true camino.workspace = true clap.workspace = true @@ -26,12 +25,13 @@ git-version.workspace = true hex.workspace = true hyper.workspace = true humantime.workspace = true +itertools.workspace = true lasso.workspace = true once_cell.workspace = true pageserver_api.workspace = true pageserver_client.workspace = true postgres_connection.workspace = true -reqwest.workspace = true +reqwest = { workspace = true, features = ["stream"] } routerify.workspace = true serde.workspace = true serde_json.workspace = true @@ -40,13 +40,15 @@ tokio.workspace = true tokio-util.workspace = true tracing.workspace = true measured.workspace = true +strum.workspace = true +strum_macros.workspace = true diesel = { version = "2.1.4", features = ["serde_json", "postgres", "r2d2"] } diesel_migrations = { version = "2.1.0" } r2d2 = { version = "0.8.10" } -utils = { path = "../../libs/utils/" } -metrics = { path = "../../libs/metrics/" } -control_plane = { path = ".." } -workspace_hack = { version = "0.1", path = "../../workspace_hack" } +utils = { path = "../libs/utils/" } +metrics = { path = "../libs/metrics/" } +control_plane = { path = "../control_plane" } +workspace_hack = { version = "0.1", path = "../workspace_hack" } diff --git a/control_plane/attachment_service/migrations/.keep b/storage_controller/migrations/.keep similarity index 100% rename from control_plane/attachment_service/migrations/.keep rename to storage_controller/migrations/.keep diff --git a/control_plane/attachment_service/migrations/00000000000000_diesel_initial_setup/down.sql b/storage_controller/migrations/00000000000000_diesel_initial_setup/down.sql similarity index 100% rename from control_plane/attachment_service/migrations/00000000000000_diesel_initial_setup/down.sql rename to storage_controller/migrations/00000000000000_diesel_initial_setup/down.sql diff --git a/control_plane/attachment_service/migrations/00000000000000_diesel_initial_setup/up.sql b/storage_controller/migrations/00000000000000_diesel_initial_setup/up.sql similarity index 100% rename from control_plane/attachment_service/migrations/00000000000000_diesel_initial_setup/up.sql rename to storage_controller/migrations/00000000000000_diesel_initial_setup/up.sql diff --git a/control_plane/attachment_service/migrations/2024-01-07-211257_create_tenant_shards/down.sql b/storage_controller/migrations/2024-01-07-211257_create_tenant_shards/down.sql similarity index 100% rename from control_plane/attachment_service/migrations/2024-01-07-211257_create_tenant_shards/down.sql rename to storage_controller/migrations/2024-01-07-211257_create_tenant_shards/down.sql diff --git a/control_plane/attachment_service/migrations/2024-01-07-211257_create_tenant_shards/up.sql b/storage_controller/migrations/2024-01-07-211257_create_tenant_shards/up.sql similarity index 100% rename from control_plane/attachment_service/migrations/2024-01-07-211257_create_tenant_shards/up.sql rename to storage_controller/migrations/2024-01-07-211257_create_tenant_shards/up.sql diff --git a/control_plane/attachment_service/migrations/2024-01-07-212945_create_nodes/down.sql b/storage_controller/migrations/2024-01-07-212945_create_nodes/down.sql similarity index 100% rename from control_plane/attachment_service/migrations/2024-01-07-212945_create_nodes/down.sql rename to storage_controller/migrations/2024-01-07-212945_create_nodes/down.sql diff --git a/control_plane/attachment_service/migrations/2024-01-07-212945_create_nodes/up.sql b/storage_controller/migrations/2024-01-07-212945_create_nodes/up.sql similarity index 100% rename from control_plane/attachment_service/migrations/2024-01-07-212945_create_nodes/up.sql rename to storage_controller/migrations/2024-01-07-212945_create_nodes/up.sql diff --git a/control_plane/attachment_service/migrations/2024-02-29-094122_generations_null/down.sql b/storage_controller/migrations/2024-02-29-094122_generations_null/down.sql similarity index 100% rename from control_plane/attachment_service/migrations/2024-02-29-094122_generations_null/down.sql rename to storage_controller/migrations/2024-02-29-094122_generations_null/down.sql diff --git a/control_plane/attachment_service/migrations/2024-02-29-094122_generations_null/up.sql b/storage_controller/migrations/2024-02-29-094122_generations_null/up.sql similarity index 100% rename from control_plane/attachment_service/migrations/2024-02-29-094122_generations_null/up.sql rename to storage_controller/migrations/2024-02-29-094122_generations_null/up.sql diff --git a/control_plane/attachment_service/migrations/2024-03-18-184429_rename_policy/down.sql b/storage_controller/migrations/2024-03-18-184429_rename_policy/down.sql similarity index 100% rename from control_plane/attachment_service/migrations/2024-03-18-184429_rename_policy/down.sql rename to storage_controller/migrations/2024-03-18-184429_rename_policy/down.sql diff --git a/control_plane/attachment_service/migrations/2024-03-18-184429_rename_policy/up.sql b/storage_controller/migrations/2024-03-18-184429_rename_policy/up.sql similarity index 100% rename from control_plane/attachment_service/migrations/2024-03-18-184429_rename_policy/up.sql rename to storage_controller/migrations/2024-03-18-184429_rename_policy/up.sql diff --git a/storage_controller/migrations/2024-03-27-133204_tenant_policies/down.sql b/storage_controller/migrations/2024-03-27-133204_tenant_policies/down.sql new file mode 100644 index 0000000000..33c06dc03d --- /dev/null +++ b/storage_controller/migrations/2024-03-27-133204_tenant_policies/down.sql @@ -0,0 +1,3 @@ +-- This file should undo anything in `up.sql` + +ALTER TABLE tenant_shards drop scheduling_policy; \ No newline at end of file diff --git a/storage_controller/migrations/2024-03-27-133204_tenant_policies/up.sql b/storage_controller/migrations/2024-03-27-133204_tenant_policies/up.sql new file mode 100644 index 0000000000..aa00f0d2ca --- /dev/null +++ b/storage_controller/migrations/2024-03-27-133204_tenant_policies/up.sql @@ -0,0 +1,2 @@ + +ALTER TABLE tenant_shards add scheduling_policy VARCHAR NOT NULL DEFAULT '"Active"'; diff --git a/control_plane/attachment_service/src/auth.rs b/storage_controller/src/auth.rs similarity index 100% rename from control_plane/attachment_service/src/auth.rs rename to storage_controller/src/auth.rs diff --git a/storage_controller/src/compute_hook.rs b/storage_controller/src/compute_hook.rs new file mode 100644 index 0000000000..9d326ef82d --- /dev/null +++ b/storage_controller/src/compute_hook.rs @@ -0,0 +1,687 @@ +use std::sync::Arc; +use std::{collections::HashMap, time::Duration}; + +use control_plane::endpoint::{ComputeControlPlane, EndpointStatus}; +use control_plane::local_env::LocalEnv; +use futures::StreamExt; +use hyper::StatusCode; +use pageserver_api::shard::{ShardCount, ShardNumber, ShardStripeSize, TenantShardId}; +use postgres_connection::parse_host_port; +use serde::{Deserialize, Serialize}; +use tokio_util::sync::CancellationToken; +use tracing::{info_span, Instrument}; +use utils::{ + backoff::{self}, + id::{NodeId, TenantId}, +}; + +use crate::service::Config; + +const SLOWDOWN_DELAY: Duration = Duration::from_secs(5); + +const NOTIFY_REQUEST_TIMEOUT: Duration = Duration::from_secs(10); + +pub(crate) const API_CONCURRENCY: usize = 32; + +struct UnshardedComputeHookTenant { + // Which node is this tenant attached to + node_id: NodeId, + + // Must hold this lock to send a notification. + send_lock: Arc>>, +} +struct ShardedComputeHookTenant { + stripe_size: ShardStripeSize, + shard_count: ShardCount, + shards: Vec<(ShardNumber, NodeId)>, + + // Must hold this lock to send a notification. The contents represent + // the last successfully sent notification, and are used to coalesce multiple + // updates by only sending when there is a chance since our last successful send. + send_lock: Arc>>, +} + +enum ComputeHookTenant { + Unsharded(UnshardedComputeHookTenant), + Sharded(ShardedComputeHookTenant), +} + +impl ComputeHookTenant { + /// Construct with at least one shard's information + fn new(tenant_shard_id: TenantShardId, stripe_size: ShardStripeSize, node_id: NodeId) -> Self { + if tenant_shard_id.shard_count.count() > 1 { + Self::Sharded(ShardedComputeHookTenant { + shards: vec![(tenant_shard_id.shard_number, node_id)], + stripe_size, + shard_count: tenant_shard_id.shard_count, + send_lock: Arc::default(), + }) + } else { + Self::Unsharded(UnshardedComputeHookTenant { + node_id, + send_lock: Arc::default(), + }) + } + } + + fn get_send_lock(&self) -> &Arc>> { + match self { + Self::Unsharded(unsharded_tenant) => &unsharded_tenant.send_lock, + Self::Sharded(sharded_tenant) => &sharded_tenant.send_lock, + } + } + + /// Set one shard's location. If stripe size or shard count have changed, Self is reset + /// and drops existing content. + fn update( + &mut self, + tenant_shard_id: TenantShardId, + stripe_size: ShardStripeSize, + node_id: NodeId, + ) { + match self { + Self::Unsharded(unsharded_tenant) if tenant_shard_id.shard_count.count() == 1 => { + unsharded_tenant.node_id = node_id + } + Self::Sharded(sharded_tenant) + if sharded_tenant.stripe_size == stripe_size + && sharded_tenant.shard_count == tenant_shard_id.shard_count => + { + if let Some(existing) = sharded_tenant + .shards + .iter() + .position(|s| s.0 == tenant_shard_id.shard_number) + { + sharded_tenant.shards.get_mut(existing).unwrap().1 = node_id; + } else { + sharded_tenant + .shards + .push((tenant_shard_id.shard_number, node_id)); + sharded_tenant.shards.sort_by_key(|s| s.0) + } + } + _ => { + // Shard count changed: reset struct. + *self = Self::new(tenant_shard_id, stripe_size, node_id); + } + } + } +} + +#[derive(Serialize, Deserialize, Debug, Eq, PartialEq)] +struct ComputeHookNotifyRequestShard { + node_id: NodeId, + shard_number: ShardNumber, +} + +/// Request body that we send to the control plane to notify it of where a tenant is attached +#[derive(Serialize, Deserialize, Debug, Eq, PartialEq)] +struct ComputeHookNotifyRequest { + tenant_id: TenantId, + stripe_size: Option, + shards: Vec, +} + +/// Error type for attempts to call into the control plane compute notification hook +#[derive(thiserror::Error, Debug)] +pub(crate) enum NotifyError { + // Request was not send successfully, e.g. transport error + #[error("Sending request: {0}")] + Request(#[from] reqwest::Error), + // Request could not be serviced right now due to ongoing Operation in control plane, but should be possible soon. + #[error("Control plane tenant busy")] + Busy, + // Explicit 429 response asking us to retry less frequently + #[error("Control plane overloaded")] + SlowDown, + // A 503 response indicates the control plane can't handle the request right now + #[error("Control plane unavailable (status {0})")] + Unavailable(StatusCode), + // API returned unexpected non-success status. We will retry, but log a warning. + #[error("Control plane returned unexpected status {0}")] + Unexpected(StatusCode), + // We shutdown while sending + #[error("Shutting down")] + ShuttingDown, + // A response indicates we will never succeed, such as 400 or 404 + #[error("Non-retryable error {0}")] + Fatal(StatusCode), +} + +enum MaybeSendResult { + // Please send this request while holding the lock, and if you succeed then write + // the request into the lock. + Transmit( + ( + ComputeHookNotifyRequest, + tokio::sync::OwnedMutexGuard>, + ), + ), + // Something requires sending, but you must wait for a current sender then call again + AwaitLock(Arc>>), + // Nothing requires sending + Noop, +} + +impl ComputeHookTenant { + fn maybe_send( + &self, + tenant_id: TenantId, + lock: Option>>, + ) -> MaybeSendResult { + let locked = match lock { + Some(already_locked) => already_locked, + None => { + // Lock order: this _must_ be only a try_lock, because we are called inside of the [`ComputeHook::state`] lock. + let Ok(locked) = self.get_send_lock().clone().try_lock_owned() else { + return MaybeSendResult::AwaitLock(self.get_send_lock().clone()); + }; + locked + } + }; + + let request = match self { + Self::Unsharded(unsharded_tenant) => Some(ComputeHookNotifyRequest { + tenant_id, + shards: vec![ComputeHookNotifyRequestShard { + shard_number: ShardNumber(0), + node_id: unsharded_tenant.node_id, + }], + stripe_size: None, + }), + Self::Sharded(sharded_tenant) + if sharded_tenant.shards.len() == sharded_tenant.shard_count.count() as usize => + { + Some(ComputeHookNotifyRequest { + tenant_id, + shards: sharded_tenant + .shards + .iter() + .map(|(shard_number, node_id)| ComputeHookNotifyRequestShard { + shard_number: *shard_number, + node_id: *node_id, + }) + .collect(), + stripe_size: Some(sharded_tenant.stripe_size), + }) + } + Self::Sharded(sharded_tenant) => { + // Sharded tenant doesn't yet have information for all its shards + + tracing::info!( + "ComputeHookTenant::maybe_send: not enough shards ({}/{})", + sharded_tenant.shards.len(), + sharded_tenant.shard_count.count() + ); + None + } + }; + + match request { + None => { + // Not yet ready to emit a notification + tracing::info!("Tenant isn't yet ready to emit a notification"); + MaybeSendResult::Noop + } + Some(request) if Some(&request) == locked.as_ref() => { + // No change from the last value successfully sent + MaybeSendResult::Noop + } + Some(request) => MaybeSendResult::Transmit((request, locked)), + } + } +} + +/// The compute hook is a destination for notifications about changes to tenant:pageserver +/// mapping. It aggregates updates for the shards in a tenant, and when appropriate reconfigures +/// the compute connection string. +pub(super) struct ComputeHook { + config: Config, + state: std::sync::Mutex>, + authorization_header: Option, + + // Concurrency limiter, so that we do not overload the cloud control plane when updating + // large numbers of tenants (e.g. when failing over after a node failure) + api_concurrency: tokio::sync::Semaphore, + + // This lock is only used in testing enviroments, to serialize calls into neon_lock + neon_local_lock: tokio::sync::Mutex<()>, + + // We share a client across all notifications to enable connection re-use etc when + // sending large numbers of notifications + client: reqwest::Client, +} + +impl ComputeHook { + pub(super) fn new(config: Config) -> Self { + let authorization_header = config + .control_plane_jwt_token + .clone() + .map(|jwt| format!("Bearer {}", jwt)); + + let client = reqwest::ClientBuilder::new() + .timeout(NOTIFY_REQUEST_TIMEOUT) + .build() + .expect("Failed to construct HTTP client"); + + Self { + state: Default::default(), + config, + authorization_header, + neon_local_lock: Default::default(), + api_concurrency: tokio::sync::Semaphore::new(API_CONCURRENCY), + client, + } + } + + /// For test environments: use neon_local's LocalEnv to update compute + async fn do_notify_local( + &self, + reconfigure_request: &ComputeHookNotifyRequest, + ) -> anyhow::Result<()> { + // neon_local updates are not safe to call concurrently, use a lock to serialize + // all calls to this function + let _locked = self.neon_local_lock.lock().await; + + let env = match LocalEnv::load_config() { + Ok(e) => e, + Err(e) => { + tracing::warn!("Couldn't load neon_local config, skipping compute update ({e})"); + return Ok(()); + } + }; + let cplane = + ComputeControlPlane::load(env.clone()).expect("Error loading compute control plane"); + let ComputeHookNotifyRequest { + tenant_id, + shards, + stripe_size, + } = reconfigure_request; + + let compute_pageservers = shards + .iter() + .map(|shard| { + let ps_conf = env + .get_pageserver_conf(shard.node_id) + .expect("Unknown pageserver"); + let (pg_host, pg_port) = parse_host_port(&ps_conf.listen_pg_addr) + .expect("Unable to parse listen_pg_addr"); + (pg_host, pg_port.unwrap_or(5432)) + }) + .collect::>(); + + for (endpoint_name, endpoint) in &cplane.endpoints { + if endpoint.tenant_id == *tenant_id && endpoint.status() == EndpointStatus::Running { + tracing::info!("Reconfiguring endpoint {}", endpoint_name,); + endpoint + .reconfigure(compute_pageservers.clone(), *stripe_size) + .await?; + } + } + + Ok(()) + } + + async fn do_notify_iteration( + &self, + url: &String, + reconfigure_request: &ComputeHookNotifyRequest, + cancel: &CancellationToken, + ) -> Result<(), NotifyError> { + let req = self.client.request(reqwest::Method::PUT, url); + let req = if let Some(value) = &self.authorization_header { + req.header(reqwest::header::AUTHORIZATION, value) + } else { + req + }; + + tracing::info!( + "Sending notify request to {} ({:?})", + url, + reconfigure_request + ); + let send_result = req.json(&reconfigure_request).send().await; + let response = match send_result { + Ok(r) => r, + Err(e) => return Err(e.into()), + }; + + // Treat all 2xx responses as success + if response.status() >= reqwest::StatusCode::OK + && response.status() < reqwest::StatusCode::MULTIPLE_CHOICES + { + if response.status() != reqwest::StatusCode::OK { + // Non-200 2xx response: it doesn't make sense to retry, but this is unexpected, so + // log a warning. + tracing::warn!( + "Unexpected 2xx response code {} from control plane", + response.status() + ); + } + + return Ok(()); + } + + // Error response codes + match response.status() { + reqwest::StatusCode::TOO_MANY_REQUESTS => { + // TODO: 429 handling should be global: set some state visible to other requests + // so that they will delay before starting, rather than all notifications trying + // once before backing off. + tokio::time::timeout(SLOWDOWN_DELAY, cancel.cancelled()) + .await + .ok(); + Err(NotifyError::SlowDown) + } + reqwest::StatusCode::LOCKED => { + // We consider this fatal, because it's possible that the operation blocking the control one is + // also the one that is waiting for this reconcile. We should let the reconciler calling + // this hook fail, to give control plane a chance to un-lock. + tracing::info!("Control plane reports tenant is locked, dropping out of notify"); + Err(NotifyError::Busy) + } + reqwest::StatusCode::SERVICE_UNAVAILABLE => { + Err(NotifyError::Unavailable(StatusCode::SERVICE_UNAVAILABLE)) + } + reqwest::StatusCode::GATEWAY_TIMEOUT => { + Err(NotifyError::Unavailable(StatusCode::GATEWAY_TIMEOUT)) + } + reqwest::StatusCode::BAD_GATEWAY => { + Err(NotifyError::Unavailable(StatusCode::BAD_GATEWAY)) + } + + reqwest::StatusCode::BAD_REQUEST => Err(NotifyError::Fatal(StatusCode::BAD_REQUEST)), + reqwest::StatusCode::UNAUTHORIZED => Err(NotifyError::Fatal(StatusCode::UNAUTHORIZED)), + reqwest::StatusCode::FORBIDDEN => Err(NotifyError::Fatal(StatusCode::FORBIDDEN)), + status => Err(NotifyError::Unexpected( + hyper::StatusCode::from_u16(status.as_u16()) + .unwrap_or(StatusCode::INTERNAL_SERVER_ERROR), + )), + } + } + + async fn do_notify( + &self, + url: &String, + reconfigure_request: &ComputeHookNotifyRequest, + cancel: &CancellationToken, + ) -> Result<(), NotifyError> { + // We hold these semaphore units across all retries, rather than only across each + // HTTP request: this is to preserve fairness and avoid a situation where a retry might + // time out waiting for a semaphore. + let _units = self + .api_concurrency + .acquire() + .await + // Interpret closed semaphore as shutdown + .map_err(|_| NotifyError::ShuttingDown)?; + + backoff::retry( + || self.do_notify_iteration(url, reconfigure_request, cancel), + |e| { + matches!( + e, + NotifyError::Fatal(_) | NotifyError::Unexpected(_) | NotifyError::Busy + ) + }, + 3, + 10, + "Send compute notification", + cancel, + ) + .await + .ok_or_else(|| NotifyError::ShuttingDown) + .and_then(|x| x) + } + + /// Synchronous phase: update the per-tenant state for the next intended notification + fn notify_prepare( + &self, + tenant_shard_id: TenantShardId, + node_id: NodeId, + stripe_size: ShardStripeSize, + ) -> MaybeSendResult { + let mut state_locked = self.state.lock().unwrap(); + + use std::collections::hash_map::Entry; + let tenant = match state_locked.entry(tenant_shard_id.tenant_id) { + Entry::Vacant(e) => e.insert(ComputeHookTenant::new( + tenant_shard_id, + stripe_size, + node_id, + )), + Entry::Occupied(e) => { + let tenant = e.into_mut(); + tenant.update(tenant_shard_id, stripe_size, node_id); + tenant + } + }; + tenant.maybe_send(tenant_shard_id.tenant_id, None) + } + + async fn notify_execute( + &self, + maybe_send_result: MaybeSendResult, + tenant_shard_id: TenantShardId, + cancel: &CancellationToken, + ) -> Result<(), NotifyError> { + // Process result: we may get an update to send, or we may have to wait for a lock + // before trying again. + let (request, mut send_lock_guard) = match maybe_send_result { + MaybeSendResult::Noop => { + return Ok(()); + } + MaybeSendResult::AwaitLock(send_lock) => { + let send_locked = tokio::select! { + guard = send_lock.lock_owned() => {guard}, + _ = cancel.cancelled() => { + return Err(NotifyError::ShuttingDown) + } + }; + + // Lock order: maybe_send is called within the `[Self::state]` lock, and takes the send lock, but here + // we have acquired the send lock and take `[Self::state]` lock. This is safe because maybe_send only uses + // try_lock. + let state_locked = self.state.lock().unwrap(); + let Some(tenant) = state_locked.get(&tenant_shard_id.tenant_id) else { + return Ok(()); + }; + match tenant.maybe_send(tenant_shard_id.tenant_id, Some(send_locked)) { + MaybeSendResult::AwaitLock(_) => { + unreachable!("We supplied lock guard") + } + MaybeSendResult::Noop => { + return Ok(()); + } + MaybeSendResult::Transmit((request, lock)) => (request, lock), + } + } + MaybeSendResult::Transmit((request, lock)) => (request, lock), + }; + + let result = if let Some(notify_url) = &self.config.compute_hook_url { + self.do_notify(notify_url, &request, cancel).await + } else { + self.do_notify_local(&request).await.map_err(|e| { + // This path is for testing only, so munge the error into our prod-style error type. + tracing::error!("Local notification hook failed: {e}"); + NotifyError::Fatal(StatusCode::INTERNAL_SERVER_ERROR) + }) + }; + + if result.is_ok() { + // Before dropping the send lock, stash the request we just sent so that + // subsequent callers can avoid redundantly re-sending the same thing. + *send_lock_guard = Some(request); + } + result + } + + /// Infallible synchronous fire-and-forget version of notify(), that sends its results to + /// a channel. Something should consume the channel and arrange to try notifying again + /// if something failed. + pub(super) fn notify_background( + self: &Arc, + notifications: Vec<(TenantShardId, NodeId, ShardStripeSize)>, + result_tx: tokio::sync::mpsc::Sender>, + cancel: &CancellationToken, + ) { + let mut maybe_sends = Vec::new(); + for (tenant_shard_id, node_id, stripe_size) in notifications { + let maybe_send_result = self.notify_prepare(tenant_shard_id, node_id, stripe_size); + maybe_sends.push((tenant_shard_id, maybe_send_result)) + } + + let this = self.clone(); + let cancel = cancel.clone(); + + tokio::task::spawn(async move { + // Construct an async stream of futures to invoke the compute notify function: we do this + // in order to subsequently use .buffered() on the stream to execute with bounded parallelism. The + // ComputeHook semaphore already limits concurrency, but this way we avoid constructing+polling lots of futures which + // would mostly just be waiting on that semaphore. + let mut stream = futures::stream::iter(maybe_sends) + .map(|(tenant_shard_id, maybe_send_result)| { + let this = this.clone(); + let cancel = cancel.clone(); + + async move { + this + .notify_execute(maybe_send_result, tenant_shard_id, &cancel) + .await.map_err(|e| (tenant_shard_id, e)) + }.instrument(info_span!( + "notify_background", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug() + )) + }) + .buffered(API_CONCURRENCY); + + loop { + tokio::select! { + next = stream.next() => { + match next { + Some(r) => { + result_tx.send(r).await.ok(); + }, + None => { + tracing::info!("Finished sending background compute notifications"); + break; + } + } + }, + _ = cancel.cancelled() => { + tracing::info!("Shutdown while running background compute notifications"); + break; + } + }; + } + }); + } + + /// Call this to notify the compute (postgres) tier of new pageservers to use + /// for a tenant. notify() is called by each shard individually, and this function + /// will decide whether an update to the tenant is sent. An update is sent on the + /// condition that: + /// - We know a pageserver for every shard. + /// - All the shards have the same shard_count (i.e. we are not mid-split) + /// + /// Cancellation token enables callers to drop out, e.g. if calling from a Reconciler + /// that is cancelled. + /// + /// This function is fallible, including in the case that the control plane is transiently + /// unavailable. A limited number of retries are done internally to efficiently hide short unavailability + /// periods, but we don't retry forever. The **caller** is responsible for handling failures and + /// ensuring that they eventually call again to ensure that the compute is eventually notified of + /// the proper pageserver nodes for a tenant. + #[tracing::instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), node_id))] + pub(super) async fn notify( + &self, + tenant_shard_id: TenantShardId, + node_id: NodeId, + stripe_size: ShardStripeSize, + cancel: &CancellationToken, + ) -> Result<(), NotifyError> { + let maybe_send_result = self.notify_prepare(tenant_shard_id, node_id, stripe_size); + self.notify_execute(maybe_send_result, tenant_shard_id, cancel) + .await + } +} + +#[cfg(test)] +pub(crate) mod tests { + use pageserver_api::shard::{ShardCount, ShardNumber}; + use utils::id::TenantId; + + use super::*; + + #[test] + fn tenant_updates() -> anyhow::Result<()> { + let tenant_id = TenantId::generate(); + let mut tenant_state = ComputeHookTenant::new( + TenantShardId { + tenant_id, + shard_count: ShardCount::new(0), + shard_number: ShardNumber(0), + }, + ShardStripeSize(12345), + NodeId(1), + ); + + // An unsharded tenant is always ready to emit a notification, but won't + // send the same one twice + let send_result = tenant_state.maybe_send(tenant_id, None); + let MaybeSendResult::Transmit((request, mut guard)) = send_result else { + anyhow::bail!("Wrong send result"); + }; + assert_eq!(request.shards.len(), 1); + assert!(request.stripe_size.is_none()); + + // Simulate successful send + *guard = Some(request); + drop(guard); + + // Try asking again: this should be a no-op + let send_result = tenant_state.maybe_send(tenant_id, None); + assert!(matches!(send_result, MaybeSendResult::Noop)); + + // Writing the first shard of a multi-sharded situation (i.e. in a split) + // resets the tenant state and puts it in an non-notifying state (need to + // see all shards) + tenant_state.update( + TenantShardId { + tenant_id, + shard_count: ShardCount::new(2), + shard_number: ShardNumber(1), + }, + ShardStripeSize(32768), + NodeId(1), + ); + assert!(matches!( + tenant_state.maybe_send(tenant_id, None), + MaybeSendResult::Noop + )); + + // Writing the second shard makes it ready to notify + tenant_state.update( + TenantShardId { + tenant_id, + shard_count: ShardCount::new(2), + shard_number: ShardNumber(0), + }, + ShardStripeSize(32768), + NodeId(1), + ); + + let send_result = tenant_state.maybe_send(tenant_id, None); + let MaybeSendResult::Transmit((request, mut guard)) = send_result else { + anyhow::bail!("Wrong send result"); + }; + assert_eq!(request.shards.len(), 2); + assert_eq!(request.stripe_size, Some(ShardStripeSize(32768))); + + // Simulate successful send + *guard = Some(request); + drop(guard); + + Ok(()) + } +} diff --git a/control_plane/attachment_service/src/heartbeater.rs b/storage_controller/src/heartbeater.rs similarity index 94% rename from control_plane/attachment_service/src/heartbeater.rs rename to storage_controller/src/heartbeater.rs index e15de28920..1ef97e78eb 100644 --- a/control_plane/attachment_service/src/heartbeater.rs +++ b/storage_controller/src/heartbeater.rs @@ -139,7 +139,7 @@ impl HeartbeaterTask { .with_client_retries( |client| async move { client.get_utilization().await }, &jwt_token, - 2, + 3, 3, Duration::from_secs(1), &cancel, @@ -184,6 +184,19 @@ impl HeartbeaterTask { } } } + tracing::info!( + "Heartbeat round complete for {} nodes, {} offline", + new_state.len(), + new_state + .values() + .filter(|s| match s { + PageserverState::Available { .. } => { + false + } + PageserverState::Offline => true, + }) + .count() + ); let mut deltas = Vec::new(); let now = Instant::now(); diff --git a/control_plane/attachment_service/src/http.rs b/storage_controller/src/http.rs similarity index 86% rename from control_plane/attachment_service/src/http.rs rename to storage_controller/src/http.rs index 036019cd38..604ad6fbaa 100644 --- a/control_plane/attachment_service/src/http.rs +++ b/storage_controller/src/http.rs @@ -4,10 +4,12 @@ use crate::metrics::{ }; use crate::reconciler::ReconcileError; use crate::service::{Service, STARTUP_RECONCILE_TIMEOUT}; +use anyhow::Context; use futures::Future; use hyper::header::CONTENT_TYPE; use hyper::{Body, Request, Response}; use hyper::{StatusCode, Uri}; +use metrics::{BuildInfo, NeonMetrics}; use pageserver_api::models::{ TenantConfigRequest, TenantCreateRequest, TenantLocationConfigRequest, TenantShardSplitRequest, TenantTimeTravelRequest, TimelineCreateRequest, @@ -34,7 +36,8 @@ use utils::{ }; use pageserver_api::controller_api::{ - NodeAvailability, NodeConfigureRequest, NodeRegisterRequest, TenantShardMigrateRequest, + NodeAvailability, NodeConfigureRequest, NodeRegisterRequest, TenantPolicyRequest, + TenantShardMigrateRequest, }; use pageserver_api::upcall_api::{ReAttachRequest, ValidateRequest}; @@ -43,15 +46,19 @@ use control_plane::storage_controller::{AttachHookRequest, InspectRequest}; use routerify::Middleware; /// State available to HTTP request handlers -#[derive(Clone)] pub struct HttpState { service: Arc, auth: Option>, + neon_metrics: NeonMetrics, allowlist_routes: Vec, } impl HttpState { - pub fn new(service: Arc, auth: Option>) -> Self { + pub fn new( + service: Arc, + auth: Option>, + build_info: BuildInfo, + ) -> Self { let allowlist_routes = ["/status", "/ready", "/metrics"] .iter() .map(|v| v.parse().unwrap()) @@ -59,6 +66,7 @@ impl HttpState { Self { service, auth, + neon_metrics: NeonMetrics::new(build_info), allowlist_routes, } } @@ -251,6 +259,12 @@ async fn handle_tenant_time_travel_remote_storage( json_response(StatusCode::OK, ()) } +fn map_reqwest_hyper_status(status: reqwest::StatusCode) -> Result { + hyper::StatusCode::from_u16(status.as_u16()) + .context("invalid status code") + .map_err(ApiError::InternalServerError) +} + async fn handle_tenant_secondary_download( service: Arc, req: Request, @@ -259,7 +273,7 @@ async fn handle_tenant_secondary_download( let wait = parse_query_param(&req, "wait_ms")?.map(Duration::from_millis); let (status, progress) = service.tenant_secondary_download(tenant_id, wait).await?; - json_response(status, progress) + json_response(map_reqwest_hyper_status(status)?, progress) } async fn handle_tenant_delete( @@ -270,7 +284,10 @@ async fn handle_tenant_delete( check_permissions(&req, Scope::PageServerApi)?; deletion_wrapper(service, move |service| async move { - service.tenant_delete(tenant_id).await + service + .tenant_delete(tenant_id) + .await + .and_then(map_reqwest_hyper_status) }) .await } @@ -301,7 +318,10 @@ async fn handle_tenant_timeline_delete( let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?; deletion_wrapper(service, move |service| async move { - service.tenant_timeline_delete(tenant_id, timeline_id).await + service + .tenant_timeline_delete(tenant_id, timeline_id) + .await + .and_then(map_reqwest_hyper_status) }) .await } @@ -364,11 +384,9 @@ async fn handle_tenant_timeline_passthrough( } // We have a reqest::Response, would like a http::Response - let mut builder = hyper::Response::builder() - .status(resp.status()) - .version(resp.version()); + let mut builder = hyper::Response::builder().status(map_reqwest_hyper_status(resp.status())?); for (k, v) in resp.headers() { - builder = builder.header(k, v); + builder = builder.header(k.as_str(), v.as_bytes()); } let response = builder @@ -398,6 +416,15 @@ async fn handle_tenant_describe( json_response(StatusCode::OK, service.tenant_describe(tenant_id)?) } +async fn handle_tenant_list( + service: Arc, + req: Request, +) -> Result, ApiError> { + check_permissions(&req, Scope::Admin)?; + + json_response(StatusCode::OK, service.tenant_list()) +} + async fn handle_node_register(mut req: Request) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; @@ -411,7 +438,10 @@ async fn handle_node_list(req: Request) -> Result, ApiError check_permissions(&req, Scope::Admin)?; let state = get_state(&req); - json_response(StatusCode::OK, state.service.node_list().await?) + let nodes = state.service.node_list().await?; + let api_nodes = nodes.into_iter().map(|n| n.describe()).collect::>(); + + json_response(StatusCode::OK, api_nodes) } async fn handle_node_drop(req: Request) -> Result, ApiError> { @@ -478,6 +508,22 @@ async fn handle_tenant_shard_migrate( ) } +async fn handle_tenant_update_policy(mut req: Request) -> Result, ApiError> { + check_permissions(&req, Scope::Admin)?; + + let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; + let update_req = json_request::(&mut req).await?; + let state = get_state(&req); + + json_response( + StatusCode::OK, + state + .service + .tenant_update_policy(tenant_id, update_req) + .await?, + ) +} + async fn handle_tenant_drop(req: Request) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; check_permissions(&req, Scope::PageServerApi)?; @@ -487,6 +533,18 @@ async fn handle_tenant_drop(req: Request) -> Result, ApiErr json_response(StatusCode::OK, state.service.tenant_drop(tenant_id).await?) } +async fn handle_tenant_import(req: Request) -> Result, ApiError> { + let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; + check_permissions(&req, Scope::PageServerApi)?; + + let state = get_state(&req); + + json_response( + StatusCode::OK, + state.service.tenant_import(tenant_id).await?, + ) +} + async fn handle_tenants_dump(req: Request) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; @@ -509,6 +567,14 @@ async fn handle_consistency_check(req: Request) -> Result, json_response(StatusCode::OK, state.service.consistency_check().await?) } +async fn handle_reconcile_all(req: Request) -> Result, ApiError> { + check_permissions(&req, Scope::Admin)?; + + let state = get_state(&req); + + json_response(StatusCode::OK, state.service.reconcile_all_now().await?) +} + /// Status endpoint is just used for checking that our HTTP listener is up async fn handle_status(_req: Request) -> Result, ApiError> { json_response(StatusCode::OK, ()) @@ -565,9 +631,17 @@ where .await } +/// Check if the required scope is held in the request's token, or if the request has +/// a token with 'admin' scope then always permit it. fn check_permissions(request: &Request, required_scope: Scope) -> Result<(), ApiError> { check_permission_with(request, |claims| { - crate::auth::check_permission(claims, required_scope) + match crate::auth::check_permission(claims, required_scope) { + Err(e) => match crate::auth::check_permission(claims, Scope::Admin) { + Ok(()) => Ok(()), + Err(_) => Err(e), + }, + Ok(()) => Ok(()), + } }) } @@ -627,10 +701,11 @@ fn epilogue_metrics_middleware }) } -pub async fn measured_metrics_handler(_req: Request) -> Result, ApiError> { +pub async fn measured_metrics_handler(req: Request) -> Result, ApiError> { pub const TEXT_FORMAT: &str = "text/plain; version=0.0.4"; - let payload = crate::metrics::METRICS_REGISTRY.encode(); + let state = get_state(&req); + let payload = crate::metrics::METRICS_REGISTRY.encode(&state.neon_metrics); let response = Response::builder() .status(200) .header(CONTENT_TYPE, TEXT_FORMAT) @@ -659,6 +734,7 @@ where pub fn make_router( service: Arc, auth: Option>, + build_info: BuildInfo, ) -> RouterBuilder { let mut router = endpoint::make_router() .middleware(prologue_metrics_middleware()) @@ -675,7 +751,7 @@ pub fn make_router( } router - .data(Arc::new(HttpState::new(service, auth))) + .data(Arc::new(HttpState::new(service, auth, build_info))) .get("/metrics", |r| { named_request_span(r, measured_metrics_handler, RequestName("metrics")) }) @@ -706,6 +782,13 @@ pub fn make_router( .post("/debug/v1/node/:node_id/drop", |r| { named_request_span(r, handle_node_drop, RequestName("debug_v1_node_drop")) }) + .post("/debug/v1/tenant/:tenant_id/import", |r| { + named_request_span( + r, + handle_tenant_import, + RequestName("debug_v1_tenant_import"), + ) + }) .get("/debug/v1/tenant", |r| { named_request_span(r, handle_tenants_dump, RequestName("debug_v1_tenant")) }) @@ -726,6 +809,9 @@ pub fn make_router( RequestName("debug_v1_consistency_check"), ) }) + .post("/debug/v1/reconcile_all", |r| { + request_span(r, handle_reconcile_all) + }) .put("/debug/v1/failpoints", |r| { request_span(r, |r| failpoints_handler(r, CancellationToken::new())) }) @@ -765,6 +851,16 @@ pub fn make_router( RequestName("control_v1_tenant_describe"), ) }) + .get("/control/v1/tenant", |r| { + tenant_service_handler(r, handle_tenant_list, RequestName("control_v1_tenant_list")) + }) + .put("/control/v1/tenant/:tenant_id/policy", |r| { + named_request_span( + r, + handle_tenant_update_policy, + RequestName("control_v1_tenant_policy"), + ) + }) // Tenant operations // The ^/v1/ endpoints act as a "Virtual Pageserver", enabling shard-naive clients to call into // this service to manage tenants that actually consist of many tenant shards, as if they are a single entity. @@ -816,7 +912,7 @@ pub fn make_router( RequestName("v1_tenant_timeline"), ) }) - // Tenant detail GET passthrough to shard zero + // Tenant detail GET passthrough to shard zero: .get("/v1/tenant/:tenant_id", |r| { tenant_service_handler( r, @@ -824,13 +920,14 @@ pub fn make_router( RequestName("v1_tenant_passthrough"), ) }) - // Timeline GET passthrough to shard zero. Note that the `*` in the URL is a wildcard: any future - // timeline GET APIs will be implicitly included. - .get("/v1/tenant/:tenant_id/timeline*", |r| { + // The `*` in the URL is a wildcard: any tenant/timeline GET APIs on the pageserver + // are implicitly exposed here. This must be last in the list to avoid + // taking precedence over other GET methods we might implement by hand. + .get("/v1/tenant/:tenant_id/*", |r| { tenant_service_handler( r, handle_tenant_timeline_passthrough, - RequestName("v1_tenant_timeline_passthrough"), + RequestName("v1_tenant_passthrough"), ) }) } diff --git a/storage_controller/src/id_lock_map.rs b/storage_controller/src/id_lock_map.rs new file mode 100644 index 0000000000..dff793289f --- /dev/null +++ b/storage_controller/src/id_lock_map.rs @@ -0,0 +1,189 @@ +use std::fmt::Display; +use std::time::Instant; +use std::{collections::HashMap, sync::Arc}; + +use std::time::Duration; + +use crate::service::RECONCILE_TIMEOUT; + +const LOCK_TIMEOUT_ALERT_THRESHOLD: Duration = RECONCILE_TIMEOUT; + +/// A wrapper around `OwnedRwLockWriteGuard` that when dropped changes the +/// current holding operation in lock. +pub struct WrappedWriteGuard { + guard: tokio::sync::OwnedRwLockWriteGuard>, + start: Instant, +} + +impl WrappedWriteGuard { + pub fn new(guard: tokio::sync::OwnedRwLockWriteGuard>) -> Self { + Self { + guard, + start: Instant::now(), + } + } +} + +impl Drop for WrappedWriteGuard { + fn drop(&mut self) { + let duration = self.start.elapsed(); + if duration > LOCK_TIMEOUT_ALERT_THRESHOLD { + tracing::warn!( + "Lock on {} was held for {:?}", + self.guard.as_ref().unwrap(), + duration + ); + } + *self.guard = None; + } +} + +/// A map of locks covering some arbitrary identifiers. Useful if you have a collection of objects but don't +/// want to embed a lock in each one, or if your locking granularity is different to your object granularity. +/// For example, used in the storage controller where the objects are tenant shards, but sometimes locking +/// is needed at a tenant-wide granularity. +pub(crate) struct IdLockMap +where + T: Eq + PartialEq + std::hash::Hash, +{ + /// A synchronous lock for getting/setting the async locks that our callers will wait on. + entities: std::sync::Mutex>>>>, +} + +impl IdLockMap +where + T: Eq + PartialEq + std::hash::Hash, + I: Display, +{ + pub(crate) fn shared( + &self, + key: T, + ) -> impl std::future::Future>> { + let mut locked = self.entities.lock().unwrap(); + let entry = locked.entry(key).or_default(); + entry.clone().read_owned() + } + + pub(crate) fn exclusive( + &self, + key: T, + operation: I, + ) -> impl std::future::Future> { + let mut locked = self.entities.lock().unwrap(); + let entry = locked.entry(key).or_default().clone(); + async move { + let mut guard = WrappedWriteGuard::new(entry.clone().write_owned().await); + *guard.guard = Some(operation); + guard + } + } + + /// Rather than building a lock guard that re-takes the [`Self::entities`] lock, we just do + /// periodic housekeeping to avoid the map growing indefinitely + pub(crate) fn housekeeping(&self) { + let mut locked = self.entities.lock().unwrap(); + locked.retain(|_k, entry| entry.try_write().is_err()) + } +} + +impl Default for IdLockMap +where + T: Eq + PartialEq + std::hash::Hash, +{ + fn default() -> Self { + Self { + entities: std::sync::Mutex::new(HashMap::new()), + } + } +} + +pub async fn trace_exclusive_lock< + T: Clone + Display + Eq + PartialEq + std::hash::Hash, + I: Display + Clone, +>( + op_locks: &IdLockMap, + key: T, + operation: I, +) -> WrappedWriteGuard { + let start = Instant::now(); + let guard = op_locks.exclusive(key.clone(), operation.clone()).await; + + let duration = start.elapsed(); + if duration > LOCK_TIMEOUT_ALERT_THRESHOLD { + tracing::warn!( + "Operation {} on key {} has waited {:?} for exclusive lock", + operation, + key, + duration + ); + } + + guard +} + +pub async fn trace_shared_lock< + T: Clone + Display + Eq + PartialEq + std::hash::Hash, + I: Display, +>( + op_locks: &IdLockMap, + key: T, + operation: I, +) -> tokio::sync::OwnedRwLockReadGuard> { + let start = Instant::now(); + let guard = op_locks.shared(key.clone()).await; + + let duration = start.elapsed(); + if duration > LOCK_TIMEOUT_ALERT_THRESHOLD { + tracing::warn!( + "Operation {} on key {} has waited {:?} for shared lock", + operation, + key, + duration + ); + } + + guard +} + +#[cfg(test)] +mod tests { + use super::IdLockMap; + + #[derive(Clone, Debug, strum_macros::Display, PartialEq)] + enum Operations { + Op1, + Op2, + } + + #[tokio::test] + async fn multiple_shared_locks() { + let id_lock_map: IdLockMap = IdLockMap::default(); + + let shared_lock_1 = id_lock_map.shared(1).await; + let shared_lock_2 = id_lock_map.shared(1).await; + + assert!(shared_lock_1.is_none()); + assert!(shared_lock_2.is_none()); + } + + #[tokio::test] + async fn exclusive_locks() { + let id_lock_map = IdLockMap::default(); + let resource_id = 1; + + { + let _ex_lock = id_lock_map.exclusive(resource_id, Operations::Op1).await; + assert_eq!(_ex_lock.guard.clone().unwrap(), Operations::Op1); + + let _ex_lock_2 = tokio::time::timeout( + tokio::time::Duration::from_millis(1), + id_lock_map.exclusive(resource_id, Operations::Op2), + ) + .await; + assert!(_ex_lock_2.is_err()); + } + + let shared_lock_1 = id_lock_map.shared(resource_id).await; + assert!(shared_lock_1.is_none()); + } +} diff --git a/control_plane/attachment_service/src/lib.rs b/storage_controller/src/lib.rs similarity index 98% rename from control_plane/attachment_service/src/lib.rs rename to storage_controller/src/lib.rs index 8bcd5c0ac4..2ea490a14b 100644 --- a/control_plane/attachment_service/src/lib.rs +++ b/storage_controller/src/lib.rs @@ -14,7 +14,7 @@ mod reconciler; mod scheduler; mod schema; pub mod service; -mod tenant_state; +mod tenant_shard; #[derive(Ord, PartialOrd, Eq, PartialEq, Copy, Clone, Serialize)] struct Sequence(u64); diff --git a/control_plane/attachment_service/src/main.rs b/storage_controller/src/main.rs similarity index 63% rename from control_plane/attachment_service/src/main.rs rename to storage_controller/src/main.rs index 0a925a63f6..f1454af533 100644 --- a/control_plane/attachment_service/src/main.rs +++ b/storage_controller/src/main.rs @@ -1,19 +1,22 @@ use anyhow::{anyhow, Context}; -use attachment_service::http::make_router; -use attachment_service::metrics::preinitialize_metrics; -use attachment_service::persistence::Persistence; -use attachment_service::service::{Config, Service, MAX_UNAVAILABLE_INTERVAL_DEFAULT}; -use aws_config::{BehaviorVersion, Region}; use camino::Utf8PathBuf; use clap::Parser; use diesel::Connection; use metrics::launch_timestamp::LaunchTimestamp; +use metrics::BuildInfo; use std::sync::Arc; +use storage_controller::http::make_router; +use storage_controller::metrics::preinitialize_metrics; +use storage_controller::persistence::Persistence; +use storage_controller::service::{ + Config, Service, MAX_UNAVAILABLE_INTERVAL_DEFAULT, RECONCILER_CONCURRENCY_DEFAULT, +}; use tokio::signal::unix::SignalKind; use tokio_util::sync::CancellationToken; use utils::auth::{JwtAuth, SwappableJwtAuth}; use utils::logging::{self, LogFormat}; +use utils::sentry_init::init_sentry; use utils::{project_build_tag, project_git_version, tcp_listener}; project_git_version!(GIT_VERSION); @@ -51,13 +54,41 @@ struct Cli { #[arg(short, long)] path: Option, - /// URL to connect to postgres, like postgresql://localhost:1234/attachment_service + /// URL to connect to postgres, like postgresql://localhost:1234/storage_controller #[arg(long)] database_url: Option, + /// Flag to enable dev mode, which permits running without auth + #[arg(long, default_value = "false")] + dev: bool, + /// Grace period before marking unresponsive pageserver offline #[arg(long)] max_unavailable_interval: Option, + + /// Maximum number of reconcilers that may run in parallel + #[arg(long)] + reconciler_concurrency: Option, + + /// How long to wait for the initial database connection to be available. + #[arg(long, default_value = "5s")] + db_connect_timeout: humantime::Duration, +} + +enum StrictMode { + /// In strict mode, we will require that all secrets are loaded, i.e. security features + /// may not be implicitly turned off by omitting secrets in the environment. + Strict, + /// In dev mode, secrets are optional, and omitting a particular secret will implicitly + /// disable the auth related to it (e.g. no pageserver jwt key -> send unauthenticated + /// requests, no public key -> don't authenticate incoming requests). + Dev, +} + +impl Default for StrictMode { + fn default() -> Self { + Self::Strict + } } /// Secrets may either be provided on the command line (for testing), or loaded from AWS SecretManager: this @@ -70,13 +101,6 @@ struct Secrets { } impl Secrets { - const DATABASE_URL_SECRET: &'static str = "rds-neon-storage-controller-url"; - const PAGESERVER_JWT_TOKEN_SECRET: &'static str = - "neon-storage-controller-pageserver-jwt-token"; - const CONTROL_PLANE_JWT_TOKEN_SECRET: &'static str = - "neon-storage-controller-control-plane-jwt-token"; - const PUBLIC_KEY_SECRET: &'static str = "neon-storage-controller-public-key"; - const DATABASE_URL_ENV: &'static str = "DATABASE_URL"; const PAGESERVER_JWT_TOKEN_ENV: &'static str = "PAGESERVER_JWT_TOKEN"; const CONTROL_PLANE_JWT_TOKEN_ENV: &'static str = "CONTROL_PLANE_JWT_TOKEN"; @@ -87,111 +111,41 @@ impl Secrets { /// - Environment variables if DATABASE_URL is set. /// - AWS Secrets Manager secrets async fn load(args: &Cli) -> anyhow::Result { - match &args.database_url { - Some(url) => Self::load_cli(url, args), - None => match std::env::var(Self::DATABASE_URL_ENV) { - Ok(database_url) => Self::load_env(database_url), - Err(_) => Self::load_aws_sm().await, - }, - } - } - - fn load_env(database_url: String) -> anyhow::Result { - let public_key = match std::env::var(Self::PUBLIC_KEY_ENV) { - Ok(public_key) => Some(JwtAuth::from_key(public_key).context("Loading public key")?), - Err(_) => None, - }; - Ok(Self { - database_url, - public_key, - jwt_token: std::env::var(Self::PAGESERVER_JWT_TOKEN_ENV).ok(), - control_plane_jwt_token: std::env::var(Self::CONTROL_PLANE_JWT_TOKEN_ENV).ok(), - }) - } - - async fn load_aws_sm() -> anyhow::Result { - let Ok(region) = std::env::var("AWS_REGION") else { - anyhow::bail!("AWS_REGION is not set, cannot load secrets automatically: either set this, or use CLI args to supply secrets"); - }; - let config = aws_config::defaults(BehaviorVersion::v2023_11_09()) - .region(Region::new(region.clone())) - .load() - .await; - - let asm = aws_sdk_secretsmanager::Client::new(&config); - - let Some(database_url) = asm - .get_secret_value() - .secret_id(Self::DATABASE_URL_SECRET) - .send() - .await? - .secret_string() - .map(str::to_string) + let Some(database_url) = + Self::load_secret(&args.database_url, Self::DATABASE_URL_ENV).await else { anyhow::bail!( - "Database URL secret not found at {region}/{}", - Self::DATABASE_URL_SECRET + "Database URL is not set (set `--database-url`, or `DATABASE_URL` environment)" ) }; - let jwt_token = asm - .get_secret_value() - .secret_id(Self::PAGESERVER_JWT_TOKEN_SECRET) - .send() - .await? - .secret_string() - .map(str::to_string); - if jwt_token.is_none() { - tracing::warn!("No pageserver JWT token set: this will only work if authentication is disabled on the pageserver"); - } - - let control_plane_jwt_token = asm - .get_secret_value() - .secret_id(Self::CONTROL_PLANE_JWT_TOKEN_SECRET) - .send() - .await? - .secret_string() - .map(str::to_string); - if jwt_token.is_none() { - tracing::warn!("No control plane JWT token set: this will only work if authentication is disabled on the pageserver"); - } - - let public_key = asm - .get_secret_value() - .secret_id(Self::PUBLIC_KEY_SECRET) - .send() - .await? - .secret_string() - .map(str::to_string); - let public_key = match public_key { - Some(key) => Some(JwtAuth::from_key(key)?), - None => { - tracing::warn!( - "No public key set: inccoming HTTP requests will not be authenticated" - ); - None - } + let public_key = match Self::load_secret(&args.public_key, Self::PUBLIC_KEY_ENV).await { + Some(v) => Some(JwtAuth::from_key(v).context("Loading public key")?), + None => None, }; - Ok(Self { + let this = Self { database_url, public_key, - jwt_token, - control_plane_jwt_token, - }) + jwt_token: Self::load_secret(&args.jwt_token, Self::PAGESERVER_JWT_TOKEN_ENV).await, + control_plane_jwt_token: Self::load_secret( + &args.control_plane_jwt_token, + Self::CONTROL_PLANE_JWT_TOKEN_ENV, + ) + .await, + }; + + Ok(this) } - fn load_cli(database_url: &str, args: &Cli) -> anyhow::Result { - let public_key = match &args.public_key { - None => None, - Some(key) => Some(JwtAuth::from_key(key.clone()).context("Loading public key")?), - }; - Ok(Self { - database_url: database_url.to_owned(), - public_key, - jwt_token: args.jwt_token.clone(), - control_plane_jwt_token: args.control_plane_jwt_token.clone(), - }) + async fn load_secret(cli: &Option, env_name: &str) -> Option { + if let Some(v) = cli { + Some(v.clone()) + } else if let Ok(v) = std::env::var(env_name) { + Some(v) + } else { + None + } } } @@ -216,6 +170,8 @@ fn main() -> anyhow::Result<()> { std::process::exit(1); })); + let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]); + tokio::runtime::Builder::new_current_thread() // We use spawn_blocking for database operations, so require approximately // as many blocking threads as we will open database connections. @@ -247,8 +203,47 @@ async fn async_main() -> anyhow::Result<()> { args.listen ); + let build_info = BuildInfo { + revision: GIT_VERSION, + build_tag: BUILD_TAG, + }; + + let strict_mode = if args.dev { + StrictMode::Dev + } else { + StrictMode::Strict + }; + let secrets = Secrets::load(&args).await?; + // Validate required secrets and arguments are provided in strict mode + match strict_mode { + StrictMode::Strict + if (secrets.public_key.is_none() + || secrets.jwt_token.is_none() + || secrets.control_plane_jwt_token.is_none()) => + { + // Production systems should always have secrets configured: if public_key was not set + // then we would implicitly disable auth. + anyhow::bail!( + "Insecure config! One or more secrets is not set. This is only permitted in `--dev` mode" + ); + } + StrictMode::Strict if args.compute_hook_url.is_none() => { + // Production systems should always have a compute hook set, to prevent falling + // back to trying to use neon_local. + anyhow::bail!( + "`--compute-hook-url` is not set: this is only permitted in `--dev` mode" + ); + } + StrictMode::Strict => { + tracing::info!("Starting in strict mode: configuration is OK.") + } + StrictMode::Dev => { + tracing::warn!("Starting in dev mode: this may be an insecure configuration.") + } + } + let config = Config { jwt_token: secrets.jwt_token, control_plane_jwt_token: secrets.control_plane_jwt_token, @@ -257,9 +252,14 @@ async fn async_main() -> anyhow::Result<()> { .max_unavailable_interval .map(humantime::Duration::into) .unwrap_or(MAX_UNAVAILABLE_INTERVAL_DEFAULT), + reconciler_concurrency: args + .reconciler_concurrency + .unwrap_or(RECONCILER_CONCURRENCY_DEFAULT), }; // After loading secrets & config, but before starting anything else, apply database migrations + Persistence::await_connection(&secrets.database_url, args.db_connect_timeout.into()).await?; + migration_run(&secrets.database_url) .await .context("Running database migrations")?; @@ -274,7 +274,7 @@ async fn async_main() -> anyhow::Result<()> { let auth = secrets .public_key .map(|jwt_auth| Arc::new(SwappableJwtAuth::new(jwt_auth))); - let router = make_router(service.clone(), auth) + let router = make_router(service.clone(), auth, build_info) .build() .map_err(|err| anyhow!(err))?; let router_service = utils::http::RouterService::new(router).unwrap(); diff --git a/control_plane/attachment_service/src/metrics.rs b/storage_controller/src/metrics.rs similarity index 63% rename from control_plane/attachment_service/src/metrics.rs rename to storage_controller/src/metrics.rs index ccf5e9b07c..ac9f22c739 100644 --- a/control_plane/attachment_service/src/metrics.rs +++ b/storage_controller/src/metrics.rs @@ -8,10 +8,8 @@ //! The rest of the code defines label group types and deals with converting outer types to labels. //! use bytes::Bytes; -use measured::{ - label::{LabelValue, StaticLabelSet}, - FixedCardinalityLabel, MetricGroup, -}; +use measured::{label::LabelValue, metric::histogram, FixedCardinalityLabel, MetricGroup}; +use metrics::NeonMetrics; use once_cell::sync::Lazy; use std::sync::Mutex; @@ -26,21 +24,28 @@ pub fn preinitialize_metrics() { pub(crate) struct StorageControllerMetrics { pub(crate) metrics_group: StorageControllerMetricGroup, - encoder: Mutex, + encoder: Mutex, } #[derive(measured::MetricGroup)] +#[metric(new())] pub(crate) struct StorageControllerMetricGroup { /// Count of how many times we spawn a reconcile task pub(crate) storage_controller_reconcile_spawn: measured::Counter, + /// Reconciler tasks completed, broken down by success/failure/cancelled pub(crate) storage_controller_reconcile_complete: measured::CounterVec, + /// Count of how many times we make an optimization change to a tenant's scheduling + pub(crate) storage_controller_schedule_optimization: measured::Counter, + /// HTTP request status counters for handled requests pub(crate) storage_controller_http_request_status: measured::CounterVec, + /// HTTP request handler latency across all status codes + #[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))] pub(crate) storage_controller_http_request_latency: measured::HistogramVec, @@ -52,6 +57,7 @@ pub(crate) struct StorageControllerMetricGroup { /// Latency of HTTP requests to the pageserver, broken down by pageserver /// node id, request name and method. This include both successful and unsuccessful /// requests. + #[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))] pub(crate) storage_controller_pageserver_request_latency: measured::HistogramVec, @@ -63,6 +69,7 @@ pub(crate) struct StorageControllerMetricGroup { /// Latency of pass-through HTTP requests to the pageserver, broken down by pageserver /// node id, request name and method. This include both successful and unsuccessful /// requests. + #[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))] pub(crate) storage_controller_passthrough_request_latency: measured::HistogramVec, @@ -71,75 +78,34 @@ pub(crate) struct StorageControllerMetricGroup { measured::CounterVec, /// Latency of database queries, broken down by operation. + #[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))] pub(crate) storage_controller_database_query_latency: measured::HistogramVec, } impl StorageControllerMetrics { - pub(crate) fn encode(&self) -> Bytes { + pub(crate) fn encode(&self, neon_metrics: &NeonMetrics) -> Bytes { let mut encoder = self.encoder.lock().unwrap(); - self.metrics_group.collect_into(&mut *encoder); + neon_metrics + .collect_group_into(&mut *encoder) + .unwrap_or_else(|infallible| match infallible {}); + self.metrics_group + .collect_group_into(&mut *encoder) + .unwrap_or_else(|infallible| match infallible {}); encoder.finish() } } impl Default for StorageControllerMetrics { fn default() -> Self { - Self { - metrics_group: StorageControllerMetricGroup::new(), - encoder: Mutex::new(measured::text::TextEncoder::new()), - } - } -} + let mut metrics_group = StorageControllerMetricGroup::new(); + metrics_group + .storage_controller_reconcile_complete + .init_all_dense(); -impl StorageControllerMetricGroup { - pub(crate) fn new() -> Self { Self { - storage_controller_reconcile_spawn: measured::Counter::new(), - storage_controller_reconcile_complete: measured::CounterVec::new( - ReconcileCompleteLabelGroupSet { - status: StaticLabelSet::new(), - }, - ), - storage_controller_http_request_status: measured::CounterVec::new( - HttpRequestStatusLabelGroupSet { - path: lasso::ThreadedRodeo::new(), - method: StaticLabelSet::new(), - status: StaticLabelSet::new(), - }, - ), - storage_controller_http_request_latency: measured::HistogramVec::new( - measured::metric::histogram::Thresholds::exponential_buckets(0.1, 2.0), - ), - storage_controller_pageserver_request_error: measured::CounterVec::new( - PageserverRequestLabelGroupSet { - pageserver_id: lasso::ThreadedRodeo::new(), - path: lasso::ThreadedRodeo::new(), - method: StaticLabelSet::new(), - }, - ), - storage_controller_pageserver_request_latency: measured::HistogramVec::new( - measured::metric::histogram::Thresholds::exponential_buckets(0.1, 2.0), - ), - storage_controller_passthrough_request_error: measured::CounterVec::new( - PageserverRequestLabelGroupSet { - pageserver_id: lasso::ThreadedRodeo::new(), - path: lasso::ThreadedRodeo::new(), - method: StaticLabelSet::new(), - }, - ), - storage_controller_passthrough_request_latency: measured::HistogramVec::new( - measured::metric::histogram::Thresholds::exponential_buckets(0.1, 2.0), - ), - storage_controller_database_query_error: measured::CounterVec::new( - DatabaseQueryErrorLabelGroupSet { - operation: StaticLabelSet::new(), - error_type: StaticLabelSet::new(), - }, - ), - storage_controller_database_query_latency: measured::HistogramVec::new( - measured::metric::histogram::Thresholds::exponential_buckets(0.1, 2.0), - ), + metrics_group, + encoder: Mutex::new(measured::text::BufferedTextEncoder::new()), } } } @@ -153,7 +119,7 @@ pub(crate) struct ReconcileCompleteLabelGroup { #[derive(measured::LabelGroup)] #[label(set = HttpRequestStatusLabelGroupSet)] pub(crate) struct HttpRequestStatusLabelGroup<'a> { - #[label(dynamic_with = lasso::ThreadedRodeo)] + #[label(dynamic_with = lasso::ThreadedRodeo, default)] pub(crate) path: &'a str, pub(crate) method: Method, pub(crate) status: StatusCode, @@ -162,40 +128,21 @@ pub(crate) struct HttpRequestStatusLabelGroup<'a> { #[derive(measured::LabelGroup)] #[label(set = HttpRequestLatencyLabelGroupSet)] pub(crate) struct HttpRequestLatencyLabelGroup<'a> { - #[label(dynamic_with = lasso::ThreadedRodeo)] + #[label(dynamic_with = lasso::ThreadedRodeo, default)] pub(crate) path: &'a str, pub(crate) method: Method, } -impl Default for HttpRequestLatencyLabelGroupSet { - fn default() -> Self { - Self { - path: lasso::ThreadedRodeo::new(), - method: StaticLabelSet::new(), - } - } -} - #[derive(measured::LabelGroup, Clone)] #[label(set = PageserverRequestLabelGroupSet)] pub(crate) struct PageserverRequestLabelGroup<'a> { - #[label(dynamic_with = lasso::ThreadedRodeo)] + #[label(dynamic_with = lasso::ThreadedRodeo, default)] pub(crate) pageserver_id: &'a str, - #[label(dynamic_with = lasso::ThreadedRodeo)] + #[label(dynamic_with = lasso::ThreadedRodeo, default)] pub(crate) path: &'a str, pub(crate) method: Method, } -impl Default for PageserverRequestLabelGroupSet { - fn default() -> Self { - Self { - pageserver_id: lasso::ThreadedRodeo::new(), - path: lasso::ThreadedRodeo::new(), - method: StaticLabelSet::new(), - } - } -} - #[derive(measured::LabelGroup)] #[label(set = DatabaseQueryErrorLabelGroupSet)] pub(crate) struct DatabaseQueryErrorLabelGroup { @@ -209,7 +156,7 @@ pub(crate) struct DatabaseQueryLatencyLabelGroup { pub(crate) operation: DatabaseOperation, } -#[derive(FixedCardinalityLabel)] +#[derive(FixedCardinalityLabel, Clone, Copy)] pub(crate) enum ReconcileOutcome { #[label(rename = "ok")] Success, @@ -217,7 +164,7 @@ pub(crate) enum ReconcileOutcome { Cancel, } -#[derive(FixedCardinalityLabel, Clone)] +#[derive(FixedCardinalityLabel, Copy, Clone)] pub(crate) enum Method { Get, Put, @@ -242,11 +189,12 @@ impl From for Method { } } +#[derive(Clone, Copy)] pub(crate) struct StatusCode(pub(crate) hyper::http::StatusCode); impl LabelValue for StatusCode { fn visit(&self, v: V) -> V::Output { - v.write_int(self.0.as_u16() as u64) + v.write_int(self.0.as_u16() as i64) } } @@ -264,7 +212,7 @@ impl FixedCardinalityLabel for StatusCode { } } -#[derive(FixedCardinalityLabel)] +#[derive(FixedCardinalityLabel, Clone, Copy)] pub(crate) enum DatabaseErrorLabel { Query, Connection, diff --git a/control_plane/attachment_service/src/node.rs b/storage_controller/src/node.rs similarity index 93% rename from control_plane/attachment_service/src/node.rs rename to storage_controller/src/node.rs index df40bff66f..7b5513c908 100644 --- a/control_plane/attachment_service/src/node.rs +++ b/storage_controller/src/node.rs @@ -1,13 +1,14 @@ use std::{str::FromStr, time::Duration}; -use hyper::StatusCode; use pageserver_api::{ controller_api::{ - NodeAvailability, NodeRegisterRequest, NodeSchedulingPolicy, TenantLocateResponseShard, + NodeAvailability, NodeDescribeResponse, NodeRegisterRequest, NodeSchedulingPolicy, + TenantLocateResponseShard, }, shard::TenantShardId, }; use pageserver_client::mgmt_api; +use reqwest::StatusCode; use serde::Serialize; use tokio_util::sync::CancellationToken; use utils::{backoff, id::NodeId}; @@ -256,6 +257,19 @@ impl Node { ) .await } + + /// Generate the simplified API-friendly description of a node's state + pub(crate) fn describe(&self) -> NodeDescribeResponse { + NodeDescribeResponse { + id: self.id, + availability: self.availability.into(), + scheduling: self.scheduling, + listen_http_addr: self.listen_http_addr.clone(), + listen_http_port: self.listen_http_port, + listen_pg_addr: self.listen_pg_addr.clone(), + listen_pg_port: self.listen_pg_port, + } + } } impl std::fmt::Display for Node { diff --git a/control_plane/attachment_service/src/pageserver_client.rs b/storage_controller/src/pageserver_client.rs similarity index 83% rename from control_plane/attachment_service/src/pageserver_client.rs rename to storage_controller/src/pageserver_client.rs index 8237229d7b..25b6b67e12 100644 --- a/control_plane/attachment_service/src/pageserver_client.rs +++ b/storage_controller/src/pageserver_client.rs @@ -1,13 +1,14 @@ use pageserver_api::{ models::{ LocationConfig, LocationConfigListResponse, PageserverUtilization, SecondaryProgress, - TenantShardSplitRequest, TenantShardSplitResponse, TimelineCreateRequest, TimelineInfo, + TenantScanRemoteStorageResponse, TenantShardSplitRequest, TenantShardSplitResponse, + TimelineCreateRequest, TimelineInfo, }, shard::TenantShardId, }; use pageserver_client::mgmt_api::{Client, Result}; use reqwest::StatusCode; -use utils::id::{NodeId, TimelineId}; +use utils::id::{NodeId, TenantId, TimelineId}; /// Thin wrapper around [`pageserver_client::mgmt_api::Client`]. It allows the storage /// controller to collect metrics in a non-intrusive manner. @@ -88,6 +89,18 @@ impl PageserverClient { ) } + pub(crate) async fn tenant_scan_remote_storage( + &self, + tenant_id: TenantId, + ) -> Result { + measured_request!( + "tenant_scan_remote_storage", + crate::metrics::Method::Get, + &self.node_id_label, + self.inner.tenant_scan_remote_storage(tenant_id).await + ) + } + pub(crate) async fn tenant_secondary_download( &self, tenant_id: TenantShardId, @@ -101,6 +114,27 @@ impl PageserverClient { ) } + pub(crate) async fn tenant_secondary_status( + &self, + tenant_shard_id: TenantShardId, + ) -> Result { + measured_request!( + "tenant_secondary_status", + crate::metrics::Method::Get, + &self.node_id_label, + self.inner.tenant_secondary_status(tenant_shard_id).await + ) + } + + pub(crate) async fn tenant_heatmap_upload(&self, tenant_id: TenantShardId) -> Result<()> { + measured_request!( + "tenant_heatmap_upload", + crate::metrics::Method::Post, + &self.node_id_label, + self.inner.tenant_heatmap_upload(tenant_id).await + ) + } + pub(crate) async fn location_config( &self, tenant_shard_id: TenantShardId, diff --git a/control_plane/attachment_service/src/persistence.rs b/storage_controller/src/persistence.rs similarity index 89% rename from control_plane/attachment_service/src/persistence.rs rename to storage_controller/src/persistence.rs index dafd52017b..dca37166ba 100644 --- a/control_plane/attachment_service/src/persistence.rs +++ b/storage_controller/src/persistence.rs @@ -2,6 +2,7 @@ pub(crate) mod split_state; use std::collections::HashMap; use std::str::FromStr; use std::time::Duration; +use std::time::Instant; use self::split_state::SplitState; use camino::Utf8Path; @@ -9,6 +10,7 @@ use camino::Utf8PathBuf; use diesel::pg::PgConnection; use diesel::prelude::*; use diesel::Connection; +use pageserver_api::controller_api::ShardSchedulingPolicy; use pageserver_api::controller_api::{NodeSchedulingPolicy, PlacementPolicy}; use pageserver_api::models::TenantConfig; use pageserver_api::shard::ShardConfigError; @@ -78,7 +80,7 @@ pub(crate) enum DatabaseError { Logical(String), } -#[derive(measured::FixedCardinalityLabel, Clone)] +#[derive(measured::FixedCardinalityLabel, Copy, Clone)] pub(crate) enum DatabaseOperation { InsertNode, UpdateNode, @@ -107,6 +109,12 @@ pub(crate) enum AbortShardSplitStatus { pub(crate) type DatabaseResult = Result; +/// Some methods can operate on either a whole tenant or a single shard +pub(crate) enum TenantFilter { + Tenant(TenantId), + Shard(TenantShardId), +} + impl Persistence { // The default postgres connection limit is 100. We use up to 99, to leave one free for a human admin under // normal circumstances. This assumes we have exclusive use of the database cluster to which we connect. @@ -137,18 +145,41 @@ impl Persistence { } } + /// A helper for use during startup, where we would like to tolerate concurrent restarts of the + /// database and the storage controller, therefore the database might not be available right away + pub async fn await_connection( + database_url: &str, + timeout: Duration, + ) -> Result<(), diesel::ConnectionError> { + let started_at = Instant::now(); + loop { + match PgConnection::establish(database_url) { + Ok(_) => { + tracing::info!("Connected to database."); + return Ok(()); + } + Err(e) => { + if started_at.elapsed() > timeout { + return Err(e); + } else { + tracing::info!("Database not yet available, waiting... ({e})"); + tokio::time::sleep(Duration::from_millis(100)).await; + } + } + } + } + } + /// Wraps `with_conn` in order to collect latency and error metrics async fn with_measured_conn(&self, op: DatabaseOperation, func: F) -> DatabaseResult where - F: Fn(&mut PgConnection) -> DatabaseResult + Send + 'static, + F: FnOnce(&mut PgConnection) -> DatabaseResult + Send + 'static, R: Send + 'static, { let latency = &METRICS_REGISTRY .metrics_group .storage_controller_database_query_latency; - let _timer = latency.start_timer(DatabaseQueryLatencyLabelGroup { - operation: op.clone(), - }); + let _timer = latency.start_timer(DatabaseQueryLatencyLabelGroup { operation: op }); let res = self.with_conn(func).await; @@ -168,7 +199,7 @@ impl Persistence { /// Call the provided function in a tokio blocking thread, with a Diesel database connection. async fn with_conn(&self, func: F) -> DatabaseResult where - F: Fn(&mut PgConnection) -> DatabaseResult + Send + 'static, + F: FnOnce(&mut PgConnection) -> DatabaseResult + Send + 'static, R: Send + 'static, { let mut conn = self.connection_pool.get()?; @@ -275,6 +306,11 @@ impl Persistence { // Backward compat for test data after PR https://github.com/neondatabase/neon/pull/7165 shard.placement_policy = "{\"Attached\":0}".to_string(); } + + if shard.scheduling_policy.is_empty() { + shard.scheduling_policy = + serde_json::to_string(&ShardSchedulingPolicy::default()).unwrap(); + } } let tenants: Vec = decoded.tenants.into_values().collect(); @@ -465,59 +501,45 @@ impl Persistence { /// that we only do the first time a tenant is set to an attached policy via /location_config. pub(crate) async fn update_tenant_shard( &self, - tenant_shard_id: TenantShardId, - input_placement_policy: PlacementPolicy, - input_config: TenantConfig, + tenant: TenantFilter, + input_placement_policy: Option, + input_config: Option, input_generation: Option, + input_scheduling_policy: Option, ) -> DatabaseResult<()> { use crate::schema::tenant_shards::dsl::*; self.with_measured_conn(DatabaseOperation::UpdateTenantShard, move |conn| { - let query = diesel::update(tenant_shards) - .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string())) - .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32)) - .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32)); + let query = match tenant { + TenantFilter::Shard(tenant_shard_id) => diesel::update(tenant_shards) + .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string())) + .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32)) + .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32)) + .into_boxed(), + TenantFilter::Tenant(input_tenant_id) => diesel::update(tenant_shards) + .filter(tenant_id.eq(input_tenant_id.to_string())) + .into_boxed(), + }; - if let Some(input_generation) = input_generation { - // Update includes generation column - query - .set(( - generation.eq(Some(input_generation.into().unwrap() as i32)), - placement_policy - .eq(serde_json::to_string(&input_placement_policy).unwrap()), - config.eq(serde_json::to_string(&input_config).unwrap()), - )) - .execute(conn)?; - } else { - // Update does not include generation column - query - .set(( - placement_policy - .eq(serde_json::to_string(&input_placement_policy).unwrap()), - config.eq(serde_json::to_string(&input_config).unwrap()), - )) - .execute(conn)?; + #[derive(AsChangeset)] + #[diesel(table_name = crate::schema::tenant_shards)] + struct ShardUpdate { + generation: Option, + placement_policy: Option, + config: Option, + scheduling_policy: Option, } - Ok(()) - }) - .await?; + let update = ShardUpdate { + generation: input_generation.map(|g| g.into().unwrap() as i32), + placement_policy: input_placement_policy + .map(|p| serde_json::to_string(&p).unwrap()), + config: input_config.map(|c| serde_json::to_string(&c).unwrap()), + scheduling_policy: input_scheduling_policy + .map(|p| serde_json::to_string(&p).unwrap()), + }; - Ok(()) - } - - pub(crate) async fn update_tenant_config( - &self, - input_tenant_id: TenantId, - input_config: TenantConfig, - ) -> DatabaseResult<()> { - use crate::schema::tenant_shards::dsl::*; - - self.with_measured_conn(DatabaseOperation::UpdateTenantConfig, move |conn| { - diesel::update(tenant_shards) - .filter(tenant_id.eq(input_tenant_id.to_string())) - .set((config.eq(serde_json::to_string(&input_config).unwrap()),)) - .execute(conn)?; + query.set(update).execute(conn)?; Ok(()) }) @@ -698,7 +720,7 @@ impl Persistence { } } -/// Parts of [`crate::tenant_state::TenantState`] that are stored durably +/// Parts of [`crate::tenant_shard::TenantShard`] that are stored durably #[derive(Queryable, Selectable, Insertable, Serialize, Deserialize, Clone, Eq, PartialEq)] #[diesel(table_name = crate::schema::tenant_shards)] pub(crate) struct TenantShardPersistence { @@ -728,6 +750,8 @@ pub(crate) struct TenantShardPersistence { pub(crate) splitting: SplitState, #[serde(default)] pub(crate) config: String, + #[serde(default)] + pub(crate) scheduling_policy: String, } impl TenantShardPersistence { diff --git a/control_plane/attachment_service/src/persistence/split_state.rs b/storage_controller/src/persistence/split_state.rs similarity index 100% rename from control_plane/attachment_service/src/persistence/split_state.rs rename to storage_controller/src/persistence/split_state.rs diff --git a/control_plane/attachment_service/src/reconciler.rs b/storage_controller/src/reconciler.rs similarity index 96% rename from control_plane/attachment_service/src/reconciler.rs rename to storage_controller/src/reconciler.rs index a62357f9ac..fe97f724c1 100644 --- a/control_plane/attachment_service/src/reconciler.rs +++ b/storage_controller/src/reconciler.rs @@ -1,12 +1,12 @@ use crate::pageserver_client::PageserverClient; use crate::persistence::Persistence; use crate::service; -use hyper::StatusCode; use pageserver_api::models::{ LocationConfig, LocationConfigMode, LocationConfigSecondary, TenantConfig, }; use pageserver_api::shard::{ShardIdentity, TenantShardId}; use pageserver_client::mgmt_api; +use reqwest::StatusCode; use std::collections::HashMap; use std::sync::Arc; use std::time::{Duration, Instant}; @@ -18,14 +18,14 @@ use utils::sync::gate::GateGuard; use crate::compute_hook::{ComputeHook, NotifyError}; use crate::node::Node; -use crate::tenant_state::{IntentState, ObservedState, ObservedStateLocation}; +use crate::tenant_shard::{IntentState, ObservedState, ObservedStateLocation}; const DEFAULT_HEATMAP_PERIOD: &str = "60s"; /// Object with the lifetime of the background reconcile task that is created /// for tenants which have a difference between their intent and observed states. pub(super) struct Reconciler { - /// See [`crate::tenant_state::TenantState`] for the meanings of these fields: they are a snapshot + /// See [`crate::tenant_shard::TenantShard`] for the meanings of these fields: they are a snapshot /// of a tenant's state from when we spawned a reconcile task. pub(super) tenant_shard_id: TenantShardId, pub(crate) shard: ShardIdentity, @@ -48,11 +48,15 @@ pub(super) struct Reconciler { /// To avoid stalling if the cloud control plane is unavailable, we may proceed /// past failures in [`ComputeHook::notify`], but we _must_ remember that we failed - /// so that we can set [`crate::tenant_state::TenantState::pending_compute_notification`] to ensure a later retry. + /// so that we can set [`crate::tenant_shard::TenantShard::pending_compute_notification`] to ensure a later retry. pub(crate) compute_notify_failure: bool, + /// Reconciler is responsible for keeping alive semaphore units that limit concurrency on how many + /// we will spawn. + pub(crate) _resource_units: ReconcileUnits, + /// A means to abort background reconciliation: it is essential to - /// call this when something changes in the original TenantState that + /// call this when something changes in the original TenantShard that /// will make this reconciliation impossible or unnecessary, for /// example when a pageserver node goes offline, or the PlacementPolicy for /// the tenant is changed. @@ -66,7 +70,20 @@ pub(super) struct Reconciler { pub(crate) persistence: Arc, } -/// This is a snapshot of [`crate::tenant_state::IntentState`], but it does not do any +/// RAII resource units granted to a Reconciler, which it should keep alive until it finishes doing I/O +pub(crate) struct ReconcileUnits { + _sem_units: tokio::sync::OwnedSemaphorePermit, +} + +impl ReconcileUnits { + pub(crate) fn new(sem_units: tokio::sync::OwnedSemaphorePermit) -> Self { + Self { + _sem_units: sem_units, + } + } +} + +/// This is a snapshot of [`crate::tenant_shard::IntentState`], but it does not do any /// reference counting for Scheduler. The IntentState is what the scheduler works with, /// and the TargetState is just the instruction for a particular Reconciler run. #[derive(Debug)] @@ -487,6 +504,7 @@ impl Reconciler { while let Err(e) = self.compute_notify().await { match e { NotifyError::Fatal(_) => return Err(ReconcileError::Notify(e)), + NotifyError::ShuttingDown => return Err(ReconcileError::Cancel), _ => { tracing::warn!( "Live migration blocked by compute notification error, retrying: {e}" @@ -749,7 +767,10 @@ impl Reconciler { // It is up to the caller whether they want to drop out on this error, but they don't have to: // in general we should avoid letting unavailability of the cloud control plane stop us from // making progress. - tracing::warn!("Failed to notify compute of attached pageserver {node}: {e}"); + if !matches!(e, NotifyError::ShuttingDown) { + tracing::warn!("Failed to notify compute of attached pageserver {node}: {e}"); + } + // Set this flag so that in our ReconcileResult we will set the flag on the shard that it // needs to retry at some point. self.compute_notify_failure = true; diff --git a/control_plane/attachment_service/src/scheduler.rs b/storage_controller/src/scheduler.rs similarity index 66% rename from control_plane/attachment_service/src/scheduler.rs rename to storage_controller/src/scheduler.rs index 981ba26cce..3ff0d87988 100644 --- a/control_plane/attachment_service/src/scheduler.rs +++ b/storage_controller/src/scheduler.rs @@ -1,4 +1,4 @@ -use crate::{node::Node, tenant_state::TenantState}; +use crate::{node::Node, tenant_shard::TenantShard}; use pageserver_api::controller_api::UtilizationScore; use serde::Serialize; use std::collections::HashMap; @@ -27,7 +27,7 @@ pub enum MaySchedule { #[derive(Serialize)] struct SchedulerNode { - /// How many shards are currently scheduled on this node, via their [`crate::tenant_state::IntentState`]. + /// How many shards are currently scheduled on this node, via their [`crate::tenant_shard::IntentState`]. shard_count: usize, /// Whether this node is currently elegible to have new shards scheduled (this is derived @@ -58,6 +58,86 @@ pub(crate) struct Scheduler { nodes: HashMap, } +/// Score for soft constraint scheduling: lower scores are preferred to higher scores. +/// +/// For example, we may set an affinity score based on the number of shards from the same +/// tenant already on a node, to implicitly prefer to balance out shards. +#[derive(Copy, Clone, Debug, Eq, PartialEq, PartialOrd, Ord)] +pub(crate) struct AffinityScore(pub(crate) usize); + +impl AffinityScore { + /// If we have no anti-affinity at all toward a node, this is its score. It means + /// the scheduler has a free choice amongst nodes with this score, and may pick a node + /// based on other information such as total utilization. + pub(crate) const FREE: Self = Self(0); + + pub(crate) fn inc(&mut self) { + self.0 += 1; + } +} + +impl std::ops::Add for AffinityScore { + type Output = Self; + + fn add(self, rhs: Self) -> Self::Output { + Self(self.0 + rhs.0) + } +} + +/// Hint for whether this is a sincere attempt to schedule, or a speculative +/// check for where we _would_ schedule (done during optimization) +#[derive(Debug)] +pub(crate) enum ScheduleMode { + Normal, + Speculative, +} + +impl Default for ScheduleMode { + fn default() -> Self { + Self::Normal + } +} + +// For carrying state between multiple calls to [`TenantShard::schedule`], e.g. when calling +// it for many shards in the same tenant. +#[derive(Debug, Default)] +pub(crate) struct ScheduleContext { + /// Sparse map of nodes: omitting a node implicitly makes its affinity [`AffinityScore::FREE`] + pub(crate) nodes: HashMap, + + /// Specifically how many _attached_ locations are on each node + pub(crate) attached_nodes: HashMap, + + pub(crate) mode: ScheduleMode, +} + +impl ScheduleContext { + /// Input is a list of nodes we would like to avoid using again within this context. The more + /// times a node is passed into this call, the less inclined we are to use it. + pub(crate) fn avoid(&mut self, nodes: &[NodeId]) { + for node_id in nodes { + let entry = self.nodes.entry(*node_id).or_insert(AffinityScore::FREE); + entry.inc() + } + } + + pub(crate) fn push_attached(&mut self, node_id: NodeId) { + let entry = self.attached_nodes.entry(node_id).or_default(); + *entry += 1; + } + + pub(crate) fn get_node_affinity(&self, node_id: NodeId) -> AffinityScore { + self.nodes + .get(&node_id) + .copied() + .unwrap_or(AffinityScore::FREE) + } + + pub(crate) fn get_node_attachments(&self, node_id: NodeId) -> usize { + self.attached_nodes.get(&node_id).copied().unwrap_or(0) + } +} + impl Scheduler { pub(crate) fn new<'a>(nodes: impl Iterator) -> Self { let mut scheduler_nodes = HashMap::new(); @@ -83,7 +163,7 @@ impl Scheduler { pub(crate) fn consistency_check<'a>( &self, nodes: impl Iterator, - shards: impl Iterator, + shards: impl Iterator, ) -> anyhow::Result<()> { let mut expect_nodes: HashMap = HashMap::new(); for node in nodes { @@ -224,53 +304,87 @@ impl Scheduler { node.and_then(|(node_id, may_schedule)| if may_schedule { Some(node_id) } else { None }) } - pub(crate) fn schedule_shard(&self, hard_exclude: &[NodeId]) -> Result { + /// hard_exclude: it is forbidden to use nodes in this list, typically becacuse they + /// are already in use by this shard -- we use this to avoid picking the same node + /// as both attached and secondary location. This is a hard constraint: if we cannot + /// find any nodes that aren't in this list, then we will return a [`ScheduleError::ImpossibleConstraint`]. + /// + /// context: we prefer to avoid using nodes identified in the context, according + /// to their anti-affinity score. We use this to prefeer to avoid placing shards in + /// the same tenant on the same node. This is a soft constraint: the context will never + /// cause us to fail to schedule a shard. + pub(crate) fn schedule_shard( + &self, + hard_exclude: &[NodeId], + context: &ScheduleContext, + ) -> Result { if self.nodes.is_empty() { return Err(ScheduleError::NoPageservers); } - let mut tenant_counts: Vec<(NodeId, usize)> = self + let mut scores: Vec<(NodeId, AffinityScore, usize)> = self .nodes .iter() .filter_map(|(k, v)| { if hard_exclude.contains(k) || v.may_schedule == MaySchedule::No { None } else { - Some((*k, v.shard_count)) + Some(( + *k, + context.nodes.get(k).copied().unwrap_or(AffinityScore::FREE), + v.shard_count, + )) } }) .collect(); - // Sort by tenant count. Nodes with the same tenant count are sorted by ID. - tenant_counts.sort_by_key(|i| (i.1, i.0)); + // Sort by, in order of precedence: + // 1st: Affinity score. We should never pick a higher-score node if a lower-score node is available + // 2nd: Utilization. Within nodes with the same affinity, use the least loaded nodes. + // 3rd: Node ID. This is a convenience to make selection deterministic in tests and empty systems. + scores.sort_by_key(|i| (i.1, i.2, i.0)); - if tenant_counts.is_empty() { - // After applying constraints, no pageservers were left. We log some detail about - // the state of nodes to help understand why this happened. This is not logged as an error because - // it is legitimately possible for enough nodes to be Offline to prevent scheduling a shard. - tracing::info!("Scheduling failure, while excluding {hard_exclude:?}, node states:"); - for (node_id, node) in &self.nodes { + if scores.is_empty() { + // After applying constraints, no pageservers were left. + if !matches!(context.mode, ScheduleMode::Speculative) { + // If this was not a speculative attempt, log details to understand why we couldn't + // schedule: this may help an engineer understand if some nodes are marked offline + // in a way that's preventing progress. tracing::info!( - "Node {node_id}: may_schedule={} shards={}", - node.may_schedule != MaySchedule::No, - node.shard_count + "Scheduling failure, while excluding {hard_exclude:?}, node states:" ); + for (node_id, node) in &self.nodes { + tracing::info!( + "Node {node_id}: may_schedule={} shards={}", + node.may_schedule != MaySchedule::No, + node.shard_count + ); + } } - return Err(ScheduleError::ImpossibleConstraint); } - let node_id = tenant_counts.first().unwrap().0; - tracing::info!( - "scheduler selected node {node_id} (elegible nodes {:?}, exclude: {hard_exclude:?})", - tenant_counts.iter().map(|i| i.0 .0).collect::>() + // Lowest score wins + let node_id = scores.first().unwrap().0; + + if !matches!(context.mode, ScheduleMode::Speculative) { + tracing::info!( + "scheduler selected node {node_id} (elegible nodes {:?}, hard exclude: {hard_exclude:?}, soft exclude: {context:?})", + scores.iter().map(|i| i.0 .0).collect::>() ); + } // Note that we do not update shard count here to reflect the scheduling: that // is IntentState's job when the scheduled location is used. Ok(node_id) } + + /// Unit test access to internal state + #[cfg(test)] + pub(crate) fn get_node_shard_count(&self, node_id: NodeId) -> usize { + self.nodes.get(&node_id).unwrap().shard_count + } } #[cfg(test)] @@ -307,7 +421,7 @@ pub(crate) mod test_utils { mod tests { use super::*; - use crate::tenant_state::IntentState; + use crate::tenant_shard::IntentState; #[test] fn scheduler_basic() -> anyhow::Result<()> { let nodes = test_utils::make_test_nodes(2); @@ -316,15 +430,17 @@ mod tests { let mut t1_intent = IntentState::new(); let mut t2_intent = IntentState::new(); - let scheduled = scheduler.schedule_shard(&[])?; + let context = ScheduleContext::default(); + + let scheduled = scheduler.schedule_shard(&[], &context)?; t1_intent.set_attached(&mut scheduler, Some(scheduled)); - let scheduled = scheduler.schedule_shard(&[])?; + let scheduled = scheduler.schedule_shard(&[], &context)?; t2_intent.set_attached(&mut scheduler, Some(scheduled)); assert_eq!(scheduler.nodes.get(&NodeId(1)).unwrap().shard_count, 1); assert_eq!(scheduler.nodes.get(&NodeId(2)).unwrap().shard_count, 1); - let scheduled = scheduler.schedule_shard(&t1_intent.all_pageservers())?; + let scheduled = scheduler.schedule_shard(&t1_intent.all_pageservers(), &context)?; t1_intent.push_secondary(&mut scheduler, scheduled); assert_eq!(scheduler.nodes.get(&NodeId(1)).unwrap().shard_count, 1); diff --git a/control_plane/attachment_service/src/schema.rs b/storage_controller/src/schema.rs similarity index 95% rename from control_plane/attachment_service/src/schema.rs rename to storage_controller/src/schema.rs index 76e4e56a66..ff37d0fe77 100644 --- a/control_plane/attachment_service/src/schema.rs +++ b/storage_controller/src/schema.rs @@ -22,6 +22,7 @@ diesel::table! { placement_policy -> Varchar, splitting -> Int2, config -> Text, + scheduling_policy -> Varchar, } } diff --git a/control_plane/attachment_service/src/service.rs b/storage_controller/src/service.rs similarity index 75% rename from control_plane/attachment_service/src/service.rs rename to storage_controller/src/service.rs index aa930014b2..ae7e8d3d7d 100644 --- a/control_plane/attachment_service/src/service.rs +++ b/storage_controller/src/service.rs @@ -8,7 +8,14 @@ use std::{ }; use crate::{ - id_lock_map::IdLockMap, persistence::AbortShardSplitStatus, reconciler::ReconcileError, + compute_hook::NotifyError, + id_lock_map::{trace_exclusive_lock, trace_shared_lock, IdLockMap, WrappedWriteGuard}, + persistence::{AbortShardSplitStatus, TenantFilter}, + reconciler::{ReconcileError, ReconcileUnits}, + scheduler::{ScheduleContext, ScheduleMode}, + tenant_shard::{ + MigrateAttachment, ReconcileNeeded, ScheduleOptimization, ScheduleOptimizationAction, + }, }; use anyhow::Context; use control_plane::storage_controller::{ @@ -16,16 +23,19 @@ use control_plane::storage_controller::{ }; use diesel::result::DatabaseErrorKind; use futures::{stream::FuturesUnordered, StreamExt}; -use hyper::StatusCode; +use itertools::Itertools; use pageserver_api::{ controller_api::{ NodeAvailability, NodeRegisterRequest, NodeSchedulingPolicy, PlacementPolicy, - TenantCreateResponse, TenantCreateResponseShard, TenantDescribeResponse, - TenantDescribeResponseShard, TenantLocateResponse, TenantShardMigrateRequest, - TenantShardMigrateResponse, UtilizationScore, + ShardSchedulingPolicy, TenantCreateResponse, TenantCreateResponseShard, + TenantDescribeResponse, TenantDescribeResponseShard, TenantLocateResponse, + TenantPolicyRequest, TenantShardMigrateRequest, TenantShardMigrateResponse, + UtilizationScore, }, models::{SecondaryProgress, TenantConfigRequest}, }; +use reqwest::StatusCode; +use tracing::instrument; use crate::pageserver_client::PageserverClient; use pageserver_api::{ @@ -43,30 +53,28 @@ use pageserver_api::{ }, }; use pageserver_client::mgmt_api; -use tokio::sync::OwnedRwLockWriteGuard; +use tokio::sync::mpsc::error::TrySendError; use tokio_util::sync::CancellationToken; -use tracing::instrument; use utils::{ completion::Barrier, + failpoint_support, generation::Generation, http::error::ApiError, id::{NodeId, TenantId, TimelineId}, - seqwait::SeqWait, sync::gate::Gate, }; use crate::{ - compute_hook::{self, ComputeHook}, + compute_hook::ComputeHook, heartbeater::{Heartbeater, PageserverState}, node::{AvailabilityTransition, Node}, persistence::{split_state::SplitState, DatabaseError, Persistence, TenantShardPersistence}, reconciler::attached_location_conf, scheduler::Scheduler, - tenant_state::{ + tenant_shard::{ IntentState, ObservedState, ObservedStateLocation, ReconcileResult, ReconcileWaitError, - ReconcilerWaiter, TenantState, + ReconcilerWaiter, TenantShard, }, - Sequence, }; // For operations that should be quick, like attaching a new tenant @@ -74,7 +82,7 @@ const SHORT_RECONCILE_TIMEOUT: Duration = Duration::from_secs(5); // For operations that might be slow, like migrating a tenant with // some data in it. -const RECONCILE_TIMEOUT: Duration = Duration::from_secs(30); +pub const RECONCILE_TIMEOUT: Duration = Duration::from_secs(30); // If we receive a call using Secondary mode initially, it will omit generation. We will initialize // tenant shards into this generation, and as long as it remains in this generation, we will accept @@ -85,27 +93,99 @@ const INITIAL_GENERATION: Generation = Generation::new(0); /// up on unresponsive pageservers and proceed. pub(crate) const STARTUP_RECONCILE_TIMEOUT: Duration = Duration::from_secs(30); -pub const MAX_UNAVAILABLE_INTERVAL_DEFAULT: Duration = Duration::from_secs(30); +/// How long a node may be unresponsive to heartbeats before we declare it offline. +/// This must be long enough to cover node restarts as well as normal operations: in future +/// it should be separated into distinct timeouts for startup vs. normal operation +/// (``) +pub const MAX_UNAVAILABLE_INTERVAL_DEFAULT: Duration = Duration::from_secs(300); + +#[derive(Clone, strum_macros::Display)] +enum TenantOperations { + Create, + LocationConfig, + ConfigSet, + TimeTravelRemoteStorage, + Delete, + UpdatePolicy, + ShardSplit, + SecondaryDownload, + TimelineCreate, + TimelineDelete, +} + +#[derive(Clone, strum_macros::Display)] +enum NodeOperations { + Register, + Configure, +} + +pub const RECONCILER_CONCURRENCY_DEFAULT: usize = 128; + +// Depth of the channel used to enqueue shards for reconciliation when they can't do it immediately. +// This channel is finite-size to avoid using excessive memory if we get into a state where reconciles are finishing more slowly +// than they're being pushed onto the queue. +const MAX_DELAYED_RECONCILES: usize = 10000; // Top level state available to all HTTP handlers struct ServiceState { - tenants: BTreeMap, + tenants: BTreeMap, nodes: Arc>, scheduler: Scheduler, + + /// Queue of tenants who are waiting for concurrency limits to permit them to reconcile + delayed_reconcile_rx: tokio::sync::mpsc::Receiver, +} + +/// Transform an error from a pageserver into an error to return to callers of a storage +/// controller API. +fn passthrough_api_error(node: &Node, e: mgmt_api::Error) -> ApiError { + match e { + mgmt_api::Error::ReceiveErrorBody(str) => { + // Presume errors receiving body are connectivity/availability issues + ApiError::ResourceUnavailable( + format!("{node} error receiving error body: {str}").into(), + ) + } + mgmt_api::Error::ReceiveBody(str) => { + // Presume errors receiving body are connectivity/availability issues + ApiError::ResourceUnavailable(format!("{node} error receiving body: {str}").into()) + } + mgmt_api::Error::ApiError(StatusCode::NOT_FOUND, msg) => { + ApiError::NotFound(anyhow::anyhow!(format!("{node}: {msg}")).into()) + } + mgmt_api::Error::ApiError(StatusCode::SERVICE_UNAVAILABLE, msg) => { + ApiError::ResourceUnavailable(format!("{node}: {msg}").into()) + } + mgmt_api::Error::ApiError(status @ StatusCode::UNAUTHORIZED, msg) + | mgmt_api::Error::ApiError(status @ StatusCode::FORBIDDEN, msg) => { + // Auth errors talking to a pageserver are not auth errors for the caller: they are + // internal server errors, showing that something is wrong with the pageserver or + // storage controller's auth configuration. + ApiError::InternalServerError(anyhow::anyhow!("{node} {status}: {msg}")) + } + mgmt_api::Error::ApiError(status, msg) => { + // Presume general case of pageserver API errors is that we tried to do something + // that can't be done right now. + ApiError::Conflict(format!("{node} {status}: {status} {msg}")) + } + mgmt_api::Error::Cancelled => ApiError::ShuttingDown, + } } impl ServiceState { fn new( nodes: HashMap, - tenants: BTreeMap, + tenants: BTreeMap, scheduler: Scheduler, + delayed_reconcile_rx: tokio::sync::mpsc::Receiver, ) -> Self { Self { tenants, nodes: Arc::new(nodes), scheduler, + delayed_reconcile_rx, } } @@ -113,7 +193,7 @@ impl ServiceState { &mut self, ) -> ( &mut Arc>, - &mut BTreeMap, + &mut BTreeMap, &mut Scheduler, ) { (&mut self.nodes, &mut self.tenants, &mut self.scheduler) @@ -139,6 +219,9 @@ pub struct Config { /// considered active. Once the grace period elapses, the next heartbeat failure will /// mark the pagseserver offline. pub max_unavailable_interval: Duration, + + /// How many Reconcilers may be spawned concurrently + pub reconciler_concurrency: usize, } impl From for ApiError { @@ -171,11 +254,22 @@ pub struct Service { // Locking on a tenant granularity (covers all shards in the tenant): // - Take exclusively for rare operations that mutate the tenant's persistent state (e.g. create/delete/split) // - Take in shared mode for operations that need the set of shards to stay the same to complete reliably (e.g. timeline CRUD) - tenant_op_locks: IdLockMap, + tenant_op_locks: IdLockMap, // Locking for node-mutating operations: take exclusively for operations that modify the node's persistent state, or // that transition it to/from Active. - node_op_locks: IdLockMap, + node_op_locks: IdLockMap, + + // Limit how many Reconcilers we will spawn concurrently + reconciler_concurrency: Arc, + + /// Queue of tenants who are waiting for concurrency limits to permit them to reconcile + /// Send into this queue to promptly attempt to reconcile this shard next time units are available. + /// + /// Note that this state logically lives inside ServiceInner, but carrying Sender here makes the code simpler + /// by avoiding needing a &mut ref to something inside the ServiceInner. This could be optimized to + /// use a VecDeque instead of a channel to reduce synchronization overhead, at the cost of some code complexity. + delayed_reconcile_tx: tokio::sync::mpsc::Sender, // Process shutdown will fire this token cancel: CancellationToken, @@ -236,7 +330,7 @@ struct TenantShardSplitAbort { new_shard_count: ShardCount, new_stripe_size: Option, /// Until this abort op is complete, no other operations may be done on the tenant - _tenant_lock: tokio::sync::OwnedRwLockWriteGuard<()>, + _tenant_lock: WrappedWriteGuard, } #[derive(thiserror::Error, Debug)] @@ -266,7 +360,12 @@ impl Service { /// Called once on startup, this function attempts to contact all pageservers to build an up-to-date /// view of the world, and determine which pageservers are responsive. #[instrument(skip_all)] - async fn startup_reconcile(self: &Arc) { + async fn startup_reconcile( + self: &Arc, + bg_compute_notify_result_tx: tokio::sync::mpsc::Sender< + Result<(), (TenantShardId, NotifyError)>, + >, + ) { // For all tenant shards, a vector of observed states on nodes (where None means // indeterminate, same as in [`ObservedStateLocation`]) let mut observed: HashMap)>> = @@ -285,10 +384,6 @@ impl Service { .checked_add(STARTUP_RECONCILE_TIMEOUT / 2) .expect("Reconcile timeout is a modest constant"); - let compute_notify_deadline = start_at - .checked_add((STARTUP_RECONCILE_TIMEOUT / 4) * 3) - .expect("Reconcile timeout is a modest constant"); - // Accumulate a list of any tenant locations that ought to be detached let mut cleanup = Vec::new(); @@ -314,6 +409,7 @@ impl Service { let mut compute_notifications = Vec::new(); // Populate intent and observed states for all tenants, based on reported state on pageservers + tracing::info!("Populating tenant shards' states from initial pageserver scan..."); let shard_count = { let mut locked = self.inner.write().unwrap(); let (nodes, tenants, scheduler) = locked.parts_mut(); @@ -332,11 +428,11 @@ impl Service { for (tenant_shard_id, shard_observations) in observed { for (node_id, observed_loc) in shard_observations { - let Some(tenant_state) = tenants.get_mut(&tenant_shard_id) else { + let Some(tenant_shard) = tenants.get_mut(&tenant_shard_id) else { cleanup.push((tenant_shard_id, node_id)); continue; }; - tenant_state + tenant_shard .observed .locations .insert(node_id, ObservedStateLocation { conf: observed_loc }); @@ -344,9 +440,15 @@ impl Service { } // Populate each tenant's intent state - for (tenant_shard_id, tenant_state) in tenants.iter_mut() { - tenant_state.intent_from_observed(scheduler); - if let Err(e) = tenant_state.schedule(scheduler) { + let mut schedule_context = ScheduleContext::default(); + for (tenant_shard_id, tenant_shard) in tenants.iter_mut() { + if tenant_shard_id.shard_number == ShardNumber(0) { + // Reset scheduling context each time we advance to the next Tenant + schedule_context = ScheduleContext::default(); + } + + tenant_shard.intent_from_observed(scheduler); + if let Err(e) = tenant_shard.schedule(scheduler, &mut schedule_context) { // Non-fatal error: we are unable to properly schedule the tenant, perhaps because // not enough pageservers are available. The tenant may well still be available // to clients. @@ -355,11 +457,11 @@ impl Service { // If we're both intending and observed to be attached at a particular node, we will // emit a compute notification for this. In the case where our observed state does not // yet match our intent, we will eventually reconcile, and that will emit a compute notification. - if let Some(attached_at) = tenant_state.stably_attached() { + if let Some(attached_at) = tenant_shard.stably_attached() { compute_notifications.push(( *tenant_shard_id, attached_at, - tenant_state.shard.stripe_size, + tenant_shard.shard.stripe_size, )); } } @@ -374,28 +476,27 @@ impl Service { // Emit compute hook notifications for all tenants which are already stably attached. Other tenants // will emit compute hook notifications when they reconcile. // - // Ordering: we must complete these notification attempts before doing any other reconciliation for the - // tenants named here, because otherwise our calls to notify() might race with more recent values - // generated by reconciliation. - let notify_failures = self - .compute_notify_many(compute_notifications, compute_notify_deadline) - .await; - - // Compute notify is fallible. If it fails here, do not delay overall startup: set the - // flag on these shards that they have a pending notification. - // Update tenant state for any that failed to do their initial compute notify, so that they'll retry later. - { - let mut locked = self.inner.write().unwrap(); - for tenant_shard_id in notify_failures.into_iter() { - if let Some(shard) = locked.tenants.get_mut(&tenant_shard_id) { - shard.pending_compute_notification = true; - } - } - } + // Ordering: our calls to notify_background synchronously establish a relative order for these notifications vs. any later + // calls into the ComputeHook for the same tenant: we can leave these to run to completion in the background and any later + // calls will be correctly ordered wrt these. + // + // Concurrency: we call notify_background for all tenants, which will create O(N) tokio tasks, but almost all of them + // will just wait on the ComputeHook::API_CONCURRENCY semaphore immediately, so very cheap until they get that semaphore + // unit and start doing I/O. + tracing::info!( + "Sending {} compute notifications", + compute_notifications.len() + ); + self.compute_hook.notify_background( + compute_notifications, + bg_compute_notify_result_tx.clone(), + &self.cancel, + ); // Finally, now that the service is up and running, launch reconcile operations for any tenants // which require it: under normal circumstances this should only include tenants that were in some // transient state before we restarted, or any tenants whose compute hooks failed above. + tracing::info!("Checking for shards in need of reconciliation..."); let reconcile_tasks = self.reconcile_all(); // We will not wait for these reconciliation tasks to run here: we're now done with startup and // normal operations may proceed. @@ -436,6 +537,7 @@ impl Service { } } + tracing::info!("Sending initial heartbeats..."); let res = self .heartbeater .heartbeat(Arc::new(nodes_to_heartbeat)) @@ -472,6 +574,7 @@ impl Service { let mut node_list_futs = FuturesUnordered::new(); + tracing::info!("Scanning shards on {} nodes...", nodes.len()); for node in nodes.values() { node_list_futs.push({ async move { @@ -591,72 +694,6 @@ impl Service { } } - /// Used during [`Self::startup_reconcile`]: issue many concurrent compute notifications. - /// - /// Returns a set of any shards for which notifications where not acked within the deadline. - async fn compute_notify_many( - &self, - notifications: Vec<(TenantShardId, NodeId, ShardStripeSize)>, - deadline: Instant, - ) -> HashSet { - let attempt_shards = notifications.iter().map(|i| i.0).collect::>(); - let mut success_shards = HashSet::new(); - - // Construct an async stream of futures to invoke the compute notify function: we do this - // in order to subsequently use .buffered() on the stream to execute with bounded parallelism. - let mut stream = futures::stream::iter(notifications.into_iter()) - .map(|(tenant_shard_id, node_id, stripe_size)| { - let compute_hook = self.compute_hook.clone(); - let cancel = self.cancel.clone(); - async move { - if let Err(e) = compute_hook - .notify(tenant_shard_id, node_id, stripe_size, &cancel) - .await - { - tracing::error!( - %tenant_shard_id, - %node_id, - "Failed to notify compute on startup for shard: {e}" - ); - None - } else { - Some(tenant_shard_id) - } - } - }) - .buffered(compute_hook::API_CONCURRENCY); - - loop { - tokio::select! { - next = stream.next() => { - match next { - Some(Some(success_shard)) => { - // A notification succeeded - success_shards.insert(success_shard); - }, - Some(None) => { - // A notification that failed - }, - None => { - tracing::info!("Successfully sent all compute notifications"); - break; - } - } - }, - _ = tokio::time::sleep(deadline.duration_since(Instant::now())) => { - // Give up sending any that didn't succeed yet - tracing::info!("Reached deadline while sending compute notifications"); - break; - } - }; - } - - attempt_shards - .difference(&success_shards) - .cloned() - .collect() - } - /// Long running background task that periodically wakes up and looks for shards that need /// reconciliation. Reconciliation is fallible, so any reconciliation tasks that fail during /// e.g. a tenant create/attach/migrate must eventually be retried: this task is responsible @@ -670,7 +707,13 @@ impl Service { let mut interval = tokio::time::interval(BACKGROUND_RECONCILE_PERIOD); while !self.cancel.is_cancelled() { tokio::select! { - _ = interval.tick() => { self.reconcile_all(); } + _ = interval.tick() => { + let reconciles_spawned = self.reconcile_all(); + if reconciles_spawned == 0 { + // Run optimizer only when we didn't find any other work to do + self.optimize_all().await; + } + } _ = self.cancel.cancelled() => return } } @@ -727,8 +770,9 @@ impl Service { } /// Apply the contents of a [`ReconcileResult`] to our in-memory state: if the reconciliation - /// was successful, this will update the observed state of the tenant such that subsequent - /// calls to [`TenantState::maybe_reconcile`] will do nothing. + /// was successful and intent hasn't changed since the Reconciler was spawned, this will update + /// the observed state of the tenant such that subsequent calls to [`TenantShard::get_reconcile_needed`] + /// will indicate that reconciliation is not needed. #[instrument(skip_all, fields( tenant_id=%result.tenant_shard_id.tenant_id, shard_id=%result.tenant_shard_id.shard_slug(), sequence=%result.sequence @@ -746,10 +790,10 @@ impl Service { tenant.generation = std::cmp::max(tenant.generation, result.generation); // If the reconciler signals that it failed to notify compute, set this state on - // the shard so that a future [`TenantState::maybe_reconcile`] will try again. + // the shard so that a future [`TenantShard::maybe_reconcile`] will try again. tenant.pending_compute_notification = result.pending_compute_notification; - // Let the TenantState know it is idle. + // Let the TenantShard know it is idle. tenant.reconcile_complete(result.sequence); match result.result { @@ -781,36 +825,72 @@ impl Service { // Ordering: populate last_error before advancing error_seq, // so that waiters will see the correct error after waiting. - *(tenant.last_error.lock().unwrap()) = format!("{e}"); - tenant.error_waiter.advance(result.sequence); + tenant.set_last_error(result.sequence, e); for (node_id, o) in result.observed.locations { tenant.observed.locations.insert(node_id, o); } } } + + // Maybe some other work can proceed now that this job finished. + if self.reconciler_concurrency.available_permits() > 0 { + while let Ok(tenant_shard_id) = locked.delayed_reconcile_rx.try_recv() { + let (nodes, tenants, _scheduler) = locked.parts_mut(); + if let Some(shard) = tenants.get_mut(&tenant_shard_id) { + shard.delayed_reconcile = false; + self.maybe_reconcile_shard(shard, nodes); + } + + if self.reconciler_concurrency.available_permits() == 0 { + break; + } + } + } } async fn process_results( &self, mut result_rx: tokio::sync::mpsc::UnboundedReceiver, + mut bg_compute_hook_result_rx: tokio::sync::mpsc::Receiver< + Result<(), (TenantShardId, NotifyError)>, + >, ) { loop { // Wait for the next result, or for cancellation - let result = tokio::select! { + tokio::select! { r = result_rx.recv() => { match r { - Some(result) => {result}, + Some(result) => {self.process_result(result);}, None => {break;} } } + _ = async{ + match bg_compute_hook_result_rx.recv().await { + Some(result) => { + if let Err((tenant_shard_id, notify_error)) = result { + tracing::warn!("Marking shard {tenant_shard_id} for notification retry, due to error {notify_error}"); + let mut locked = self.inner.write().unwrap(); + if let Some(shard) = locked.tenants.get_mut(&tenant_shard_id) { + shard.pending_compute_notification = true; + } + + } + }, + None => { + // This channel is dead, but we don't want to terminate the outer loop{}: just wait for shutdown + self.cancel.cancelled().await; + } + } + } => {}, _ = self.cancel.cancelled() => { break; } }; - - self.process_result(result); } + + // We should only fall through on shutdown + assert!(self.cancel.is_cancelled()); } async fn process_aborts( @@ -957,36 +1037,27 @@ impl Service { } for tsp in tenant_shard_persistence { let tenant_shard_id = tsp.get_tenant_shard_id()?; - let shard_identity = tsp.get_shard_identity()?; + // We will populate intent properly later in [`Self::startup_reconcile`], initially populate // it with what we can infer: the node for which a generation was most recently issued. let mut intent = IntentState::new(); if let Some(generation_pageserver) = tsp.generation_pageserver { intent.set_attached(&mut scheduler, Some(NodeId(generation_pageserver as u64))); } - - let new_tenant = TenantState { - tenant_shard_id, - shard: shard_identity, - sequence: Sequence::initial(), - generation: tsp.generation.map(|g| Generation::new(g as u32)), - policy: serde_json::from_str(&tsp.placement_policy).unwrap(), - intent, - observed: ObservedState::new(), - config: serde_json::from_str(&tsp.config).unwrap(), - reconciler: None, - splitting: tsp.splitting, - waiter: Arc::new(SeqWait::new(Sequence::initial())), - error_waiter: Arc::new(SeqWait::new(Sequence::initial())), - last_error: Arc::default(), - pending_compute_notification: false, - }; + let new_tenant = TenantShard::from_persistent(tsp, intent)?; tenants.insert(tenant_shard_id, new_tenant); } let (startup_completion, startup_complete) = utils::completion::channel(); + // This channel is continuously consumed by process_results, so doesn't need to be very large. + let (bg_compute_notify_result_tx, bg_compute_notify_result_rx) = + tokio::sync::mpsc::channel(512); + + let (delayed_reconcile_tx, delayed_reconcile_rx) = + tokio::sync::mpsc::channel(MAX_DELAYED_RECONCILES); + let cancel = CancellationToken::new(); let heartbeater = Heartbeater::new( config.jwt_token.clone(), @@ -995,13 +1066,20 @@ impl Service { ); let this = Arc::new(Self { inner: Arc::new(std::sync::RwLock::new(ServiceState::new( - nodes, tenants, scheduler, + nodes, + tenants, + scheduler, + delayed_reconcile_rx, ))), config: config.clone(), persistence, - compute_hook: Arc::new(ComputeHook::new(config)), + compute_hook: Arc::new(ComputeHook::new(config.clone())), result_tx, heartbeater, + reconciler_concurrency: Arc::new(tokio::sync::Semaphore::new( + config.reconciler_concurrency, + )), + delayed_reconcile_tx, abort_tx, startup_complete: startup_complete.clone(), cancel, @@ -1014,7 +1092,9 @@ impl Service { tokio::task::spawn(async move { // Block shutdown until we're done (we must respect self.cancel) if let Ok(_gate) = result_task_this.gate.enter() { - result_task_this.process_results(result_rx).await + result_task_this + .process_results(result_rx, bg_compute_notify_result_rx) + .await } }); @@ -1056,7 +1136,7 @@ impl Service { return; }; - this.startup_reconcile().await; + this.startup_reconcile(bg_compute_notify_result_tx).await; drop(startup_completion); } }); @@ -1104,6 +1184,8 @@ impl Service { placement_policy: serde_json::to_string(&PlacementPolicy::Attached(0)).unwrap(), config: serde_json::to_string(&TenantConfig::default()).unwrap(), splitting: SplitState::default(), + scheduling_policy: serde_json::to_string(&ShardSchedulingPolicy::default()) + .unwrap(), }; match self.persistence.insert_tenant_shards(vec![tsp]).await { @@ -1125,7 +1207,7 @@ impl Service { let mut locked = self.inner.write().unwrap(); locked.tenants.insert( attach_req.tenant_shard_id, - TenantState::new( + TenantShard::new( attach_req.tenant_shard_id, ShardIdentity::unsharded(), PlacementPolicy::Attached(0), @@ -1156,9 +1238,10 @@ impl Service { // when we reattaching a detached tenant. self.persistence .update_tenant_shard( - attach_req.tenant_shard_id, - PlacementPolicy::Attached(0), - conf, + TenantFilter::Shard(attach_req.tenant_shard_id), + Some(PlacementPolicy::Attached(0)), + Some(conf), + None, None, ) .await?; @@ -1176,32 +1259,32 @@ impl Service { let mut locked = self.inner.write().unwrap(); let (_nodes, tenants, scheduler) = locked.parts_mut(); - let tenant_state = tenants + let tenant_shard = tenants .get_mut(&attach_req.tenant_shard_id) .expect("Checked for existence above"); if let Some(new_generation) = new_generation { - tenant_state.generation = Some(new_generation); - tenant_state.policy = PlacementPolicy::Attached(0); + tenant_shard.generation = Some(new_generation); + tenant_shard.policy = PlacementPolicy::Attached(0); } else { // This is a detach notification. We must update placement policy to avoid re-attaching // during background scheduling/reconciliation, or during storage controller restart. assert!(attach_req.node_id.is_none()); - tenant_state.policy = PlacementPolicy::Detached; + tenant_shard.policy = PlacementPolicy::Detached; } if let Some(attaching_pageserver) = attach_req.node_id.as_ref() { tracing::info!( tenant_id = %attach_req.tenant_shard_id, ps_id = %attaching_pageserver, - generation = ?tenant_state.generation, + generation = ?tenant_shard.generation, "issuing", ); - } else if let Some(ps_id) = tenant_state.intent.get_attached() { + } else if let Some(ps_id) = tenant_shard.intent.get_attached() { tracing::info!( tenant_id = %attach_req.tenant_shard_id, %ps_id, - generation = ?tenant_state.generation, + generation = ?tenant_shard.generation, "dropping", ); } else { @@ -1209,14 +1292,14 @@ impl Service { tenant_id = %attach_req.tenant_shard_id, "no-op: tenant already has no pageserver"); } - tenant_state + tenant_shard .intent .set_attached(scheduler, attach_req.node_id); tracing::info!( "attach_hook: tenant {} set generation {:?}, pageserver {}", attach_req.tenant_shard_id, - tenant_state.generation, + tenant_shard.generation, // TODO: this is an odd number of 0xf's attach_req.node_id.unwrap_or(utils::id::NodeId(0xfffffff)) ); @@ -1228,36 +1311,36 @@ impl Service { #[cfg(feature = "testing")] { if let Some(node_id) = attach_req.node_id { - tenant_state.observed.locations = HashMap::from([( + tenant_shard.observed.locations = HashMap::from([( node_id, ObservedStateLocation { conf: Some(attached_location_conf( - tenant_state.generation.unwrap(), - &tenant_state.shard, - &tenant_state.config, + tenant_shard.generation.unwrap(), + &tenant_shard.shard, + &tenant_shard.config, false, )), }, )]); } else { - tenant_state.observed.locations.clear(); + tenant_shard.observed.locations.clear(); } } Ok(AttachHookResponse { gen: attach_req .node_id - .map(|_| tenant_state.generation.expect("Test hook, not used on tenants that are mid-onboarding with a NULL generation").into().unwrap()), + .map(|_| tenant_shard.generation.expect("Test hook, not used on tenants that are mid-onboarding with a NULL generation").into().unwrap()), }) } pub(crate) fn inspect(&self, inspect_req: InspectRequest) -> InspectResponse { let locked = self.inner.read().unwrap(); - let tenant_state = locked.tenants.get(&inspect_req.tenant_shard_id); + let tenant_shard = locked.tenants.get(&inspect_req.tenant_shard_id); InspectResponse { - attachment: tenant_state.and_then(|s| { + attachment: tenant_shard.and_then(|s| { s.intent .get_attached() .map(|ps| (s.generation.expect("Test hook, not used on tenants that are mid-onboarding with a NULL generation").into().unwrap(), ps)) @@ -1280,7 +1363,7 @@ impl Service { async fn node_activate_reconcile( &self, mut node: Node, - _lock: &OwnedRwLockWriteGuard<()>, + _lock: &WrappedWriteGuard, ) -> Result<(), ApiError> { // This Node is a mutable local copy: we will set it active so that we can use its // API client to reconcile with the node. The Node in [`Self::nodes`] will get updated @@ -1319,11 +1402,11 @@ impl Service { let mut locked = self.inner.write().unwrap(); for (tenant_shard_id, observed_loc) in configs.tenant_shards { - let Some(tenant_state) = locked.tenants.get_mut(&tenant_shard_id) else { + let Some(tenant_shard) = locked.tenants.get_mut(&tenant_shard_id) else { cleanup.push(tenant_shard_id); continue; }; - tenant_state + tenant_shard .observed .locations .insert(node.get_id(), ObservedStateLocation { conf: observed_loc }); @@ -1494,13 +1577,13 @@ impl Service { }; for req_tenant in validate_req.tenants { - if let Some(tenant_state) = locked.tenants.get(&req_tenant.id) { - let valid = tenant_state.generation == Some(Generation::new(req_tenant.gen)); + if let Some(tenant_shard) = locked.tenants.get(&req_tenant.id) { + let valid = tenant_shard.generation == Some(Generation::new(req_tenant.gen)); tracing::info!( "handle_validate: {}(gen {}): valid={valid} (latest {:?})", req_tenant.id, req_tenant.gen, - tenant_state.generation + tenant_shard.generation ); response.tenants.push(ValidateResponseTenant { id: req_tenant.id, @@ -1523,15 +1606,23 @@ impl Service { &self, create_req: TenantCreateRequest, ) -> Result { - // Exclude any concurrent attempts to create/access the same tenant ID - let _tenant_lock = self - .tenant_op_locks - .exclusive(create_req.new_tenant_id.tenant_id) - .await; + let tenant_id = create_req.new_tenant_id.tenant_id; + // Exclude any concurrent attempts to create/access the same tenant ID + let _tenant_lock = trace_exclusive_lock( + &self.tenant_op_locks, + create_req.new_tenant_id.tenant_id, + TenantOperations::Create, + ) + .await; let (response, waiters) = self.do_tenant_create(create_req).await?; - self.await_waiters(waiters, SHORT_RECONCILE_TIMEOUT).await?; + if let Err(e) = self.await_waiters(waiters, RECONCILE_TIMEOUT).await { + // Avoid deadlock: reconcile may fail while notifying compute, if the cloud control plane refuses to + // accept compute notifications while it is in the process of creating. Reconciliation will + // be retried in the background. + tracing::warn!(%tenant_id, "Reconcile not done yet while creating tenant ({e})"); + } Ok(response) } @@ -1608,15 +1699,31 @@ impl Service { placement_policy: serde_json::to_string(&placement_policy).unwrap(), config: serde_json::to_string(&create_req.config).unwrap(), splitting: SplitState::default(), + scheduling_policy: serde_json::to_string(&ShardSchedulingPolicy::default()) + .unwrap(), }) .collect(); - self.persistence + + match self + .persistence .insert_tenant_shards(persist_tenant_shards) .await - .map_err(|e| { - // TODO: distinguish primary key constraint (idempotent, OK), from other errors - ApiError::InternalServerError(anyhow::anyhow!(e)) - })?; + { + Ok(_) => {} + Err(DatabaseError::Query(diesel::result::Error::DatabaseError( + DatabaseErrorKind::UniqueViolation, + _, + ))) => { + // Unique key violation: this is probably a retry. Because the shard count is part of the unique key, + // if we see a unique key violation it means that the creation request's shard count matches the previous + // creation's shard count. + tracing::info!("Tenant shards already present in database, proceeding with idempotent creation..."); + } + // Any other database error is unexpected and a bug. + Err(e) => return Err(ApiError::InternalServerError(anyhow::anyhow!(e))), + }; + + let mut schedule_context = ScheduleContext::default(); let (waiters, response_shards) = { let mut locked = self.inner.write().unwrap(); @@ -1639,11 +1746,14 @@ impl Service { // attached and secondary locations (independently) away frorm those // pageservers also holding a shard for this tenant. - entry.get_mut().schedule(scheduler).map_err(|e| { - ApiError::Conflict(format!( - "Failed to schedule shard {tenant_shard_id}: {e}" - )) - })?; + entry + .get_mut() + .schedule(scheduler, &mut schedule_context) + .map_err(|e| { + ApiError::Conflict(format!( + "Failed to schedule shard {tenant_shard_id}: {e}" + )) + })?; if let Some(node_id) = entry.get().intent.get_attached() { let generation = entry @@ -1660,7 +1770,7 @@ impl Service { continue; } Entry::Vacant(entry) => { - let state = entry.insert(TenantState::new( + let state = entry.insert(TenantShard::new( tenant_shard_id, ShardIdentity::from_params( tenant_shard_id.shard_number, @@ -1671,7 +1781,7 @@ impl Service { state.generation = initial_generation; state.config = create_req.config.clone(); - if let Err(e) = state.schedule(scheduler) { + if let Err(e) = state.schedule(scheduler, &mut schedule_context) { schcedule_error = Some(e); } @@ -1735,6 +1845,9 @@ impl Service { /// Part of [`Self::tenant_location_config`]: dissect an incoming location config request, /// and transform it into either a tenant creation of a series of shard updates. + /// + /// If the incoming request makes no changes, a [`TenantCreateOrUpdate::Update`] result will + /// still be returned. fn tenant_location_config_prepare( &self, tenant_id: TenantId, @@ -1782,17 +1895,12 @@ impl Service { _ => None, }; - if shard.policy != placement_policy - || shard.config != req.config.tenant_conf - || set_generation.is_some() - { - updates.push(ShardUpdate { - tenant_shard_id: *shard_id, - placement_policy: placement_policy.clone(), - tenant_config: req.config.tenant_conf.clone(), - generation: set_generation, - }); - } + updates.push(ShardUpdate { + tenant_shard_id: *shard_id, + placement_policy: placement_policy.clone(), + tenant_config: req.config.tenant_conf.clone(), + generation: set_generation, + }); } if create { @@ -1821,6 +1929,7 @@ impl Service { }, ) } else { + assert!(!updates.is_empty()); TenantCreateOrUpdate::Update(updates) } } @@ -1844,10 +1953,12 @@ impl Service { req: TenantLocationConfigRequest, ) -> Result { // We require an exclusive lock, because we are updating both persistent and in-memory state - let _tenant_lock = self - .tenant_op_locks - .exclusive(tenant_shard_id.tenant_id) - .await; + let _tenant_lock = trace_exclusive_lock( + &self.tenant_op_locks, + tenant_shard_id.tenant_id, + TenantOperations::LocationConfig, + ) + .await; if !tenant_shard_id.is_unsharded() { return Err(ApiError::BadRequest(anyhow::anyhow!( @@ -1879,6 +1990,7 @@ impl Service { // Persist updates // Ordering: write to the database before applying changes in-memory, so that // we will not appear time-travel backwards on a restart. + let mut schedule_context = ScheduleContext::default(); for ShardUpdate { tenant_shard_id, placement_policy, @@ -1888,10 +2000,11 @@ impl Service { { self.persistence .update_tenant_shard( - *tenant_shard_id, - placement_policy.clone(), - tenant_config.clone(), + TenantFilter::Shard(*tenant_shard_id), + Some(placement_policy.clone()), + Some(tenant_config.clone()), *generation, + None, ) .await?; } @@ -1925,7 +2038,7 @@ impl Service { shard.generation = Some(generation); } - shard.schedule(scheduler)?; + shard.schedule(scheduler, &mut schedule_context)?; let maybe_waiter = self.maybe_reconcile_shard(shard, nodes); if let Some(waiter) = maybe_waiter { @@ -1963,13 +2076,24 @@ impl Service { pub(crate) async fn tenant_config_set(&self, req: TenantConfigRequest) -> Result<(), ApiError> { // We require an exclusive lock, because we are updating persistent and in-memory state - let _tenant_lock = self.tenant_op_locks.exclusive(req.tenant_id).await; + let _tenant_lock = trace_exclusive_lock( + &self.tenant_op_locks, + req.tenant_id, + TenantOperations::ConfigSet, + ) + .await; let tenant_id = req.tenant_id; let config = req.config; self.persistence - .update_tenant_config(req.tenant_id, config.clone()) + .update_tenant_shard( + TenantFilter::Tenant(req.tenant_id), + None, + Some(config.clone()), + None, + None, + ) .await?; let waiters = { @@ -2046,7 +2170,12 @@ impl Service { timestamp: Cow<'_, str>, done_if_after: Cow<'_, str>, ) -> Result<(), ApiError> { - let _tenant_lock = self.tenant_op_locks.exclusive(tenant_id).await; + let _tenant_lock = trace_exclusive_lock( + &self.tenant_op_locks, + tenant_id, + TenantOperations::TimeTravelRemoteStorage, + ) + .await; let node = { let locked = self.inner.read().unwrap(); @@ -2079,7 +2208,7 @@ impl Service { let scheduler = &locked.scheduler; // Right now we only perform the operation on a single node without parallelization // TODO fan out the operation to multiple nodes for better performance - let node_id = scheduler.schedule_shard(&[])?; + let node_id = scheduler.schedule_shard(&[], &ScheduleContext::default())?; let node = locked .nodes .get(&node_id) @@ -2137,7 +2266,12 @@ impl Service { tenant_id: TenantId, wait: Option, ) -> Result<(StatusCode, SecondaryProgress), ApiError> { - let _tenant_lock = self.tenant_op_locks.shared(tenant_id).await; + let _tenant_lock = trace_shared_lock( + &self.tenant_op_locks, + tenant_id, + TenantOperations::SecondaryDownload, + ) + .await; // Acquire lock and yield the collection of shard-node tuples which we will send requests onward to let targets = { @@ -2231,7 +2365,8 @@ impl Service { } pub(crate) async fn tenant_delete(&self, tenant_id: TenantId) -> Result { - let _tenant_lock = self.tenant_op_locks.exclusive(tenant_id).await; + let _tenant_lock = + trace_exclusive_lock(&self.tenant_op_locks, tenant_id, TenantOperations::Delete).await; self.ensure_attached_wait(tenant_id).await?; @@ -2322,6 +2457,65 @@ impl Service { Ok(StatusCode::NOT_FOUND) } + /// Naming: this configures the storage controller's policies for a tenant, whereas [`Self::tenant_config_set`] is "set the TenantConfig" + /// for a tenant. The TenantConfig is passed through to pageservers, whereas this function modifies + /// the tenant's policies (configuration) within the storage controller + pub(crate) async fn tenant_update_policy( + &self, + tenant_id: TenantId, + req: TenantPolicyRequest, + ) -> Result<(), ApiError> { + // We require an exclusive lock, because we are updating persistent and in-memory state + let _tenant_lock = trace_exclusive_lock( + &self.tenant_op_locks, + tenant_id, + TenantOperations::UpdatePolicy, + ) + .await; + + failpoint_support::sleep_millis_async!("tenant-update-policy-exclusive-lock"); + + let TenantPolicyRequest { + placement, + scheduling, + } = req; + + self.persistence + .update_tenant_shard( + TenantFilter::Tenant(tenant_id), + placement.clone(), + None, + None, + scheduling, + ) + .await?; + + let mut schedule_context = ScheduleContext::default(); + let mut locked = self.inner.write().unwrap(); + let (nodes, tenants, scheduler) = locked.parts_mut(); + for (shard_id, shard) in tenants.range_mut(TenantShardId::tenant_range(tenant_id)) { + if let Some(placement) = &placement { + shard.policy = placement.clone(); + + tracing::info!(tenant_id=%shard_id.tenant_id, shard_id=%shard_id.shard_slug(), + "Updated placement policy to {placement:?}"); + } + + if let Some(scheduling) = &scheduling { + shard.set_scheduling_policy(*scheduling); + + tracing::info!(tenant_id=%shard_id.tenant_id, shard_id=%shard_id.shard_slug(), + "Updated scheduling policy to {scheduling:?}"); + } + + // In case scheduling is being switched back on, try it now. + shard.schedule(scheduler, &mut schedule_context).ok(); + self.maybe_reconcile_shard(shard, nodes); + } + + Ok(()) + } + pub(crate) async fn tenant_timeline_create( &self, tenant_id: TenantId, @@ -2333,7 +2527,12 @@ impl Service { create_req.new_timeline_id, ); - let _tenant_lock = self.tenant_op_locks.shared(tenant_id).await; + let _tenant_lock = trace_shared_lock( + &self.tenant_op_locks, + tenant_id, + TenantOperations::TimelineCreate, + ) + .await; self.ensure_attached_wait(tenant_id).await?; @@ -2380,17 +2579,7 @@ impl Service { client .timeline_create(tenant_shard_id, &create_req) .await - .map_err(|e| match e { - mgmt_api::Error::ApiError(status, msg) - if status == StatusCode::INTERNAL_SERVER_ERROR - || status == StatusCode::NOT_ACCEPTABLE => - { - // TODO: handle more error codes, e.g. 503 should be passed through. Make a general wrapper - // for pass-through API calls. - ApiError::InternalServerError(anyhow::anyhow!(msg)) - } - _ => ApiError::Conflict(format!("Failed to create timeline: {e}")), - }) + .map_err(|e| passthrough_api_error(&node, e)) } // Because the caller might not provide an explicit LSN, we must do the creation first on a single shard, and then @@ -2452,13 +2641,57 @@ impl Service { Ok(results) } + /// Concurrently invoke a pageserver API call on many shards at once + pub(crate) async fn tenant_for_shards_api( + &self, + locations: Vec<(TenantShardId, Node)>, + op: O, + warn_threshold: u32, + max_retries: u32, + timeout: Duration, + cancel: &CancellationToken, + ) -> Vec> + where + O: Fn(TenantShardId, PageserverClient) -> F + Copy, + F: std::future::Future>, + { + let mut futs = FuturesUnordered::new(); + let mut results = Vec::with_capacity(locations.len()); + + for (tenant_shard_id, node) in locations { + futs.push(async move { + node.with_client_retries( + |client| op(tenant_shard_id, client), + &self.config.jwt_token, + warn_threshold, + max_retries, + timeout, + cancel, + ) + .await + }); + } + + while let Some(r) = futs.next().await { + let r = r.unwrap_or(Err(mgmt_api::Error::Cancelled)); + results.push(r); + } + + results + } + pub(crate) async fn tenant_timeline_delete( &self, tenant_id: TenantId, timeline_id: TimelineId, ) -> Result { tracing::info!("Deleting timeline {}/{}", tenant_id, timeline_id,); - let _tenant_lock = self.tenant_op_locks.shared(tenant_id).await; + let _tenant_lock = trace_shared_lock( + &self.tenant_op_locks, + tenant_id, + TenantOperations::TimelineDelete, + ) + .await; self.ensure_attached_wait(tenant_id).await?; @@ -2648,45 +2881,78 @@ impl Service { }) } + /// Returns None if the input iterator of shards does not include a shard with number=0 + fn tenant_describe_impl<'a>( + &self, + shards: impl Iterator, + ) -> Option { + let mut shard_zero = None; + let mut describe_shards = Vec::new(); + + for shard in shards { + if shard.tenant_shard_id.is_shard_zero() { + shard_zero = Some(shard); + } + + describe_shards.push(TenantDescribeResponseShard { + tenant_shard_id: shard.tenant_shard_id, + node_attached: *shard.intent.get_attached(), + node_secondary: shard.intent.get_secondary().to_vec(), + last_error: shard + .last_error + .lock() + .unwrap() + .as_ref() + .map(|e| format!("{e}")) + .unwrap_or("".to_string()) + .clone(), + is_reconciling: shard.reconciler.is_some(), + is_pending_compute_notification: shard.pending_compute_notification, + is_splitting: matches!(shard.splitting, SplitState::Splitting), + scheduling_policy: *shard.get_scheduling_policy(), + }) + } + + let shard_zero = shard_zero?; + + Some(TenantDescribeResponse { + tenant_id: shard_zero.tenant_shard_id.tenant_id, + shards: describe_shards, + stripe_size: shard_zero.shard.stripe_size, + policy: shard_zero.policy.clone(), + config: shard_zero.config.clone(), + }) + } + pub(crate) fn tenant_describe( &self, tenant_id: TenantId, ) -> Result { let locked = self.inner.read().unwrap(); - let mut shard_zero = None; - let mut shards = Vec::new(); + self.tenant_describe_impl( + locked + .tenants + .range(TenantShardId::tenant_range(tenant_id)) + .map(|(_k, v)| v), + ) + .ok_or_else(|| ApiError::NotFound(anyhow::anyhow!("Tenant {tenant_id} not found").into())) + } - for (tenant_shard_id, shard) in locked.tenants.range(TenantShardId::tenant_range(tenant_id)) + pub(crate) fn tenant_list(&self) -> Vec { + let locked = self.inner.read().unwrap(); + + let mut result = Vec::new(); + for (_tenant_id, tenant_shards) in + &locked.tenants.iter().group_by(|(id, _shard)| id.tenant_id) { - if tenant_shard_id.is_zero() { - shard_zero = Some(shard); - } - - let response_shard = TenantDescribeResponseShard { - tenant_shard_id: *tenant_shard_id, - node_attached: *shard.intent.get_attached(), - node_secondary: shard.intent.get_secondary().to_vec(), - last_error: shard.last_error.lock().unwrap().clone(), - is_reconciling: shard.reconciler.is_some(), - is_pending_compute_notification: shard.pending_compute_notification, - is_splitting: matches!(shard.splitting, SplitState::Splitting), - }; - shards.push(response_shard); + result.push( + self.tenant_describe_impl(tenant_shards.map(|(_k, v)| v)) + .expect("Groups are always non-empty"), + ); } - let Some(shard_zero) = shard_zero else { - return Err(ApiError::NotFound( - anyhow::anyhow!("Tenant {tenant_id} not found").into(), - )); - }; - - Ok(TenantDescribeResponse { - shards, - stripe_size: shard_zero.shard.stripe_size, - policy: shard_zero.policy.clone(), - config: shard_zero.config.clone(), - }) + result } #[instrument(skip_all, fields(tenant_id=%op.tenant_id))] @@ -2779,7 +3045,7 @@ impl Service { tracing::info!("Restoring parent shard {tenant_shard_id}"); shard.splitting = SplitState::Idle; - if let Err(e) = shard.schedule(scheduler) { + if let Err(e) = shard.schedule(scheduler, &mut ScheduleContext::default()) { // If this shard can't be scheduled now (perhaps due to offline nodes or // capacity issues), that must not prevent us rolling back a split. In this // case it should be eventually scheduled in the background. @@ -2863,11 +3129,14 @@ impl Service { ) -> ( TenantShardSplitResponse, Vec<(TenantShardId, NodeId, ShardStripeSize)>, + Vec, ) { let mut response = TenantShardSplitResponse { new_shards: Vec::new(), }; let mut child_locations = Vec::new(); + let mut waiters = Vec::new(); + { let mut locked = self.inner.write().unwrap(); @@ -2903,6 +3172,7 @@ impl Service { ) }; + let mut schedule_context = ScheduleContext::default(); for child in child_ids { let mut child_shard = parent_ident; child_shard.number = child.shard_number; @@ -2924,7 +3194,7 @@ impl Service { }, ); - let mut child_state = TenantState::new(child, child_shard, policy.clone()); + let mut child_state = TenantShard::new(child, child_shard, policy.clone()); child_state.intent = IntentState::single(scheduler, Some(pageserver)); child_state.observed = ObservedState { locations: child_observed, @@ -2932,27 +3202,125 @@ impl Service { child_state.generation = Some(generation); child_state.config = config.clone(); - // The child's TenantState::splitting is intentionally left at the default value of Idle, + // The child's TenantShard::splitting is intentionally left at the default value of Idle, // as at this point in the split process we have succeeded and this part is infallible: // we will never need to do any special recovery from this state. child_locations.push((child, pageserver, child_shard.stripe_size)); - if let Err(e) = child_state.schedule(scheduler) { + if let Err(e) = child_state.schedule(scheduler, &mut schedule_context) { // This is not fatal, because we've implicitly already got an attached // location for the child shard. Failure here just means we couldn't // find a secondary (e.g. because cluster is overloaded). tracing::warn!("Failed to schedule child shard {child}: {e}"); } // In the background, attach secondary locations for the new shards - self.maybe_reconcile_shard(&mut child_state, nodes); + if let Some(waiter) = self.maybe_reconcile_shard(&mut child_state, nodes) { + waiters.push(waiter); + } tenants.insert(child, child_state); response.new_shards.push(child); } } + (response, child_locations, waiters) + } + } - (response, child_locations) + async fn tenant_shard_split_start_secondaries( + &self, + tenant_id: TenantId, + waiters: Vec, + ) { + // Wait for initial reconcile of child shards, this creates the secondary locations + if let Err(e) = self.await_waiters(waiters, RECONCILE_TIMEOUT).await { + // This is not a failure to split: it's some issue reconciling the new child shards, perhaps + // their secondaries couldn't be attached. + tracing::warn!("Failed to reconcile after split: {e}"); + return; + } + + // Take the state lock to discover the attached & secondary intents for all shards + let (attached, secondary) = { + let locked = self.inner.read().unwrap(); + let mut attached = Vec::new(); + let mut secondary = Vec::new(); + + for (tenant_shard_id, shard) in + locked.tenants.range(TenantShardId::tenant_range(tenant_id)) + { + let Some(node_id) = shard.intent.get_attached() else { + // Unexpected. Race with a PlacementPolicy change? + tracing::warn!( + "No attached node on {tenant_shard_id} immediately after shard split!" + ); + continue; + }; + + let Some(secondary_node_id) = shard.intent.get_secondary().first() else { + // No secondary location. Nothing for us to do. + continue; + }; + + let attached_node = locked + .nodes + .get(node_id) + .expect("Pageservers may not be deleted while referenced"); + + let secondary_node = locked + .nodes + .get(secondary_node_id) + .expect("Pageservers may not be deleted while referenced"); + + attached.push((*tenant_shard_id, attached_node.clone())); + secondary.push((*tenant_shard_id, secondary_node.clone())); + } + (attached, secondary) + }; + + if secondary.is_empty() { + // No secondary locations; nothing for us to do + return; + } + + for result in self + .tenant_for_shards_api( + attached, + |tenant_shard_id, client| async move { + client.tenant_heatmap_upload(tenant_shard_id).await + }, + 1, + 1, + SHORT_RECONCILE_TIMEOUT, + &self.cancel, + ) + .await + { + if let Err(e) = result { + tracing::warn!("Error calling heatmap upload after shard split: {e}"); + return; + } + } + + for result in self + .tenant_for_shards_api( + secondary, + |tenant_shard_id, client| async move { + client + .tenant_secondary_download(tenant_shard_id, Some(Duration::ZERO)) + .await + }, + 1, + 1, + SHORT_RECONCILE_TIMEOUT, + &self.cancel, + ) + .await + { + if let Err(e) = result { + tracing::warn!("Error calling secondary download after shard split: {e}"); + return; + } } } @@ -2963,7 +3331,12 @@ impl Service { ) -> Result { // TODO: return 503 if we get stuck waiting for this lock // (issue https://github.com/neondatabase/neon/issues/7108) - let _tenant_lock = self.tenant_op_locks.exclusive(tenant_id).await; + let _tenant_lock = trace_exclusive_lock( + &self.tenant_op_locks, + tenant_id, + TenantOperations::ShardSplit, + ) + .await; let new_shard_count = ShardCount::new(split_req.new_shard_count); let new_stripe_size = split_req.new_stripe_size; @@ -2981,8 +3354,8 @@ impl Service { .do_tenant_shard_split(tenant_id, shard_split_params) .await; - match r { - Ok(r) => Ok(r), + let (response, waiters) = match r { + Ok(r) => r, Err(e) => { // Split might be part-done, we must do work to abort it. tracing::warn!("Enqueuing background abort of split on {tenant_id}"); @@ -2995,9 +3368,17 @@ impl Service { }) // Ignore error sending: that just means we're shutting down: aborts are ephemeral so it's fine to drop it. .ok(); - Err(e) + return Err(e); } - } + }; + + // The split is now complete. As an optimization, we will trigger all the child shards to upload + // a heatmap immediately, and all their secondary locations to start downloading: this avoids waiting + // for the background heatmap/download interval before secondaries get warm enough to migrate shards + // in [`Self::optimize_all`] + self.tenant_shard_split_start_secondaries(tenant_id, waiters) + .await; + Ok(response) } fn prepare_tenant_shard_split( @@ -3147,7 +3528,7 @@ impl Service { &self, tenant_id: TenantId, params: ShardSplitParams, - ) -> Result { + ) -> Result<(TenantShardSplitResponse, Vec), ApiError> { // FIXME: we have dropped self.inner lock, and not yet written anything to the database: another // request could occur here, deleting or mutating the tenant. begin_shard_split checks that the // parent shards exist as expected, but it would be neater to do the above pre-checks within the @@ -3231,6 +3612,10 @@ impl Service { placement_policy: serde_json::to_string(&policy).unwrap(), config: serde_json::to_string(&config).unwrap(), splitting: SplitState::Splitting, + + // Scheduling policies do not carry through to children + scheduling_policy: serde_json::to_string(&ShardSchedulingPolicy::default()) + .unwrap(), }); } @@ -3345,7 +3730,7 @@ impl Service { )); // Replace all the shards we just split with their children: this phase is infallible. - let (response, child_locations) = + let (response, child_locations, waiters) = self.tenant_shard_split_commit_inmem(tenant_id, new_shard_count, new_stripe_size); // Send compute notifications for all the new shards @@ -3372,7 +3757,7 @@ impl Service { } } - Ok(response) + Ok((response, waiters)) } pub(crate) async fn tenant_shard_migrate( @@ -3477,8 +3862,90 @@ impl Service { Ok(()) } - /// For debug/support: a full JSON dump of TenantStates. Returns a response so that - /// we don't have to make TenantState clonable in the return path. + /// This is for debug/support only: assuming tenant data is already present in S3, we "create" a + /// tenant with a very high generation number so that it will see the existing data. + pub(crate) async fn tenant_import( + &self, + tenant_id: TenantId, + ) -> Result { + // Pick an arbitrary available pageserver to use for scanning the tenant in remote storage + let maybe_node = { + self.inner + .read() + .unwrap() + .nodes + .values() + .find(|n| n.is_available()) + .cloned() + }; + let Some(node) = maybe_node else { + return Err(ApiError::BadRequest(anyhow::anyhow!("No nodes available"))); + }; + + let client = PageserverClient::new( + node.get_id(), + node.base_url(), + self.config.jwt_token.as_deref(), + ); + + let scan_result = client + .tenant_scan_remote_storage(tenant_id) + .await + .map_err(|e| passthrough_api_error(&node, e))?; + + // A post-split tenant may contain a mixture of shard counts in remote storage: pick the highest count. + let Some(shard_count) = scan_result + .shards + .iter() + .map(|s| s.tenant_shard_id.shard_count) + .max() + else { + return Err(ApiError::NotFound( + anyhow::anyhow!("No shards found").into(), + )); + }; + + // Ideally we would set each newly imported shard's generation independently, but for correctness it is sufficient + // to + let generation = scan_result + .shards + .iter() + .map(|s| s.generation) + .max() + .expect("We already validated >0 shards"); + + // FIXME: we have no way to recover the shard stripe size from contents of remote storage: this will + // only work if they were using the default stripe size. + let stripe_size = ShardParameters::DEFAULT_STRIPE_SIZE; + + let (response, waiters) = self + .do_tenant_create(TenantCreateRequest { + new_tenant_id: TenantShardId::unsharded(tenant_id), + generation, + + shard_parameters: ShardParameters { + count: shard_count, + stripe_size, + }, + placement_policy: Some(PlacementPolicy::Attached(0)), // No secondaries, for convenient debug/hacking + + // There is no way to know what the tenant's config was: revert to defaults + config: TenantConfig::default(), + }) + .await?; + + if let Err(e) = self.await_waiters(waiters, SHORT_RECONCILE_TIMEOUT).await { + // Since this is a debug/support operation, all kinds of weird issues are possible (e.g. this + // tenant doesn't exist in the control plane), so don't fail the request if it can't fully + // reconcile, as reconciliation includes notifying compute. + tracing::warn!(%tenant_id, "Reconcile not done yet while importing tenant ({e})"); + } + + Ok(response) + } + + /// For debug/support: a full JSON dump of TenantShards. Returns a response so that + /// we don't have to make TenantShard clonable in the return path. pub(crate) fn tenants_dump(&self) -> Result, ApiError> { let serialized = { let locked = self.inner.read().unwrap(); @@ -3582,7 +4049,7 @@ impl Service { } /// For debug/support: a JSON dump of the [`Scheduler`]. Returns a response so that - /// we don't have to make TenantState clonable in the return path. + /// we don't have to make TenantShard clonable in the return path. pub(crate) fn scheduler_dump(&self) -> Result, ApiError> { let serialized = { let locked = self.inner.read().unwrap(); @@ -3638,9 +4105,13 @@ impl Service { &self, register_req: NodeRegisterRequest, ) -> Result<(), ApiError> { - let _node_lock = self.node_op_locks.exclusive(register_req.node_id).await; + let _node_lock = trace_exclusive_lock( + &self.node_op_locks, + register_req.node_id, + NodeOperations::Register, + ) + .await; - // Pre-check for an already-existing node { let locked = self.inner.read().unwrap(); if let Some(node) = locked.nodes.get(®ister_req.node_id) { @@ -3727,7 +4198,8 @@ impl Service { availability: Option, scheduling: Option, ) -> Result<(), ApiError> { - let _node_lock = self.node_op_locks.exclusive(node_id).await; + let _node_lock = + trace_exclusive_lock(&self.node_op_locks, node_id, NodeOperations::Configure).await; if let Some(scheduling) = scheduling { // Scheduling is a persistent part of Node: we must write updates to the database before @@ -3798,8 +4270,9 @@ impl Service { AvailabilityTransition::ToOffline => { tracing::info!("Node {} transition to offline", node_id); let mut tenants_affected: usize = 0; - for (tenant_shard_id, tenant_state) in tenants { - if let Some(observed_loc) = tenant_state.observed.locations.get_mut(&node_id) { + + for (tenant_shard_id, tenant_shard) in tenants { + if let Some(observed_loc) = tenant_shard.observed.locations.get_mut(&node_id) { // When a node goes offline, we set its observed configuration to None, indicating unknown: we will // not assume our knowledge of the node's configuration is accurate until it comes back online observed_loc.conf = None; @@ -3812,18 +4285,24 @@ impl Service { continue; } - if tenant_state.intent.demote_attached(node_id) { - tenant_state.sequence = tenant_state.sequence.next(); - match tenant_state.schedule(scheduler) { + if tenant_shard.intent.demote_attached(node_id) { + tenant_shard.sequence = tenant_shard.sequence.next(); + + // TODO: populate a ScheduleContext including all shards in the same tenant_id (only matters + // for tenants without secondary locations: if they have a secondary location, then this + // schedule() call is just promoting an existing secondary) + let mut schedule_context = ScheduleContext::default(); + + match tenant_shard.schedule(scheduler, &mut schedule_context) { Err(e) => { // It is possible that some tenants will become unschedulable when too many pageservers // go offline: in this case there isn't much we can do other than make the issue observable. - // TODO: give TenantState a scheduling error attribute to be queried later. + // TODO: give TenantShard a scheduling error attribute to be queried later. tracing::warn!(%tenant_shard_id, "Scheduling error when marking pageserver {} offline: {e}", node_id); } Ok(()) => { if self - .maybe_reconcile_shard(tenant_state, &new_nodes) + .maybe_reconcile_shard(tenant_shard, &new_nodes) .is_some() { tenants_affected += 1; @@ -3842,10 +4321,10 @@ impl Service { tracing::info!("Node {} transition to active", node_id); // When a node comes back online, we must reconcile any tenant that has a None observed // location on the node. - for tenant_state in locked.tenants.values_mut() { - if let Some(observed_loc) = tenant_state.observed.locations.get_mut(&node_id) { + for tenant_shard in locked.tenants.values_mut() { + if let Some(observed_loc) = tenant_shard.observed.locations.get_mut(&node_id) { if observed_loc.conf.is_none() { - self.maybe_reconcile_shard(tenant_state, &new_nodes); + self.maybe_reconcile_shard(tenant_shard, &new_nodes); } } } @@ -3853,7 +4332,7 @@ impl Service { // TODO: in the background, we should balance work back onto this pageserver } AvailabilityTransition::Unchanged => { - tracing::info!("Node {} no change during config", node_id); + tracing::debug!("Node {} no change during config", node_id); } } @@ -3865,9 +4344,6 @@ impl Service { /// Helper for methods that will try and call pageserver APIs for /// a tenant, such as timeline CRUD: they cannot proceed unless the tenant /// is attached somewhere. - /// - /// TODO: this doesn't actually ensure attached unless the PlacementPolicy is - /// an attached policy. We should error out if it isn't. fn ensure_attached_schedule( &self, mut locked: std::sync::RwLockWriteGuard<'_, ServiceState>, @@ -3876,10 +4352,27 @@ impl Service { let mut waiters = Vec::new(); let (nodes, tenants, scheduler) = locked.parts_mut(); - for (_tenant_shard_id, shard) in tenants.range_mut(TenantShardId::tenant_range(tenant_id)) { - shard.schedule(scheduler)?; + let mut schedule_context = ScheduleContext::default(); + for (tenant_shard_id, shard) in tenants.range_mut(TenantShardId::tenant_range(tenant_id)) { + shard.schedule(scheduler, &mut schedule_context)?; + + // The shard's policies may not result in an attached location being scheduled: this + // is an error because our caller needs it attached somewhere. + if shard.intent.get_attached().is_none() { + return Err(anyhow::anyhow!( + "Tenant {tenant_id} not scheduled to be attached" + )); + }; + + if shard.stably_attached().is_some() { + // We do not require the shard to be totally up to date on reconciliation: we just require + // that it has been attached on the intended node. Other dirty state such as unattached secondary + // locations, or compute hook notifications can be ignored. + continue; + } if let Some(waiter) = self.maybe_reconcile_shard(shard, nodes) { + tracing::info!("Waiting for shard {tenant_shard_id} to reconcile, in order to ensure it is attached"); waiters.push(waiter); } } @@ -3914,20 +4407,64 @@ impl Service { Ok(()) } - /// Convenience wrapper around [`TenantState::maybe_reconcile`] that provides - /// all the references to parts of Self that are needed + /// Wrap [`TenantShard`] reconciliation methods with acquisition of [`Gate`] and [`ReconcileUnits`], fn maybe_reconcile_shard( &self, - shard: &mut TenantState, + shard: &mut TenantShard, nodes: &Arc>, ) -> Option { - shard.maybe_reconcile( + let reconcile_needed = shard.get_reconcile_needed(nodes); + + match reconcile_needed { + ReconcileNeeded::No => return None, + ReconcileNeeded::WaitExisting(waiter) => return Some(waiter), + ReconcileNeeded::Yes => { + // Fall through to try and acquire units for spawning reconciler + } + }; + + let units = match self.reconciler_concurrency.clone().try_acquire_owned() { + Ok(u) => ReconcileUnits::new(u), + Err(_) => { + tracing::info!(tenant_id=%shard.tenant_shard_id.tenant_id, shard_id=%shard.tenant_shard_id.shard_slug(), + "Concurrency limited: enqueued for reconcile later"); + if !shard.delayed_reconcile { + match self.delayed_reconcile_tx.try_send(shard.tenant_shard_id) { + Err(TrySendError::Closed(_)) => { + // Weird mid-shutdown case? + } + Err(TrySendError::Full(_)) => { + // It is safe to skip sending our ID in the channel: we will eventually get retried by the background reconcile task. + tracing::warn!( + "Many shards are waiting to reconcile: delayed_reconcile queue is full" + ); + } + Ok(()) => { + shard.delayed_reconcile = true; + } + } + } + + // We won't spawn a reconciler, but we will construct a waiter that waits for the shard's sequence + // number to advance. When this function is eventually called again and succeeds in getting units, + // it will spawn a reconciler that makes this waiter complete. + return Some(shard.future_reconcile_waiter()); + } + }; + + let Ok(gate_guard) = self.gate.enter() else { + // Gate closed: we're shutting down, drop out. + return None; + }; + + shard.spawn_reconciler( &self.result_tx, nodes, &self.compute_hook, &self.config, &self.persistence, - &self.gate, + units, + gate_guard, &self.cancel, ) } @@ -3935,25 +4472,350 @@ impl Service { /// Check all tenants for pending reconciliation work, and reconcile those in need. /// Additionally, reschedule tenants that require it. /// - /// Returns how many reconciliation tasks were started + /// Returns how many reconciliation tasks were started, or `1` if no reconciles were + /// spawned but some _would_ have been spawned if `reconciler_concurrency` units where + /// available. A return value of 0 indicates that everything is fully reconciled already. fn reconcile_all(&self) -> usize { let mut locked = self.inner.write().unwrap(); let (nodes, tenants, _scheduler) = locked.parts_mut(); let pageservers = nodes.clone(); + let mut schedule_context = ScheduleContext::default(); + let mut reconciles_spawned = 0; - for (_tenant_shard_id, shard) in tenants.iter_mut() { + for (tenant_shard_id, shard) in tenants.iter_mut() { + if tenant_shard_id.is_shard_zero() { + schedule_context = ScheduleContext::default(); + } + + // Skip checking if this shard is already enqueued for reconciliation + if shard.delayed_reconcile && self.reconciler_concurrency.available_permits() == 0 { + // If there is something delayed, then return a nonzero count so that + // callers like reconcile_all_now do not incorrectly get the impression + // that the system is in a quiescent state. + reconciles_spawned = std::cmp::max(1, reconciles_spawned); + continue; + } + + // Eventual consistency: if an earlier reconcile job failed, and the shard is still + // dirty, spawn another rone if self.maybe_reconcile_shard(shard, &pageservers).is_some() { reconciles_spawned += 1; } + + schedule_context.avoid(&shard.intent.all_pageservers()); } reconciles_spawned } + /// `optimize` in this context means identifying shards which have valid scheduled locations, but + /// could be scheduled somewhere better: + /// - Cutting over to a secondary if the node with the secondary is more lightly loaded + /// * e.g. after a node fails then recovers, to move some work back to it + /// - Cutting over to a secondary if it improves the spread of shard attachments within a tenant + /// * e.g. after a shard split, the initial attached locations will all be on the node where + /// we did the split, but are probably better placed elsewhere. + /// - Creating new secondary locations if it improves the spreading of a sharded tenant + /// * e.g. after a shard split, some locations will be on the same node (where the split + /// happened), and will probably be better placed elsewhere. + /// + /// To put it more briefly: whereas the scheduler respects soft constraints in a ScheduleContext at + /// the time of scheduling, this function looks for cases where a better-scoring location is available + /// according to those same soft constraints. + async fn optimize_all(&self) -> usize { + // Limit on how many shards' optmizations each call to this function will execute. Combined + // with the frequency of background calls, this acts as an implicit rate limit that runs a small + // trickle of optimizations in the background, rather than executing a large number in parallel + // when a change occurs. + const MAX_OPTIMIZATIONS_EXEC_PER_PASS: usize = 2; + + // Synchronous prepare: scan shards for possible scheduling optimizations + let candidate_work = self.optimize_all_plan(); + let candidate_work_len = candidate_work.len(); + + // Asynchronous validate: I/O to pageservers to make sure shards are in a good state to apply validation + let validated_work = self.optimize_all_validate(candidate_work).await; + + let was_work_filtered = validated_work.len() != candidate_work_len; + + // Synchronous apply: update the shards' intent states according to validated optimisations + let mut reconciles_spawned = 0; + let mut optimizations_applied = 0; + let mut locked = self.inner.write().unwrap(); + let (nodes, tenants, scheduler) = locked.parts_mut(); + for (tenant_shard_id, optimization) in validated_work { + let Some(shard) = tenants.get_mut(&tenant_shard_id) else { + // Shard was dropped between planning and execution; + continue; + }; + if shard.apply_optimization(scheduler, optimization) { + optimizations_applied += 1; + if self.maybe_reconcile_shard(shard, nodes).is_some() { + reconciles_spawned += 1; + } + } + + if optimizations_applied >= MAX_OPTIMIZATIONS_EXEC_PER_PASS { + break; + } + } + + if was_work_filtered { + // If we filtered any work out during validation, ensure we return a nonzero value to indicate + // to callers that the system is not in a truly quiet state, it's going to do some work as soon + // as these validations start passing. + reconciles_spawned = std::cmp::max(reconciles_spawned, 1); + } + + reconciles_spawned + } + + fn optimize_all_plan(&self) -> Vec<(TenantShardId, ScheduleOptimization)> { + let mut schedule_context = ScheduleContext::default(); + + let mut tenant_shards: Vec<&TenantShard> = Vec::new(); + + // How many candidate optimizations we will generate, before evaluating them for readniess: setting + // this higher than the execution limit gives us a chance to execute some work even if the first + // few optimizations we find are not ready. + const MAX_OPTIMIZATIONS_PLAN_PER_PASS: usize = 8; + + let mut work = Vec::new(); + + let mut locked = self.inner.write().unwrap(); + let (nodes, tenants, scheduler) = locked.parts_mut(); + for (tenant_shard_id, shard) in tenants.iter() { + if tenant_shard_id.is_shard_zero() { + // Reset accumulators on the first shard in a tenant + schedule_context = ScheduleContext::default(); + schedule_context.mode = ScheduleMode::Speculative; + tenant_shards.clear(); + } + + if work.len() >= MAX_OPTIMIZATIONS_PLAN_PER_PASS { + break; + } + + match shard.get_scheduling_policy() { + ShardSchedulingPolicy::Active => { + // Ok to do optimization + } + ShardSchedulingPolicy::Essential + | ShardSchedulingPolicy::Pause + | ShardSchedulingPolicy::Stop => { + // Policy prevents optimizing this shard. + continue; + } + } + + // Accumulate the schedule context for all the shards in a tenant: we must have + // the total view of all shards before we can try to optimize any of them. + schedule_context.avoid(&shard.intent.all_pageservers()); + if let Some(attached) = shard.intent.get_attached() { + schedule_context.push_attached(*attached); + } + tenant_shards.push(shard); + + // Once we have seen the last shard in the tenant, proceed to search across all shards + // in the tenant for optimizations + if shard.shard.number.0 == shard.shard.count.count() - 1 { + if tenant_shards.iter().any(|s| s.reconciler.is_some()) { + // Do not start any optimizations while another change to the tenant is ongoing: this + // is not necessary for correctness, but simplifies operations and implicitly throttles + // optimization changes to happen in a "trickle" over time. + continue; + } + + if tenant_shards.iter().any(|s| { + !matches!(s.splitting, SplitState::Idle) + || matches!(s.policy, PlacementPolicy::Detached) + }) { + // Never attempt to optimize a tenant that is currently being split, or + // a tenant that is meant to be detached + continue; + } + + // TODO: optimization calculations are relatively expensive: create some fast-path for + // the common idle case (avoiding the search on tenants that we have recently checked) + + for shard in &tenant_shards { + if let Some(optimization) = + // If idle, maybe ptimize attachments: if a shard has a secondary location that is preferable to + // its primary location based on soft constraints, cut it over. + shard.optimize_attachment(nodes, &schedule_context) + { + work.push((shard.tenant_shard_id, optimization)); + break; + } else if let Some(optimization) = + // If idle, maybe optimize secondary locations: if a shard has a secondary location that would be + // better placed on another node, based on ScheduleContext, then adjust it. This + // covers cases like after a shard split, where we might have too many shards + // in the same tenant with secondary locations on the node where they originally split. + shard.optimize_secondary(scheduler, &schedule_context) + { + work.push((shard.tenant_shard_id, optimization)); + break; + } + + // TODO: extend this mechanism to prefer attaching on nodes with fewer attached + // tenants (i.e. extend schedule state to distinguish attached from secondary counts), + // for the total number of attachments on a node (not just within a tenant.) + } + } + } + + work + } + + async fn optimize_all_validate( + &self, + candidate_work: Vec<(TenantShardId, ScheduleOptimization)>, + ) -> Vec<(TenantShardId, ScheduleOptimization)> { + // Take a clone of the node map to use outside the lock in async validation phase + let validation_nodes = { self.inner.read().unwrap().nodes.clone() }; + + let mut want_secondary_status = Vec::new(); + + // Validate our plans: this is an async phase where we may do I/O to pageservers to + // check that the state of locations is acceptable to run the optimization, such as + // checking that a secondary location is sufficiently warmed-up to cleanly cut over + // in a live migration. + let mut validated_work = Vec::new(); + for (tenant_shard_id, optimization) in candidate_work { + match optimization.action { + ScheduleOptimizationAction::MigrateAttachment(MigrateAttachment { + old_attached_node_id: _, + new_attached_node_id, + }) => { + match validation_nodes.get(&new_attached_node_id) { + None => { + // Node was dropped between planning and validation + } + Some(node) => { + if !node.is_available() { + tracing::info!("Skipping optimization migration of {tenant_shard_id} to {new_attached_node_id} because node unavailable"); + } else { + // Accumulate optimizations that require fetching secondary status, so that we can execute these + // remote API requests concurrently. + want_secondary_status.push(( + tenant_shard_id, + node.clone(), + optimization, + )); + } + } + } + } + ScheduleOptimizationAction::ReplaceSecondary(_) => { + // No extra checks needed to replace a secondary: this does not interrupt client access + validated_work.push((tenant_shard_id, optimization)) + } + }; + } + + // Call into pageserver API to find out if the destination secondary location is warm enough for a reasonably smooth migration: we + // do this so that we avoid spawning a Reconciler that would have to wait minutes/hours for a destination to warm up: that reconciler + // would hold a precious reconcile semaphore unit the whole time it was waiting for the destination to warm up. + let results = self + .tenant_for_shards_api( + want_secondary_status + .iter() + .map(|i| (i.0, i.1.clone())) + .collect(), + |tenant_shard_id, client| async move { + client.tenant_secondary_status(tenant_shard_id).await + }, + 1, + 1, + SHORT_RECONCILE_TIMEOUT, + &self.cancel, + ) + .await; + + for ((tenant_shard_id, node, optimization), secondary_status) in + want_secondary_status.into_iter().zip(results.into_iter()) + { + match secondary_status { + Err(e) => { + tracing::info!("Skipping migration of {tenant_shard_id} to {node}, error querying secondary: {e}"); + } + Ok(progress) => { + // We require secondary locations to have less than 10GiB of downloads pending before we will use + // them in an optimization + const DOWNLOAD_FRESHNESS_THRESHOLD: u64 = 10 * 1024 * 1024 * 1024; + + if progress.heatmap_mtime.is_none() + || progress.bytes_total < DOWNLOAD_FRESHNESS_THRESHOLD + && progress.bytes_downloaded != progress.bytes_total + || progress.bytes_total - progress.bytes_downloaded + > DOWNLOAD_FRESHNESS_THRESHOLD + { + tracing::info!("Skipping migration of {tenant_shard_id} to {node} because secondary isn't ready: {progress:?}"); + } else { + // Location looks ready: proceed + tracing::info!( + "{tenant_shard_id} secondary on {node} is warm enough for migration: {progress:?}" + ); + validated_work.push((tenant_shard_id, optimization)) + } + } + } + } + + validated_work + } + + /// Useful for tests: run whatever work a background [`Self::reconcile_all`] would have done, but + /// also wait for any generated Reconcilers to complete. Calling this until it returns zero should + /// put the system into a quiescent state where future background reconciliations won't do anything. + pub(crate) async fn reconcile_all_now(&self) -> Result { + let reconciles_spawned = self.reconcile_all(); + let reconciles_spawned = if reconciles_spawned == 0 { + // Only optimize when we are otherwise idle + self.optimize_all().await + } else { + reconciles_spawned + }; + + let waiters = { + let mut waiters = Vec::new(); + let locked = self.inner.read().unwrap(); + for (_tenant_shard_id, shard) in locked.tenants.iter() { + if let Some(waiter) = shard.get_waiter() { + waiters.push(waiter); + } + } + waiters + }; + + let waiter_count = waiters.len(); + match self.await_waiters(waiters, RECONCILE_TIMEOUT).await { + Ok(()) => {} + Err(ReconcileWaitError::Failed(_, reconcile_error)) + if matches!(*reconcile_error, ReconcileError::Cancel) => + { + // Ignore reconciler cancel errors: this reconciler might have shut down + // because some other change superceded it. We will return a nonzero number, + // so the caller knows they might have to call again to quiesce the system. + } + Err(e) => { + return Err(e); + } + }; + + tracing::info!( + "{} reconciles in reconcile_all, {} waiters", + reconciles_spawned, + waiter_count + ); + + Ok(std::cmp::max(waiter_count, reconciles_spawned)) + } + pub async fn shutdown(&self) { // Note that this already stops processing any results from reconciles: so - // we do not expect that our [`TenantState`] objects will reach a neat + // we do not expect that our [`TenantShard`] objects will reach a neat // final state. self.cancel.cancel(); diff --git a/control_plane/attachment_service/src/tenant_state.rs b/storage_controller/src/tenant_shard.rs similarity index 53% rename from control_plane/attachment_service/src/tenant_state.rs rename to storage_controller/src/tenant_shard.rs index 83c921dc58..dda17f9887 100644 --- a/control_plane/attachment_service/src/tenant_state.rs +++ b/storage_controller/src/tenant_shard.rs @@ -7,8 +7,10 @@ use std::{ use crate::{ metrics::{self, ReconcileCompleteLabelGroup, ReconcileOutcome}, persistence::TenantShardPersistence, + reconciler::ReconcileUnits, + scheduler::{AffinityScore, MaySchedule, ScheduleContext}, }; -use pageserver_api::controller_api::PlacementPolicy; +use pageserver_api::controller_api::{PlacementPolicy, ShardSchedulingPolicy}; use pageserver_api::{ models::{LocationConfig, LocationConfigMode, TenantConfig}, shard::{ShardIdentity, TenantShardId}, @@ -21,7 +23,7 @@ use utils::{ generation::Generation, id::NodeId, seqwait::{SeqWait, SeqWaitError}, - sync::gate::Gate, + sync::gate::GateGuard, }; use crate::{ @@ -36,12 +38,18 @@ use crate::{ }; /// Serialization helper -fn read_mutex_content(v: &std::sync::Mutex, serializer: S) -> Result +fn read_last_error(v: &std::sync::Mutex>, serializer: S) -> Result where S: serde::ser::Serializer, - T: Clone + std::fmt::Display, + T: std::fmt::Display, { - serializer.collect_str(&v.lock().unwrap()) + serializer.collect_str( + &v.lock() + .unwrap() + .as_ref() + .map(|e| format!("{e}")) + .unwrap_or("".to_string()), + ) } /// In-memory state for a particular tenant shard. @@ -49,7 +57,7 @@ where /// This struct implement Serialize for debugging purposes, but is _not_ persisted /// itself: see [`crate::persistence`] for the subset of tenant shard state that is persisted. #[derive(Serialize)] -pub(crate) struct TenantState { +pub(crate) struct TenantShard { pub(crate) tenant_shard_id: TenantShardId, pub(crate) shard: ShardIdentity, @@ -94,6 +102,10 @@ pub(crate) struct TenantState { /// reconciliation, and timeline creation. pub(crate) splitting: SplitState, + /// If a tenant was enqueued for later reconcile due to hitting concurrency limit, this flag + /// is set. This flag is cleared when the tenant is popped off the delay queue. + pub(crate) delayed_reconcile: bool, + /// Optionally wait for reconciliation to complete up to a particular /// sequence number. #[serde(skip)] @@ -105,17 +117,25 @@ pub(crate) struct TenantState { #[serde(skip)] pub(crate) error_waiter: std::sync::Arc>, - /// The most recent error from a reconcile on this tenant + /// The most recent error from a reconcile on this tenant. This is a nested Arc + /// because: + /// - ReconcileWaiters need to Arc-clone the overall object to read it later + /// - ReconcileWaitError needs to use an `Arc` because we can construct + /// many waiters for one shard, and the underlying error types are not Clone. /// TODO: generalize to an array of recent events /// TOOD: use a ArcSwap instead of mutex for faster reads? - #[serde(serialize_with = "read_mutex_content")] - pub(crate) last_error: std::sync::Arc>, + #[serde(serialize_with = "read_last_error")] + pub(crate) last_error: std::sync::Arc>>>, /// If we have a pending compute notification that for some reason we weren't able to send, - /// set this to true. If this is set, calls to [`Self::maybe_reconcile`] will run a task to retry - /// sending it. This is the mechanism by which compute notifications are included in the scope + /// set this to true. If this is set, calls to [`Self::get_reconcile_needed`] will return Yes + /// and trigger a Reconciler run. This is the mechanism by which compute notifications are included in the scope /// of state that we publish externally in an eventually consistent way. pub(crate) pending_compute_notification: bool, + + // Support/debug tool: if something is going wrong or flapping with scheduling, this may + // be set to a non-active state to avoid making changes while the issue is fixed. + scheduling_policy: ShardSchedulingPolicy, } #[derive(Default, Clone, Debug, Serialize)] @@ -246,8 +266,13 @@ impl IntentState { impl Drop for IntentState { fn drop(&mut self) { - // Must clear before dropping, to avoid leaving stale refcounts in the Scheduler - debug_assert!(self.attached.is_none() && self.secondary.is_empty()); + // Must clear before dropping, to avoid leaving stale refcounts in the Scheduler. + // We do not check this while panicking, to avoid polluting unit test failures or + // other assertions with this assertion's output. It's still wrong to leak these, + // but if we already have a panic then we don't need to independently flag this case. + if !(std::thread::panicking()) { + debug_assert!(self.attached.is_none() && self.secondary.is_empty()); + } } } @@ -278,18 +303,48 @@ pub(crate) struct ReconcilerWaiter { seq_wait: std::sync::Arc>, error_seq_wait: std::sync::Arc>, - error: std::sync::Arc>, + error: std::sync::Arc>>>, seq: Sequence, } #[derive(thiserror::Error, Debug)] -pub enum ReconcileWaitError { +pub(crate) enum ReconcileWaitError { #[error("Timeout waiting for shard {0}")] Timeout(TenantShardId), #[error("shutting down")] Shutdown, #[error("Reconcile error on shard {0}: {1}")] - Failed(TenantShardId, String), + Failed(TenantShardId, Arc), +} + +#[derive(Eq, PartialEq, Debug)] +pub(crate) struct ReplaceSecondary { + old_node_id: NodeId, + new_node_id: NodeId, +} + +#[derive(Eq, PartialEq, Debug)] +pub(crate) struct MigrateAttachment { + pub(crate) old_attached_node_id: NodeId, + pub(crate) new_attached_node_id: NodeId, +} + +#[derive(Eq, PartialEq, Debug)] +pub(crate) enum ScheduleOptimizationAction { + // Replace one of our secondary locations with a different node + ReplaceSecondary(ReplaceSecondary), + // Migrate attachment to an existing secondary location + MigrateAttachment(MigrateAttachment), +} + +#[derive(Eq, PartialEq, Debug)] +pub(crate) struct ScheduleOptimization { + // What was the reconcile sequence when we generated this optimization? The optimization + // should only be applied if the shard's sequence is still at this value, in case other changes + // happened between planning the optimization and applying it. + sequence: Sequence, + + pub(crate) action: ScheduleOptimizationAction, } impl ReconcilerWaiter { @@ -307,7 +362,8 @@ impl ReconcilerWaiter { SeqWaitError::Timeout => unreachable!() })?; - return Err(ReconcileWaitError::Failed(self.tenant_shard_id, self.error.lock().unwrap().clone())) + return Err(ReconcileWaitError::Failed(self.tenant_shard_id, + self.error.lock().unwrap().clone().expect("If error_seq_wait was advanced error was set").clone())) } } @@ -323,8 +379,19 @@ pub(crate) struct ReconcilerHandle { cancel: CancellationToken, } +pub(crate) enum ReconcileNeeded { + /// shard either doesn't need reconciliation, or is forbidden from spawning a reconciler + /// in its current state (e.g. shard split in progress, or ShardSchedulingPolicy forbids it) + No, + /// shard has a reconciler running, and its intent hasn't changed since that one was + /// spawned: wait for the existing reconciler rather than spawning a new one. + WaitExisting(ReconcilerWaiter), + /// shard needs reconciliation: call into [`TenantShard::spawn_reconciler`] + Yes, +} + /// When a reconcile task completes, it sends this result object -/// to be applied to the primary TenantState. +/// to be applied to the primary TenantShard. pub(crate) struct ReconcileResult { pub(crate) sequence: Sequence, /// On errors, `observed` should be treated as an incompleted description @@ -337,7 +404,7 @@ pub(crate) struct ReconcileResult { pub(crate) generation: Option, pub(crate) observed: ObservedState, - /// Set [`TenantState::pending_compute_notification`] from this flag + /// Set [`TenantShard::pending_compute_notification`] from this flag pub(crate) pending_compute_notification: bool, } @@ -349,7 +416,7 @@ impl ObservedState { } } -impl TenantState { +impl TenantShard { pub(crate) fn new( tenant_shard_id: TenantShardId, shard: ShardIdentity, @@ -366,10 +433,12 @@ impl TenantState { reconciler: None, splitting: SplitState::Idle, sequence: Sequence(1), + delayed_reconcile: false, waiter: Arc::new(SeqWait::new(Sequence(0))), error_waiter: Arc::new(SeqWait::new(Sequence(0))), last_error: Arc::default(), pending_compute_notification: false, + scheduling_policy: ShardSchedulingPolicy::default(), } } @@ -425,6 +494,7 @@ impl TenantState { fn schedule_attached( &mut self, scheduler: &mut Scheduler, + context: &ScheduleContext, ) -> Result<(bool, NodeId), ScheduleError> { // No work to do if we already have an attached tenant if let Some(node_id) = self.intent.attached { @@ -438,14 +508,33 @@ impl TenantState { Ok((true, promote_secondary)) } else { // Pick a fresh node: either we had no secondaries or none were schedulable - let node_id = scheduler.schedule_shard(&self.intent.secondary)?; + let node_id = scheduler.schedule_shard(&self.intent.secondary, context)?; tracing::debug!("Selected {} as attached", node_id); self.intent.set_attached(scheduler, Some(node_id)); Ok((true, node_id)) } } - pub(crate) fn schedule(&mut self, scheduler: &mut Scheduler) -> Result<(), ScheduleError> { + pub(crate) fn schedule( + &mut self, + scheduler: &mut Scheduler, + context: &mut ScheduleContext, + ) -> Result<(), ScheduleError> { + let r = self.do_schedule(scheduler, context); + + context.avoid(&self.intent.all_pageservers()); + if let Some(attached) = self.intent.get_attached() { + context.push_attached(*attached); + } + + r + } + + pub(crate) fn do_schedule( + &mut self, + scheduler: &mut Scheduler, + context: &ScheduleContext, + ) -> Result<(), ScheduleError> { // TODO: before scheduling new nodes, check if any existing content in // self.intent refers to pageservers that are offline, and pick other // pageservers if so. @@ -453,6 +542,16 @@ impl TenantState { // TODO: respect the splitting bit on tenants: if they are currently splitting then we may not // change their attach location. + match self.scheduling_policy { + ShardSchedulingPolicy::Active | ShardSchedulingPolicy::Essential => {} + ShardSchedulingPolicy::Pause | ShardSchedulingPolicy::Stop => { + // Warn to make it obvious why other things aren't happening/working, if we skip scheduling + tracing::warn!(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), + "Scheduling is disabled by policy {:?}", self.scheduling_policy); + return Ok(()); + } + } + // Build the set of pageservers already in use by this tenant, to avoid scheduling // more work on the same pageservers we're already using. let mut modified = false; @@ -479,12 +578,13 @@ impl TenantState { } // Should have exactly one attached, and N secondaries - let (modified_attached, attached_node_id) = self.schedule_attached(scheduler)?; + let (modified_attached, attached_node_id) = + self.schedule_attached(scheduler, context)?; modified |= modified_attached; let mut used_pageservers = vec![attached_node_id]; while self.intent.secondary.len() < secondary_count { - let node_id = scheduler.schedule_shard(&used_pageservers)?; + let node_id = scheduler.schedule_shard(&used_pageservers, context)?; self.intent.push_secondary(scheduler, node_id); used_pageservers.push(node_id); modified = true; @@ -497,7 +597,7 @@ impl TenantState { modified = true; } else if self.intent.secondary.is_empty() { // Populate secondary by scheduling a fresh node - let node_id = scheduler.schedule_shard(&[])?; + let node_id = scheduler.schedule_shard(&[], context)?; self.intent.push_secondary(scheduler, node_id); modified = true; } @@ -524,6 +624,181 @@ impl TenantState { Ok(()) } + /// Optimize attachments: if a shard has a secondary location that is preferable to + /// its primary location based on soft constraints, switch that secondary location + /// to be attached. + #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))] + pub(crate) fn optimize_attachment( + &self, + nodes: &HashMap, + schedule_context: &ScheduleContext, + ) -> Option { + let attached = (*self.intent.get_attached())?; + if self.intent.secondary.is_empty() { + // We can only do useful work if we have both attached and secondary locations: this + // function doesn't schedule new locations, only swaps between attached and secondaries. + return None; + } + + let current_affinity_score = schedule_context.get_node_affinity(attached); + let current_attachment_count = schedule_context.get_node_attachments(attached); + + // Generate score for each node, dropping any un-schedulable nodes. + let all_pageservers = self.intent.all_pageservers(); + let mut scores = all_pageservers + .iter() + .flat_map(|node_id| { + if matches!( + nodes + .get(node_id) + .map(|n| n.may_schedule()) + .unwrap_or(MaySchedule::No), + MaySchedule::No + ) { + None + } else { + let affinity_score = schedule_context.get_node_affinity(*node_id); + let attachment_count = schedule_context.get_node_attachments(*node_id); + Some((*node_id, affinity_score, attachment_count)) + } + }) + .collect::>(); + + // Sort precedence: + // 1st - prefer nodes with the lowest total affinity score + // 2nd - prefer nodes with the lowest number of attachments in this context + // 3rd - if all else is equal, sort by node ID for determinism in tests. + scores.sort_by_key(|i| (i.1, i.2, i.0)); + + if let Some((preferred_node, preferred_affinity_score, preferred_attachment_count)) = + scores.first() + { + if attached != *preferred_node { + // The best alternative must be more than 1 better than us, otherwise we could end + // up flapping back next time we're called (e.g. there's no point migrating from + // a location with score 1 to a score zero, because on next location the situation + // would be the same, but in reverse). + if current_affinity_score > *preferred_affinity_score + AffinityScore(1) + || current_attachment_count > *preferred_attachment_count + 1 + { + tracing::info!( + "Identified optimization: migrate attachment {attached}->{preferred_node} (secondaries {:?})", + self.intent.get_secondary() + ); + return Some(ScheduleOptimization { + sequence: self.sequence, + action: ScheduleOptimizationAction::MigrateAttachment(MigrateAttachment { + old_attached_node_id: attached, + new_attached_node_id: *preferred_node, + }), + }); + } + } else { + tracing::debug!( + "Node {} is already preferred (score {:?})", + preferred_node, + preferred_affinity_score + ); + } + } + + // Fall-through: we didn't find an optimization + None + } + + #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))] + pub(crate) fn optimize_secondary( + &self, + scheduler: &Scheduler, + schedule_context: &ScheduleContext, + ) -> Option { + if self.intent.secondary.is_empty() { + // We can only do useful work if we have both attached and secondary locations: this + // function doesn't schedule new locations, only swaps between attached and secondaries. + return None; + } + + for secondary in self.intent.get_secondary() { + let Some(affinity_score) = schedule_context.nodes.get(secondary) else { + // We're already on a node unaffected any affinity constraints, + // so we won't change it. + continue; + }; + + // Let the scheduler suggest a node, where it would put us if we were scheduling afresh + // This implicitly limits the choice to nodes that are available, and prefers nodes + // with lower utilization. + let Ok(candidate_node) = + scheduler.schedule_shard(&self.intent.all_pageservers(), schedule_context) + else { + // A scheduling error means we have no possible candidate replacements + continue; + }; + + let candidate_affinity_score = schedule_context + .nodes + .get(&candidate_node) + .unwrap_or(&AffinityScore::FREE); + + // The best alternative must be more than 1 better than us, otherwise we could end + // up flapping back next time we're called. + if *candidate_affinity_score + AffinityScore(1) < *affinity_score { + // If some other node is available and has a lower score than this node, then + // that other node is a good place to migrate to. + tracing::info!( + "Identified optimization: replace secondary {secondary}->{candidate_node} (current secondaries {:?})", + self.intent.get_secondary() + ); + return Some(ScheduleOptimization { + sequence: self.sequence, + action: ScheduleOptimizationAction::ReplaceSecondary(ReplaceSecondary { + old_node_id: *secondary, + new_node_id: candidate_node, + }), + }); + } + } + + None + } + + /// Return true if the optimization was really applied: it will not be applied if the optimization's + /// sequence is behind this tenant shard's + pub(crate) fn apply_optimization( + &mut self, + scheduler: &mut Scheduler, + optimization: ScheduleOptimization, + ) -> bool { + if optimization.sequence != self.sequence { + return false; + } + + metrics::METRICS_REGISTRY + .metrics_group + .storage_controller_schedule_optimization + .inc(); + + match optimization.action { + ScheduleOptimizationAction::MigrateAttachment(MigrateAttachment { + old_attached_node_id, + new_attached_node_id, + }) => { + self.intent.demote_attached(old_attached_node_id); + self.intent + .promote_attached(scheduler, new_attached_node_id); + } + ScheduleOptimizationAction::ReplaceSecondary(ReplaceSecondary { + old_node_id, + new_node_id, + }) => { + self.intent.remove_secondary(scheduler, old_node_id); + self.intent.push_secondary(scheduler, new_node_id); + } + } + + true + } + /// Query whether the tenant's observed state for attached node matches its intent state, and if so, /// yield the node ID. This is appropriate for emitting compute hook notifications: we are checking that /// the node in question is not only where we intend to attach, but that the tenant is indeed already attached there. @@ -608,16 +883,10 @@ impl TenantState { #[allow(clippy::too_many_arguments)] #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))] - pub(crate) fn maybe_reconcile( + pub(crate) fn get_reconcile_needed( &mut self, - result_tx: &tokio::sync::mpsc::UnboundedSender, pageservers: &Arc>, - compute_hook: &Arc, - service_config: &service::Config, - persistence: &Arc, - gate: &Gate, - cancel: &CancellationToken, - ) -> Option { + ) -> ReconcileNeeded { // If there are any ambiguous observed states, and the nodes they refer to are available, // we should reconcile to clean them up. let mut dirty_observed = false; @@ -639,8 +908,8 @@ impl TenantState { active_nodes_dirty || dirty_observed || self.pending_compute_notification; if !do_reconcile { - tracing::info!("Not dirty, no reconciliation needed."); - return None; + tracing::debug!("Not dirty, no reconciliation needed."); + return ReconcileNeeded::No; } // If we are currently splitting, then never start a reconciler task: the splitting logic @@ -648,7 +917,7 @@ impl TenantState { // up top, so that we only log this message if we would otherwise have done a reconciliation. if !matches!(self.splitting, SplitState::Idle) { tracing::info!("Refusing to reconcile, splitting in progress"); - return None; + return ReconcileNeeded::No; } // Reconcile already in flight for the current sequence? @@ -658,7 +927,7 @@ impl TenantState { "Reconciliation already in progress for sequence {:?}", self.sequence, ); - return Some(ReconcilerWaiter { + return ReconcileNeeded::WaitExisting(ReconcilerWaiter { tenant_shard_id: self.tenant_shard_id, seq_wait: self.waiter.clone(), error_seq_wait: self.error_waiter.clone(), @@ -668,6 +937,76 @@ impl TenantState { } } + // Pre-checks done: finally check whether we may actually do the work + match self.scheduling_policy { + ShardSchedulingPolicy::Active + | ShardSchedulingPolicy::Essential + | ShardSchedulingPolicy::Pause => {} + ShardSchedulingPolicy::Stop => { + // We only reach this point if there is work to do and we're going to skip + // doing it: warn it obvious why this tenant isn't doing what it ought to. + tracing::warn!("Skipping reconcile for policy {:?}", self.scheduling_policy); + return ReconcileNeeded::No; + } + } + + ReconcileNeeded::Yes + } + + /// Ensure the sequence number is set to a value where waiting for this value will make us wait + /// for the next reconcile: i.e. it is ahead of all completed or running reconcilers. + /// + /// Constructing a ReconcilerWaiter with the resulting sequence number gives the property + /// that the waiter will not complete until some future Reconciler is constructed and run. + fn ensure_sequence_ahead(&mut self) { + // Find the highest sequence for which a Reconciler has previously run or is currently + // running + let max_seen = std::cmp::max( + self.reconciler + .as_ref() + .map(|r| r.sequence) + .unwrap_or(Sequence(0)), + std::cmp::max(self.waiter.load(), self.error_waiter.load()), + ); + + if self.sequence <= max_seen { + self.sequence = max_seen.next(); + } + } + + /// Create a waiter that will wait for some future Reconciler that hasn't been spawned yet. + /// + /// This is appropriate when you can't spawn a reconciler (e.g. due to resource limits), but + /// you would like to wait on the next reconciler that gets spawned in the background. + pub(crate) fn future_reconcile_waiter(&mut self) -> ReconcilerWaiter { + self.ensure_sequence_ahead(); + + ReconcilerWaiter { + tenant_shard_id: self.tenant_shard_id, + seq_wait: self.waiter.clone(), + error_seq_wait: self.error_waiter.clone(), + error: self.last_error.clone(), + seq: self.sequence, + } + } + + #[allow(clippy::too_many_arguments)] + #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))] + pub(crate) fn spawn_reconciler( + &mut self, + result_tx: &tokio::sync::mpsc::UnboundedSender, + pageservers: &Arc>, + compute_hook: &Arc, + service_config: &service::Config, + persistence: &Arc, + units: ReconcileUnits, + gate_guard: GateGuard, + cancel: &CancellationToken, + ) -> Option { + // Reconcile in flight for a stale sequence? Our sequence's task will wait for it before + // doing our sequence's work. + let old_handle = self.reconciler.take(); + // Build list of nodes from which the reconciler should detach let mut detach = Vec::new(); for node_id in self.observed.locations.keys() { @@ -683,18 +1022,9 @@ impl TenantState { } } - // Reconcile in flight for a stale sequence? Our sequence's task will wait for it before - // doing our sequence's work. - let old_handle = self.reconciler.take(); - - let Ok(gate_guard) = gate.enter() else { - // Shutting down, don't start a reconciler - return None; - }; - // Advance the sequence before spawning a reconciler, so that sequence waiters // can distinguish between before+after the reconcile completes. - self.sequence = self.sequence.next(); + self.ensure_sequence_ahead(); let reconciler_cancel = cancel.child_token(); let reconciler_intent = TargetState::from_intent(pageservers, &self.intent); @@ -709,6 +1039,7 @@ impl TenantState { compute_hook: compute_hook.clone(), service_config: service_config.clone(), _gate_guard: gate_guard, + _resource_units: units, cancel: reconciler_cancel.clone(), persistence: persistence.clone(), compute_notify_failure: false, @@ -775,16 +1106,18 @@ impl TenantState { status: outcome_label, }); - result_tx - .send(ReconcileResult { - sequence: reconcile_seq, - result, - tenant_shard_id: reconciler.tenant_shard_id, - generation: reconciler.generation, - observed: reconciler.observed, - pending_compute_notification: reconciler.compute_notify_failure, - }) - .ok(); + // Constructing result implicitly drops Reconciler, freeing any ReconcileUnits before the Service might + // try and schedule more work in response to our result. + let result = ReconcileResult { + sequence: reconcile_seq, + result, + tenant_shard_id: reconciler.tenant_shard_id, + generation: reconciler.generation, + observed: reconciler.observed, + pending_compute_notification: reconciler.compute_notify_failure, + }; + + result_tx.send(result).ok(); } .instrument(reconciler_span), ); @@ -804,6 +1137,22 @@ impl TenantState { }) } + /// Get a waiter for any reconciliation in flight, but do not start reconciliation + /// if it is not already running + pub(crate) fn get_waiter(&self) -> Option { + if self.reconciler.is_some() { + Some(ReconcilerWaiter { + tenant_shard_id: self.tenant_shard_id, + seq_wait: self.waiter.clone(), + error_seq_wait: self.error_waiter.clone(), + error: self.last_error.clone(), + seq: self.sequence, + }) + } else { + None + } + } + /// Called when a ReconcileResult has been emitted and the service is updating /// our state: if the result is from a sequence >= my ReconcileHandle, then drop /// the handle to indicate there is no longer a reconciliation in progress. @@ -829,6 +1178,48 @@ impl TenantState { debug_assert!(!self.intent.all_pageservers().contains(&node_id)); } + pub(crate) fn set_scheduling_policy(&mut self, p: ShardSchedulingPolicy) { + self.scheduling_policy = p; + } + + pub(crate) fn get_scheduling_policy(&self) -> &ShardSchedulingPolicy { + &self.scheduling_policy + } + + pub(crate) fn set_last_error(&mut self, sequence: Sequence, error: ReconcileError) { + // Ordering: always set last_error before advancing sequence, so that sequence + // waiters are guaranteed to see a Some value when they see an error. + *(self.last_error.lock().unwrap()) = Some(Arc::new(error)); + self.error_waiter.advance(sequence); + } + + pub(crate) fn from_persistent( + tsp: TenantShardPersistence, + intent: IntentState, + ) -> anyhow::Result { + let tenant_shard_id = tsp.get_tenant_shard_id()?; + let shard_identity = tsp.get_shard_identity()?; + + Ok(Self { + tenant_shard_id, + shard: shard_identity, + sequence: Sequence::initial(), + generation: tsp.generation.map(|g| Generation::new(g as u32)), + policy: serde_json::from_str(&tsp.placement_policy).unwrap(), + intent, + observed: ObservedState::new(), + config: serde_json::from_str(&tsp.config).unwrap(), + reconciler: None, + splitting: tsp.splitting, + waiter: Arc::new(SeqWait::new(Sequence::initial())), + error_waiter: Arc::new(SeqWait::new(Sequence::initial())), + last_error: Arc::default(), + pending_compute_notification: false, + delayed_reconcile: false, + scheduling_policy: serde_json::from_str(&tsp.scheduling_policy).unwrap(), + }) + } + pub(crate) fn to_persistent(&self) -> TenantShardPersistence { TenantShardPersistence { tenant_id: self.tenant_shard_id.tenant_id.to_string(), @@ -840,6 +1231,7 @@ impl TenantState { placement_policy: serde_json::to_string(&self.policy).unwrap(), config: serde_json::to_string(&self.config).unwrap(), splitting: SplitState::default(), + scheduling_policy: serde_json::to_string(&self.scheduling_policy).unwrap(), } } } @@ -856,7 +1248,7 @@ pub(crate) mod tests { use super::*; - fn make_test_tenant_shard(policy: PlacementPolicy) -> TenantState { + fn make_test_tenant_shard(policy: PlacementPolicy) -> TenantShard { let tenant_id = TenantId::generate(); let shard_number = ShardNumber(0); let shard_count = ShardCount::new(1); @@ -866,7 +1258,7 @@ pub(crate) mod tests { shard_number, shard_count, }; - TenantState::new( + TenantShard::new( tenant_shard_id, ShardIdentity::new( shard_number, @@ -878,6 +1270,32 @@ pub(crate) mod tests { ) } + fn make_test_tenant(policy: PlacementPolicy, shard_count: ShardCount) -> Vec { + let tenant_id = TenantId::generate(); + + (0..shard_count.count()) + .map(|i| { + let shard_number = ShardNumber(i); + + let tenant_shard_id = TenantShardId { + tenant_id, + shard_number, + shard_count, + }; + TenantShard::new( + tenant_shard_id, + ShardIdentity::new( + shard_number, + shard_count, + pageserver_api::shard::ShardStripeSize(32768), + ) + .unwrap(), + policy.clone(), + ) + }) + .collect() + } + /// Test the scheduling behaviors used when a tenant configured for HA is subject /// to nodes being marked offline. #[test] @@ -887,25 +1305,26 @@ pub(crate) mod tests { let mut nodes = make_test_nodes(3); let mut scheduler = Scheduler::new(nodes.values()); + let mut context = ScheduleContext::default(); - let mut tenant_state = make_test_tenant_shard(PlacementPolicy::Attached(1)); - tenant_state - .schedule(&mut scheduler) + let mut tenant_shard = make_test_tenant_shard(PlacementPolicy::Attached(1)); + tenant_shard + .schedule(&mut scheduler, &mut context) .expect("we have enough nodes, scheduling should work"); // Expect to initially be schedule on to different nodes - assert_eq!(tenant_state.intent.secondary.len(), 1); - assert!(tenant_state.intent.attached.is_some()); + assert_eq!(tenant_shard.intent.secondary.len(), 1); + assert!(tenant_shard.intent.attached.is_some()); - let attached_node_id = tenant_state.intent.attached.unwrap(); - let secondary_node_id = *tenant_state.intent.secondary.iter().last().unwrap(); + let attached_node_id = tenant_shard.intent.attached.unwrap(); + let secondary_node_id = *tenant_shard.intent.secondary.iter().last().unwrap(); assert_ne!(attached_node_id, secondary_node_id); // Notifying the attached node is offline should demote it to a secondary - let changed = tenant_state.intent.demote_attached(attached_node_id); + let changed = tenant_shard.intent.demote_attached(attached_node_id); assert!(changed); - assert!(tenant_state.intent.attached.is_none()); - assert_eq!(tenant_state.intent.secondary.len(), 2); + assert!(tenant_shard.intent.attached.is_none()); + assert_eq!(tenant_shard.intent.secondary.len(), 2); // Update the scheduler state to indicate the node is offline nodes @@ -915,18 +1334,18 @@ pub(crate) mod tests { scheduler.node_upsert(nodes.get(&attached_node_id).unwrap()); // Scheduling the node should promote the still-available secondary node to attached - tenant_state - .schedule(&mut scheduler) + tenant_shard + .schedule(&mut scheduler, &mut context) .expect("active nodes are available"); - assert_eq!(tenant_state.intent.attached.unwrap(), secondary_node_id); + assert_eq!(tenant_shard.intent.attached.unwrap(), secondary_node_id); // The original attached node should have been retained as a secondary assert_eq!( - *tenant_state.intent.secondary.iter().last().unwrap(), + *tenant_shard.intent.secondary.iter().last().unwrap(), attached_node_id ); - tenant_state.intent.clear(&mut scheduler); + tenant_shard.intent.clear(&mut scheduler); Ok(()) } @@ -936,48 +1355,272 @@ pub(crate) mod tests { let nodes = make_test_nodes(3); let mut scheduler = Scheduler::new(nodes.values()); - let mut tenant_state = make_test_tenant_shard(PlacementPolicy::Attached(1)); + let mut tenant_shard = make_test_tenant_shard(PlacementPolicy::Attached(1)); - tenant_state.observed.locations.insert( + tenant_shard.observed.locations.insert( NodeId(3), ObservedStateLocation { conf: Some(LocationConfig { mode: LocationConfigMode::AttachedMulti, generation: Some(2), secondary_conf: None, - shard_number: tenant_state.shard.number.0, - shard_count: tenant_state.shard.count.literal(), - shard_stripe_size: tenant_state.shard.stripe_size.0, + shard_number: tenant_shard.shard.number.0, + shard_count: tenant_shard.shard.count.literal(), + shard_stripe_size: tenant_shard.shard.stripe_size.0, tenant_conf: TenantConfig::default(), }), }, ); - tenant_state.observed.locations.insert( + tenant_shard.observed.locations.insert( NodeId(2), ObservedStateLocation { conf: Some(LocationConfig { mode: LocationConfigMode::AttachedStale, generation: Some(1), secondary_conf: None, - shard_number: tenant_state.shard.number.0, - shard_count: tenant_state.shard.count.literal(), - shard_stripe_size: tenant_state.shard.stripe_size.0, + shard_number: tenant_shard.shard.number.0, + shard_count: tenant_shard.shard.count.literal(), + shard_stripe_size: tenant_shard.shard.stripe_size.0, tenant_conf: TenantConfig::default(), }), }, ); - tenant_state.intent_from_observed(&mut scheduler); + tenant_shard.intent_from_observed(&mut scheduler); // The highest generationed attached location gets used as attached - assert_eq!(tenant_state.intent.attached, Some(NodeId(3))); + assert_eq!(tenant_shard.intent.attached, Some(NodeId(3))); // Other locations get used as secondary - assert_eq!(tenant_state.intent.secondary, vec![NodeId(2)]); + assert_eq!(tenant_shard.intent.secondary, vec![NodeId(2)]); - scheduler.consistency_check(nodes.values(), [&tenant_state].into_iter())?; + scheduler.consistency_check(nodes.values(), [&tenant_shard].into_iter())?; + + tenant_shard.intent.clear(&mut scheduler); + Ok(()) + } + + #[test] + fn scheduling_mode() -> anyhow::Result<()> { + let nodes = make_test_nodes(3); + let mut scheduler = Scheduler::new(nodes.values()); + + let mut tenant_shard = make_test_tenant_shard(PlacementPolicy::Attached(1)); + + // In pause mode, schedule() shouldn't do anything + tenant_shard.scheduling_policy = ShardSchedulingPolicy::Pause; + assert!(tenant_shard + .schedule(&mut scheduler, &mut ScheduleContext::default()) + .is_ok()); + assert!(tenant_shard.intent.all_pageservers().is_empty()); + + // In active mode, schedule() works + tenant_shard.scheduling_policy = ShardSchedulingPolicy::Active; + assert!(tenant_shard + .schedule(&mut scheduler, &mut ScheduleContext::default()) + .is_ok()); + assert!(!tenant_shard.intent.all_pageservers().is_empty()); + + tenant_shard.intent.clear(&mut scheduler); + Ok(()) + } + + #[test] + fn optimize_attachment() -> anyhow::Result<()> { + let nodes = make_test_nodes(3); + let mut scheduler = Scheduler::new(nodes.values()); + + let mut shard_a = make_test_tenant_shard(PlacementPolicy::Attached(1)); + let mut shard_b = make_test_tenant_shard(PlacementPolicy::Attached(1)); + + // Initially: both nodes attached on shard 1, and both have secondary locations + // on different nodes. + shard_a.intent.set_attached(&mut scheduler, Some(NodeId(1))); + shard_a.intent.push_secondary(&mut scheduler, NodeId(2)); + shard_b.intent.set_attached(&mut scheduler, Some(NodeId(1))); + shard_b.intent.push_secondary(&mut scheduler, NodeId(3)); + + let mut schedule_context = ScheduleContext::default(); + schedule_context.avoid(&shard_a.intent.all_pageservers()); + schedule_context.push_attached(shard_a.intent.get_attached().unwrap()); + schedule_context.avoid(&shard_b.intent.all_pageservers()); + schedule_context.push_attached(shard_b.intent.get_attached().unwrap()); + + let optimization_a = shard_a.optimize_attachment(&nodes, &schedule_context); + + // Either shard should recognize that it has the option to switch to a secondary location where there + // would be no other shards from the same tenant, and request to do so. + assert_eq!( + optimization_a, + Some(ScheduleOptimization { + sequence: shard_a.sequence, + action: ScheduleOptimizationAction::MigrateAttachment(MigrateAttachment { + old_attached_node_id: NodeId(1), + new_attached_node_id: NodeId(2) + }) + }) + ); + + // Note that these optimizing two shards in the same tenant with the same ScheduleContext is + // mutually exclusive (the optimization of one invalidates the stats) -- it is the responsibility + // of [`Service::optimize_all`] to avoid trying + // to do optimizations for multiple shards in the same tenant at the same time. Generating + // both optimizations is just done for test purposes + let optimization_b = shard_b.optimize_attachment(&nodes, &schedule_context); + assert_eq!( + optimization_b, + Some(ScheduleOptimization { + sequence: shard_b.sequence, + action: ScheduleOptimizationAction::MigrateAttachment(MigrateAttachment { + old_attached_node_id: NodeId(1), + new_attached_node_id: NodeId(3) + }) + }) + ); + + // Applying these optimizations should result in the end state proposed + shard_a.apply_optimization(&mut scheduler, optimization_a.unwrap()); + assert_eq!(shard_a.intent.get_attached(), &Some(NodeId(2))); + assert_eq!(shard_a.intent.get_secondary(), &vec![NodeId(1)]); + shard_b.apply_optimization(&mut scheduler, optimization_b.unwrap()); + assert_eq!(shard_b.intent.get_attached(), &Some(NodeId(3))); + assert_eq!(shard_b.intent.get_secondary(), &vec![NodeId(1)]); + + shard_a.intent.clear(&mut scheduler); + shard_b.intent.clear(&mut scheduler); + + Ok(()) + } + + #[test] + fn optimize_secondary() -> anyhow::Result<()> { + let nodes = make_test_nodes(4); + let mut scheduler = Scheduler::new(nodes.values()); + + let mut shard_a = make_test_tenant_shard(PlacementPolicy::Attached(1)); + let mut shard_b = make_test_tenant_shard(PlacementPolicy::Attached(1)); + + // Initially: both nodes attached on shard 1, and both have secondary locations + // on different nodes. + shard_a.intent.set_attached(&mut scheduler, Some(NodeId(1))); + shard_a.intent.push_secondary(&mut scheduler, NodeId(3)); + shard_b.intent.set_attached(&mut scheduler, Some(NodeId(2))); + shard_b.intent.push_secondary(&mut scheduler, NodeId(3)); + + let mut schedule_context = ScheduleContext::default(); + schedule_context.avoid(&shard_a.intent.all_pageservers()); + schedule_context.push_attached(shard_a.intent.get_attached().unwrap()); + schedule_context.avoid(&shard_b.intent.all_pageservers()); + schedule_context.push_attached(shard_b.intent.get_attached().unwrap()); + + let optimization_a = shard_a.optimize_secondary(&scheduler, &schedule_context); + + // Since there is a node with no locations available, the node with two locations for the + // same tenant should generate an optimization to move one away + assert_eq!( + optimization_a, + Some(ScheduleOptimization { + sequence: shard_a.sequence, + action: ScheduleOptimizationAction::ReplaceSecondary(ReplaceSecondary { + old_node_id: NodeId(3), + new_node_id: NodeId(4) + }) + }) + ); + + shard_a.apply_optimization(&mut scheduler, optimization_a.unwrap()); + assert_eq!(shard_a.intent.get_attached(), &Some(NodeId(1))); + assert_eq!(shard_a.intent.get_secondary(), &vec![NodeId(4)]); + + shard_a.intent.clear(&mut scheduler); + shard_b.intent.clear(&mut scheduler); + + Ok(()) + } + + // Optimize til quiescent: this emulates what Service::optimize_all does, when + // called repeatedly in the background. + fn optimize_til_idle( + nodes: &HashMap, + scheduler: &mut Scheduler, + shards: &mut [TenantShard], + ) { + let mut loop_n = 0; + loop { + let mut schedule_context = ScheduleContext::default(); + let mut any_changed = false; + + for shard in shards.iter() { + schedule_context.avoid(&shard.intent.all_pageservers()); + if let Some(attached) = shard.intent.get_attached() { + schedule_context.push_attached(*attached); + } + } + + for shard in shards.iter_mut() { + let optimization = shard.optimize_attachment(nodes, &schedule_context); + if let Some(optimization) = optimization { + shard.apply_optimization(scheduler, optimization); + any_changed = true; + break; + } + + let optimization = shard.optimize_secondary(scheduler, &schedule_context); + if let Some(optimization) = optimization { + shard.apply_optimization(scheduler, optimization); + any_changed = true; + break; + } + } + + if !any_changed { + break; + } + + // Assert no infinite loop + loop_n += 1; + assert!(loop_n < 1000); + } + } + + /// Test the balancing behavior of shard scheduling: that it achieves a balance, and + /// that it converges. + #[test] + fn optimize_add_nodes() -> anyhow::Result<()> { + let nodes = make_test_nodes(4); + + // Only show the scheduler a couple of nodes + let mut scheduler = Scheduler::new([].iter()); + scheduler.node_upsert(nodes.get(&NodeId(1)).unwrap()); + scheduler.node_upsert(nodes.get(&NodeId(2)).unwrap()); + + let mut shards = make_test_tenant(PlacementPolicy::Attached(1), ShardCount::new(4)); + let mut schedule_context = ScheduleContext::default(); + for shard in &mut shards { + assert!(shard + .schedule(&mut scheduler, &mut schedule_context) + .is_ok()); + } + + // We should see equal number of locations on the two nodes. + assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 4); + assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 4); + + // Add another two nodes: we should see the shards spread out when their optimize + // methods are called + scheduler.node_upsert(nodes.get(&NodeId(3)).unwrap()); + scheduler.node_upsert(nodes.get(&NodeId(4)).unwrap()); + optimize_til_idle(&nodes, &mut scheduler, &mut shards); + + assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 2); + assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 2); + assert_eq!(scheduler.get_node_shard_count(NodeId(3)), 2); + assert_eq!(scheduler.get_node_shard_count(NodeId(4)), 2); + + for shard in shards.iter_mut() { + shard.intent.clear(&mut scheduler); + } - tenant_state.intent.clear(&mut scheduler); Ok(()) } } diff --git a/test_runner/README.md b/test_runner/README.md index 96e74659ce..fd68cfff79 100644 --- a/test_runner/README.md +++ b/test_runner/README.md @@ -76,13 +76,10 @@ you can use `--pg-version` argument. `TEST_OUTPUT`: Set the directory where test state and test output files should go. `TEST_SHARED_FIXTURES`: Try to re-use a single pageserver for all the tests. -`NEON_PAGESERVER_OVERRIDES`: add a `;`-separated set of configs that will be passed as `RUST_LOG`: logging configuration to pass into Neon CLI Useful parameters and commands: -`--pageserver-config-override=${value}` `-c` values to pass into pageserver through neon_local cli - `--preserve-database-files` to preserve pageserver (layer) and safekeer (segment) timeline files on disk after running a test suite. Such files might be large, so removed by default; but might be useful for debugging or creation of svg images with layer file contents. @@ -95,6 +92,166 @@ Exit after the first test failure: `./scripts/pytest -x ...` (there are many more pytest options; run `pytest -h` to see them.) +#### Running Python tests against real S3 or S3-compatible services + +Neon's `libs/remote_storage` supports multiple implementations of remote storage. +At the time of writing, that is +```rust +pub enum RemoteStorageKind { + /// Storage based on local file system. + /// Specify a root folder to place all stored files into. + LocalFs(Utf8PathBuf), + /// AWS S3 based storage, storing all files in the S3 bucket + /// specified by the config + AwsS3(S3Config), + /// Azure Blob based storage, storing all files in the container + /// specified by the config + AzureContainer(AzureConfig), +} +``` + +The test suite has a Python enum with equal name but different meaning: + +```python +@enum.unique +class RemoteStorageKind(str, enum.Enum): + LOCAL_FS = "local_fs" + MOCK_S3 = "mock_s3" + REAL_S3 = "real_s3" +``` + +* `LOCAL_FS` => `LocalFs` +* `MOCK_S3`: starts [`moto`](https://github.com/getmoto/moto)'s S3 implementation, then configures Pageserver with `AwsS3` +* `REAL_S3` => configure `AwsS3` as detailed below + +When a test in the test suite needs an `AwsS3`, it is supposed to call `remote_storage.s3_storage()`. +That function checks env var `ENABLE_REAL_S3_REMOTE_STORAGE`: +* If it is not set, use `MOCK_S3` +* If it is set, use `REAL_S3`. + +For `REAL_S3`, the test suite creates the dict/toml representation of the `RemoteStorageKind::AwsS3` based on env vars: + +```rust +pub struct S3Config { + // test suite env var: REMOTE_STORAGE_S3_BUCKET + pub bucket_name: String, + // test suite env var: REMOTE_STORAGE_S3_REGION + pub bucket_region: String, + // test suite determines this + pub prefix_in_bucket: Option, + // no env var exists; test suite sets it for MOCK_S3, because that's how moto works + pub endpoint: Option, + ... +} +``` + +*Credentials* are not part of the config, but discovered by the AWS SDK. +See the `libs/remote_storage` Rust code. +We're documenting two mechanism here: + +The test suite supports two mechanisms (`remote_storage.py`): + +**Credential mechanism 1**: env vars `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY`. +Populate the env vars with AWS access keys that you created in IAM. +Our CI uses this mechanism. +However, it is _not_ recommended for interactive use by developers ([learn more](https://docs.aws.amazon.com/sdkref/latest/guide/access-users.html#credentials-long-term)). +Instead, use profiles (next section). + +**Credential mechanism 2**: env var `AWS_PROFILE`. +This uses the AWS SDK's (and CLI's) profile mechanism. +Learn more about it [in the official docs](https://docs.aws.amazon.com/sdkref/latest/guide/file-format.html). +After configuring a profile (e.g. via the aws CLI), set the env var to its name. + +In conclusion, the full command line is: + +```bash +# with long-term AWS access keys +ENABLE_REAL_S3_REMOTE_STORAGE=true \ +REMOTE_STORAGE_S3_BUCKET=mybucket \ +REMOTE_STORAGE_S3_REGION=eu-central-1 \ +AWS_ACCESS_KEY_ID=... \ +AWS_SECRET_ACCESS_KEY=... \ +./scripts/pytest +``` + +```bash +# with AWS PROFILE +ENABLE_REAL_S3_REMOTE_STORAGE=true \ +REMOTE_STORAGE_S3_BUCKET=mybucket \ +REMOTE_STORAGE_S3_REGION=eu-central-1 \ +AWS_PROFILE=... \ +./scripts/pytest +``` + +If you're using SSO, make sure to `aws sso login --profile $AWS_PROFILE` first. + +##### Minio + +If you want to run test without the cloud setup, we recommend [minio](https://min.io/docs/minio/linux/index.html). + +```bash +# Start in Terminal 1 +mkdir /tmp/minio_data +minio server /tmp/minio_data --console-address 127.0.0.1:9001 --address 127.0.0.1:9000 +``` + +In another terminal, create an `aws` CLI profile for it: + +```ini +# append to ~/.aws/config +[profile local-minio] +services = local-minio-services +[services local-minio-services] +s3 = + endpoint_url=http://127.0.0.1:9000/ +``` + + +Now configure the credentials (this is going to write `~/.aws/credentials` for you). +It's an interactive prompt. + +```bash +# Terminal 2 +$ aws --profile local-minio configure +AWS Access Key ID [None]: minioadmin +AWS Secret Access Key [None]: minioadmin +Default region name [None]: +Default output format [None]: +``` + +Now create a bucket `testbucket` using the CLI. + +```bash +# (don't forget to have AWS_PROFILE env var set; or use --profile) +aws --profile local-minio s3 mb s3://mybucket +``` + +(If it doesn't work, make sure you update your AWS CLI to a recent version. + The [service-specific endpoint feature](https://docs.aws.amazon.com/sdkref/latest/guide/feature-ss-endpoints.html) + that we're using is quite new.) + +```bash +# with AWS PROFILE +ENABLE_REAL_S3_REMOTE_STORAGE=true \ +REMOTE_STORAGE_S3_BUCKET=mybucket \ +REMOTE_STORAGE_S3_REGION=doesntmatterforminio \ +AWS_PROFILE=local-minio \ +./scripts/pytest +``` + +NB: you can avoid the `--profile` by setting the `AWS_PROFILE` variable. +Just like the AWS SDKs, the `aws` CLI is sensible to it. + +#### Running Rust tests against real S3 or S3-compatible services + +We have some Rust tests that only run against real S3, e.g., [here](https://github.com/neondatabase/neon/blob/c18d3340b5e3c978a81c3db8b6f1e83cd9087e8a/libs/remote_storage/tests/test_real_s3.rs#L392-L397). + +They use the same env vars as the Python test suite (see previous section) +but interpret them on their own. +However, at this time, the interpretation is identical. + +So, above instructions apply to the Rust test as well. + ### Writing a test Every test needs a Neon Environment, or NeonEnv to operate in. A Neon Environment diff --git a/test_runner/fixtures/benchmark_fixture.py b/test_runner/fixtures/benchmark_fixture.py index e7959c1764..c32748f6f0 100644 --- a/test_runner/fixtures/benchmark_fixture.py +++ b/test_runner/fixtures/benchmark_fixture.py @@ -482,20 +482,18 @@ def pytest_terminal_summary( terminalreporter.section("Benchmark results", "-") is_header_printed = True - terminalreporter.write( - "{}.{}: ".format(test_report.head_line, recorded_property["name"]) - ) + terminalreporter.write(f"{test_report.head_line}.{recorded_property['name']}: ") unit = recorded_property["unit"] value = recorded_property["value"] if unit == "MB": - terminalreporter.write("{0:,.0f}".format(value), green=True) + terminalreporter.write(f"{value:,.0f}", green=True) elif unit in ("s", "ms") and isinstance(value, float): - terminalreporter.write("{0:,.3f}".format(value), green=True) + terminalreporter.write(f"{value:,.3f}", green=True) elif isinstance(value, float): - terminalreporter.write("{0:,.4f}".format(value), green=True) + terminalreporter.write(f"{value:,.4f}", green=True) else: terminalreporter.write(str(value), green=True) - terminalreporter.line(" {}".format(unit)) + terminalreporter.line(f" {unit}") result_entry.append(recorded_property) diff --git a/test_runner/fixtures/compute_reconfigure.py b/test_runner/fixtures/compute_reconfigure.py index 9dd66fe636..a883d94f73 100644 --- a/test_runner/fixtures/compute_reconfigure.py +++ b/test_runner/fixtures/compute_reconfigure.py @@ -14,10 +14,18 @@ class ComputeReconfigure: self.server = server self.control_plane_compute_hook_api = f"http://{server.host}:{server.port}/notify-attach" self.workloads = {} + self.on_notify = None def register_workload(self, workload): self.workloads[workload.tenant_id] = workload + def register_on_notify(self, fn): + """ + Add some extra work during a notification, like sleeping to slow things down, or + logging what was notified. + """ + self.on_notify = fn + @pytest.fixture(scope="function") def compute_reconfigure_listener(make_httpserver): @@ -43,6 +51,9 @@ def compute_reconfigure_listener(make_httpserver): body: dict[str, Any] = request.json log.info(f"notify-attach request: {body}") + if self.on_notify is not None: + self.on_notify(body) + try: workload = self.workloads[TenantId(body["tenant_id"])] except KeyError: diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py index c615dd154f..7d34e12ca3 100644 --- a/test_runner/fixtures/metrics.py +++ b/test_runner/fixtures/metrics.py @@ -129,7 +129,7 @@ PAGESERVER_GLOBAL_METRICS: Tuple[str, ...] = ( "pageserver_getpage_reconstruct_seconds_sum", *[f"pageserver_basebackup_query_seconds_{x}" for x in ["bucket", "count", "sum"]], *histogram("pageserver_smgr_query_seconds_global"), - *histogram("pageserver_read_num_fs_layers"), + *histogram("pageserver_layers_visited_per_read_global"), *histogram("pageserver_getpage_get_reconstruct_data_seconds"), *histogram("pageserver_wait_lsn_seconds"), *histogram("pageserver_remote_operation_seconds"), diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index f8994a8dcc..da379693a0 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -54,7 +54,7 @@ from fixtures.pageserver.allowed_errors import ( DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS, ) from fixtures.pageserver.http import PageserverHttpClient -from fixtures.pageserver.types import IndexPartDump +from fixtures.pageserver.types import IndexPartDump, LayerName, parse_layer_file_name from fixtures.pageserver.utils import ( wait_for_last_record_lsn, wait_for_upload, @@ -68,7 +68,7 @@ from fixtures.remote_storage import ( RemoteStorageUser, S3Storage, default_remote_storage, - remote_storage_to_toml_inline_table, + remote_storage_to_toml_dict, ) from fixtures.safekeeper.http import SafekeeperHttpClient from fixtures.safekeeper.utils import are_walreceivers_absent @@ -82,6 +82,7 @@ from fixtures.utils import ( subprocess_capture, wait_until, ) +from fixtures.utils import AuxFileStore as AuxFileStore # reexport """ This file contains pytest fixtures. A fixture is a test resource that can be @@ -450,6 +451,7 @@ class NeonEnvBuilder: test_output_dir: Path, test_overlay_dir: Optional[Path] = None, pageserver_remote_storage: Optional[RemoteStorage] = None, + # toml that will be decomposed into `--config-override` flags during `pageserver --init` pageserver_config_override: Optional[str] = None, num_safekeepers: int = 1, num_pageservers: int = 1, @@ -464,6 +466,7 @@ class NeonEnvBuilder: initial_tenant: Optional[TenantId] = None, initial_timeline: Optional[TimelineId] = None, pageserver_virtual_file_io_engine: Optional[str] = None, + pageserver_aux_file_policy: Optional[AuxFileStore] = None, ): self.repo_dir = repo_dir self.rust_log_override = rust_log_override @@ -487,6 +490,7 @@ class NeonEnvBuilder: self.env: Optional[NeonEnv] = None self.keep_remote_storage_contents: bool = True self.neon_binpath = neon_binpath + self.neon_local_binpath = neon_binpath self.pg_distrib_dir = pg_distrib_dir self.pg_version = pg_version self.preserve_database_files = preserve_database_files @@ -499,6 +503,7 @@ class NeonEnvBuilder: self.config_init_force: Optional[str] = None self.top_output_dir = top_output_dir self.control_plane_compute_hook_api: Optional[str] = None + self.storage_controller_config: Optional[dict[Any, Any]] = None self.pageserver_virtual_file_io_engine: Optional[str] = pageserver_virtual_file_io_engine @@ -507,6 +512,18 @@ class NeonEnvBuilder: self.pageserver_get_vectored_impl = "vectored" log.debug('Overriding pageserver get_vectored_impl config to "vectored"') + self.pageserver_get_impl: Optional[str] = None + if os.getenv("PAGESERVER_GET_IMPL", "") == "vectored": + self.pageserver_get_impl = "vectored" + log.debug('Overriding pageserver get_impl config to "vectored"') + + self.pageserver_validate_vectored_get: Optional[bool] = None + if (validate := os.getenv("PAGESERVER_VALIDATE_VEC_GET")) is not None: + self.pageserver_validate_vectored_get = bool(validate) + log.debug(f'Overriding pageserver validate_vectored_get config to "{validate}"') + + self.pageserver_aux_file_policy = pageserver_aux_file_policy + assert test_name.startswith( "test_" ), "Unexpectedly instantiated from outside a test function" @@ -520,9 +537,9 @@ class NeonEnvBuilder: self.env = NeonEnv(self) return self.env - def start(self, register_pageservers=False): + def start(self): assert self.env is not None, "environment is not already initialized, call init() first" - self.env.start(register_pageservers=register_pageservers) + self.env.start() def init_start( self, @@ -552,6 +569,7 @@ class NeonEnvBuilder: timeline_id=env.initial_timeline, shard_count=initial_tenant_shard_count, shard_stripe_size=initial_tenant_shard_stripe_size, + aux_file_v2=self.pageserver_aux_file_policy, ) assert env.initial_tenant == initial_tenant assert env.initial_timeline == initial_timeline @@ -620,17 +638,11 @@ class NeonEnvBuilder: def from_repo_dir( self, repo_dir: Path, - neon_binpath: Optional[Path] = None, - pg_distrib_dir: Optional[Path] = None, ) -> NeonEnv: """ A simple method to import data into the current NeonEnvBuilder from a snapshot of a repo dir. """ - # Setting custom `neon_binpath` and `pg_distrib_dir` is useful for compatibility tests - self.neon_binpath = neon_binpath or self.neon_binpath - self.pg_distrib_dir = pg_distrib_dir or self.pg_distrib_dir - # Get the initial tenant and timeline from the snapshot config snapshot_config_toml = repo_dir / "config" with snapshot_config_toml.open("r") as f: @@ -689,6 +701,11 @@ class NeonEnvBuilder: config["default_tenant_id"] = snapshot_config["default_tenant_id"] config["branch_name_mappings"] = snapshot_config["branch_name_mappings"] + # Update the config with new neon + postgres path in case of compat test + # FIXME: overriding pg_distrib_dir cause storage controller fail to start + # config["pg_distrib_dir"] = str(self.pg_distrib_dir) + config["neon_distrib_dir"] = str(self.neon_binpath) + with (self.repo_dir / "config").open("w") as f: toml.dump(config, f) @@ -970,7 +987,7 @@ class NeonEnv: Some notable functions and fields in NeonEnv: - postgres - A factory object for creating postgres compute nodes. + endpoints - A factory object for creating postgres compute nodes. pageservers - An array containing objects representing the pageservers @@ -1005,12 +1022,13 @@ class NeonEnv: self.pg_version = config.pg_version # Binary path for pageserver, safekeeper, etc self.neon_binpath = config.neon_binpath - # Binary path for neon_local test-specific binaries: may be overridden - # after construction for compat testing - self.neon_local_binpath = config.neon_binpath + # Binary path for neon_local test-specific binaries + self.neon_local_binpath = config.neon_local_binpath + if self.neon_local_binpath is None: + self.neon_local_binpath = self.neon_binpath self.pg_distrib_dir = config.pg_distrib_dir self.endpoint_counter = 0 - self.pageserver_config_override = config.pageserver_config_override + self.storage_controller_config = config.storage_controller_config # generate initial tenant ID here instead of letting 'neon init' generate it, # so that we don't need to dig it out of the config file afterwards. @@ -1039,15 +1057,16 @@ class NeonEnv: ) self.pageserver_virtual_file_io_engine = config.pageserver_virtual_file_io_engine + self.pageserver_aux_file_policy = config.pageserver_aux_file_policy - # Create a config file corresponding to the options + # Create the neon_local's `NeonLocalInitConf` cfg: Dict[str, Any] = { "default_tenant_id": str(self.initial_tenant), "broker": { "listen_addr": self.broker.listen_addr(), }, - "pageservers": [], "safekeepers": [], + "pageservers": [], } if self.control_plane_api is not None: @@ -1056,6 +1075,9 @@ class NeonEnv: if self.control_plane_compute_hook_api is not None: cfg["control_plane_compute_hook_api"] = self.control_plane_compute_hook_api + if self.storage_controller_config is not None: + cfg["storage_controller"] = self.storage_controller_config + # Create config for pageserver http_auth_type = "NeonJWT" if config.auth_enabled else "Trust" pg_auth_type = "NeonJWT" if config.auth_enabled else "Trust" @@ -1078,6 +1100,21 @@ class NeonEnv: ps_cfg["virtual_file_io_engine"] = self.pageserver_virtual_file_io_engine if config.pageserver_get_vectored_impl is not None: ps_cfg["get_vectored_impl"] = config.pageserver_get_vectored_impl + if config.pageserver_get_impl is not None: + ps_cfg["get_impl"] = config.pageserver_get_impl + if config.pageserver_validate_vectored_get is not None: + ps_cfg["validate_vectored_get"] = config.pageserver_validate_vectored_get + + if self.pageserver_remote_storage is not None: + ps_cfg["remote_storage"] = remote_storage_to_toml_dict( + self.pageserver_remote_storage + ) + + if config.pageserver_config_override is not None: + for o in config.pageserver_config_override.split(";"): + override = toml.loads(o) + for key, value in override.items(): + ps_cfg[key] = value # Create a corresponding NeonPageserver object self.pageservers.append( @@ -1085,7 +1122,6 @@ class NeonEnv: self, ps_id, port=pageserver_port, - config_override=self.pageserver_config_override, ) ) cfg["pageservers"].append(ps_cfg) @@ -1113,24 +1149,19 @@ class NeonEnv: cfg["safekeepers"].append(sk_cfg) log.info(f"Config: {cfg}") - self.neon_cli.init(cfg, force=config.config_init_force) + self.neon_cli.init( + cfg, + force=config.config_init_force, + ) - def start(self, register_pageservers=False): - # storage controller starts first, so that pageserver /re-attach calls don't + def start(self): + # Storage controller starts first, so that pageserver /re-attach calls don't # bounce through retries on startup self.storage_controller.start() - def storage_controller_ready(): - assert self.storage_controller.ready() is True - # Wait for storage controller readiness to prevent unnecessary post start-up # reconcile. - wait_until(30, 1, storage_controller_ready) - - if register_pageservers: - # Special case for forward compat tests, this can be removed later. - for pageserver in self.pageservers: - self.storage_controller.node_register(pageserver) + self.storage_controller.wait_until_ready() # Start up broker, pageserver and all safekeepers futs = [] @@ -1155,13 +1186,17 @@ class NeonEnv: After this method returns, there should be no child processes running. """ self.endpoints.stop_all() + + # Stop storage controller before pageservers: we don't want it to spuriously + # detect a pageserver "failure" during test teardown + self.storage_controller.stop(immediate=immediate) + for sk in self.safekeepers: sk.stop(immediate=immediate) for pageserver in self.pageservers: if ps_assert_metric_no_errors: pageserver.assert_no_metric_errors() pageserver.stop(immediate=immediate) - self.storage_controller.stop(immediate=immediate) self.broker.stop(immediate=immediate) @property @@ -1269,6 +1304,7 @@ def _shared_simple_env( pg_distrib_dir: Path, pg_version: PgVersion, pageserver_virtual_file_io_engine: str, + pageserver_aux_file_policy: Optional[AuxFileStore], ) -> Iterator[NeonEnv]: """ # Internal fixture backing the `neon_simple_env` fixture. If TEST_SHARED_FIXTURES @@ -1299,6 +1335,7 @@ def _shared_simple_env( test_name=request.node.name, test_output_dir=test_output_dir, pageserver_virtual_file_io_engine=pageserver_virtual_file_io_engine, + pageserver_aux_file_policy=pageserver_aux_file_policy, ) as builder: env = builder.init_start() @@ -1338,6 +1375,7 @@ def neon_env_builder( test_overlay_dir: Path, top_output_dir: Path, pageserver_virtual_file_io_engine: str, + pageserver_aux_file_policy: Optional[AuxFileStore] = None, ) -> Iterator[NeonEnvBuilder]: """ Fixture to create a Neon environment for test. @@ -1371,6 +1409,7 @@ def neon_env_builder( test_name=request.node.name, test_output_dir=test_output_dir, test_overlay_dir=test_overlay_dir, + pageserver_aux_file_policy=pageserver_aux_file_policy, ) as builder: yield builder @@ -1530,6 +1569,7 @@ class NeonCli(AbstractNeonCli): shard_stripe_size: Optional[int] = None, placement_policy: Optional[str] = None, set_default: bool = False, + aux_file_v2: Optional[AuxFileStore] = None, ) -> Tuple[TenantId, TimelineId]: """ Creates a new tenant, returns its id and its initial timeline's id. @@ -1553,6 +1593,16 @@ class NeonCli(AbstractNeonCli): product(["-c"], (f"{key}:{value}" for key, value in conf.items())) ) ) + + if aux_file_v2 is AuxFileStore.V2: + args.extend(["-c", "switch_aux_file_policy:v2"]) + + if aux_file_v2 is AuxFileStore.V1: + args.extend(["-c", "switch_aux_file_policy:v1"]) + + if aux_file_v2 is AuxFileStore.CrossValidation: + args.extend(["-c", "switch_aux_file_policy:cross_validation"]) + if set_default: args.append("--set-default") @@ -1569,6 +1619,11 @@ class NeonCli(AbstractNeonCli): res.check_returncode() return tenant_id, timeline_id + def import_tenant(self, tenant_id: TenantId): + args = ["tenant", "import", "--tenant-id", str(tenant_id)] + res = self.raw_cli(args) + res.check_returncode() + def set_default(self, tenant_id: TenantId): """ Update default tenant for future operations that require tenant_id. @@ -1682,32 +1737,24 @@ class NeonCli(AbstractNeonCli): def init( self, - config: Dict[str, Any], + init_config: Dict[str, Any], force: Optional[str] = None, ) -> "subprocess.CompletedProcess[str]": - with tempfile.NamedTemporaryFile(mode="w+") as tmp: - tmp.write(toml.dumps(config)) - tmp.flush() + with tempfile.NamedTemporaryFile(mode="w+") as init_config_tmpfile: + init_config_tmpfile.write(toml.dumps(init_config)) + init_config_tmpfile.flush() - cmd = ["init", f"--config={tmp.name}", "--pg-version", self.env.pg_version] + cmd = [ + "init", + f"--config={init_config_tmpfile.name}", + ] if force is not None: cmd.extend(["--force", force]) - storage = self.env.pageserver_remote_storage - - append_pageserver_param_overrides( - params_to_update=cmd, - remote_storage=storage, - pageserver_config_override=self.env.pageserver_config_override, - ) - - s3_env_vars = None - if isinstance(storage, S3Storage): - s3_env_vars = storage.access_env_vars() - res = self.raw_cli(cmd, extra_env_vars=s3_env_vars) + res = self.raw_cli(cmd) res.check_returncode() - return res + return res def storage_controller_start(self): cmd = ["storage_controller", "start"] @@ -1722,16 +1769,10 @@ class NeonCli(AbstractNeonCli): def pageserver_start( self, id: int, - overrides: Tuple[str, ...] = (), extra_env_vars: Optional[Dict[str, str]] = None, ) -> "subprocess.CompletedProcess[str]": - start_args = ["pageserver", "start", f"--id={id}", *overrides] + start_args = ["pageserver", "start", f"--id={id}"] storage = self.env.pageserver_remote_storage - append_pageserver_param_overrides( - params_to_update=start_args, - remote_storage=storage, - pageserver_config_override=self.env.pageserver_config_override, - ) if isinstance(storage, S3Storage): s3_env_vars = storage.access_env_vars() @@ -1782,6 +1823,7 @@ class NeonCli(AbstractNeonCli): hot_standby: bool = False, lsn: Optional[Lsn] = None, pageserver_id: Optional[int] = None, + allow_multiple=False, ) -> "subprocess.CompletedProcess[str]": args = [ "endpoint", @@ -1805,6 +1847,8 @@ class NeonCli(AbstractNeonCli): args.extend(["--hot-standby", "true"]) if pageserver_id is not None: args.extend(["--pageserver-id", str(pageserver_id)]) + if allow_multiple: + args.extend(["--allow-multiple"]) res = self.raw_cli(args) res.check_returncode() @@ -1816,6 +1860,7 @@ class NeonCli(AbstractNeonCli): safekeepers: Optional[List[int]] = None, remote_ext_config: Optional[str] = None, pageserver_id: Optional[int] = None, + allow_multiple=False, ) -> "subprocess.CompletedProcess[str]": args = [ "endpoint", @@ -1830,6 +1875,8 @@ class NeonCli(AbstractNeonCli): args.append(endpoint_id) if pageserver_id is not None: args.extend(["--pageserver-id", str(pageserver_id)]) + if allow_multiple: + args.extend(["--allow-multiple"]) res = self.raw_cli(args) res.check_returncode() @@ -1939,6 +1986,55 @@ class Pagectl(AbstractNeonCli): return IndexPartDump.from_json(parsed) +class LogUtils: + """ + A mixin class which provides utilities for inspecting the logs of a service. + """ + + def __init__(self, logfile: Path) -> None: + self.logfile = logfile + + def assert_log_contains( + self, pattern: str, offset: None | LogCursor = None + ) -> Tuple[str, LogCursor]: + """Convenient for use inside wait_until()""" + + res = self.log_contains(pattern, offset=offset) + assert res is not None + return res + + def log_contains( + self, pattern: str, offset: None | LogCursor = None + ) -> Optional[Tuple[str, LogCursor]]: + """Check that the log contains a line that matches the given regex""" + logfile = self.logfile + if not logfile.exists(): + log.warning(f"Skipping log check: {logfile} does not exist") + return None + + contains_re = re.compile(pattern) + + # XXX: Our rust logging machinery buffers the messages, so if you + # call this function immediately after it's been logged, there is + # no guarantee it is already present in the log file. This hasn't + # been a problem in practice, our python tests are not fast enough + # to hit that race condition. + skip_until_line_no = 0 if offset is None else offset._line_no + cur_line_no = 0 + with logfile.open("r") as f: + for line in f: + if cur_line_no < skip_until_line_no: + cur_line_no += 1 + continue + elif contains_re.search(line): + # found it! + cur_line_no += 1 + return (line, LogCursor(cur_line_no)) + else: + cur_line_no += 1 + return None + + class StorageControllerApiException(Exception): def __init__(self, message, status_code: int): super().__init__(message) @@ -1946,12 +2042,13 @@ class StorageControllerApiException(Exception): self.status_code = status_code -class NeonStorageController(MetricsGetter): +class NeonStorageController(MetricsGetter, LogUtils): def __init__(self, env: NeonEnv, auth_enabled: bool): self.env = env self.running = False self.auth_enabled = auth_enabled self.allowed_errors: list[str] = DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS + self.logfile = self.workdir / "storage_controller.log" def start(self): assert not self.running @@ -2025,6 +2122,15 @@ class NeonStorageController(MetricsGetter): else: raise RuntimeError(f"Unexpected status {status} from readiness endpoint") + def wait_until_ready(self): + t1 = time.time() + + def storage_controller_ready(): + assert self.ready() is True + + wait_until(30, 1, storage_controller_ready) + return time.time() - t1 + def attach_hook_issue( self, tenant_shard_id: Union[TenantId, TenantShardId], pageserver_id: int ) -> int: @@ -2112,6 +2218,7 @@ class NeonStorageController(MetricsGetter): shard_count: Optional[int] = None, shard_stripe_size: Optional[int] = None, tenant_config: Optional[Dict[Any, Any]] = None, + placement_policy: Optional[Union[Dict[Any, Any] | str]] = None, ): """ Use this rather than pageserver_api() when you need to include shard parameters @@ -2122,6 +2229,8 @@ class NeonStorageController(MetricsGetter): shard_params = {"count": shard_count} if shard_stripe_size is not None: shard_params["stripe_size"] = shard_stripe_size + else: + shard_params["stripe_size"] = 32768 body["shard_parameters"] = shard_params @@ -2129,12 +2238,15 @@ class NeonStorageController(MetricsGetter): for k, v in tenant_config.items(): body[k] = v + body["placement_policy"] = placement_policy + response = self.request( "POST", f"{self.env.storage_controller_api}/v1/tenant", json=body, headers=self.headers(TokenScope.PAGE_SERVER_API), ) + response.raise_for_status() log.info(f"tenant_create success: {response.json()}") def locate(self, tenant_id: TenantId) -> list[dict[str, Any]]: @@ -2186,6 +2298,52 @@ class NeonStorageController(MetricsGetter): log.info(f"Migrated tenant {tenant_shard_id} to pageserver {dest_ps_id}") assert self.env.get_tenant_pageserver(tenant_shard_id).id == dest_ps_id + def tenant_policy_update(self, tenant_id: TenantId, body: dict[str, Any]): + log.info(f"tenant_policy_update({tenant_id}, {body})") + self.request( + "PUT", + f"{self.env.storage_controller_api}/control/v1/tenant/{tenant_id}/policy", + json=body, + headers=self.headers(TokenScope.ADMIN), + ) + + def tenant_import(self, tenant_id: TenantId): + self.request( + "POST", + f"{self.env.storage_controller_api}/debug/v1/tenant/{tenant_id}/import", + headers=self.headers(TokenScope.ADMIN), + ) + + def reconcile_all(self): + r = self.request( + "POST", + f"{self.env.storage_controller_api}/debug/v1/reconcile_all", + headers=self.headers(TokenScope.ADMIN), + ) + r.raise_for_status() + n = r.json() + log.info(f"reconcile_all waited for {n} shards") + return n + + def reconcile_until_idle(self, timeout_secs=30): + start_at = time.time() + n = 1 + delay_sec = 0.5 + delay_max = 5 + while n > 0: + n = self.reconcile_all() + if n == 0: + break + elif time.time() - start_at > timeout_secs: + raise RuntimeError("Timeout in reconcile_until_idle") + else: + # Don't call again right away: if we're waiting for many reconciles that + # are blocked on the concurrency limit, it slows things down to call + # reconcile_all frequently. + time.sleep(delay_sec) + delay_sec *= 2 + delay_sec = min(delay_sec, delay_max) + def consistency_check(self): """ Throw an exception if the service finds any inconsistencies in its state @@ -2214,6 +2372,10 @@ class NeonStorageController(MetricsGetter): log.info(f"Got failpoints request response code {res.status_code}") res.raise_for_status() + @property + def workdir(self) -> Path: + return self.env.repo_dir + def __enter__(self) -> "NeonStorageController": return self @@ -2231,24 +2393,21 @@ class LogCursor: _line_no: int -class NeonPageserver(PgProtocol): +class NeonPageserver(PgProtocol, LogUtils): """ An object representing a running pageserver. """ TEMP_FILE_SUFFIX = "___temp" - def __init__( - self, env: NeonEnv, id: int, port: PageserverPort, config_override: Optional[str] = None - ): + def __init__(self, env: NeonEnv, id: int, port: PageserverPort): super().__init__(host="localhost", port=port.pg, user="cloud_admin") self.env = env self.id = id self.running = False self.service_port = port - self.config_override = config_override self.version = env.get_binary_version("pageserver") - + self.logfile = self.workdir / "pageserver.log" # After a test finishes, we will scrape the log to see if there are any # unexpected error messages. If your test expects an error, add it to # 'allowed_errors' in the test with something like: @@ -2258,24 +2417,61 @@ class NeonPageserver(PgProtocol): # The entries in the list are regular experessions. self.allowed_errors: List[str] = list(DEFAULT_PAGESERVER_ALLOWED_ERRORS) - def timeline_dir(self, tenant_id: TenantId, timeline_id: Optional[TimelineId] = None) -> Path: + def timeline_dir( + self, + tenant_shard_id: Union[TenantId, TenantShardId], + timeline_id: Optional[TimelineId] = None, + ) -> Path: """Get a timeline directory's path based on the repo directory of the test environment""" if timeline_id is None: - return self.tenant_dir(tenant_id) / "timelines" - return self.tenant_dir(tenant_id) / "timelines" / str(timeline_id) + return self.tenant_dir(tenant_shard_id) / "timelines" + return self.tenant_dir(tenant_shard_id) / "timelines" / str(timeline_id) def tenant_dir( self, - tenant_id: Optional[TenantId] = None, + tenant_shard_id: Optional[Union[TenantId, TenantShardId]] = None, ) -> Path: """Get a tenant directory's path based on the repo directory of the test environment""" - if tenant_id is None: + if tenant_shard_id is None: return self.workdir / "tenants" - return self.workdir / "tenants" / str(tenant_id) + return self.workdir / "tenants" / str(tenant_shard_id) + + @property + def config_toml_path(self) -> Path: + return self.workdir / "pageserver.toml" + + def edit_config_toml(self, edit_fn: Callable[[Dict[str, Any]], None]): + """ + Edit the pageserver's config toml file in place. + """ + path = self.config_toml_path + with open(path, "r") as f: + config = toml.load(f) + edit_fn(config) + with open(path, "w") as f: + toml.dump(config, f) + + def patch_config_toml_nonrecursive(self, patch: Dict[str, Any]) -> Dict[str, Any]: + """ + Non-recursively merge the given `patch` dict into the existing config toml, using `dict.update()`. + Returns the replaced values. + If there was no previous value, the key is mapped to None. + This allows to restore the original value by calling this method with the returned dict. + """ + replacements = {} + + def doit(config: Dict[str, Any]): + while len(patch) > 0: + key, new = patch.popitem() + old = config.get(key, None) + config[key] = new + replacements[key] = old + + self.edit_config_toml(doit) + return replacements def start( self, - overrides: Tuple[str, ...] = (), extra_env_vars: Optional[Dict[str, str]] = None, ) -> "NeonPageserver": """ @@ -2285,9 +2481,7 @@ class NeonPageserver(PgProtocol): """ assert self.running is False - self.env.neon_cli.pageserver_start( - self.id, overrides=overrides, extra_env_vars=extra_env_vars - ) + self.env.neon_cli.pageserver_start(self.id, extra_env_vars=extra_env_vars) self.running = True return self @@ -2384,44 +2578,6 @@ class NeonPageserver(PgProtocol): value = self.http_client().get_metric_value(metric) assert value == 0, f"Nonzero {metric} == {value}" - def assert_log_contains( - self, pattern: str, offset: None | LogCursor = None - ) -> Tuple[str, LogCursor]: - """Convenient for use inside wait_until()""" - - res = self.log_contains(pattern, offset=offset) - assert res is not None - return res - - def log_contains( - self, pattern: str, offset: None | LogCursor = None - ) -> Optional[Tuple[str, LogCursor]]: - """Check that the pageserver log contains a line that matches the given regex""" - logfile = self.workdir / "pageserver.log" - if not logfile.exists(): - log.warning(f"Skipping log check: {logfile} does not exist") - return None - - contains_re = re.compile(pattern) - - # XXX: Our rust logging machinery buffers the messages, so if you - # call this function immediately after it's been logged, there is - # no guarantee it is already present in the log file. This hasn't - # been a problem in practice, our python tests are not fast enough - # to hit that race condition. - skip_until_line_no = 0 if offset is None else offset._line_no - cur_line_no = 0 - with logfile.open("r") as f: - for line in f: - if cur_line_no < skip_until_line_no: - cur_line_no += 1 - continue - if contains_re.search(line): - # found it! - cur_line_no += 1 - return (line, LogCursor(cur_line_no)) - return None - def tenant_attach( self, tenant_id: TenantId, @@ -2456,8 +2612,10 @@ class NeonPageserver(PgProtocol): client = self.http_client() return client.tenant_location_conf(tenant_id, config, **kwargs) - def read_tenant_location_conf(self, tenant_id: TenantId) -> dict[str, Any]: - path = self.tenant_dir(tenant_id) / "config-v1" + def read_tenant_location_conf( + self, tenant_shard_id: Union[TenantId, TenantShardId] + ) -> dict[str, Any]: + path = self.tenant_dir(tenant_shard_id) / "config-v1" log.info(f"Reading location conf from {path}") bytes = open(path, "r").read() try: @@ -2485,32 +2643,36 @@ class NeonPageserver(PgProtocol): tenant_id, generation=self.env.storage_controller.attach_hook_issue(tenant_id, self.id) ) + def list_layers(self, tenant_id: TenantId, timeline_id: TimelineId) -> list[Path]: + """ + Inspect local storage on a pageserver to discover which layer files are present. -def append_pageserver_param_overrides( - params_to_update: List[str], - remote_storage: Optional[RemoteStorage], - pageserver_config_override: Optional[str] = None, -): - if remote_storage is not None: - remote_storage_toml_table = remote_storage_to_toml_inline_table(remote_storage) + :return: list of relative paths to layers, from the timeline root. + """ + timeline_path = self.timeline_dir(tenant_id, timeline_id) - params_to_update.append( - f"--pageserver-config-override=remote_storage={remote_storage_toml_table}" + def relative(p: Path) -> Path: + return p.relative_to(timeline_path) + + return sorted( + list( + map( + relative, + filter( + lambda path: path.name != "metadata" + and "ephemeral" not in path.name + and "temp" not in path.name, + timeline_path.glob("*"), + ), + ) + ) ) - else: - params_to_update.append('--pageserver-config-override=remote_storage=""') - env_overrides = os.getenv("NEON_PAGESERVER_OVERRIDES") - if env_overrides is not None: - params_to_update += [ - f"--pageserver-config-override={o.strip()}" for o in env_overrides.split(";") - ] - - if pageserver_config_override is not None: - params_to_update += [ - f"--pageserver-config-override={o.strip()}" - for o in pageserver_config_override.split(";") - ] + def layer_exists( + self, tenant_id: TenantId, timeline_id: TimelineId, layer_name: LayerName + ) -> bool: + layers = self.list_layers(tenant_id, timeline_id) + return layer_name in [parse_layer_file_name(p.name) for p in layers] class PgBin: @@ -3200,6 +3362,7 @@ class Endpoint(PgProtocol): lsn: Optional[Lsn] = None, config_lines: Optional[List[str]] = None, pageserver_id: Optional[int] = None, + allow_multiple: bool = False, ) -> "Endpoint": """ Create a new Postgres endpoint. @@ -3222,6 +3385,7 @@ class Endpoint(PgProtocol): pg_port=self.pg_port, http_port=self.http_port, pageserver_id=pageserver_id, + allow_multiple=allow_multiple, ) path = Path("endpoints") / self.endpoint_id / "pgdata" self.pgdata_dir = os.path.join(self.env.repo_dir, path) @@ -3238,7 +3402,10 @@ class Endpoint(PgProtocol): return self def start( - self, remote_ext_config: Optional[str] = None, pageserver_id: Optional[int] = None + self, + remote_ext_config: Optional[str] = None, + pageserver_id: Optional[int] = None, + allow_multiple: bool = False, ) -> "Endpoint": """ Start the Postgres instance. @@ -3254,6 +3421,7 @@ class Endpoint(PgProtocol): safekeepers=self.active_safekeepers, remote_ext_config=remote_ext_config, pageserver_id=pageserver_id, + allow_multiple=allow_multiple, ) self.running = True @@ -3383,6 +3551,7 @@ class Endpoint(PgProtocol): config_lines: Optional[List[str]] = None, remote_ext_config: Optional[str] = None, pageserver_id: Optional[int] = None, + allow_multiple=False, ) -> "Endpoint": """ Create an endpoint, apply config, and start Postgres. @@ -3398,7 +3567,12 @@ class Endpoint(PgProtocol): hot_standby=hot_standby, lsn=lsn, pageserver_id=pageserver_id, - ).start(remote_ext_config=remote_ext_config, pageserver_id=pageserver_id) + allow_multiple=allow_multiple, + ).start( + remote_ext_config=remote_ext_config, + pageserver_id=pageserver_id, + allow_multiple=allow_multiple, + ) log.info(f"Postgres startup took {time.time() - started_at} seconds") @@ -3567,7 +3741,7 @@ class Safekeeper: return self def stop(self, immediate: bool = False) -> "Safekeeper": - log.info("Stopping safekeeper {}".format(self.id)) + log.info(f"Stopping safekeeper {self.id}") self.env.neon_cli.safekeeper_stop(self.id, immediate) self.running = False return self @@ -3661,13 +3835,15 @@ class S3Scrubber: log.warning(f"Scrub environment: {env}") log.warning(f"Output at: {output_path}") - raise RuntimeError("Remote storage scrub failed") + raise RuntimeError(f"Scrubber failed while running {args}") assert stdout is not None return stdout def scan_metadata(self) -> Any: - stdout = self.scrubber_cli(["scan-metadata", "--json"], timeout=30) + stdout = self.scrubber_cli( + ["scan-metadata", "--node-kind", "pageserver", "--json"], timeout=30 + ) try: return json.loads(stdout) @@ -3676,6 +3852,13 @@ class S3Scrubber: log.error(stdout) raise + def tenant_snapshot(self, tenant_id: TenantId, output_path: Path): + stdout = self.scrubber_cli( + ["tenant-snapshot", "--tenant-id", str(tenant_id), "--output-path", str(output_path)], + timeout=30, + ) + log.info(f"tenant-snapshot output: {stdout}") + def _get_test_dir(request: FixtureRequest, top_output_dir: Path, prefix: str) -> Path: """Compute the path to a working directory for an individual test.""" @@ -3999,13 +4182,13 @@ def check_restored_datadir_content(test_output_dir: Path, env: NeonEnv, endpoint for f in mismatch: f1 = os.path.join(endpoint.pgdata_dir, f) f2 = os.path.join(restored_dir_path, f) - stdout_filename = "{}.filediff".format(f2) + stdout_filename = f"{f2}.filediff" with open(stdout_filename, "w") as stdout_f: - subprocess.run("xxd -b {} > {}.hex ".format(f1, f1), shell=True) - subprocess.run("xxd -b {} > {}.hex ".format(f2, f2), shell=True) + subprocess.run(f"xxd -b {f1} > {f1}.hex ", shell=True) + subprocess.run(f"xxd -b {f2} > {f2}.hex ", shell=True) - cmd = "diff {}.hex {}.hex".format(f1, f2) + cmd = f"diff {f1}.hex {f2}.hex" subprocess.run([cmd], stdout=stdout_f, shell=True) assert (mismatch, error) == ([], []) diff --git a/test_runner/fixtures/pageserver/allowed_errors.py b/test_runner/fixtures/pageserver/allowed_errors.py index ec0f81b380..58a76d7586 100755 --- a/test_runner/fixtures/pageserver/allowed_errors.py +++ b/test_runner/fixtures/pageserver/allowed_errors.py @@ -86,6 +86,11 @@ DEFAULT_PAGESERVER_ALLOWED_ERRORS = ( # This is especially pronounced in tests that set small checkpoint # distances. ".*Flushed oversized open layer with size.*", + # During teardown, we stop the storage controller before the pageservers, so pageservers + # can experience connection errors doing background deletion queue work. + ".*WARN deletion backend: calling control plane generation validation API failed.*error sending request.*", + # Can happen when the test shuts down the storage controller while it is calling the utilization API + ".*WARN.*path=/v1/utilization .*request was dropped before completing", ) @@ -96,6 +101,8 @@ DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS = [ ".*Call to node.*management API.*failed.*ReceiveBody.*", # Many tests will start up with a node offline ".*startup_reconcile: Could not scan node.*", + # Tests run in dev mode + ".*Starting in dev mode.*", ] diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py index 6aebfbc99c..b06972056c 100644 --- a/test_runner/fixtures/pageserver/http.py +++ b/test_runner/fixtures/pageserver/http.py @@ -293,7 +293,6 @@ class PageserverHttpClient(requests.Session, MetricsGetter): lazy: Optional[bool] = None, ): body = location_conf.copy() - body["tenant_id"] = str(tenant_id) params = {} if flush_ms is not None: @@ -308,6 +307,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter): params=params, ) self.verbose_error(res) + return res.json() def tenant_list_locations(self): res = self.get( @@ -341,8 +341,21 @@ class PageserverHttpClient(requests.Session, MetricsGetter): res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/ignore") self.verbose_error(res) - def tenant_status(self, tenant_id: Union[TenantId, TenantShardId]) -> Dict[Any, Any]: - res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}") + def tenant_status( + self, tenant_id: Union[TenantId, TenantShardId], activate: bool = False + ) -> Dict[Any, Any]: + """ + :activate: hint the server not to accelerate activation of this tenant in response + to this query. False by default for tests, because they generally want to observed the + system rather than interfering with it. This is true by default on the server side, + because in the field if the control plane is GET'ing a tenant it's a sign that it wants + to do something with it. + """ + params = {} + if not activate: + params["activate"] = "false" + + res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}", params=params) self.verbose_error(res) res_json = res.json() assert isinstance(res_json, dict) @@ -806,6 +819,23 @@ class PageserverHttpClient(requests.Session, MetricsGetter): continue self.download_layer(tenant_id, timeline_id, layer.layer_file_name) + def detach_ancestor( + self, + tenant_id: Union[TenantId, TenantShardId], + timeline_id: TimelineId, + batch_size: int | None = None, + ) -> Set[TimelineId]: + params = {} + if batch_size is not None: + params["batch_size"] = batch_size + res = self.put( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/detach_ancestor", + params=params, + ) + self.verbose_error(res) + json = res.json() + return set(map(TimelineId, json["reparented_timelines"])) + def evict_layer( self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId, layer_name: str ): diff --git a/test_runner/fixtures/pageserver/types.py b/test_runner/fixtures/pageserver/types.py index 72fa30a2f2..1fb618f445 100644 --- a/test_runner/fixtures/pageserver/types.py +++ b/test_runner/fixtures/pageserver/types.py @@ -1,3 +1,4 @@ +import re from dataclasses import dataclass from typing import Any, Dict, Tuple, Union @@ -11,7 +12,7 @@ class IndexLayerMetadata: @dataclass(frozen=True) -class ImageLayerFileName: +class ImageLayerName: lsn: Lsn key_start: Key key_end: Key @@ -25,7 +26,7 @@ class ImageLayerFileName: @dataclass(frozen=True) -class DeltaLayerFileName: +class DeltaLayerName: lsn_start: Lsn lsn_end: Lsn key_start: Key @@ -40,65 +41,57 @@ class DeltaLayerFileName: return ret -LayerFileName = Union[ImageLayerFileName, DeltaLayerFileName] +LayerName = Union[ImageLayerName, DeltaLayerName] class InvalidFileName(Exception): pass +IMAGE_LAYER_FILE_NAME = re.compile( + "^([A-F0-9]{36})-([A-F0-9]{36})__([A-F0-9]{16})(-v1-[a-f0-9]{8})?$" +) + + def parse_image_layer(f_name: str) -> Tuple[int, int, int]: """Parse an image layer file name. Return key start, key end, and snapshot lsn""" - parts = f_name.split("__") - if len(parts) != 2: - raise InvalidFileName(f"expecting two parts separated by '__', got: {parts}") - key_parts = parts[0].split("-") - if len(key_parts) != 2: - raise InvalidFileName( - f"expecting two key parts separated by '--' in parts[0], got: {key_parts}" - ) - try: - return int(key_parts[0], 16), int(key_parts[1], 16), int(parts[1], 16) - except ValueError as e: - raise InvalidFileName(f"conversion error: {f_name}") from e + + match = IMAGE_LAYER_FILE_NAME.match(f_name) + if match is None: + raise InvalidFileName(f"'{f_name}' is not an image layer filename") + + return int(match.group(1), 16), int(match.group(2), 16), int(match.group(3), 16) + + +DELTA_LAYER_FILE_NAME = re.compile( + "^([A-F0-9]{36})-([A-F0-9]{36})__([A-F0-9]{16})-([A-F0-9]{16})(-v1-[a-f0-9]{8})?$" +) def parse_delta_layer(f_name: str) -> Tuple[int, int, int, int]: """Parse a delta layer file name. Return key start, key end, lsn start, and lsn end""" - parts = f_name.split("__") - if len(parts) != 2: - raise InvalidFileName(f"expecting two parts separated by '__', got: {parts}") - key_parts = parts[0].split("-") - if len(key_parts) != 2: - raise InvalidFileName( - f"expecting two key parts separated by '--' in parts[0], got: {key_parts}" - ) - lsn_parts = parts[1].split("-") - if len(lsn_parts) != 2: - raise InvalidFileName( - f"expecting two lsn parts separated by '--' in parts[1], got: {lsn_parts}" - ) - try: - return ( - int(key_parts[0], 16), - int(key_parts[1], 16), - int(lsn_parts[0], 16), - int(lsn_parts[1], 16), - ) - except ValueError as e: - raise InvalidFileName(f"conversion error: {f_name}") from e + match = DELTA_LAYER_FILE_NAME.match(f_name) + if match is None: + raise InvalidFileName(f"'{f_name}' is not an delta layer filename") + + return ( + int(match.group(1), 16), + int(match.group(2), 16), + int(match.group(3), 16), + int(match.group(4), 16), + ) -def parse_layer_file_name(file_name: str) -> LayerFileName: +def parse_layer_file_name(file_name: str) -> LayerName: try: key_start, key_end, lsn = parse_image_layer(file_name) - return ImageLayerFileName(lsn=Lsn(lsn), key_start=Key(key_start), key_end=Key(key_end)) + return ImageLayerName(lsn=Lsn(lsn), key_start=Key(key_start), key_end=Key(key_end)) except InvalidFileName: pass try: key_start, key_end, lsn_start, lsn_end = parse_delta_layer(file_name) - return DeltaLayerFileName( + return DeltaLayerName( lsn_start=Lsn(lsn_start), lsn_end=Lsn(lsn_end), key_start=Key(key_start), @@ -110,18 +103,15 @@ def parse_layer_file_name(file_name: str) -> LayerFileName: raise InvalidFileName("neither image nor delta layer") -def is_future_layer(layer_file_name: LayerFileName, disk_consistent_lsn: Lsn): +def is_future_layer(layer_file_name: LayerName, disk_consistent_lsn: Lsn): """ Determines if this layer file is considered to be in future meaning we will discard these layers during timeline initialization from the given disk_consistent_lsn. """ - if ( - isinstance(layer_file_name, ImageLayerFileName) - and layer_file_name.lsn > disk_consistent_lsn - ): + if isinstance(layer_file_name, ImageLayerName) and layer_file_name.lsn > disk_consistent_lsn: return True elif ( - isinstance(layer_file_name, DeltaLayerFileName) + isinstance(layer_file_name, DeltaLayerName) and layer_file_name.lsn_end > disk_consistent_lsn + 1 ): return True @@ -131,7 +121,7 @@ def is_future_layer(layer_file_name: LayerFileName, disk_consistent_lsn: Lsn): @dataclass class IndexPartDump: - layer_metadata: Dict[LayerFileName, IndexLayerMetadata] + layer_metadata: Dict[LayerName, IndexLayerMetadata] disk_consistent_lsn: Lsn @classmethod diff --git a/test_runner/fixtures/pageserver/utils.py b/test_runner/fixtures/pageserver/utils.py index cf64c86821..4b0dd7a815 100644 --- a/test_runner/fixtures/pageserver/utils.py +++ b/test_runner/fixtures/pageserver/utils.py @@ -62,9 +62,7 @@ def wait_for_upload( ) time.sleep(1) raise Exception( - "timed out while waiting for remote_consistent_lsn to reach {}, was {}".format( - lsn, current_lsn - ) + f"timed out while waiting for {tenant}/{timeline} remote_consistent_lsn to reach {lsn}, was {current_lsn}" ) @@ -206,13 +204,11 @@ def wait_for_last_record_lsn( return current_lsn if i % 10 == 0: log.info( - "{}/{} waiting for last_record_lsn to reach {}, now {}, iteration {}".format( - tenant, timeline, lsn, current_lsn, i + 1 - ) + f"{tenant}/{timeline} waiting for last_record_lsn to reach {lsn}, now {current_lsn}, iteration {i + 1}" ) time.sleep(0.1) raise Exception( - "timed out while waiting for last_record_lsn to reach {}, was {}".format(lsn, current_lsn) + f"timed out while waiting for last_record_lsn to reach {lsn}, was {current_lsn}" ) diff --git a/test_runner/fixtures/parametrize.py b/test_runner/fixtures/parametrize.py index c8ab550ad7..77523a542b 100644 --- a/test_runner/fixtures/parametrize.py +++ b/test_runner/fixtures/parametrize.py @@ -5,6 +5,7 @@ import pytest from _pytest.python import Metafunc from fixtures.pg_version import PgVersion +from fixtures.utils import AuxFileStore """ Dynamically parametrize tests by different parameters @@ -31,6 +32,11 @@ def pageserver_virtual_file_io_engine() -> Optional[str]: return os.getenv("PAGESERVER_VIRTUAL_FILE_IO_ENGINE") +@pytest.fixture(scope="function", autouse=True) +def pageserver_aux_file_policy() -> Optional[AuxFileStore]: + return None + + def pytest_generate_tests(metafunc: Metafunc): if (bt := os.getenv("BUILD_TYPE")) is None: build_types = ["debug", "release"] diff --git a/test_runner/fixtures/remote_storage.py b/test_runner/fixtures/remote_storage.py index 60591d8d46..132d2450a7 100644 --- a/test_runner/fixtures/remote_storage.py +++ b/test_runner/fixtures/remote_storage.py @@ -50,7 +50,7 @@ class MockS3Server: # XXX: do not use `shell=True` or add `exec ` to the command here otherwise. # We use `self.subprocess.kill()` to shut down the server, which would not "just" work in Linux # if a process is started from the shell process. - self.subprocess = subprocess.Popen(["poetry", "run", "moto_server", "s3", f"-p{port}"]) + self.subprocess = subprocess.Popen(["poetry", "run", "moto_server", f"-p{port}"]) error = None try: return_code = self.subprocess.poll() @@ -141,11 +141,13 @@ class LocalFsStorage: with self.heatmap_path(tenant_id).open("r") as f: return json.load(f) - def to_toml_inline_table(self) -> str: - rv = { + def to_toml_dict(self) -> Dict[str, Any]: + return { "local_path": str(self.root), } - return toml.TomlEncoder().dump_inline_table(rv) + + def to_toml_inline_table(self) -> str: + return toml.TomlEncoder().dump_inline_table(self.to_toml_dict()) def cleanup(self): # no cleanup is done here, because there's NeonEnvBuilder.cleanup_local_storage which will remove everything, including localfs files @@ -194,7 +196,7 @@ class S3Storage: } ) - def to_toml_inline_table(self) -> str: + def to_toml_dict(self) -> Dict[str, Any]: rv = { "bucket_name": self.bucket_name, "bucket_region": self.bucket_region, @@ -206,7 +208,10 @@ class S3Storage: if self.endpoint is not None: rv["endpoint"] = self.endpoint - return toml.TomlEncoder().dump_inline_table(rv) + return rv + + def to_toml_inline_table(self) -> str: + return toml.TomlEncoder().dump_inline_table(self.to_toml_dict()) def do_cleanup(self): if not self.cleanup: @@ -252,8 +257,11 @@ class S3Storage: log.info(f"deleted {cnt} objects from remote storage") + def tenants_path(self) -> str: + return f"{self.prefix_in_bucket}/tenants" + def tenant_path(self, tenant_id: TenantId) -> str: - return f"{self.prefix_in_bucket}/tenants/{tenant_id}" + return f"{self.tenants_path()}/{tenant_id}" def heatmap_key(self, tenant_id: TenantId) -> str: return f"{self.tenant_path(tenant_id)}/{TENANT_HEATMAP_FILE_NAME}" @@ -262,6 +270,9 @@ class S3Storage: r = self.client.get_object(Bucket=self.bucket_name, Key=self.heatmap_key(tenant_id)) return json.loads(r["Body"].read().decode("utf-8")) + def mock_remote_tenant_path(self, tenant_id: TenantId): + assert self.real is False + RemoteStorage = Union[LocalFsStorage, S3Storage] @@ -408,6 +419,13 @@ def default_remote_storage() -> RemoteStorageKind: return RemoteStorageKind.LOCAL_FS +def remote_storage_to_toml_dict(remote_storage: RemoteStorage) -> Dict[str, Any]: + if not isinstance(remote_storage, (LocalFsStorage, S3Storage)): + raise Exception("invalid remote storage type") + + return remote_storage.to_toml_dict() + + # serialize as toml inline table def remote_storage_to_toml_inline_table(remote_storage: RemoteStorage) -> str: if not isinstance(remote_storage, (LocalFsStorage, S3Storage)): diff --git a/test_runner/fixtures/types.py b/test_runner/fixtures/types.py index 80c9b9ce9a..b5458b5c26 100644 --- a/test_runner/fixtures/types.py +++ b/test_runner/fixtures/types.py @@ -156,7 +156,11 @@ class TenantShardId: raise ValueError(f"Invalid TenantShardId '{input}'") def __str__(self): - return f"{self.tenant_id}-{self.shard_number:02x}{self.shard_count:02x}" + if self.shard_count > 0: + return f"{self.tenant_id}-{self.shard_number:02x}{self.shard_count:02x}" + else: + # Unsharded case: equivalent of Rust TenantShardId::unsharded(tenant_id) + return str(self.tenant_id) def __repr__(self): return self.__str__() diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py index 9365d65fc9..6470621900 100644 --- a/test_runner/fixtures/utils.py +++ b/test_runner/fixtures/utils.py @@ -1,4 +1,5 @@ import contextlib +import enum import json import os import re @@ -484,3 +485,16 @@ def assert_no_errors(log_file, service, allowed_errors): log.info(f"not allowed {service} error: {error.strip()}") assert not errors, f"Log errors on {service}: {errors[0]}" + + +@enum.unique +class AuxFileStore(str, enum.Enum): + V1 = "V1" + V2 = "V2" + CrossValidation = "CrossValidation" + + def __repr__(self) -> str: + return f"'aux-{self.value}'" + + def __str__(self) -> str: + return f"'aux-{self.value}'" diff --git a/test_runner/fixtures/workload.py b/test_runner/fixtures/workload.py index ab8717de54..c44628ce06 100644 --- a/test_runner/fixtures/workload.py +++ b/test_runner/fixtures/workload.py @@ -81,9 +81,13 @@ class Workload: return self._endpoint - def __del__(self): + def stop(self): if self._endpoint is not None: self._endpoint.stop() + self._endpoint = None + + def __del__(self): + self.stop() def init(self, pageserver_id: Optional[int] = None): endpoint = self.endpoint(pageserver_id) diff --git a/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py b/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py index 324ef0d516..b66db4d0ab 100644 --- a/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py +++ b/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py @@ -125,19 +125,19 @@ async def run_update_loop_worker(ep: Endpoint, n_txns: int, idx: int): await conn.execute(f"ALTER TABLE {table} SET (autovacuum_enabled = false)") await conn.execute(f"INSERT INTO {table} VALUES (1, 0)") await conn.execute( + f""" + CREATE PROCEDURE updating{table}() as + $$ + DECLARE + i integer; + BEGIN + FOR i IN 1..{n_txns} LOOP + UPDATE {table} SET x = x + 1 WHERE pk=1; + COMMIT; + END LOOP; + END + $$ LANGUAGE plpgsql """ - CREATE PROCEDURE updating{0}() as - $$ - DECLARE - i integer; - BEGIN - FOR i IN 1..{1} LOOP - UPDATE {0} SET x = x + 1 WHERE pk=1; - COMMIT; - END LOOP; - END - $$ LANGUAGE plpgsql - """.format(table, n_txns) ) await conn.execute("SET statement_timeout=0") await conn.execute(f"call updating{table}()") diff --git a/test_runner/performance/test_branch_creation.py b/test_runner/performance/test_branch_creation.py index 9777bf6748..7687b8417f 100644 --- a/test_runner/performance/test_branch_creation.py +++ b/test_runner/performance/test_branch_creation.py @@ -78,7 +78,7 @@ def test_branch_creation_heavy_write(neon_compare: NeonCompare, n_branches: int) p = random.randint(0, i) timer = timeit.default_timer() - env.neon_cli.create_branch("b{}".format(i + 1), "b{}".format(p), tenant_id=tenant) + env.neon_cli.create_branch(f"b{i + 1}", f"b{p}", tenant_id=tenant) dur = timeit.default_timer() - timer log.info(f"Creating branch b{i+1} took {dur}s") @@ -140,10 +140,14 @@ def test_branch_creation_many(neon_compare: NeonCompare, n_branches: int, shape: # start without gc so we can time compaction with less noise; use shorter # period for compaction so it starts earlier + def patch_default_tenant_config(config): + tenant_config = config.get("tenant_config", {}) + tenant_config["compaction_period"] = "3s" + tenant_config["gc_period"] = "0s" + config["tenant_config"] = tenant_config + + env.pageserver.edit_config_toml(patch_default_tenant_config) env.pageserver.start( - overrides=( - "--pageserver-config-override=tenant_config={ compaction_period = '3s', gc_period = '0s' }", - ), # this does print more than we want, but the number should be comparable between runs extra_env_vars={ "RUST_LOG": f"[compaction_loop{{tenant_id={env.initial_tenant}}}]=debug,info" diff --git a/test_runner/performance/test_bulk_insert.py b/test_runner/performance/test_bulk_insert.py index 9e3f602237..1df3f2f5f1 100644 --- a/test_runner/performance/test_bulk_insert.py +++ b/test_runner/performance/test_bulk_insert.py @@ -1,5 +1,6 @@ from contextlib import closing +import pytest from fixtures.benchmark_fixture import MetricReport from fixtures.compare_fixtures import NeonCompare, PgCompare from fixtures.pageserver.utils import wait_tenant_status_404 @@ -17,6 +18,7 @@ from fixtures.types import Lsn # 3. Disk space used # 4. Peak memory usage # +@pytest.mark.skip("See https://github.com/neondatabase/neon/issues/7124") def test_bulk_insert(neon_with_baseline: PgCompare): env = neon_with_baseline diff --git a/test_runner/performance/test_storage_controller_scale.py b/test_runner/performance/test_storage_controller_scale.py new file mode 100644 index 0000000000..632d465c3f --- /dev/null +++ b/test_runner/performance/test_storage_controller_scale.py @@ -0,0 +1,201 @@ +import concurrent.futures +import random +import time + +import pytest +from fixtures.compute_reconfigure import ComputeReconfigure +from fixtures.log_helper import log +from fixtures.neon_fixtures import ( + NeonEnvBuilder, +) +from fixtures.pageserver.http import PageserverHttpClient +from fixtures.pg_version import PgVersion +from fixtures.types import TenantId, TenantShardId, TimelineId + + +@pytest.mark.timeout(3600) # super long running test: should go down as we optimize +def test_storage_controller_many_tenants( + neon_env_builder: NeonEnvBuilder, compute_reconfigure_listener: ComputeReconfigure +): + """ + Check that we cope well with a not-totally-trivial number of tenants. + + This is checking for: + - Obvious concurrency bugs from issuing many tenant creations/modifications + concurrently. + - Obvious scaling bugs like O(N^2) scaling that would be so slow that even + a basic test starts failing from slowness. + + This is _not_ a comprehensive scale test: just a basic sanity check that + we don't fall over for a thousand shards. + """ + + neon_env_builder.num_pageservers = 5 + neon_env_builder.storage_controller_config = { + # Default neon_local uses a small timeout: use a longer one to tolerate longer pageserver restarts. + # TODO: tune this down as restarts get faster (https://github.com/neondatabase/neon/pull/7553), to + # guard against regressions in restart time. + "max_unavailable": "300s" + } + neon_env_builder.control_plane_compute_hook_api = ( + compute_reconfigure_listener.control_plane_compute_hook_api + ) + + # A small sleep on each call into the notify hook, to simulate the latency of doing a database write + compute_reconfigure_listener.register_on_notify(lambda body: time.sleep(0.01)) + + env = neon_env_builder.init_start() + + # We will intentionally stress reconciler concurrrency, which triggers a warning when lots + # of shards are hitting the delayed path. + env.storage_controller.allowed_errors.append(".*Many shards are waiting to reconcile") + + for ps in env.pageservers: + # This can happen because when we do a loop over all pageservers and mark them offline/active, + # reconcilers might get cancelled, and the next reconcile can follow a not-so-elegant path of + # bumping generation before other attachments are detached. + # + # We could clean this up by making reconcilers respect the .observed of their predecessor, if + # we spawn with a wait for the predecessor. + ps.allowed_errors.append(".*Dropped remote consistent LSN updates.*") + + # Storage controller is allowed to drop pageserver requests when the cancellation token + # for a Reconciler fires. + ps.allowed_errors.append(".*request was dropped before completing.*") + + # Total tenants + tenant_count = 4000 + + # Shards per tenant + shard_count = 2 + stripe_size = 1024 + + tenants = set(TenantId.generate() for _i in range(0, tenant_count)) + + virtual_ps_http = PageserverHttpClient(env.storage_controller_port, lambda: True) + + def check_memory(): + # Shards should be cheap_ in memory, as we will have very many of them + expect_memory_per_shard = 128 * 1024 + + rss = env.storage_controller.get_metric_value("process_resident_memory_bytes") + assert rss is not None + log.info(f"Resident memory: {rss} ({ rss / (shard_count * tenant_count)} per shard)") + assert rss < expect_memory_per_shard * shard_count * tenant_count + + # We use a fixed seed to make the test somewhat reproducible: we want a randomly + # chosen order in the sense that it's arbitrary, but not in the sense that it should change every run. + rng = random.Random(1234) + + # Issue more concurrent operations than the storage controller's reconciler concurrency semaphore + # permits, to ensure that we are exercising stressing that. + api_concurrency = 135 + + # We will create tenants directly via API, not via neon_local, to avoid any false + # serialization of operations in neon_local (it e.g. loads/saves a config file on each call) + with concurrent.futures.ThreadPoolExecutor(max_workers=api_concurrency) as executor: + futs = [] + t1 = time.time() + for tenant_id in tenants: + f = executor.submit( + env.storage_controller.tenant_create, + tenant_id, + shard_count, + stripe_size, + # Upload heatmaps fast, so that secondary downloads happen promptly, enabling + # the controller's optimization migrations to proceed promptly. + tenant_config={"heatmap_period": "10s"}, + placement_policy={"Attached": 1}, + ) + futs.append(f) + + # Wait for creations to finish + for f in futs: + f.result() + log.info( + f"Created {len(tenants)} tenants in {time.time() - t1}, {len(tenants) / (time.time() - t1)}/s" + ) + + run_ops = api_concurrency * 4 + assert run_ops < len(tenants) + op_tenants = list(tenants)[0:run_ops] + + # Generate a mixture of operations and dispatch them all concurrently + futs = [] + for tenant_id in op_tenants: + op = rng.choice([0, 1, 2]) + if op == 0: + # A fan-out write operation to all shards in a tenant (timeline creation) + f = executor.submit( + virtual_ps_http.timeline_create, + PgVersion.NOT_SET, + tenant_id, + TimelineId.generate(), + ) + elif op == 1: + # A reconciler operation: migrate a shard. + shard_number = rng.randint(0, shard_count - 1) + tenant_shard_id = TenantShardId(tenant_id, shard_number, shard_count) + dest_ps_id = rng.choice([ps.id for ps in env.pageservers]) + f = executor.submit( + env.storage_controller.tenant_shard_migrate, tenant_shard_id, dest_ps_id + ) + elif op == 2: + # A passthrough read to shard zero + f = executor.submit(virtual_ps_http.tenant_status, tenant_id) + + futs.append(f) + + # Wait for mixed ops to finish + for f in futs: + f.result() + + # Consistency check is safe here: all the previous operations waited for reconcile before completing + env.storage_controller.consistency_check() + check_memory() + + # This loop waits for reconcile_all to indicate no pending work, and then calls it once more to time + # how long the call takes when idle: this iterates over shards while doing no I/O and should be reliably fast: if + # it isn't, that's a sign that we have made some algorithmic mistake (e.g. O(N**2) scheduling) + # + # We do not require that the system is quiescent already here, although at present in this point in the test + # that may be the case. + while True: + t1 = time.time() + reconcilers = env.storage_controller.reconcile_all() + if reconcilers == 0: + # Time how long a no-op background reconcile takes: this measures how long it takes to + # loop over all the shards looking for work to do. + runtime = time.time() - t1 + log.info(f"No-op call to reconcile_all took {runtime}s") + assert runtime < 1 + break + + # Restart the storage controller + env.storage_controller.stop() + env.storage_controller.start() + + # See how long the controller takes to pass its readiness check. This should be fast because + # all the nodes are online: offline pageservers are the only thing that's allowed to delay + # startup. + readiness_period = env.storage_controller.wait_until_ready() + assert readiness_period < 5 + + # Consistency check is safe here: the storage controller's restart should not have caused any reconcilers + # to run, as it was in a stable state before restart. If it did, that's a bug. + env.storage_controller.consistency_check() + check_memory() + + # Restart pageservers: this exercises the /re-attach API + for pageserver in env.pageservers: + pageserver.stop() + pageserver.start() + + # Consistency check is safe here: restarting pageservers should not have caused any Reconcilers to spawn, + # as they were not offline long enough to trigger any scheduling changes. + env.storage_controller.consistency_check() + check_memory() + + # Stop the storage controller before tearing down fixtures, because it otherwise might log + # errors trying to call our `ComputeReconfigure`. + env.storage_controller.stop() diff --git a/test_runner/pg_clients/csharp/npgsql/csharp-npgsql.csproj b/test_runner/pg_clients/csharp/npgsql/csharp-npgsql.csproj index 50243e3ea7..edf2a01337 100644 --- a/test_runner/pg_clients/csharp/npgsql/csharp-npgsql.csproj +++ b/test_runner/pg_clients/csharp/npgsql/csharp-npgsql.csproj @@ -8,7 +8,7 @@ - + diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py index 3058926b25..693add422f 100644 --- a/test_runner/regress/test_attach_tenant_config.py +++ b/test_runner/regress/test_attach_tenant_config.py @@ -189,6 +189,8 @@ def test_fully_custom_config(positive_env: NeonEnv): }, "trace_read_requests": True, "walreceiver_connect_timeout": "13m", + "image_layer_creation_check_threshold": 1, + "switch_aux_file_policy": "CrossValidation", } ps_http = env.pageserver.http_client() diff --git a/test_runner/regress/test_auth.py b/test_runner/regress/test_auth.py index ea88b5d8e9..bb622c0d59 100644 --- a/test_runner/regress/test_auth.py +++ b/test_runner/regress/test_auth.py @@ -105,7 +105,7 @@ def test_pageserver_multiple_keys(neon_env_builder: NeonEnvBuilder): # The neon_local tool generates one key pair at a hardcoded path by default. # As a preparation for our test, move the public key of the key pair into a # directory at the same location as the hardcoded path by: - # 1. moving the the file at `configured_pub_key_path` to a temporary location + # 1. moving the file at `configured_pub_key_path` to a temporary location # 2. creating a new directory at `configured_pub_key_path` # 3. moving the file from the temporary location into the newly created directory configured_pub_key_path = Path(env.repo_dir) / "auth_public_key.pem" diff --git a/test_runner/regress/test_bad_connection.py b/test_runner/regress/test_bad_connection.py index c808fa0f54..82a3a05c2b 100644 --- a/test_runner/regress/test_bad_connection.py +++ b/test_runner/regress/test_bad_connection.py @@ -1,10 +1,13 @@ import random import time +import psycopg2.errors +import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder +@pytest.mark.timeout(600) def test_compute_pageserver_connection_stress(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_start() env.pageserver.allowed_errors.append(".*simulated connection error.*") @@ -20,12 +23,20 @@ def test_compute_pageserver_connection_stress(neon_env_builder: NeonEnvBuilder): pg_conn = endpoint.connect() cur = pg_conn.cursor() + def execute_retry_on_timeout(query): + while True: + try: + cur.execute(query) + return + except psycopg2.errors.QueryCanceled: + log.info(f"Query '{query}' timed out - retrying") + # Create table, and insert some rows. Make it big enough that it doesn't fit in # shared_buffers, otherwise the SELECT after restart will just return answer # from shared_buffers without hitting the page server, which defeats the point # of this test. - cur.execute("CREATE TABLE foo (t text)") - cur.execute( + execute_retry_on_timeout("CREATE TABLE foo (t text)") + execute_retry_on_timeout( """ INSERT INTO foo SELECT 'long string to consume some space' || g @@ -34,7 +45,7 @@ def test_compute_pageserver_connection_stress(neon_env_builder: NeonEnvBuilder): ) # Verify that the table is larger than shared_buffers - cur.execute( + execute_retry_on_timeout( """ select setting::int * pg_size_bytes(unit) as shared_buffers, pg_relation_size('foo') as tbl_size from pg_settings where name = 'shared_buffers' @@ -45,16 +56,16 @@ def test_compute_pageserver_connection_stress(neon_env_builder: NeonEnvBuilder): log.info(f"shared_buffers is {row[0]}, table size {row[1]}") assert int(row[0]) < int(row[1]) - cur.execute("SELECT count(*) FROM foo") + execute_retry_on_timeout("SELECT count(*) FROM foo") assert cur.fetchone() == (100000,) end_time = time.time() + 30 times_executed = 0 while time.time() < end_time: if random.random() < 0.5: - cur.execute("INSERT INTO foo VALUES ('stas'), ('heikki')") + execute_retry_on_timeout("INSERT INTO foo VALUES ('stas'), ('heikki')") else: - cur.execute("SELECT t FROM foo ORDER BY RANDOM() LIMIT 10") + execute_retry_on_timeout("SELECT t FROM foo ORDER BY RANDOM() LIMIT 10") cur.fetchall() times_executed += 1 log.info(f"Workload executed {times_executed} times") diff --git a/test_runner/regress/test_branching.py b/test_runner/regress/test_branching.py index 2a7a3c41ac..9fe9f77fea 100644 --- a/test_runner/regress/test_branching.py +++ b/test_runner/regress/test_branching.py @@ -1,6 +1,7 @@ import random import threading import time +from concurrent.futures import ThreadPoolExecutor from typing import List import pytest @@ -84,11 +85,11 @@ def test_branching_with_pgbench( threads = [] if ty == "cascade": - env.neon_cli.create_branch("b{}".format(i + 1), "b{}".format(i), tenant_id=tenant) + env.neon_cli.create_branch(f"b{i + 1}", f"b{i}", tenant_id=tenant) else: - env.neon_cli.create_branch("b{}".format(i + 1), "b0", tenant_id=tenant) + env.neon_cli.create_branch(f"b{i + 1}", "b0", tenant_id=tenant) - endpoints.append(env.endpoints.create_start("b{}".format(i + 1), tenant_id=tenant)) + endpoints.append(env.endpoints.create_start(f"b{i + 1}", tenant_id=tenant)) threads.append( threading.Thread(target=run_pgbench, args=(endpoints[-1].connstr(),), daemon=True) @@ -405,6 +406,29 @@ def test_duplicate_creation(neon_env_builder: NeonEnvBuilder): assert len(ps_http.timeline_list(tenant_id=env.initial_tenant)) == 1 +def test_branching_while_stuck_find_gc_cutoffs(neon_env_builder: NeonEnvBuilder): + env = neon_env_builder.init_start() + + client = env.pageserver.http_client() + + failpoint = "Timeline::find_gc_cutoffs-pausable" + + client.configure_failpoints((failpoint, "pause")) + + with ThreadPoolExecutor(max_workers=1) as exec: + completion = exec.submit(client.timeline_gc, env.initial_tenant, env.initial_timeline, None) + + wait_until_paused(env, failpoint) + + env.neon_cli.create_branch( + tenant_id=env.initial_tenant, ancestor_branch_name="main", new_branch_name="branch" + ) + + client.configure_failpoints((failpoint, "off")) + + completion.result() + + def wait_until_paused(env: NeonEnv, failpoint: str): found = False msg = f"at failpoint {failpoint}" diff --git a/test_runner/regress/test_broken_timeline.py b/test_runner/regress/test_broken_timeline.py index 804ad135ce..1279c1bf81 100644 --- a/test_runner/regress/test_broken_timeline.py +++ b/test_runner/regress/test_broken_timeline.py @@ -17,11 +17,16 @@ from fixtures.types import TenantId, TimelineId # Test restarting page server, while safekeeper and compute node keep # running. def test_local_corruption(neon_env_builder: NeonEnvBuilder): + if neon_env_builder.pageserver_get_impl == "vectored": + reconstruct_function_name = "get_values_reconstruct_data" + else: + reconstruct_function_name = "get_value_reconstruct_data" + env = neon_env_builder.init_start() env.pageserver.allowed_errors.extend( [ - ".*get_value_reconstruct_data for layer .*", + f".*{reconstruct_function_name} for layer .*", ".*could not find data for key.*", ".*is not active. Current state: Broken.*", ".*will not become active. Current state: Broken.*", @@ -84,7 +89,7 @@ def test_local_corruption(neon_env_builder: NeonEnvBuilder): # (We don't check layer file contents on startup, when loading the timeline) # # This will change when we implement checksums for layers - with pytest.raises(Exception, match="get_value_reconstruct_data for layer ") as err: + with pytest.raises(Exception, match=f"{reconstruct_function_name} for layer ") as err: pg2.start() log.info( f"As expected, compute startup failed for timeline {tenant2}/{timeline2} with corrupt layers: {err}" diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py new file mode 100644 index 0000000000..43a3323462 --- /dev/null +++ b/test_runner/regress/test_compaction.py @@ -0,0 +1,192 @@ +import json +import os +from typing import Optional + +import pytest +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnvBuilder +from fixtures.workload import Workload + +AGGRESIVE_COMPACTION_TENANT_CONF = { + # Disable gc and compaction. The test runs compaction manually. + "gc_period": "0s", + "compaction_period": "0s", + # Small checkpoint distance to create many layers + "checkpoint_distance": 1024**2, + # Compact small layers + "compaction_target_size": 1024**2, + "image_creation_threshold": 2, +} + + +@pytest.mark.skipif(os.environ.get("BUILD_TYPE") == "debug", reason="only run with release build") +def test_pageserver_compaction_smoke(neon_env_builder: NeonEnvBuilder): + """ + This is a smoke test that compaction kicks in. The workload repeatedly churns + a small number of rows and manually instructs the pageserver to run compaction + between iterations. At the end of the test validate that the average number of + layers visited to gather reconstruct data for a given key is within the empirically + observed bounds. + """ + + # Effectively disable the page cache to rely only on image layers + # to shorten reads. + neon_env_builder.pageserver_config_override = """ +page_cache_size=10 +""" + + env = neon_env_builder.init_start(initial_tenant_conf=AGGRESIVE_COMPACTION_TENANT_CONF) + + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + + row_count = 10000 + churn_rounds = 100 + + ps_http = env.pageserver.http_client() + + workload = Workload(env, tenant_id, timeline_id) + workload.init(env.pageserver.id) + + log.info("Writing initial data ...") + workload.write_rows(row_count, env.pageserver.id) + + for i in range(1, churn_rounds + 1): + if i % 10 == 0: + log.info(f"Running churn round {i}/{churn_rounds} ...") + + workload.churn_rows(row_count, env.pageserver.id) + ps_http.timeline_compact(tenant_id, timeline_id) + + log.info("Validating at workload end ...") + workload.validate(env.pageserver.id) + + log.info("Checking layer access metrics ...") + + layer_access_metric_names = [ + "pageserver_layers_visited_per_read_global_sum", + "pageserver_layers_visited_per_read_global_count", + "pageserver_layers_visited_per_read_global_bucket", + "pageserver_layers_visited_per_vectored_read_global_sum", + "pageserver_layers_visited_per_vectored_read_global_count", + "pageserver_layers_visited_per_vectored_read_global_bucket", + ] + + metrics = env.pageserver.http_client().get_metrics() + for name in layer_access_metric_names: + layer_access_metrics = metrics.query_all(name) + log.info(f"Got metrics: {layer_access_metrics}") + + non_vectored_sum = metrics.query_one("pageserver_layers_visited_per_read_global_sum") + non_vectored_count = metrics.query_one("pageserver_layers_visited_per_read_global_count") + non_vectored_average = non_vectored_sum.value / non_vectored_count.value + + vectored_sum = metrics.query_one("pageserver_layers_visited_per_vectored_read_global_sum") + vectored_count = metrics.query_one("pageserver_layers_visited_per_vectored_read_global_count") + vectored_average = vectored_sum.value / vectored_count.value + + log.info(f"{non_vectored_average=} {vectored_average=}") + + # The upper bound for average number of layer visits below (8) + # was chosen empirically for this workload. + assert non_vectored_average < 8 + assert vectored_average < 8 + + +# Stripe sizes in number of pages. +TINY_STRIPES = 16 +LARGE_STRIPES = 32768 + + +@pytest.mark.parametrize( + "shard_count,stripe_size", [(None, None), (4, TINY_STRIPES), (4, LARGE_STRIPES)] +) +def test_sharding_compaction( + neon_env_builder: NeonEnvBuilder, stripe_size: int, shard_count: Optional[int] +): + """ + Use small stripes, small layers, and small compaction thresholds to exercise how compaction + and image layer generation interacts with sharding. + + We are looking for bugs that might emerge from the way sharding uses sparse layer files that + only contain some of the keys in the key range covered by the layer, such as errors estimating + the size of layers that might result in too-small layer files. + """ + + compaction_target_size = 128 * 1024 + + TENANT_CONF = { + # small checkpointing and compaction targets to ensure we generate many upload operations + "checkpoint_distance": f"{128 * 1024}", + "compaction_threshold": "1", + "compaction_target_size": f"{compaction_target_size}", + # no PITR horizon, we specify the horizon when we request on-demand GC + "pitr_interval": "0s", + # disable background compaction and GC. We invoke it manually when we want it to happen. + "gc_period": "0s", + "compaction_period": "0s", + # create image layers eagerly: we want to exercise image layer creation in this test. + "image_creation_threshold": "1", + "image_layer_creation_check_threshold": 0, + } + + neon_env_builder.num_pageservers = 1 if shard_count is None else shard_count + env = neon_env_builder.init_start( + initial_tenant_conf=TENANT_CONF, + initial_tenant_shard_count=shard_count, + initial_tenant_shard_stripe_size=stripe_size, + ) + + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + + workload = Workload(env, tenant_id, timeline_id) + workload.init() + workload.write_rows(64) + for _i in range(0, 10): + # Each of these does some writes then a checkpoint: because we set image_creation_threshold to 1, + # these should result in image layers each time we write some data into a shard, and also shards + # recieving less data hitting their "empty image layer" path (wherre they should skip writing the layer, + # rather than asserting) + workload.churn_rows(64) + + # Assert that we got some image layers: this is important because this test's purpose is to exercise the sharding changes + # to Timeline::create_image_layers, so if we weren't creating any image layers we wouldn't be doing our job. + shard_has_image_layers = [] + for shard in env.storage_controller.locate(tenant_id): + pageserver = env.get_pageserver(shard["node_id"]) + shard_id = shard["shard_id"] + layer_map = pageserver.http_client().layer_map_info(shard_id, timeline_id) + image_layer_sizes = {} + for layer in layer_map.historic_layers: + if layer.kind == "Image": + image_layer_sizes[layer.layer_file_name] = layer.layer_file_size + + # Pageserver should assert rather than emit an empty layer file, but double check here + assert layer.layer_file_size is not None + assert layer.layer_file_size > 0 + + shard_has_image_layers.append(len(image_layer_sizes) > 1) + log.info(f"Shard {shard_id} image layer sizes: {json.dumps(image_layer_sizes, indent=2)}") + + if stripe_size == TINY_STRIPES: + # Checking the average size validates that our keyspace partitioning is properly respecting sharding: if + # it was not, we would tend to get undersized layers because the partitioning would overestimate the physical + # data in a keyrange. + # + # We only do this check with tiny stripes, because large stripes may not give all shards enough + # data to have statistically significant image layers + avg_size = sum(v for v in image_layer_sizes.values()) / len(image_layer_sizes) # type: ignore + log.info(f"Shard {shard_id} average image layer size: {avg_size}") + assert avg_size > compaction_target_size / 2 + + if stripe_size == TINY_STRIPES: + # Expect writes were scattered across all pageservers: they should all have compacted some image layers + assert all(shard_has_image_layers) + else: + # With large stripes, it is expected that most of our writes went to one pageserver, so we just require + # that at least one of them has some image layers. + assert any(shard_has_image_layers) + + # Assert that everything is still readable + workload.validate() diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py index e0bb4c2062..787c114fc1 100644 --- a/test_runner/regress/test_compatibility.py +++ b/test_runner/regress/test_compatibility.py @@ -226,27 +226,26 @@ def test_forward_compatibility( ) try: - # TODO: remove this once the previous pageserrver version understands - # the 'get_vectored_impl' config - neon_env_builder.pageserver_get_vectored_impl = None + # Previous version neon_local and pageserver are not aware + # of the new config. + # TODO: remove these once the previous version of neon local supports them + neon_env_builder.pageserver_get_impl = None + neon_env_builder.pageserver_validate_vectored_get = None neon_env_builder.num_safekeepers = 3 - neon_local_binpath = neon_env_builder.neon_binpath + + # Use previous version's production binaries (pageserver, safekeeper, pg_distrib_dir, etc.). + # But always use the current version's neon_local binary. + # This is because we want to test the compatibility of the data format, not the compatibility of the neon_local CLI. + neon_env_builder.neon_binpath = compatibility_neon_bin + neon_env_builder.pg_distrib_dir = compatibility_postgres_distrib_dir + neon_env_builder.neon_local_binpath = neon_env_builder.neon_local_binpath + env = neon_env_builder.from_repo_dir( compatibility_snapshot_dir / "repo", - neon_binpath=compatibility_neon_bin, - pg_distrib_dir=compatibility_postgres_distrib_dir, ) - # TODO: remove this workaround after release-5090 is no longer the most recent release. - # There was a bug in that code that generates a warning in the storage controller log. - env.storage_controller.allowed_errors.append(".*no tenant_shard_id specified.*") - - # Use current neon_local even though we're using old binaries for - # everything else: our test code is written for latest CLI args. - env.neon_local_binpath = neon_local_binpath - - neon_env_builder.start(register_pageservers=True) + neon_env_builder.start() check_neon_works( env, @@ -267,9 +266,10 @@ def test_forward_compatibility( def check_neon_works(env: NeonEnv, test_output_dir: Path, sql_dump_path: Path, repo_dir: Path): ep = env.endpoints.create_start("main") + connstr = ep.connstr() + pg_bin = PgBin(test_output_dir, env.pg_distrib_dir, env.pg_version) - connstr = ep.connstr() pg_bin.run_capture( ["pg_dumpall", f"--dbname={connstr}", f"--file={test_output_dir / 'dump.sql'}"] ) @@ -286,6 +286,9 @@ def check_neon_works(env: NeonEnv, test_output_dir: Path, sql_dump_path: Path, r timeline_id = env.initial_timeline pg_version = env.pg_version + # Stop endpoint while we recreate timeline + ep.stop() + try: pageserver_http.timeline_preserve_initdb_archive(tenant_id, timeline_id) except PageserverApiException as e: @@ -310,6 +313,9 @@ def check_neon_works(env: NeonEnv, test_output_dir: Path, sql_dump_path: Path, r existing_initdb_timeline_id=timeline_id, ) + # Timeline exists again: restart the endpoint + ep.start() + pg_bin.run_capture( ["pg_dumpall", f"--dbname={connstr}", f"--file={test_output_dir / 'dump-from-wal.sql'}"] ) diff --git a/test_runner/regress/test_crafted_wal_end.py b/test_runner/regress/test_crafted_wal_end.py index 01ecc2b95f..30f8d81890 100644 --- a/test_runner/regress/test_crafted_wal_end.py +++ b/test_runner/regress/test_crafted_wal_end.py @@ -19,6 +19,12 @@ from fixtures.neon_fixtures import NeonEnvBuilder, WalCraft def test_crafted_wal_end(neon_env_builder: NeonEnvBuilder, wal_type: str): env = neon_env_builder.init_start() env.neon_cli.create_branch("test_crafted_wal_end") + env.pageserver.allowed_errors.extend( + [ + # seems like pageserver stop triggers these + ".*initial size calculation failed.*Bad state (not active).*", + ] + ) endpoint = env.endpoints.create("test_crafted_wal_end") wal_craft = WalCraft(env) diff --git a/test_runner/regress/test_disk_usage_eviction.py b/test_runner/regress/test_disk_usage_eviction.py index b83545216d..5e9efa7cce 100644 --- a/test_runner/regress/test_disk_usage_eviction.py +++ b/test_runner/regress/test_disk_usage_eviction.py @@ -5,7 +5,6 @@ from dataclasses import dataclass from typing import Any, Dict, Iterable, Tuple import pytest -import toml from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnv, @@ -45,17 +44,16 @@ def test_min_resident_size_override_handling( ps_http.set_tenant_config(tenant_id, {}) assert_config(tenant_id, None, default_tenant_conf_value) - env.pageserver.stop() if config_level_override is not None: - env.pageserver.start( - overrides=( - "--pageserver-config-override=tenant_config={ min_resident_size_override = " - + str(config_level_override) - + " }", - ) - ) - else: - env.pageserver.start() + + def set_min_resident_size(config): + tenant_config = config.get("tenant_config", {}) + tenant_config["min_resident_size_override"] = config_level_override + config["tenant_config"] = tenant_config + + env.pageserver.edit_config_toml(set_min_resident_size) + env.pageserver.stop() + env.pageserver.start() tenant_id, _ = env.neon_cli.create_tenant() assert_overrides(tenant_id, config_level_override) @@ -164,34 +162,32 @@ class EvictionEnv: usage eviction task is unknown; it might need to run one more iteration before assertions can be made. """ - disk_usage_config = { - "period": period, - "max_usage_pct": max_usage_pct, - "min_avail_bytes": min_avail_bytes, - "mock_statvfs": mock_behavior, - "eviction_order": eviction_order.config(), - } - - enc = toml.TomlEncoder() # these can sometimes happen during startup before any tenants have been # loaded, so nothing can be evicted, we just wait for next iteration which # is able to evict. pageserver.allowed_errors.append(".*WARN.* disk usage still high.*") - pageserver.start( - overrides=( - "--pageserver-config-override=disk_usage_based_eviction=" - + enc.dump_inline_table(disk_usage_config).replace("\n", " "), + pageserver.patch_config_toml_nonrecursive( + { + "disk_usage_based_eviction": { + "period": period, + "max_usage_pct": max_usage_pct, + "min_avail_bytes": min_avail_bytes, + "mock_statvfs": mock_behavior, + "eviction_order": eviction_order.config(), + }, # Disk usage based eviction runs as a background task. # But pageserver startup delays launch of background tasks for some time, to prioritize initial logical size calculations during startup. # But, initial logical size calculation may not be triggered if safekeepers don't publish new broker messages. # But, we only have a 10-second-timeout in this test. # So, disable the delay for this test. - "--pageserver-config-override=background_task_maximum_delay='0s'", - ), + "background_task_maximum_delay": "0s", + } ) + pageserver.start() + # we now do initial logical size calculation on startup, which on debug builds can fight with disk usage based eviction for tenant_id, timeline_id in self.timelines: tenant_ps = self.neon_env.get_tenant_pageserver(tenant_id) diff --git a/test_runner/regress/test_duplicate_layers.py b/test_runner/regress/test_duplicate_layers.py index cb4fa43be7..7471338ce5 100644 --- a/test_runner/regress/test_duplicate_layers.py +++ b/test_runner/regress/test_duplicate_layers.py @@ -2,6 +2,7 @@ import time import pytest from fixtures.neon_fixtures import NeonEnvBuilder, PgBin, wait_for_last_flush_lsn +from fixtures.pageserver.types import parse_layer_file_name from fixtures.pageserver.utils import ( wait_for_last_record_lsn, wait_for_upload_queue_empty, @@ -86,14 +87,7 @@ def test_actually_duplicated_l1(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin) # path = env.remote_storage.timeline_path(tenant_id, timeline_id) l1_found = None - for path in env.pageserver.timeline_dir(tenant_id, timeline_id).iterdir(): - if path.name == "metadata" or path.name.startswith("ephemeral-"): - continue - - if len(path.suffixes) > 0: - # temp files - continue - + for path in env.pageserver.list_layers(tenant_id, timeline_id): [key_range, lsn_range] = path.name.split("__", maxsplit=1) if "-" not in lsn_range: @@ -108,19 +102,21 @@ def test_actually_duplicated_l1(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin) if l1_found is not None: raise RuntimeError(f"found multiple L1: {l1_found.name} and {path.name}") - l1_found = path + l1_found = parse_layer_file_name(path.name) assert l1_found is not None, "failed to find L1 locally" uploaded = env.pageserver_remote_storage.remote_layer_path( - tenant_id, timeline_id, l1_found.name + tenant_id, timeline_id, l1_found.to_str() ) assert not uploaded.exists(), "to-be-overwritten should not yet be uploaded" env.pageserver.start() wait_until_tenant_active(pageserver_http, tenant_id) - assert not l1_found.exists(), "partial compaction result should had been removed during startup" + assert not env.pageserver.layer_exists( + tenant_id, timeline_id, l1_found + ), "partial compaction result should had been removed during startup" # wait for us to catch up again wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id, lsn) @@ -130,18 +126,18 @@ def test_actually_duplicated_l1(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin) # give time for log flush time.sleep(1) - message = f".*duplicated L1 layer layer={l1_found.name}" + message = f".*duplicated L1 layer layer={l1_found}" found_msg = env.pageserver.log_contains(message) # resident or evicted, it should not be overwritten, however it should had been non-existing at startup assert ( found_msg is None ), "layer should had been removed during startup, did it live on as evicted?" - assert l1_found.exists(), "the L1 reappears" + assert env.pageserver.layer_exists(tenant_id, timeline_id, l1_found), "the L1 reappears" wait_for_upload_queue_empty(pageserver_http, tenant_id, timeline_id) uploaded = env.pageserver_remote_storage.remote_layer_path( - tenant_id, timeline_id, l1_found.name + tenant_id, timeline_id, l1_found.to_str() ) assert uploaded.exists(), "the L1 is uploaded" diff --git a/test_runner/regress/test_hot_standby.py b/test_runner/regress/test_hot_standby.py index 0497e1965c..179cc273ec 100644 --- a/test_runner/regress/test_hot_standby.py +++ b/test_runner/regress/test_hot_standby.py @@ -3,7 +3,7 @@ import re import time from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnv, wait_replica_caughtup +from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, tenant_get_shards, wait_replica_caughtup # Check for corrupted WAL messages which might otherwise go unnoticed if @@ -84,3 +84,98 @@ def test_hot_standby(neon_simple_env: NeonEnv): # clean up if slow_down_send: sk_http.configure_failpoints(("sk-send-wal-replica-sleep", "off")) + + +def test_2_replicas_start(neon_simple_env: NeonEnv): + env = neon_simple_env + + with env.endpoints.create_start( + branch_name="main", + endpoint_id="primary", + ) as primary: + time.sleep(1) + with env.endpoints.new_replica_start( + origin=primary, endpoint_id="secondary1" + ) as secondary1: + with env.endpoints.new_replica_start( + origin=primary, endpoint_id="secondary2" + ) as secondary2: + wait_replica_caughtup(primary, secondary1) + wait_replica_caughtup(primary, secondary2) + + +# We had an issue that a standby server made GetPage requests with an +# old LSN, based on the last-written LSN cache, to avoid waits in the +# pageserver. However, requesting a page with a very old LSN, such +# that the GC horizon has already advanced past it, results in an +# error from the pageserver: +# "Bad request: tried to request a page version that was garbage collected" +# +# To avoid that, the compute<-> pageserver protocol was updated so +# that that the standby now sends two LSNs, the old last-written LSN +# and the current replay LSN. +# +# https://github.com/neondatabase/neon/issues/6211 +def test_hot_standby_gc(neon_env_builder: NeonEnvBuilder): + tenant_conf = { + # set PITR interval to be small, so we can do GC + "pitr_interval": "0 s", + } + env = neon_env_builder.init_start(initial_tenant_conf=tenant_conf) + timeline_id = env.initial_timeline + tenant_id = env.initial_tenant + + with env.endpoints.create_start( + branch_name="main", + endpoint_id="primary", + ) as primary: + with env.endpoints.new_replica_start( + origin=primary, + endpoint_id="secondary", + # Protocol version 2 was introduced to fix the issue + # that this test exercises. With protocol version 1 it + # fails. + config_lines=["neon.protocol_version=2"], + ) as secondary: + p_cur = primary.connect().cursor() + p_cur.execute("CREATE EXTENSION neon_test_utils") + p_cur.execute("CREATE TABLE test (id int primary key) WITH (autovacuum_enabled=false)") + p_cur.execute("INSERT INTO test SELECT generate_series(1, 10000) AS g") + + wait_replica_caughtup(primary, secondary) + + s_cur = secondary.connect().cursor() + + s_cur.execute("SELECT 1 WHERE pg_is_in_recovery()") + res = s_cur.fetchone() + assert res is not None + + s_cur.execute("SELECT COUNT(*) FROM test") + res = s_cur.fetchone() + assert res[0] == 10000 + + # Clear the cache in the standby, so that when we + # re-execute the query, it will make GetPage + # requests. This does not clear the last-written LSN cache + # so we still remember the LSNs of the pages. + s_cur.execute("SELECT clear_buffer_cache()") + + # Do other stuff on the primary, to advance the WAL + p_cur.execute("CREATE TABLE test2 AS SELECT generate_series(1, 1000000) AS g") + + # Run GC. The PITR interval is very small, so this advances the GC cutoff LSN + # very close to the primary's current insert LSN. + shards = tenant_get_shards(env, tenant_id, None) + for tenant_shard_id, pageserver in shards: + client = pageserver.http_client() + client.timeline_checkpoint(tenant_shard_id, timeline_id) + client.timeline_compact(tenant_shard_id, timeline_id) + client.timeline_gc(tenant_shard_id, timeline_id, 0) + + # Re-execute the query. The GetPage requests that this + # generates use old not_modified_since LSNs, older than + # the GC cutoff, but new request LSNs. (In protocol + # version 1 there was only one LSN, and this failed.) + s_cur.execute("SELECT COUNT(*) FROM test") + res = s_cur.fetchone() + assert res[0] == 10000 diff --git a/test_runner/regress/test_large_schema.py b/test_runner/regress/test_large_schema.py index b6ac1aa41f..c5d5b5fe64 100644 --- a/test_runner/regress/test_large_schema.py +++ b/test_runner/regress/test_large_schema.py @@ -74,8 +74,8 @@ def test_large_schema(neon_env_builder: NeonEnvBuilder): cur.execute("select * from pg_depend order by refclassid, refobjid, refobjsubid") # Check layer file sizes - timeline_path = "{}/tenants/{}/timelines/{}/".format( - env.pageserver.workdir, env.initial_tenant, env.initial_timeline + timeline_path = ( + f"{env.pageserver.workdir}/tenants/{env.initial_tenant}/timelines/{env.initial_timeline}/" ) for filename in os.listdir(timeline_path): if filename.startswith("00000"): diff --git a/test_runner/regress/test_layer_bloating.py b/test_runner/regress/test_layer_bloating.py index 2fdee89389..77dc8a35b5 100644 --- a/test_runner/regress/test_layer_bloating.py +++ b/test_runner/regress/test_layer_bloating.py @@ -57,9 +57,7 @@ def test_layer_bloating(neon_simple_env: NeonEnv, vanilla_pg): time.sleep(10) # Check layer file sizes - timeline_path = "{}/tenants/{}/timelines/{}/".format( - env.pageserver.workdir, env.initial_tenant, timeline - ) + timeline_path = f"{env.pageserver.workdir}/tenants/{env.initial_tenant}/timelines/{timeline}/" log.info(f"Check {timeline_path}") for filename in os.listdir(timeline_path): if filename.startswith("00000"): diff --git a/test_runner/regress/test_layer_eviction.py b/test_runner/regress/test_layer_eviction.py index 7bbc0cc160..b178baea11 100644 --- a/test_runner/regress/test_layer_eviction.py +++ b/test_runner/regress/test_layer_eviction.py @@ -7,6 +7,7 @@ from fixtures.neon_fixtures import ( flush_ep_to_pageserver, wait_for_last_flush_lsn, ) +from fixtures.pageserver.types import parse_layer_file_name from fixtures.pageserver.utils import wait_for_upload from fixtures.remote_storage import RemoteStorageKind @@ -57,9 +58,9 @@ def test_basic_eviction( for sk in env.safekeepers: sk.stop() - timeline_path = env.pageserver.timeline_dir(tenant_id, timeline_id) - initial_local_layers = sorted( - list(filter(lambda path: path.name != "metadata", timeline_path.glob("*"))) + initial_local_layers = dict( + (parse_layer_file_name(path.name), path) + for path in env.pageserver.list_layers(tenant_id, timeline_id) ) assert ( len(initial_local_layers) > 1 @@ -73,6 +74,7 @@ def test_basic_eviction( assert len(initial_local_layers) == len( initial_layer_map_info.historic_layers ), "Should have the same layers in memory and on disk" + for returned_layer in initial_layer_map_info.historic_layers: assert ( returned_layer.kind == "Delta" @@ -81,27 +83,29 @@ def test_basic_eviction( not returned_layer.remote ), f"All created layers should be present locally, but got {returned_layer}" - local_layers = list( - filter(lambda layer: layer.name == returned_layer.layer_file_name, initial_local_layers) + returned_layer_name = parse_layer_file_name(returned_layer.layer_file_name) + assert ( + returned_layer_name in initial_local_layers + ), f"Did not find returned layer {returned_layer_name} in local layers {list(initial_local_layers.keys())}" + + local_layer_path = ( + env.pageserver.timeline_dir(tenant_id, timeline_id) + / initial_local_layers[returned_layer_name] ) assert ( - len(local_layers) == 1 - ), f"Did not find returned layer {returned_layer} in local layers {initial_local_layers}" - local_layer = local_layers[0] - assert ( - returned_layer.layer_file_size == local_layer.stat().st_size - ), f"Returned layer {returned_layer} has a different file size than local layer {local_layer}" + returned_layer.layer_file_size == local_layer_path.stat().st_size + ), f"Returned layer {returned_layer} has a different file size than local layer {local_layer_path}" # Detach all layers, ensre they are not in the local FS, but are still dumped as part of the layer map - for local_layer in initial_local_layers: + for local_layer_name, local_layer_path in initial_local_layers.items(): client.evict_layer( - tenant_id=tenant_id, timeline_id=timeline_id, layer_name=local_layer.name + tenant_id=tenant_id, timeline_id=timeline_id, layer_name=local_layer_path.name ) - assert not any( - new_local_layer.name == local_layer.name for new_local_layer in timeline_path.glob("*") - ), f"Did not expect to find {local_layer} layer after evicting" + assert not env.pageserver.layer_exists( + tenant_id, timeline_id, local_layer_name + ), f"Did not expect to find {local_layer_name} layer after evicting" - empty_layers = list(filter(lambda path: path.name != "metadata", timeline_path.glob("*"))) + empty_layers = env.pageserver.list_layers(tenant_id, timeline_id) assert not empty_layers, f"After evicting all layers, timeline {tenant_id}/{timeline_id} should have no layers locally, but got: {empty_layers}" evicted_layer_map_info = client.layer_map_info(tenant_id=tenant_id, timeline_id=timeline_id) @@ -118,15 +122,15 @@ def test_basic_eviction( assert ( returned_layer.remote ), f"All layers should be evicted and not present locally, but got {returned_layer}" - assert any( - local_layer.name == returned_layer.layer_file_name - for local_layer in initial_local_layers + returned_layer_name = parse_layer_file_name(returned_layer.layer_file_name) + assert ( + returned_layer_name in initial_local_layers ), f"Did not find returned layer {returned_layer} in local layers {initial_local_layers}" # redownload all evicted layers and ensure the initial state is restored - for local_layer in initial_local_layers: + for local_layer_name, _local_layer_path in initial_local_layers.items(): client.download_layer( - tenant_id=tenant_id, timeline_id=timeline_id, layer_name=local_layer.name + tenant_id=tenant_id, timeline_id=timeline_id, layer_name=local_layer_name.to_str() ) client.timeline_download_remote_layers( tenant_id, @@ -137,8 +141,9 @@ def test_basic_eviction( at_least_one_download=False, ) - redownloaded_layers = sorted( - list(filter(lambda path: path.name != "metadata", timeline_path.glob("*"))) + redownloaded_layers = dict( + (parse_layer_file_name(path.name), path) + for path in env.pageserver.list_layers(tenant_id, timeline_id) ) assert ( redownloaded_layers == initial_local_layers @@ -154,7 +159,9 @@ def test_basic_eviction( def test_gc_of_remote_layers(neon_env_builder: NeonEnvBuilder): neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) - env = neon_env_builder.init_start() + # don't create initial tenant, we'll create it manually with custom config + env = neon_env_builder.init_configs() + env.start() tenant_config = { "pitr_interval": "1s", # set to non-zero, so GC actually does something @@ -165,6 +172,7 @@ def test_gc_of_remote_layers(neon_env_builder: NeonEnvBuilder): "compaction_threshold": "3", # "image_creation_threshold": set at runtime "compaction_target_size": f"{128 * (1024**2)}", # make it so that we only have 1 partition => image coverage for delta layers => enables gc of delta layers + "image_layer_creation_check_threshold": "0", # always check if a new image layer can be created } def tenant_update_config(changes): diff --git a/test_runner/regress/test_layers_from_future.py b/test_runner/regress/test_layers_from_future.py index ca4295c5cb..cc34fd83e9 100644 --- a/test_runner/regress/test_layers_from_future.py +++ b/test_runner/regress/test_layers_from_future.py @@ -3,8 +3,8 @@ import time from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder, flush_ep_to_pageserver from fixtures.pageserver.types import ( - DeltaLayerFileName, - ImageLayerFileName, + DeltaLayerName, + ImageLayerName, is_future_layer, ) from fixtures.pageserver.utils import ( @@ -53,6 +53,7 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder): "checkpoint_timeout": "24h", # something we won't reach "checkpoint_distance": f"{50 * (1024**2)}", # something we won't reach, we checkpoint manually "image_creation_threshold": "100", # we want to control when image is created + "image_layer_creation_check_threshold": "0", "compaction_threshold": f"{l0_l1_threshold}", "compaction_target_size": f"{128 * (1024**3)}", # make it so that we only have 1 partition => image coverage for delta layers => enables gc of delta layers } @@ -80,7 +81,7 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder): current = get_index_part() assert len(set(current.layer_metadata.keys())) == 1 layer_file_name = list(current.layer_metadata.keys())[0] - assert isinstance(layer_file_name, DeltaLayerFileName) + assert isinstance(layer_file_name, DeltaLayerName) assert layer_file_name.is_l0(), f"{layer_file_name}" log.info("force image layer creation in the future by writing some data into in-memory layer") @@ -145,7 +146,7 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder): future_layers = get_future_layers() assert len(future_layers) == 1 future_layer = future_layers[0] - assert isinstance(future_layer, ImageLayerFileName) + assert isinstance(future_layer, ImageLayerName) assert future_layer.lsn == last_record_lsn log.info( f"got layer from the future: lsn={future_layer.lsn} disk_consistent_lsn={ip.disk_consistent_lsn} last_record_lsn={last_record_lsn}" diff --git a/test_runner/regress/test_local_file_cache.py b/test_runner/regress/test_local_file_cache.py index 38f2034c18..76c6581448 100644 --- a/test_runner/regress/test_local_file_cache.py +++ b/test_runner/regress/test_local_file_cache.py @@ -4,16 +4,21 @@ import threading import time from typing import List -from fixtures.neon_fixtures import NeonEnv +from fixtures.neon_fixtures import DEFAULT_BRANCH_NAME, NeonEnvBuilder from fixtures.utils import query_scalar -def test_local_file_cache_unlink(neon_simple_env: NeonEnv): - env = neon_simple_env +def test_local_file_cache_unlink(neon_env_builder: NeonEnvBuilder, build_type: str): + if build_type == "debug": + # Disable vectored read path cross validation since it makes the test time out. + neon_env_builder.pageserver_config_override = "validate_vectored_get=false" + + env = neon_env_builder.init_start() cache_dir = os.path.join(env.repo_dir, "file_cache") os.mkdir(cache_dir) + env.neon_cli.create_branch("empty", ancestor_branch_name=DEFAULT_BRANCH_NAME) env.neon_cli.create_branch("test_local_file_cache_unlink", "empty") endpoint = env.endpoints.create_start( diff --git a/test_runner/regress/test_logical_replication.py b/test_runner/regress/test_logical_replication.py index 3f4ca8070d..57d3447cae 100644 --- a/test_runner/regress/test_logical_replication.py +++ b/test_runner/regress/test_logical_replication.py @@ -6,7 +6,9 @@ from string import ascii_lowercase import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import ( + AuxFileStore, NeonEnv, + NeonEnvBuilder, logical_replication_sync, wait_for_last_flush_lsn, ) @@ -18,6 +20,19 @@ def random_string(n: int): return "".join([choice(ascii_lowercase) for _ in range(n)]) +@pytest.mark.parametrize( + "pageserver_aux_file_policy", [AuxFileStore.V1, AuxFileStore.V2, AuxFileStore.CrossValidation] +) +def test_aux_file_v2_flag(neon_simple_env: NeonEnv, pageserver_aux_file_policy: AuxFileStore): + env = neon_simple_env + with env.pageserver.http_client() as client: + tenant_config = client.tenant_config(env.initial_tenant).effective_config + assert pageserver_aux_file_policy == tenant_config["switch_aux_file_policy"] + + +@pytest.mark.parametrize( + "pageserver_aux_file_policy", [AuxFileStore.V1, AuxFileStore.CrossValidation] +) def test_logical_replication(neon_simple_env: NeonEnv, vanilla_pg): env = neon_simple_env @@ -159,6 +174,9 @@ COMMIT; # Test that neon.logical_replication_max_snap_files works +@pytest.mark.parametrize( + "pageserver_aux_file_policy", [AuxFileStore.V1, AuxFileStore.CrossValidation] +) def test_obsolete_slot_drop(neon_simple_env: NeonEnv, vanilla_pg): def slot_removed(ep): assert ( @@ -203,8 +221,86 @@ def test_obsolete_slot_drop(neon_simple_env: NeonEnv, vanilla_pg): wait_until(number_of_iterations=10, interval=2, func=partial(slot_removed, endpoint)) +# Tests that walsender correctly blocks until WAL is downloaded from safekeepers +def test_lr_with_slow_safekeeper(neon_env_builder: NeonEnvBuilder, vanilla_pg): + neon_env_builder.num_safekeepers = 3 + env = neon_env_builder.init_start() + + env.neon_cli.create_branch("init") + endpoint = env.endpoints.create_start("init") + + with endpoint.connect().cursor() as cur: + cur.execute("create table wal_generator (id serial primary key, data text)") + cur.execute( + """ +INSERT INTO wal_generator (data) +SELECT repeat('A', 1024) -- Generates a kilobyte of data per row +FROM generate_series(1, 16384) AS seq; -- Inserts enough rows to exceed 16MB of data +""" + ) + cur.execute("create table t(a int)") + cur.execute("create publication pub for table t") + cur.execute("insert into t values (1)") + + vanilla_pg.start() + vanilla_pg.safe_psql("create table t(a int)") + connstr = endpoint.connstr().replace("'", "''") + vanilla_pg.safe_psql(f"create subscription sub1 connection '{connstr}' publication pub") + logical_replication_sync(vanilla_pg, endpoint) + vanilla_pg.stop() + + # Pause the safekeepers so that they can't send WAL (except to pageserver) + for sk in env.safekeepers: + sk_http = sk.http_client() + sk_http.configure_failpoints([("sk-pause-send", "return")]) + + # Insert a 2 + with endpoint.connect().cursor() as cur: + cur.execute("insert into t values (2)") + + endpoint.stop_and_destroy() + + # This new endpoint should contain [1, 2], but it can't access WAL from safekeeper + endpoint = env.endpoints.create_start("init") + with endpoint.connect().cursor() as cur: + cur.execute("select * from t") + res = [r[0] for r in cur.fetchall()] + assert res == [1, 2] + + # Reconnect subscriber + vanilla_pg.start() + connstr = endpoint.connstr().replace("'", "''") + vanilla_pg.safe_psql(f"alter subscription sub1 connection '{connstr}'") + + time.sleep(5) + # Make sure the 2 isn't replicated + assert [r[0] for r in vanilla_pg.safe_psql("select * from t")] == [1] + + # Re-enable WAL download + for sk in env.safekeepers: + sk_http = sk.http_client() + sk_http.configure_failpoints([("sk-pause-send", "off")]) + + logical_replication_sync(vanilla_pg, endpoint) + assert [r[0] for r in vanilla_pg.safe_psql("select * from t")] == [1, 2] + + # Check that local reads also work + with endpoint.connect().cursor() as cur: + cur.execute("insert into t values (3)") + logical_replication_sync(vanilla_pg, endpoint) + assert [r[0] for r in vanilla_pg.safe_psql("select * from t")] == [1, 2, 3] + + log_path = vanilla_pg.pgdatadir / "pg.log" + with open(log_path, "r") as log_file: + logs = log_file.read() + assert "could not receive data from WAL stream" not in logs + + # Test compute start at LSN page of which starts with contrecord # https://github.com/neondatabase/neon/issues/5749 +@pytest.mark.parametrize( + "pageserver_aux_file_policy", [AuxFileStore.V1, AuxFileStore.CrossValidation] +) def test_wal_page_boundary_start(neon_simple_env: NeonEnv, vanilla_pg): env = neon_simple_env @@ -295,6 +391,9 @@ def test_wal_page_boundary_start(neon_simple_env: NeonEnv, vanilla_pg): # logical replication bug as such, but without logical replication, # records passed ot the WAL redo process are never large enough to hit # the bug. +@pytest.mark.parametrize( + "pageserver_aux_file_policy", [AuxFileStore.V1, AuxFileStore.CrossValidation] +) def test_large_records(neon_simple_env: NeonEnv, vanilla_pg): env = neon_simple_env @@ -364,3 +463,70 @@ def test_slots_and_branching(neon_simple_env: NeonEnv): # Check that we can create slot with the same name ws_cur = ws_branch.connect().cursor() ws_cur.execute("select pg_create_logical_replication_slot('my_slot', 'pgoutput')") + + +@pytest.mark.parametrize( + "pageserver_aux_file_policy", [AuxFileStore.V1, AuxFileStore.CrossValidation] +) +def test_replication_shutdown(neon_simple_env: NeonEnv): + # Ensure Postgres can exit without stuck when a replication job is active + neon extension installed + env = neon_simple_env + env.neon_cli.create_branch("test_replication_shutdown_publisher", "empty") + pub = env.endpoints.create("test_replication_shutdown_publisher") + + env.neon_cli.create_branch("test_replication_shutdown_subscriber") + sub = env.endpoints.create("test_replication_shutdown_subscriber") + + pub.respec(skip_pg_catalog_updates=False) + pub.start() + + sub.respec(skip_pg_catalog_updates=False) + sub.start() + + pub.wait_for_migrations() + sub.wait_for_migrations() + + with pub.cursor() as cur: + cur.execute( + "CREATE ROLE mr_whiskers WITH PASSWORD 'cat' LOGIN INHERIT CREATEROLE CREATEDB BYPASSRLS REPLICATION IN ROLE neon_superuser" + ) + cur.execute("CREATE DATABASE neondb WITH OWNER mr_whiskers") + cur.execute("GRANT ALL PRIVILEGES ON DATABASE neondb TO neon_superuser") + + # If we don't do this, creating the subscription will fail later on PG16 + pub.edit_hba(["host all mr_whiskers 0.0.0.0/0 md5"]) + + with sub.cursor() as cur: + cur.execute( + "CREATE ROLE mr_whiskers WITH PASSWORD 'cat' LOGIN INHERIT CREATEROLE CREATEDB BYPASSRLS REPLICATION IN ROLE neon_superuser" + ) + cur.execute("CREATE DATABASE neondb WITH OWNER mr_whiskers") + cur.execute("GRANT ALL PRIVILEGES ON DATABASE neondb TO neon_superuser") + + with pub.cursor(dbname="neondb", user="mr_whiskers", password="cat") as cur: + cur.execute("CREATE PUBLICATION pub FOR ALL TABLES") + cur.execute("CREATE TABLE t (a int)") + cur.execute("INSERT INTO t VALUES (10), (20)") + cur.execute("SELECT * from t") + res = cur.fetchall() + assert [r[0] for r in res] == [10, 20] + + with sub.cursor(dbname="neondb", user="mr_whiskers", password="cat") as cur: + cur.execute("CREATE TABLE t (a int)") + + pub_conn = f"host=localhost port={pub.pg_port} dbname=neondb user=mr_whiskers password=cat" + query = f"CREATE SUBSCRIPTION sub CONNECTION '{pub_conn}' PUBLICATION pub" + log.info(f"Creating subscription: {query}") + cur.execute(query) + + with pub.cursor(dbname="neondb", user="mr_whiskers", password="cat") as pcur: + pcur.execute("INSERT INTO t VALUES (30), (40)") + + def check_that_changes_propagated(): + cur.execute("SELECT * FROM t") + res = cur.fetchall() + log.info(res) + assert len(res) == 4 + assert [r[0] for r in res] == [10, 20, 30, 40] + + wait_until(10, 0.5, check_that_changes_propagated) diff --git a/test_runner/regress/test_lsn_mapping.py b/test_runner/regress/test_lsn_mapping.py index 5813231aab..225622868d 100644 --- a/test_runner/regress/test_lsn_mapping.py +++ b/test_runner/regress/test_lsn_mapping.py @@ -1,3 +1,4 @@ +import re import time from datetime import datetime, timedelta, timezone @@ -109,6 +110,8 @@ def test_lsn_mapping(neon_env_builder: NeonEnvBuilder): # Test pageserver get_timestamp_of_lsn API def test_ts_of_lsn_api(neon_env_builder: NeonEnvBuilder): + key_not_found_error = r".*could not find data for key.*" + env = neon_env_builder.init_start() new_timeline_id = env.neon_cli.create_branch("test_ts_of_lsn_api") @@ -116,11 +119,11 @@ def test_ts_of_lsn_api(neon_env_builder: NeonEnvBuilder): cur = endpoint_main.connect().cursor() # Create table, and insert rows, each in a separate transaction - # Disable synchronous_commit to make this initialization go faster. + # Enable synchronous commit as we are timing sensitive # # Each row contains current insert LSN and the current timestamp, when # the row was inserted. - cur.execute("SET synchronous_commit=off") + cur.execute("SET synchronous_commit=on") cur.execute("CREATE TABLE foo (x integer)") tbl = [] for i in range(1000): @@ -129,7 +132,7 @@ def test_ts_of_lsn_api(neon_env_builder: NeonEnvBuilder): after_timestamp = query_scalar(cur, "SELECT clock_timestamp()").replace(tzinfo=timezone.utc) after_lsn = query_scalar(cur, "SELECT pg_current_wal_lsn()") tbl.append([i, after_timestamp, after_lsn]) - time.sleep(0.005) + time.sleep(0.02) # Execute one more transaction with synchronous_commit enabled, to flush # all the previous transactions @@ -177,8 +180,8 @@ def test_ts_of_lsn_api(neon_env_builder: NeonEnvBuilder): raise RuntimeError("there should have been an 'could not find data for key' error") except PageserverApiException as error: assert error.status_code == 500 - assert str(error).startswith("could not find data for key") - env.pageserver.allowed_errors.append(".*could not find data for key.*") + assert re.match(key_not_found_error, str(error)) + env.pageserver.allowed_errors.append(key_not_found_error) # Probe a bunch of timestamps in the valid range step_size = 100 diff --git a/test_runner/regress/test_neon_extension.py b/test_runner/regress/test_neon_extension.py index e31e1cab51..39b4865026 100644 --- a/test_runner/regress/test_neon_extension.py +++ b/test_runner/regress/test_neon_extension.py @@ -1,3 +1,4 @@ +import time from contextlib import closing from fixtures.log_helper import log @@ -43,6 +44,12 @@ def test_neon_extension_compatibility(neon_env_builder: NeonEnvBuilder): with closing(endpoint_main.connect()) as conn: with conn.cursor() as cur: + cur.execute("SELECT extversion from pg_extension where extname='neon'") + # IMPORTANT: + # If the version has changed, the test should be updated. + # Ensure that the default version is also updated in the neon.control file + assert cur.fetchone() == ("1.3",) + cur.execute("SELECT * from neon.NEON_STAT_FILE_CACHE") all_versions = ["1.3", "1.2", "1.1", "1.0"] current_version = "1.3" for idx, begin_version in enumerate(all_versions): @@ -60,3 +67,30 @@ def test_neon_extension_compatibility(neon_env_builder: NeonEnvBuilder): cur.execute( f"ALTER EXTENSION neon UPDATE TO '{begin_version}'; -- {target_version}->{begin_version}" ) + + +# Verify that the neon extension can be auto-upgraded to the latest version. +def test_neon_extension_auto_upgrade(neon_env_builder: NeonEnvBuilder): + env = neon_env_builder.init_start() + env.neon_cli.create_branch("test_neon_extension_auto_upgrade") + + endpoint_main = env.endpoints.create("test_neon_extension_auto_upgrade") + # don't skip pg_catalog updates - it runs CREATE EXTENSION neon + endpoint_main.respec(skip_pg_catalog_updates=False) + endpoint_main.start() + + with closing(endpoint_main.connect()) as conn: + with conn.cursor() as cur: + cur.execute("ALTER EXTENSION neon UPDATE TO '1.0';") + cur.execute("SELECT extversion from pg_extension where extname='neon'") + assert cur.fetchone() == ("1.0",) # Ensure the extension gets downgraded + + endpoint_main.stop() + time.sleep(1) + endpoint_main.start() + time.sleep(1) + + with closing(endpoint_main.connect()) as conn: + with conn.cursor() as cur: + cur.execute("SELECT extversion from pg_extension where extname='neon'") + assert cur.fetchone() != ("1.0",) # Ensure the extension gets upgraded diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py index 914f068afb..6c2556f6a2 100644 --- a/test_runner/regress/test_ondemand_download.py +++ b/test_runner/regress/test_ondemand_download.py @@ -333,6 +333,17 @@ def test_download_remote_layers_api( } ) + # This test triggers layer download failures on demand. It is possible to modify the failpoint + # during a `Timeline::get_vectored` right between the vectored read and it's validation read. + # This means that one of the reads can fail while the other one succeeds and vice versa. + # TODO(vlad): Remove this block once the vectored read path validation goes away. + env.pageserver.allowed_errors.extend( + [ + ".*initial_size_calculation.*Vectored get failed with downloading evicted layer file failed, but sequential get did not.*" + ".*initial_size_calculation.*Sequential get failed with downloading evicted layer file failed, but vectored get did not.*" + ] + ) + endpoint = env.endpoints.create_start("main") client = env.pageserver.http_client() @@ -568,6 +579,8 @@ def test_compaction_downloads_on_demand_with_image_creation(neon_env_builder: Ne "image_creation_threshold": 100, # repartitioning parameter, unused "compaction_target_size": 128 * 1024**2, + # Always check if a new image layer can be created + "image_layer_creation_check_threshold": 0, # pitr_interval and gc_horizon are not interesting because we dont run gc } @@ -632,7 +645,8 @@ def test_compaction_downloads_on_demand_with_image_creation(neon_env_builder: Ne # threshold to expose image creation to downloading all of the needed # layers -- threshold of 2 would sound more reasonable, but keeping it as 1 # to be less flaky - env.neon_cli.config_tenant(tenant_id, {"image_creation_threshold": "1"}) + conf["image_creation_threshold"] = "1" + env.neon_cli.config_tenant(tenant_id, {k: str(v) for k, v in conf.items()}) pageserver_http.timeline_compact(tenant_id, timeline_id) layers = pageserver_http.layer_map_info(tenant_id, timeline_id) diff --git a/test_runner/regress/test_ondemand_slru_download.py b/test_runner/regress/test_ondemand_slru_download.py new file mode 100644 index 0000000000..0b36b32552 --- /dev/null +++ b/test_runner/regress/test_ondemand_slru_download.py @@ -0,0 +1,131 @@ +from typing import Optional + +import pytest +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnvBuilder, tenant_get_shards +from fixtures.types import Lsn +from fixtures.utils import query_scalar + + +# +# Test on-demand download of the pg_xact SLRUs +# +@pytest.mark.parametrize("shard_count", [None, 4]) +def test_ondemand_download_pg_xact(neon_env_builder: NeonEnvBuilder, shard_count: Optional[int]): + if shard_count is not None: + neon_env_builder.num_pageservers = shard_count + + tenant_conf = { + "lazy_slru_download": "true", + # set PITR interval to be small, so we can do GC + "pitr_interval": "0 s", + } + env = neon_env_builder.init_start( + initial_tenant_conf=tenant_conf, initial_tenant_shard_count=shard_count + ) + + timeline_id = env.initial_timeline + tenant_id = env.initial_tenant + endpoint = env.endpoints.create_start("main") + + pg_conn = endpoint.connect() + cur = pg_conn.cursor() + + cur.execute("CREATE EXTENSION neon_test_utils") + + # Create a test table + cur.execute("CREATE TABLE clogtest (id integer)") + cur.execute("INSERT INTO clogtest VALUES (1)") + + # Consume a lot of XIDs, to create more pg_xact segments + for _ in range(1000): + cur.execute("select test_consume_xids(10000);") + cur.execute("INSERT INTO clogtest VALUES (2)") + for _ in range(1000): + cur.execute("select test_consume_xids(10000);") + cur.execute("INSERT INTO clogtest VALUES (2)") + for _ in range(1000): + cur.execute("select test_consume_xids(10000);") + cur.execute("INSERT INTO clogtest VALUES (3)") + + # Restart postgres. After restart, the new instance will download the + # pg_xact segments lazily. + endpoint.stop() + endpoint.start() + pg_conn = endpoint.connect() + cur = pg_conn.cursor() + + # Consume more WAL, so that the pageserver can compact and GC older data, + # including the LSN that we started the new endpoint at, + cur.execute("CREATE TABLE anothertable (i int, t text)") + cur.execute( + "INSERT INTO anothertable SELECT g, 'long string to consume some space' || g FROM generate_series(1, 10000) g" + ) + + # Run GC + shards = tenant_get_shards(env, tenant_id, None) + for tenant_shard_id, pageserver in shards: + client = pageserver.http_client() + client.timeline_checkpoint(tenant_shard_id, timeline_id) + client.timeline_compact(tenant_shard_id, timeline_id) + client.timeline_gc(tenant_shard_id, timeline_id, 0) + + # Test that this can still on-demand download the old pg_xact segments + cur.execute("select xmin, xmax, * from clogtest") + tup = cur.fetchall() + log.info(f"tuples = {tup}") + + +@pytest.mark.parametrize("shard_count", [None, 4]) +def test_ondemand_download_replica(neon_env_builder: NeonEnvBuilder, shard_count: Optional[int]): + if shard_count is not None: + neon_env_builder.num_pageservers = shard_count + + tenant_conf = { + "lazy_slru_download": "true", + } + env = neon_env_builder.init_start( + initial_tenant_conf=tenant_conf, initial_tenant_shard_count=shard_count + ) + + endpoint = env.endpoints.create_start("main") + + pg_conn = endpoint.connect() + cur = pg_conn.cursor() + + cur.execute("CREATE EXTENSION neon_test_utils") + + # Create a test table + cur.execute("CREATE TABLE clogtest (id integer)") + cur.execute("INSERT INTO clogtest VALUES (1)") + + # Consume a lot of XIDs, to create more pg_xact segments + for _ in range(1000): + cur.execute("select test_consume_xids(10000);") + + # Open a new connection and insert another row, but leave + # the transaction open + pg_conn2 = endpoint.connect() + cur2 = pg_conn2.cursor() + cur2.execute("BEGIN") + cur2.execute("INSERT INTO clogtest VALUES (2)") + + # Another insert on the first connection, which is committed. + for _ in range(1000): + cur.execute("select test_consume_xids(10000);") + cur.execute("INSERT INTO clogtest VALUES (3)") + + # Start standby at this point in time + lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_insert_lsn()")) + endpoint_at_lsn = env.endpoints.create_start( + branch_name="main", endpoint_id="ep-at-lsn", lsn=lsn + ) + + # Commit transaction 2, after the standby was launched. + cur2.execute("COMMIT") + + # The replica should not see transaction 2 as committed. + conn_replica = endpoint_at_lsn.connect() + cur_replica = conn_replica.cursor() + cur_replica.execute("SELECT * FROM clogtest") + assert cur_replica.fetchall() == [(1,), (3,)] diff --git a/test_runner/regress/test_pageserver_api.py b/test_runner/regress/test_pageserver_api.py index 81aed704bb..bd7e4f118f 100644 --- a/test_runner/regress/test_pageserver_api.py +++ b/test_runner/regress/test_pageserver_api.py @@ -2,6 +2,7 @@ import subprocess from pathlib import Path from typing import Optional +import toml from fixtures.neon_fixtures import ( DEFAULT_BRANCH_NAME, NeonEnv, @@ -12,10 +13,11 @@ from fixtures.types import Lsn, TenantId, TimelineId from fixtures.utils import wait_until -# test that we cannot override node id after init -def test_pageserver_init_node_id( - neon_simple_env: NeonEnv, neon_binpath: Path, pg_distrib_dir: Path -): +def test_pageserver_init_node_id(neon_simple_env: NeonEnv, neon_binpath: Path): + """ + NB: The neon_local doesn't use `--init` mode anymore, but our production + deployment still does => https://github.com/neondatabase/aws/pull/1322 + """ workdir = neon_simple_env.pageserver.workdir pageserver_config = workdir / "pageserver.toml" pageserver_bin = neon_binpath / "pageserver" @@ -29,18 +31,36 @@ def test_pageserver_init_node_id( stderr=subprocess.PIPE, ) - # remove initial config and stop existing pageserver - pageserver_config.unlink() neon_simple_env.pageserver.stop() - bad_init = run_pageserver(["--init", "-c", f'pg_distrib_dir="{pg_distrib_dir}"']) + with open(neon_simple_env.pageserver.config_toml_path, "r") as f: + ps_config = toml.load(f) + + required_config_keys = [ + "pg_distrib_dir", + "listen_pg_addr", + "listen_http_addr", + "pg_auth_type", + "http_auth_type", + ] + required_config_overrides = [ + f"--config-override={toml.dumps({k: ps_config[k]})}" for k in required_config_keys + ] + + pageserver_config.unlink() + + bad_init = run_pageserver(["--init", *required_config_overrides]) assert ( bad_init.returncode == 1 ), "pageserver should not be able to init new config without the node id" assert 'missing config value "id"' in bad_init.stderr assert not pageserver_config.exists(), "config file should not be created after init error" - good_init_cmd = ["--init", "-c", "id = 12345", "-c", f'pg_distrib_dir="{pg_distrib_dir}"'] + good_init_cmd = [ + "--init", + f"--config-override=id={ps_config['id']}", + *required_config_overrides, + ] completed_init = run_pageserver(good_init_cmd) assert ( completed_init.returncode == 0 @@ -49,11 +69,7 @@ def test_pageserver_init_node_id( bad_reinit = run_pageserver(good_init_cmd) assert bad_reinit.returncode == 1, "pageserver refuses to init if already exists" - assert "already exists, cannot init it" in bad_reinit.stderr - - bad_update = run_pageserver(["--update-config", "-c", "id = 3"]) - assert bad_update.returncode == 1, "pageserver should not allow updating node id" - assert "has node id already, it cannot be overridden" in bad_update.stderr + assert "config file already exists" in bad_reinit.stderr def check_client(env: NeonEnv, client: PageserverHttpClient): diff --git a/test_runner/regress/test_pageserver_config.py b/test_runner/regress/test_pageserver_config.py new file mode 100644 index 0000000000..c04348b488 --- /dev/null +++ b/test_runner/regress/test_pageserver_config.py @@ -0,0 +1,35 @@ +import pytest +from fixtures.neon_fixtures import ( + NeonEnvBuilder, + last_flush_lsn_upload, +) + + +@pytest.mark.parametrize("kind", ["sync", "async"]) +def test_walredo_process_kind_config(neon_env_builder: NeonEnvBuilder, kind: str): + neon_env_builder.pageserver_config_override = f"walredo_process_kind = '{kind}'" + # ensure it starts + env = neon_env_builder.init_start() + # ensure the metric is set + ps_http = env.pageserver.http_client() + metrics = ps_http.get_metrics() + samples = metrics.query_all("pageserver_wal_redo_process_kind") + assert [(s.labels, s.value) for s in samples] == [({"kind": kind}, 1)] + # ensure default tenant's config kind matches + # => write some data to force-spawn walredo + ep = env.endpoints.create_start("main") + with ep.connect() as conn: + with conn.cursor() as cur: + cur.execute("create table foo(bar text)") + cur.execute("insert into foo select from generate_series(1, 100)") + last_flush_lsn_upload(env, ep, env.initial_tenant, env.initial_timeline) + ep.stop() + ep.start() + with ep.connect() as conn: + with conn.cursor() as cur: + cur.execute("select count(*) from foo") + [(count,)] = cur.fetchall() + assert count == 100 + + status = ps_http.tenant_status(env.initial_tenant) + assert status["walredo"]["process"]["kind"] == kind diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py index 56b4548b64..58eaf404d3 100644 --- a/test_runner/regress/test_pageserver_generations.py +++ b/test_runner/regress/test_pageserver_generations.py @@ -9,8 +9,8 @@ of the pageserver are: - Updates to remote_consistent_lsn may only be made visible after validating generation """ - import enum +import os import re import time from typing import Optional @@ -23,6 +23,7 @@ from fixtures.neon_fixtures import ( NeonPageserver, PgBin, S3Scrubber, + flush_ep_to_pageserver, last_flush_lsn_upload, ) from fixtures.pageserver.http import PageserverApiException @@ -31,6 +32,7 @@ from fixtures.pageserver.utils import ( list_prefix, wait_for_last_record_lsn, wait_for_upload, + wait_for_upload_queue_empty, ) from fixtures.remote_storage import ( RemoteStorageKind, @@ -53,6 +55,7 @@ TENANT_CONF = { "compaction_period": "0s", # create image layers eagerly, so that GC can remove some layers "image_creation_threshold": "1", + "image_layer_creation_check_threshold": "0", } @@ -111,7 +114,6 @@ def generate_uploads_and_deletions( last_flush_lsn_upload( env, endpoint, tenant_id, timeline_id, pageserver_id=pageserver.id ) - ps_http.timeline_checkpoint(tenant_id, timeline_id) # Compaction should generate some GC-elegible layers for i in range(0, 2): @@ -121,6 +123,17 @@ def generate_uploads_and_deletions( print_gc_result(gc_result) assert gc_result["layers_removed"] > 0 + # Stop endpoint and flush all data to pageserver, then checkpoint it: this + # ensures that the pageserver is in a fully idle state: there will be no more + # background ingest, no more uploads pending, and therefore no non-determinism + # in subsequent actions like pageserver restarts. + final_lsn = flush_ep_to_pageserver(env, endpoint, tenant_id, timeline_id, pageserver.id) + ps_http.timeline_checkpoint(tenant_id, timeline_id) + # Finish uploads + wait_for_upload(ps_http, tenant_id, timeline_id, final_lsn) + # Finish all remote writes (including deletions) + wait_for_upload_queue_empty(ps_http, tenant_id, timeline_id) + def read_all( env: NeonEnv, tenant_id: Optional[TenantId] = None, timeline_id: Optional[TimelineId] = None @@ -208,7 +221,12 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder): # We will start a pageserver with no control_plane_api set, so it won't be able to self-register env.storage_controller.node_register(env.pageserver) - env.pageserver.start(overrides=('--pageserver-config-override=control_plane_api=""',)) + replaced_config = env.pageserver.patch_config_toml_nonrecursive( + { + "control_plane_api": "", + } + ) + env.pageserver.start() env.storage_controller.node_configure(env.pageserver.id, {"availability": "Active"}) env.neon_cli.create_tenant( @@ -239,8 +257,8 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder): assert parse_generation_suffix(key) is None env.pageserver.stop() - # Starting without the override that disabled control_plane_api + env.pageserver.patch_config_toml_nonrecursive(replaced_config) env.pageserver.start() generate_uploads_and_deletions(env, pageserver=env.pageserver, init=False) @@ -385,9 +403,8 @@ def test_deletion_queue_recovery( if validate_before == ValidateBefore.NO_VALIDATE: failpoints.append( # Prevent deletion lists from being validated, we will test that they are - # dropped properly during recovery. 'pause' is okay here because we kill - # the pageserver with immediate=true - ("control-plane-client-validate", "pause") + # dropped properly during recovery. This is such a long sleep as to be equivalent to "never" + ("control-plane-client-validate", "return(3600000)") ) ps_http.configure_failpoints(failpoints) @@ -514,9 +531,12 @@ def test_emergency_mode(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): # incident, but it might be unavoidable: if so, we want to be able to start up # and serve clients. env.pageserver.stop() # Non-immediate: implicitly checking that shutdown doesn't hang waiting for CP - env.pageserver.start( - overrides=("--pageserver-config-override=control_plane_emergency_mode=true",), + replaced = env.pageserver.patch_config_toml_nonrecursive( + { + "control_plane_emergency_mode": True, + } ) + env.pageserver.start() # The pageserver should provide service to clients generate_uploads_and_deletions(env, init=False, pageserver=env.pageserver) @@ -538,6 +558,7 @@ def test_emergency_mode(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): # The pageserver should work fine when subsequently restarted in non-emergency mode env.pageserver.stop() # Non-immediate: implicitly checking that shutdown doesn't hang waiting for CP + env.pageserver.patch_config_toml_nonrecursive(replaced) env.pageserver.start() generate_uploads_and_deletions(env, init=False, pageserver=env.pageserver) @@ -680,3 +701,50 @@ def test_multi_attach( # All data we wrote while multi-attached remains readable workload.validate(pageservers[2].id) + + +@pytest.mark.skip(reason="To be enabled after release with new local path style") +def test_upgrade_generationless_local_file_paths( + neon_env_builder: NeonEnvBuilder, +): + """ + Test pageserver behavior when startup up with local layer paths without + generation numbers: it should accept these layer files, and avoid doing + a delete/download cycle on them. + """ + env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF) + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + + workload = Workload(env, tenant_id, timeline_id) + workload.init() + workload.write_rows(1000) + + env.pageserver.stop() + + # Rename the local paths to legacy format, to simulate what + # we would see when upgrading + timeline_dir = env.pageserver.timeline_dir(tenant_id, timeline_id) + files_renamed = 0 + for filename in os.listdir(timeline_dir): + path = os.path.join(timeline_dir, filename) + log.info(f"Found file {path}") + if path.endswith("-v1-00000001"): + new_path = path[:-12] + os.rename(path, new_path) + log.info(f"Renamed {path} -> {new_path}") + files_renamed += 1 + + assert files_renamed > 0 + + env.pageserver.start() + + workload.validate() + + # Assert that there were no on-demand downloads + assert ( + env.pageserver.http_client().get_metric_value( + "pageserver_remote_ondemand_downloaded_layers_total" + ) + == 0 + ) diff --git a/test_runner/regress/test_pageserver_layer_rolling.py b/test_runner/regress/test_pageserver_layer_rolling.py new file mode 100644 index 0000000000..c5dc0f2919 --- /dev/null +++ b/test_runner/regress/test_pageserver_layer_rolling.py @@ -0,0 +1,313 @@ +import asyncio +import os +import time +from typing import Optional, Tuple + +import psutil +import pytest +from fixtures.log_helper import log +from fixtures.neon_fixtures import ( + NeonEnv, + NeonEnvBuilder, + tenant_get_shards, +) +from fixtures.pageserver.http import PageserverHttpClient +from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload +from fixtures.types import Lsn, TenantId, TimelineId +from fixtures.utils import wait_until + +TIMELINE_COUNT = 10 +ENTRIES_PER_TIMELINE = 10_000 +CHECKPOINT_TIMEOUT_SECONDS = 60 + + +async def run_worker_for_tenant( + env: NeonEnv, entries: int, tenant: TenantId, offset: Optional[int] = None +) -> Lsn: + if offset is None: + offset = 0 + + with env.endpoints.create_start("main", tenant_id=tenant) as ep: + conn = await ep.connect_async() + try: + await conn.execute("CREATE TABLE IF NOT EXISTS t(key serial primary key, value text)") + await conn.execute( + f"INSERT INTO t SELECT i, CONCAT('payload_', i) FROM generate_series({offset},{entries}) as i" + ) + finally: + await conn.close(timeout=10) + + last_flush_lsn = Lsn(ep.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0]) + return last_flush_lsn + + +async def run_worker(env: NeonEnv, tenant_conf, entries: int) -> Tuple[TenantId, TimelineId, Lsn]: + tenant, timeline = env.neon_cli.create_tenant(conf=tenant_conf) + last_flush_lsn = await run_worker_for_tenant(env, entries, tenant) + return tenant, timeline, last_flush_lsn + + +async def workload( + env: NeonEnv, tenant_conf, timelines: int, entries: int +) -> list[Tuple[TenantId, TimelineId, Lsn]]: + workers = [asyncio.create_task(run_worker(env, tenant_conf, entries)) for _ in range(timelines)] + return await asyncio.gather(*workers) + + +def wait_until_pageserver_is_caught_up( + env: NeonEnv, last_flush_lsns: list[Tuple[TenantId, TimelineId, Lsn]] +): + for tenant, timeline, last_flush_lsn in last_flush_lsns: + shards = tenant_get_shards(env, tenant) + for tenant_shard_id, pageserver in shards: + waited = wait_for_last_record_lsn( + pageserver.http_client(), tenant_shard_id, timeline, last_flush_lsn + ) + assert waited >= last_flush_lsn + + +def wait_until_pageserver_has_uploaded( + env: NeonEnv, last_flush_lsns: list[Tuple[TenantId, TimelineId, Lsn]] +): + for tenant, timeline, last_flush_lsn in last_flush_lsns: + shards = tenant_get_shards(env, tenant) + for tenant_shard_id, pageserver in shards: + wait_for_upload(pageserver.http_client(), tenant_shard_id, timeline, last_flush_lsn) + + +def wait_for_wal_ingest_metric(pageserver_http: PageserverHttpClient) -> float: + def query(): + value = pageserver_http.get_metric_value("pageserver_wal_ingest_records_received_total") + assert value is not None + return value + + # The metric gets initialised on the first update. + # Retry a few times, but return 0 if it's stable. + try: + return float(wait_until(3, 0.5, query)) + except Exception: + return 0 + + +def get_dirty_bytes(env): + v = env.pageserver.http_client().get_metric_value("pageserver_timeline_ephemeral_bytes") or 0 + log.info(f"dirty_bytes: {v}") + return v + + +def assert_dirty_bytes(env, v): + assert get_dirty_bytes(env) == v + + +def assert_dirty_bytes_nonzero(env): + dirty_bytes = get_dirty_bytes(env) + assert dirty_bytes > 0 + return dirty_bytes + + +@pytest.mark.parametrize("immediate_shutdown", [True, False]) +def test_pageserver_small_inmemory_layers( + neon_env_builder: NeonEnvBuilder, immediate_shutdown: bool +): + """ + Test that open layers get flushed after the `checkpoint_timeout` config + and do not require WAL reingest upon restart. + + The workload creates a number of timelines and writes some data to each, + but not enough to trigger flushes via the `checkpoint_distance` config. + """ + tenant_conf = { + # Large `checkpoint_distance` effectively disables size + # based checkpointing. + "checkpoint_distance": f"{2 * 1024 ** 3}", + "checkpoint_timeout": f"{CHECKPOINT_TIMEOUT_SECONDS}s", + "compaction_period": "1s", + } + + env = neon_env_builder.init_configs() + env.start() + + last_flush_lsns = asyncio.run(workload(env, tenant_conf, TIMELINE_COUNT, ENTRIES_PER_TIMELINE)) + wait_until_pageserver_is_caught_up(env, last_flush_lsns) + + # We didn't write enough data to trigger a size-based checkpoint: we should see dirty data. + wait_until(10, 1, lambda: assert_dirty_bytes_nonzero(env)) # type: ignore + + ps_http_client = env.pageserver.http_client() + total_wal_ingested_before_restart = wait_for_wal_ingest_metric(ps_http_client) + + # Within ~ the checkpoint interval, all the ephemeral layers should be frozen and flushed, + # such that there are zero bytes of ephemeral layer left on the pageserver + log.info("Waiting for background checkpoints...") + wait_until(CHECKPOINT_TIMEOUT_SECONDS * 2, 1, lambda: assert_dirty_bytes(env, 0)) # type: ignore + + # Zero ephemeral layer bytes does not imply that all the frozen layers were uploaded: they + # must be uploaded to remain visible to the pageserver after restart. + wait_until_pageserver_has_uploaded(env, last_flush_lsns) + + env.pageserver.restart(immediate=immediate_shutdown) + wait_until_pageserver_is_caught_up(env, last_flush_lsns) + + # Catching up with WAL ingest should have resulted in zero bytes of ephemeral layers, since + # we froze, flushed and uploaded everything before restarting. There can be no more WAL writes + # because we shut down compute endpoints before flushing. + assert get_dirty_bytes(env) == 0 + + total_wal_ingested_after_restart = wait_for_wal_ingest_metric(ps_http_client) + + log.info(f"WAL ingested before restart: {total_wal_ingested_before_restart}") + log.info(f"WAL ingested after restart: {total_wal_ingested_after_restart}") + + assert total_wal_ingested_after_restart == 0 + + +def test_idle_checkpoints(neon_env_builder: NeonEnvBuilder): + """ + Test that `checkpoint_timeout` is enforced even if there is no safekeeper input. + """ + tenant_conf = { + # Large `checkpoint_distance` effectively disables size + # based checkpointing. + "checkpoint_distance": f"{2 * 1024 ** 3}", + "checkpoint_timeout": f"{CHECKPOINT_TIMEOUT_SECONDS}s", + "compaction_period": "1s", + } + + env = neon_env_builder.init_configs() + env.start() + + last_flush_lsns = asyncio.run(workload(env, tenant_conf, TIMELINE_COUNT, ENTRIES_PER_TIMELINE)) + wait_until_pageserver_is_caught_up(env, last_flush_lsns) + + # We didn't write enough data to trigger a size-based checkpoint: we should see dirty data. + wait_until(10, 1, lambda: assert_dirty_bytes_nonzero(env)) # type: ignore + + # Stop the safekeepers, so that we cannot have any more WAL receiver connections + for sk in env.safekeepers: + sk.stop() + + # We should have got here fast enough that we didn't hit the background interval yet, + # and the teardown of SK connections shouldn't prompt any layer freezing. + assert get_dirty_bytes(env) > 0 + + # Within ~ the checkpoint interval, all the ephemeral layers should be frozen and flushed, + # such that there are zero bytes of ephemeral layer left on the pageserver + log.info("Waiting for background checkpoints...") + wait_until(CHECKPOINT_TIMEOUT_SECONDS * 2, 1, lambda: assert_dirty_bytes(env, 0)) # type: ignore + + # The code below verifies that we do not flush on the first write + # after an idle period longer than the checkpoint timeout. + + # Sit quietly for longer than the checkpoint timeout + time.sleep(CHECKPOINT_TIMEOUT_SECONDS + CHECKPOINT_TIMEOUT_SECONDS / 2) + + # Restart the safekeepers and write a bit of extra data into one tenant + for sk in env.safekeepers: + sk.start() + + tenant_with_extra_writes = last_flush_lsns[0][0] + asyncio.run( + run_worker_for_tenant(env, 5, tenant_with_extra_writes, offset=ENTRIES_PER_TIMELINE) + ) + + dirty_after_write = wait_until(10, 1, lambda: assert_dirty_bytes_nonzero(env)) # type: ignore + + # We shouldn't flush since we've just opened a new layer + waited_for = 0 + while waited_for < CHECKPOINT_TIMEOUT_SECONDS // 4: + time.sleep(5) + waited_for += 5 + + assert get_dirty_bytes(env) >= dirty_after_write + + +@pytest.mark.skipif( + # We have to use at least ~100MB of data to hit the lowest limit we can configure, which is + # prohibitively slow in debug mode + os.getenv("BUILD_TYPE") == "debug", + reason="Avoid running bulkier ingest tests in debug mode", +) +def test_total_size_limit(neon_env_builder: NeonEnvBuilder): + """ + Test that checkpoints are done based on total ephemeral layer size, even if no one timeline is + individually exceeding checkpoint thresholds. + """ + + system_memory = psutil.virtual_memory().total + + # The smallest total size limit we can configure is 1/1024th of the system memory (e.g. 128MB on + # a system with 128GB of RAM). We will then write enough data to violate this limit. + max_dirty_data = 128 * 1024 * 1024 + ephemeral_bytes_per_memory_kb = (max_dirty_data * 1024) // system_memory + assert ephemeral_bytes_per_memory_kb > 0 + + neon_env_builder.pageserver_config_override = f""" + ephemeral_bytes_per_memory_kb={ephemeral_bytes_per_memory_kb} + """ + + compaction_period_s = 10 + + tenant_conf = { + # Large space + time thresholds: effectively disable these limits + "checkpoint_distance": f"{1024 ** 4}", + "checkpoint_timeout": "3600s", + "compaction_period": f"{compaction_period_s}s", + } + + env = neon_env_builder.init_configs() + env.start() + + timeline_count = 10 + + # This is about 2MiB of data per timeline + entries_per_timeline = 100_000 + + last_flush_lsns = asyncio.run(workload(env, tenant_conf, timeline_count, entries_per_timeline)) + wait_until_pageserver_is_caught_up(env, last_flush_lsns) + + total_bytes_ingested = 0 + for tenant, timeline, last_flush_lsn in last_flush_lsns: + http_client = env.pageserver.http_client() + initdb_lsn = Lsn(http_client.timeline_detail(tenant, timeline)["initdb_lsn"]) + total_bytes_ingested += last_flush_lsn - initdb_lsn + + log.info(f"Ingested {total_bytes_ingested} bytes since initdb (vs max dirty {max_dirty_data})") + assert total_bytes_ingested > max_dirty_data + + # Expected end state: the total physical size of all the tenants is in excess of the max dirty + # data, but the total amount of dirty data is less than the limit: this demonstrates that we + # have exceeded the threshold but then rolled layers in response + def get_total_historic_layers(): + total_ephemeral_layers = 0 + total_historic_bytes = 0 + for tenant, timeline, _last_flush_lsn in last_flush_lsns: + http_client = env.pageserver.http_client() + initdb_lsn = Lsn(http_client.timeline_detail(tenant, timeline)["initdb_lsn"]) + layer_map = http_client.layer_map_info(tenant, timeline) + total_historic_bytes += sum( + layer.layer_file_size + for layer in layer_map.historic_layers + if layer.layer_file_size is not None and Lsn(layer.lsn_start) > initdb_lsn + ) + total_ephemeral_layers += len(layer_map.in_memory_layers) + + log.info( + f"Total historic layer bytes: {total_historic_bytes} ({total_ephemeral_layers} ephemeral layers)" + ) + + return total_historic_bytes + + def assert_bytes_rolled(): + assert total_bytes_ingested - get_total_historic_layers() <= max_dirty_data + + # Wait until enough layers have rolled that the amount of dirty data is under the threshold. + # We do this indirectly via layer maps, rather than the dirty bytes metric, to avoid false-passing + # if that metric isn't updated quickly enough to reflect the dirty bytes exceeding the limit. + wait_until(compaction_period_s * 2, 1, assert_bytes_rolled) + + # The end state should also have the reported metric under the limit + def assert_dirty_data_limited(): + dirty_bytes = get_dirty_bytes(env) + assert dirty_bytes < max_dirty_data + + wait_until(compaction_period_s * 2, 1, lambda: assert_dirty_data_limited()) # type: ignore diff --git a/test_runner/regress/test_pageserver_metric_collection.py b/test_runner/regress/test_pageserver_metric_collection.py index 042961baa5..c34ef46d07 100644 --- a/test_runner/regress/test_pageserver_metric_collection.py +++ b/test_runner/regress/test_pageserver_metric_collection.py @@ -1,4 +1,6 @@ +import gzip import json +import os import time from dataclasses import dataclass from pathlib import Path @@ -10,7 +12,11 @@ from fixtures.neon_fixtures import ( NeonEnvBuilder, wait_for_last_flush_lsn, ) -from fixtures.remote_storage import RemoteStorageKind +from fixtures.remote_storage import ( + LocalFsStorage, + RemoteStorageKind, + remote_storage_to_toml_inline_table, +) from fixtures.types import TenantId, TimelineId from pytest_httpserver import HTTPServer from werkzeug.wrappers.request import Request @@ -40,6 +46,9 @@ def test_metric_collection( uploads.put((events, is_last == "true")) return Response(status=200) + neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) + assert neon_env_builder.pageserver_remote_storage is not None + # Require collecting metrics frequently, since we change # the timeline and want something to be logged about it. # @@ -48,12 +57,11 @@ def test_metric_collection( neon_env_builder.pageserver_config_override = f""" metric_collection_interval="1s" metric_collection_endpoint="{metric_collection_endpoint}" + metric_collection_bucket={remote_storage_to_toml_inline_table(neon_env_builder.pageserver_remote_storage)} cached_metric_collection_interval="0s" synthetic_size_calculation_interval="3s" """ - neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) - log.info(f"test_metric_collection endpoint is {metric_collection_endpoint}") # mock http server that returns OK for the metrics @@ -70,6 +78,7 @@ def test_metric_collection( # we have a fast rate of calculation, these can happen at shutdown ".*synthetic_size_worker:calculate_synthetic_size.*:gather_size_inputs.*: failed to calculate logical size at .*: cancelled.*", ".*synthetic_size_worker: failed to calculate synthetic size for tenant .*: failed to calculate some logical_sizes", + ".*metrics_collection: failed to upload to S3: Failed to upload data of length .* to storage path.*", ] ) @@ -166,6 +175,20 @@ def test_metric_collection( httpserver.check() + # Check that at least one bucket output object is present, and that all + # can be decompressed and decoded. + bucket_dumps = {} + assert isinstance(env.pageserver_remote_storage, LocalFsStorage) + for dirpath, _dirs, files in os.walk(env.pageserver_remote_storage.root): + for file in files: + file_path = os.path.join(dirpath, file) + log.info(file_path) + if file.endswith(".gz"): + bucket_dumps[file_path] = json.load(gzip.open(file_path)) + + assert len(bucket_dumps) >= 1 + assert all("events" in data for data in bucket_dumps.values()) + def test_metric_collection_cleans_up_tempfile( httpserver: HTTPServer, diff --git a/test_runner/regress/test_pageserver_restart.py b/test_runner/regress/test_pageserver_restart.py index 753898f747..759e845927 100644 --- a/test_runner/regress/test_pageserver_restart.py +++ b/test_runner/regress/test_pageserver_restart.py @@ -20,7 +20,10 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder): endpoint = env.endpoints.create_start("main") pageserver_http = env.pageserver.http_client() - assert pageserver_http.get_metric_value("pageserver_tenant_manager_slots") == 1 + assert ( + pageserver_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) + == 1 + ) pg_conn = endpoint.connect() cur = pg_conn.cursor() @@ -55,7 +58,10 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder): env.pageserver.start() # We reloaded our tenant - assert pageserver_http.get_metric_value("pageserver_tenant_manager_slots") == 1 + assert ( + pageserver_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) + == 1 + ) cur.execute("SELECT count(*) FROM foo") assert cur.fetchone() == (100000,) diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py index 2e57136607..c40bb962f2 100644 --- a/test_runner/regress/test_pageserver_secondary.py +++ b/test_runner/regress/test_pageserver_secondary.py @@ -1,16 +1,18 @@ import json import os import random -from pathlib import Path +import time from typing import Any, Dict, Optional import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder, NeonPageserver, S3Scrubber +from fixtures.pageserver.types import parse_layer_file_name from fixtures.pageserver.utils import ( assert_prefix_empty, poll_for_remote_storage_iterations, tenant_delete_wait_completed, + wait_for_upload_queue_empty, ) from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind, S3Storage from fixtures.types import TenantId, TimelineId @@ -49,9 +51,13 @@ def evict_random_layers( if "ephemeral" in layer.name or "temp_download" in layer.name: continue + layer_name = parse_layer_file_name(layer.name) + if rng.choice([True, False]): - log.info(f"Evicting layer {tenant_id}/{timeline_id} {layer.name}") - client.evict_layer(tenant_id=tenant_id, timeline_id=timeline_id, layer_name=layer.name) + log.info(f"Evicting layer {tenant_id}/{timeline_id} {layer_name.to_str()}") + client.evict_layer( + tenant_id=tenant_id, timeline_id=timeline_id, layer_name=layer_name.to_str() + ) @pytest.mark.parametrize("seed", [1, 2, 3]) @@ -89,6 +95,8 @@ def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, seed: int): # this shutdown case is logged at WARN severity by the time it bubbles up to logical size calculation code # WARN ...: initial size calculation failed: downloading failed, possibly for shutdown ".*downloading failed, possibly for shutdown", + # {tenant_id=... timeline_id=...}:handle_pagerequests:handle_get_page_at_lsn_request{rel=1664/0/1260 blkno=0 req_lsn=0/149F0D8}: error reading relation or page version: Not found: will not become active. Current state: Stopping\n' + ".*page_service.*will not become active.*", ] ) @@ -398,32 +406,6 @@ def test_heatmap_uploads(neon_env_builder: NeonEnvBuilder): validate_heatmap(heatmap_second) -def list_layers(pageserver, tenant_id: TenantId, timeline_id: TimelineId) -> list[Path]: - """ - Inspect local storage on a pageserver to discover which layer files are present. - - :return: list of relative paths to layers, from the timeline root. - """ - timeline_path = pageserver.timeline_dir(tenant_id, timeline_id) - - def relative(p: Path) -> Path: - return p.relative_to(timeline_path) - - return sorted( - list( - map( - relative, - filter( - lambda path: path.name != "metadata" - and "ephemeral" not in path.name - and "temp" not in path.name, - timeline_path.glob("*"), - ), - ) - ) - ) - - def test_secondary_downloads(neon_env_builder: NeonEnvBuilder): """ Test the overall data flow in secondary mode: @@ -472,10 +454,14 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder): log.info("Synchronizing after initial write...") ps_attached.http_client().tenant_heatmap_upload(tenant_id) + # Ensure that everything which appears in the heatmap is also present in S3: heatmap writers + # are allowed to upload heatmaps that reference layers which are only enqueued for upload + wait_for_upload_queue_empty(ps_attached.http_client(), tenant_id, timeline_id) + ps_secondary.http_client().tenant_secondary_download(tenant_id) - assert list_layers(ps_attached, tenant_id, timeline_id) == list_layers( - ps_secondary, tenant_id, timeline_id + assert ps_attached.list_layers(tenant_id, timeline_id) == ps_secondary.list_layers( + tenant_id, timeline_id ) # Make changes on attached pageserver, check secondary downloads them @@ -484,11 +470,26 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder): workload.churn_rows(128, ps_attached.id) ps_attached.http_client().tenant_heatmap_upload(tenant_id) + + # Ensure that everything which appears in the heatmap is also present in S3: heatmap writers + # are allowed to upload heatmaps that reference layers which are only enqueued for upload + wait_for_upload_queue_empty(ps_attached.http_client(), tenant_id, timeline_id) + ps_secondary.http_client().tenant_secondary_download(tenant_id) - assert list_layers(ps_attached, tenant_id, timeline_id) == list_layers( - ps_secondary, tenant_id, timeline_id - ) + try: + assert ps_attached.list_layers(tenant_id, timeline_id) == ps_secondary.list_layers( + tenant_id, timeline_id + ) + except: + # Do a full listing of the secondary location on errors, to help debug of + # https://github.com/neondatabase/neon/issues/6966 + timeline_path = ps_secondary.timeline_dir(tenant_id, timeline_id) + for path, _dirs, files in os.walk(timeline_path): + for f in files: + log.info(f"Secondary file: {os.path.join(path, f)}") + + raise # FIXME: this sleep is needed to avoid on-demand promotion of the layers we evict, while # walreceiver is still doing something. @@ -500,8 +501,8 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder): # ================================================================== try: log.info("Evicting a layer...") - layer_to_evict = list_layers(ps_attached, tenant_id, timeline_id)[0] - some_other_layer = list_layers(ps_attached, tenant_id, timeline_id)[1] + layer_to_evict = ps_attached.list_layers(tenant_id, timeline_id)[0] + some_other_layer = ps_attached.list_layers(tenant_id, timeline_id)[1] log.info(f"Victim layer: {layer_to_evict.name}") ps_attached.http_client().evict_layer( tenant_id, timeline_id, layer_name=layer_to_evict.name @@ -514,13 +515,13 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder): layer["name"] for layer in heatmap_after_eviction["timelines"][0]["layers"] ) assert layer_to_evict.name not in heatmap_layers - assert some_other_layer.name in heatmap_layers + assert parse_layer_file_name(some_other_layer.name).to_str() in heatmap_layers ps_secondary.http_client().tenant_secondary_download(tenant_id) - assert layer_to_evict not in list_layers(ps_attached, tenant_id, timeline_id) - assert list_layers(ps_attached, tenant_id, timeline_id) == list_layers( - ps_secondary, tenant_id, timeline_id + assert layer_to_evict not in ps_attached.list_layers(tenant_id, timeline_id) + assert ps_attached.list_layers(tenant_id, timeline_id) == ps_secondary.list_layers( + tenant_id, timeline_id ) except: # On assertion failures, log some details to help with debugging @@ -560,6 +561,91 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder): ) +def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder): + """ + Slow test that runs in realtime, checks that the background scheduling of secondary + downloads happens as expected. + """ + neon_env_builder.num_pageservers = 2 + env = neon_env_builder.init_configs() + env.start() + + # Create this many tenants, each with two timelines + tenant_count = 4 + tenant_timelines = {} + + # This mirrors a constant in `downloader.rs` + freshen_interval_secs = 60 + + for _i in range(0, tenant_count): + tenant_id = TenantId.generate() + timeline_a = TimelineId.generate() + timeline_b = TimelineId.generate() + env.neon_cli.create_tenant( + tenant_id, + timeline_a, + placement_policy='{"Attached":1}', + # Run with a low heatmap period so that we can avoid having to do synthetic API calls + # to trigger the upload promptly. + conf={"heatmap_period": "1s"}, + ) + env.neon_cli.create_timeline("main2", tenant_id, timeline_b) + + tenant_timelines[tenant_id] = [timeline_a, timeline_b] + + t_start = time.time() + + # Wait long enough that the background downloads should happen; we expect all the inital layers + # of all the initial timelines to show up on the secondary location of each tenant. + time.sleep(freshen_interval_secs * 1.5) + + for tenant_id, timelines in tenant_timelines.items(): + attached_to_id = env.storage_controller.locate(tenant_id)[0]["node_id"] + ps_attached = env.get_pageserver(attached_to_id) + # We only have two: the other one must be secondary + ps_secondary = next(p for p in env.pageservers if p != ps_attached) + + for timeline_id in timelines: + log.info(f"Checking for secondary timeline {timeline_id} on node {ps_secondary.id}") + # One or more layers should be present for all timelines + assert ps_secondary.list_layers(tenant_id, timeline_id) + + # Delete the second timeline: this should be reflected later on the secondary + env.storage_controller.pageserver_api().timeline_delete(tenant_id, timelines[1]) + + # Wait long enough for the secondary locations to see the deletion + time.sleep(freshen_interval_secs * 1.5) + + for tenant_id, timelines in tenant_timelines.items(): + attached_to_id = env.storage_controller.locate(tenant_id)[0]["node_id"] + ps_attached = env.get_pageserver(attached_to_id) + # We only have two: the other one must be secondary + ps_secondary = next(p for p in env.pageservers if p != ps_attached) + + # This one was not deleted + assert ps_secondary.list_layers(tenant_id, timelines[0]) + + # This one was deleted + assert not ps_secondary.list_layers(tenant_id, timelines[1]) + + t_end = time.time() + + # Measure how many heatmap downloads we did in total: this checks that we succeeded with + # proper scheduling, and not some bug that just runs downloads in a loop. + total_heatmap_downloads = 0 + for ps in env.pageservers: + v = ps.http_client().get_metric_value("pageserver_secondary_download_heatmap_total") + assert v is not None + total_heatmap_downloads += int(v) + + download_rate = (total_heatmap_downloads / tenant_count) / (t_end - t_start) + + expect_download_rate = 1.0 / freshen_interval_secs + log.info(f"Download rate: {download_rate * 60}/min vs expected {expect_download_rate * 60}/min") + + assert download_rate < expect_download_rate * 2 + + @pytest.mark.skipif(os.environ.get("BUILD_TYPE") == "debug", reason="only run with release build") @pytest.mark.parametrize("via_controller", [True, False]) def test_slow_secondary_downloads(neon_env_builder: NeonEnvBuilder, via_controller: bool): @@ -600,7 +686,7 @@ def test_slow_secondary_downloads(neon_env_builder: NeonEnvBuilder, via_controll ps_attached.http_client().timeline_checkpoint(tenant_id, timeline_id) # Expect lots of layers - assert len(list_layers(ps_attached, tenant_id, timeline_id)) > 10 + assert len(ps_attached.list_layers(tenant_id, timeline_id)) > 10 # Simulate large data by making layer downloads artifically slow for ps in env.pageservers: diff --git a/test_runner/regress/test_pageserver_small_inmemory_layers.py b/test_runner/regress/test_pageserver_small_inmemory_layers.py deleted file mode 100644 index 5d55020e3c..0000000000 --- a/test_runner/regress/test_pageserver_small_inmemory_layers.py +++ /dev/null @@ -1,110 +0,0 @@ -import asyncio -import time -from typing import Tuple - -import pytest -from fixtures.log_helper import log -from fixtures.neon_fixtures import ( - NeonEnv, - NeonEnvBuilder, - tenant_get_shards, -) -from fixtures.pageserver.http import PageserverHttpClient -from fixtures.pageserver.utils import wait_for_last_record_lsn -from fixtures.types import Lsn, TenantId, TimelineId -from fixtures.utils import wait_until - -TIMELINE_COUNT = 10 -ENTRIES_PER_TIMELINE = 10_000 -CHECKPOINT_TIMEOUT_SECONDS = 60 - -TENANT_CONF = { - # Large `checkpoint_distance` effectively disables size - # based checkpointing. - "checkpoint_distance": f"{2 * 1024 ** 3}", - "checkpoint_timeout": f"{CHECKPOINT_TIMEOUT_SECONDS}s", -} - - -async def run_worker(env: NeonEnv, entries: int) -> Tuple[TenantId, TimelineId, Lsn]: - tenant, timeline = env.neon_cli.create_tenant(conf=TENANT_CONF) - with env.endpoints.create_start("main", tenant_id=tenant) as ep: - conn = await ep.connect_async() - try: - await conn.execute("CREATE TABLE IF NOT EXISTS t(key serial primary key, value text)") - await conn.execute( - f"INSERT INTO t SELECT i, CONCAT('payload_', i) FROM generate_series(0,{entries}) as i" - ) - finally: - await conn.close(timeout=10) - - last_flush_lsn = Lsn(ep.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0]) - return tenant, timeline, last_flush_lsn - - -async def workload( - env: NeonEnv, timelines: int, entries: int -) -> list[Tuple[TenantId, TimelineId, Lsn]]: - workers = [asyncio.create_task(run_worker(env, entries)) for _ in range(timelines)] - return await asyncio.gather(*workers) - - -def wait_until_pageserver_is_caught_up( - env: NeonEnv, last_flush_lsns: list[Tuple[TenantId, TimelineId, Lsn]] -): - for tenant, timeline, last_flush_lsn in last_flush_lsns: - shards = tenant_get_shards(env, tenant) - for tenant_shard_id, pageserver in shards: - waited = wait_for_last_record_lsn( - pageserver.http_client(), tenant_shard_id, timeline, last_flush_lsn - ) - assert waited >= last_flush_lsn - - -def wait_for_wal_ingest_metric(pageserver_http: PageserverHttpClient) -> float: - def query(): - value = pageserver_http.get_metric_value("pageserver_wal_ingest_records_received_total") - assert value is not None - return value - - # The metric gets initialised on the first update. - # Retry a few times, but return 0 if it's stable. - try: - return float(wait_until(3, 0.5, query)) - except Exception: - return 0 - - -@pytest.mark.parametrize("immediate_shutdown", [True, False]) -def test_pageserver_small_inmemory_layers( - neon_env_builder: NeonEnvBuilder, immediate_shutdown: bool -): - """ - Test that open layers get flushed after the `checkpoint_timeout` config - and do not require WAL reingest upon restart. - - The workload creates a number of timelines and writes some data to each, - but not enough to trigger flushes via the `checkpoint_distance` config. - """ - env = neon_env_builder.init_configs() - env.start() - - last_flush_lsns = asyncio.run(workload(env, TIMELINE_COUNT, ENTRIES_PER_TIMELINE)) - wait_until_pageserver_is_caught_up(env, last_flush_lsns) - - ps_http_client = env.pageserver.http_client() - total_wal_ingested_before_restart = wait_for_wal_ingest_metric(ps_http_client) - - log.info("Sleeping for checkpoint timeout ...") - time.sleep(CHECKPOINT_TIMEOUT_SECONDS + 5) - - env.pageserver.restart(immediate=immediate_shutdown) - wait_until_pageserver_is_caught_up(env, last_flush_lsns) - - total_wal_ingested_after_restart = wait_for_wal_ingest_metric(ps_http_client) - - log.info(f"WAL ingested before restart: {total_wal_ingested_before_restart}") - log.info(f"WAL ingested after restart: {total_wal_ingested_after_restart}") - - leeway = total_wal_ingested_before_restart * 5 / 100 - assert total_wal_ingested_after_restart <= leeway diff --git a/test_runner/regress/test_pg_regress.py b/test_runner/regress/test_pg_regress.py index e4219ec7a6..2b1b7fff34 100644 --- a/test_runner/regress/test_pg_regress.py +++ b/test_runner/regress/test_pg_regress.py @@ -18,6 +18,7 @@ from fixtures.remote_storage import s3_storage def test_pg_regress( neon_env_builder: NeonEnvBuilder, test_output_dir: Path, + build_type: str, pg_bin, capsys, base_dir: Path, @@ -30,6 +31,11 @@ def test_pg_regress( """ if shard_count is not None: neon_env_builder.num_pageservers = shard_count + + if build_type == "debug": + # Disable vectored read path cross validation since it makes the test time out. + neon_env_builder.pageserver_config_override = "validate_vectored_get=false" + neon_env_builder.enable_pageserver_remote_storage(s3_storage()) neon_env_builder.enable_scrub_on_exit() env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count) diff --git a/test_runner/regress/test_pg_waldump.py b/test_runner/regress/test_pg_waldump.py new file mode 100644 index 0000000000..8e80efd9ba --- /dev/null +++ b/test_runner/regress/test_pg_waldump.py @@ -0,0 +1,61 @@ +import os +import shutil + +from fixtures.neon_fixtures import NeonEnv, PgBin +from fixtures.utils import subprocess_capture + + +def check_wal_segment(pg_waldump_path: str, segment_path: str, test_output_dir): + # use special --ignore option to ignore the validation checks in pg_waldump + # this is necessary, because neon WAL files contain gap at the beginning + output_path, _, _ = subprocess_capture( + test_output_dir, [pg_waldump_path, "--ignore", segment_path] + ) + + with open(f"{output_path}.stdout", "r") as f: + stdout = f.read() + assert "ABORT" in stdout + assert "COMMIT" in stdout + + +# Simple test to check that pg_waldump works with neon WAL files +def test_pg_waldump(neon_simple_env: NeonEnv, test_output_dir, pg_bin: PgBin): + env = neon_simple_env + tenant_id = env.initial_tenant + timeline_id = env.neon_cli.create_branch("test_pg_waldump", "empty") + endpoint = env.endpoints.create_start("test_pg_waldump") + + cur = endpoint.connect().cursor() + cur.execute( + """ + BEGIN; + CREATE TABLE t1(i int primary key, n_updated int); + INSERT INTO t1 select g, 0 from generate_series(1, 50) g; + ROLLBACK; + """ + ) + + cur.execute( + """ + BEGIN; + CREATE TABLE t1(i int primary key, n_updated int); + INSERT INTO t1 select g, 0 from generate_series(1, 50) g; + COMMIT; + """ + ) + + # stop the endpoint to make sure that WAL files are flushed and won't change + endpoint.stop() + + assert endpoint.pgdata_dir + wal_path = os.path.join(endpoint.pgdata_dir, "pg_wal/000000010000000000000001") + pg_waldump_path = os.path.join(pg_bin.pg_bin_path, "pg_waldump") + # check segment on compute + check_wal_segment(pg_waldump_path, wal_path, test_output_dir) + + # Check file on safekeepers as well. pg_waldump is strict about file naming, so remove .partial suffix. + sk = env.safekeepers[0] + sk_tli_dir = sk.timeline_dir(tenant_id, timeline_id) + non_partial_path = os.path.join(sk_tli_dir, "000000010000000000000001") + shutil.copyfile(os.path.join(sk_tli_dir, "000000010000000000000001.partial"), non_partial_path) + check_wal_segment(pg_waldump_path, non_partial_path, test_output_dir) diff --git a/test_runner/regress/test_postgres_version.py b/test_runner/regress/test_postgres_version.py new file mode 100644 index 0000000000..03e8c7c0df --- /dev/null +++ b/test_runner/regress/test_postgres_version.py @@ -0,0 +1,35 @@ +import json +import re +from pathlib import Path + +from fixtures.neon_fixtures import PgBin +from fixtures.pg_version import PgVersion + + +def test_postgres_version(base_dir: Path, pg_bin: PgBin, pg_version: PgVersion): + """Test that Postgres version matches the one we expect""" + + with (base_dir / "vendor" / "revisions.json").open() as f: + expected_revisions = json.load(f) + + output_prefix = pg_bin.run_capture(["postgres", "--version"], with_command_header=False) + stdout = Path(f"{output_prefix}.stdout") + assert stdout.exists(), "postgres --version didn't print anything to stdout" + + with stdout.open() as f: + output = f.read().strip() + + # `postgres --version` prints something like "postgres (PostgreSQL) 15.6 (85d809c124a898847a97d66a211f7d5ef4f8e0cb)". + pattern = r"postgres \(PostgreSQL\) (?P\d+\.\d+) \((?P[0-9a-f]{40})\)" + match = re.search(pattern, output, re.IGNORECASE) + assert match is not None, f"Can't parse {output} with {pattern}" + + version = match.group("version") + commit = match.group("commit") + + assert ( + pg_version.v_prefixed in expected_revisions + ), f"Version `{pg_version.v_prefixed}` doesn't exist in `vendor/revisions.json`, please update it if these changes are intentional" + + msg = f"Unexpected Postgres {pg_version} version: `{output}`, please update `vendor/revisions.json` if these changes are intentional" + assert [version, commit] == expected_revisions[pg_version.v_prefixed], msg diff --git a/test_runner/regress/test_proxy.py b/test_runner/regress/test_proxy.py index 3e986a8f7b..f446f4f200 100644 --- a/test_runner/regress/test_proxy.py +++ b/test_runner/regress/test_proxy.py @@ -566,38 +566,6 @@ async def test_sql_over_http2(static_proxy: NeonProxy): assert resp["rows"] == [{"answer": 42}] -def test_sql_over_http_timeout_cancel(static_proxy: NeonProxy): - static_proxy.safe_psql("create role http with login password 'http' superuser") - - static_proxy.safe_psql("create table test_table ( id int primary key )") - - # insert into a table, with a unique constraint, after sleeping for n seconds - query = "WITH temp AS ( \ - SELECT pg_sleep($1) as sleep, $2::int as id \ - ) INSERT INTO test_table (id) SELECT id FROM temp" - - # expect to fail with timeout - res = static_proxy.http_query( - query, - [static_proxy.http_timeout_seconds + 1, 1], - user="http", - password="http", - expected_code=400, - ) - assert "Query cancelled, runtime exceeded" in res["message"], "HTTP query should time out" - - time.sleep(2) - - res = static_proxy.http_query(query, [1, 1], user="http", password="http", expected_code=200) - assert res["command"] == "INSERT", "HTTP query should insert" - assert res["rowCount"] == 1, "HTTP query should insert" - - res = static_proxy.http_query(query, [0, 1], user="http", password="http", expected_code=400) - assert ( - "duplicate key value violates unique constraint" in res["message"] - ), "HTTP query should conflict" - - def test_sql_over_http_connection_cancel(static_proxy: NeonProxy): static_proxy.safe_psql("create role http with login password 'http' superuser") diff --git a/test_runner/regress/test_proxy_rate_limiter.py b/test_runner/regress/test_proxy_rate_limiter.py deleted file mode 100644 index f39f0cad07..0000000000 --- a/test_runner/regress/test_proxy_rate_limiter.py +++ /dev/null @@ -1,84 +0,0 @@ -import asyncio -import time -from pathlib import Path -from typing import Iterator - -import pytest -from fixtures.neon_fixtures import ( - PSQL, - NeonProxy, -) -from fixtures.port_distributor import PortDistributor -from pytest_httpserver import HTTPServer -from werkzeug.wrappers.response import Response - - -def waiting_handler(status_code: int) -> Response: - # wait more than timeout to make sure that both (two) connections are open. - # It would be better to use a barrier here, but I don't know how to do that together with pytest-httpserver. - time.sleep(2) - return Response(status=status_code) - - -@pytest.fixture(scope="function") -def proxy_with_rate_limit( - port_distributor: PortDistributor, - neon_binpath: Path, - httpserver_listen_address, - test_output_dir: Path, -) -> Iterator[NeonProxy]: - """Neon proxy that routes directly to vanilla postgres.""" - - proxy_port = port_distributor.get_port() - mgmt_port = port_distributor.get_port() - http_port = port_distributor.get_port() - external_http_port = port_distributor.get_port() - (host, port) = httpserver_listen_address - endpoint = f"http://{host}:{port}/billing/api/v1/usage_events" - - with NeonProxy( - neon_binpath=neon_binpath, - test_output_dir=test_output_dir, - proxy_port=proxy_port, - http_port=http_port, - mgmt_port=mgmt_port, - external_http_port=external_http_port, - auth_backend=NeonProxy.Console(endpoint, fixed_rate_limit=5), - ) as proxy: - proxy.start() - yield proxy - - -@pytest.mark.asyncio -async def test_proxy_rate_limit( - httpserver: HTTPServer, - proxy_with_rate_limit: NeonProxy, -): - uri = "/billing/api/v1/usage_events/proxy_get_role_secret" - # mock control plane service - httpserver.expect_ordered_request(uri, method="GET").respond_with_handler( - lambda _: Response(status=200) - ) - httpserver.expect_ordered_request(uri, method="GET").respond_with_handler( - lambda _: waiting_handler(429) - ) - httpserver.expect_ordered_request(uri, method="GET").respond_with_handler( - lambda _: waiting_handler(500) - ) - - psql = PSQL(host=proxy_with_rate_limit.host, port=proxy_with_rate_limit.proxy_port) - f = await psql.run("select 42;") - await proxy_with_rate_limit.find_auth_link(uri, f) - # Limit should be 2. - - # Run two queries in parallel. - f1, f2 = await asyncio.gather(psql.run("select 42;"), psql.run("select 42;")) - await proxy_with_rate_limit.find_auth_link(uri, f1) - await proxy_with_rate_limit.find_auth_link(uri, f2) - - # Now limit should be 0. - f = await psql.run("select 42;") - await proxy_with_rate_limit.find_auth_link(uri, f) - - # There last query shouldn't reach the http-server. - assert httpserver.assertions == [] diff --git a/test_runner/regress/test_read_validation.py b/test_runner/regress/test_read_validation.py index effb7e83f9..2437c8f806 100644 --- a/test_runner/regress/test_read_validation.py +++ b/test_runner/regress/test_read_validation.py @@ -17,12 +17,19 @@ def test_read_validation(neon_simple_env: NeonEnv): env = neon_simple_env env.neon_cli.create_branch("test_read_validation", "empty") - endpoint = env.endpoints.create_start("test_read_validation") + endpoint = env.endpoints.create_start( + "test_read_validation", + # Use protocol version 2, because the code that constructs the V1 messages + # assumes that a primary always wants to read the latest version of a page, + # and therefore doesn't work with the test functions below to read an older + # page version. + config_lines=["neon.protocol_version=2"], + ) with closing(endpoint.connect()) as con: with con.cursor() as c: for e in extensions: - c.execute("create extension if not exists {};".format(e)) + c.execute(f"create extension if not exists {e};") c.execute("create table foo (c int) with (autovacuum_enabled = false)") c.execute("insert into foo values (1)") @@ -42,14 +49,12 @@ def test_read_validation(neon_simple_env: NeonEnv): log.info("Test table is populated, validating buffer cache") cache_entries = query_scalar( - c, "select count(*) from pg_buffercache where relfilenode = {}".format(relfilenode) + c, f"select count(*) from pg_buffercache where relfilenode = {relfilenode}" ) assert cache_entries > 0, "No buffers cached for the test relation" c.execute( - "select reltablespace, reldatabase, relfilenode from pg_buffercache where relfilenode = {}".format( - relfilenode - ) + f"select reltablespace, reldatabase, relfilenode from pg_buffercache where relfilenode = {relfilenode}" ) reln = c.fetchone() assert reln is not None @@ -59,35 +64,33 @@ def test_read_validation(neon_simple_env: NeonEnv): c.execute("select clear_buffer_cache()") cache_entries = query_scalar( - c, "select count(*) from pg_buffercache where relfilenode = {}".format(relfilenode) + c, f"select count(*) from pg_buffercache where relfilenode = {relfilenode}" ) assert cache_entries == 0, "Failed to clear buffer cache" log.info("Cache is clear, reading stale page version") c.execute( - "select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', 'main', 0, '{}'))".format( - first[0] - ) + f"select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', 'main', 0, '{first[0]}', NULL))" ) direct_first = c.fetchone() assert first == direct_first, "Failed fetch page at historic lsn" cache_entries = query_scalar( - c, "select count(*) from pg_buffercache where relfilenode = {}".format(relfilenode) + c, f"select count(*) from pg_buffercache where relfilenode = {relfilenode}" ) assert cache_entries == 0, "relation buffers detected after invalidation" log.info("Cache is clear, reading latest page version without cache") c.execute( - "select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', 'main', 0, NULL))" + "select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', 'main', 0, NULL, NULL))" ) direct_latest = c.fetchone() assert second == direct_latest, "Failed fetch page at latest lsn" cache_entries = query_scalar( - c, "select count(*) from pg_buffercache where relfilenode = {}".format(relfilenode) + c, f"select count(*) from pg_buffercache where relfilenode = {relfilenode}" ) assert cache_entries == 0, "relation buffers detected after invalidation" @@ -96,9 +99,7 @@ def test_read_validation(neon_simple_env: NeonEnv): ) c.execute( - "select lsn, lower, upper from page_header(get_raw_page_at_lsn( {}, {}, {}, 0, 0, '{}' ))".format( - reln[0], reln[1], reln[2], first[0] - ) + f"select lsn, lower, upper from page_header(get_raw_page_at_lsn({reln[0]}, {reln[1]}, {reln[2]}, 0, 0, '{first[0]}', NULL))" ) direct_first = c.fetchone() assert first == direct_first, "Failed fetch page at historic lsn using oid" @@ -108,9 +109,7 @@ def test_read_validation(neon_simple_env: NeonEnv): ) c.execute( - "select lsn, lower, upper from page_header(get_raw_page_at_lsn( {}, {}, {}, 0, 0, NULL ))".format( - reln[0], reln[1], reln[2] - ) + f"select lsn, lower, upper from page_header(get_raw_page_at_lsn({reln[0]}, {reln[1]}, {reln[2]}, 0, 0, NULL, NULL))" ) direct_latest = c.fetchone() assert second == direct_latest, "Failed fetch page at latest lsn" @@ -122,9 +121,7 @@ def test_read_validation(neon_simple_env: NeonEnv): ) c.execute( - "select lsn, lower, upper from page_header(get_raw_page_at_lsn( {}, {}, {}, 0, 0, '{}' ))".format( - reln[0], reln[1], reln[2], first[0] - ) + f"select lsn, lower, upper from page_header(get_raw_page_at_lsn({reln[0]}, {reln[1]}, {reln[2]}, 0, 0, '{first[0]}', NULL))" ) direct_first = c.fetchone() assert first == direct_first, "Failed fetch page at historic lsn using oid" @@ -134,7 +131,7 @@ def test_read_validation(neon_simple_env: NeonEnv): c.execute("select * from page_header(get_raw_page('foo', 'main', 0));") raise AssertionError("query should have failed") except UndefinedTable as e: - log.info("Caught an expected failure: {}".format(e)) + log.info(f"Caught an expected failure: {e}") def test_read_validation_neg(neon_simple_env: NeonEnv): @@ -143,21 +140,28 @@ def test_read_validation_neg(neon_simple_env: NeonEnv): env.pageserver.allowed_errors.append(".*invalid LSN\\(0\\) in request.*") - endpoint = env.endpoints.create_start("test_read_validation_neg") + endpoint = env.endpoints.create_start( + "test_read_validation_neg", + # Use protocol version 2, because the code that constructs the V1 messages + # assumes that a primary always wants to read the latest version of a page, + # and therefore doesn't work with the test functions below to read an older + # page version. + config_lines=["neon.protocol_version=2"], + ) with closing(endpoint.connect()) as con: with con.cursor() as c: for e in extensions: - c.execute("create extension if not exists {};".format(e)) + c.execute(f"create extension if not exists {e};") log.info("read a page of a missing relation") try: c.execute( - "select lsn, lower, upper from page_header(get_raw_page_at_lsn('Unknown', 'main', 0, '0/0'))" + "select lsn, lower, upper from page_header(get_raw_page_at_lsn('Unknown', 'main', 0, '0/0', NULL))" ) raise AssertionError("query should have failed") except UndefinedTable as e: - log.info("Caught an expected failure: {}".format(e)) + log.info(f"Caught an expected failure: {e}") c.execute("create table foo (c int) with (autovacuum_enabled = false)") c.execute("insert into foo values (1)") @@ -165,31 +169,31 @@ def test_read_validation_neg(neon_simple_env: NeonEnv): log.info("read a page at lsn 0") try: c.execute( - "select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', 'main', 0, '0/0'))" + "select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', 'main', 0, '0/0', NULL))" ) raise AssertionError("query should have failed") except IoError as e: - log.info("Caught an expected failure: {}".format(e)) + log.info(f"Caught an expected failure: {e}") log.info("Pass NULL as an input") expected = (None, None, None) c.execute( - "select lsn, lower, upper from page_header(get_raw_page_at_lsn(NULL, 'main', 0, '0/0'))" + "select lsn, lower, upper from page_header(get_raw_page_at_lsn(NULL, 'main', 0, '0/0', NULL))" ) assert c.fetchone() == expected, "Expected null output" c.execute( - "select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', NULL, 0, '0/0'))" + "select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', NULL, 0, '0/0', NULL))" ) assert c.fetchone() == expected, "Expected null output" c.execute( - "select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', 'main', NULL, '0/0'))" + "select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', 'main', NULL, '0/0', NULL))" ) assert c.fetchone() == expected, "Expected null output" # This check is currently failing, reading beyond EOF is returning a 0-page log.info("Read beyond EOF") c.execute( - "select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', 'main', 1, NULL))" + "select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', 'main', 1, NULL, NULL))" ) diff --git a/test_runner/regress/test_recovery.py b/test_runner/regress/test_recovery.py index 6aac1e1d84..ab5c8be256 100644 --- a/test_runner/regress/test_recovery.py +++ b/test_runner/regress/test_recovery.py @@ -15,6 +15,13 @@ def test_pageserver_recovery(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_start() env.pageserver.is_testing_enabled_or_skip() + # We expect the pageserver to exit, which will cause storage storage controller + # requests to fail and warn. + env.storage_controller.allowed_errors.append(".*management API still failed.*") + env.storage_controller.allowed_errors.append( + ".*Reconcile error.*error sending request for url.*" + ) + # Create a branch for us env.neon_cli.create_branch("test_pageserver_recovery", "main") diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py index 05f769b0e3..70c025c225 100644 --- a/test_runner/regress/test_remote_storage.py +++ b/test_runner/regress/test_remote_storage.py @@ -1,6 +1,3 @@ -# It's possible to run any regular test with the local fs remote storage via -# env NEON_PAGESERVER_OVERRIDES="remote_storage={local_path='/tmp/neon_zzz/'}" poetry ...... - import os import queue import shutil @@ -15,6 +12,7 @@ from fixtures.neon_fixtures import ( wait_for_last_flush_lsn, ) from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient +from fixtures.pageserver.types import parse_layer_file_name from fixtures.pageserver.utils import ( timeline_delete_wait_completed, wait_for_last_record_lsn, @@ -245,6 +243,7 @@ def test_remote_storage_upload_queue_retries( "compaction_period": "0s", # create image layers eagerly, so that GC can remove some layers "image_creation_threshold": "1", + "image_layer_creation_check_threshold": "0", } ) @@ -831,17 +830,19 @@ def test_compaction_waits_for_upload( assert len(upload_stuck_layers) > 0 for name in upload_stuck_layers: - path = env.pageserver.timeline_dir(tenant_id, timeline_id) / name - assert path.exists(), "while uploads are stuck the layers should be present on disk" + assert env.pageserver.layer_exists( + tenant_id, timeline_id, parse_layer_file_name(name) + ), "while uploads are stuck the layers should be present on disk" # now this will do the L0 => L1 compaction and want to remove # upload_stuck_layers and the original initdb L0 client.timeline_checkpoint(tenant_id, timeline_id) - # as uploads are paused, the the upload_stuck_layers should still be with us + # as uploads are paused, the upload_stuck_layers should still be with us for name in upload_stuck_layers: - path = env.pageserver.timeline_dir(tenant_id, timeline_id) / name - assert path.exists(), "uploads are stuck still over compaction" + assert env.pageserver.layer_exists( + tenant_id, timeline_id, parse_layer_file_name(name) + ), "uploads are stuck still over compaction" compacted_layers = client.layer_map_info(tenant_id, timeline_id).historic_by_name() overlap = compacted_layers.intersection(upload_stuck_layers) @@ -875,9 +876,8 @@ def test_compaction_waits_for_upload( wait_until(10, 1, until_layer_deletes_completed) for name in upload_stuck_layers: - path = env.pageserver.timeline_dir(tenant_id, timeline_id) / name - assert ( - not path.exists() + assert not env.pageserver.layer_exists( + tenant_id, timeline_id, parse_layer_file_name(name) ), "l0 should now be removed because of L0 => L1 compaction and completed uploads" # We should not have hit the error handling path in uploads where a uploaded file is gone diff --git a/test_runner/regress/test_replication_start.py b/test_runner/regress/test_replication_start.py index b4699c7be8..2360745990 100644 --- a/test_runner/regress/test_replication_start.py +++ b/test_runner/regress/test_replication_start.py @@ -1,7 +1,9 @@ +import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnv, wait_replica_caughtup +@pytest.mark.xfail def test_replication_start(neon_simple_env: NeonEnv): env = neon_simple_env diff --git a/test_runner/regress/test_s3_restore.py b/test_runner/regress/test_s3_restore.py index 611bd1c2a2..9227836862 100644 --- a/test_runner/regress/test_s3_restore.py +++ b/test_runner/regress/test_s3_restore.py @@ -47,7 +47,7 @@ def test_tenant_s3_restore( tenant_id = env.initial_tenant # Default tenant and the one we created - assert ps_http.get_metric_value("pageserver_tenant_manager_slots") == 1 + assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 1 # create two timelines one being the parent of another, both with non-trivial data parent = None @@ -72,13 +72,13 @@ def test_tenant_s3_restore( time.sleep(4) assert ( - ps_http.get_metric_value("pageserver_tenant_manager_slots") == 1 + ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 1 ), "tenant removed before we deletion was issued" iterations = poll_for_remote_storage_iterations(remote_storage_kind) tenant_delete_wait_completed(ps_http, tenant_id, iterations) ps_http.deletion_queue_flush(execute=True) assert ( - ps_http.get_metric_value("pageserver_tenant_manager_slots") == 0 + ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 0 ), "tenant removed before we deletion was issued" env.storage_controller.attach_hook_drop(tenant_id) @@ -116,4 +116,4 @@ def test_tenant_s3_restore( # There might be some activity that advances the lsn so we can't use a strict equality check assert last_flush_lsn >= expected_last_flush_lsn, "last_flush_lsn too old" - assert ps_http.get_metric_value("pageserver_tenant_manager_slots") == 1 + assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 1 diff --git a/test_runner/regress/test_s3_scrubber.py b/test_runner/regress/test_s3_scrubber.py new file mode 100644 index 0000000000..018c1637d0 --- /dev/null +++ b/test_runner/regress/test_s3_scrubber.py @@ -0,0 +1,111 @@ +import os +import shutil +from typing import Optional + +import pytest +from fixtures.neon_fixtures import ( + NeonEnvBuilder, + S3Scrubber, +) +from fixtures.remote_storage import S3Storage, s3_storage +from fixtures.types import TenantShardId +from fixtures.workload import Workload + + +@pytest.mark.parametrize("shard_count", [None, 4]) +def test_scrubber_tenant_snapshot(neon_env_builder: NeonEnvBuilder, shard_count: Optional[int]): + """ + Test the `tenant-snapshot` subcommand, which grabs data from remote storage + + This is only a support/debug tool, but worth testing to ensure the tool does not regress. + """ + + neon_env_builder.enable_pageserver_remote_storage(s3_storage()) + neon_env_builder.num_pageservers = shard_count if shard_count is not None else 1 + + env = neon_env_builder.init_start() + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + branch = "main" + + # Do some work + workload = Workload(env, tenant_id, timeline_id, branch) + workload.init() + + # Multiple write/flush passes to generate multiple layers + for _n in range(0, 3): + workload.write_rows(128) + + # Do some more work after a restart, so that we have multiple generations + for pageserver in env.pageservers: + pageserver.stop() + pageserver.start() + + for _n in range(0, 3): + workload.write_rows(128) + + # If we're doing multiple shards, split: this is important to exercise + # the scrubber's ability to understand the references from child shards to parent shard's layers + if shard_count is not None: + tenant_shard_ids = env.storage_controller.tenant_shard_split( + tenant_id, shard_count=shard_count + ) + + # Write after shard split: this will result in shards containing a mixture of owned + # and parent layers in their index. + workload.write_rows(128) + else: + tenant_shard_ids = [TenantShardId(tenant_id, 0, 0)] + + output_path = neon_env_builder.test_output_dir / "snapshot" + os.makedirs(output_path) + + scrubber = S3Scrubber(neon_env_builder) + scrubber.tenant_snapshot(tenant_id, output_path) + + assert len(os.listdir(output_path)) > 0 + + workload.stop() + + # Stop pageservers + for pageserver in env.pageservers: + pageserver.stop() + + # Drop all shards' local storage + for tenant_shard_id in tenant_shard_ids: + pageserver = env.get_tenant_pageserver(tenant_shard_id) + shutil.rmtree(pageserver.timeline_dir(tenant_shard_id, timeline_id)) + + # Replace remote storage contents with the snapshot we downloaded + assert isinstance(env.pageserver_remote_storage, S3Storage) + + remote_tenant_path = env.pageserver_remote_storage.tenant_path(tenant_id) + + # Delete current remote storage contents + bucket = env.pageserver_remote_storage.bucket_name + remote_client = env.pageserver_remote_storage.client + deleted = 0 + for object in remote_client.list_objects_v2(Bucket=bucket, Prefix=remote_tenant_path)[ + "Contents" + ]: + key = object["Key"] + remote_client.delete_object(Key=key, Bucket=bucket) + deleted += 1 + assert deleted > 0 + + # Upload from snapshot + for root, _dirs, files in os.walk(output_path): + for file in files: + full_local_path = os.path.join(root, file) + full_remote_path = ( + env.pageserver_remote_storage.tenants_path() + + "/" + + full_local_path.removeprefix(f"{output_path}/") + ) + remote_client.upload_file(full_local_path, bucket, full_remote_path) + + for pageserver in env.pageservers: + pageserver.start() + + # Check we can read everything + workload.validate() diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py index e6318aff68..d33803250f 100644 --- a/test_runner/regress/test_sharding.py +++ b/test_runner/regress/test_sharding.py @@ -10,9 +10,13 @@ from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnv, NeonEnvBuilder, + S3Scrubber, StorageControllerApiException, + last_flush_lsn_upload, tenant_get_shards, + wait_for_last_flush_lsn, ) +from fixtures.pageserver.utils import assert_prefix_empty, assert_prefix_not_empty from fixtures.remote_storage import s3_storage from fixtures.types import Lsn, TenantId, TenantShardId, TimelineId from fixtures.utils import wait_until @@ -67,6 +71,15 @@ def test_sharding_smoke( log.info(f"sizes = {sizes}") return sizes + # The imported initdb for timeline creation should + # not be fully imported on every shard. We use a 1MB strripe size so expect + # pretty good distribution: no one shard should have more than half the data + sizes = get_sizes() + physical_initdb_total = sum(sizes.values()) + expect_initdb_size = 20 * 1024 * 1024 + assert physical_initdb_total > expect_initdb_size + assert all(s < expect_initdb_size // 2 for s in sizes.values()) + # Test that timeline creation works on a sharded tenant timeline_b = env.neon_cli.create_branch("branch_b", tenant_id=tenant_id) @@ -99,6 +112,38 @@ def test_sharding_smoke( env.storage_controller.consistency_check() + # Validate that deleting a sharded tenant removes all files in the prefix + + # Before deleting, stop the client and check we have some objects to delete + workload.stop() + assert_prefix_not_empty( + neon_env_builder.pageserver_remote_storage, + prefix="/".join( + ( + "tenants", + str(tenant_id), + ) + ), + ) + + # Check the scrubber isn't confused by sharded content, then disable + # it during teardown because we'll have deleted by then + S3Scrubber(neon_env_builder).scan_metadata() + neon_env_builder.scrub_on_exit = False + + env.storage_controller.pageserver_api().tenant_delete(tenant_id) + assert_prefix_empty( + neon_env_builder.pageserver_remote_storage, + prefix="/".join( + ( + "tenants", + str(tenant_id), + ) + ), + ) + + env.storage_controller.consistency_check() + def test_sharding_split_unsharded( neon_env_builder: NeonEnvBuilder, @@ -132,6 +177,67 @@ def test_sharding_split_unsharded( env.storage_controller.consistency_check() +def test_sharding_split_compaction(neon_env_builder: NeonEnvBuilder): + """ + Test that after a split, we clean up parent layer data in the child shards via compaction. + """ + TENANT_CONF = { + # small checkpointing and compaction targets to ensure we generate many upload operations + "checkpoint_distance": f"{128 * 1024}", + "compaction_threshold": "1", + "compaction_target_size": f"{128 * 1024}", + # no PITR horizon, we specify the horizon when we request on-demand GC + "pitr_interval": "3600s", + # disable background compaction and GC. We invoke it manually when we want it to happen. + "gc_period": "0s", + "compaction_period": "0s", + # create image layers eagerly, so that GC can remove some layers + "image_creation_threshold": "1", + "image_layer_creation_check_threshold": "0", + } + + env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF) + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + + # Check that we created with an unsharded TenantShardId: this is the default, + # but check it in case we change the default in future + assert env.storage_controller.inspect(TenantShardId(tenant_id, 0, 0)) is not None + + workload = Workload(env, tenant_id, timeline_id, branch_name="main") + workload.init() + workload.write_rows(256) + workload.validate() + workload.stop() + + # Split one shard into two + shards = env.storage_controller.tenant_shard_split(tenant_id, shard_count=2) + + # Check we got the shard IDs we expected + assert env.storage_controller.inspect(TenantShardId(tenant_id, 0, 2)) is not None + assert env.storage_controller.inspect(TenantShardId(tenant_id, 1, 2)) is not None + + workload.validate() + workload.stop() + + env.storage_controller.consistency_check() + + # Cleanup part 1: while layers are still in PITR window, we should only drop layers that are fully redundant + for shard in shards: + ps = env.get_tenant_pageserver(shard) + + # Invoke compaction: this should drop any layers that don't overlap with the shard's key stripes + detail_before = ps.http_client().timeline_detail(shard, timeline_id) + ps.http_client().timeline_compact(shard, timeline_id) + detail_after = ps.http_client().timeline_detail(shard, timeline_id) + + # Physical size should shrink because some layers have been dropped + assert detail_after["current_physical_size"] < detail_before["current_physical_size"] + + # Compaction shouldn't make anything unreadable + workload.validate() + + def test_sharding_split_smoke( neon_env_builder: NeonEnvBuilder, ): @@ -146,7 +252,7 @@ def test_sharding_split_smoke( # 8 shards onto separate pageservers shard_count = 4 split_shard_count = 8 - neon_env_builder.num_pageservers = split_shard_count + neon_env_builder.num_pageservers = split_shard_count * 2 # 1MiB stripes: enable getting some meaningful data distribution without # writing large quantities of data in this test. The stripe size is given @@ -174,6 +280,7 @@ def test_sharding_split_smoke( placement_policy='{"Attached": 1}', conf=non_default_tenant_config, ) + workload = Workload(env, tenant_id, timeline_id, branch_name="main") workload.init() @@ -241,6 +348,11 @@ def test_sharding_split_smoke( == shard_count ) + # Make secondary downloads slow: this exercises the storage controller logic for not migrating an attachment + # during post-split optimization until the secondary is ready + for ps in env.pageservers: + ps.http_client().configure_failpoints([("secondary-layer-download-sleep", "return(1000)")]) + env.storage_controller.tenant_shard_split(tenant_id, shard_count=split_shard_count) post_split_pageserver_ids = [loc["node_id"] for loc in env.storage_controller.locate(tenant_id)] @@ -252,6 +364,10 @@ def test_sharding_split_smoke( # The old parent shards should no longer exist on disk assert not shards_on_disk(old_shard_ids) + # Enough background reconciliations should result in the shards being properly distributed. + # Run this before the workload, because its LSN-waiting code presumes stable locations. + env.storage_controller.reconcile_until_idle(timeout_secs=60) + workload.validate() workload.churn_rows(256) @@ -265,27 +381,6 @@ def test_sharding_split_smoke( pageserver.http_client().timeline_gc(tenant_shard_id, timeline_id, None) workload.validate() - migrate_to_pageserver_ids = list( - set(p.id for p in env.pageservers) - set(pre_split_pageserver_ids) - ) - assert len(migrate_to_pageserver_ids) == split_shard_count - shard_count - - # Migrate shards away from the node where the split happened - for ps_id in pre_split_pageserver_ids: - shards_here = [ - tenant_shard_id - for (tenant_shard_id, pageserver) in all_shards - if pageserver.id == ps_id - ] - assert len(shards_here) == 2 - migrate_shard = shards_here[0] - destination = migrate_to_pageserver_ids.pop() - - log.info(f"Migrating shard {migrate_shard} from {ps_id} to {destination}") - env.storage_controller.tenant_shard_migrate(migrate_shard, destination) - - workload.validate() - # Assert on how many reconciles happened during the process. This is something of an # implementation detail, but it is useful to detect any bugs that might generate spurious # extra reconcile iterations. @@ -294,8 +389,9 @@ def test_sharding_split_smoke( # - shard_count reconciles for the original setup of the tenant # - shard_count reconciles for detaching the original secondary locations during split # - split_shard_count reconciles during shard splitting, for setting up secondaries. - # - shard_count reconciles for the migrations we did to move child shards away from their split location - expect_reconciles = shard_count * 2 + split_shard_count + shard_count + # - shard_count of the child shards will need to fail over to their secondaries + # - shard_count of the child shard secondary locations will get moved to emptier nodes + expect_reconciles = shard_count * 2 + split_shard_count + shard_count * 2 reconcile_ok = env.storage_controller.get_metric_value( "storage_controller_reconcile_complete_total", filter={"status": "ok"} ) @@ -312,6 +408,10 @@ def test_sharding_split_smoke( assert cancelled_reconciles is not None and int(cancelled_reconciles) == 0 assert errored_reconciles is not None and int(errored_reconciles) == 0 + # We should see that the migration of shards after the split waited for secondaries to warm up + # before happening + assert env.storage_controller.log_contains(".*Skipping.*because secondary isn't ready.*") + env.storage_controller.consistency_check() def get_node_shard_counts(env: NeonEnv, tenant_ids): @@ -343,6 +443,31 @@ def test_sharding_split_smoke( assert sum(total.values()) == split_shard_count * 2 check_effective_tenant_config() + # More specific check: that we are fully balanced. This is deterministic because + # the order in which we consider shards for optimization is deterministic, and the + # order of preference of nodes is also deterministic (lower node IDs win). + log.info(f"total: {total}") + assert total == { + 1: 1, + 2: 1, + 3: 1, + 4: 1, + 5: 1, + 6: 1, + 7: 1, + 8: 1, + 9: 1, + 10: 1, + 11: 1, + 12: 1, + 13: 1, + 14: 1, + 15: 1, + 16: 1, + } + log.info(f"attached: {attached}") + assert attached == {1: 1, 2: 1, 3: 1, 5: 1, 6: 1, 7: 1, 9: 1, 11: 1} + # Ensure post-split pageserver locations survive a restart (i.e. the child shards # correctly wrote config to disk, and the storage controller responds correctly # to /re-attach) @@ -401,6 +526,7 @@ def test_sharding_split_stripe_size( env.storage_controller.tenant_shard_split( tenant_id, shard_count=2, shard_stripe_size=new_stripe_size ) + env.storage_controller.reconcile_until_idle() # Check that we ended up with the stripe size that we expected, both on the pageserver # and in the notifications to compute @@ -455,13 +581,11 @@ def test_sharding_split_stripe_size( os.getenv("BUILD_TYPE") == "debug", reason="Avoid running bulkier ingest tests in debug mode", ) -def test_sharding_ingest( +def test_sharding_ingest_layer_sizes( neon_env_builder: NeonEnvBuilder, ): """ - Check behaviors related to ingest: - - That we generate properly sized layers - - TODO: that updates to remote_consistent_lsn are made correctly via safekeepers + Check that when ingesting data to a sharded tenant, we properly respect layer size limts. """ # Set a small stripe size and checkpoint distance, so that we can exercise rolling logic @@ -492,6 +616,7 @@ def test_sharding_ingest( workload.write_rows(4096, upload=False) workload.write_rows(4096, upload=False) workload.write_rows(4096, upload=False) + workload.validate() small_layer_count = 0 @@ -504,7 +629,9 @@ def test_sharding_ingest( shard_id = shard["shard_id"] layer_map = pageserver.http_client().layer_map_info(shard_id, timeline_id) - for layer in layer_map.historic_layers: + historic_layers = sorted(layer_map.historic_layers, key=lambda layer: layer.lsn_start) + + for layer in historic_layers: assert layer.layer_file_size is not None if layer.layer_file_size < expect_layer_size // 2: classification = "Small" @@ -541,6 +668,93 @@ def test_sharding_ingest( assert huge_layer_count <= shard_count +def test_sharding_ingest_gaps( + neon_env_builder: NeonEnvBuilder, +): + """ + Check ingest behavior when the incoming data results in some shards having gaps where + no data is ingested: they should advance their disk_consistent_lsn and remote_consistent_lsn + even if they aren't writing out layers. + """ + + # Set a small stripe size and checkpoint distance, so that we can exercise rolling logic + # without writing a lot of data. + expect_layer_size = 131072 + checkpoint_interval_secs = 5 + TENANT_CONF = { + # small checkpointing and compaction targets to ensure we generate many upload operations + "checkpoint_distance": f"{expect_layer_size}", + "compaction_target_size": f"{expect_layer_size}", + # Set a short checkpoint interval as we will wait for uploads to happen + "checkpoint_timeout": f"{checkpoint_interval_secs}s", + # Background checkpointing is done from compaction loop, so set that interval short too + "compaction_period": "1s", + } + shard_count = 4 + neon_env_builder.num_pageservers = shard_count + env = neon_env_builder.init_start( + initial_tenant_conf=TENANT_CONF, + initial_tenant_shard_count=shard_count, + initial_tenant_shard_stripe_size=128, + ) + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + + # Just a few writes: we aim to produce a situation where some shards are skipping + # ingesting some records and thereby won't have layer files that advance their + # consistent LSNs, to exercise the code paths that explicitly handle this case by + # advancing consistent LSNs in the background if there is no open layer. + workload = Workload(env, tenant_id, timeline_id) + workload.init() + workload.write_rows(128, upload=False) + workload.churn_rows(128, upload=False) + + # Checkpoint, so that we won't get a background checkpoint happening during the next step + workload.endpoint().safe_psql("checkpoint") + # Freeze + flush, so that subsequent writes will start from a position of no open layers + last_flush_lsn_upload(env, workload.endpoint(), tenant_id, timeline_id) + + # This write is tiny: at least some of the shards should find they don't have any + # data to ingest. This will exercise how they handle that. + workload.churn_rows(1, upload=False) + + # The LSN that has reached pageservers, but may not have been flushed to historic layers yet + expect_lsn = wait_for_last_flush_lsn(env, workload.endpoint(), tenant_id, timeline_id) + + # Don't leave the endpoint running, we don't want it writing in the background + workload.stop() + + log.info(f"Waiting for shards' consistent LSNs to reach {expect_lsn}") + + shards = tenant_get_shards(env, tenant_id, None) + + def assert_all_disk_consistent(): + """ + Assert that all the shards' disk_consistent_lsns have reached expect_lsn + """ + for tenant_shard_id, pageserver in shards: + timeline_detail = pageserver.http_client().timeline_detail(tenant_shard_id, timeline_id) + log.info(f"{tenant_shard_id} (ps {pageserver.id}) detail: {timeline_detail}") + assert Lsn(timeline_detail["disk_consistent_lsn"]) >= expect_lsn + + # We set a short checkpoint timeout: expect things to get frozen+flushed within that + wait_until(checkpoint_interval_secs * 3, 1, assert_all_disk_consistent) + + def assert_all_remote_consistent(): + """ + Assert that all the shards' remote_consistent_lsns have reached expect_lsn + """ + for tenant_shard_id, pageserver in shards: + timeline_detail = pageserver.http_client().timeline_detail(tenant_shard_id, timeline_id) + log.info(f"{tenant_shard_id} (ps {pageserver.id}) detail: {timeline_detail}") + assert Lsn(timeline_detail["remote_consistent_lsn"]) >= expect_lsn + + # We set a short checkpoint timeout: expect things to get frozen+flushed within that + wait_until(checkpoint_interval_secs * 3, 1, assert_all_remote_consistent) + + workload.validate() + + class Failure: pageserver_id: Optional[int] @@ -784,6 +998,10 @@ def test_sharding_split_failures( ".*Reconcile error: receive body: error sending request for url.*", # Node offline cases will fail inside reconciler when detaching secondaries ".*Reconcile error on shard.*: receive body: error sending request for url.*", + # Node offline cases may eventually cancel reconcilers when the heartbeater realizes nodes are offline + ".*Reconcile error.*Cancelled.*", + # While parent shard's client is stopped during split, flush loop updating LSNs will emit this warning + ".*Failed to schedule metadata upload after updating disk_consistent_lsn.*", ] ) @@ -869,16 +1087,23 @@ def test_sharding_split_failures( # Having failed+rolled back, we should be able to split again # No failures this time; it will succeed env.storage_controller.tenant_shard_split(tenant_id, shard_count=split_shard_count) + env.storage_controller.reconcile_until_idle(timeout_secs=30) workload.churn_rows(10) workload.validate() if failure.expect_available(): - # Even though the split failed partway through, this should not have interrupted - # clients. Disable waiting for pageservers in the workload helper, because our - # failpoints may prevent API access. - # This only applies for failure modes that leave pageserver page_service API available. - workload.churn_rows(10, upload=False, ingest=False) + # Even though the split failed partway through, this should not leave the tenant in + # an unavailable state. + # - Disable waiting for pageservers in the workload helper, because our + # failpoints may prevent API access. This only applies for failure modes that + # leave pageserver page_service API available. + # - This is a wait_until because clients may see transient errors in some split error cases, + # e.g. while waiting for a storage controller to re-attach a parent shard if we failed + # inside the pageserver and the storage controller responds by detaching children and attaching + # parents concurrently (https://github.com/neondatabase/neon/issues/7148) + wait_until(10, 1, lambda: workload.churn_rows(10, upload=False, ingest=False)) # type: ignore + workload.validate() if failure.fails_forward(env): @@ -916,6 +1141,21 @@ def test_sharding_split_failures( finish_split() assert_split_done() + if isinstance(failure, StorageControllerFailpoint) and "post-complete" in failure.failpoint: + # On a post-complete failure, the controller will recover the post-split state + # after restart, but it will have missed the optimization part of the split function + # where secondary downloads are kicked off. This means that reconcile_until_idle + # will take a very long time if we wait for all optimizations to complete, because + # those optimizations will wait for secondary downloads. + # + # Avoid that by configuring the tenant into Essential scheduling mode, so that it will + # skip optimizations when we're exercising this particular failpoint. + env.storage_controller.tenant_policy_update(tenant_id, {"scheduling": "Essential"}) + + # Having completed the split, pump the background reconciles to ensure that + # the scheduler reaches an idle state + env.storage_controller.reconcile_until_idle(timeout_secs=30) + env.storage_controller.consistency_check() @@ -1044,3 +1284,45 @@ def test_sharding_backpressure(neon_env_builder: NeonEnvBuilder): max_lsn = max(Lsn(info["last_record_lsn"]) for info in infos) diff = max_lsn - min_lsn assert diff < 2 * 1024 * 1024, f"LSN diff={diff}, expected diff < 2MB due to backpressure" + + +def test_sharding_unlogged_relation(neon_env_builder: NeonEnvBuilder): + """ + Check that an unlogged relation is handled properly on a sharded tenant + + Reproducer for https://github.com/neondatabase/neon/issues/7451 + """ + + neon_env_builder.num_pageservers = 2 + env = neon_env_builder.init_configs() + neon_env_builder.start() + + tenant_id = TenantId.generate() + timeline_id = TimelineId.generate() + env.neon_cli.create_tenant(tenant_id, timeline_id, shard_count=8) + + # We will create many tables to ensure it's overwhelmingly likely that at least one + # of them doesn't land on shard 0 + table_names = [f"my_unlogged_{i}" for i in range(0, 16)] + + with env.endpoints.create_start("main", tenant_id=tenant_id) as ep: + for table_name in table_names: + ep.safe_psql(f"CREATE UNLOGGED TABLE {table_name} (id integer, value varchar(64));") + ep.safe_psql(f"INSERT INTO {table_name} VALUES (1, 'foo')") + result = ep.safe_psql(f"SELECT * from {table_name};") + assert result == [(1, "foo")] + ep.safe_psql(f"CREATE INDEX ON {table_name} USING btree (value);") + + wait_for_last_flush_lsn(env, ep, tenant_id, timeline_id) + + with env.endpoints.create_start("main", tenant_id=tenant_id) as ep: + for table_name in table_names: + # Check that table works: we can select and insert + result = ep.safe_psql(f"SELECT * from {table_name};") + assert result == [] + ep.safe_psql(f"INSERT INTO {table_name} VALUES (2, 'bar');") + result = ep.safe_psql(f"SELECT * from {table_name};") + assert result == [(2, "bar")] + + # Ensure that post-endpoint-restart modifications are ingested happily by pageserver + wait_for_last_flush_lsn(env, ep, tenant_id, timeline_id) diff --git a/test_runner/regress/test_sharding_service.py b/test_runner/regress/test_storage_controller.py similarity index 70% rename from test_runner/regress/test_sharding_service.py rename to test_runner/regress/test_storage_controller.py index b7488cadd6..bdd356388f 100644 --- a/test_runner/regress/test_sharding_service.py +++ b/test_runner/regress/test_storage_controller.py @@ -1,3 +1,5 @@ +import json +import threading import time from collections import defaultdict from datetime import datetime, timezone @@ -24,7 +26,8 @@ from fixtures.pageserver.utils import ( from fixtures.pg_version import PgVersion from fixtures.remote_storage import RemoteStorageKind, s3_storage from fixtures.types import TenantId, TenantShardId, TimelineId -from fixtures.utils import run_pg_bench_small, wait_until +from fixtures.utils import run_pg_bench_small, subprocess_capture, wait_until +from fixtures.workload import Workload from mypy_boto3_s3.type_defs import ( ObjectTypeDef, ) @@ -41,11 +44,11 @@ def get_node_shard_counts(env: NeonEnv, tenant_ids): return counts -def test_sharding_service_smoke( +def test_storage_controller_smoke( neon_env_builder: NeonEnvBuilder, ): """ - Test the basic lifecycle of a sharding service: + Test the basic lifecycle of a storage controller: - Restarting - Restarting a pageserver - Creating and deleting tenants and timelines @@ -89,6 +92,11 @@ def test_sharding_service_smoke( for tid in tenant_ids: env.neon_cli.create_tenant(tid, shard_count=shards_per_tenant) + # Repeating a creation should be idempotent (we are just testing it doesn't return an error) + env.storage_controller.tenant_create( + tenant_id=next(iter(tenant_ids)), shard_count=shards_per_tenant + ) + for node_id, count in get_node_shard_counts(env, tenant_ids).items(): # we used a multiple of pagservers for the total shard count, # so expect equal number on all pageservers @@ -198,7 +206,7 @@ def test_node_status_after_restart( env.storage_controller.consistency_check() -def test_sharding_service_passthrough( +def test_storage_controller_passthrough( neon_env_builder: NeonEnvBuilder, ): """ @@ -222,10 +230,14 @@ def test_sharding_service_passthrough( } assert status["state"]["slug"] == "Active" + (synthetic_size, size_inputs) = client.tenant_size_and_modelinputs(env.initial_tenant) + assert synthetic_size > 0 + assert "segments" in size_inputs + env.storage_controller.consistency_check() -def test_sharding_service_restart(neon_env_builder: NeonEnvBuilder): +def test_storage_controller_restart(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_start() tenant_a = env.initial_tenant tenant_b = TenantId.generate() @@ -260,14 +272,15 @@ def test_sharding_service_restart(neon_env_builder: NeonEnvBuilder): @pytest.mark.parametrize("warm_up", [True, False]) -def test_sharding_service_onboarding(neon_env_builder: NeonEnvBuilder, warm_up: bool): +def test_storage_controller_onboarding(neon_env_builder: NeonEnvBuilder, warm_up: bool): """ We onboard tenants to the sharding service by treating it as a 'virtual pageserver' which provides the /location_config API. This is similar to creating a tenant, but imports the generation number. """ - neon_env_builder.num_pageservers = 2 + # One pageserver to simulate legacy environment, two to be managed by storage controller + neon_env_builder.num_pageservers = 3 # Start services by hand so that we can skip registration on one of the pageservers env = neon_env_builder.init_configs() @@ -277,15 +290,18 @@ def test_sharding_service_onboarding(neon_env_builder: NeonEnvBuilder, warm_up: # This is the pageserver where we'll initially create the tenant. Run it in emergency # mode so that it doesn't talk to storage controller, and do not register it. env.pageservers[0].allowed_errors.append(".*Emergency mode!.*") - env.pageservers[0].start( - overrides=("--pageserver-config-override=control_plane_emergency_mode=true",), + env.pageservers[0].patch_config_toml_nonrecursive( + { + "control_plane_emergency_mode": True, + } ) + env.pageservers[0].start() origin_ps = env.pageservers[0] - # This is the pageserver managed by the sharding service, where the tenant + # These are the pageservers managed by the sharding service, where the tenant # will be attached after onboarding env.pageservers[1].start() - dest_ps = env.pageservers[1] + env.pageservers[2].start() virtual_ps_http = PageserverHttpClient(env.storage_controller_port, lambda: True) for sk in env.safekeepers: @@ -297,7 +313,7 @@ def test_sharding_service_onboarding(neon_env_builder: NeonEnvBuilder, warm_up: origin_ps.http_client().tenant_create(tenant_id, generation=generation) # As if doing a live migration, first configure origin into stale mode - origin_ps.http_client().tenant_location_conf( + r = origin_ps.http_client().tenant_location_conf( tenant_id, { "mode": "AttachedStale", @@ -306,6 +322,7 @@ def test_sharding_service_onboarding(neon_env_builder: NeonEnvBuilder, warm_up: "generation": generation, }, ) + assert len(r["shards"]) == 1 if warm_up: origin_ps.http_client().tenant_heatmap_upload(tenant_id) @@ -323,10 +340,13 @@ def test_sharding_service_onboarding(neon_env_builder: NeonEnvBuilder, warm_up: ) virtual_ps_http.tenant_secondary_download(tenant_id) + warm_up_ps = env.storage_controller.tenant_describe(tenant_id)["shards"][0][ + "node_secondary" + ][0] # Call into storage controller to onboard the tenant generation += 1 - virtual_ps_http.tenant_location_conf( + r = virtual_ps_http.tenant_location_conf( tenant_id, { "mode": "AttachedMulti", @@ -335,6 +355,19 @@ def test_sharding_service_onboarding(neon_env_builder: NeonEnvBuilder, warm_up: "generation": generation, }, ) + assert len(r["shards"]) == 1 + + describe = env.storage_controller.tenant_describe(tenant_id)["shards"][0] + dest_ps_id = describe["node_attached"] + dest_ps = env.get_pageserver(dest_ps_id) + if warm_up: + # The storage controller should have attached the tenant to the same placce + # it had a secondary location, otherwise there was no point warming it up + assert dest_ps_id == warm_up_ps + + # It should have been given a new secondary location as well + assert len(describe["node_secondary"]) == 1 + assert describe["node_secondary"][0] != warm_up_ps # As if doing a live migration, detach the original pageserver origin_ps.http_client().tenant_location_conf( @@ -351,7 +384,7 @@ def test_sharding_service_onboarding(neon_env_builder: NeonEnvBuilder, warm_up: # set it to AttachedSingle: this is a no-op, but we test it because the # cloud control plane may call this for symmetry with live migration to # an individual pageserver - virtual_ps_http.tenant_location_conf( + r = virtual_ps_http.tenant_location_conf( tenant_id, { "mode": "AttachedSingle", @@ -360,6 +393,7 @@ def test_sharding_service_onboarding(neon_env_builder: NeonEnvBuilder, warm_up: "generation": generation, }, ) + assert len(r["shards"]) == 1 # We should see the tenant is now attached to the pageserver managed # by the sharding service @@ -390,7 +424,7 @@ def test_sharding_service_onboarding(neon_env_builder: NeonEnvBuilder, warm_up: # The generation has moved on since we onboarded assert generation != dest_tenant_before_conf_change["generation"] - virtual_ps_http.tenant_location_conf( + r = virtual_ps_http.tenant_location_conf( tenant_id, { "mode": "AttachedSingle", @@ -400,17 +434,21 @@ def test_sharding_service_onboarding(neon_env_builder: NeonEnvBuilder, warm_up: "generation": generation, }, ) + assert len(r["shards"]) == 1 dest_tenant_after_conf_change = dest_ps.http_client().tenant_status(tenant_id) assert ( dest_tenant_after_conf_change["generation"] == dest_tenant_before_conf_change["generation"] ) dest_tenant_conf_after = dest_ps.http_client().tenant_config(tenant_id) + + # Storage controller auto-sets heatmap period, ignore it for the comparison + del dest_tenant_conf_after.tenant_specific_overrides["heatmap_period"] assert dest_tenant_conf_after.tenant_specific_overrides == modified_tenant_conf env.storage_controller.consistency_check() -def test_sharding_service_compute_hook( +def test_storage_controller_compute_hook( httpserver: HTTPServer, neon_env_builder: NeonEnvBuilder, httpserver_listen_address, @@ -428,10 +466,13 @@ def test_sharding_service_compute_hook( # Set up fake HTTP notify endpoint notifications = [] + handle_params = {"status": 200} + def handler(request: Request): - log.info(f"Notify request: {request}") + status = handle_params["status"] + log.info(f"Notify request[{status}]: {request}") notifications.append(request.json) - return Response(status=200) + return Response(status=status) httpserver.expect_request("/notify", method="PUT").respond_with_handler(handler) @@ -499,10 +540,28 @@ def test_sharding_service_compute_hook( wait_until(10, 1, received_split_notification) + # If the compute hook is unavailable, that should not block creating a tenant and + # creating a timeline. This simulates a control plane refusing to accept notifications + handle_params["status"] = 423 + degraded_tenant_id = TenantId.generate() + degraded_timeline_id = TimelineId.generate() + env.storage_controller.tenant_create(degraded_tenant_id) + env.storage_controller.pageserver_api().timeline_create( + PgVersion.NOT_SET, degraded_tenant_id, degraded_timeline_id + ) + + # Ensure we hit the handler error path + env.storage_controller.allowed_errors.append( + ".*Failed to notify compute of attached pageserver.*tenant busy.*" + ) + env.storage_controller.allowed_errors.append(".*Reconcile error.*tenant busy.*") + assert notifications[-1] is not None + assert notifications[-1]["tenant_id"] == str(degraded_tenant_id) + env.storage_controller.consistency_check() -def test_sharding_service_debug_apis(neon_env_builder: NeonEnvBuilder): +def test_storage_controller_debug_apis(neon_env_builder: NeonEnvBuilder): """ Verify that occasional-use debug APIs work as expected. This is a lightweight test that just hits the endpoints to check that they don't bitrot. @@ -563,7 +622,7 @@ def test_sharding_service_debug_apis(neon_env_builder: NeonEnvBuilder): env.storage_controller.consistency_check() -def test_sharding_service_s3_time_travel_recovery( +def test_storage_controller_s3_time_travel_recovery( neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, ): @@ -673,7 +732,7 @@ def test_sharding_service_s3_time_travel_recovery( env.storage_controller.consistency_check() -def test_sharding_service_auth(neon_env_builder: NeonEnvBuilder): +def test_storage_controller_auth(neon_env_builder: NeonEnvBuilder): neon_env_builder.auth_enabled = True env = neon_env_builder.init_start() svc = env.storage_controller @@ -697,13 +756,18 @@ def test_sharding_service_auth(neon_env_builder: NeonEnvBuilder): StorageControllerApiException, match="Forbidden: JWT authentication error", ): - svc.request("POST", f"{api}/v1/tenant", json=body, headers=svc.headers(TokenScope.ADMIN)) + svc.request( + "POST", f"{api}/v1/tenant", json=body, headers=svc.headers(TokenScope.SAFEKEEPER_DATA) + ) # Token with correct scope svc.request( "POST", f"{api}/v1/tenant", json=body, headers=svc.headers(TokenScope.PAGE_SERVER_API) ) + # Token with admin scope should also be permitted + svc.request("POST", f"{api}/v1/tenant", json=body, headers=svc.headers(TokenScope.ADMIN)) + # No token with pytest.raises( StorageControllerApiException, @@ -737,7 +801,7 @@ def test_sharding_service_auth(neon_env_builder: NeonEnvBuilder): ) -def test_sharding_service_tenant_conf(neon_env_builder: NeonEnvBuilder): +def test_storage_controller_tenant_conf(neon_env_builder: NeonEnvBuilder): """ Validate the pageserver-compatible API endpoints for setting and getting tenant conf, without supplying the whole LocationConf. @@ -840,7 +904,7 @@ def build_node_to_tenants_map(env: NeonEnv) -> dict[int, list[TenantId]]: PageserverFailpoint(pageserver_id=1, failpoint="get-utilization-http-handler"), ], ) -def test_sharding_service_heartbeats( +def test_storage_controller_heartbeats( neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, failure: Failure ): neon_env_builder.num_pageservers = 2 @@ -950,7 +1014,7 @@ def test_sharding_service_heartbeats( wait_until(10, 1, storage_controller_consistent) -def test_sharding_service_re_attach(neon_env_builder: NeonEnvBuilder): +def test_storage_controller_re_attach(neon_env_builder: NeonEnvBuilder): """ Exercise the behavior of the /re-attach endpoint on pageserver startup when pageservers have a mixture of attached and secondary locations @@ -1010,3 +1074,323 @@ def test_sharding_service_re_attach(neon_env_builder: NeonEnvBuilder): "storage_controller_reconcile_complete_total", filter={"status": "ok"} ) assert reconciles_after_restart == reconciles_before_restart + + +def test_storage_controller_shard_scheduling_policy(neon_env_builder: NeonEnvBuilder): + """ + Check that emergency hooks for disabling rogue tenants' reconcilers work as expected. + """ + env = neon_env_builder.init_configs() + env.start() + + tenant_id = TenantId.generate() + + env.storage_controller.allowed_errors.extend( + [ + # We will intentionally cause reconcile errors + ".*Reconcile error.*", + # Message from using a scheduling policy + ".*Scheduling is disabled by policy.*", + ".*Skipping reconcile for policy.*", + # Message from a node being offline + ".*Call to node .* management API .* failed", + ] + ) + + # Stop pageserver so that reconcile cannot complete + env.pageserver.stop() + + env.storage_controller.tenant_create(tenant_id, placement_policy="Detached") + + # Try attaching it: we should see reconciles failing + env.storage_controller.tenant_policy_update( + tenant_id, + { + "placement": {"Attached": 0}, + }, + ) + + def reconcile_errors() -> int: + return int( + env.storage_controller.get_metric_value( + "storage_controller_reconcile_complete_total", filter={"status": "error"} + ) + or 0 + ) + + def reconcile_ok() -> int: + return int( + env.storage_controller.get_metric_value( + "storage_controller_reconcile_complete_total", filter={"status": "ok"} + ) + or 0 + ) + + def assert_errors_gt(n) -> int: + e = reconcile_errors() + assert e > n + return e + + errs = wait_until(10, 1, lambda: assert_errors_gt(0)) + + # Try reconciling again, it should fail again + with pytest.raises(StorageControllerApiException): + env.storage_controller.reconcile_all() + errs = wait_until(10, 1, lambda: assert_errors_gt(errs)) + + # Configure the tenant to disable reconciles + env.storage_controller.tenant_policy_update( + tenant_id, + { + "scheduling": "Stop", + }, + ) + + # Try reconciling again, it should not cause an error (silently skip) + env.storage_controller.reconcile_all() + assert reconcile_errors() == errs + + # Start the pageserver and re-enable reconciles + env.pageserver.start() + env.storage_controller.tenant_policy_update( + tenant_id, + { + "scheduling": "Active", + }, + ) + + def assert_ok_gt(n) -> int: + o = reconcile_ok() + assert o > n + return o + + # We should see a successful reconciliation + wait_until(10, 1, lambda: assert_ok_gt(0)) + + # And indeed the tenant should be attached + assert len(env.pageserver.http_client().tenant_list_locations()["tenant_shards"]) == 1 + + +def test_storcon_cli(neon_env_builder: NeonEnvBuilder): + """ + The storage controller command line interface (storcon-cli) is an internal tool. Most tests + just use the APIs directly: this test exercises some basics of the CLI as a regression test + that the client remains usable as the server evolves. + """ + output_dir = neon_env_builder.test_output_dir + shard_count = 4 + env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count) + base_args = [env.neon_binpath / "storcon_cli", "--api", env.storage_controller_api] + + def storcon_cli(args): + """ + CLI wrapper: returns stdout split into a list of non-empty strings + """ + (output_path, stdout, status_code) = subprocess_capture( + output_dir, + [str(s) for s in base_args + args], + echo_stderr=True, + echo_stdout=True, + env={}, + check=False, + capture_stdout=True, + timeout=10, + ) + if status_code: + log.warning(f"Command {args} failed") + log.warning(f"Output at: {output_path}") + + raise RuntimeError("CLI failure (check logs for stderr)") + + assert stdout is not None + return [line.strip() for line in stdout.split("\n") if line.strip()] + + # List nodes + node_lines = storcon_cli(["nodes"]) + # Table header, footer, and one line of data + assert len(node_lines) == 5 + assert "localhost" in node_lines[3] + + # Pause scheduling onto a node + storcon_cli(["node-configure", "--node-id", "1", "--scheduling", "pause"]) + assert "Pause" in storcon_cli(["nodes"])[3] + + # We will simulate a node death and then marking it offline + env.pageservers[0].stop(immediate=True) + # Sleep to make it unlikely that the controller's heartbeater will race handling + # a /utilization response internally, such that it marks the node back online. IRL + # there would always be a longer delay than this before a node failing and a human + # intervening. + time.sleep(2) + + storcon_cli(["node-configure", "--node-id", "1", "--availability", "offline"]) + assert "Offline" in storcon_cli(["nodes"])[3] + + # List tenants + tenant_lines = storcon_cli(["tenants"]) + assert len(tenant_lines) == 5 + assert str(env.initial_tenant) in tenant_lines[3] + + # Setting scheduling policies intentionally result in warnings, they're for rare use. + env.storage_controller.allowed_errors.extend( + [".*Skipping reconcile for policy.*", ".*Scheduling is disabled by policy.*"] + ) + + # Describe a tenant + tenant_lines = storcon_cli(["tenant-describe", "--tenant-id", str(env.initial_tenant)]) + assert len(tenant_lines) == 3 + shard_count * 2 + assert str(env.initial_tenant) in tenant_lines[3] + + # Pause changes on a tenant + storcon_cli(["tenant-policy", "--tenant-id", str(env.initial_tenant), "--scheduling", "stop"]) + assert "Stop" in storcon_cli(["tenants"])[3] + + # Change a tenant's placement + storcon_cli( + ["tenant-policy", "--tenant-id", str(env.initial_tenant), "--placement", "secondary"] + ) + assert "Secondary" in storcon_cli(["tenants"])[3] + + # Modify a tenant's config + storcon_cli( + [ + "tenant-config", + "--tenant-id", + str(env.initial_tenant), + "--config", + json.dumps({"pitr_interval": "1m"}), + ] + ) + + # Quiesce any background reconciliation before doing consistency check + env.storage_controller.reconcile_until_idle(timeout_secs=10) + env.storage_controller.consistency_check() + + +def test_lock_time_tracing(neon_env_builder: NeonEnvBuilder): + """ + Check that when lock on resource (tenants, nodes) is held for too long it is + traced in logs. + """ + env = neon_env_builder.init_start() + tenant_id = env.initial_tenant + env.storage_controller.allowed_errors.extend( + [ + ".*Lock on.*", + ".*Scheduling is disabled by policy.*", + f".*Operation TimelineCreate on key {tenant_id} has waited.*", + ] + ) + + # Apply failpoint + env.storage_controller.configure_failpoints( + ("tenant-update-policy-exclusive-lock", "return(31000)") + ) + + # This will hold the exclusive for enough time to cause an warning + def update_tenent_policy(): + env.storage_controller.tenant_policy_update( + tenant_id=tenant_id, + body={ + "scheduling": "Stop", + }, + ) + + thread_update_tenant_policy = threading.Thread(target=update_tenent_policy) + thread_update_tenant_policy.start() + + # Make sure the update policy thread has started + time.sleep(1) + # This will not be able to access and will log a warning + timeline_id = TimelineId.generate() + env.storage_controller.pageserver_api().timeline_create( + pg_version=PgVersion.NOT_SET, tenant_id=tenant_id, new_timeline_id=timeline_id + ) + thread_update_tenant_policy.join(timeout=10) + + env.storage_controller.assert_log_contains("Lock on UpdatePolicy was held for") + env.storage_controller.assert_log_contains( + f"Operation TimelineCreate on key {tenant_id} has waited" + ) + + +@pytest.mark.parametrize("remote_storage", [RemoteStorageKind.LOCAL_FS, s3_storage()]) +@pytest.mark.parametrize("shard_count", [None, 4]) +def test_tenant_import(neon_env_builder: NeonEnvBuilder, shard_count, remote_storage): + """ + Tenant import is a support/debug tool for recovering a tenant from remote storage + if we don't have any metadata for it in the storage controller. + """ + + # This test is parametrized on remote storage because it exercises the relatively rare + # code path of listing with a prefix that is not a directory name: this helps us notice + # quickly if local_fs or s3_bucket implementations diverge. + neon_env_builder.enable_pageserver_remote_storage(remote_storage) + + # Use multiple pageservers because some test helpers assume single sharded tenants + # if there is only one pageserver. + neon_env_builder.num_pageservers = 2 + + env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count) + tenant_id = env.initial_tenant + + # Create a second timeline to ensure that import finds both + timeline_a = env.initial_timeline + timeline_b = env.neon_cli.create_branch("branch_b", tenant_id=tenant_id) + + workload_a = Workload(env, tenant_id, timeline_a, branch_name="main") + workload_a.init() + + workload_b = Workload(env, tenant_id, timeline_b, branch_name="branch_b") + workload_b.init() + + # Write some data + workload_a.write_rows(72) + expect_rows_a = workload_a.expect_rows + workload_a.stop() + del workload_a + + # Bump generation to make sure generation recovery works properly + for pageserver in env.pageservers: + pageserver.stop() + pageserver.start() + + # Write some data in the higher generation into the other branch + workload_b.write_rows(107) + expect_rows_b = workload_b.expect_rows + workload_b.stop() + del workload_b + + # Detach from pageservers + env.storage_controller.tenant_policy_update( + tenant_id, + { + "placement": "Detached", + }, + ) + env.storage_controller.reconcile_until_idle(timeout_secs=10) + + # Force-drop it from the storage controller + env.storage_controller.request( + "POST", + f"{env.storage_controller_api}/debug/v1/tenant/{tenant_id}/drop", + headers=env.storage_controller.headers(TokenScope.ADMIN), + ) + + # Now import it again + env.neon_cli.import_tenant(tenant_id) + + # Check we found the shards + describe = env.storage_controller.tenant_describe(tenant_id) + literal_shard_count = 1 if shard_count is None else shard_count + assert len(describe["shards"]) == literal_shard_count + + # Check the data is still there: this implicitly proves that we recovered generation numbers + # properly, for the timeline which was written to after a generation bump. + for timeline, branch, expect_rows in [ + (timeline_a, "main", expect_rows_a), + (timeline_b, "branch_1", expect_rows_b), + ]: + workload = Workload(env, tenant_id, timeline, branch_name=branch) + workload.expect_rows = expect_rows + workload.validate() diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py index a164c7f60a..363c3c88ec 100644 --- a/test_runner/regress/test_tenant_delete.py +++ b/test_runner/regress/test_tenant_delete.py @@ -64,7 +64,7 @@ def test_tenant_delete_smoke( ) # Default tenant and the one we created - assert ps_http.get_metric_value("pageserver_tenant_manager_slots") == 2 + assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 2 # create two timelines one being the parent of another parent = None @@ -90,9 +90,9 @@ def test_tenant_delete_smoke( iterations = poll_for_remote_storage_iterations(remote_storage_kind) - assert ps_http.get_metric_value("pageserver_tenant_manager_slots") == 2 + assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 2 tenant_delete_wait_completed(ps_http, tenant_id, iterations) - assert ps_http.get_metric_value("pageserver_tenant_manager_slots") == 1 + assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 1 tenant_path = env.pageserver.tenant_dir(tenant_id) assert not tenant_path.exists() @@ -108,7 +108,7 @@ def test_tenant_delete_smoke( ) # Deletion updates the tenant count: the one default tenant remains - assert ps_http.get_metric_value("pageserver_tenant_manager_slots") == 1 + assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 1 class Check(enum.Enum): @@ -469,7 +469,8 @@ def test_tenant_delete_concurrent( ): """ Validate that concurrent delete requests to the same tenant behave correctly: - exactly one should succeed. + exactly one should execute: the rest should give 202 responses but not start + another deletion. This is a reproducer for https://github.com/neondatabase/neon/issues/5936 """ @@ -484,14 +485,10 @@ def test_tenant_delete_concurrent( run_pg_bench_small(pg_bin, endpoint.connstr()) last_flush_lsn_upload(env, endpoint, tenant_id, timeline_id) - CONFLICT_MESSAGE = "Precondition failed: Invalid state Stopping. Expected Active or Broken" - env.pageserver.allowed_errors.extend( [ # lucky race with stopping from flushing a layer we fail to schedule any uploads ".*layer flush task.+: could not flush frozen layer: update_metadata_file", - # Errors logged from our 4xx requests - f".*{CONFLICT_MESSAGE}.*", ] ) @@ -507,7 +504,7 @@ def test_tenant_delete_concurrent( return ps_http.tenant_delete(tenant_id) def hit_remove_failpoint(): - env.pageserver.assert_log_contains(f"at failpoint {BEFORE_REMOVE_FAILPOINT}") + return env.pageserver.assert_log_contains(f"at failpoint {BEFORE_REMOVE_FAILPOINT}")[1] def hit_run_failpoint(): env.pageserver.assert_log_contains(f"at failpoint {BEFORE_RUN_FAILPOINT}") @@ -518,11 +515,14 @@ def test_tenant_delete_concurrent( # Wait until the first request completes its work and is blocked on removing # the TenantSlot from tenant manager. - wait_until(100, 0.1, hit_remove_failpoint) + log_cursor = wait_until(100, 0.1, hit_remove_failpoint) + assert log_cursor is not None - # Start another request: this should fail when it sees a tenant in Stopping state - with pytest.raises(PageserverApiException, match=CONFLICT_MESSAGE): - ps_http.tenant_delete(tenant_id) + # Start another request: this should succeed without actually entering the deletion code + ps_http.tenant_delete(tenant_id) + assert not env.pageserver.log_contains( + f"at failpoint {BEFORE_RUN_FAILPOINT}", offset=log_cursor + ) # Start another background request, which will pause after acquiring a TenantSlotGuard # but before completing. @@ -532,15 +532,19 @@ def test_tenant_delete_concurrent( # The TenantSlot is still present while the original request is hung before # final removal - assert ps_http.get_metric_value("pageserver_tenant_manager_slots") == 1 + assert ( + ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 1 + ) # Permit the original request to run to success ps_http.configure_failpoints((BEFORE_REMOVE_FAILPOINT, "off")) # Permit the duplicate background request to run to completion and fail. ps_http.configure_failpoints((BEFORE_RUN_FAILPOINT, "off")) - with pytest.raises(PageserverApiException, match=CONFLICT_MESSAGE): - background_4xx_req.result(timeout=10) + background_4xx_req.result(timeout=10) + assert not env.pageserver.log_contains( + f"at failpoint {BEFORE_RUN_FAILPOINT}", offset=log_cursor + ) # Physical deletion should have happened assert_prefix_empty( @@ -554,7 +558,8 @@ def test_tenant_delete_concurrent( ) # Zero tenants remain (we deleted the default tenant) - assert ps_http.get_metric_value("pageserver_tenant_manager_slots") == 0 + assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 0 + assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "inprogress"}) == 0 def test_tenant_delete_races_timeline_creation( @@ -671,7 +676,7 @@ def test_tenant_delete_races_timeline_creation( ) # Zero tenants remain (we deleted the default tenant) - assert ps_http.get_metric_value("pageserver_tenant_manager_slots") == 0 + assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 0 def test_tenant_delete_scrubber(pg_bin: PgBin, neon_env_builder: NeonEnvBuilder): diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py index d3f24cb06e..0ba0108651 100644 --- a/test_runner/regress/test_tenant_detach.py +++ b/test_runner/regress/test_tenant_detach.py @@ -132,7 +132,7 @@ def test_tenant_reattach(neon_env_builder: NeonEnvBuilder, mode: str): assert query_scalar(cur, "SELECT count(*) FROM t") == 100000 # Check that we had to retry the downloads - assert env.pageserver.log_contains(".*list timelines.*failed, will retry.*") + assert env.pageserver.log_contains(".*list identifiers.*failed, will retry.*") assert env.pageserver.log_contains(".*download.*failed, will retry.*") diff --git a/test_runner/regress/test_tenant_relocation.py b/test_runner/regress/test_tenant_relocation.py index 9def3ad1c2..68d9d9a660 100644 --- a/test_runner/regress/test_tenant_relocation.py +++ b/test_runner/regress/test_tenant_relocation.py @@ -16,7 +16,6 @@ from fixtures.pageserver.utils import ( wait_for_upload, wait_tenant_status_404, ) -from fixtures.port_distributor import PortDistributor from fixtures.remote_storage import ( LocalFsStorage, RemoteStorageKind, @@ -24,7 +23,6 @@ from fixtures.remote_storage import ( from fixtures.types import Lsn, TenantId, TimelineId from fixtures.utils import ( query_scalar, - subprocess_capture, wait_until, ) @@ -184,20 +182,14 @@ def post_migration_check(endpoint: Endpoint, sum_before_migration: int, old_loca # A minor migration involves no storage breaking changes. # It is done by attaching the tenant to a new pageserver. "minor", - # A major migration involves exporting a postgres datadir - # basebackup and importing it into the new pageserver. - # This kind of migration can tolerate breaking changes - # to storage format - "major", + # In the unlikely and unfortunate event that we have to break + # the storage format, extend this test with the param below. + # "major", ], ) @pytest.mark.parametrize("with_load", ["with_load", "without_load"]) def test_tenant_relocation( neon_env_builder: NeonEnvBuilder, - port_distributor: PortDistributor, - test_output_dir: Path, - neon_binpath: Path, - base_dir: Path, method: str, with_load: str, ): @@ -299,40 +291,7 @@ def test_tenant_relocation( current_lsn=current_lsn_second, ) - # Migrate either by attaching from s3 or import/export basebackup - if method == "major": - cmd = [ - "poetry", - "run", - "python", - str(base_dir / "scripts/export_import_between_pageservers.py"), - "--tenant-id", - str(tenant_id), - "--from-host", - "localhost", - "--from-http-port", - str(origin_http.port), - "--from-pg-port", - str(origin_ps.service_port.pg), - "--to-host", - "localhost", - "--to-http-port", - str(destination_http.port), - "--to-pg-port", - str(destination_ps.service_port.pg), - "--pg-distrib-dir", - str(neon_env_builder.pg_distrib_dir), - "--work-dir", - str(test_output_dir), - "--tmp-pg-port", - str(port_distributor.get_port()), - ] - subprocess_capture(test_output_dir, cmd, check=True) - - destination_ps.allowed_errors.append( - ".*ignored .* unexpected bytes after the tar archive.*" - ) - elif method == "minor": + if method == "minor": # call to attach timeline to new pageserver destination_ps.tenant_attach(tenant_id) diff --git a/test_runner/regress/test_tenant_size.py b/test_runner/regress/test_tenant_size.py index 025cc930d7..e73eae91f0 100644 --- a/test_runner/regress/test_tenant_size.py +++ b/test_runner/regress/test_tenant_size.py @@ -1,4 +1,5 @@ import os +from concurrent.futures import ThreadPoolExecutor from pathlib import Path from typing import List, Tuple @@ -11,18 +12,21 @@ from fixtures.neon_fixtures import ( wait_for_last_flush_lsn, wait_for_wal_insert_lsn, ) -from fixtures.pageserver.http import PageserverHttpClient +from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient from fixtures.pageserver.utils import ( + tenant_delete_wait_completed, timeline_delete_wait_completed, wait_until_tenant_active, ) from fixtures.pg_version import PgVersion from fixtures.types import Lsn, TenantId, TimelineId +from fixtures.utils import wait_until -@pytest.mark.xfail -def test_empty_tenant_size(neon_simple_env: NeonEnv, test_output_dir: Path): - env = neon_simple_env +def test_empty_tenant_size(neon_env_builder: NeonEnvBuilder): + env = neon_env_builder.init_configs() + env.start() + (tenant_id, _) = env.neon_cli.create_tenant() http_client = env.pageserver.http_client() initial_size = http_client.tenant_size(tenant_id) @@ -35,66 +39,25 @@ def test_empty_tenant_size(neon_simple_env: NeonEnv, test_output_dir: Path): branch_name, main_timeline_id = env.neon_cli.list_timelines(tenant_id)[0] assert branch_name == main_branch_name - with env.endpoints.create_start( + endpoint = env.endpoints.create_start( main_branch_name, tenant_id=tenant_id, config_lines=["autovacuum=off", "checkpoint_timeout=10min"], - ) as endpoint: - with endpoint.cursor() as cur: - cur.execute("SELECT 1") - row = cur.fetchone() - assert row is not None - assert row[0] == 1 - size = http_client.tenant_size(tenant_id) - # we've disabled the autovacuum and checkpoint - # so background processes should not change the size. - # If this test will flake we should probably loosen the check - assert ( - size == initial_size - ), f"starting idle compute should not change the tenant size (Currently {size}, expected {initial_size})" + ) - # the size should be the same, until we increase the size over the - # gc_horizon - size, inputs = http_client.tenant_size_and_modelinputs(tenant_id) - assert ( - size == initial_size - ), f"tenant_size should not be affected by shutdown of compute (Currently {size}, expected {initial_size})" + with endpoint.cursor() as cur: + cur.execute("SELECT 1") + row = cur.fetchone() + assert row is not None + assert row[0] == 1 - expected_inputs = { - "segments": [ - { - "segment": {"parent": None, "lsn": 23694408, "size": 25362432, "needed": True}, - "timeline_id": f"{main_timeline_id}", - "kind": "BranchStart", - }, - { - "segment": {"parent": 0, "lsn": 23694528, "size": None, "needed": True}, - "timeline_id": f"{main_timeline_id}", - "kind": "BranchEnd", - }, - ], - "timeline_inputs": [ - { - "timeline_id": f"{main_timeline_id}", - "ancestor_id": None, - "ancestor_lsn": "0/0", - "last_record": "0/1698CC0", - "latest_gc_cutoff": "0/1698C48", - "horizon_cutoff": "0/0", - "pitr_cutoff": "0/0", - "next_gc_cutoff": "0/0", - "retention_param_cutoff": None, - } - ], - } - expected_inputs = mask_model_inputs(expected_inputs) - actual_inputs = mask_model_inputs(inputs) + # The transaction above will make the compute generate a checkpoint. + # In turn, the pageserver persists the checkpoint. This should only be + # one key with a size of a couple hundred bytes. + wait_for_last_flush_lsn(env, endpoint, tenant_id, main_timeline_id) + size = http_client.tenant_size(tenant_id) - assert expected_inputs == actual_inputs - - size_debug_file = open(test_output_dir / "size_debug.html", "w") - size_debug = http_client.tenant_size_debug(tenant_id) - size_debug_file.write(size_debug) + assert size >= initial_size and size - initial_size < 1024 def test_branched_empty_timeline_size(neon_simple_env: NeonEnv, test_output_dir: Path): @@ -190,7 +153,6 @@ def test_branched_from_many_empty_parents_size(neon_simple_env: NeonEnv, test_ou size_debug_file.write(size_debug) -@pytest.mark.skip("This should work, but is left out because assumed covered by other tests") def test_branch_point_within_horizon(neon_simple_env: NeonEnv, test_output_dir: Path): """ gc_horizon = 15 @@ -233,7 +195,6 @@ def test_branch_point_within_horizon(neon_simple_env: NeonEnv, test_output_dir: size_debug_file.write(size_debug) -@pytest.mark.skip("This should work, but is left out because assumed covered by other tests") def test_parent_within_horizon(neon_simple_env: NeonEnv, test_output_dir: Path): """ gc_horizon = 5 @@ -282,7 +243,6 @@ def test_parent_within_horizon(neon_simple_env: NeonEnv, test_output_dir: Path): size_debug_file.write(size_debug) -@pytest.mark.skip("This should work, but is left out because assumed covered by other tests") def test_only_heads_within_horizon(neon_simple_env: NeonEnv, test_output_dir: Path): """ gc_horizon = small @@ -335,33 +295,12 @@ def test_single_branch_get_tenant_size_grows( Operate on single branch reading the tenants size after each transaction. """ - # Disable automatic gc and compaction. - # The pitr_interval here is quite problematic, so we cannot really use it. - # it'd have to be calibrated per test executing env. - - # there was a bug which was hidden if the create table and first batch of - # inserts is larger than gc_horizon. for example 0x20000 here hid the fact - # that there next_gc_cutoff could be smaller than initdb_lsn, which will - # obviously lead to issues when calculating the size. - gc_horizon = 0x3BA00 - - # it's a bit of a hack, but different versions of postgres have different - # amount of WAL generated for the same amount of data. so we need to - # adjust the gc_horizon accordingly. - if pg_version == PgVersion.V14: - gc_horizon = 0x4A000 - elif pg_version == PgVersion.V15: - gc_horizon = 0x3BA00 - elif pg_version == PgVersion.V16: - gc_horizon = 210000 - else: - raise NotImplementedError(pg_version) - + # Disable automatic compaction and GC, and set a long PITR interval: we will expect + # size to always increase with writes as all writes remain within the PITR tenant_config = { "compaction_period": "0s", "gc_period": "0s", - "pitr_interval": "0s", - "gc_horizon": gc_horizon, + "pitr_interval": "3600s", } env = neon_env_builder.init_start(initial_tenant_conf=tenant_config) @@ -375,18 +314,6 @@ def test_single_branch_get_tenant_size_grows( size_debug_file = open(test_output_dir / "size_debug.html", "w") - def check_size_change( - current_lsn: Lsn, initdb_lsn: Lsn, gc_horizon: int, size: int, prev_size: int - ): - if current_lsn - initdb_lsn >= gc_horizon: - assert ( - size >= prev_size - ), f"tenant_size may grow or not grow, because we only add gc_horizon amount of WAL to initial snapshot size (Currently at: {current_lsn}, Init at: {initdb_lsn})" - else: - assert ( - size > prev_size - ), f"tenant_size should grow, because we continue to add WAL to initial snapshot size (Currently at: {current_lsn}, Init at: {initdb_lsn})" - def get_current_consistent_size( env: NeonEnv, endpoint: Endpoint, @@ -455,14 +382,6 @@ def test_single_branch_get_tenant_size_grows( ) prev_size = collected_responses[-1][2] - - # branch start shouldn't be past gc_horizon yet - # thus the size should grow as we insert more data - # "gc_horizon" is tuned so that it kicks in _after_ the - # insert phase, but before the update phase ends. - assert ( - current_lsn - initdb_lsn <= gc_horizon - ), "Tuning of GC window is likely out-of-date" assert size > prev_size collected_responses.append(("INSERT", current_lsn, size)) @@ -482,8 +401,7 @@ def test_single_branch_get_tenant_size_grows( ) prev_size = collected_responses[-1][2] - - check_size_change(current_lsn, initdb_lsn, gc_horizon, size, prev_size) + assert size > prev_size collected_responses.append(("UPDATE", current_lsn, size)) @@ -500,8 +418,7 @@ def test_single_branch_get_tenant_size_grows( ) prev_size = collected_responses[-1][2] - - check_size_change(current_lsn, initdb_lsn, gc_horizon, size, prev_size) + assert size > prev_size collected_responses.append(("DELETE", current_lsn, size)) @@ -512,20 +429,20 @@ def test_single_branch_get_tenant_size_grows( with endpoint.cursor() as cur: cur.execute("DROP TABLE t0") - # Without setting a PITR interval, dropping the table doesn't reclaim any space - # from the user's point of view, because the DROP transaction is too small - # to fall out of gc_horizon. + # Dropping the table doesn't reclaim any space + # from the user's point of view, because the DROP transaction is still + # within pitr_interval. (current_lsn, size) = get_current_consistent_size( env, endpoint, size_debug_file, http_client, tenant_id, timeline_id ) - prev_size = collected_responses[-1][2] - check_size_change(current_lsn, initdb_lsn, gc_horizon, size, prev_size) + assert size >= prev_size + prev_size = size - # Set a tiny PITR interval to allow the DROP to impact the synthetic size + # Set a zero PITR interval to allow the DROP to impact the synthetic size # Because synthetic size calculation uses pitr interval when available, # when our tenant is configured with a tiny pitr interval, dropping a table should # cause synthetic size to go down immediately - tenant_config["pitr_interval"] = "1ms" + tenant_config["pitr_interval"] = "0s" env.pageserver.http_client().set_tenant_config(tenant_id, tenant_config) (current_lsn, size) = get_current_consistent_size( env, endpoint, size_debug_file, http_client, tenant_id, timeline_id @@ -537,10 +454,6 @@ def test_single_branch_get_tenant_size_grows( # defined by gc_horizon. collected_responses.append(("DROP", current_lsn, size)) - # Should have gone past gc_horizon, otherwise gc_horizon is too large - bytes_written = current_lsn - initdb_lsn - assert bytes_written > gc_horizon - # this isn't too many lines to forget for a while. observed while # developing these tests that locally the value is a bit more than what we # get in the ci. @@ -706,6 +619,68 @@ def test_get_tenant_size_with_multiple_branches( size_debug_file.write(size_debug) +def test_synthetic_size_while_deleting(neon_env_builder: NeonEnvBuilder): + """ + Makes sure synthetic size can still be calculated even if one of the + timelines is deleted or the tenant is deleted. + """ + + env = neon_env_builder.init_start() + failpoint = "Timeline::find_gc_cutoffs-pausable" + client = env.pageserver.http_client() + + orig_size = client.tenant_size(env.initial_tenant) + + branch_id = env.neon_cli.create_branch( + tenant_id=env.initial_tenant, ancestor_branch_name="main", new_branch_name="branch" + ) + client.configure_failpoints((failpoint, "pause")) + + with ThreadPoolExecutor(max_workers=1) as exec: + completion = exec.submit(client.tenant_size, env.initial_tenant) + _, last_offset = wait_until( + 10, 1.0, lambda: env.pageserver.assert_log_contains(f"at failpoint {failpoint}") + ) + + timeline_delete_wait_completed(client, env.initial_tenant, branch_id) + + client.configure_failpoints((failpoint, "off")) + size = completion.result() + + assert_size_approx_equal(orig_size, size) + + branch_id = env.neon_cli.create_branch( + tenant_id=env.initial_tenant, ancestor_branch_name="main", new_branch_name="branch2" + ) + client.configure_failpoints((failpoint, "pause")) + + with ThreadPoolExecutor(max_workers=1) as exec: + completion = exec.submit(client.tenant_size, env.initial_tenant) + wait_until( + 10, + 1.0, + lambda: env.pageserver.assert_log_contains( + f"at failpoint {failpoint}", offset=last_offset + ), + ) + + tenant_delete_wait_completed(client, env.initial_tenant, 10) + + client.configure_failpoints((failpoint, "off")) + + # accept both, because the deletion might still complete before + matcher = "(Failed to refresh gc_info before gathering inputs|NotFound: tenant)" + with pytest.raises(PageserverApiException, match=matcher): + completion.result() + + # this happens on both cases + env.pageserver.allowed_errors.append( + ".*ignoring failure to find gc cutoffs: timeline shutting down.*" + ) + # this happens only in the case of deletion (http response logging) + env.pageserver.allowed_errors.append(".*Failed to refresh gc_info before gathering inputs.*") + + # Helper for tests that compare timeline_inputs # We don't want to compare the exact values, because they can be unstable # and cause flaky tests. So replace the values with useful invariants. diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py index f8701b65d7..2832304dcc 100644 --- a/test_runner/regress/test_tenants.py +++ b/test_runner/regress/test_tenants.py @@ -389,6 +389,9 @@ def test_create_churn_during_restart(neon_env_builder: NeonEnvBuilder): if e.status_code == 409: log.info(f"delay_ms={delay_ms} 409") pass + elif e.status_code == 429: + log.info(f"delay_ms={delay_ms} 429") + pass elif e.status_code == 400: if "is less than existing" in e.message: # We send creation requests very close together in time: it is expected that these diff --git a/test_runner/regress/test_tenants_with_remote_storage.py b/test_runner/regress/test_tenants_with_remote_storage.py index d16978d02a..a1e96928bf 100644 --- a/test_runner/regress/test_tenants_with_remote_storage.py +++ b/test_runner/regress/test_tenants_with_remote_storage.py @@ -18,6 +18,7 @@ from fixtures.neon_fixtures import ( NeonEnvBuilder, last_flush_lsn_upload, ) +from fixtures.pageserver.types import parse_layer_file_name from fixtures.pageserver.utils import ( assert_tenant_state, wait_for_last_record_lsn, @@ -246,7 +247,10 @@ def test_tenant_redownloads_truncated_file_on_startup( # ensure the same size is found from the index_part.json index_part = env.pageserver_remote_storage.index_content(tenant_id, timeline_id) - assert index_part["layer_metadata"][path.name]["file_size"] == expected_size + assert ( + index_part["layer_metadata"][parse_layer_file_name(path.name).to_str()]["file_size"] + == expected_size + ) ## Start the pageserver. It will notice that the file size doesn't match, and ## rename away the local file. It will be re-downloaded when it's needed. @@ -276,7 +280,7 @@ def test_tenant_redownloads_truncated_file_on_startup( # the remote side of local_layer_truncated remote_layer_path = env.pageserver_remote_storage.remote_layer_path( - tenant_id, timeline_id, path.name + tenant_id, timeline_id, parse_layer_file_name(path.name).to_str() ) # if the upload ever was ongoing, this check would be racy, but at least one diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py new file mode 100644 index 0000000000..075f0a6bbc --- /dev/null +++ b/test_runner/regress/test_timeline_detach_ancestor.py @@ -0,0 +1,402 @@ +import datetime +import enum +from concurrent.futures import ThreadPoolExecutor +from queue import Empty, Queue +from threading import Barrier +from typing import List + +import pytest +from fixtures.log_helper import log +from fixtures.neon_fixtures import ( + NeonEnvBuilder, + wait_for_last_flush_lsn, +) +from fixtures.pageserver.http import HistoricLayerInfo +from fixtures.pageserver.utils import wait_timeline_detail_404 +from fixtures.remote_storage import LocalFsStorage +from fixtures.types import Lsn, TimelineId + + +def by_end_lsn(info: HistoricLayerInfo) -> Lsn: + assert info.lsn_end is not None + return Lsn(info.lsn_end) + + +def layer_name(info: HistoricLayerInfo) -> str: + return info.layer_file_name + + +@enum.unique +class Branchpoint(str, enum.Enum): + """ + Have branches at these Lsns possibly relative to L0 layer boundary. + """ + + EARLIER = "earlier" + AT_L0 = "at" + AFTER_L0 = "after" + LAST_RECORD_LSN = "head" + + def __str__(self) -> str: + return self.value + + @staticmethod + def all() -> List["Branchpoint"]: + return [ + Branchpoint.EARLIER, + Branchpoint.AT_L0, + Branchpoint.AFTER_L0, + Branchpoint.LAST_RECORD_LSN, + ] + + +SHUTDOWN_ALLOWED_ERRORS = [ + ".*initial size calculation failed: downloading failed, possibly for shutdown", + ".*failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited", +] + + +@pytest.mark.parametrize("branchpoint", Branchpoint.all()) +@pytest.mark.parametrize("restart_after", [True, False]) +@pytest.mark.parametrize("write_to_branch_first", [True, False]) +def test_ancestor_detach_branched_from( + neon_env_builder: NeonEnvBuilder, + branchpoint: Branchpoint, + restart_after: bool, + write_to_branch_first: bool, +): + """ + Creates a branch relative to L0 lsn boundary according to Branchpoint. Later the timeline is detached. + """ + env = neon_env_builder.init_start() + + env.pageserver.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS) + + client = env.pageserver.http_client() + + with env.endpoints.create_start("main", tenant_id=env.initial_tenant) as ep: + ep.safe_psql("CREATE TABLE foo (i BIGINT);") + + after_first_tx = wait_for_last_flush_lsn(env, ep, env.initial_tenant, env.initial_timeline) + + ep.safe_psql("INSERT INTO foo SELECT i::bigint FROM generate_series(0, 8191) g(i);") + + # create a single layer for us to remote copy + wait_for_last_flush_lsn(env, ep, env.initial_tenant, env.initial_timeline) + client.timeline_checkpoint(env.initial_tenant, env.initial_timeline) + + ep.safe_psql("INSERT INTO foo SELECT i::bigint FROM generate_series(8192, 16383) g(i);") + wait_for_last_flush_lsn(env, ep, env.initial_tenant, env.initial_timeline) + + deltas = client.layer_map_info(env.initial_tenant, env.initial_timeline).delta_layers() + # there is also the in-mem layer, but ignore it for now + assert len(deltas) == 2, "expecting there to be two deltas: initdb and checkpointed" + later_delta = max(deltas, key=by_end_lsn) + assert later_delta.lsn_end is not None + + # -1 as the lsn_end is exclusive. + last_lsn = Lsn(later_delta.lsn_end).lsn_int - 1 + + if branchpoint == Branchpoint.EARLIER: + branch_at = after_first_tx + rows = 0 + truncated_layers = 1 + elif branchpoint == Branchpoint.AT_L0: + branch_at = Lsn(last_lsn) + rows = 8192 + truncated_layers = 0 + elif branchpoint == Branchpoint.AFTER_L0: + branch_at = Lsn(last_lsn + 8) + rows = 8192 + # as there is no 8 byte walrecord, nothing should get copied from the straddling layer + truncated_layers = 0 + else: + # this case also covers the implicit flush of ancestor as the inmemory hasn't been flushed yet + assert branchpoint == Branchpoint.LAST_RECORD_LSN + branch_at = None + rows = 16384 + truncated_layers = 0 + + name = "new main" + + timeline_id = env.neon_cli.create_branch( + name, "main", env.initial_tenant, ancestor_start_lsn=branch_at + ) + + recorded = Lsn(client.timeline_detail(env.initial_tenant, timeline_id)["ancestor_lsn"]) + if branch_at is None: + # fix it up if we need it later (currently unused) + branch_at = recorded + else: + assert branch_at == recorded, "the test should not use unaligned lsns" + + if write_to_branch_first: + with env.endpoints.create_start(name, tenant_id=env.initial_tenant) as ep: + assert ep.safe_psql("SELECT count(*) FROM foo;")[0][0] == rows + # make sure the ep is writable + # with BEFORE_L0, AFTER_L0 there will be a gap in Lsns caused by accurate end_lsn on straddling layers + ep.safe_psql("CREATE TABLE audit AS SELECT 1 as starts;") + wait_for_last_flush_lsn(env, ep, env.initial_tenant, timeline_id) + + # branch must have a flush for "PREV_LSN: none" + client.timeline_checkpoint(env.initial_tenant, timeline_id) + branch_layers = set( + map(layer_name, client.layer_map_info(env.initial_tenant, timeline_id).historic_layers) + ) + else: + branch_layers = set() + + all_reparented = client.detach_ancestor(env.initial_tenant, timeline_id) + assert all_reparented == set() + + if restart_after: + env.pageserver.stop() + env.pageserver.start() + + with env.endpoints.create_start("main", tenant_id=env.initial_tenant) as ep: + assert ep.safe_psql("SELECT count(*) FROM foo;")[0][0] == 16384 + + with env.endpoints.create_start(name, tenant_id=env.initial_tenant) as ep: + assert ep.safe_psql("SELECT count(*) FROM foo;")[0][0] == rows + + old_main_info = client.layer_map_info(env.initial_tenant, env.initial_timeline) + old_main = set(map(layer_name, old_main_info.historic_layers)) + + new_main_info = client.layer_map_info(env.initial_tenant, timeline_id) + new_main = set(map(layer_name, new_main_info.historic_layers)) + + new_main_copied_or_truncated = new_main - branch_layers + new_main_truncated = new_main_copied_or_truncated - old_main + + assert len(new_main_truncated) == truncated_layers + # could additionally check that the symmetric difference has layers starting at the same lsn + # but if nothing was copied, then there is no nice rule. + # there could be a hole in LSNs between copied from the "old main" and the first branch layer. + + client.timeline_delete(env.initial_tenant, env.initial_timeline) + wait_timeline_detail_404(client, env.initial_tenant, env.initial_timeline, 10, 1.0) + + +def test_ancestor_detach_reparents_earlier(neon_env_builder: NeonEnvBuilder): + """ + The case from RFC: + + +-> another branch with same ancestor_lsn as new main + | + old main -------|---------X---------> + | | | + | | +-> after + | | + | +-> new main + | + +-> reparented + + Ends up as: + + old main ---------------------------> + | + +-> after + + +-> another branch with same ancestor_lsn as new main + | + new main -------|---------|-> + | + +-> reparented + + We confirm the end result by being able to delete "old main" after deleting "after". + """ + + env = neon_env_builder.init_start() + + env.pageserver.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS) + + client = env.pageserver.http_client() + + with env.endpoints.create_start("main", tenant_id=env.initial_tenant) as ep: + ep.safe_psql("CREATE TABLE foo (i BIGINT);") + ep.safe_psql("CREATE TABLE audit AS SELECT 1 as starts;") + + branchpoint_pipe = wait_for_last_flush_lsn( + env, ep, env.initial_tenant, env.initial_timeline + ) + + ep.safe_psql("INSERT INTO foo SELECT i::bigint FROM generate_series(0, 8191) g(i);") + + branchpoint_x = wait_for_last_flush_lsn(env, ep, env.initial_tenant, env.initial_timeline) + client.timeline_checkpoint(env.initial_tenant, env.initial_timeline) + + ep.safe_psql("INSERT INTO foo SELECT i::bigint FROM generate_series(8192, 16383) g(i);") + wait_for_last_flush_lsn(env, ep, env.initial_tenant, env.initial_timeline) + + # as this only gets reparented, we don't need to write to it like new main + reparented = env.neon_cli.create_branch( + "reparented", "main", env.initial_tenant, ancestor_start_lsn=branchpoint_pipe + ) + + same_branchpoint = env.neon_cli.create_branch( + "same_branchpoint", "main", env.initial_tenant, ancestor_start_lsn=branchpoint_x + ) + + timeline_id = env.neon_cli.create_branch( + "new main", "main", env.initial_tenant, ancestor_start_lsn=branchpoint_x + ) + + after = env.neon_cli.create_branch("after", "main", env.initial_tenant, ancestor_start_lsn=None) + + all_reparented = client.detach_ancestor(env.initial_tenant, timeline_id) + assert all_reparented == {reparented, same_branchpoint} + + env.pageserver.quiesce_tenants() + + # checking the ancestor after is much faster than waiting for the endpoint not start + expected_result = [ + ("main", env.initial_timeline, None, 16384, 1), + ("after", after, env.initial_timeline, 16384, 1), + ("new main", timeline_id, None, 8192, 1), + ("same_branchpoint", same_branchpoint, timeline_id, 8192, 1), + ("reparented", reparented, timeline_id, 0, 1), + ] + + assert isinstance(env.pageserver_remote_storage, LocalFsStorage) + + for _, queried_timeline, expected_ancestor, _, _ in expected_result: + details = client.timeline_detail(env.initial_tenant, queried_timeline) + ancestor_timeline_id = details["ancestor_timeline_id"] + if expected_ancestor is None: + assert ancestor_timeline_id is None + else: + assert TimelineId(ancestor_timeline_id) == expected_ancestor + + index_part = env.pageserver_remote_storage.index_content( + env.initial_tenant, queried_timeline + ) + lineage = index_part["lineage"] + assert lineage is not None + + assert lineage.get("reparenting_history_overflown", "false") == "false" + + if queried_timeline == timeline_id: + original_ancestor = lineage["original_ancestor"] + assert original_ancestor is not None + assert original_ancestor[0] == str(env.initial_timeline) + assert original_ancestor[1] == str(branchpoint_x) + + # this does not contain Z in the end, so fromisoformat accepts it + # it is to be in line with the deletion timestamp.. well, almost. + when = original_ancestor[2][:26] + when_ts = datetime.datetime.fromisoformat(when) + assert when_ts < datetime.datetime.now() + assert len(lineage.get("reparenting_history", [])) == 0 + elif expected_ancestor == timeline_id: + assert len(lineage.get("original_ancestor", [])) == 0 + assert lineage["reparenting_history"] == [str(env.initial_timeline)] + else: + assert len(lineage.get("original_ancestor", [])) == 0 + assert len(lineage.get("reparenting_history", [])) == 0 + + for name, _, _, rows, starts in expected_result: + with env.endpoints.create_start(name, tenant_id=env.initial_tenant) as ep: + assert ep.safe_psql("SELECT count(*) FROM foo;")[0][0] == rows + assert ep.safe_psql(f"SELECT count(*) FROM audit WHERE starts = {starts}")[0][0] == 1 + + # delete the timelines to confirm detach actually worked + client.timeline_delete(env.initial_tenant, after) + wait_timeline_detail_404(client, env.initial_tenant, after, 10, 1.0) + + client.timeline_delete(env.initial_tenant, env.initial_timeline) + wait_timeline_detail_404(client, env.initial_tenant, env.initial_timeline, 10, 1.0) + + +def test_detached_receives_flushes_while_being_detached(neon_env_builder: NeonEnvBuilder): + """ + Makes sure that the timeline is able to receive writes through-out the detach process. + """ + + env = neon_env_builder.init_start() + + client = env.pageserver.http_client() + + # row counts have been manually verified to cause reconnections and getpage + # requests when restart_after=False with pg16 + def insert_rows(n: int, ep) -> int: + ep.safe_psql( + f"INSERT INTO foo SELECT i::bigint, 'more info!! this is a long string' || i FROM generate_series(0, {n - 1}) g(i);" + ) + return n + + with env.endpoints.create_start("main", tenant_id=env.initial_tenant) as ep: + ep.safe_psql("CREATE EXTENSION neon_test_utils;") + ep.safe_psql("CREATE TABLE foo (i BIGINT, aux TEXT NOT NULL);") + + rows = insert_rows(256, ep) + + branchpoint = wait_for_last_flush_lsn(env, ep, env.initial_tenant, env.initial_timeline) + + timeline_id = env.neon_cli.create_branch( + "new main", "main", tenant_id=env.initial_tenant, ancestor_start_lsn=branchpoint + ) + + log.info("starting the new main endpoint") + ep = env.endpoints.create_start("new main", tenant_id=env.initial_tenant) + assert ep.safe_psql("SELECT count(*) FROM foo;")[0][0] == rows + + def small_txs(ep, queue: Queue[str], barrier): + extra_rows = 0 + + with ep.connect() as conn: + while True: + try: + queue.get_nowait() + break + except Empty: + pass + + if barrier is not None: + barrier.wait() + barrier = None + + cursor = conn.cursor() + cursor.execute( + "INSERT INTO foo(i, aux) VALUES (1, 'more info!! this is a long string' || 1);" + ) + extra_rows += 1 + return extra_rows + + with ThreadPoolExecutor(max_workers=1) as exec: + queue: Queue[str] = Queue() + barrier = Barrier(2) + + completion = exec.submit(small_txs, ep, queue, barrier) + barrier.wait() + + reparented = client.detach_ancestor(env.initial_tenant, timeline_id) + assert len(reparented) == 0 + + env.pageserver.quiesce_tenants() + + queue.put("done") + extra_rows = completion.result() + assert extra_rows > 0, "some rows should had been written" + rows += extra_rows + + assert client.timeline_detail(env.initial_tenant, timeline_id)["ancestor_timeline_id"] is None + + assert ep.safe_psql("SELECT clear_buffer_cache();") + assert ep.safe_psql("SELECT count(*) FROM foo;")[0][0] == rows + assert ep.safe_psql("SELECT SUM(LENGTH(aux)) FROM foo")[0][0] != 0 + ep.stop() + + # finally restart the endpoint and make sure we still have the same answer + with env.endpoints.create_start("new main", tenant_id=env.initial_tenant) as ep: + assert ep.safe_psql("SELECT count(*) FROM foo;")[0][0] == rows + + env.pageserver.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS) + + +# TODO: +# - after starting the operation, tenant is deleted +# - after starting the operation, pageserver is shutdown, restarted +# - after starting the operation, bottom-most timeline is deleted, pageserver is restarted, gc is inhibited +# - deletion of reparented while reparenting should fail once, then succeed (?) +# - branch near existing L1 boundary, image layers? +# - investigate: why are layers started at uneven lsn? not just after branching, but in general. diff --git a/test_runner/regress/test_vm_bits.py b/test_runner/regress/test_vm_bits.py index eff103ca09..b549db1af6 100644 --- a/test_runner/regress/test_vm_bits.py +++ b/test_runner/regress/test_vm_bits.py @@ -168,13 +168,16 @@ def test_vm_bit_clear_on_heap_lock(neon_env_builder: NeonEnvBuilder): # The VM page in shared buffer cache, and the same page as reconstructed # by the pageserver, should be equal. # - # Ignore the LSN on the page though (first 8 bytes). If the dirty - # VM page is flushed from the cache for some reason, it gets WAL-logged, - # which changes the LSN on the page. + # Ignore page header (24 bytes) of visibility map. + # If the dirty VM page is flushed from the cache for some reason, + # it gets WAL-logged, which changes the LSN on the page. + # Also in neon SMGR we can replace empty heap page with zero (uninitialized) heap page. cur.execute("select get_raw_page( 'vmtest_lock', 'vm', 0 )") - vm_page_in_cache = (cur.fetchall()[0][0])[8:100].hex() - cur.execute("select get_raw_page_at_lsn( 'vmtest_lock', 'vm', 0, pg_current_wal_insert_lsn() )") - vm_page_at_pageserver = (cur.fetchall()[0][0])[8:100].hex() + vm_page_in_cache = (cur.fetchall()[0][0])[24:100].hex() + cur.execute( + "select get_raw_page_at_lsn( 'vmtest_lock', 'vm', 0, pg_current_wal_insert_lsn(), NULL )" + ) + vm_page_at_pageserver = (cur.fetchall()[0][0])[24:100].hex() assert vm_page_at_pageserver == vm_page_in_cache diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index 2cac58dc1a..967d133e18 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -103,9 +103,7 @@ def test_many_timelines(neon_env_builder: NeonEnvBuilder): n_timelines = 3 - branch_names = [ - "test_safekeepers_many_timelines_{}".format(tlin) for tlin in range(n_timelines) - ] + branch_names = [f"test_safekeepers_many_timelines_{tlin}" for tlin in range(n_timelines)] # pageserver, safekeeper operate timelines via their ids (can be represented in hex as 'ad50847381e248feaac9876cc71ae418') # that's not really human readable, so the branch names are introduced in Neon CLI. # Neon CLI stores its branch <-> timeline mapping in its internals, @@ -1136,13 +1134,13 @@ def cmp_sk_wal(sks: List[Safekeeper], tenant_id: TenantId, timeline_id: Timeline for f in mismatch: f1 = os.path.join(sk0.timeline_dir(tenant_id, timeline_id), f) f2 = os.path.join(sk.timeline_dir(tenant_id, timeline_id), f) - stdout_filename = "{}.filediff".format(f2) + stdout_filename = f"{f2}.filediff" with open(stdout_filename, "w") as stdout_f: - subprocess.run("xxd {} > {}.hex ".format(f1, f1), shell=True) - subprocess.run("xxd {} > {}.hex ".format(f2, f2), shell=True) + subprocess.run(f"xxd {f1} > {f1}.hex ", shell=True) + subprocess.run(f"xxd {f2} > {f2}.hex ", shell=True) - cmd = "diff {}.hex {}.hex".format(f1, f2) + cmd = f"diff {f1}.hex {f2}.hex" subprocess.run([cmd], stdout=stdout_f, shell=True) assert (mismatch, not_regular) == ( @@ -1830,7 +1828,7 @@ def test_idle_reconnections(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_start() tenant_id = env.initial_tenant - timeline_id = env.neon_cli.create_branch("test_sk_auth_restart_endpoint") + timeline_id = env.neon_cli.create_branch("test_idle_reconnections") def collect_stats() -> Dict[str, float]: # we need to collect safekeeper_pg_queries_received_total metric from all safekeepers @@ -1861,7 +1859,7 @@ def test_idle_reconnections(neon_env_builder: NeonEnvBuilder): collect_stats() - endpoint = env.endpoints.create_start("test_sk_auth_restart_endpoint") + endpoint = env.endpoints.create_start("test_idle_reconnections") # just write something to the timeline endpoint.safe_psql("create table t(i int)") collect_stats() @@ -2009,3 +2007,47 @@ def test_patch_control_file(neon_env_builder: NeonEnvBuilder): ) log.info(f"dump_control_file response: {res}") assert res["timelines"][0]["control_file"]["timeline_start_lsn"] == "0/1" + + +# Test disables periodic pushes from safekeeper to the broker and checks that +# pageserver can still discover safekeepers with discovery requests. +def test_broker_discovery(neon_env_builder: NeonEnvBuilder): + neon_env_builder.num_safekeepers = 3 + neon_env_builder.enable_safekeeper_remote_storage(RemoteStorageKind.LOCAL_FS) + env = neon_env_builder.init_start() + + env.neon_cli.create_branch("test_broker_discovery") + + endpoint = env.endpoints.create_start( + "test_broker_discovery", + config_lines=["shared_buffers=1MB"], + ) + endpoint.safe_psql("create table t(i int, payload text)") + # Install extension containing function needed to clear buffer + endpoint.safe_psql("CREATE EXTENSION neon_test_utils") + + def do_something(): + time.sleep(1) + # generate some data to commit WAL on safekeepers + endpoint.safe_psql("insert into t select generate_series(1,100), 'action'") + # clear the buffers + endpoint.safe_psql("select clear_buffer_cache()") + # read data to fetch pages from pageserver + endpoint.safe_psql("select sum(i) from t") + + do_something() + do_something() + + for sk in env.safekeepers: + # Disable periodic broker push, so pageserver won't be able to discover + # safekeepers without sending a discovery request + sk.stop().start(extra_opts=["--disable-periodic-broker-push"]) + + do_something() + do_something() + + # restart pageserver and check how everything works + env.pageserver.stop().start() + + do_something() + do_something() diff --git a/test_runner/regress/test_wal_acceptor_async.py b/test_runner/regress/test_wal_acceptor_async.py index 720633189e..dce5616ac6 100644 --- a/test_runner/regress/test_wal_acceptor_async.py +++ b/test_runner/regress/test_wal_acceptor_async.py @@ -10,6 +10,7 @@ import pytest import toml from fixtures.log_helper import getLogger from fixtures.neon_fixtures import Endpoint, NeonEnv, NeonEnvBuilder, Safekeeper +from fixtures.remote_storage import RemoteStorageKind from fixtures.types import Lsn, TenantId, TimelineId log = getLogger("root.safekeeper_async") @@ -76,20 +77,20 @@ class WorkerStats(object): self.counters[worker_id] += 1 def check_progress(self): - log.debug("Workers progress: {}".format(self.counters)) + log.debug(f"Workers progress: {self.counters}") # every worker should finish at least one tx assert all(cnt > 0 for cnt in self.counters) progress = sum(self.counters) - log.info("All workers made {} transactions".format(progress)) + log.info(f"All workers made {progress} transactions") async def run_random_worker( stats: WorkerStats, endpoint: Endpoint, worker_id, n_accounts, max_transfer ): pg_conn = await endpoint.connect_async() - log.debug("Started worker {}".format(worker_id)) + log.debug(f"Started worker {worker_id}") while stats.running: from_uid = random.randint(0, n_accounts - 1) @@ -99,9 +100,9 @@ async def run_random_worker( await bank_transfer(pg_conn, from_uid, to_uid, amount) stats.inc_progress(worker_id) - log.debug("Executed transfer({}) {} => {}".format(amount, from_uid, to_uid)) + log.debug(f"Executed transfer({amount}) {from_uid} => {to_uid}") - log.debug("Finished worker {}".format(worker_id)) + log.debug(f"Finished worker {worker_id}") await pg_conn.close() @@ -199,7 +200,9 @@ async def run_restarts_under_load( # assert that at least one transaction has completed in every worker stats.check_progress() - victim.start() + # testing #6530, temporary here + # TODO: remove afer partial backup is enabled by default + victim.start(extra_opts=["--partial-backup-enabled", "--partial-backup-timeout=2s"]) log.info("Iterations are finished, exiting coroutines...") stats.running = False @@ -213,6 +216,7 @@ async def run_restarts_under_load( # Restart acceptors one by one, while executing and validating bank transactions def test_restarts_under_load(neon_env_builder: NeonEnvBuilder): neon_env_builder.num_safekeepers = 3 + neon_env_builder.enable_safekeeper_remote_storage(RemoteStorageKind.LOCAL_FS) env = neon_env_builder.init_start() env.neon_cli.create_branch("test_safekeepers_restarts_under_load") @@ -250,7 +254,9 @@ def test_restarts_frequent_checkpoints(neon_env_builder: NeonEnvBuilder): ) -def endpoint_create_start(env: NeonEnv, branch: str, pgdir_name: Optional[str]): +def endpoint_create_start( + env: NeonEnv, branch: str, pgdir_name: Optional[str], allow_multiple: bool = False +): endpoint = Endpoint( env, tenant_id=env.initial_tenant, @@ -264,14 +270,23 @@ def endpoint_create_start(env: NeonEnv, branch: str, pgdir_name: Optional[str]): # embed current time in endpoint ID endpoint_id = pgdir_name or f"ep-{time.time()}" return endpoint.create_start( - branch_name=branch, endpoint_id=endpoint_id, config_lines=["log_statement=all"] + branch_name=branch, + endpoint_id=endpoint_id, + config_lines=["log_statement=all"], + allow_multiple=allow_multiple, ) async def exec_compute_query( - env: NeonEnv, branch: str, query: str, pgdir_name: Optional[str] = None + env: NeonEnv, + branch: str, + query: str, + pgdir_name: Optional[str] = None, + allow_multiple: bool = False, ): - with endpoint_create_start(env, branch=branch, pgdir_name=pgdir_name) as endpoint: + with endpoint_create_start( + env, branch=branch, pgdir_name=pgdir_name, allow_multiple=allow_multiple + ) as endpoint: before_conn = time.time() conn = await endpoint.connect_async() res = await conn.fetch(query) @@ -343,6 +358,7 @@ class BackgroundCompute(object): self.branch, f"INSERT INTO query_log(index, verify_key) VALUES ({self.index}, {verify_key}) RETURNING verify_key", pgdir_name=f"bgcompute{self.index}_key{verify_key}", + allow_multiple=True, ) log.info(f"result: {res}") if len(res) != 1: diff --git a/trace/src/main.rs b/trace/src/main.rs index 4605c124e9..049f922b6f 100644 --- a/trace/src/main.rs +++ b/trace/src/main.rs @@ -7,7 +7,9 @@ use std::{ io::BufReader, }; -use pageserver_api::models::{PagestreamFeMessage, PagestreamGetPageRequest}; +use pageserver_api::models::{ + PagestreamFeMessage, PagestreamGetPageRequest, PagestreamProtocolVersion, +}; use utils::id::{ConnectionId, TenantId, TimelineId}; use clap::{Parser, Subcommand}; @@ -56,7 +58,7 @@ fn analyze_trace(mut reader: R) { let mut prev: Option = None; // Compute stats - while let Ok(msg) = PagestreamFeMessage::parse(&mut reader) { + while let Ok(msg) = PagestreamFeMessage::parse(&mut reader, PagestreamProtocolVersion::V2) { match msg { PagestreamFeMessage::Exists(_) => {} PagestreamFeMessage::Nblocks(_) => {} @@ -89,7 +91,7 @@ fn analyze_trace(mut reader: R) { } fn dump_trace(mut reader: R) { - while let Ok(msg) = PagestreamFeMessage::parse(&mut reader) { + while let Ok(msg) = PagestreamFeMessage::parse(&mut reader, PagestreamProtocolVersion::V2) { println!("{msg:?}"); } } diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index 3b09894ddb..d6f7e2c604 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit 3b09894ddb8825b50c963942059eab1a2a0b0a89 +Subproject commit d6f7e2c604bfc7cbc4c46bcea0a8e800f4bc778a diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index 80cef885ad..f0d6b0ef75 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit 80cef885add1af6741aa31944c7d2c84d8f9098f +Subproject commit f0d6b0ef7581bd78011832e23d8420a7d2c8a83a diff --git a/vendor/postgres-v16 b/vendor/postgres-v16 index 9007894722..8ef3c33aa0 160000 --- a/vendor/postgres-v16 +++ b/vendor/postgres-v16 @@ -1 +1 @@ -Subproject commit 90078947229aa7f9ac5f7ed4527b2c7386d5332b +Subproject commit 8ef3c33aa01631e17cb24a122776349fcc777b46 diff --git a/vendor/revisions.json b/vendor/revisions.json index ae524d70b1..c5b55762fa 100644 --- a/vendor/revisions.json +++ b/vendor/revisions.json @@ -1,5 +1,5 @@ { - "postgres-v16": "90078947229aa7f9ac5f7ed4527b2c7386d5332b", - "postgres-v15": "80cef885add1af6741aa31944c7d2c84d8f9098f", - "postgres-v14": "3b09894ddb8825b50c963942059eab1a2a0b0a89" + "v16": ["16.2", "8ef3c33aa01631e17cb24a122776349fcc777b46"], + "v15": ["15.6", "f0d6b0ef7581bd78011832e23d8420a7d2c8a83a"], + "v14": ["14.11", "d6f7e2c604bfc7cbc4c46bcea0a8e800f4bc778a"] } diff --git a/vm-image-spec.yaml b/vm-image-spec.yaml index 5b93088303..e9d983eba3 100644 --- a/vm-image-spec.yaml +++ b/vm-image-spec.yaml @@ -5,6 +5,12 @@ commands: user: root sysvInitAction: sysinit shell: 'cgconfigparser -l /etc/cgconfig.conf -s 1664' + # restrict permissions on /neonvm/bin/resize-swap, because we grant access to compute_ctl for + # running it as root. + - name: chmod-resize-swap + user: root + sysvInitAction: sysinit + shell: 'chmod 711 /neonvm/bin/resize-swap' - name: pgbouncer user: postgres sysvInitAction: respawn @@ -16,10 +22,19 @@ commands: - name: sql-exporter user: nobody sysvInitAction: respawn - shell: '/bin/sql_exporter -config.file=/etc/sql_exporter.yml' + shell: '/bin/sql_exporter -config.file=/etc/sql_exporter.yml -web.listen-address=:9399' + - name: sql-exporter-autoscaling + user: nobody + sysvInitAction: respawn + shell: '/bin/sql_exporter -config.file=/etc/sql_exporter_autoscaling.yml -web.listen-address=:9499' shutdownHook: | su -p postgres --session-command '/usr/local/bin/pg_ctl stop -D /var/db/postgres/compute/pgdata -m fast --wait -t 10' files: + - filename: compute_ctl-resize-swap + content: | + # Allow postgres user (which is what compute_ctl runs as) to run /neonvm/bin/resize-swap + # as root without requiring entering a password (NOPASSWD), regardless of hostname (ALL) + postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap - filename: pgbouncer.ini content: | [databases] @@ -88,6 +103,41 @@ files: # Glob patterns are supported (see for syntax). collector_files: - "neon_collector.yml" + - filename: sql_exporter_autoscaling.yml + content: | + # Configuration for sql_exporter for autoscaling-agent + # Global defaults. + global: + # If scrape_timeout <= 0, no timeout is set unless Prometheus provides one. The default is 10s. + scrape_timeout: 10s + # Subtracted from Prometheus' scrape_timeout to give us some headroom and prevent Prometheus from timing out first. + scrape_timeout_offset: 500ms + # Minimum interval between collector runs: by default (0s) collectors are executed on every scrape. + min_interval: 0s + # Maximum number of open connections to any one target. Metric queries will run concurrently on multiple connections, + # as will concurrent scrapes. + max_connections: 1 + # Maximum number of idle connections to any one target. Unless you use very long collection intervals, this should + # always be the same as max_connections. + max_idle_connections: 1 + # Maximum number of maximum amount of time a connection may be reused. Expired connections may be closed lazily before reuse. + # If 0, connections are not closed due to a connection's age. + max_connection_lifetime: 5m + + # The target to monitor and the collectors to execute on it. + target: + # Data source name always has a URI schema that matches the driver name. In some cases (e.g. MySQL) + # the schema gets dropped or replaced to match the driver expected DSN format. + data_source_name: 'postgresql://cloud_admin@127.0.0.1:5432/postgres?sslmode=disable' + + # Collectors (referenced by name) to execute on the target. + # Glob patterns are supported (see for syntax). + collectors: [neon_collector_autoscaling] + + # Collector files specifies a list of globs. One collector definition is read from each matching file. + # Glob patterns are supported (see for syntax). + collector_files: + - "neon_collector_autoscaling.yml" - filename: neon_collector.yml content: | collector_name: neon_collector @@ -187,6 +237,113 @@ files: query: | select sum(pg_database_size(datname)) as total from pg_database; + - metric_name: lfc_approximate_working_set_size + type: gauge + help: 'Approximate working set size in pages of 8192 bytes' + key_labels: + values: [approximate_working_set_size] + query: | + select neon.approximate_working_set_size(false) as approximate_working_set_size; + + - metric_name: current_lsn + type: gauge + help: 'Current LSN of the database' + key_labels: + values: [lsn] + query: | + select + case + when pg_catalog.pg_is_in_recovery() + then pg_last_wal_replay_lsn() + else pg_current_wal_lsn() + end as lsn; + + - metric_name: replication_delay_bytes + type: gauge + help: 'Bytes between received and replayed LSN' + key_labels: + values: [replication_delay_bytes] + query: | + SELECT pg_wal_lsn_diff(pg_last_wal_receive_lsn(), pg_last_wal_replay_lsn()) AS replication_delay_bytes; + + - metric_name: replication_delay_seconds + type: gauge + help: 'Time since last LSN was replayed' + key_labels: + values: [replication_delay_seconds] + query: | + SELECT + CASE + WHEN pg_last_wal_receive_lsn() = pg_last_wal_replay_lsn() THEN 0 + ELSE GREATEST (0, EXTRACT (EPOCH FROM now() - pg_last_xact_replay_timestamp())) + END AS replication_delay_seconds; + + - metric_name: checkpoints_req + type: gauge + help: 'Number of requested checkpoints' + key_labels: + values: [checkpoints_req] + query: | + SELECT checkpoints_req FROM pg_stat_bgwriter; + + - metric_name: checkpoints_timed + type: gauge + help: 'Number of scheduled checkpoints' + key_labels: + values: [checkpoints_timed] + query: | + SELECT checkpoints_timed FROM pg_stat_bgwriter; + - filename: neon_collector_autoscaling.yml + content: | + collector_name: neon_collector_autoscaling + metrics: + - metric_name: lfc_misses + type: gauge + help: 'lfc_misses' + key_labels: + values: [lfc_misses] + query: | + select lfc_value as lfc_misses from neon.neon_lfc_stats where lfc_key='file_cache_misses'; + + - metric_name: lfc_used + type: gauge + help: 'LFC chunks used (chunk = 1MB)' + key_labels: + values: [lfc_used] + query: | + select lfc_value as lfc_used from neon.neon_lfc_stats where lfc_key='file_cache_used'; + + - metric_name: lfc_hits + type: gauge + help: 'lfc_hits' + key_labels: + values: [lfc_hits] + query: | + select lfc_value as lfc_hits from neon.neon_lfc_stats where lfc_key='file_cache_hits'; + + - metric_name: lfc_writes + type: gauge + help: 'lfc_writes' + key_labels: + values: [lfc_writes] + query: | + select lfc_value as lfc_writes from neon.neon_lfc_stats where lfc_key='file_cache_writes'; + + - metric_name: lfc_cache_size_limit + type: gauge + help: 'LFC cache size limit in bytes' + key_labels: + values: [lfc_cache_size_limit] + query: | + select pg_size_bytes(current_setting('neon.file_cache_size_limit')) as lfc_cache_size_limit; + + - metric_name: lfc_approximate_working_set_size + type: gauge + help: 'Approximate working set size in pages of 8192 bytes' + key_labels: + values: [approximate_working_set_size] + query: | + select neon.approximate_working_set_size(false) as approximate_working_set_size; build: | # Build cgroup-tools # @@ -255,17 +412,32 @@ merge: | && echo 'root - nofile 1048576' >>/etc/security/limits.conf \ ) + # Allow postgres user (compute_ctl) to run swap resizer. + # Need to install sudo in order to allow this. + # + # Also, remove the 'read' permission from group/other on /neonvm/bin/resize-swap, just to be safe. + RUN set -e \ + && apt update \ + && apt install --no-install-recommends -y \ + sudo \ + && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* + COPY compute_ctl-resize-swap /etc/sudoers.d/compute_ctl-resize-swap + COPY cgconfig.conf /etc/cgconfig.conf COPY pgbouncer.ini /etc/pgbouncer.ini COPY sql_exporter.yml /etc/sql_exporter.yml COPY neon_collector.yml /etc/neon_collector.yml + COPY sql_exporter_autoscaling.yml /etc/sql_exporter_autoscaling.yml + COPY neon_collector_autoscaling.yml /etc/neon_collector_autoscaling.yml RUN set -e \ && chown postgres:postgres /etc/pgbouncer.ini \ && chmod 0666 /etc/pgbouncer.ini \ && chmod 0644 /etc/cgconfig.conf \ && chmod 0644 /etc/sql_exporter.yml \ - && chmod 0644 /etc/neon_collector.yml + && chmod 0644 /etc/neon_collector.yml \ + && chmod 0644 /etc/sql_exporter_autoscaling.yml \ + && chmod 0644 /etc/neon_collector_autoscaling.yml COPY --from=libcgroup-builder /libcgroup-install/bin/* /usr/bin/ COPY --from=libcgroup-builder /libcgroup-install/lib/* /usr/lib/ diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index 152c452dd4..b605757f64 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -19,8 +19,7 @@ aws-runtime = { version = "1", default-features = false, features = ["event-stre aws-sigv4 = { version = "1", features = ["http0-compat", "sign-eventstream", "sigv4a"] } aws-smithy-async = { version = "1", default-features = false, features = ["rt-tokio"] } aws-smithy-http = { version = "0.60", default-features = false, features = ["event-stream"] } -aws-smithy-runtime-api = { version = "1", features = ["client", "http-02x", "http-auth"] } -aws-smithy-types = { version = "1", default-features = false, features = ["byte-stream-poll-next", "http-body-0-4-x", "rt-tokio"] } +aws-smithy-types = { version = "1", default-features = false, features = ["byte-stream-poll-next", "http-body-0-4-x", "http-body-1-x", "rt-tokio", "test-util"] } axum = { version = "0.6", features = ["ws"] } base64 = { version = "0.21", features = ["alloc"] } base64ct = { version = "1", default-features = false, features = ["std"] } @@ -38,8 +37,7 @@ futures-io = { version = "0.3" } futures-sink = { version = "0.3" } futures-util = { version = "0.3", features = ["channel", "io", "sink"] } getrandom = { version = "0.2", default-features = false, features = ["std"] } -hashbrown-582f2526e08bb6a0 = { package = "hashbrown", version = "0.14", features = ["raw"] } -hashbrown-594e8ee84c453af0 = { package = "hashbrown", version = "0.13", features = ["raw"] } +hashbrown = { version = "0.14", features = ["raw"] } hex = { version = "0.4", features = ["serde"] } hmac = { version = "0.12", default-features = false, features = ["reset"] } hyper = { version = "0.14", features = ["full"] } @@ -59,14 +57,16 @@ rand = { version = "0.8", features = ["small_rng"] } regex = { version = "1" } regex-automata = { version = "0.4", default-features = false, features = ["dfa-onepass", "hybrid", "meta", "nfa-backtrack", "perf-inline", "perf-literal", "unicode"] } regex-syntax = { version = "0.8" } -reqwest = { version = "0.11", default-features = false, features = ["blocking", "default-tls", "json", "multipart", "rustls-tls", "stream"] } +reqwest-5ef9efb8ec2df382 = { package = "reqwest", version = "0.12", default-features = false, features = ["blocking", "json", "rustls-tls", "stream"] } +reqwest-a6292c17cd707f01 = { package = "reqwest", version = "0.11", default-features = false, features = ["blocking", "default-tls", "stream"] } rustls = { version = "0.21", features = ["dangerous_configuration"] } scopeguard = { version = "1" } serde = { version = "1", features = ["alloc", "derive"] } serde_json = { version = "1", features = ["raw_value"] } sha2 = { version = "0.10", features = ["asm"] } -smallvec = { version = "1", default-features = false, features = ["write"] } +smallvec = { version = "1", default-features = false, features = ["const_new", "write"] } subtle = { version = "2" } +sync_wrapper = { version = "0.1", default-features = false, features = ["futures"] } time = { version = "0.3", features = ["local-offset", "macros", "serde-well-known"] } tokio = { version = "1", features = ["fs", "io-std", "io-util", "macros", "net", "process", "rt-multi-thread", "signal", "test-util"] } tokio-rustls = { version = "0.24" } @@ -77,7 +77,6 @@ tonic = { version = "0.9", features = ["tls-roots"] } tower = { version = "0.4", default-features = false, features = ["balance", "buffer", "limit", "log", "timeout", "util"] } tracing = { version = "0.1", features = ["log"] } tracing-core = { version = "0.1" } -tungstenite = { version = "0.20" } url = { version = "2", features = ["serde"] } uuid = { version = "1", features = ["serde", "v4", "v7"] } zeroize = { version = "1", features = ["derive"] } @@ -92,7 +91,7 @@ cc = { version = "1", default-features = false, features = ["parallel"] } chrono = { version = "0.4", default-features = false, features = ["clock", "serde", "wasmbind"] } either = { version = "1" } getrandom = { version = "0.2", default-features = false, features = ["std"] } -hashbrown-582f2526e08bb6a0 = { package = "hashbrown", version = "0.14", features = ["raw"] } +hashbrown = { version = "0.14", features = ["raw"] } indexmap = { version = "1", default-features = false, features = ["std"] } itertools = { version = "0.10" } libc = { version = "0.2", features = ["extra_traits", "use_std"] }