diff --git a/.config/nextest.toml b/.config/nextest.toml new file mode 100644 index 0000000000..8bccd51c6d --- /dev/null +++ b/.config/nextest.toml @@ -0,0 +1,2 @@ +[profile.default] +slow-timeout = "1m" diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml index e2f15d96db..8bf12c31b1 100644 --- a/.github/workflows/benchmarking.yml +++ b/.github/workflows/benchmarking.yml @@ -11,7 +11,7 @@ on: # │ │ ┌───────────── day of the month (1 - 31) # │ │ │ ┌───────────── month (1 - 12 or JAN-DEC) # │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT) - - cron: '0 3 * * *' # run once a day, timezone is utc + - cron: '0 3 * * *' # run once a day, timezone is utc workflow_dispatch: # adds ability to run this manually inputs: @@ -23,6 +23,21 @@ on: type: boolean description: 'Publish perf report. If not set, the report will be published only for the main branch' required: false + collect_olap_explain: + type: boolean + description: 'Collect EXPLAIN ANALYZE for OLAP queries. If not set, EXPLAIN ANALYZE will not be collected' + required: false + default: false + collect_pg_stat_statements: + type: boolean + description: 'Collect pg_stat_statements for OLAP queries. If not set, pg_stat_statements will not be collected' + required: false + default: false + run_AWS_RDS_AND_AURORA: + type: boolean + description: 'AWS-RDS and AWS-AURORA normally only run on Saturday. Set this to true to run them on every workflow_dispatch' + required: false + default: false defaults: run: @@ -113,6 +128,8 @@ jobs: # - neon-captest-reuse: Reusing existing project # - rds-aurora: Aurora Postgres Serverless v2 with autoscaling from 0.5 to 2 ACUs # - rds-postgres: RDS Postgres db.m5.large instance (2 vCPU, 8 GiB) with gp3 EBS storage + env: + RUN_AWS_RDS_AND_AURORA: ${{ github.event.inputs.run_AWS_RDS_AND_AURORA || 'false' }} runs-on: ubuntu-latest outputs: pgbench-compare-matrix: ${{ steps.pgbench-compare-matrix.outputs.matrix }} @@ -152,7 +169,7 @@ jobs: ] }' - if [ "$(date +%A)" = "Saturday" ]; then + if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres" }, { "platform": "rds-aurora" }]') fi @@ -171,9 +188,9 @@ jobs: ] }' - if [ "$(date +%A)" = "Saturday" ]; then + if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres", "scale": "10" }, - { "platform": "rds-aurora", "scale": "10" }]') + { "platform": "rds-aurora", "scale": "10" }]') fi echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT @@ -337,6 +354,8 @@ jobs: POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install DEFAULT_PG_VERSION: 14 TEST_OUTPUT: /tmp/test_output + TEST_OLAP_COLLECT_EXPLAIN: ${{ github.event.inputs.collect_olap_explain }} + TEST_OLAP_COLLECT_PG_STAT_STATEMENTS: ${{ github.event.inputs.collect_pg_stat_statements }} BUILD_TYPE: remote SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }} PLATFORM: ${{ matrix.platform }} @@ -399,6 +418,8 @@ jobs: env: VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" + TEST_OLAP_COLLECT_EXPLAIN: ${{ github.event.inputs.collect_olap_explain || 'false' }} + TEST_OLAP_COLLECT_PG_STAT_STATEMENTS: ${{ github.event.inputs.collect_pg_stat_statements || 'false' }} BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} TEST_OLAP_SCALE: 10 diff --git a/.github/workflows/build_and_push_docker_image.yml b/.github/workflows/build_and_push_docker_image.yml new file mode 100644 index 0000000000..e401b2f418 --- /dev/null +++ b/.github/workflows/build_and_push_docker_image.yml @@ -0,0 +1,105 @@ +name: Build and Push Docker Image + +on: + workflow_call: + inputs: + dockerfile-path: + required: true + type: string + image-name: + required: true + type: string + outputs: + build-tools-tag: + description: "tag generated for build tools" + value: ${{ jobs.tag.outputs.build-tools-tag }} + +jobs: + check-if-build-tools-dockerfile-changed: + runs-on: ubuntu-latest + outputs: + docker_file_changed: ${{ steps.dockerfile.outputs.docker_file_changed }} + steps: + - name: Check if Dockerfile.buildtools has changed + id: dockerfile + run: | + if [[ "$GITHUB_EVENT_NAME" != "pull_request" ]]; then + echo "docker_file_changed=false" >> $GITHUB_OUTPUT + exit + fi + updated_files=$(gh pr --repo neondatabase/neon diff ${{ github.event.pull_request.number }} --name-only) + if [[ $updated_files == *"Dockerfile.buildtools"* ]]; then + echo "docker_file_changed=true" >> $GITHUB_OUTPUT + fi + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + + tag: + runs-on: ubuntu-latest + needs: [ check-if-build-tools-dockerfile-changed ] + outputs: + build-tools-tag: ${{steps.buildtools-tag.outputs.image_tag}} + + steps: + - name: Get buildtools tag + env: + DOCKERFILE_CHANGED: ${{ needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed }} + run: | + if [[ "$GITHUB_EVENT_NAME" == "pull_request" ]] && [[ "${DOCKERFILE_CHANGED}" == "true" ]]; then + IMAGE_TAG=$GITHUB_RUN_ID + else + IMAGE_TAG=pinned + fi + + echo "image_tag=${IMAGE_TAG}" >> $GITHUB_OUTPUT + shell: bash + id: buildtools-tag + + kaniko: + if: needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed == 'true' + needs: [ tag, check-if-build-tools-dockerfile-changed ] + runs-on: [ self-hosted, dev, x64 ] + container: gcr.io/kaniko-project/executor:v1.7.0-debug + + steps: + - name: Checkout + uses: actions/checkout@v1 + + - name: Configure ECR login + run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json + + - name: Kaniko build + run: /kaniko/executor --reproducible --snapshotMode=redo --skip-unused-stages --dockerfile ${{ inputs.dockerfile-path }} --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-amd64 + + kaniko-arm: + if: needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed == 'true' + needs: [ tag, check-if-build-tools-dockerfile-changed ] + runs-on: [ self-hosted, dev, arm64 ] + container: gcr.io/kaniko-project/executor:v1.7.0-debug + + steps: + - name: Checkout + uses: actions/checkout@v1 + + - name: Configure ECR login + run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json + + - name: Kaniko build + run: /kaniko/executor --reproducible --snapshotMode=redo --skip-unused-stages --dockerfile ${{ inputs.dockerfile-path }} --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-arm64 + + manifest: + if: needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed == 'true' + name: 'manifest' + runs-on: [ self-hosted, dev, x64 ] + needs: + - tag + - kaniko + - kaniko-arm + - check-if-build-tools-dockerfile-changed + + steps: + - name: Create manifest + run: docker manifest create 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }} --amend 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-amd64 --amend 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-arm64 + + - name: Push manifest + run: docker manifest push 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }} diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 820848b4fb..78deff6e85 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -44,7 +44,6 @@ jobs: exit 1 - tag: needs: [ check-permissions ] runs-on: [ self-hosted, gen3, small ] @@ -74,11 +73,19 @@ jobs: shell: bash id: build-tag - check-codestyle-python: + build-buildtools-image: needs: [ check-permissions ] + uses: ./.github/workflows/build_and_push_docker_image.yml + with: + dockerfile-path: Dockerfile.buildtools + image-name: build-tools + secrets: inherit + + check-codestyle-python: + needs: [ check-permissions, build-buildtools-image ] runs-on: [ self-hosted, gen3, small ] container: - image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned + image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }} options: --init steps: @@ -108,10 +115,10 @@ jobs: run: poetry run mypy . check-codestyle-rust: - needs: [ check-permissions ] + needs: [ check-permissions, build-buildtools-image ] runs-on: [ self-hosted, gen3, large ] container: - image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned + image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }} options: --init steps: @@ -175,10 +182,10 @@ jobs: run: cargo deny check --hide-inclusion-graph build-neon: - needs: [ check-permissions, tag ] + needs: [ check-permissions, tag, build-buildtools-image ] runs-on: [ self-hosted, gen3, large ] container: - image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned + image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }} options: --init strategy: fail-fast: false @@ -199,6 +206,10 @@ jobs: # git config --global --add safe.directory ${{ github.workspace }} git config --global --add safe.directory ${GITHUB_WORKSPACE} + for r in 14 15 16; do + git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r" + git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r" + done - name: Checkout uses: actions/checkout@v3 @@ -328,16 +339,16 @@ jobs: run: | ${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests - - name: Run cargo test + - name: Run rust tests run: | - ${cov_prefix} cargo test $CARGO_FLAGS $CARGO_FEATURES + ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES # Run separate tests for real S3 export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty - export REMOTE_STORAGE_S3_BUCKET=neon-github-public-dev + export REMOTE_STORAGE_S3_BUCKET=neon-github-ci-tests export REMOTE_STORAGE_S3_REGION=eu-central-1 # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now - ${cov_prefix} cargo test $CARGO_FLAGS --package remote_storage --test test_real_s3 + ${cov_prefix} cargo nextest run $CARGO_FLAGS -E 'package(remote_storage)' -E 'test(test_real_s3)' # Run separate tests for real Azure Blob Storage # XXX: replace region with `eu-central-1`-like region @@ -347,7 +358,7 @@ jobs: export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}" export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}" # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now - ${cov_prefix} cargo test $CARGO_FLAGS --package remote_storage --test test_real_azure + ${cov_prefix} cargo nextest run $CARGO_FLAGS -E 'package(remote_storage)' -E 'test(test_real_azure)' - name: Install rust binaries run: | @@ -404,10 +415,10 @@ jobs: uses: ./.github/actions/save-coverage-data regress-tests: - needs: [ check-permissions, build-neon, tag ] + needs: [ check-permissions, build-neon, build-buildtools-image, tag ] runs-on: [ self-hosted, gen3, large ] container: - image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned + image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }} # Default shared memory is 64mb options: --init --shm-size=512mb strategy: @@ -443,10 +454,10 @@ jobs: uses: ./.github/actions/save-coverage-data benchmarks: - needs: [ check-permissions, build-neon ] + needs: [ check-permissions, build-neon, build-buildtools-image ] runs-on: [ self-hosted, gen3, small ] container: - image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned + image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }} # Default shared memory is 64mb options: --init --shm-size=512mb if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks') @@ -475,12 +486,12 @@ jobs: # while coverage is currently collected for the debug ones create-test-report: - needs: [ check-permissions, regress-tests, coverage-report, benchmarks ] + needs: [ check-permissions, regress-tests, coverage-report, benchmarks, build-buildtools-image ] if: ${{ !cancelled() && contains(fromJSON('["skipped", "success"]'), needs.check-permissions.result) }} runs-on: [ self-hosted, gen3, small ] container: - image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned + image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }} options: --init steps: @@ -522,11 +533,10 @@ jobs: }) coverage-report: - needs: [ check-permissions, regress-tests ] - + needs: [ check-permissions, regress-tests, build-buildtools-image ] runs-on: [ self-hosted, gen3, small ] container: - image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned + image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }} options: --init strategy: fail-fast: false @@ -690,7 +700,7 @@ jobs: }" neon-image: - needs: [ check-permissions, tag ] + needs: [ check-permissions, build-buildtools-image, tag ] runs-on: [ self-hosted, gen3, large ] container: gcr.io/kaniko-project/executor:v1.9.2-debug defaults: @@ -729,6 +739,7 @@ jobs: --context . --build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }} --build-arg BUILD_TAG=${{ needs.tag.outputs.build-tag }} + --build-arg TAG=${{ needs.build-buildtools-image.outputs.build-tools-tag }} --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} --destination neondatabase/neon:${{needs.tag.outputs.build-tag}} @@ -739,7 +750,7 @@ jobs: compute-tools-image: runs-on: [ self-hosted, gen3, large ] - needs: [ check-permissions, tag ] + needs: [ check-permissions, build-buildtools-image, tag ] container: gcr.io/kaniko-project/executor:v1.9.2-debug defaults: run: @@ -774,6 +785,7 @@ jobs: --context . --build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }} --build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}} + --build-arg TAG=${{needs.build-buildtools-image.outputs.build-tools-tag}} --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com --dockerfile Dockerfile.compute-tools --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} @@ -784,7 +796,7 @@ jobs: run: rm -rf ~/.ecr compute-node-image: - needs: [ check-permissions, tag ] + needs: [ check-permissions, build-buildtools-image, tag ] runs-on: [ self-hosted, gen3, large ] container: image: gcr.io/kaniko-project/executor:v1.9.2-debug @@ -832,6 +844,7 @@ jobs: --build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }} --build-arg PG_VERSION=${{ matrix.version }} --build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}} + --build-arg TAG=${{needs.build-buildtools-image.outputs.build-tools-tag}} --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com --dockerfile Dockerfile.compute-node --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} @@ -853,7 +866,7 @@ jobs: run: shell: sh -eu {0} env: - VM_BUILDER_VERSION: v0.19.0 + VM_BUILDER_VERSION: v0.21.0 steps: - name: Checkout @@ -1097,6 +1110,10 @@ jobs: # git config --global --add safe.directory ${{ github.workspace }} git config --global --add safe.directory ${GITHUB_WORKSPACE} + for r in 14 15 16; do + git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r" + git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r" + done - name: Checkout uses: actions/checkout@v3 diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml index 0d7db8dfbc..c6c2b7386a 100644 --- a/.github/workflows/neon_extra_builds.yml +++ b/.github/workflows/neon_extra_builds.yml @@ -142,6 +142,10 @@ jobs: # git config --global --add safe.directory ${{ github.workspace }} git config --global --add safe.directory ${GITHUB_WORKSPACE} + for r in 14 15 16; do + git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r" + git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r" + done - name: Checkout uses: actions/checkout@v4 @@ -214,7 +218,7 @@ jobs: # Run separate tests for real S3 export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty - export REMOTE_STORAGE_S3_BUCKET=neon-github-public-dev + export REMOTE_STORAGE_S3_BUCKET=neon-github-ci-tests export REMOTE_STORAGE_S3_REGION=eu-central-1 # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now cargo test $CARGO_FLAGS --package remote_storage --test test_real_s3 @@ -238,6 +242,20 @@ jobs: options: --init steps: + - name: Fix git ownership + run: | + # Workaround for `fatal: detected dubious ownership in repository at ...` + # + # Use both ${{ github.workspace }} and ${GITHUB_WORKSPACE} because they're different on host and in containers + # Ref https://github.com/actions/checkout/issues/785 + # + git config --global --add safe.directory ${{ github.workspace }} + git config --global --add safe.directory ${GITHUB_WORKSPACE} + for r in 14 15 16; do + git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r" + git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r" + done + - name: Checkout uses: actions/checkout@v4 with: diff --git a/.github/workflows/update_build_tools_image.yml b/.github/workflows/update_build_tools_image.yml new file mode 100644 index 0000000000..88bab797b7 --- /dev/null +++ b/.github/workflows/update_build_tools_image.yml @@ -0,0 +1,130 @@ +name: 'Update build tools image tag' + +# This workflow it used to update tag of build tools in ECR. +# The most common use case is adding/moving `pinned` tag to `${GITHUB_RUN_IT}` image. + +on: + workflow_dispatch: + inputs: + from-tag: + description: 'Source tag' + required: true + type: string + to-tag: + description: 'Destination tag' + required: true + type: string + default: 'pinned' + +defaults: + run: + shell: bash -euo pipefail {0} + +env: + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }} + +permissions: {} + +jobs: + tag-image: + runs-on: [ self-hosted, gen3, small ] + container: golang:1.19-bullseye + + env: + IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools + FROM_TAG: ${{ inputs.from-tag }} + TO_TAG: ${{ inputs.to-tag }} + outputs: + next-digest-buildtools: ${{ steps.next-digest.outputs.next-digest-buildtools }} + prev-digest-buildtools: ${{ steps.prev-digest.outputs.prev-digest-buildtools }} + + steps: + - name: Install Crane & ECR helper + run: | + go install github.com/google/go-containerregistry/cmd/crane@a54d64203cffcbf94146e04069aae4a97f228ee2 # v0.16.1 + go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@adf1bafd791ae7d4ff098108b1e91f36a4da5404 # v0.7.1 + + - name: Configure ECR login + run: | + mkdir /github/home/.docker/ + echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json + + - name: Get source image digest + id: next-digest + run: | + NEXT_DIGEST=$(crane digest ${IMAGE}:${FROM_TAG} || true) + if [ -z "${NEXT_DIGEST}" ]; then + echo >&2 "Image ${IMAGE}:${FROM_TAG} does not exist" + exit 1 + fi + + echo "Current ${IMAGE}@${FROM_TAG} image is ${IMAGE}@${NEXT_DIGEST}" + echo "next-digest-buildtools=$NEXT_DIGEST" >> $GITHUB_OUTPUT + + - name: Get destination image digest (if already exists) + id: prev-digest + run: | + PREV_DIGEST=$(crane digest ${IMAGE}:${TO_TAG} || true) + if [ -z "${PREV_DIGEST}" ]; then + echo >&2 "Image ${IMAGE}:${TO_TAG} does not exist (it's ok)" + else + echo >&2 "Current ${IMAGE}@${TO_TAG} image is ${IMAGE}@${PREV_DIGEST}" + + echo "prev-digest-buildtools=$PREV_DIGEST" >> $GITHUB_OUTPUT + fi + + - name: Tag image + run: | + crane tag "${IMAGE}:${FROM_TAG}" "${TO_TAG}" + + rollback-tag-image: + needs: tag-image + if: ${{ !success() }} + + runs-on: [ self-hosted, gen3, small ] + container: golang:1.19-bullseye + + env: + IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools + FROM_TAG: ${{ inputs.from-tag }} + TO_TAG: ${{ inputs.to-tag }} + + steps: + - name: Install Crane & ECR helper + run: | + go install github.com/google/go-containerregistry/cmd/crane@a54d64203cffcbf94146e04069aae4a97f228ee2 # v0.16.1 + go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@adf1bafd791ae7d4ff098108b1e91f36a4da5404 # v0.7.1 + + - name: Configure ECR login + run: | + mkdir /github/home/.docker/ + echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json + + - name: Restore previous tag if needed + run: | + NEXT_DIGEST="${{ needs.tag-image.outputs.next-digest-buildtools }}" + PREV_DIGEST="${{ needs.tag-image.outputs.prev-digest-buildtools }}" + + if [ -z "${NEXT_DIGEST}" ]; then + echo >&2 "Image ${IMAGE}:${FROM_TAG} does not exist, nothing to rollback" + exit 0 + fi + + if [ -z "${PREV_DIGEST}" ]; then + # I guess we should delete the tag here/untag the image, but crane does not support it + # - https://github.com/google/go-containerregistry/issues/999 + + echo >&2 "Image ${IMAGE}:${TO_TAG} did not exist, but it was created by the job, no need to rollback" + + exit 0 + fi + + CURRENT_DIGEST=$(crane digest "${IMAGE}:${TO_TAG}") + if [ "${CURRENT_DIGEST}" == "${NEXT_DIGEST}" ]; then + crane tag "${IMAGE}@${PREV_DIGEST}" "${TO_TAG}" + + echo >&2 "Successfully restored ${TO_TAG} tag from ${IMAGE}@${CURRENT_DIGEST} to ${IMAGE}@${PREV_DIGEST}" + else + echo >&2 "Image ${IMAGE}:${TO_TAG}@${CURRENT_DIGEST} is not required to be restored" + fi diff --git a/.gitignore b/.gitignore index f1afdee599..3f4495c9e7 100644 --- a/.gitignore +++ b/.gitignore @@ -6,6 +6,7 @@ __pycache__/ test_output/ .vscode .idea +neon.iml /.neon /integration_tests/.neon @@ -18,3 +19,6 @@ test_output/ *.o *.so *.Po + +# pgindent typedef lists +*.list diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 2692684006..b318c295a3 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -70,3 +70,17 @@ We're using the following approach to make it work: - The label gets removed automatically, so to run CI again with new changes, the label should be added again (after the review) For details see [`approved-for-ci-run.yml`](.github/workflows/approved-for-ci-run.yml) + +## How do I add the "pinned" tag to an buildtools image? +We use the `pinned` tag for `Dockerfile.buildtools` build images in our CI/CD setup, currently adding the `pinned` tag is a manual operation. + +You can call it from GitHub UI: https://github.com/neondatabase/neon/actions/workflows/update_build_tools_image.yml, +or using GitHub CLI: + +```bash +gh workflow -R neondatabase/neon run update_build_tools_image.yml \ + -f from-tag=6254913013 \ + -f to-tag=pinned \ + +# Default `-f to-tag` is `pinned`, so the parameter can be omitted. +``` \ No newline at end of file diff --git a/Cargo.lock b/Cargo.lock index 6546590f6c..93efbadd79 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -44,6 +44,12 @@ dependencies = [ "memchr", ] +[[package]] +name = "allocator-api2" +version = "0.2.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0942ffc6dcaadf03badf6e6a2d0228460359d5e34b57ccdc720b7382dfbd5ec5" + [[package]] name = "android_system_properties" version = "0.1.5" @@ -178,15 +184,15 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "81953c529336010edd6d8e358f886d9581267795c61b19475b71314bffa46d35" dependencies = [ "concurrent-queue", - "event-listener", + "event-listener 2.5.3", "futures-core", ] [[package]] name = "async-compression" -version = "0.4.0" +version = "0.4.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b0122885821398cc923ece939e24d1056a2384ee719432397fa9db87230ff11" +checksum = "bc2d0cfb2a7388d34f590e76686704c494ed7aaceed62ee1ba35cbf363abc2a5" dependencies = [ "flate2", "futures-core", @@ -199,11 +205,13 @@ dependencies = [ [[package]] name = "async-lock" -version = "2.8.0" +version = "3.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "287272293e9d8c41773cec55e365490fe034813a2f172f502d6ddcf75b2f582b" +checksum = "7125e42787d53db9dd54261812ef17e937c95a51e4d291373b670342fa44310c" dependencies = [ - "event-listener", + "event-listener 4.0.0", + "event-listener-strategy", + "pin-project-lite", ] [[package]] @@ -225,7 +233,7 @@ checksum = "16e62a023e7c117e27523144c5d2459f4397fcc3cab0085af8e2224f643a0193" dependencies = [ "proc-macro2", "quote", - "syn 2.0.28", + "syn 2.0.32", ] [[package]] @@ -236,7 +244,7 @@ checksum = "b9ccdd8f2a161be9bd5c023df56f1b2a0bd1d83872ae53b71a84a12c9bf6e842" dependencies = [ "proc-macro2", "quote", - "syn 2.0.28", + "syn 2.0.32", ] [[package]] @@ -686,9 +694,9 @@ dependencies = [ [[package]] name = "azure_core" -version = "0.16.0" +version = "0.18.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e29286b9edfdd6f2c7e9d970bb5b015df8621258acab9ecfcea09b2d7692467" +checksum = "a6218987c374650fdad0b476bfc675729762c28dfb35f58608a38a2b1ea337dd" dependencies = [ "async-trait", "base64 0.21.1", @@ -696,8 +704,10 @@ dependencies = [ "dyn-clone", "futures", "getrandom 0.2.11", + "hmac", "http-types", "log", + "once_cell", "paste", "pin-project", "quick-xml", @@ -706,6 +716,7 @@ dependencies = [ "rustc_version", "serde", "serde_json", + "sha2", "time", "url", "uuid", @@ -713,9 +724,9 @@ dependencies = [ [[package]] name = "azure_identity" -version = "0.16.2" +version = "0.18.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b67b337346da8739e91ea1e9400a6ebc9bc54e0b2af1d23c9bcd565950588f9" +checksum = "9e1eacc4f7fb2a73d57c39139d0fc3aed78435606055779ddaef4b43cdf919a8" dependencies = [ "async-lock", "async-trait", @@ -725,7 +736,6 @@ dependencies = [ "oauth2", "pin-project", "serde", - "serde_json", "time", "tz-rs", "url", @@ -734,21 +744,18 @@ dependencies = [ [[package]] name = "azure_storage" -version = "0.16.0" +version = "0.18.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bed0ccefde57930b2886fd4aed1f70ac469c197b8c2e94828290d71bcbdb5d97" +checksum = "ade8f2653e408de88b9eafec9f48c3c26b94026375e88adbd34523a7dd9795a1" dependencies = [ "RustyXML", + "async-lock", "async-trait", "azure_core", "bytes", - "futures", - "hmac", "log", "serde", "serde_derive", - "serde_json", - "sha2", "time", "url", "uuid", @@ -756,13 +763,14 @@ dependencies = [ [[package]] name = "azure_storage_blobs" -version = "0.16.0" +version = "0.18.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f91a52da2d192cfe43759f61e8bb31a5969f1722d5b85ac89627f356ad674ab4" +checksum = "025701c7cc5b523100f0f3b2b01723564ec5a86c03236521c06826337047e872" dependencies = [ "RustyXML", "azure_core", "azure_storage", + "azure_svc_blobstorage", "bytes", "futures", "log", @@ -774,6 +782,22 @@ dependencies = [ "uuid", ] +[[package]] +name = "azure_svc_blobstorage" +version = "0.18.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76051e5bb67cea1055abe5e530a0878feac7e0ab4cbbcb4a6adc953a58993389" +dependencies = [ + "azure_core", + "bytes", + "futures", + "log", + "once_cell", + "serde", + "serde_json", + "time", +] + [[package]] name = "backtrace" version = "0.3.67" @@ -857,7 +881,7 @@ dependencies = [ "regex", "rustc-hash", "shlex", - "syn 2.0.28", + "syn 2.0.32", "which", ] @@ -890,7 +914,7 @@ checksum = "a246e68bb43f6cd9db24bea052a53e40405417c5fb372e3d1a8a7f770a564ef5" dependencies = [ "memchr", "once_cell", - "regex-automata", + "regex-automata 0.1.10", "serde", ] @@ -1071,7 +1095,7 @@ dependencies = [ "heck", "proc-macro2", "quote", - "syn 2.0.28", + "syn 2.0.32", ] [[package]] @@ -1137,6 +1161,7 @@ dependencies = [ "flate2", "futures", "hyper", + "nix 0.26.2", "notify", "num_cpus", "opentelemetry", @@ -1144,8 +1169,10 @@ dependencies = [ "regex", "remote_storage", "reqwest", + "rust-ini", "serde", "serde_json", + "signal-hook", "tar", "tokio", "tokio-postgres", @@ -1177,6 +1204,26 @@ version = "0.9.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "28c122c3980598d243d63d9a704629a2d748d101f278052ff068be5a4423ab6f" +[[package]] +name = "const-random" +version = "0.1.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5aaf16c9c2c612020bcfd042e170f6e32de9b9d75adb5277cdbbd2e2c8c8299a" +dependencies = [ + "const-random-macro", +] + +[[package]] +name = "const-random-macro" +version = "0.1.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f9d839f2a20b0aee515dc581a6172f2321f96cab76c1a38a4c584a194955390e" +dependencies = [ + "getrandom 0.2.11", + "once_cell", + "tiny-keccak", +] + [[package]] name = "const_fn" version = "0.4.9" @@ -1221,16 +1268,19 @@ name = "control_plane" version = "0.1.0" dependencies = [ "anyhow", + "async-trait", "camino", "clap", "comfy-table", "compute_api", + "futures", "git-version", "hex", "hyper", "nix 0.26.2", "once_cell", "pageserver_api", + "pageserver_client", "postgres", "postgres_backend", "postgres_connection", @@ -1244,6 +1294,8 @@ dependencies = [ "tar", "thiserror", "tokio", + "tokio-postgres", + "tokio-util", "toml", "tracing", "url", @@ -1404,6 +1456,12 @@ dependencies = [ "winapi", ] +[[package]] +name = "crunchy" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a81dae078cea95a014a339291cec439d2f232ebe854a9d672b796c6afafa9b7" + [[package]] name = "crypto-bigint" version = "0.4.9" @@ -1457,7 +1515,7 @@ dependencies = [ "proc-macro2", "quote", "strsim", - "syn 2.0.28", + "syn 2.0.32", ] [[package]] @@ -1468,7 +1526,7 @@ checksum = "29a358ff9f12ec09c3e61fef9b5a9902623a695a46a917b07f269bff1445611a" dependencies = [ "darling_core", "quote", - "syn 2.0.28", + "syn 2.0.32", ] [[package]] @@ -1543,7 +1601,16 @@ checksum = "487585f4d0c6655fe74905e2504d8ad6908e4db67f744eb140876906c2f3175d" dependencies = [ "proc-macro2", "quote", - "syn 2.0.28", + "syn 2.0.32", +] + +[[package]] +name = "dlv-list" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "442039f5147480ba31067cb00ada1adae6892028e40e45fc5de7b7df6dcc1b5f" +dependencies = [ + "const-random", ] [[package]] @@ -1637,7 +1704,7 @@ dependencies = [ "darling", "proc-macro2", "quote", - "syn 2.0.28", + "syn 2.0.32", ] [[package]] @@ -1680,6 +1747,27 @@ version = "2.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0206175f82b8d6bf6652ff7d71a1e27fd2e4efde587fd368662814d6ec1d9ce0" +[[package]] +name = "event-listener" +version = "4.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "770d968249b5d99410d61f5bf89057f3199a077a04d087092f58e7d10692baae" +dependencies = [ + "concurrent-queue", + "parking", + "pin-project-lite", +] + +[[package]] +name = "event-listener-strategy" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "958e4d70b6d5e81971bebec42271ec641e7ff4e170a6fa605f2b8a8b65cb97d3" +dependencies = [ + "event-listener 4.0.0", + "pin-project-lite", +] + [[package]] name = "fail" version = "0.5.1" @@ -1870,7 +1958,7 @@ checksum = "89ca545a94061b6365f2c7355b4b32bd20df3ff95f02da9329b34ccc3bd6ee72" dependencies = [ "proc-macro2", "quote", - "syn 2.0.28", + "syn 2.0.32", ] [[package]] @@ -2042,6 +2130,10 @@ name = "hashbrown" version = "0.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2c6201b9ff9fd90a5a3bac2e56a830d0caa509576f0e503818ee82c181b3437a" +dependencies = [ + "ahash", + "allocator-api2", +] [[package]] name = "hashlink" @@ -2052,6 +2144,20 @@ dependencies = [ "hashbrown 0.13.2", ] +[[package]] +name = "hdrhistogram" +version = "7.5.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "765c9198f173dd59ce26ff9f95ef0aafd0a0fe01fb9d72841bc5066a4c06511d" +dependencies = [ + "base64 0.21.1", + "byteorder", + "crossbeam-channel", + "flate2", + "nom", + "num-traits", +] + [[package]] name = "heapless" version = "0.8.0" @@ -2433,13 +2539,14 @@ dependencies = [ [[package]] name = "jsonwebtoken" -version = "8.3.0" +version = "9.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6971da4d9c3aa03c3d8f3ff0f4155b534aad021292003895a469716b2a230378" +checksum = "5c7ea04a7c5c055c175f189b6dc6ba036fd62306b58c66c9f6389036c503a3f4" dependencies = [ "base64 0.21.1", - "pem 1.1.1", - "ring 0.16.20", + "js-sys", + "pem 3.0.3", + "ring 0.17.6", "serde", "serde_json", "simple_asn1", @@ -2533,7 +2640,7 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8263075bb86c5a1b1427b5ae862e8889656f126e9f77c484496e8b47cf5c5558" dependencies = [ - "regex-automata", + "regex-automata 0.1.10", ] [[package]] @@ -2559,9 +2666,9 @@ checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771" [[package]] name = "memchr" -version = "2.5.0" +version = "2.6.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d" +checksum = "f665ee40bc4a3c5590afb1e9677db74a508659dfd71e126420da8274909a0167" [[package]] name = "memoffset" @@ -2634,14 +2741,14 @@ dependencies = [ [[package]] name = "mio" -version = "0.8.6" +version = "0.8.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b9d9a46eff5b4ff64b45a9e316a6d1e0bc719ef429cbec4dc630684212bfdf9" +checksum = "8f3d0b296e374a4e6f3c7b0a1f5a51d748a0d34c85e7dc48fc3fa9a87657fe09" dependencies = [ "libc", "log", "wasi 0.11.0+wasi-snapshot-preview1", - "windows-sys 0.45.0", + "windows-sys 0.48.0", ] [[package]] @@ -2852,7 +2959,7 @@ checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c" dependencies = [ "proc-macro2", "quote", - "syn 2.0.28", + "syn 2.0.32", ] [[package]] @@ -2974,6 +3081,16 @@ dependencies = [ "tokio-stream", ] +[[package]] +name = "ordered-multimap" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4d6a8c22fc714f0c2373e6091bf6f5e9b37b1bc0b1184874b7e0a4e303d318f" +dependencies = [ + "dlv-list", + "hashbrown 0.14.0", +] + [[package]] name = "os_info" version = "3.7.0" @@ -3002,6 +3119,28 @@ dependencies = [ "sha2", ] +[[package]] +name = "pagebench" +version = "0.1.0" +dependencies = [ + "anyhow", + "clap", + "futures", + "hdrhistogram", + "humantime", + "humantime-serde", + "pageserver", + "pageserver_api", + "pageserver_client", + "rand 0.8.5", + "serde", + "serde_json", + "tokio", + "tracing", + "utils", + "workspace_hack", +] + [[package]] name = "pagectl" version = "0.1.0" @@ -3054,6 +3193,7 @@ dependencies = [ "humantime-serde", "hyper", "itertools", + "md5", "metrics", "nix 0.26.2", "num-traits", @@ -3090,6 +3230,7 @@ dependencies = [ "tokio", "tokio-io-timeout", "tokio-postgres", + "tokio-stream", "tokio-tar", "tokio-util", "toml_edit", @@ -3112,6 +3253,7 @@ dependencies = [ "enum-map", "hex", "postgres_ffi", + "rand 0.8.5", "serde", "serde_json", "serde_with", @@ -3122,6 +3264,27 @@ dependencies = [ "workspace_hack", ] +[[package]] +name = "pageserver_client" +version = "0.1.0" +dependencies = [ + "anyhow", + "async-trait", + "bytes", + "futures", + "pageserver_api", + "postgres", + "reqwest", + "serde", + "thiserror", + "tokio", + "tokio-postgres", + "tokio-stream", + "tokio-util", + "utils", + "workspace_hack", +] + [[package]] name = "parking" version = "2.1.1" @@ -3213,18 +3376,19 @@ checksum = "19b17cddbe7ec3f8bc800887bab5e717348c95ea2ca0b1bf0837fb964dc67099" [[package]] name = "pem" -version = "1.1.1" +version = "2.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a8835c273a76a90455d7344889b0964598e3316e2a79ede8e36f16bdcf2228b8" +checksum = "6b13fe415cdf3c8e44518e18a7c95a13431d9bdf6d15367d82b23c377fdd441a" dependencies = [ - "base64 0.13.1", + "base64 0.21.1", + "serde", ] [[package]] name = "pem" -version = "2.0.1" +version = "3.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6b13fe415cdf3c8e44518e18a7c95a13431d9bdf6d15367d82b23c377fdd441a" +checksum = "1b8fcc794035347fb64beda2d3b462595dd2753e3f268d89c5aae77e8cf2c310" dependencies = [ "base64 0.21.1", "serde", @@ -3281,7 +3445,7 @@ checksum = "39407670928234ebc5e6e580247dd567ad73a3578460c5990f9503df207e8f07" dependencies = [ "proc-macro2", "quote", - "syn 2.0.28", + "syn 2.0.32", ] [[package]] @@ -3488,7 +3652,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3b69d39aab54d069e7f2fe8cb970493e7834601ca2d8c65fd7bbd183578080d1" dependencies = [ "proc-macro2", - "syn 2.0.28", + "syn 2.0.32", ] [[package]] @@ -3643,7 +3807,8 @@ dependencies = [ "serde", "serde_json", "sha2", - "socket2 0.5.3", + "smol_str", + "socket2 0.5.5", "sync_wrapper", "task-local-extensions", "thiserror", @@ -3667,9 +3832,9 @@ dependencies = [ [[package]] name = "quick-xml" -version = "0.30.0" +version = "0.31.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eff6510e86862b57b210fd8cbe8ed3f0d7d600b9c2863cd4549a2e033c66e956" +checksum = "1004a344b30a54e2ee58d66a71b32d2db2feb0a31f9a2d302bf0536f15de2a33" dependencies = [ "memchr", "serde", @@ -3809,13 +3974,14 @@ dependencies = [ [[package]] name = "regex" -version = "1.8.2" +version = "1.10.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d1a59b5d8e97dee33696bf13c5ba8ab85341c002922fba050069326b9c498974" +checksum = "380b951a9c5e80ddfd6136919eef32310721aa4aacd4889a8d39124b026ab343" dependencies = [ "aho-corasick", "memchr", - "regex-syntax 0.7.2", + "regex-automata 0.4.3", + "regex-syntax 0.8.2", ] [[package]] @@ -3827,6 +3993,17 @@ dependencies = [ "regex-syntax 0.6.29", ] +[[package]] +name = "regex-automata" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f804c7828047e88b2d32e2d7fe5a105da8ee3264f01902f796c8e067dc2483f" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax 0.8.2", +] + [[package]] name = "regex-syntax" version = "0.6.29" @@ -3835,9 +4012,9 @@ checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1" [[package]] name = "regex-syntax" -version = "0.7.2" +version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "436b050e76ed2903236f032a59761c1eb99e1b0aead2c257922771dab1fc8c78" +checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f" [[package]] name = "relative-path" @@ -3863,6 +4040,7 @@ dependencies = [ "bytes", "camino", "camino-tempfile", + "futures", "futures-util", "http-types", "hyper", @@ -4082,10 +4260,20 @@ dependencies = [ "regex", "relative-path", "rustc_version", - "syn 2.0.28", + "syn 2.0.32", "unicode-ident", ] +[[package]] +name = "rust-ini" +version = "0.20.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3e0698206bcb8882bf2a9ecb4c1e7785db57ff052297085a6efd4fe42302068a" +dependencies = [ + "cfg-if", + "ordered-multimap", +] + [[package]] name = "rustc-demangle" version = "0.1.23" @@ -4217,17 +4405,20 @@ dependencies = [ "async-stream", "aws-config", "aws-sdk-s3", + "aws-smithy-async", "bincode", "bytes", "chrono", "clap", "crc32c", "either", + "futures", "futures-util", "hex", "histogram", "itertools", "pageserver", + "pageserver_api", "rand 0.8.5", "remote_storage", "reqwest", @@ -4260,6 +4451,7 @@ dependencies = [ "clap", "const_format", "crc32c", + "fail", "fs2", "futures", "git-version", @@ -4290,6 +4482,7 @@ dependencies = [ "tokio-io-timeout", "tokio-postgres", "tokio-stream", + "tokio-util", "toml_edit", "tracing", "url", @@ -4334,12 +4527,12 @@ checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" [[package]] name = "sct" -version = "0.7.0" +version = "0.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d53dcdb7c9f8158937a7981b48accfd39a43af418591a5d008c7b22b5e1b7ca4" +checksum = "da046153aa2352493d6cb7da4b6e5c0c057d8a1d0a9aa8560baffdd945acd414" dependencies = [ - "ring 0.16.20", - "untrusted 0.7.1", + "ring 0.17.6", + "untrusted 0.9.0", ] [[package]] @@ -4515,7 +4708,7 @@ checksum = "aafe972d60b0b9bee71a91b92fee2d4fb3c9d7e8f6b179aa99f27203d99a4816" dependencies = [ "proc-macro2", "quote", - "syn 2.0.28", + "syn 2.0.32", ] [[package]] @@ -4596,7 +4789,7 @@ dependencies = [ "darling", "proc-macro2", "quote", - "syn 2.0.28", + "syn 2.0.32", ] [[package]] @@ -4709,6 +4902,15 @@ version = "1.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "62bb4feee49fdd9f707ef802e22365a35de4b7b299de4763d44bfea899442ff9" +[[package]] +name = "smol_str" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "74212e6bbe9a4352329b2f68ba3130c15a3f26fe88ff22dbdc6cdd58fa85e99c" +dependencies = [ + "serde", +] + [[package]] name = "socket2" version = "0.4.9" @@ -4721,9 +4923,9 @@ dependencies = [ [[package]] name = "socket2" -version = "0.5.3" +version = "0.5.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2538b18701741680e0322a2302176d3253a35388e2e62f172f64f4f16605f877" +checksum = "7b5fac59a5cb5dd637972e5fca70daf0523c9067fcdc4842f053dae04a18f8e9" dependencies = [ "libc", "windows-sys 0.48.0", @@ -4854,9 +5056,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.28" +version = "2.0.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "04361975b3f5e348b2189d8dc55bc942f278b2d482a6a0365de5bdd62d351567" +checksum = "239814284fd6f1a4ffe4ca893952cdd93c224b6a1571c9a9eadd670295c0c9e2" dependencies = [ "proc-macro2", "quote", @@ -4986,7 +5188,7 @@ checksum = "f9456a42c5b0d803c8cd86e73dd7cc9edd429499f37a3550d286d5e86720569f" dependencies = [ "proc-macro2", "quote", - "syn 2.0.28", + "syn 2.0.32", ] [[package]] @@ -5029,6 +5231,15 @@ dependencies = [ "time-core", ] +[[package]] +name = "tiny-keccak" +version = "2.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2c9d3793400a45f954c52e73d068316d76b6f4e36977e3fcebb13a2721e80237" +dependencies = [ + "crunchy", +] + [[package]] name = "tinytemplate" version = "1.2.1" @@ -5070,18 +5281,18 @@ dependencies = [ [[package]] name = "tokio" -version = "1.28.1" +version = "1.34.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0aa32867d44e6f2ce3385e89dceb990188b8bb0fb25b0cf576647a6f98ac5105" +checksum = "d0c014766411e834f7af5b8f4cf46257aab4036ca95e9d2c144a10f59ad6f5b9" dependencies = [ - "autocfg", + "backtrace", "bytes", "libc", "mio", "num_cpus", "pin-project-lite", "signal-hook-registry", - "socket2 0.4.9", + "socket2 0.5.5", "tokio-macros", "windows-sys 0.48.0", ] @@ -5098,13 +5309,13 @@ dependencies = [ [[package]] name = "tokio-macros" -version = "2.1.0" +version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "630bdcf245f78637c13ec01ffae6187cca34625e8c63150d424b59e55af2675e" +checksum = "5b8a1e28f2deaa14e508979454cb3a223b10b938b45af148bc0986de36f1923b" dependencies = [ "proc-macro2", "quote", - "syn 2.0.28", + "syn 2.0.32", ] [[package]] @@ -5135,7 +5346,7 @@ dependencies = [ "pin-project-lite", "postgres-protocol", "postgres-types", - "socket2 0.5.3", + "socket2 0.5.5", "tokio", "tokio-util", ] @@ -5204,13 +5415,16 @@ dependencies = [ [[package]] name = "tokio-util" -version = "0.7.8" +version = "0.7.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "806fe8c2c87eccc8b3267cbae29ed3ab2d0bd37fca70ab622e46aaa9375ddb7d" +checksum = "5419f34732d9eb6ee4c3578b7989078579b7f039cbbb9ca2c4da015749371e15" dependencies = [ "bytes", "futures-core", + "futures-io", "futures-sink", + "futures-util", + "hashbrown 0.14.0", "pin-project-lite", "tokio", "tracing", @@ -5402,7 +5616,7 @@ checksum = "0f57e3ca2a01450b1a921183a9c9cbfda207fd822cef4ccb00a65402cbba7a74" dependencies = [ "proc-macro2", "quote", - "syn 2.0.28", + "syn 2.0.32", ] [[package]] @@ -5669,6 +5883,7 @@ dependencies = [ "chrono", "const_format", "criterion", + "fail", "futures", "heapless", "hex", @@ -5688,6 +5903,7 @@ dependencies = [ "serde", "serde_assert", "serde_json", + "serde_path_to_error", "serde_with", "signal-hook", "strum", @@ -5846,7 +6062,7 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn 2.0.28", + "syn 2.0.32", "wasm-bindgen-shared", ] @@ -5880,7 +6096,7 @@ checksum = "e128beba882dd1eb6200e1dc92ae6c5dbaa4311aa7bb211ca035779e5efc39f8" dependencies = [ "proc-macro2", "quote", - "syn 2.0.28", + "syn 2.0.32", "wasm-bindgen-backend", "wasm-bindgen-shared", ] @@ -6192,6 +6408,7 @@ dependencies = [ "futures-io", "futures-sink", "futures-util", + "getrandom 0.2.11", "hex", "hmac", "hyper", @@ -6203,10 +6420,12 @@ dependencies = [ "num-bigint", "num-integer", "num-traits", + "once_cell", "prost", "rand 0.8.5", "regex", - "regex-syntax 0.7.2", + "regex-automata 0.4.3", + "regex-syntax 0.8.2", "reqwest", "ring 0.16.20", "rustls", @@ -6216,7 +6435,7 @@ dependencies = [ "smallvec", "subtle", "syn 1.0.109", - "syn 2.0.28", + "syn 2.0.32", "time", "time-macros", "tokio", @@ -6278,22 +6497,22 @@ dependencies = [ [[package]] name = "zerocopy" -version = "0.7.3" +version = "0.7.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a7af71d8643341260a65f89fa60c0eeaa907f34544d8f6d9b0df72f069b5e74" +checksum = "1c4061bedbb353041c12f413700357bec76df2c7e2ca8e4df8bac24c6bf68e3d" dependencies = [ "zerocopy-derive", ] [[package]] name = "zerocopy-derive" -version = "0.7.3" +version = "0.7.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9731702e2f0617ad526794ae28fbc6f6ca8849b5ba729666c2a5bc4b6ddee2cd" +checksum = "b3c129550b3e6de3fd0ba67ba5c81818f9805e58b8d7fee80a3a59d2c9fc601a" dependencies = [ "proc-macro2", "quote", - "syn 2.0.28", + "syn 2.0.32", ] [[package]] @@ -6304,30 +6523,28 @@ checksum = "2a0956f1ba7c7909bfb66c2e9e4124ab6f6482560f6628b5aaeba39207c9aad9" [[package]] name = "zstd" -version = "0.12.4" +version = "0.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1a27595e173641171fc74a1232b7b1c7a7cb6e18222c11e9dfb9888fa424c53c" +checksum = "bffb3309596d527cfcba7dfc6ed6052f1d39dfbd7c867aa2e865e4a449c10110" dependencies = [ "zstd-safe", ] [[package]] name = "zstd-safe" -version = "6.0.6" +version = "7.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ee98ffd0b48ee95e6c5168188e44a54550b1564d9d530ee21d5f0eaed1069581" +checksum = "43747c7422e2924c11144d5229878b98180ef8b06cca4ab5af37afc8a8d8ea3e" dependencies = [ - "libc", "zstd-sys", ] [[package]] name = "zstd-sys" -version = "2.0.8+zstd.1.5.5" +version = "2.0.9+zstd.1.5.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5556e6ee25d32df2586c098bbfa278803692a20d0ab9565e049480d52707ec8c" +checksum = "9e16efa8a874a0481a574084d34cc26fdb3b99627480f785888deb6386506656" dependencies = [ "cc", - "libc", "pkg-config", ] diff --git a/Cargo.toml b/Cargo.toml index cbcb25359d..5de636778a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -5,6 +5,8 @@ members = [ "control_plane", "pageserver", "pageserver/ctl", + "pageserver/client", + "pageserver/pagebench", "proxy", "safekeeper", "storage_broker", @@ -38,10 +40,10 @@ license = "Apache-2.0" anyhow = { version = "1.0", features = ["backtrace"] } arc-swap = "1.6" async-compression = { version = "0.4.0", features = ["tokio", "gzip", "zstd"] } -azure_core = "0.16" -azure_identity = "0.16" -azure_storage = "0.16" -azure_storage_blobs = "0.16" +azure_core = "0.18" +azure_identity = "0.18" +azure_storage = "0.18" +azure_storage_blobs = "0.18" flate2 = "1.0.26" async-stream = "0.3" async-trait = "0.1" @@ -78,6 +80,7 @@ futures-util = "0.3" git-version = "0.3" hashbrown = "0.13" hashlink = "0.8.1" +hdrhistogram = "7.5.2" hex = "0.4" hex-literal = "0.4" hmac = "0.12.1" @@ -90,7 +93,7 @@ hyper-tungstenite = "0.11" inotify = "0.10.2" ipnet = "2.9.0" itertools = "0.10" -jsonwebtoken = "8" +jsonwebtoken = "9" libc = "0.2" md5 = "0.7.0" memoffset = "0.8" @@ -109,7 +112,7 @@ pin-project-lite = "0.2" prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency prost = "0.11" rand = "0.8" -regex = "1.4" +regex = "1.10.2" reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"] } reqwest-tracing = { version = "0.4.0", features = ["opentelemetry_0_19"] } reqwest-middleware = "0.2.0" @@ -132,6 +135,7 @@ serde_assert = "0.5.0" sha2 = "0.10.2" signal-hook = "0.3" smallvec = "1.11" +smol_str = { version = "0.2.0", features = ["serde"] } socket2 = "0.5" strum = "0.24" strum_macros = "0.24" @@ -148,7 +152,7 @@ tokio-postgres-rustls = "0.10.0" tokio-rustls = "0.24" tokio-stream = "0.1" tokio-tar = "0.3" -tokio-util = { version = "0.7", features = ["io"] } +tokio-util = { version = "0.7.10", features = ["io", "rt"] } toml = "0.7" toml_edit = "0.19" tonic = {version = "0.9", features = ["tls", "tls-roots"]} @@ -181,6 +185,7 @@ compute_api = { version = "0.1", path = "./libs/compute_api/" } consumption_metrics = { version = "0.1", path = "./libs/consumption_metrics/" } metrics = { version = "0.1", path = "./libs/metrics/" } pageserver_api = { version = "0.1", path = "./libs/pageserver_api/" } +pageserver_client = { path = "./pageserver/client" } postgres_backend = { version = "0.1", path = "./libs/postgres_backend/" } postgres_connection = { version = "0.1", path = "./libs/postgres_connection/" } postgres_ffi = { version = "0.1", path = "./libs/postgres_ffi/" } diff --git a/Dockerfile b/Dockerfile index 60de9cfa3e..5d5fde4f14 100644 --- a/Dockerfile +++ b/Dockerfile @@ -3,7 +3,7 @@ ### By default, the binaries inside the image have some mock parameters and can start, but are not intended to be used ### inside this image in the real deployments. ARG REPOSITORY=neondatabase -ARG IMAGE=rust +ARG IMAGE=build-tools ARG TAG=pinned # Build Postgres diff --git a/Dockerfile.buildtools b/Dockerfile.buildtools new file mode 100644 index 0000000000..c2fcd8841e --- /dev/null +++ b/Dockerfile.buildtools @@ -0,0 +1,166 @@ +FROM debian:bullseye-slim + +# Add nonroot user +RUN useradd -ms /bin/bash nonroot -b /home +SHELL ["/bin/bash", "-c"] + +# System deps +RUN set -e \ + && apt update \ + && apt install -y \ + autoconf \ + automake \ + bison \ + build-essential \ + ca-certificates \ + cmake \ + curl \ + flex \ + git \ + gnupg \ + gzip \ + jq \ + libcurl4-openssl-dev \ + libbz2-dev \ + libffi-dev \ + liblzma-dev \ + libncurses5-dev \ + libncursesw5-dev \ + libpq-dev \ + libreadline-dev \ + libseccomp-dev \ + libsqlite3-dev \ + libssl-dev \ + libstdc++-10-dev \ + libtool \ + libxml2-dev \ + libxmlsec1-dev \ + libxxhash-dev \ + lsof \ + make \ + netcat \ + net-tools \ + openssh-client \ + parallel \ + pkg-config \ + unzip \ + wget \ + xz-utils \ + zlib1g-dev \ + zstd \ + && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* + +# protobuf-compiler (protoc) +ENV PROTOC_VERSION 25.1 +RUN curl -fsSL "https://github.com/protocolbuffers/protobuf/releases/download/v${PROTOC_VERSION}/protoc-${PROTOC_VERSION}-linux-$(uname -m | sed 's/aarch64/aarch_64/g').zip" -o "protoc.zip" \ + && unzip -q protoc.zip -d protoc \ + && mv protoc/bin/protoc /usr/local/bin/protoc \ + && mv protoc/include/google /usr/local/include/google \ + && rm -rf protoc.zip protoc + +# LLVM +ENV LLVM_VERSION=17 +RUN curl -fsSL 'https://apt.llvm.org/llvm-snapshot.gpg.key' | apt-key add - \ + && echo "deb http://apt.llvm.org/bullseye/ llvm-toolchain-bullseye-${LLVM_VERSION} main" > /etc/apt/sources.list.d/llvm.stable.list \ + && apt update \ + && apt install -y clang-${LLVM_VERSION} llvm-${LLVM_VERSION} \ + && bash -c 'for f in /usr/bin/clang*-${LLVM_VERSION} /usr/bin/llvm*-${LLVM_VERSION}; do ln -s "${f}" "${f%-${LLVM_VERSION}}"; done' \ + && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* + +# PostgreSQL 14 +RUN curl -fsSL 'https://www.postgresql.org/media/keys/ACCC4CF8.asc' | apt-key add - \ + && echo 'deb http://apt.postgresql.org/pub/repos/apt bullseye-pgdg main' > /etc/apt/sources.list.d/pgdg.list \ + && apt update \ + && apt install -y postgresql-client-14 \ + && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* + +# AWS CLI +RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-$(uname -m).zip" -o "awscliv2.zip" \ + && unzip -q awscliv2.zip \ + && ./aws/install \ + && rm awscliv2.zip + +# Mold: A Modern Linker +ENV MOLD_VERSION v2.4.0 +RUN set -e \ + && git clone https://github.com/rui314/mold.git \ + && mkdir mold/build \ + && cd mold/build \ + && git checkout ${MOLD_VERSION} \ + && cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_COMPILER=clang++ .. \ + && cmake --build . -j $(nproc) \ + && cmake --install . \ + && cd .. \ + && rm -rf mold + +# LCOV +# Build lcov from a fork: +# It includes several bug fixes on top on v2.0 release (https://github.com/linux-test-project/lcov/compare/v2.0...master) +# And patches from us: +# - Generates json file with code coverage summary (https://github.com/neondatabase/lcov/commit/426e7e7a22f669da54278e9b55e6d8caabd00af0.tar.gz) +RUN for package in Capture::Tiny DateTime Devel::Cover Digest::MD5 File::Spec JSON::XS Memory::Process Time::HiRes JSON; do yes | perl -MCPAN -e "CPAN::Shell->notest('install', '$package')"; done \ + && wget https://github.com/neondatabase/lcov/archive/426e7e7a22f669da54278e9b55e6d8caabd00af0.tar.gz -O lcov.tar.gz \ + && echo "61a22a62e20908b8b9e27d890bd0ea31f567a7b9668065589266371dcbca0992 lcov.tar.gz" | sha256sum --check \ + && mkdir -p lcov && tar -xzf lcov.tar.gz -C lcov --strip-components=1 \ + && cd lcov \ + && make install \ + && rm -rf ../lcov.tar.gz + +# Switch to nonroot user +USER nonroot:nonroot +WORKDIR /home/nonroot + +# Python +ENV PYTHON_VERSION=3.9.2 \ + PYENV_ROOT=/home/nonroot/.pyenv \ + PATH=/home/nonroot/.pyenv/shims:/home/nonroot/.pyenv/bin:/home/nonroot/.poetry/bin:$PATH +RUN set -e \ + && cd $HOME \ + && curl -sSO https://raw.githubusercontent.com/pyenv/pyenv-installer/master/bin/pyenv-installer \ + && chmod +x pyenv-installer \ + && ./pyenv-installer \ + && export PYENV_ROOT=/home/nonroot/.pyenv \ + && export PATH="$PYENV_ROOT/bin:$PATH" \ + && export PATH="$PYENV_ROOT/shims:$PATH" \ + && pyenv install ${PYTHON_VERSION} \ + && pyenv global ${PYTHON_VERSION} \ + && python --version \ + && pip install --upgrade pip \ + && pip --version \ + && pip install pipenv wheel poetry + +# Switch to nonroot user (again) +USER nonroot:nonroot +WORKDIR /home/nonroot + +# Rust +# Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`) +ENV RUSTC_VERSION=1.74.0 +ENV RUSTUP_HOME="/home/nonroot/.rustup" +ENV PATH="/home/nonroot/.cargo/bin:${PATH}" +RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \ + chmod +x rustup-init && \ + ./rustup-init -y --default-toolchain ${RUSTC_VERSION} && \ + rm rustup-init && \ + export PATH="$HOME/.cargo/bin:$PATH" && \ + . "$HOME/.cargo/env" && \ + cargo --version && rustup --version && \ + rustup component add llvm-tools-preview rustfmt clippy && \ + cargo install --git https://github.com/paritytech/cachepot && \ + cargo install rustfilt && \ + cargo install cargo-hakari && \ + cargo install cargo-deny && \ + cargo install cargo-hack && \ + cargo install cargo-nextest && \ + rm -rf /home/nonroot/.cargo/registry && \ + rm -rf /home/nonroot/.cargo/git +ENV RUSTC_WRAPPER=cachepot + +# Show versions +RUN whoami \ + && python --version \ + && pip --version \ + && cargo --version --verbose \ + && rustup --version --verbose \ + && rustc --version --verbose \ + && clang --version diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node index 8109091e76..14ba1b5b9a 100644 --- a/Dockerfile.compute-node +++ b/Dockerfile.compute-node @@ -1,6 +1,6 @@ ARG PG_VERSION ARG REPOSITORY=neondatabase -ARG IMAGE=rust +ARG IMAGE=build-tools ARG TAG=pinned ARG BUILD_TAG @@ -48,7 +48,29 @@ RUN cd postgres && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrowlocks.control && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgstattuple.control && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/refint.control && \ - echo 'trusted = true' >> /usr/local/pgsql/share/extension/xml2.control + echo 'trusted = true' >> /usr/local/pgsql/share/extension/xml2.control && \ + # We need to grant EXECUTE on pg_stat_statements_reset() to neon_superuser. + # In vanilla postgres this function is limited to Postgres role superuser. + # In neon we have neon_superuser role that is not a superuser but replaces superuser in some cases. + # We could add the additional grant statements to the postgres repository but it would be hard to maintain, + # whenever we need to pick up a new postgres version and we want to limit the changes in our postgres fork, + # so we do it here. + old_list="pg_stat_statements--1.0--1.1.sql pg_stat_statements--1.1--1.2.sql pg_stat_statements--1.2--1.3.sql pg_stat_statements--1.3--1.4.sql pg_stat_statements--1.4--1.5.sql pg_stat_statements--1.4.sql pg_stat_statements--1.5--1.6.sql"; \ + # the first loop is for pg_stat_statement extension version <= 1.6 + for file in /usr/local/pgsql/share/extension/pg_stat_statements--*.sql; do \ + filename=$(basename "$file"); \ + if echo "$old_list" | grep -q -F "$filename"; then \ + echo 'GRANT EXECUTE ON FUNCTION pg_stat_statements_reset() TO neon_superuser;' >> $file; \ + fi; \ + done; \ + # the second loop is for pg_stat_statement extension versions >= 1.7, + # where pg_stat_statement_reset() got 3 additional arguments + for file in /usr/local/pgsql/share/extension/pg_stat_statements--*.sql; do \ + filename=$(basename "$file"); \ + if ! echo "$old_list" | grep -q -F "$filename"; then \ + echo 'GRANT EXECUTE ON FUNCTION pg_stat_statements_reset(Oid, Oid, bigint) TO neon_superuser;' >> $file; \ + fi; \ + done ######################################################################################### # @@ -393,7 +415,9 @@ RUN case "${PG_VERSION}" in \ export TIMESCALEDB_CHECKSUM=6fca72a6ed0f6d32d2b3523951ede73dc5f9b0077b38450a029a5f411fdb8c73 \ ;; \ *) \ - echo "TimescaleDB not supported on this PostgreSQL version. See https://github.com/timescale/timescaledb/issues/5752" && exit 0;; \ + export TIMESCALEDB_VERSION=2.13.0 \ + export TIMESCALEDB_CHECKSUM=584a351c7775f0e067eaa0e7277ea88cab9077cc4c455cbbf09a5d9723dce95d \ + ;; \ esac && \ apt-get update && \ apt-get install -y cmake && \ @@ -567,6 +591,23 @@ RUN wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4 make -j $(getconf _NPROCESSORS_ONLN) install && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/roaringbitmap.control +######################################################################################### +# +# Layer "pg-semver-pg-build" +# compile pg_semver extension +# +######################################################################################### +FROM build-deps AS pg-semver-pg-build +COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ + +ENV PATH "/usr/local/pgsql/bin/:$PATH" +RUN wget https://github.com/theory/pg-semver/archive/refs/tags/v0.32.1.tar.gz -O pg_semver.tar.gz && \ + echo "fbdaf7512026d62eec03fad8687c15ed509b6ba395bff140acd63d2e4fbe25d7 pg_semver.tar.gz" | sha256sum --check && \ + mkdir pg_semver-src && cd pg_semver-src && tar xvzf ../pg_semver.tar.gz --strip-components=1 -C . && \ + make -j $(getconf _NPROCESSORS_ONLN) && \ + make -j $(getconf _NPROCESSORS_ONLN) install && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/semver.control + ######################################################################################### # # Layer "pg-embedding-pg-build" @@ -729,8 +770,7 @@ RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_5.tar. echo "b516653575541cf221b99cf3f8be9b6821f6dbcfc125675c85f35090f824f00e wal2json_2_5.tar.gz" | sha256sum --check && \ mkdir wal2json-src && cd wal2json-src && tar xvzf ../wal2json_2_5.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) && \ - make -j $(getconf _NPROCESSORS_ONLN) install && \ - echo 'trusted = true' >> /usr/local/pgsql/share/extension/wal2json.control + make -j $(getconf _NPROCESSORS_ONLN) install ######################################################################################### # @@ -767,6 +807,7 @@ COPY --from=pg-pgx-ulid-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=rdkit-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg-uuidv7-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg-roaringbitmap-pg-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=pg-semver-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg-embedding-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=wal2json-pg-build /usr/local/pgsql /usr/local/pgsql COPY pgxn/ pgxn/ diff --git a/Dockerfile.compute-tools b/Dockerfile.compute-tools index 3066e3f7ca..cc305cc556 100644 --- a/Dockerfile.compute-tools +++ b/Dockerfile.compute-tools @@ -1,7 +1,7 @@ # First transient image to build compute_tools binaries # NB: keep in sync with rust image version in .github/workflows/build_and_test.yml ARG REPOSITORY=neondatabase -ARG IMAGE=rust +ARG IMAGE=build-tools ARG TAG=pinned ARG BUILD_TAG diff --git a/Makefile b/Makefile index 89acbe564a..004ca3fbcf 100644 --- a/Makefile +++ b/Makefile @@ -260,6 +260,44 @@ distclean: fmt: ./pre-commit.py --fix-inplace +postgres-%-pg-bsd-indent: postgres-% + +@echo "Compiling pg_bsd_indent" + $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/src/tools/pg_bsd_indent/ + +# Create typedef list for the core. Note that generally it should be combined with +# buildfarm one to cover platform specific stuff. +# https://wiki.postgresql.org/wiki/Running_pgindent_on_non-core_code_or_development_code +postgres-%-typedefs.list: postgres-% + $(ROOT_PROJECT_DIR)/vendor/postgres-$*/src/tools/find_typedef $(POSTGRES_INSTALL_DIR)/$*/bin > $@ + +# Indent postgres. See src/tools/pgindent/README for details. +.PHONY: postgres-%-pgindent +postgres-%-pgindent: postgres-%-pg-bsd-indent postgres-%-typedefs.list + +@echo merge with buildfarm typedef to cover all platforms + +@echo note: I first tried to download from pgbuildfarm.org, but for unclear reason e.g. \ + REL_16_STABLE list misses PGSemaphoreData + # wget -q -O - "http://www.pgbuildfarm.org/cgi-bin/typedefs.pl?branch=REL_16_STABLE" |\ + # cat - postgres-$*-typedefs.list | sort | uniq > postgres-$*-typedefs-full.list + cat $(ROOT_PROJECT_DIR)/vendor/postgres-$*/src/tools/pgindent/typedefs.list |\ + cat - postgres-$*-typedefs.list | sort | uniq > postgres-$*-typedefs-full.list + +@echo note: you might want to run it on selected files/dirs instead. + INDENT=$(POSTGRES_INSTALL_DIR)/build/$*/src/tools/pg_bsd_indent/pg_bsd_indent \ + $(ROOT_PROJECT_DIR)/vendor/postgres-$*/src/tools/pgindent/pgindent --typedefs postgres-$*-typedefs-full.list \ + $(ROOT_PROJECT_DIR)/vendor/postgres-$*/src/ \ + --excludes $(ROOT_PROJECT_DIR)/vendor/postgres-$*/src/tools/pgindent/exclude_file_patterns + rm -f pg*.BAK + +# Indent pxgn/neon. +.PHONY: pgindent +neon-pgindent: postgres-v16-pg-bsd-indent neon-pg-ext-v16 + $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v16/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \ + FIND_TYPEDEF=$(ROOT_PROJECT_DIR)/vendor/postgres-v16/src/tools/find_typedef \ + INDENT=$(POSTGRES_INSTALL_DIR)/build/v16/src/tools/pg_bsd_indent/pg_bsd_indent \ + PGINDENT_SCRIPT=$(ROOT_PROJECT_DIR)/vendor/postgres-v16/src/tools/pgindent/pgindent \ + -C $(POSTGRES_INSTALL_DIR)/build/neon-v16 \ + -f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile pgindent + + .PHONY: setup-pre-commit-hook setup-pre-commit-hook: ln -s -f $(ROOT_PROJECT_DIR)/pre-commit.py .git/hooks/pre-commit diff --git a/README.md b/README.md index 3e3123f5ee..98af1edee6 100644 --- a/README.md +++ b/README.md @@ -29,13 +29,14 @@ See developer documentation in [SUMMARY.md](/docs/SUMMARY.md) for more informati ```bash apt install build-essential libtool libreadline-dev zlib1g-dev flex bison libseccomp-dev \ libssl-dev clang pkg-config libpq-dev cmake postgresql-client protobuf-compiler \ -libcurl4-openssl-dev openssl python-poetry lsof libicu-dev +libcurl4-openssl-dev openssl python3-poetry lsof libicu-dev ``` * On Fedora, these packages are needed: ```bash dnf install flex bison readline-devel zlib-devel openssl-devel \ libseccomp-devel perl clang cmake postgresql postgresql-contrib protobuf-compiler \ - protobuf-devel libcurl-devel openssl poetry lsof libicu-devel + protobuf-devel libcurl-devel openssl poetry lsof libicu-devel libpq-devel python3-devel \ + libffi-devel ``` * On Arch based systems, these packages are needed: ```bash diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml index 47378f1910..759a117ee9 100644 --- a/compute_tools/Cargo.toml +++ b/compute_tools/Cargo.toml @@ -13,6 +13,7 @@ clap.workspace = true flate2.workspace = true futures.workspace = true hyper = { workspace = true, features = ["full"] } +nix.workspace = true notify.workspace = true num_cpus.workspace = true opentelemetry.workspace = true @@ -20,6 +21,7 @@ postgres.workspace = true regex.workspace = true serde.workspace = true serde_json.workspace = true +signal-hook.workspace = true tar.workspace = true reqwest = { workspace = true, features = ["json"] } tokio = { workspace = true, features = ["rt", "rt-multi-thread"] } @@ -37,5 +39,6 @@ workspace_hack.workspace = true toml_edit.workspace = true remote_storage = { version = "0.1", path = "../libs/remote_storage/" } vm_monitor = { version = "0.1", path = "../libs/vm_monitor/" } -zstd = "0.12.4" +zstd = "0.13" bytes = "1.0" +rust-ini = "0.20.0" diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs index 36e9ca0731..eb1d746f04 100644 --- a/compute_tools/src/bin/compute_ctl.rs +++ b/compute_tools/src/bin/compute_ctl.rs @@ -31,25 +31,31 @@ //! -C 'postgresql://cloud_admin@localhost/postgres' \ //! -S /var/db/postgres/specs/current.json \ //! -b /usr/local/bin/postgres \ -//! -r http://pg-ext-s3-gateway +//! -r http://pg-ext-s3-gateway \ +//! --pgbouncer-connstr 'host=localhost port=6432 dbname=pgbouncer user=cloud_admin sslmode=disable' +//! --pgbouncer-ini-path /etc/pgbouncer.ini \ //! ``` //! use std::collections::HashMap; use std::fs::File; use std::path::Path; use std::process::exit; +use std::sync::atomic::Ordering; use std::sync::{mpsc, Arc, Condvar, Mutex, RwLock}; use std::{thread, time::Duration}; use anyhow::{Context, Result}; use chrono::Utc; use clap::Arg; +use nix::sys::signal::{kill, Signal}; +use signal_hook::consts::{SIGQUIT, SIGTERM}; +use signal_hook::{consts::SIGINT, iterator::Signals}; use tracing::{error, info}; use url::Url; use compute_api::responses::ComputeStatus; -use compute_tools::compute::{ComputeNode, ComputeState, ParsedSpec}; +use compute_tools::compute::{ComputeNode, ComputeState, ParsedSpec, PG_PID, SYNC_SAFEKEEPERS_PID}; use compute_tools::configurator::launch_configurator; use compute_tools::extension_server::get_pg_version; use compute_tools::http::api::launch_http_server; @@ -65,6 +71,13 @@ const BUILD_TAG_DEFAULT: &str = "latest"; fn main() -> Result<()> { init_tracing_and_logging(DEFAULT_LOG_LEVEL)?; + let mut signals = Signals::new([SIGINT, SIGTERM, SIGQUIT])?; + thread::spawn(move || { + for sig in signals.forever() { + handle_exit_signal(sig); + } + }); + let build_tag = option_env!("BUILD_TAG") .unwrap_or(BUILD_TAG_DEFAULT) .to_string(); @@ -99,6 +112,9 @@ fn main() -> Result<()> { let spec_json = matches.get_one::("spec"); let spec_path = matches.get_one::("spec-path"); + let pgbouncer_connstr = matches.get_one::("pgbouncer-connstr"); + let pgbouncer_ini_path = matches.get_one::("pgbouncer-ini-path"); + // Extract OpenTelemetry context for the startup actions from the // TRACEPARENT and TRACESTATE env variables, and attach it to the current // tracing context. @@ -209,6 +225,8 @@ fn main() -> Result<()> { ext_remote_storage: ext_remote_storage.map(|s| s.to_string()), ext_download_progress: RwLock::new(HashMap::new()), build_tag, + pgbouncer_connstr: pgbouncer_connstr.map(|s| s.to_string()), + pgbouncer_ini_path: pgbouncer_ini_path.map(|s| s.to_string()), }; let compute = Arc::new(compute_node); @@ -274,7 +292,13 @@ fn main() -> Result<()> { let mut state = compute.state.lock().unwrap(); state.error = Some(format!("{:?}", err)); state.status = ComputeStatus::Failed; - drop(state); + // Notify others that Postgres failed to start. In case of configuring the + // empty compute, it's likely that API handler is still waiting for compute + // state change. With this we will notify it that compute is in Failed state, + // so control plane will know about it earlier and record proper error instead + // of timeout. + compute.state_changed.notify_all(); + drop(state); // unlock delay_exit = true; None } @@ -333,6 +357,7 @@ fn main() -> Result<()> { let ecode = pg .wait() .expect("failed to start waiting on Postgres process"); + PG_PID.store(0, Ordering::SeqCst); info!("Postgres exited with code {}, shutting down", ecode); exit_code = ecode.code() } @@ -487,6 +512,41 @@ fn cli() -> clap::Command { ) .value_name("FILECACHE_CONNSTR"), ) + .arg( + Arg::new("pgbouncer-connstr") + .long("pgbouncer-connstr") + .default_value( + "host=localhost port=6432 dbname=pgbouncer user=cloud_admin sslmode=disable", + ) + .value_name("PGBOUNCER_CONNSTR"), + ) + .arg( + Arg::new("pgbouncer-ini-path") + .long("pgbouncer-ini-path") + // Note: this doesn't match current path for pgbouncer.ini. + // Until we fix it, we need to pass the path explicitly + // or this will be effectively no-op. + .default_value("/etc/pgbouncer.ini") + .value_name("PGBOUNCER_INI_PATH"), + ) +} + +/// When compute_ctl is killed, send also termination signal to sync-safekeepers +/// to prevent leakage. TODO: it is better to convert compute_ctl to async and +/// wait for termination which would be easy then. +fn handle_exit_signal(sig: i32) { + info!("received {sig} termination signal"); + let ss_pid = SYNC_SAFEKEEPERS_PID.load(Ordering::SeqCst); + if ss_pid != 0 { + let ss_pid = nix::unistd::Pid::from_raw(ss_pid as i32); + kill(ss_pid, Signal::SIGTERM).ok(); + } + let pg_pid = PG_PID.load(Ordering::SeqCst); + if pg_pid != 0 { + let pg_pid = nix::unistd::Pid::from_raw(pg_pid as i32); + kill(pg_pid, Signal::SIGTERM).ok(); + } + exit(1); } #[test] diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index 0dfacb615c..13701b7378 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -6,7 +6,10 @@ use std::os::unix::fs::PermissionsExt; use std::path::Path; use std::process::{Command, Stdio}; use std::str::FromStr; +use std::sync::atomic::AtomicU32; +use std::sync::atomic::Ordering; use std::sync::{Condvar, Mutex, RwLock}; +use std::thread; use std::time::Instant; use anyhow::{Context, Result}; @@ -22,7 +25,7 @@ use utils::id::{TenantId, TimelineId}; use utils::lsn::Lsn; use compute_api::responses::{ComputeMetrics, ComputeStatus}; -use compute_api::spec::{ComputeMode, ComputeSpec}; +use compute_api::spec::{ComputeFeature, ComputeMode, ComputeSpec}; use utils::measured_stream::MeasuredReader; use remote_storage::{DownloadError, RemotePath}; @@ -33,6 +36,9 @@ use crate::spec::*; use crate::sync_sk::{check_if_synced, ping_safekeeper}; use crate::{config, extension_server}; +pub static SYNC_SAFEKEEPERS_PID: AtomicU32 = AtomicU32::new(0); +pub static PG_PID: AtomicU32 = AtomicU32::new(0); + /// Compute node info shared across several `compute_ctl` threads. pub struct ComputeNode { // Url type maintains proper escaping @@ -64,6 +70,10 @@ pub struct ComputeNode { // key: ext_archive_name, value: started download time, download_completed? pub ext_download_progress: RwLock, bool)>>, pub build_tag: String, + // connection string to pgbouncer to change settings + pub pgbouncer_connstr: Option, + // path to pgbouncer.ini to change settings + pub pgbouncer_ini_path: Option, } // store some metrics about download size that might impact startup time @@ -252,7 +262,7 @@ fn create_neon_superuser(spec: &ComputeSpec, client: &mut Client) -> Result<()> IF NOT EXISTS ( SELECT FROM pg_catalog.pg_roles WHERE rolname = 'neon_superuser') THEN - CREATE ROLE neon_superuser CREATEDB CREATEROLE NOLOGIN REPLICATION IN ROLE pg_read_all_data, pg_write_all_data; + CREATE ROLE neon_superuser CREATEDB CREATEROLE NOLOGIN REPLICATION BYPASSRLS IN ROLE pg_read_all_data, pg_write_all_data; IF array_length(roles, 1) IS NOT NULL THEN EXECUTE format('GRANT neon_superuser TO %s', array_to_string(ARRAY(SELECT quote_ident(x) FROM unnest(roles) as x), ', ')); @@ -277,6 +287,17 @@ fn create_neon_superuser(spec: &ComputeSpec, client: &mut Client) -> Result<()> } impl ComputeNode { + /// Check that compute node has corresponding feature enabled. + pub fn has_feature(&self, feature: ComputeFeature) -> bool { + let state = self.state.lock().unwrap(); + + if let Some(s) = state.pspec.as_ref() { + s.spec.features.contains(&feature) + } else { + false + } + } + pub fn set_status(&self, status: ComputeStatus) { let mut state = self.state.lock().unwrap(); state.status = status; @@ -485,6 +506,7 @@ impl ComputeNode { .stdout(Stdio::piped()) .spawn() .expect("postgres --sync-safekeepers failed to start"); + SYNC_SAFEKEEPERS_PID.store(sync_handle.id(), Ordering::SeqCst); // `postgres --sync-safekeepers` will print all log output to stderr and // final LSN to stdout. So we pipe only stdout, while stderr will be automatically @@ -492,6 +514,7 @@ impl ComputeNode { let sync_output = sync_handle .wait_with_output() .expect("postgres --sync-safekeepers failed"); + SYNC_SAFEKEEPERS_PID.store(0, Ordering::SeqCst); if !sync_output.status.success() { anyhow::bail!( @@ -646,6 +669,7 @@ impl ComputeNode { }) .spawn() .expect("cannot start postgres process"); + PG_PID.store(pg.id(), Ordering::SeqCst); wait_for_postgres(&mut pg, pgdata_path)?; @@ -726,9 +750,39 @@ impl ComputeNode { pub fn reconfigure(&self) -> Result<()> { let spec = self.state.lock().unwrap().pspec.clone().unwrap().spec; + if let Some(connstr) = &self.pgbouncer_connstr { + info!("tuning pgbouncer with connstr: {:?}", connstr); + + let rt = tokio::runtime::Builder::new_current_thread() + .enable_all() + .build() + .expect("failed to create rt"); + + // Spawn a thread to do the tuning, + // so that we don't block the main thread that starts Postgres. + let pgbouncer_settings = spec.pgbouncer_settings.clone(); + let connstr_clone = connstr.clone(); + let pgbouncer_ini_path = self.pgbouncer_ini_path.clone(); + let _handle = thread::spawn(move || { + let res = rt.block_on(tune_pgbouncer( + pgbouncer_settings, + &connstr_clone, + pgbouncer_ini_path, + )); + if let Err(err) = res { + error!("error while tuning pgbouncer: {err:?}"); + } + }); + } + // Write new config let pgdata_path = Path::new(&self.pgdata); - config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), &spec, None)?; + let postgresql_conf_path = pgdata_path.join("postgresql.conf"); + config::write_postgres_conf(&postgresql_conf_path, &spec, None)?; + // temporarily reset max_cluster_size in config + // to avoid the possibility of hitting the limit, while we are reconfiguring: + // creating new extensions, roles, etc... + config::compute_ctl_temp_override_create(pgdata_path, "neon.max_cluster_size=-1")?; self.pg_reload_conf()?; let mut client = Client::connect(self.connstr.as_str(), NoTls)?; @@ -749,6 +803,10 @@ impl ComputeNode { // 'Close' connection drop(client); + // reset max_cluster_size in config back to original value and reload config + config::compute_ctl_temp_override_remove(pgdata_path)?; + self.pg_reload_conf()?; + let unknown_op = "unknown".to_string(); let op_id = spec.operation_uuid.as_ref().unwrap_or(&unknown_op); info!( @@ -771,6 +829,32 @@ impl ComputeNode { pspec.timeline_id, ); + // tune pgbouncer + if let Some(connstr) = &self.pgbouncer_connstr { + info!("tuning pgbouncer with connstr: {:?}", connstr); + + let rt = tokio::runtime::Builder::new_current_thread() + .enable_all() + .build() + .expect("failed to create rt"); + + // Spawn a thread to do the tuning, + // so that we don't block the main thread that starts Postgres. + let pgbouncer_settings = pspec.spec.pgbouncer_settings.clone(); + let connstr_clone = connstr.clone(); + let pgbouncer_ini_path = self.pgbouncer_ini_path.clone(); + let _handle = thread::spawn(move || { + let res = rt.block_on(tune_pgbouncer( + pgbouncer_settings, + &connstr_clone, + pgbouncer_ini_path, + )); + if let Err(err) = res { + error!("error while tuning pgbouncer: {err:?}"); + } + }); + } + info!( "start_compute spec.remote_extensions {:?}", pspec.spec.remote_extensions @@ -809,7 +893,17 @@ impl ComputeNode { let config_time = Utc::now(); if pspec.spec.mode == ComputeMode::Primary && !pspec.spec.skip_pg_catalog_updates { + let pgdata_path = Path::new(&self.pgdata); + // temporarily reset max_cluster_size in config + // to avoid the possibility of hitting the limit, while we are applying config: + // creating new extensions, roles, etc... + config::compute_ctl_temp_override_create(pgdata_path, "neon.max_cluster_size=-1")?; + self.pg_reload_conf()?; + self.apply_config(&compute_state)?; + + config::compute_ctl_temp_override_remove(pgdata_path)?; + self.pg_reload_conf()?; } let startup_end_time = Utc::now(); diff --git a/compute_tools/src/config.rs b/compute_tools/src/config.rs index bc48a2110d..a7ef8cea92 100644 --- a/compute_tools/src/config.rs +++ b/compute_tools/src/config.rs @@ -93,5 +93,25 @@ pub fn write_postgres_conf( writeln!(file, "neon.extension_server_port={}", port)?; } + // This is essential to keep this line at the end of the file, + // because it is intended to override any settings above. + writeln!(file, "include_if_exists = 'compute_ctl_temp_override.conf'")?; + + Ok(()) +} + +/// create file compute_ctl_temp_override.conf in pgdata_dir +/// add provided options to this file +pub fn compute_ctl_temp_override_create(pgdata_path: &Path, options: &str) -> Result<()> { + let path = pgdata_path.join("compute_ctl_temp_override.conf"); + let mut file = File::create(path)?; + write!(file, "{}", options)?; + Ok(()) +} + +/// remove file compute_ctl_temp_override.conf in pgdata_dir +pub fn compute_ctl_temp_override_remove(pgdata_path: &Path) -> Result<()> { + let path = pgdata_path.join("compute_ctl_temp_override.conf"); + std::fs::remove_file(path)?; Ok(()) } diff --git a/compute_tools/src/http/api.rs b/compute_tools/src/http/api.rs index ef6ca6eee3..fa2c4cff28 100644 --- a/compute_tools/src/http/api.rs +++ b/compute_tools/src/http/api.rs @@ -227,7 +227,7 @@ async fn handle_configure_request( let parsed_spec = match ParsedSpec::try_from(spec) { Ok(ps) => ps, - Err(msg) => return Err((msg, StatusCode::PRECONDITION_FAILED)), + Err(msg) => return Err((msg, StatusCode::BAD_REQUEST)), }; // XXX: wrap state update under lock in code blocks. Otherwise, diff --git a/compute_tools/src/http/openapi_spec.yaml b/compute_tools/src/http/openapi_spec.yaml index dc26cc63eb..cedc6ece8f 100644 --- a/compute_tools/src/http/openapi_spec.yaml +++ b/compute_tools/src/http/openapi_spec.yaml @@ -156,17 +156,17 @@ paths: description: Error text or 'OK' if download succeeded. example: "OK" 400: - description: Request is invalid. - content: - application/json: - schema: - $ref: "#/components/schemas/GenericError" + description: Request is invalid. + content: + application/json: + schema: + $ref: "#/components/schemas/GenericError" 500: - description: Extension download request failed. - content: - application/json: - schema: - $ref: "#/components/schemas/GenericError" + description: Extension download request failed. + content: + application/json: + schema: + $ref: "#/components/schemas/GenericError" components: securitySchemes: diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs index 8722822f5e..0b0e137c03 100644 --- a/compute_tools/src/pg_helpers.rs +++ b/compute_tools/src/pg_helpers.rs @@ -9,9 +9,11 @@ use std::process::Child; use std::time::{Duration, Instant}; use anyhow::{bail, Result}; +use ini::Ini; use notify::{RecursiveMode, Watcher}; use postgres::{Client, Transaction}; -use tracing::{debug, instrument}; +use tokio_postgres::NoTls; +use tracing::{debug, error, info, instrument}; use compute_api::spec::{Database, GenericOption, GenericOptions, PgIdent, Role}; @@ -193,16 +195,11 @@ impl Escaping for PgIdent { /// Build a list of existing Postgres roles pub fn get_existing_roles(xact: &mut Transaction<'_>) -> Result> { let postgres_roles = xact - .query( - "SELECT rolname, rolpassword, rolreplication, rolbypassrls FROM pg_catalog.pg_authid", - &[], - )? + .query("SELECT rolname, rolpassword FROM pg_catalog.pg_authid", &[])? .iter() .map(|row| Role { name: row.get("rolname"), encrypted_password: row.get("rolpassword"), - replication: Some(row.get("rolreplication")), - bypassrls: Some(row.get("rolbypassrls")), options: None, }) .collect(); @@ -364,3 +361,68 @@ pub fn create_pgdata(pgdata: &str) -> Result<()> { Ok(()) } + +/// Update pgbouncer.ini with provided options +pub fn update_pgbouncer_ini( + pgbouncer_config: HashMap, + pgbouncer_ini_path: &str, +) -> Result<()> { + let mut conf = Ini::load_from_file(pgbouncer_ini_path)?; + let section = conf.section_mut(Some("pgbouncer")).unwrap(); + + for (option_name, value) in pgbouncer_config.iter() { + section.insert(option_name, value); + } + + conf.write_to_file(pgbouncer_ini_path)?; + Ok(()) +} + +/// Tune pgbouncer. +/// 1. Apply new config using pgbouncer admin console +/// 2. Add new values to pgbouncer.ini to preserve them after restart +pub async fn tune_pgbouncer( + pgbouncer_settings: Option>, + pgbouncer_connstr: &str, + pgbouncer_ini_path: Option, +) -> Result<()> { + if let Some(pgbouncer_config) = pgbouncer_settings { + // Apply new config + let connect_result = tokio_postgres::connect(pgbouncer_connstr, NoTls).await; + let (client, connection) = connect_result.unwrap(); + tokio::spawn(async move { + if let Err(e) = connection.await { + eprintln!("connection error: {}", e); + } + }); + + for (option_name, value) in pgbouncer_config.iter() { + info!( + "Applying pgbouncer setting change: {} = {}", + option_name, value + ); + let query = format!("SET {} = {}", option_name, value); + + let result = client.simple_query(&query).await; + + info!("Applying pgbouncer setting change: {}", query); + info!("pgbouncer setting change result: {:?}", result); + + if let Err(err) = result { + // Don't fail on error, just print it into log + error!( + "Failed to apply pgbouncer setting change: {}, {}", + query, err + ); + }; + } + + // save values to pgbouncer.ini + // so that they are preserved after pgbouncer restart + if let Some(pgbouncer_ini_path) = pgbouncer_ini_path { + update_pgbouncer_ini(pgbouncer_config, &pgbouncer_ini_path)?; + } + } + + Ok(()) +} diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs index 8c44c6d519..d545858dc2 100644 --- a/compute_tools/src/spec.rs +++ b/compute_tools/src/spec.rs @@ -118,19 +118,6 @@ pub fn get_spec_from_control_plane( spec } -/// It takes cluster specification and does the following: -/// - Serialize cluster config and put it into `postgresql.conf` completely rewriting the file. -/// - Update `pg_hba.conf` to allow external connections. -pub fn handle_configuration(spec: &ComputeSpec, pgdata_path: &Path) -> Result<()> { - // File `postgresql.conf` is no longer included into `basebackup`, so just - // always write all config into it creating new file. - config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), spec, None)?; - - update_pg_hba(pgdata_path)?; - - Ok(()) -} - /// Check `pg_hba.conf` and update if needed to allow external connections. pub fn update_pg_hba(pgdata_path: &Path) -> Result<()> { // XXX: consider making it a part of spec.json @@ -265,8 +252,6 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> { let action = if let Some(r) = pg_role { if (r.encrypted_password.is_none() && role.encrypted_password.is_some()) || (r.encrypted_password.is_some() && role.encrypted_password.is_none()) - || !r.bypassrls.unwrap_or(false) - || !r.replication.unwrap_or(false) { RoleAction::Update } else if let Some(pg_pwd) = &r.encrypted_password { @@ -298,14 +283,22 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> { match action { RoleAction::None => {} RoleAction::Update => { - let mut query: String = - format!("ALTER ROLE {} BYPASSRLS REPLICATION", name.pg_quote()); + // This can be run on /every/ role! Not just ones created through the console. + // This means that if you add some funny ALTER here that adds a permission, + // this will get run even on user-created roles! This will result in different + // behavior before and after a spec gets reapplied. The below ALTER as it stands + // now only grants LOGIN and changes the password. Please do not allow this branch + // to do anything silly. + let mut query: String = format!("ALTER ROLE {} ", name.pg_quote()); query.push_str(&role.to_pg_options()); xact.execute(query.as_str(), &[])?; } RoleAction::Create => { + // This branch only runs when roles are created through the console, so it is + // safe to add more permissions here. BYPASSRLS and REPLICATION are inherited + // from neon_superuser. let mut query: String = format!( - "CREATE ROLE {} CREATEROLE CREATEDB BYPASSRLS REPLICATION IN ROLE neon_superuser", + "CREATE ROLE {} INHERIT CREATEROLE CREATEDB BYPASSRLS REPLICATION IN ROLE neon_superuser", name.pg_quote() ); info!("role create query: '{}'", &query); @@ -377,33 +370,49 @@ pub fn handle_role_deletions(spec: &ComputeSpec, connstr: &str, client: &mut Cli Ok(()) } +fn reassign_owned_objects_in_one_db( + conf: Config, + role_name: &PgIdent, + db_owner: &PgIdent, +) -> Result<()> { + let mut client = conf.connect(NoTls)?; + + // This will reassign all dependent objects to the db owner + let reassign_query = format!( + "REASSIGN OWNED BY {} TO {}", + role_name.pg_quote(), + db_owner.pg_quote() + ); + info!( + "reassigning objects owned by '{}' in db '{}' to '{}'", + role_name, + conf.get_dbname().unwrap_or(""), + db_owner + ); + client.simple_query(&reassign_query)?; + + // This now will only drop privileges of the role + let drop_query = format!("DROP OWNED BY {}", role_name.pg_quote()); + client.simple_query(&drop_query)?; + Ok(()) +} + // Reassign all owned objects in all databases to the owner of the database. fn reassign_owned_objects(spec: &ComputeSpec, connstr: &str, role_name: &PgIdent) -> Result<()> { for db in &spec.cluster.databases { if db.owner != *role_name { let mut conf = Config::from_str(connstr)?; conf.dbname(&db.name); - - let mut client = conf.connect(NoTls)?; - - // This will reassign all dependent objects to the db owner - let reassign_query = format!( - "REASSIGN OWNED BY {} TO {}", - role_name.pg_quote(), - db.owner.pg_quote() - ); - info!( - "reassigning objects owned by '{}' in db '{}' to '{}'", - role_name, &db.name, &db.owner - ); - client.simple_query(&reassign_query)?; - - // This now will only drop privileges of the role - let drop_query = format!("DROP OWNED BY {}", role_name.pg_quote()); - client.simple_query(&drop_query)?; + reassign_owned_objects_in_one_db(conf, role_name, &db.owner)?; } } + // Also handle case when there are no databases in the spec. + // In this case we need to reassign objects in the default database. + let conf = Config::from_str(connstr)?; + let db_owner = PgIdent::from_str("cloud_admin")?; + reassign_owned_objects_in_one_db(conf, role_name, &db_owner)?; + Ok(()) } diff --git a/control_plane/Cargo.toml b/control_plane/Cargo.toml index 7ccddc161e..898ad05add 100644 --- a/control_plane/Cargo.toml +++ b/control_plane/Cargo.toml @@ -6,9 +6,11 @@ license.workspace = true [dependencies] anyhow.workspace = true +async-trait.workspace = true camino.workspace = true clap.workspace = true comfy-table.workspace = true +futures.workspace = true git-version.workspace = true nix.workspace = true once_cell.workspace = true @@ -24,10 +26,11 @@ tar.workspace = true thiserror.workspace = true toml.workspace = true tokio.workspace = true +tokio-postgres.workspace = true +tokio-util.workspace = true url.workspace = true -# Note: Do not directly depend on pageserver or safekeeper; use pageserver_api or safekeeper_api -# instead, so that recompile times are better. pageserver_api.workspace = true +pageserver_client.workspace = true postgres_backend.workspace = true safekeeper_api.workspace = true postgres_connection.workspace = true diff --git a/control_plane/src/attachment_service.rs b/control_plane/src/attachment_service.rs index 822ac7d8a6..731c05809e 100644 --- a/control_plane/src/attachment_service.rs +++ b/control_plane/src/attachment_service.rs @@ -9,7 +9,7 @@ pub struct AttachmentService { env: LocalEnv, listen: String, path: PathBuf, - client: reqwest::blocking::Client, + client: reqwest::Client, } const COMMAND: &str = "attachment_service"; @@ -53,7 +53,7 @@ impl AttachmentService { env: env.clone(), path, listen, - client: reqwest::blocking::ClientBuilder::new() + client: reqwest::ClientBuilder::new() .build() .expect("Failed to construct http client"), } @@ -64,7 +64,7 @@ impl AttachmentService { .expect("non-Unicode path") } - pub fn start(&self) -> anyhow::Result { + pub async fn start(&self) -> anyhow::Result { let path_str = self.path.to_string_lossy(); background_process::start_process( @@ -73,10 +73,11 @@ impl AttachmentService { &self.env.attachment_service_bin(), ["-l", &self.listen, "-p", &path_str], [], - background_process::InitialPidFile::Create(&self.pid_file()), + background_process::InitialPidFile::Create(self.pid_file()), // TODO: a real status check - || Ok(true), + || async move { anyhow::Ok(true) }, ) + .await } pub fn stop(&self, immediate: bool) -> anyhow::Result<()> { @@ -84,7 +85,7 @@ impl AttachmentService { } /// Call into the attach_hook API, for use before handing out attachments to pageservers - pub fn attach_hook( + pub async fn attach_hook( &self, tenant_id: TenantId, pageserver_id: NodeId, @@ -104,16 +105,16 @@ impl AttachmentService { node_id: Some(pageserver_id), }; - let response = self.client.post(url).json(&request).send()?; + let response = self.client.post(url).json(&request).send().await?; if response.status() != StatusCode::OK { return Err(anyhow!("Unexpected status {}", response.status())); } - let response = response.json::()?; + let response = response.json::().await?; Ok(response.gen) } - pub fn inspect(&self, tenant_id: TenantId) -> anyhow::Result> { + pub async fn inspect(&self, tenant_id: TenantId) -> anyhow::Result> { use hyper::StatusCode; let url = self @@ -126,12 +127,12 @@ impl AttachmentService { let request = InspectRequest { tenant_id }; - let response = self.client.post(url).json(&request).send()?; + let response = self.client.post(url).json(&request).send().await?; if response.status() != StatusCode::OK { return Err(anyhow!("Unexpected status {}", response.status())); } - let response = response.json::()?; + let response = response.json::().await?; Ok(response.attachment) } } diff --git a/control_plane/src/background_process.rs b/control_plane/src/background_process.rs index 26fc08fc8f..20fa3af9b8 100644 --- a/control_plane/src/background_process.rs +++ b/control_plane/src/background_process.rs @@ -44,15 +44,15 @@ const NOTICE_AFTER_RETRIES: u64 = 50; /// Argument to `start_process`, to indicate whether it should create pidfile or if the process creates /// it itself. -pub enum InitialPidFile<'t> { +pub enum InitialPidFile { /// Create a pidfile, to allow future CLI invocations to manipulate the process. - Create(&'t Utf8Path), + Create(Utf8PathBuf), /// The process will create the pidfile itself, need to wait for that event. - Expect(&'t Utf8Path), + Expect(Utf8PathBuf), } /// Start a background child process using the parameters given. -pub fn start_process( +pub async fn start_process( process_name: &str, datadir: &Path, command: &Path, @@ -62,7 +62,8 @@ pub fn start_process( process_status_check: F, ) -> anyhow::Result where - F: Fn() -> anyhow::Result, + F: Fn() -> Fut, + Fut: std::future::Future>, AI: IntoIterator, A: AsRef, // Not generic AsRef, otherwise empty `envs` prevents type inference @@ -89,7 +90,7 @@ where let filled_cmd = fill_remote_storage_secrets_vars(fill_rust_env_vars(background_command)); filled_cmd.envs(envs); - let pid_file_to_check = match initial_pid_file { + let pid_file_to_check = match &initial_pid_file { InitialPidFile::Create(path) => { pre_exec_create_pidfile(filled_cmd, path); path @@ -107,7 +108,7 @@ where ); for retries in 0..RETRIES { - match process_started(pid, Some(pid_file_to_check), &process_status_check) { + match process_started(pid, pid_file_to_check, &process_status_check).await { Ok(true) => { println!("\n{process_name} started, pid: {pid}"); return Ok(spawned_process); @@ -316,22 +317,20 @@ where cmd } -fn process_started( +async fn process_started( pid: Pid, - pid_file_to_check: Option<&Utf8Path>, + pid_file_to_check: &Utf8Path, status_check: &F, ) -> anyhow::Result where - F: Fn() -> anyhow::Result, + F: Fn() -> Fut, + Fut: std::future::Future>, { - match status_check() { - Ok(true) => match pid_file_to_check { - Some(pid_file_path) => match pid_file::read(pid_file_path)? { - PidFileRead::NotExist => Ok(false), - PidFileRead::LockedByOtherProcess(pid_in_file) => Ok(pid_in_file == pid), - PidFileRead::NotHeldByAnyProcess(_) => Ok(false), - }, - None => Ok(true), + match status_check().await { + Ok(true) => match pid_file::read(pid_file_to_check)? { + PidFileRead::NotExist => Ok(false), + PidFileRead::LockedByOtherProcess(pid_in_file) => Ok(pid_in_file == pid), + PidFileRead::NotHeldByAnyProcess(_) => Ok(false), }, Ok(false) => Ok(false), Err(e) => anyhow::bail!("process failed to start: {e}"), diff --git a/control_plane/src/bin/attachment_service.rs b/control_plane/src/bin/attachment_service.rs index be7cff352c..e50c8fbba0 100644 --- a/control_plane/src/bin/attachment_service.rs +++ b/control_plane/src/bin/attachment_service.rs @@ -201,6 +201,12 @@ async fn handle_validate(mut req: Request) -> Result, ApiEr // TODO(sharding): make this shard-aware if let Some(tenant_state) = locked.tenants.get(&req_tenant.id.tenant_id) { let valid = tenant_state.generation == req_tenant.gen; + tracing::info!( + "handle_validate: {}(gen {}): valid={valid} (latest {})", + req_tenant.id, + req_tenant.gen, + tenant_state.generation + ); response.tenants.push(ValidateResponseTenant { id: req_tenant.id, valid, @@ -250,6 +256,13 @@ async fn handle_attach_hook(mut req: Request) -> Result, Ap tenant_state.pageserver = attach_req.node_id; let generation = tenant_state.generation; + tracing::info!( + "handle_attach_hook: tenant {} set generation {}, pageserver {}", + attach_req.tenant_id, + tenant_state.generation, + attach_req.node_id.unwrap_or(utils::id::NodeId(0xfffffff)) + ); + locked.save().await.map_err(ApiError::InternalServerError)?; json_response( diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index f7442c02c7..03e69010f7 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -120,15 +120,20 @@ fn main() -> Result<()> { let mut env = LocalEnv::load_config().context("Error loading config")?; let original_env = env.clone(); + let rt = tokio::runtime::Builder::new_current_thread() + .enable_all() + .build() + .unwrap(); + let subcommand_result = match sub_name { - "tenant" => handle_tenant(sub_args, &mut env), - "timeline" => handle_timeline(sub_args, &mut env), - "start" => handle_start_all(sub_args, &env), + "tenant" => rt.block_on(handle_tenant(sub_args, &mut env)), + "timeline" => rt.block_on(handle_timeline(sub_args, &mut env)), + "start" => rt.block_on(handle_start_all(sub_args, &env)), "stop" => handle_stop_all(sub_args, &env), - "pageserver" => handle_pageserver(sub_args, &env), - "attachment_service" => handle_attachment_service(sub_args, &env), - "safekeeper" => handle_safekeeper(sub_args, &env), - "endpoint" => handle_endpoint(sub_args, &env), + "pageserver" => rt.block_on(handle_pageserver(sub_args, &env)), + "attachment_service" => rt.block_on(handle_attachment_service(sub_args, &env)), + "safekeeper" => rt.block_on(handle_safekeeper(sub_args, &env)), + "endpoint" => rt.block_on(handle_endpoint(sub_args, &env)), "mappings" => handle_mappings(sub_args, &mut env), "pg" => bail!("'pg' subcommand has been renamed to 'endpoint'"), _ => bail!("unexpected subcommand {sub_name}"), @@ -168,7 +173,7 @@ fn print_timelines_tree( info: t.clone(), children: BTreeSet::new(), name: timeline_name_mappings - .remove(&TenantTimelineId::new(t.tenant_id, t.timeline_id)), + .remove(&TenantTimelineId::new(t.tenant_id.tenant_id, t.timeline_id)), }, ) }) @@ -269,12 +274,13 @@ fn print_timeline( /// Returns a map of timeline IDs to timeline_id@lsn strings. /// Connects to the pageserver to query this information. -fn get_timeline_infos( +async fn get_timeline_infos( env: &local_env::LocalEnv, tenant_id: &TenantId, ) -> Result> { Ok(get_default_pageserver(env) - .timeline_list(tenant_id)? + .timeline_list(tenant_id) + .await? .into_iter() .map(|timeline_info| (timeline_info.timeline_id, timeline_info)) .collect()) @@ -373,11 +379,14 @@ fn pageserver_config_overrides(init_match: &ArgMatches) -> Vec<&str> { .collect() } -fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> anyhow::Result<()> { +async fn handle_tenant( + tenant_match: &ArgMatches, + env: &mut local_env::LocalEnv, +) -> anyhow::Result<()> { let pageserver = get_default_pageserver(env); match tenant_match.subcommand() { Some(("list", _)) => { - for t in pageserver.tenant_list()? { + for t in pageserver.tenant_list().await? { println!("{} {:?}", t.id, t.state); } } @@ -394,12 +403,16 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an // We must register the tenant with the attachment service, so // that when the pageserver restarts, it will be re-attached. let attachment_service = AttachmentService::from_env(env); - attachment_service.attach_hook(tenant_id, pageserver.conf.id)? + attachment_service + .attach_hook(tenant_id, pageserver.conf.id) + .await? } else { None }; - pageserver.tenant_create(tenant_id, generation, tenant_conf)?; + pageserver + .tenant_create(tenant_id, generation, tenant_conf) + .await?; println!("tenant {tenant_id} successfully created on the pageserver"); // Create an initial timeline for the new tenant @@ -409,13 +422,16 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an .copied() .context("Failed to parse postgres version from the argument string")?; - let timeline_info = pageserver.timeline_create( - tenant_id, - new_timeline_id, - None, - None, - Some(pg_version), - )?; + let timeline_info = pageserver + .timeline_create( + tenant_id, + new_timeline_id, + None, + None, + Some(pg_version), + None, + ) + .await?; let new_timeline_id = timeline_info.timeline_id; let last_record_lsn = timeline_info.last_record_lsn; @@ -449,6 +465,7 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an pageserver .tenant_config(tenant_id, tenant_conf) + .await .with_context(|| format!("Tenant config failed for tenant with id {tenant_id}"))?; println!("tenant {tenant_id} successfully configured on the pageserver"); } @@ -457,7 +474,7 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an let new_pageserver = get_pageserver(env, matches)?; let new_pageserver_id = new_pageserver.conf.id; - migrate_tenant(env, tenant_id, new_pageserver)?; + migrate_tenant(env, tenant_id, new_pageserver).await?; println!("tenant {tenant_id} migrated to {}", new_pageserver_id); } @@ -467,13 +484,13 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an Ok(()) } -fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -> Result<()> { +async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -> Result<()> { let pageserver = get_default_pageserver(env); match timeline_match.subcommand() { Some(("list", list_match)) => { let tenant_id = get_tenant_id(list_match, env)?; - let timelines = pageserver.timeline_list(&tenant_id)?; + let timelines = pageserver.timeline_list(&tenant_id).await?; print_timelines_tree(timelines, env.timeline_name_mappings())?; } Some(("create", create_match)) => { @@ -489,13 +506,16 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) - let new_timeline_id_opt = parse_timeline_id(create_match)?; - let timeline_info = pageserver.timeline_create( - tenant_id, - new_timeline_id_opt, - None, - None, - Some(pg_version), - )?; + let timeline_info = pageserver + .timeline_create( + tenant_id, + new_timeline_id_opt, + None, + None, + Some(pg_version), + None, + ) + .await?; let new_timeline_id = timeline_info.timeline_id; let last_record_lsn = timeline_info.last_record_lsn; @@ -540,7 +560,9 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) - let mut cplane = ComputeControlPlane::load(env.clone())?; println!("Importing timeline into pageserver ..."); - pageserver.timeline_import(tenant_id, timeline_id, base, pg_wal, pg_version)?; + pageserver + .timeline_import(tenant_id, timeline_id, base, pg_wal, pg_version) + .await?; env.register_branch_mapping(name.to_string(), tenant_id, timeline_id)?; println!("Creating endpoint for imported timeline ..."); @@ -576,13 +598,16 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) - .map(|lsn_str| Lsn::from_str(lsn_str)) .transpose() .context("Failed to parse ancestor start Lsn from the request")?; - let timeline_info = pageserver.timeline_create( - tenant_id, - None, - start_lsn, - Some(ancestor_timeline_id), - None, - )?; + let timeline_info = pageserver + .timeline_create( + tenant_id, + None, + start_lsn, + Some(ancestor_timeline_id), + None, + None, + ) + .await?; let new_timeline_id = timeline_info.timeline_id; let last_record_lsn = timeline_info.last_record_lsn; @@ -601,7 +626,7 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) - Ok(()) } -fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { +async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { let (sub_name, sub_args) = match ep_match.subcommand() { Some(ep_subcommand_data) => ep_subcommand_data, None => bail!("no endpoint subcommand provided"), @@ -611,10 +636,12 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<( match sub_name { "list" => { let tenant_id = get_tenant_id(sub_args, env)?; - let timeline_infos = get_timeline_infos(env, &tenant_id).unwrap_or_else(|e| { - eprintln!("Failed to load timeline info: {}", e); - HashMap::new() - }); + let timeline_infos = get_timeline_infos(env, &tenant_id) + .await + .unwrap_or_else(|e| { + eprintln!("Failed to load timeline info: {}", e); + HashMap::new() + }); let timeline_name_mappings = env.timeline_name_mappings(); @@ -788,7 +815,9 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<( }; println!("Starting existing endpoint {endpoint_id}..."); - endpoint.start(&auth_token, safekeepers, remote_ext_config)?; + endpoint + .start(&auth_token, safekeepers, remote_ext_config) + .await?; } "reconfigure" => { let endpoint_id = sub_args @@ -806,7 +835,7 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<( } else { None }; - endpoint.reconfigure(pageserver_id)?; + endpoint.reconfigure(pageserver_id).await?; } "stop" => { let endpoint_id = sub_args @@ -872,11 +901,12 @@ fn get_pageserver(env: &local_env::LocalEnv, args: &ArgMatches) -> Result Result<()> { +async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { match sub_match.subcommand() { Some(("start", subcommand_args)) => { if let Err(e) = get_pageserver(env, subcommand_args)? .start(&pageserver_config_overrides(subcommand_args)) + .await { eprintln!("pageserver start failed: {e}"); exit(1); @@ -903,7 +933,10 @@ fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul exit(1); } - if let Err(e) = pageserver.start(&pageserver_config_overrides(subcommand_args)) { + if let Err(e) = pageserver + .start(&pageserver_config_overrides(subcommand_args)) + .await + { eprintln!("pageserver start failed: {e}"); exit(1); } @@ -917,14 +950,17 @@ fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul exit(1); } - if let Err(e) = pageserver.start(&pageserver_config_overrides(subcommand_args)) { + if let Err(e) = pageserver + .start(&pageserver_config_overrides(subcommand_args)) + .await + { eprintln!("pageserver start failed: {e}"); exit(1); } } Some(("status", subcommand_args)) => { - match get_pageserver(env, subcommand_args)?.check_status() { + match get_pageserver(env, subcommand_args)?.check_status().await { Ok(_) => println!("Page server is up and running"), Err(err) => { eprintln!("Page server is not available: {}", err); @@ -939,11 +975,14 @@ fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul Ok(()) } -fn handle_attachment_service(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { +async fn handle_attachment_service( + sub_match: &ArgMatches, + env: &local_env::LocalEnv, +) -> Result<()> { let svc = AttachmentService::from_env(env); match sub_match.subcommand() { Some(("start", _start_match)) => { - if let Err(e) = svc.start() { + if let Err(e) = svc.start().await { eprintln!("start failed: {e}"); exit(1); } @@ -984,7 +1023,7 @@ fn safekeeper_extra_opts(init_match: &ArgMatches) -> Vec { .collect() } -fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { +async fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { let (sub_name, sub_args) = match sub_match.subcommand() { Some(safekeeper_command_data) => safekeeper_command_data, None => bail!("no safekeeper subcommand provided"), @@ -1002,7 +1041,7 @@ fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul "start" => { let extra_opts = safekeeper_extra_opts(sub_args); - if let Err(e) = safekeeper.start(extra_opts) { + if let Err(e) = safekeeper.start(extra_opts).await { eprintln!("safekeeper start failed: {}", e); exit(1); } @@ -1028,7 +1067,7 @@ fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul } let extra_opts = safekeeper_extra_opts(sub_args); - if let Err(e) = safekeeper.start(extra_opts) { + if let Err(e) = safekeeper.start(extra_opts).await { eprintln!("safekeeper start failed: {}", e); exit(1); } @@ -1041,15 +1080,15 @@ fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul Ok(()) } -fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow::Result<()> { +async fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow::Result<()> { // Endpoints are not started automatically - broker::start_broker_process(env)?; + broker::start_broker_process(env).await?; // Only start the attachment service if the pageserver is configured to need it if env.control_plane_api.is_some() { let attachment_service = AttachmentService::from_env(env); - if let Err(e) = attachment_service.start() { + if let Err(e) = attachment_service.start().await { eprintln!("attachment_service start failed: {:#}", e); try_stop_all(env, true); exit(1); @@ -1058,7 +1097,10 @@ fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow for ps_conf in &env.pageservers { let pageserver = PageServerNode::from_env(env, ps_conf); - if let Err(e) = pageserver.start(&pageserver_config_overrides(sub_match)) { + if let Err(e) = pageserver + .start(&pageserver_config_overrides(sub_match)) + .await + { eprintln!("pageserver {} start failed: {:#}", ps_conf.id, e); try_stop_all(env, true); exit(1); @@ -1067,7 +1109,7 @@ fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow for node in env.safekeepers.iter() { let safekeeper = SafekeeperNode::from_env(env, node); - if let Err(e) = safekeeper.start(vec![]) { + if let Err(e) = safekeeper.start(vec![]).await { eprintln!("safekeeper {} start failed: {:#}", safekeeper.id, e); try_stop_all(env, false); exit(1); diff --git a/control_plane/src/broker.rs b/control_plane/src/broker.rs index 6be865cc2e..f40705863b 100644 --- a/control_plane/src/broker.rs +++ b/control_plane/src/broker.rs @@ -11,7 +11,7 @@ use camino::Utf8PathBuf; use crate::{background_process, local_env}; -pub fn start_broker_process(env: &local_env::LocalEnv) -> anyhow::Result<()> { +pub async fn start_broker_process(env: &local_env::LocalEnv) -> anyhow::Result<()> { let broker = &env.broker; let listen_addr = &broker.listen_addr; @@ -19,15 +19,15 @@ pub fn start_broker_process(env: &local_env::LocalEnv) -> anyhow::Result<()> { let args = [format!("--listen-addr={listen_addr}")]; - let client = reqwest::blocking::Client::new(); + let client = reqwest::Client::new(); background_process::start_process( "storage_broker", &env.base_data_dir, &env.storage_broker_bin(), args, [], - background_process::InitialPidFile::Create(&storage_broker_pid_file_path(env)), - || { + background_process::InitialPidFile::Create(storage_broker_pid_file_path(env)), + || async { let url = broker.client_url(); let status_url = url.join("status").with_context(|| { format!("Failed to append /status path to broker endpoint {url}") @@ -36,12 +36,13 @@ pub fn start_broker_process(env: &local_env::LocalEnv) -> anyhow::Result<()> { .get(status_url) .build() .with_context(|| format!("Failed to construct request to broker endpoint {url}"))?; - match client.execute(request) { + match client.execute(request).await { Ok(resp) => Ok(resp.status().is_success()), Err(_) => Ok(false), } }, ) + .await .context("Failed to spawn storage_broker subprocess")?; Ok(()) } diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs index 12b1250764..3d5dfd6311 100644 --- a/control_plane/src/endpoint.rs +++ b/control_plane/src/endpoint.rs @@ -46,6 +46,8 @@ use std::time::Duration; use anyhow::{anyhow, bail, Context, Result}; use compute_api::spec::RemoteExtSpec; +use nix::sys::signal::kill; +use nix::sys::signal::Signal; use serde::{Deserialize, Serialize}; use utils::id::{NodeId, TenantId, TimelineId}; @@ -439,11 +441,14 @@ impl Endpoint { Ok(()) } - fn wait_for_compute_ctl_to_exit(&self) -> Result<()> { + fn wait_for_compute_ctl_to_exit(&self, send_sigterm: bool) -> Result<()> { // TODO use background_process::stop_process instead let pidfile_path = self.endpoint_path().join("compute_ctl.pid"); let pid: u32 = std::fs::read_to_string(pidfile_path)?.parse()?; let pid = nix::unistd::Pid::from_raw(pid as i32); + if send_sigterm { + kill(pid, Signal::SIGTERM).ok(); + } crate::background_process::wait_until_stopped("compute_ctl", pid)?; Ok(()) } @@ -464,7 +469,7 @@ impl Endpoint { } } - pub fn start( + pub async fn start( &self, auth_token: &Option, safekeepers: Vec, @@ -519,6 +524,7 @@ impl Endpoint { skip_pg_catalog_updates: self.skip_pg_catalog_updates, format_version: 1.0, operation_uuid: None, + features: vec![], cluster: Cluster { cluster_id: None, // project ID: not used name: None, // project name: not used @@ -536,6 +542,7 @@ impl Endpoint { safekeeper_connstrings, storage_auth_token: auth_token.clone(), remote_extensions, + pgbouncer_settings: None, }; let spec_path = self.endpoint_path().join("spec.json"); std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?; @@ -586,7 +593,7 @@ impl Endpoint { const MAX_ATTEMPTS: u32 = 10 * 30; // Wait up to 30 s loop { attempt += 1; - match self.get_status() { + match self.get_status().await { Ok(state) => { match state.status { ComputeStatus::Init => { @@ -628,8 +635,8 @@ impl Endpoint { } // Call the /status HTTP API - pub fn get_status(&self) -> Result { - let client = reqwest::blocking::Client::new(); + pub async fn get_status(&self) -> Result { + let client = reqwest::Client::new(); let response = client .request( @@ -640,16 +647,17 @@ impl Endpoint { self.http_address.port() ), ) - .send()?; + .send() + .await?; // Interpret the response let status = response.status(); if !(status.is_client_error() || status.is_server_error()) { - Ok(response.json()?) + Ok(response.json().await?) } else { // reqwest does not export its error construction utility functions, so let's craft the message ourselves let url = response.url().to_owned(); - let msg = match response.text() { + let msg = match response.text().await { Ok(err_body) => format!("Error: {}", err_body), Err(_) => format!("Http error ({}) at {}.", status.as_u16(), url), }; @@ -657,7 +665,7 @@ impl Endpoint { } } - pub fn reconfigure(&self, pageserver_id: Option) -> Result<()> { + pub async fn reconfigure(&self, pageserver_id: Option) -> Result<()> { let mut spec: ComputeSpec = { let spec_path = self.endpoint_path().join("spec.json"); let file = std::fs::File::open(spec_path)?; @@ -686,7 +694,7 @@ impl Endpoint { spec.pageserver_connstring = Some(format!("postgresql://no_user@{host}:{port}")); } - let client = reqwest::blocking::Client::new(); + let client = reqwest::Client::new(); let response = client .post(format!( "http://{}:{}/configure", @@ -697,14 +705,15 @@ impl Endpoint { "{{\"spec\":{}}}", serde_json::to_string_pretty(&spec)? )) - .send()?; + .send() + .await?; let status = response.status(); if !(status.is_client_error() || status.is_server_error()) { Ok(()) } else { let url = response.url().to_owned(); - let msg = match response.text() { + let msg = match response.text().await { Ok(err_body) => format!("Error: {}", err_body), Err(_) => format!("Http error ({}) at {}.", status.as_u16(), url), }; @@ -729,10 +738,15 @@ impl Endpoint { &None, )?; - // Also wait for the compute_ctl process to die. It might have some cleanup - // work to do after postgres stops, like syncing safekeepers, etc. + // Also wait for the compute_ctl process to die. It might have some + // cleanup work to do after postgres stops, like syncing safekeepers, + // etc. // - self.wait_for_compute_ctl_to_exit()?; + // If destroying, send it SIGTERM before waiting. Sometimes we do *not* + // want this cleanup: tests intentionally do stop when majority of + // safekeepers is down, so sync-safekeepers would hang otherwise. This + // could be a separate flag though. + self.wait_for_compute_ctl_to_exit(destroy)?; if destroy { println!( "Destroying postgres data directory '{}'", diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs index 0a55c90e8f..7d490016bf 100644 --- a/control_plane/src/pageserver.rs +++ b/control_plane/src/pageserver.rs @@ -6,28 +6,24 @@ //! use std::borrow::Cow; use std::collections::HashMap; -use std::fs::File; -use std::io::{BufReader, Write}; + +use std::io; +use std::io::Write; use std::num::NonZeroU64; use std::path::PathBuf; use std::process::{Child, Command}; use std::time::Duration; -use std::{io, result}; use anyhow::{bail, Context}; use camino::Utf8PathBuf; -use pageserver_api::models::{ - self, LocationConfig, TenantInfo, TenantLocationConfigRequest, TimelineInfo, -}; +use futures::SinkExt; +use pageserver_api::models::{self, LocationConfig, TenantInfo, TimelineInfo}; use pageserver_api::shard::TenantShardId; +use pageserver_client::mgmt_api; use postgres_backend::AuthType; use postgres_connection::{parse_host_port, PgConnectionConfig}; -use reqwest::blocking::{Client, RequestBuilder, Response}; -use reqwest::{IntoUrl, Method}; -use thiserror::Error; use utils::auth::{Claims, Scope}; use utils::{ - http::error::HttpErrorBody, id::{TenantId, TimelineId}, lsn::Lsn, }; @@ -38,45 +34,6 @@ use crate::{background_process, local_env::LocalEnv}; /// Directory within .neon which will be used by default for LocalFs remote storage. pub const PAGESERVER_REMOTE_STORAGE_DIR: &str = "local_fs_remote_storage/pageserver"; -#[derive(Error, Debug)] -pub enum PageserverHttpError { - #[error("Reqwest error: {0}")] - Transport(#[from] reqwest::Error), - - #[error("Error: {0}")] - Response(String), -} - -impl From for PageserverHttpError { - fn from(e: anyhow::Error) -> Self { - Self::Response(e.to_string()) - } -} - -type Result = result::Result; - -pub trait ResponseErrorMessageExt: Sized { - fn error_from_body(self) -> Result; -} - -impl ResponseErrorMessageExt for Response { - fn error_from_body(self) -> Result { - let status = self.status(); - if !(status.is_client_error() || status.is_server_error()) { - return Ok(self); - } - - // reqwest does not export its error construction utility functions, so let's craft the message ourselves - let url = self.url().to_owned(); - Err(PageserverHttpError::Response( - match self.json::() { - Ok(err_body) => format!("Error: {}", err_body.msg), - Err(_) => format!("Http error ({}) at {}.", status.as_u16(), url), - }, - )) - } -} - // // Control routines for pageserver. // @@ -87,8 +44,7 @@ pub struct PageServerNode { pub pg_connection_config: PgConnectionConfig, pub conf: PageServerConf, pub env: LocalEnv, - pub http_client: Client, - pub http_base_url: String, + pub http_client: mgmt_api::Client, } impl PageServerNode { @@ -100,8 +56,19 @@ impl PageServerNode { pg_connection_config: PgConnectionConfig::new_host_port(host, port), conf: conf.clone(), env: env.clone(), - http_client: Client::new(), - http_base_url: format!("http://{}/v1", conf.listen_http_addr), + http_client: mgmt_api::Client::new( + format!("http://{}", conf.listen_http_addr), + { + match conf.http_auth_type { + AuthType::Trust => None, + AuthType::NeonJWT => Some( + env.generate_auth_token(&Claims::new(None, Scope::PageServerApi)) + .unwrap(), + ), + } + } + .as_deref(), + ), } } @@ -182,8 +149,8 @@ impl PageServerNode { .expect("non-Unicode path") } - pub fn start(&self, config_overrides: &[&str]) -> anyhow::Result { - self.start_node(config_overrides, false) + pub async fn start(&self, config_overrides: &[&str]) -> anyhow::Result { + self.start_node(config_overrides, false).await } fn pageserver_init(&self, config_overrides: &[&str]) -> anyhow::Result<()> { @@ -224,7 +191,12 @@ impl PageServerNode { Ok(()) } - fn start_node(&self, config_overrides: &[&str], update_config: bool) -> anyhow::Result { + async fn start_node( + &self, + config_overrides: &[&str], + update_config: bool, + ) -> anyhow::Result { + // TODO: using a thread here because start_process() is not async but we need to call check_status() let datadir = self.repo_path(); print!( "Starting pageserver node {} at '{}' in {:?}", @@ -232,7 +204,7 @@ impl PageServerNode { self.pg_connection_config.raw_address(), datadir ); - io::stdout().flush()?; + io::stdout().flush().context("flush stdout")?; let datadir_path_str = datadir.to_str().with_context(|| { format!( @@ -244,20 +216,23 @@ impl PageServerNode { if update_config { args.push(Cow::Borrowed("--update-config")); } - background_process::start_process( "pageserver", &datadir, &self.env.pageserver_bin(), args.iter().map(Cow::as_ref), self.pageserver_env_variables()?, - background_process::InitialPidFile::Expect(&self.pid_file()), - || match self.check_status() { - Ok(()) => Ok(true), - Err(PageserverHttpError::Transport(_)) => Ok(false), - Err(e) => Err(anyhow::anyhow!("Failed to check node status: {e}")), + background_process::InitialPidFile::Expect(self.pid_file()), + || async { + let st = self.check_status().await; + match st { + Ok(()) => Ok(true), + Err(mgmt_api::Error::ReceiveBody(_)) => Ok(false), + Err(e) => Err(anyhow::anyhow!("Failed to check node status: {e}")), + } }, ) + .await } fn pageserver_basic_args<'a>( @@ -303,7 +278,12 @@ impl PageServerNode { background_process::stop_process(immediate, "pageserver", &self.pid_file()) } - pub fn page_server_psql_client(&self) -> anyhow::Result { + pub async fn page_server_psql_client( + &self, + ) -> anyhow::Result<( + tokio_postgres::Client, + tokio_postgres::Connection, + )> { let mut config = self.pg_connection_config.clone(); if self.conf.pg_auth_type == AuthType::NeonJWT { let token = self @@ -311,36 +291,18 @@ impl PageServerNode { .generate_auth_token(&Claims::new(None, Scope::PageServerApi))?; config = config.set_password(Some(token)); } - Ok(config.connect_no_tls()?) + Ok(config.connect_no_tls().await?) } - fn http_request(&self, method: Method, url: U) -> anyhow::Result { - let mut builder = self.http_client.request(method, url); - if self.conf.http_auth_type == AuthType::NeonJWT { - let token = self - .env - .generate_auth_token(&Claims::new(None, Scope::PageServerApi))?; - builder = builder.bearer_auth(token) - } - Ok(builder) + pub async fn check_status(&self) -> mgmt_api::Result<()> { + self.http_client.status().await } - pub fn check_status(&self) -> Result<()> { - self.http_request(Method::GET, format!("{}/status", self.http_base_url))? - .send()? - .error_from_body()?; - Ok(()) + pub async fn tenant_list(&self) -> mgmt_api::Result> { + self.http_client.list_tenants().await } - pub fn tenant_list(&self) -> Result> { - Ok(self - .http_request(Method::GET, format!("{}/tenant", self.http_base_url))? - .send()? - .error_from_body()? - .json()?) - } - - pub fn tenant_create( + pub async fn tenant_create( &self, new_tenant_id: TenantId, generation: Option, @@ -407,6 +369,7 @@ impl PageServerNode { .map(|x| x.parse::()) .transpose() .context("Failed to parse 'gc_feedback' as bool")?, + heatmap_period: settings.remove("heatmap_period").map(|x| x.to_string()), }; let request = models::TenantCreateRequest { @@ -417,23 +380,10 @@ impl PageServerNode { if !settings.is_empty() { bail!("Unrecognized tenant settings: {settings:?}") } - self.http_request(Method::POST, format!("{}/tenant", self.http_base_url))? - .json(&request) - .send()? - .error_from_body()? - .json::>() - .with_context(|| { - format!("Failed to parse tenant creation response for tenant id: {new_tenant_id:?}") - })? - .context("No tenant id was found in the tenant creation response") - .and_then(|tenant_id_string| { - tenant_id_string.parse().with_context(|| { - format!("Failed to parse response string as tenant id: '{tenant_id_string}'") - }) - }) + Ok(self.http_client.tenant_create(&request).await?) } - pub fn tenant_config( + pub async fn tenant_config( &self, tenant_id: TenantId, mut settings: HashMap<&str, &str>, @@ -504,6 +454,7 @@ impl PageServerNode { .map(|x| x.parse::()) .transpose() .context("Failed to parse 'gc_feedback' as bool")?, + heatmap_period: settings.remove("heatmap_period").map(|x| x.to_string()), } }; @@ -511,85 +462,48 @@ impl PageServerNode { bail!("Unrecognized tenant settings: {settings:?}") } - self.http_request(Method::PUT, format!("{}/tenant/config", self.http_base_url))? - .json(&models::TenantConfigRequest { tenant_id, config }) - .send()? - .error_from_body()?; + self.http_client + .tenant_config(&models::TenantConfigRequest { tenant_id, config }) + .await?; Ok(()) } - pub fn location_config( + pub async fn location_config( &self, tenant_id: TenantId, config: LocationConfig, flush_ms: Option, ) -> anyhow::Result<()> { - let req_body = TenantLocationConfigRequest { tenant_id, config }; - - let path = format!( - "{}/tenant/{}/location_config", - self.http_base_url, tenant_id - ); - let path = if let Some(flush_ms) = flush_ms { - format!("{}?flush_ms={}", path, flush_ms.as_millis()) - } else { - path - }; - - self.http_request(Method::PUT, path)? - .json(&req_body) - .send()? - .error_from_body()?; - - Ok(()) + Ok(self + .http_client + .location_config(tenant_id, config, flush_ms) + .await?) } - pub fn timeline_list(&self, tenant_id: &TenantId) -> anyhow::Result> { - let timeline_infos: Vec = self - .http_request( - Method::GET, - format!("{}/tenant/{}/timeline", self.http_base_url, tenant_id), - )? - .send()? - .error_from_body()? - .json()?; - - Ok(timeline_infos) + pub async fn timeline_list(&self, tenant_id: &TenantId) -> anyhow::Result> { + Ok(self.http_client.list_timelines(*tenant_id).await?) } - pub fn timeline_create( + pub async fn timeline_create( &self, tenant_id: TenantId, new_timeline_id: Option, ancestor_start_lsn: Option, ancestor_timeline_id: Option, pg_version: Option, + existing_initdb_timeline_id: Option, ) -> anyhow::Result { // If timeline ID was not specified, generate one let new_timeline_id = new_timeline_id.unwrap_or(TimelineId::generate()); - - self.http_request( - Method::POST, - format!("{}/tenant/{}/timeline", self.http_base_url, tenant_id), - )? - .json(&models::TimelineCreateRequest { + let req = models::TimelineCreateRequest { new_timeline_id, ancestor_start_lsn, ancestor_timeline_id, pg_version, - }) - .send()? - .error_from_body()? - .json::>() - .with_context(|| { - format!("Failed to parse timeline creation response for tenant id: {tenant_id}") - })? - .with_context(|| { - format!( - "No timeline id was found in the timeline creation response for tenant {tenant_id}" - ) - }) + existing_initdb_timeline_id, + }; + Ok(self.http_client.timeline_create(tenant_id, &req).await?) } /// Import a basebackup prepared using either: @@ -601,7 +515,7 @@ impl PageServerNode { /// * `timeline_id` - id to assign to imported timeline /// * `base` - (start lsn of basebackup, path to `base.tar` file) /// * `pg_wal` - if there's any wal to import: (end lsn, path to `pg_wal.tar`) - pub fn timeline_import( + pub async fn timeline_import( &self, tenant_id: TenantId, timeline_id: TimelineId, @@ -609,36 +523,60 @@ impl PageServerNode { pg_wal: Option<(Lsn, PathBuf)>, pg_version: u32, ) -> anyhow::Result<()> { - let mut client = self.page_server_psql_client()?; + let (client, conn) = self.page_server_psql_client().await?; + // The connection object performs the actual communication with the database, + // so spawn it off to run on its own. + tokio::spawn(async move { + if let Err(e) = conn.await { + eprintln!("connection error: {}", e); + } + }); + tokio::pin!(client); // Init base reader let (start_lsn, base_tarfile_path) = base; - let base_tarfile = File::open(base_tarfile_path)?; - let mut base_reader = BufReader::new(base_tarfile); + let base_tarfile = tokio::fs::File::open(base_tarfile_path).await?; + let base_tarfile = tokio_util::io::ReaderStream::new(base_tarfile); // Init wal reader if necessary let (end_lsn, wal_reader) = if let Some((end_lsn, wal_tarfile_path)) = pg_wal { - let wal_tarfile = File::open(wal_tarfile_path)?; - let wal_reader = BufReader::new(wal_tarfile); + let wal_tarfile = tokio::fs::File::open(wal_tarfile_path).await?; + let wal_reader = tokio_util::io::ReaderStream::new(wal_tarfile); (end_lsn, Some(wal_reader)) } else { (start_lsn, None) }; - // Import base - let import_cmd = format!( - "import basebackup {tenant_id} {timeline_id} {start_lsn} {end_lsn} {pg_version}" - ); - let mut writer = client.copy_in(&import_cmd)?; - io::copy(&mut base_reader, &mut writer)?; - writer.finish()?; + let copy_in = |reader, cmd| { + let client = &client; + async move { + let writer = client.copy_in(&cmd).await?; + let writer = std::pin::pin!(writer); + let mut writer = writer.sink_map_err(|e| { + std::io::Error::new(std::io::ErrorKind::Other, format!("{e}")) + }); + let mut reader = std::pin::pin!(reader); + writer.send_all(&mut reader).await?; + writer.into_inner().finish().await?; + anyhow::Ok(()) + } + }; + // Import base + copy_in( + base_tarfile, + format!( + "import basebackup {tenant_id} {timeline_id} {start_lsn} {end_lsn} {pg_version}" + ), + ) + .await?; // Import wal if necessary - if let Some(mut wal_reader) = wal_reader { - let import_cmd = format!("import wal {tenant_id} {timeline_id} {start_lsn} {end_lsn}"); - let mut writer = client.copy_in(&import_cmd)?; - io::copy(&mut wal_reader, &mut writer)?; - writer.finish()?; + if let Some(wal_reader) = wal_reader { + copy_in( + wal_reader, + format!("import wal {tenant_id} {timeline_id} {start_lsn} {end_lsn}"), + ) + .await?; } Ok(()) diff --git a/control_plane/src/safekeeper.rs b/control_plane/src/safekeeper.rs index a8baa0ac53..4026ef0eb9 100644 --- a/control_plane/src/safekeeper.rs +++ b/control_plane/src/safekeeper.rs @@ -13,7 +13,6 @@ use std::{io, result}; use anyhow::Context; use camino::Utf8PathBuf; use postgres_connection::PgConnectionConfig; -use reqwest::blocking::{Client, RequestBuilder, Response}; use reqwest::{IntoUrl, Method}; use thiserror::Error; use utils::{http::error::HttpErrorBody, id::NodeId}; @@ -34,12 +33,14 @@ pub enum SafekeeperHttpError { type Result = result::Result; +#[async_trait::async_trait] pub trait ResponseErrorMessageExt: Sized { - fn error_from_body(self) -> Result; + async fn error_from_body(self) -> Result; } -impl ResponseErrorMessageExt for Response { - fn error_from_body(self) -> Result { +#[async_trait::async_trait] +impl ResponseErrorMessageExt for reqwest::Response { + async fn error_from_body(self) -> Result { let status = self.status(); if !(status.is_client_error() || status.is_server_error()) { return Ok(self); @@ -48,7 +49,7 @@ impl ResponseErrorMessageExt for Response { // reqwest does not export its error construction utility functions, so let's craft the message ourselves let url = self.url().to_owned(); Err(SafekeeperHttpError::Response( - match self.json::() { + match self.json::().await { Ok(err_body) => format!("Error: {}", err_body.msg), Err(_) => format!("Http error ({}) at {}.", status.as_u16(), url), }, @@ -69,7 +70,7 @@ pub struct SafekeeperNode { pub pg_connection_config: PgConnectionConfig, pub env: LocalEnv, - pub http_client: Client, + pub http_client: reqwest::Client, pub http_base_url: String, } @@ -80,7 +81,7 @@ impl SafekeeperNode { conf: conf.clone(), pg_connection_config: Self::safekeeper_connection_config(conf.pg_port), env: env.clone(), - http_client: Client::new(), + http_client: reqwest::Client::new(), http_base_url: format!("http://127.0.0.1:{}/v1", conf.http_port), } } @@ -103,7 +104,7 @@ impl SafekeeperNode { .expect("non-Unicode path") } - pub fn start(&self, extra_opts: Vec) -> anyhow::Result { + pub async fn start(&self, extra_opts: Vec) -> anyhow::Result { print!( "Starting safekeeper at '{}' in '{}'", self.pg_connection_config.raw_address(), @@ -191,13 +192,16 @@ impl SafekeeperNode { &self.env.safekeeper_bin(), &args, [], - background_process::InitialPidFile::Expect(&self.pid_file()), - || match self.check_status() { - Ok(()) => Ok(true), - Err(SafekeeperHttpError::Transport(_)) => Ok(false), - Err(e) => Err(anyhow::anyhow!("Failed to check node status: {e}")), + background_process::InitialPidFile::Expect(self.pid_file()), + || async { + match self.check_status().await { + Ok(()) => Ok(true), + Err(SafekeeperHttpError::Transport(_)) => Ok(false), + Err(e) => Err(anyhow::anyhow!("Failed to check node status: {e}")), + } }, ) + .await } /// @@ -216,7 +220,7 @@ impl SafekeeperNode { ) } - fn http_request(&self, method: Method, url: U) -> RequestBuilder { + fn http_request(&self, method: Method, url: U) -> reqwest::RequestBuilder { // TODO: authentication //if self.env.auth_type == AuthType::NeonJWT { // builder = builder.bearer_auth(&self.env.safekeeper_auth_token) @@ -224,10 +228,12 @@ impl SafekeeperNode { self.http_client.request(method, url) } - pub fn check_status(&self) -> Result<()> { + pub async fn check_status(&self) -> Result<()> { self.http_request(Method::GET, format!("{}/{}", self.http_base_url, "status")) - .send()? - .error_from_body()?; + .send() + .await? + .error_from_body() + .await?; Ok(()) } } diff --git a/control_plane/src/tenant_migration.rs b/control_plane/src/tenant_migration.rs index c0c44e279f..79df108896 100644 --- a/control_plane/src/tenant_migration.rs +++ b/control_plane/src/tenant_migration.rs @@ -19,11 +19,11 @@ use utils::{ }; /// Given an attached pageserver, retrieve the LSN for all timelines -fn get_lsns( +async fn get_lsns( tenant_id: TenantId, pageserver: &PageServerNode, ) -> anyhow::Result> { - let timelines = pageserver.timeline_list(&tenant_id)?; + let timelines = pageserver.timeline_list(&tenant_id).await?; Ok(timelines .into_iter() .map(|t| (t.timeline_id, t.last_record_lsn)) @@ -32,13 +32,13 @@ fn get_lsns( /// Wait for the timeline LSNs on `pageserver` to catch up with or overtake /// `baseline`. -fn await_lsn( +async fn await_lsn( tenant_id: TenantId, pageserver: &PageServerNode, baseline: HashMap, ) -> anyhow::Result<()> { loop { - let latest = match get_lsns(tenant_id, pageserver) { + let latest = match get_lsns(tenant_id, pageserver).await { Ok(l) => l, Err(e) => { println!( @@ -84,7 +84,7 @@ fn await_lsn( /// - Coordinate attach/secondary/detach on pageservers /// - call into attachment_service for generations /// - reconfigure compute endpoints to point to new attached pageserver -pub fn migrate_tenant( +pub async fn migrate_tenant( env: &LocalEnv, tenant_id: TenantId, dest_ps: PageServerNode, @@ -108,16 +108,18 @@ pub fn migrate_tenant( } } - let previous = attachment_service.inspect(tenant_id)?; + let previous = attachment_service.inspect(tenant_id).await?; let mut baseline_lsns = None; if let Some((generation, origin_ps_id)) = &previous { let origin_ps = PageServerNode::from_env(env, env.get_pageserver_conf(*origin_ps_id)?); if origin_ps_id == &dest_ps.conf.id { println!("🔁 Already attached to {origin_ps_id}, freshening..."); - let gen = attachment_service.attach_hook(tenant_id, dest_ps.conf.id)?; + let gen = attachment_service + .attach_hook(tenant_id, dest_ps.conf.id) + .await?; let dest_conf = build_location_config(LocationConfigMode::AttachedSingle, gen, None); - dest_ps.location_config(tenant_id, dest_conf, None)?; + dest_ps.location_config(tenant_id, dest_conf, None).await?; println!("✅ Migration complete"); return Ok(()); } @@ -126,20 +128,24 @@ pub fn migrate_tenant( let stale_conf = build_location_config(LocationConfigMode::AttachedStale, Some(*generation), None); - origin_ps.location_config(tenant_id, stale_conf, Some(Duration::from_secs(10)))?; + origin_ps + .location_config(tenant_id, stale_conf, Some(Duration::from_secs(10))) + .await?; - baseline_lsns = Some(get_lsns(tenant_id, &origin_ps)?); + baseline_lsns = Some(get_lsns(tenant_id, &origin_ps).await?); } - let gen = attachment_service.attach_hook(tenant_id, dest_ps.conf.id)?; + let gen = attachment_service + .attach_hook(tenant_id, dest_ps.conf.id) + .await?; let dest_conf = build_location_config(LocationConfigMode::AttachedMulti, gen, None); println!("🔁 Attaching to pageserver {}", dest_ps.conf.id); - dest_ps.location_config(tenant_id, dest_conf, None)?; + dest_ps.location_config(tenant_id, dest_conf, None).await?; if let Some(baseline) = baseline_lsns { println!("🕑 Waiting for LSN to catch up..."); - await_lsn(tenant_id, &dest_ps, baseline)?; + await_lsn(tenant_id, &dest_ps, baseline).await?; } let cplane = ComputeControlPlane::load(env.clone())?; @@ -149,7 +155,7 @@ pub fn migrate_tenant( "🔁 Reconfiguring endpoint {} to use pageserver {}", endpoint_name, dest_ps.conf.id ); - endpoint.reconfigure(Some(dest_ps.conf.id))?; + endpoint.reconfigure(Some(dest_ps.conf.id)).await?; } } @@ -159,13 +165,13 @@ pub fn migrate_tenant( } let other_ps = PageServerNode::from_env(env, other_ps_conf); - let other_ps_tenants = other_ps.tenant_list()?; + let other_ps_tenants = other_ps.tenant_list().await?; // Check if this tenant is attached let found = other_ps_tenants .into_iter() .map(|t| t.id) - .any(|i| i == tenant_id); + .any(|i| i.tenant_id == tenant_id); if !found { continue; } @@ -181,7 +187,9 @@ pub fn migrate_tenant( "💤 Switching to secondary mode on pageserver {}", other_ps.conf.id ); - other_ps.location_config(tenant_id, secondary_conf, None)?; + other_ps + .location_config(tenant_id, secondary_conf, None) + .await?; } println!( @@ -189,7 +197,7 @@ pub fn migrate_tenant( dest_ps.conf.id ); let dest_conf = build_location_config(LocationConfigMode::AttachedSingle, gen, None); - dest_ps.location_config(tenant_id, dest_conf, None)?; + dest_ps.location_config(tenant_id, dest_conf, None).await?; println!("✅ Migration complete"); diff --git a/deny.toml b/deny.toml index 079dcac679..22e39a2ca3 100644 --- a/deny.toml +++ b/deny.toml @@ -35,6 +35,7 @@ allow = [ "Artistic-2.0", "BSD-2-Clause", "BSD-3-Clause", + "CC0-1.0", "ISC", "MIT", "MPL-2.0", diff --git a/docs/rfcs/029-getpage-throttling.md b/docs/rfcs/029-getpage-throttling.md new file mode 100644 index 0000000000..b4f9adefc5 --- /dev/null +++ b/docs/rfcs/029-getpage-throttling.md @@ -0,0 +1,197 @@ +# Per-Tenant GetPage@LSN Throttling + +Author: Christian Schwarz +Date: Oct 24, 2023 + +## Summary + +This RFC proposes per-tenant throttling of GetPage@LSN requests inside Pageserver +and the interactions with its client, i.e., the neon_smgr component in Compute. + +The result of implementing & executing this RFC will be a fleet-wide upper limit for +**"the highest GetPage/second that Pageserver can support for a single tenant/shard"**. + +## Background + +### GetPage@LSN Request Flow + +Pageserver exposes its `page_service.rs` as a libpq listener. +The Computes' `neon_smgr` module connects to that libpq listener. +Once a connection is established, the protocol allows Compute to request page images at a given LSN. +We call these requests GetPage@LSN requests, or GetPage requests for short. +Other request types can be sent, but these are low traffic compared to GetPage requests +and are not the concern of this RFC. + +Pageserver associates one libpq connection with one tokio task. + +Per connection/task, the pq protocol is handled by the common `postgres_backend` crate. +Its `run_message_loop` function invokes the `page_service` specific `impl postgres_backend::Handler for PageServerHandler`. +Requests are processed in the order in which they arrive via the TCP-based pq protocol. +So, there is no concurrent request processing within one connection/task. + +There is a degree of natural pipelining: +Compute can "fill the pipe" by sending more than one GetPage request into the libpq TCP stream. +And Pageserver can fill the pipe with responses in the other direction. +Both directions are subject to the limit of tx/rx buffers, nodelay, TCP flow control, etc. + +### GetPage@LSN Access Pattern + +The Compute has its own hierarchy of caches, specifically `shared_buffers` and the `local file cache` (LFC). +Compute only issues GetPage requests to Pageserver if it encounters a miss in these caches. + +If the working set stops fitting into Compute's caches, requests to Pageserver increase sharply -- the Compute starts *thrashing*. + +## Motivation + +In INC-69, a tenant issued 155k GetPage/second for a period of 10 minutes and 60k GetPage/second for a period of 3h, +then dropping to ca 18k GetPage/second for a period of 9h. + +We noticed this because of an internal GetPage latency SLO burn rate alert, i.e., +the request latency profile during this period significantly exceeded what was acceptable according to the internal SLO. + +Sadly, we do not have the observability data to determine the impact of this tenant on other tenants on the same tenants. + +However, here are some illustrative data points for the 155k period: +The tenant was responsible for >= 99% of the GetPage traffic and, frankly, the overall activity on this Pageserver instance. +We were serving pages at 10 Gb/s (`155k x 8 kbyte (PAGE_SZ) per second is 1.12GiB/s = 9.4Gb/s.`) +The CPU utilization of the instance was 75% user+system. +Pageserver page cache served 1.75M accesses/second at a hit rate of ca 90%. +The hit rate for materialized pages was ca. 40%. +Curiously, IOPS to the Instance Store NVMe were very low, rarely exceeding 100. + +The fact that the IOPS were so low / the materialized page cache hit rate was so high suggests that **this tenant's compute's caches were thrashing**. +The compute was of type `k8s-pod`; hence, auto-scaling could/would not have helped remediate the thrashing by provisioning more RAM. +The consequence was that the **thrashing translated into excessive GetPage requests against Pageserver**. + +My claim is that it was **unhealthy to serve this workload at the pace we did**: +* it is likely that other tenants were/would have experienced high latencies (again, we sadly don't have per-tenant latency data to confirm this) +* more importantly, it was **unsustainable** to serve traffic at this pace for multiple reasons: + * **predictability of performance**: when the working set grows, the pageserver materialized page cache hit rate drops. + At some point, we're bound by the EC2 Instance Store NVMe drive's IOPS limit. + The result is an **uneven** performance profile from the Compute perspective. + + * **economics**: Neon currently does not charge for IOPS, only capacity. + **We cannot afford to undercut the market in IOPS/$ this drastically; it leads to adverse selection and perverse incentives.** + For example, the 155k IOPS, which we served for 10min, would cost ca. 6.5k$/month when provisioned as an io2 EBS volume. + Even the 18k IOPS, which we served for 9h, would cost ca. 1.1k$/month when provisioned as an io2 EBS volume. + We charge 0$. + It could be economically advantageous to keep using a low-DRAM compute because Pageserver IOPS are fast enough and free. + + +Note: It is helpful to think of Pageserver as a disk, because it's precisely where `neon_smgr` sits: +vanilla Postgres gets its pages from disk, Neon Postgres gets them from Pageserver. +So, regarding the above performance & economic arguments, it is fair to say that we currently provide an "as-fast-as-possible-IOPS" disk that we charge for only by capacity. + +## Solution: Throttling GetPage Requests + +**The consequence of the above analysis must be that Pageserver throttles GetPage@LSN requests**. +That is, unless we want to start charging for provisioned GetPage@LSN/second. +Throttling sets the correct incentive for a thrashing Compute to scale up its DRAM to the working set size. +Neon Autoscaling will make this easy, [eventually](https://github.com/neondatabase/neon/pull/3913). + +## The Design Space + +What that remains is the question about *policy* and *mechanism*: + +**Policy** concerns itself with the question of what limit applies to a given connection|timeline|tenant. +Candidates are: + +* hard limit, same limit value per connection|timeline|tenant + * Per-tenant will provide an upper bound for the impact of a tenant on a given Pageserver instance. + This is a major operational pain point / risk right now. +* hard limit, configurable per connection|timeline|tenant + * This outsources policy to console/control plane, with obvious advantages for flexible structuring of what service we offer to customers. + * Note that this is not a mechanism to guarantee a minium provisioned rate, i.e., this is not a mechanism to guarantee a certain QoS for a tenant. +* fair share among active connections|timelines|tenants per instance + * example: each connection|timeline|tenant gets a fair fraction of the machine's GetPage/second capacity + * NB: needs definition of "active", and knowledge of available GetPage/second capacity in advance +* ... + + +Regarding **mechanism**, it's clear that **backpressure** is the way to go. +However, we must choose between +* **implicit** backpressure through pq/TCP and +* **explicit** rejection of requests + retries with exponential backoff + +Further, there is the question of how throttling GetPage@LSN will affect the **internal GetPage latency SLO**: +where do we measure the SLI for Pageserver's internal getpage latency SLO? Before or after the throttling? + +And when we eventually move the measurement point into the Computes (to avoid coordinated omission), +how do we avoid counting throttling-induced latency toward the internal getpage latency SLI/SLO? + +## Scope Of This RFC + +**This RFC proposes introducing a hard GetPage@LSN/second limit per tenant, with the same value applying to each tenant on a Pageserver**. + +This proposal is easy to implement and significantly de-risks operating large Pageservers, +based on the assumption that extremely-high-GetPage-rate-episodes like the one from the "Motivation" section are uncorrelated between tenants. + +For example, suppose we pick a limit that allows up to 10 tenants to go at limit rate. +Suppose our Pageserver can serve 100k GetPage/second total at a 100% page cache miss rate. +If each tenant gets a hard limit of 10k GetPage/second, we can serve up to 10 tenants at limit speed without latency degradation. + +The mechanism for backpressure will be TCP-based implicit backpressure. +The compute team isn't concerned about prefetch queue depth. +Pageserver will implement it by delaying the reading of requests from the libpq connection(s). + +The rate limit will be implemented using a per-tenant token bucket. +The bucket will be be shared among all connections to the tenant. +The bucket implementation supports starvation-preventing `await`ing. +The current candidate for the implementation is [`leaky_bucket`](https://docs.rs/leaky-bucket/). +The getpage@lsn benchmark that's being added in https://github.com/neondatabase/neon/issues/5771 +can be used to evaluate the overhead of sharing the bucket among connections of a tenant. +A possible technique to mitigate the impact of sharing the bucket would be to maintain a buffer of a few tokens per connection handler. + +Regarding metrics / the internal GetPage latency SLO: +we will measure the GetPage latency SLO _after_ the throttler and introduce a new metric to measure the amount of throttling, quantified by: +- histogram that records the tenants' observations of queue depth before they start waiting (one such histogram per pageserver) +- histogram that records the tenants' observations of time spent waiting (one such histogram per pageserver) + +Further observability measures: +- an INFO log message at frequency 1/min if the tenant/timeline/connection was throttled in that last minute. + The message will identify the tenant/timeline/connection to allow correlation with compute logs/stats. + +Rollout will happen as follows: +- deploy 1: implementation + config: disabled by default, ability to enable it per tenant through tenant_conf +- experimentation in staging and later production to study impact & interaction with auto-scaling +- determination of a sensible global default value + - the value will be chosen as high as possible ... + - ... but low enough to work towards this RFC's goal that one tenant should not be able to dominate a pageserver instance. +- deploy 2: implementation fixes if any + config: enabled by default with the aforementioned global default +- reset of the experimental per-tenant overrides +- gain experience & lower the limit over time + - we stop lowering the limit as soon as this RFC's goal is achieved, i.e., + once we decide that in practice the chosen value sufficiently de-risks operating large pageservers + +The per-tenant override will remain for emergencies and testing. +But since Console doesn't preserve it during tenant migrations, it isn't durably configurable for the tenant. + +Toward the upper layers of the Neon stack, the resulting limit will be +**"the highest GetPage/second that Pageserver can support for a single tenant"**. + +### Rationale + +We decided against error + retry because of worries about starvation. + +## Future Work + +Enable per-tenant emergency override of the limit via Console. +Should be part of a more general framework to specify tenant config overrides. +**NB:** this is **not** the right mechanism to _sell_ different max GetPage/second levels to users, +or _auto-scale_ the GetPage/second levels. Such functionality will require a separate RFC that +concerns itself with GetPage/second capacity planning. + +Compute-side metrics for GetPage latency. + +Back-channel to inform Compute/Autoscaling/ControlPlane that the project is being throttled. + +Compute-side neon_smgr improvements to avoid sending the same GetPage request multiple times if multiple backends experience a cache miss. + +Dealing with read-only endpoints: users use read-only endpoints to scale reads for a single tenant. +Possibly there are also assumptions around read-only endpoints not affecting the primary read-write endpoint's performance. +With per-tenant rate limiting, we will not meet that expectation. +However, we can currently only scale per tenant. +Soon, we will have sharding (#5505), which will apply the throttling on a per-shard basis. +But, that's orthogonal to scaling reads: if many endpoints hit one shard, they share the same throttling limit. +To solve this properly, I think we'll need replicas for tenants / shard. +To performance-isolate a tenant's endpoints from each other, we'd then route them to different replicas. diff --git a/docs/rfcs/027-pageserver-wal-disaster-recovery.md b/docs/rfcs/029-pageserver-wal-disaster-recovery.md similarity index 100% rename from docs/rfcs/027-pageserver-wal-disaster-recovery.md rename to docs/rfcs/029-pageserver-wal-disaster-recovery.md diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs index 175b4461ac..4ff6831272 100644 --- a/libs/compute_api/src/spec.rs +++ b/libs/compute_api/src/spec.rs @@ -26,6 +26,13 @@ pub struct ComputeSpec { // but we don't use it for anything. Serde will ignore missing fields when // deserializing it. pub operation_uuid: Option, + + /// Compute features to enable. These feature flags are provided, when we + /// know all the details about client's compute, so they cannot be used + /// to change `Empty` compute behavior. + #[serde(default)] + pub features: Vec, + /// Expected cluster state at the end of transition process. pub cluster: Cluster, pub delta_operations: Option>, @@ -66,6 +73,21 @@ pub struct ComputeSpec { // information about available remote extensions pub remote_extensions: Option, + + pub pgbouncer_settings: Option>, +} + +/// Feature flag to signal `compute_ctl` to enable certain experimental functionality. +#[derive(Serialize, Clone, Copy, Debug, Deserialize, PartialEq, Eq)] +#[serde(rename_all = "snake_case")] +pub enum ComputeFeature { + // XXX: Add more feature flags here. + + // This is a special feature flag that is used to represent unknown feature flags. + // Basically all unknown to enum flags are represented as this one. See unit test + // `parse_unknown_features()` for more details. + #[serde(other)] + UnknownFeature, } #[derive(Clone, Debug, Default, Deserialize, Serialize)] @@ -187,8 +209,6 @@ pub struct DeltaOp { pub struct Role { pub name: PgIdent, pub encrypted_password: Option, - pub replication: Option, - pub bypassrls: Option, pub options: GenericOptions, } @@ -229,7 +249,10 @@ mod tests { #[test] fn parse_spec_file() { let file = File::open("tests/cluster_spec.json").unwrap(); - let _spec: ComputeSpec = serde_json::from_reader(file).unwrap(); + let spec: ComputeSpec = serde_json::from_reader(file).unwrap(); + + // Features list defaults to empty vector. + assert!(spec.features.is_empty()); } #[test] @@ -241,4 +264,22 @@ mod tests { ob.insert("unknown_field_123123123".into(), "hello".into()); let _spec: ComputeSpec = serde_json::from_value(json).unwrap(); } + + #[test] + fn parse_unknown_features() { + // Test that unknown feature flags do not cause any errors. + let file = File::open("tests/cluster_spec.json").unwrap(); + let mut json: serde_json::Value = serde_json::from_reader(file).unwrap(); + let ob = json.as_object_mut().unwrap(); + + // Add unknown feature flags. + let features = vec!["foo_bar_feature", "baz_feature"]; + ob.insert("features".into(), features.into()); + + let spec: ComputeSpec = serde_json::from_value(json).unwrap(); + + assert!(spec.features.len() == 2); + assert!(spec.features.contains(&ComputeFeature::UnknownFeature)); + assert_eq!(spec.features, vec![ComputeFeature::UnknownFeature; 2]); + } } diff --git a/libs/compute_api/tests/cluster_spec.json b/libs/compute_api/tests/cluster_spec.json index e2afa17ef0..ccd015ad19 100644 --- a/libs/compute_api/tests/cluster_spec.json +++ b/libs/compute_api/tests/cluster_spec.json @@ -243,5 +243,9 @@ "public_extensions": [ "postgis" ] + }, + "pgbouncer_settings": { + "default_pool_size": "42", + "pool_mode": "session" } } diff --git a/libs/metrics/src/lib.rs b/libs/metrics/src/lib.rs index ed375a152f..d09ba11344 100644 --- a/libs/metrics/src/lib.rs +++ b/libs/metrics/src/lib.rs @@ -3,8 +3,11 @@ //! Otherwise, we might not see all metrics registered via //! a default registry. #![deny(clippy::undocumented_unsafe_blocks)] + use once_cell::sync::Lazy; -use prometheus::core::{AtomicU64, Collector, GenericGauge, GenericGaugeVec}; +use prometheus::core::{ + Atomic, AtomicU64, Collector, GenericCounter, GenericCounterVec, GenericGauge, GenericGaugeVec, +}; pub use prometheus::opts; pub use prometheus::register; pub use prometheus::Error; @@ -132,3 +135,137 @@ fn get_rusage_stats() -> libc::rusage { rusage.assume_init() } } + +/// Create an [`IntCounterPairVec`] and registers to default registry. +#[macro_export(local_inner_macros)] +macro_rules! register_int_counter_pair_vec { + ($NAME1:expr, $HELP1:expr, $NAME2:expr, $HELP2:expr, $LABELS_NAMES:expr $(,)?) => {{ + match ( + $crate::register_int_counter_vec!($NAME1, $HELP1, $LABELS_NAMES), + $crate::register_int_counter_vec!($NAME2, $HELP2, $LABELS_NAMES), + ) { + (Ok(inc), Ok(dec)) => Ok($crate::IntCounterPairVec::new(inc, dec)), + (Err(e), _) | (_, Err(e)) => Err(e), + } + }}; +} +/// Create an [`IntCounterPair`] and registers to default registry. +#[macro_export(local_inner_macros)] +macro_rules! register_int_counter_pair { + ($NAME1:expr, $HELP1:expr, $NAME2:expr, $HELP2:expr $(,)?) => {{ + match ( + $crate::register_int_counter!($NAME1, $HELP1), + $crate::register_int_counter!($NAME2, $HELP2), + ) { + (Ok(inc), Ok(dec)) => Ok($crate::IntCounterPair::new(inc, dec)), + (Err(e), _) | (_, Err(e)) => Err(e), + } + }}; +} + +/// A Pair of [`GenericCounterVec`]s. Like an [`GenericGaugeVec`] but will always observe changes +pub struct GenericCounterPairVec { + inc: GenericCounterVec

, + dec: GenericCounterVec

, +} + +/// A Pair of [`GenericCounter`]s. Like an [`GenericGauge`] but will always observe changes +pub struct GenericCounterPair { + inc: GenericCounter

, + dec: GenericCounter

, +} + +impl GenericCounterPairVec

{ + pub fn new(inc: GenericCounterVec

, dec: GenericCounterVec

) -> Self { + Self { inc, dec } + } + + /// `get_metric_with_label_values` returns the [`GenericCounterPair

`] for the given slice + /// of label values (same order as the VariableLabels in Desc). If that combination of + /// label values is accessed for the first time, a new [`GenericCounterPair

`] is created. + /// + /// An error is returned if the number of label values is not the same as the + /// number of VariableLabels in Desc. + pub fn get_metric_with_label_values(&self, vals: &[&str]) -> Result> { + Ok(GenericCounterPair { + inc: self.inc.get_metric_with_label_values(vals)?, + dec: self.dec.get_metric_with_label_values(vals)?, + }) + } + + /// `with_label_values` works as `get_metric_with_label_values`, but panics if an error + /// occurs. + pub fn with_label_values(&self, vals: &[&str]) -> GenericCounterPair

{ + self.get_metric_with_label_values(vals).unwrap() + } +} + +impl GenericCounterPair

{ + pub fn new(inc: GenericCounter

, dec: GenericCounter

) -> Self { + Self { inc, dec } + } + + /// Increment the gauge by 1, returning a guard that decrements by 1 on drop. + pub fn guard(&self) -> GenericCounterPairGuard

{ + self.inc.inc(); + GenericCounterPairGuard(self.dec.clone()) + } + + /// Increment the gauge by n, returning a guard that decrements by n on drop. + pub fn guard_by(&self, n: P::T) -> GenericCounterPairGuardBy

{ + self.inc.inc_by(n); + GenericCounterPairGuardBy(self.dec.clone(), n) + } + + /// Increase the gauge by 1. + #[inline] + pub fn inc(&self) { + self.inc.inc(); + } + + /// Decrease the gauge by 1. + #[inline] + pub fn dec(&self) { + self.dec.inc(); + } + + /// Add the given value to the gauge. (The value can be + /// negative, resulting in a decrement of the gauge.) + #[inline] + pub fn inc_by(&self, v: P::T) { + self.inc.inc_by(v); + } + + /// Subtract the given value from the gauge. (The value can be + /// negative, resulting in an increment of the gauge.) + #[inline] + pub fn dec_by(&self, v: P::T) { + self.dec.inc_by(v); + } +} + +/// Guard returned by [`GenericCounterPair::guard`] +pub struct GenericCounterPairGuard(GenericCounter

); + +impl Drop for GenericCounterPairGuard

{ + fn drop(&mut self) { + self.0.inc(); + } +} +/// Guard returned by [`GenericCounterPair::guard_by`] +pub struct GenericCounterPairGuardBy(GenericCounter

, P::T); + +impl Drop for GenericCounterPairGuardBy

{ + fn drop(&mut self) { + self.0.inc_by(self.1); + } +} + +/// A Pair of [`IntCounterVec`]s. Like an [`IntGaugeVec`] but will always observe changes +pub type IntCounterPairVec = GenericCounterPairVec; + +/// A Pair of [`IntCounter`]s. Like an [`IntGauge`] but will always observe changes +pub type IntCounterPair = GenericCounterPair; + +/// A guard for [`IntCounterPair`] that will decrement the gauge on drop +pub type IntCounterPairGuard = GenericCounterPairGuard; diff --git a/libs/pageserver_api/Cargo.toml b/libs/pageserver_api/Cargo.toml index 4d08d78e87..4146597d8d 100644 --- a/libs/pageserver_api/Cargo.toml +++ b/libs/pageserver_api/Cargo.toml @@ -24,3 +24,4 @@ workspace_hack.workspace = true [dev-dependencies] bincode.workspace = true +rand.workspace = true diff --git a/libs/pageserver_api/src/key.rs b/libs/pageserver_api/src/key.rs index b5350d6384..d680a5600e 100644 --- a/libs/pageserver_api/src/key.rs +++ b/libs/pageserver_api/src/key.rs @@ -140,3 +140,41 @@ impl Key { }) } } + +pub fn is_rel_block_key(key: &Key) -> bool { + key.field1 == 0x00 && key.field4 != 0 +} + +impl std::str::FromStr for Key { + type Err = anyhow::Error; + + fn from_str(s: &str) -> std::result::Result { + Self::from_hex(s) + } +} + +#[cfg(test)] +mod tests { + use std::str::FromStr; + + use crate::key::Key; + + use rand::Rng; + use rand::SeedableRng; + + #[test] + fn display_fromstr_bijection() { + let mut rng = rand::rngs::StdRng::seed_from_u64(42); + + let key = Key { + field1: rng.gen(), + field2: rng.gen(), + field3: rng.gen(), + field4: rng.gen(), + field5: rng.gen(), + field6: rng.gen(), + }; + + assert_eq!(key, Key::from_str(&format!("{key}")).unwrap()); + } +} diff --git a/pageserver/src/keyspace.rs b/libs/pageserver_api/src/keyspace.rs similarity index 91% rename from pageserver/src/keyspace.rs rename to libs/pageserver_api/src/keyspace.rs index 20e6df9c7b..80183506d8 100644 --- a/pageserver/src/keyspace.rs +++ b/libs/pageserver_api/src/keyspace.rs @@ -1,11 +1,12 @@ -use crate::repository::{key_range_size, singleton_range, Key}; use postgres_ffi::BLCKSZ; use std::ops::Range; +use crate::key::Key; + /// /// Represents a set of Keys, in a compact form. /// -#[derive(Clone, Debug, Default)] +#[derive(Clone, Debug, Default, PartialEq, Eq)] pub struct KeySpace { /// Contiguous ranges of keys that belong to the key space. In key order, /// and with no overlap. @@ -123,6 +124,9 @@ impl KeySpaceAccum { if range.start == accum.end { accum.end = range.end; } else { + // TODO: to efficiently support small sharding stripe sizes, we should avoid starting + // a new range here if the skipped region was all keys that don't belong on this shard. + // (https://github.com/neondatabase/neon/issues/6247) assert!(range.start > accum.end); self.ranges.push(accum.clone()); *accum = range; @@ -186,6 +190,33 @@ impl KeySpaceRandomAccum { } } +pub fn key_range_size(key_range: &Range) -> u32 { + let start = key_range.start; + let end = key_range.end; + + if end.field1 != start.field1 + || end.field2 != start.field2 + || end.field3 != start.field3 + || end.field4 != start.field4 + { + return u32::MAX; + } + + let start = (start.field5 as u64) << 32 | start.field6 as u64; + let end = (end.field5 as u64) << 32 | end.field6 as u64; + + let diff = end - start; + if diff > u32::MAX as u64 { + u32::MAX + } else { + diff as u32 + } +} + +pub fn singleton_range(key: Key) -> Range { + key..key.next() +} + #[cfg(test)] mod tests { use super::*; diff --git a/libs/pageserver_api/src/lib.rs b/libs/pageserver_api/src/lib.rs index 511c5ed208..b236b93428 100644 --- a/libs/pageserver_api/src/lib.rs +++ b/libs/pageserver_api/src/lib.rs @@ -5,6 +5,7 @@ use const_format::formatcp; /// Public API types pub mod control_api; pub mod key; +pub mod keyspace; pub mod models; pub mod reltag; pub mod shard; diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index 60c508037e..dea925b468 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -1,5 +1,8 @@ +pub mod partitioning; + use std::{ collections::HashMap, + io::Read, num::{NonZeroU64, NonZeroUsize}, time::SystemTime, }; @@ -17,7 +20,7 @@ use utils::{ use crate::{reltag::RelTag, shard::TenantShardId}; use anyhow::bail; -use bytes::{BufMut, Bytes, BytesMut}; +use bytes::{Buf, BufMut, Bytes, BytesMut}; /// The state of a tenant in this pageserver. /// @@ -179,6 +182,8 @@ pub struct TimelineCreateRequest { #[serde(default)] pub ancestor_timeline_id: Option, #[serde(default)] + pub existing_initdb_timeline_id: Option, + #[serde(default)] pub ancestor_start_lsn: Option, pub pg_version: Option, } @@ -235,6 +240,7 @@ pub struct TenantConfig { pub min_resident_size_override: Option, pub evictions_low_residence_duration_metric_threshold: Option, pub gc_feedback: Option, + pub heatmap_period: Option, } /// A flattened analog of a `pagesever::tenant::LocationMode`, which @@ -321,6 +327,7 @@ impl TenantConfigRequest { #[derive(Debug, Deserialize)] pub struct TenantAttachRequest { + #[serde(default)] pub config: TenantAttachConfig, #[serde(default)] pub generation: Option, @@ -328,7 +335,7 @@ pub struct TenantAttachRequest { /// Newtype to enforce deny_unknown_fields on TenantConfig for /// its usage inside `TenantAttachRequest`. -#[derive(Debug, Serialize, Deserialize)] +#[derive(Debug, Serialize, Deserialize, Default)] #[serde(deny_unknown_fields)] pub struct TenantAttachConfig { #[serde(flatten)] @@ -354,7 +361,7 @@ pub enum TenantAttachmentStatus { #[derive(Serialize, Deserialize, Clone)] pub struct TenantInfo { - pub id: TenantId, + pub id: TenantShardId, // NB: intentionally not part of OpenAPI, we don't want to commit to a specific set of TenantState's pub state: TenantState, /// Sum of the size of all layer files. @@ -363,10 +370,18 @@ pub struct TenantInfo { pub attachment_status: TenantAttachmentStatus, } +#[derive(Serialize, Deserialize, Clone)] +pub struct TenantDetails { + #[serde(flatten)] + pub tenant_info: TenantInfo, + + pub timelines: Vec, +} + /// This represents the output of the "timeline_detail" and "timeline_list" API calls. #[derive(Debug, Serialize, Deserialize, Clone)] pub struct TimelineInfo { - pub tenant_id: TenantId, + pub tenant_id: TenantShardId, pub timeline_id: TimelineId, pub ancestor_timeline_id: Option, @@ -382,7 +397,12 @@ pub struct TimelineInfo { /// The LSN that we are advertizing to safekeepers pub remote_consistent_lsn_visible: Lsn, - pub current_logical_size: Option, // is None when timeline is Unloaded + /// The LSN from the start of the root timeline (never changes) + pub initdb_lsn: Lsn, + + pub current_logical_size: u64, + pub current_logical_size_is_accurate: bool, + /// Sum of the size of all layer files. /// If a layer is present in both local FS and S3, it counts only once. pub current_physical_size: Option, // is None when timeline is Unloaded @@ -537,19 +557,6 @@ pub enum DownloadRemoteLayersTaskState { ShutDown, } -pub type ConfigureFailpointsRequest = Vec; - -/// Information for configuring a single fail point -#[derive(Debug, Serialize, Deserialize)] -pub struct FailpointConfig { - /// Name of the fail point - pub name: String, - /// List of actions to take, using the format described in `fail::cfg` - /// - /// We also support `actions = "exit"` to cause the fail point to immediately exit. - pub actions: String, -} - #[derive(Debug, Serialize, Deserialize)] pub struct TimelineGcRequest { pub gc_horizon: Option, @@ -565,6 +572,7 @@ pub enum PagestreamFeMessage { } // Wrapped in libpq CopyData +#[derive(strum_macros::EnumProperty)] pub enum PagestreamBeMessage { Exists(PagestreamExistsResponse), Nblocks(PagestreamNblocksResponse), @@ -573,6 +581,29 @@ pub enum PagestreamBeMessage { DbSize(PagestreamDbSizeResponse), } +// Keep in sync with `pagestore_client.h` +#[repr(u8)] +enum PagestreamBeMessageTag { + Exists = 100, + Nblocks = 101, + GetPage = 102, + Error = 103, + DbSize = 104, +} +impl TryFrom for PagestreamBeMessageTag { + type Error = u8; + fn try_from(value: u8) -> Result { + match value { + 100 => Ok(PagestreamBeMessageTag::Exists), + 101 => Ok(PagestreamBeMessageTag::Nblocks), + 102 => Ok(PagestreamBeMessageTag::GetPage), + 103 => Ok(PagestreamBeMessageTag::Error), + 104 => Ok(PagestreamBeMessageTag::DbSize), + _ => Err(value), + } + } +} + #[derive(Debug, PartialEq, Eq)] pub struct PagestreamExistsRequest { pub latest: bool, @@ -728,35 +759,91 @@ impl PagestreamBeMessage { pub fn serialize(&self) -> Bytes { let mut bytes = BytesMut::new(); + use PagestreamBeMessageTag as Tag; match self { Self::Exists(resp) => { - bytes.put_u8(100); /* tag from pagestore_client.h */ + bytes.put_u8(Tag::Exists as u8); bytes.put_u8(resp.exists as u8); } Self::Nblocks(resp) => { - bytes.put_u8(101); /* tag from pagestore_client.h */ + bytes.put_u8(Tag::Nblocks as u8); bytes.put_u32(resp.n_blocks); } Self::GetPage(resp) => { - bytes.put_u8(102); /* tag from pagestore_client.h */ + bytes.put_u8(Tag::GetPage as u8); bytes.put(&resp.page[..]); } Self::Error(resp) => { - bytes.put_u8(103); /* tag from pagestore_client.h */ + bytes.put_u8(Tag::Error as u8); bytes.put(resp.message.as_bytes()); bytes.put_u8(0); // null terminator } Self::DbSize(resp) => { - bytes.put_u8(104); /* tag from pagestore_client.h */ + bytes.put_u8(Tag::DbSize as u8); bytes.put_i64(resp.db_size); } } bytes.into() } + + pub fn deserialize(buf: Bytes) -> anyhow::Result { + let mut buf = buf.reader(); + let msg_tag = buf.read_u8()?; + + use PagestreamBeMessageTag as Tag; + let ok = + match Tag::try_from(msg_tag).map_err(|tag: u8| anyhow::anyhow!("invalid tag {tag}"))? { + Tag::Exists => { + let exists = buf.read_u8()?; + Self::Exists(PagestreamExistsResponse { + exists: exists != 0, + }) + } + Tag::Nblocks => { + let n_blocks = buf.read_u32::()?; + Self::Nblocks(PagestreamNblocksResponse { n_blocks }) + } + Tag::GetPage => { + let mut page = vec![0; 8192]; // TODO: use MaybeUninit + buf.read_exact(&mut page)?; + PagestreamBeMessage::GetPage(PagestreamGetPageResponse { page: page.into() }) + } + Tag::Error => { + let buf = buf.get_ref(); + let cstr = std::ffi::CStr::from_bytes_until_nul(buf)?; + let rust_str = cstr.to_str()?; + PagestreamBeMessage::Error(PagestreamErrorResponse { + message: rust_str.to_owned(), + }) + } + Tag::DbSize => { + let db_size = buf.read_i64::()?; + Self::DbSize(PagestreamDbSizeResponse { db_size }) + } + }; + let remaining = buf.into_inner(); + if !remaining.is_empty() { + anyhow::bail!( + "remaining bytes in msg with tag={msg_tag}: {}", + remaining.len() + ); + } + Ok(ok) + } + + pub fn kind(&self) -> &'static str { + match self { + Self::Exists(_) => "Exists", + Self::Nblocks(_) => "Nblocks", + Self::GetPage(_) => "GetPage", + Self::Error(_) => "Error", + Self::DbSize(_) => "DbSize", + } + } } #[cfg(test)] @@ -818,7 +905,7 @@ mod tests { fn test_tenantinfo_serde() { // Test serialization/deserialization of TenantInfo let original_active = TenantInfo { - id: TenantId::generate(), + id: TenantShardId::unsharded(TenantId::generate()), state: TenantState::Active, current_physical_size: Some(42), attachment_status: TenantAttachmentStatus::Attached, @@ -835,7 +922,7 @@ mod tests { }); let original_broken = TenantInfo { - id: TenantId::generate(), + id: TenantShardId::unsharded(TenantId::generate()), state: TenantState::Broken { reason: "reason".into(), backtrace: "backtrace info".into(), diff --git a/libs/pageserver_api/src/models/partitioning.rs b/libs/pageserver_api/src/models/partitioning.rs new file mode 100644 index 0000000000..0d287f7be0 --- /dev/null +++ b/libs/pageserver_api/src/models/partitioning.rs @@ -0,0 +1,151 @@ +use utils::lsn::Lsn; + +#[derive(Debug, PartialEq, Eq)] +pub struct Partitioning { + pub keys: crate::keyspace::KeySpace, + + pub at_lsn: Lsn, +} + +impl serde::Serialize for Partitioning { + fn serialize(&self, serializer: S) -> std::result::Result + where + S: serde::Serializer, + { + pub struct KeySpace<'a>(&'a crate::keyspace::KeySpace); + + impl<'a> serde::Serialize for KeySpace<'a> { + fn serialize(&self, serializer: S) -> std::result::Result + where + S: serde::Serializer, + { + use serde::ser::SerializeSeq; + let mut seq = serializer.serialize_seq(Some(self.0.ranges.len()))?; + for kr in &self.0.ranges { + seq.serialize_element(&KeyRange(kr))?; + } + seq.end() + } + } + + use serde::ser::SerializeMap; + let mut map = serializer.serialize_map(Some(2))?; + map.serialize_key("keys")?; + map.serialize_value(&KeySpace(&self.keys))?; + map.serialize_key("at_lsn")?; + map.serialize_value(&WithDisplay(&self.at_lsn))?; + map.end() + } +} + +pub struct WithDisplay<'a, T>(&'a T); + +impl<'a, T: std::fmt::Display> serde::Serialize for WithDisplay<'a, T> { + fn serialize(&self, serializer: S) -> std::result::Result + where + S: serde::Serializer, + { + serializer.collect_str(&self.0) + } +} + +pub struct KeyRange<'a>(&'a std::ops::Range); + +impl<'a> serde::Serialize for KeyRange<'a> { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + use serde::ser::SerializeTuple; + let mut t = serializer.serialize_tuple(2)?; + t.serialize_element(&WithDisplay(&self.0.start))?; + t.serialize_element(&WithDisplay(&self.0.end))?; + t.end() + } +} + +impl<'a> serde::Deserialize<'a> for Partitioning { + fn deserialize(deserializer: D) -> Result + where + D: serde::Deserializer<'a>, + { + pub struct KeySpace(crate::keyspace::KeySpace); + + impl<'de> serde::Deserialize<'de> for KeySpace { + fn deserialize(deserializer: D) -> Result + where + D: serde::Deserializer<'de>, + { + #[serde_with::serde_as] + #[derive(serde::Deserialize)] + #[serde(transparent)] + struct Key(#[serde_as(as = "serde_with::DisplayFromStr")] crate::key::Key); + + #[serde_with::serde_as] + #[derive(serde::Deserialize)] + struct Range(Key, Key); + + let ranges: Vec = serde::Deserialize::deserialize(deserializer)?; + Ok(Self(crate::keyspace::KeySpace { + ranges: ranges + .into_iter() + .map(|Range(start, end)| (start.0..end.0)) + .collect(), + })) + } + } + + #[serde_with::serde_as] + #[derive(serde::Deserialize)] + struct De { + keys: KeySpace, + #[serde_as(as = "serde_with::DisplayFromStr")] + at_lsn: Lsn, + } + + let de: De = serde::Deserialize::deserialize(deserializer)?; + Ok(Self { + at_lsn: de.at_lsn, + keys: de.keys.0, + }) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_serialization_roundtrip() { + let reference = r#" + { + "keys": [ + [ + "000000000000000000000000000000000000", + "000000000000000000000000000000000001" + ], + [ + "000000067F00000001000000000000000000", + "000000067F00000001000000000000000002" + ], + [ + "030000000000000000000000000000000000", + "030000000000000000000000000000000003" + ] + ], + "at_lsn": "0/2240160" + } + "#; + + let de: Partitioning = serde_json::from_str(reference).unwrap(); + + let ser = serde_json::to_string(&de).unwrap(); + + let ser_de: serde_json::Value = serde_json::from_str(&ser).unwrap(); + + assert_eq!( + ser_de, + serde_json::from_str::<'_, serde_json::Value>(reference).unwrap() + ); + } +} diff --git a/libs/pageserver_api/src/shard.rs b/libs/pageserver_api/src/shard.rs index 3510b4dbca..a186d93bce 100644 --- a/libs/pageserver_api/src/shard.rs +++ b/libs/pageserver_api/src/shard.rs @@ -1,5 +1,6 @@ use std::{ops::RangeInclusive, str::FromStr}; +use crate::key::{is_rel_block_key, Key}; use hex::FromHex; use serde::{Deserialize, Serialize}; use thiserror; @@ -72,19 +73,37 @@ impl TenantShardId { ) } - pub fn shard_slug(&self) -> String { - format!("{:02x}{:02x}", self.shard_number.0, self.shard_count.0) + pub fn shard_slug(&self) -> impl std::fmt::Display + '_ { + ShardSlug(self) + } + + /// Convenience for code that has special behavior on the 0th shard. + pub fn is_zero(&self) -> bool { + self.shard_number == ShardNumber(0) + } + + pub fn is_unsharded(&self) -> bool { + self.shard_number == ShardNumber(0) && self.shard_count == ShardCount(0) + } +} + +/// Formatting helper +struct ShardSlug<'a>(&'a TenantShardId); + +impl<'a> std::fmt::Display for ShardSlug<'a> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "{:02x}{:02x}", + self.0.shard_number.0, self.0.shard_count.0 + ) } } impl std::fmt::Display for TenantShardId { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { if self.shard_count != ShardCount(0) { - write!( - f, - "{}-{:02x}{:02x}", - self.tenant_id, self.shard_number.0, self.shard_count.0 - ) + write!(f, "{}-{}", self.tenant_id, self.shard_slug()) } else { // Legacy case (shard_count == 0) -- format as just the tenant id. Note that this // is distinct from the normal single shard case (shard count == 1). @@ -144,7 +163,7 @@ impl From<[u8; 18]> for TenantShardId { /// shard we're dealing with, but do not need to know the full ShardIdentity (because /// we won't be doing any page->shard mapping), and do not need to know the fully qualified /// TenantShardId. -#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy)] +#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)] pub struct ShardIndex { pub shard_number: ShardNumber, pub shard_count: ShardCount, @@ -302,6 +321,8 @@ pub struct ShardStripeSize(pub u32); pub struct ShardLayout(u8); const LAYOUT_V1: ShardLayout = ShardLayout(1); +/// ShardIdentity uses a magic layout value to indicate if it is unusable +const LAYOUT_BROKEN: ShardLayout = ShardLayout(255); /// Default stripe size in pages: 256MiB divided by 8kiB page size. const DEFAULT_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(256 * 1024 / 8); @@ -310,10 +331,10 @@ const DEFAULT_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(256 * 1024 / 8); /// to resolve a key to a shard, and then check whether that shard is ==self. #[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)] pub struct ShardIdentity { - pub layout: ShardLayout, pub number: ShardNumber, pub count: ShardCount, - pub stripe_size: ShardStripeSize, + stripe_size: ShardStripeSize, + layout: ShardLayout, } #[derive(thiserror::Error, Debug, PartialEq, Eq)] @@ -339,6 +360,22 @@ impl ShardIdentity { } } + /// A broken instance of this type is only used for `TenantState::Broken` tenants, + /// which are constructed in code paths that don't have access to proper configuration. + /// + /// A ShardIdentity in this state may not be used for anything, and should not be persisted. + /// Enforcement is via assertions, to avoid making our interface fallible for this + /// edge case: it is the Tenant's responsibility to avoid trying to do any I/O when in a broken + /// state, and by extension to avoid trying to do any page->shard resolution. + pub fn broken(number: ShardNumber, count: ShardCount) -> Self { + Self { + number, + count, + layout: LAYOUT_BROKEN, + stripe_size: DEFAULT_STRIPE_SIZE, + } + } + pub fn is_unsharded(&self) -> bool { self.number == ShardNumber(0) && self.count == ShardCount(0) } @@ -365,6 +402,54 @@ impl ShardIdentity { }) } } + + fn is_broken(&self) -> bool { + self.layout == LAYOUT_BROKEN + } + + pub fn get_shard_number(&self, key: &Key) -> ShardNumber { + assert!(!self.is_broken()); + key_to_shard_number(self.count, self.stripe_size, key) + } + + /// Return true if the key should be ingested by this shard + pub fn is_key_local(&self, key: &Key) -> bool { + assert!(!self.is_broken()); + if self.count < ShardCount(2) || (key_is_shard0(key) && self.number == ShardNumber(0)) { + true + } else { + key_to_shard_number(self.count, self.stripe_size, key) == self.number + } + } + + /// Return true if the key should be discarded if found in this shard's + /// data store, e.g. during compaction after a split + pub fn is_key_disposable(&self, key: &Key) -> bool { + if key_is_shard0(key) { + // Q: Why can't we dispose of shard0 content if we're not shard 0? + // A: because the WAL ingestion logic currently ingests some shard 0 + // content on all shards, even though it's only read on shard 0. If we + // dropped it, then subsequent WAL ingest to these keys would encounter + // an error. + false + } else { + !self.is_key_local(key) + } + } + + pub fn shard_slug(&self) -> String { + if self.count > ShardCount(0) { + format!("-{:02x}{:02x}", self.number.0, self.count.0) + } else { + String::new() + } + } + + /// Convenience for checking if this identity is the 0th shard in a tenant, + /// for special cases on shard 0 such as ingesting relation sizes. + pub fn is_zero(&self) -> bool { + self.number == ShardNumber(0) + } } impl Serialize for ShardIndex { @@ -438,6 +523,65 @@ impl<'de> Deserialize<'de> for ShardIndex { } } +/// Whether this key is always held on shard 0 (e.g. shard 0 holds all SLRU keys +/// in order to be able to serve basebackup requests without peer communication). +fn key_is_shard0(key: &Key) -> bool { + // To decide what to shard out to shards >0, we apply a simple rule that only + // relation pages are distributed to shards other than shard zero. Everything else gets + // stored on shard 0. This guarantees that shard 0 can independently serve basebackup + // requests, and any request other than those for particular blocks in relations. + // + // In this condition: + // - is_rel_block_key includes only relations, i.e. excludes SLRU data and + // all metadata. + // - field6 is set to -1 for relation size pages. + !(is_rel_block_key(key) && key.field6 != 0xffffffff) +} + +/// Provide the same result as the function in postgres `hashfn.h` with the same name +fn murmurhash32(mut h: u32) -> u32 { + h ^= h >> 16; + h = h.wrapping_mul(0x85ebca6b); + h ^= h >> 13; + h = h.wrapping_mul(0xc2b2ae35); + h ^= h >> 16; + h +} + +/// Provide the same result as the function in postgres `hashfn.h` with the same name +fn hash_combine(mut a: u32, mut b: u32) -> u32 { + b = b.wrapping_add(0x9e3779b9); + b = b.wrapping_add(a << 6); + b = b.wrapping_add(a >> 2); + + a ^= b; + a +} + +/// Where a Key is to be distributed across shards, select the shard. This function +/// does not account for keys that should be broadcast across shards. +/// +/// The hashing in this function must exactly match what we do in postgres smgr +/// code. The resulting distribution of pages is intended to preserve locality within +/// `stripe_size` ranges of contiguous block numbers in the same relation, while otherwise +/// distributing data pseudo-randomly. +/// +/// The mapping of key to shard is not stable across changes to ShardCount: this is intentional +/// and will be handled at higher levels when shards are split. +fn key_to_shard_number(count: ShardCount, stripe_size: ShardStripeSize, key: &Key) -> ShardNumber { + // Fast path for un-sharded tenants or broadcast keys + if count < ShardCount(2) || key_is_shard0(key) { + return ShardNumber(0); + } + + // relNode + let mut hash = murmurhash32(key.field4); + // blockNum/stripe size + hash = hash_combine(hash, murmurhash32(key.field6 / stripe_size.0)); + + ShardNumber((hash % count.0 as u32) as u8) +} + #[cfg(test)] mod tests { use std::str::FromStr; @@ -609,4 +753,29 @@ mod tests { Ok(()) } + + // These are only smoke tests to spot check that our implementation doesn't + // deviate from a few examples values: not aiming to validate the overall + // hashing algorithm. + #[test] + fn murmur_hash() { + assert_eq!(murmurhash32(0), 0); + + assert_eq!(hash_combine(0xb1ff3b40, 0), 0xfb7923c9); + } + + #[test] + fn shard_mapping() { + let key = Key { + field1: 0x00, + field2: 0x67f, + field3: 0x5, + field4: 0x400c, + field5: 0x00, + field6: 0x7d06, + }; + + let shard = key_to_shard_number(ShardCount(10), DEFAULT_STRIPE_SIZE, &key); + assert_eq!(shard, ShardNumber(8)); + } } diff --git a/libs/postgres_connection/src/lib.rs b/libs/postgres_connection/src/lib.rs index 35cb1a2691..ccf9108895 100644 --- a/libs/postgres_connection/src/lib.rs +++ b/libs/postgres_connection/src/lib.rs @@ -163,8 +163,18 @@ impl PgConnectionConfig { } /// Connect using postgres protocol with TLS disabled. - pub fn connect_no_tls(&self) -> Result { - postgres::Config::from(self.to_tokio_postgres_config()).connect(postgres::NoTls) + pub async fn connect_no_tls( + &self, + ) -> Result< + ( + tokio_postgres::Client, + tokio_postgres::Connection, + ), + postgres::Error, + > { + self.to_tokio_postgres_config() + .connect(postgres::NoTls) + .await } } diff --git a/libs/pq_proto/src/lib.rs b/libs/pq_proto/src/lib.rs index 41fc206cd7..c52a21bcd3 100644 --- a/libs/pq_proto/src/lib.rs +++ b/libs/pq_proto/src/lib.rs @@ -289,10 +289,10 @@ impl FeStartupPacket { // We shouldn't advance `buf` as probably full message is not there yet, // so can't directly use Bytes::get_u32 etc. let len = (&buf[0..4]).read_u32::().unwrap() as usize; - // The proposed replacement is `!(4..=MAX_STARTUP_PACKET_LENGTH).contains(&len)` + // The proposed replacement is `!(8..=MAX_STARTUP_PACKET_LENGTH).contains(&len)` // which is less readable #[allow(clippy::manual_range_contains)] - if len < 4 || len > MAX_STARTUP_PACKET_LENGTH { + if len < 8 || len > MAX_STARTUP_PACKET_LENGTH { return Err(ProtocolError::Protocol(format!( "invalid startup packet message length {}", len @@ -975,4 +975,10 @@ mod tests { let params = make_params("foo\\ bar \\ \\\\ baz\\ lol"); assert_eq!(split_options(¶ms), ["foo bar", " \\", "baz ", "lol"]); } + + #[test] + fn parse_fe_startup_packet_regression() { + let data = [0, 0, 0, 7, 0, 0, 0, 0]; + FeStartupPacket::parse(&mut BytesMut::from_iter(data)).unwrap_err(); + } } diff --git a/libs/remote_storage/Cargo.toml b/libs/remote_storage/Cargo.toml index e8bfc005d3..2cc59a947b 100644 --- a/libs/remote_storage/Cargo.toml +++ b/libs/remote_storage/Cargo.toml @@ -16,10 +16,11 @@ aws-credential-types.workspace = true bytes.workspace = true camino.workspace = true hyper = { workspace = true, features = ["stream"] } +futures.workspace = true serde.workspace = true serde_json.workspace = true tokio = { workspace = true, features = ["sync", "fs", "io-util"] } -tokio-util.workspace = true +tokio-util = { workspace = true, features = ["compat"] } toml_edit.workspace = true tracing.workspace = true scopeguard.workspace = true diff --git a/libs/remote_storage/src/azure_blob.rs b/libs/remote_storage/src/azure_blob.rs index ae08e9b171..7ea1103eb2 100644 --- a/libs/remote_storage/src/azure_blob.rs +++ b/libs/remote_storage/src/azure_blob.rs @@ -1,21 +1,24 @@ //! Azure Blob Storage wrapper +use std::borrow::Cow; use std::collections::HashMap; use std::env; use std::num::NonZeroU32; +use std::pin::Pin; use std::sync::Arc; -use std::{borrow::Cow, io::Cursor}; use super::REMOTE_STORAGE_PREFIX_SEPARATOR; use anyhow::Result; use azure_core::request_options::{MaxResults, Metadata, Range}; +use azure_core::RetryOptions; use azure_identity::DefaultAzureCredential; use azure_storage::StorageCredentials; use azure_storage_blobs::prelude::ClientBuilder; use azure_storage_blobs::{blob::operations::GetBlobBuilder, prelude::ContainerClient}; +use bytes::Bytes; +use futures::stream::Stream; use futures_util::StreamExt; use http_types::StatusCode; -use tokio::io::AsyncRead; use tracing::debug; use crate::s3_bucket::RequestKind; @@ -49,7 +52,8 @@ impl AzureBlobStorage { StorageCredentials::token_credential(Arc::new(token_credential)) }; - let builder = ClientBuilder::new(account, credentials); + // we have an outer retry + let builder = ClientBuilder::new(account, credentials).retry(RetryOptions::none()); let client = builder.container_client(azure_config.container_name.to_owned()); @@ -113,12 +117,22 @@ impl AzureBlobStorage { ) -> Result { let mut response = builder.into_stream(); + let mut etag = None; + let mut last_modified = None; let mut metadata = HashMap::new(); // TODO give proper streaming response instead of buffering into RAM // https://github.com/neondatabase/neon/issues/5563 - let mut buf = Vec::new(); + + let mut bufs = Vec::new(); while let Some(part) = response.next().await { let part = part.map_err(to_download_error)?; + let etag_str: &str = part.blob.properties.etag.as_ref(); + if etag.is_none() { + etag = Some(etag.unwrap_or_else(|| etag_str.to_owned())); + } + if last_modified.is_none() { + last_modified = Some(part.blob.properties.last_modified.into()); + } if let Some(blob_meta) = part.blob.metadata { metadata.extend(blob_meta.iter().map(|(k, v)| (k.to_owned(), v.to_owned()))); } @@ -127,10 +141,12 @@ impl AzureBlobStorage { .collect() .await .map_err(|e| DownloadError::Other(e.into()))?; - buf.extend_from_slice(&data.slice(..)); + bufs.push(data); } Ok(Download { - download_stream: Box::pin(Cursor::new(buf)), + download_stream: Box::pin(futures::stream::iter(bufs.into_iter().map(Ok))), + etag, + last_modified, metadata: Some(StorageMetadata(metadata)), }) } @@ -217,9 +233,10 @@ impl RemoteStorage for AzureBlobStorage { } Ok(res) } + async fn upload( &self, - mut from: impl AsyncRead + Unpin + Send + Sync + 'static, + from: impl Stream> + Send + Sync + 'static, data_size_bytes: usize, to: &RemotePath, metadata: Option, @@ -227,13 +244,12 @@ impl RemoteStorage for AzureBlobStorage { let _permit = self.permit(RequestKind::Put).await; let blob_client = self.client.blob_client(self.relative_path_to_name(to)); - // TODO FIX THIS UGLY HACK and don't buffer the entire object - // into RAM here, but use the streaming interface. For that, - // we'd have to change the interface though... - // https://github.com/neondatabase/neon/issues/5563 - let mut buf = Vec::with_capacity(data_size_bytes); - tokio::io::copy(&mut from, &mut buf).await?; - let body = azure_core::Body::Bytes(buf.into()); + let from: Pin> + Send + Sync + 'static>> = + Box::pin(from); + + let from = NonSeekableStream::new(from, data_size_bytes); + + let body = azure_core::Body::SeekableStream(Box::new(from)); let mut builder = blob_client.put_block_blob(body); @@ -266,17 +282,12 @@ impl RemoteStorage for AzureBlobStorage { let mut builder = blob_client.get(); - if let Some(end_exclusive) = end_exclusive { - builder = builder.range(Range::new(start_inclusive, end_exclusive)); + let range: Range = if let Some(end_exclusive) = end_exclusive { + (start_inclusive..end_exclusive).into() } else { - // Open ranges are not supported by the SDK so we work around - // by setting the upper limit extremely high (but high enough - // to still be representable by signed 64 bit integers). - // TODO remove workaround once the SDK adds open range support - // https://github.com/Azure/azure-sdk-for-rust/issues/1438 - let end_exclusive = u64::MAX / 4; - builder = builder.range(Range::new(start_inclusive, end_exclusive)); - } + (start_inclusive..).into() + }; + builder = builder.range(range); self.download_for_builder(builder).await } @@ -312,3 +323,153 @@ impl RemoteStorage for AzureBlobStorage { Ok(()) } } + +pin_project_lite::pin_project! { + /// Hack to work around not being able to stream once with azure sdk. + /// + /// Azure sdk clones streams around with the assumption that they are like + /// `Arc` (except not supporting tokio), however our streams are not like + /// that. For example for an `index_part.json` we just have a single chunk of [`Bytes`] + /// representing the whole serialized vec. It could be trivially cloneable and "semi-trivially" + /// seekable, but we can also just re-try the request easier. + #[project = NonSeekableStreamProj] + enum NonSeekableStream { + /// A stream wrappers initial form. + /// + /// Mutex exists to allow moving when cloning. If the sdk changes to do less than 1 + /// clone before first request, then this must be changed. + Initial { + inner: std::sync::Mutex>>>, + len: usize, + }, + /// The actually readable variant, produced by cloning the Initial variant. + /// + /// The sdk currently always clones once, even without retry policy. + Actual { + #[pin] + inner: tokio_util::compat::Compat>, + len: usize, + read_any: bool, + }, + /// Most likely unneeded, but left to make life easier, in case more clones are added. + Cloned { + len_was: usize, + } + } +} + +impl NonSeekableStream +where + S: Stream> + Send + Sync + 'static, +{ + fn new(inner: S, len: usize) -> NonSeekableStream { + use tokio_util::compat::TokioAsyncReadCompatExt; + + let inner = tokio_util::io::StreamReader::new(inner).compat(); + let inner = Some(inner); + let inner = std::sync::Mutex::new(inner); + NonSeekableStream::Initial { inner, len } + } +} + +impl std::fmt::Debug for NonSeekableStream { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::Initial { len, .. } => f.debug_struct("Initial").field("len", len).finish(), + Self::Actual { len, .. } => f.debug_struct("Actual").field("len", len).finish(), + Self::Cloned { len_was, .. } => f.debug_struct("Cloned").field("len", len_was).finish(), + } + } +} + +impl futures::io::AsyncRead for NonSeekableStream +where + S: Stream>, +{ + fn poll_read( + self: std::pin::Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + buf: &mut [u8], + ) -> std::task::Poll> { + match self.project() { + NonSeekableStreamProj::Actual { + inner, read_any, .. + } => { + *read_any = true; + inner.poll_read(cx, buf) + } + // NonSeekableStream::Initial does not support reading because it is just much easier + // to have the mutex in place where one does not poll the contents, or that's how it + // seemed originally. If there is a version upgrade which changes the cloning, then + // that support needs to be hacked in. + // + // including {self:?} into the message would be useful, but unsure how to unproject. + _ => std::task::Poll::Ready(Err(std::io::Error::new( + std::io::ErrorKind::Other, + "cloned or initial values cannot be read", + ))), + } + } +} + +impl Clone for NonSeekableStream { + /// Weird clone implementation exists to support the sdk doing cloning before issuing the first + /// request, see type documentation. + fn clone(&self) -> Self { + use NonSeekableStream::*; + + match self { + Initial { inner, len } => { + if let Some(inner) = inner.lock().unwrap().take() { + Actual { + inner, + len: *len, + read_any: false, + } + } else { + Self::Cloned { len_was: *len } + } + } + Actual { len, .. } => Cloned { len_was: *len }, + Cloned { len_was } => Cloned { len_was: *len_was }, + } + } +} + +#[async_trait::async_trait] +impl azure_core::SeekableStream for NonSeekableStream +where + S: Stream> + Unpin + Send + Sync + 'static, +{ + async fn reset(&mut self) -> azure_core::error::Result<()> { + use NonSeekableStream::*; + + let msg = match self { + Initial { inner, .. } => { + if inner.get_mut().unwrap().is_some() { + return Ok(()); + } else { + "reset after first clone is not supported" + } + } + Actual { read_any, .. } if !*read_any => return Ok(()), + Actual { .. } => "reset after reading is not supported", + Cloned { .. } => "reset after second clone is not supported", + }; + Err(azure_core::error::Error::new( + azure_core::error::ErrorKind::Io, + std::io::Error::new(std::io::ErrorKind::Other, msg), + )) + } + + // Note: it is not documented if this should be the total or remaining length, total passes the + // tests. + fn len(&self) -> usize { + use NonSeekableStream::*; + match self { + Initial { len, .. } => *len, + Actual { len, .. } => *len, + Cloned { len_was, .. } => *len_was, + } + } +} diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs index e6d306ff66..3e408e3119 100644 --- a/libs/remote_storage/src/lib.rs +++ b/libs/remote_storage/src/lib.rs @@ -14,13 +14,17 @@ mod local_fs; mod s3_bucket; mod simulate_failures; -use std::{collections::HashMap, fmt::Debug, num::NonZeroUsize, pin::Pin, sync::Arc}; +use std::{ + collections::HashMap, fmt::Debug, num::NonZeroUsize, pin::Pin, sync::Arc, time::SystemTime, +}; use anyhow::{bail, Context}; use camino::{Utf8Path, Utf8PathBuf}; +use bytes::Bytes; +use futures::stream::Stream; use serde::{Deserialize, Serialize}; -use tokio::{io, sync::Semaphore}; +use tokio::sync::Semaphore; use toml_edit::Item; use tracing::info; @@ -179,7 +183,7 @@ pub trait RemoteStorage: Send + Sync + 'static { /// Streams the local file contents into remote into the remote storage entry. async fn upload( &self, - from: impl io::AsyncRead + Unpin + Send + Sync + 'static, + from: impl Stream> + Send + Sync + 'static, // S3 PUT request requires the content length to be specified, // otherwise it starts to fail with the concurrent connection count increasing. data_size_bytes: usize, @@ -205,8 +209,13 @@ pub trait RemoteStorage: Send + Sync + 'static { async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()>; } +pub type DownloadStream = Pin> + Unpin + Send + Sync>>; pub struct Download { - pub download_stream: Pin>, + pub download_stream: DownloadStream, + /// The last time the file was modified (`last-modified` HTTP header) + pub last_modified: Option, + /// A way to identify this specific version of the resource (`etag` HTTP header) + pub etag: Option, /// Extra key-value data, associated with the current remote file. pub metadata: Option, } @@ -300,7 +309,7 @@ impl GenericRemoteStorage { pub async fn upload( &self, - from: impl io::AsyncRead + Unpin + Send + Sync + 'static, + from: impl Stream> + Send + Sync + 'static, data_size_bytes: usize, to: &RemotePath, metadata: Option, @@ -398,7 +407,7 @@ impl GenericRemoteStorage { /// this path is used for the remote object id conversion only. pub async fn upload_storage_object( &self, - from: impl tokio::io::AsyncRead + Unpin + Send + Sync + 'static, + from: impl Stream> + Send + Sync + 'static, from_size_bytes: usize, to: &RemotePath, ) -> anyhow::Result<()> { diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs index 1be50ce565..d1e7d325b9 100644 --- a/libs/remote_storage/src/local_fs.rs +++ b/libs/remote_storage/src/local_fs.rs @@ -7,15 +7,18 @@ use std::{borrow::Cow, future::Future, io::ErrorKind, pin::Pin}; use anyhow::{bail, ensure, Context}; +use bytes::Bytes; use camino::{Utf8Path, Utf8PathBuf}; +use futures::stream::Stream; use tokio::{ fs, io::{self, AsyncReadExt, AsyncSeekExt, AsyncWriteExt}, }; +use tokio_util::io::ReaderStream; use tracing::*; use utils::{crashsafe::path_with_suffix_extension, fs_ext::is_directory_empty}; -use crate::{Download, DownloadError, Listing, ListingMode, RemotePath}; +use crate::{Download, DownloadError, DownloadStream, Listing, ListingMode, RemotePath}; use super::{RemoteStorage, StorageMetadata}; @@ -99,27 +102,35 @@ impl LocalFs { }; // If we were given a directory, we may use it as our starting point. - // Otherwise, we must go up to the parent directory. This is because + // Otherwise, we must go up to the first ancestor dir that exists. This is because // S3 object list prefixes can be arbitrary strings, but when reading // the local filesystem we need a directory to start calling read_dir on. let mut initial_dir = full_path.clone(); - match fs::metadata(full_path.clone()).await { - Ok(meta) => { - if !meta.is_dir() { + loop { + // Did we make it to the root? + if initial_dir.parent().is_none() { + anyhow::bail!("list_files: failed to find valid ancestor dir for {full_path}"); + } + + match fs::metadata(initial_dir.clone()).await { + Ok(meta) if meta.is_dir() => { + // We found a directory, break + break; + } + Ok(_meta) => { // It's not a directory: strip back to the parent initial_dir.pop(); } - } - Err(e) if e.kind() == ErrorKind::NotFound => { - // It's not a file that exists: strip the prefix back to the parent directory - initial_dir.pop(); - } - Err(e) => { - // Unexpected I/O error - anyhow::bail!(e) + Err(e) if e.kind() == ErrorKind::NotFound => { + // It's not a file that exists: strip the prefix back to the parent directory + initial_dir.pop(); + } + Err(e) => { + // Unexpected I/O error + anyhow::bail!(e) + } } } - // Note that Utf8PathBuf starts_with only considers full path segments, but // object prefixes are arbitrary strings, so we need the strings for doing // starts_with later. @@ -211,7 +222,7 @@ impl RemoteStorage for LocalFs { async fn upload( &self, - data: impl io::AsyncRead + Unpin + Send + Sync + 'static, + data: impl Stream> + Send + Sync, data_size_bytes: usize, to: &RemotePath, metadata: Option, @@ -244,9 +255,12 @@ impl RemoteStorage for LocalFs { ); let from_size_bytes = data_size_bytes as u64; + let data = tokio_util::io::StreamReader::new(data); + let data = std::pin::pin!(data); let mut buffer_to_read = data.take(from_size_bytes); - let bytes_read = io::copy(&mut buffer_to_read, &mut destination) + // alternatively we could just write the bytes to a file, but local_fs is a testing utility + let bytes_read = io::copy_buf(&mut buffer_to_read, &mut destination) .await .with_context(|| { format!( @@ -300,7 +314,7 @@ impl RemoteStorage for LocalFs { async fn download(&self, from: &RemotePath) -> Result { let target_path = from.with_base(&self.storage_root); if file_exists(&target_path).map_err(DownloadError::BadInput)? { - let source = io::BufReader::new( + let source = ReaderStream::new( fs::OpenOptions::new() .read(true) .open(&target_path) @@ -317,6 +331,8 @@ impl RemoteStorage for LocalFs { .map_err(DownloadError::Other)?; Ok(Download { metadata, + last_modified: None, + etag: None, download_stream: Box::pin(source), }) } else { @@ -340,16 +356,14 @@ impl RemoteStorage for LocalFs { } let target_path = from.with_base(&self.storage_root); if file_exists(&target_path).map_err(DownloadError::BadInput)? { - let mut source = io::BufReader::new( - fs::OpenOptions::new() - .read(true) - .open(&target_path) - .await - .with_context(|| { - format!("Failed to open source file {target_path:?} to use in the download") - }) - .map_err(DownloadError::Other)?, - ); + let mut source = tokio::fs::OpenOptions::new() + .read(true) + .open(&target_path) + .await + .with_context(|| { + format!("Failed to open source file {target_path:?} to use in the download") + }) + .map_err(DownloadError::Other)?; source .seek(io::SeekFrom::Start(start_inclusive)) .await @@ -360,15 +374,17 @@ impl RemoteStorage for LocalFs { .await .map_err(DownloadError::Other)?; - Ok(match end_exclusive { - Some(end_exclusive) => Download { - metadata, - download_stream: Box::pin(source.take(end_exclusive - start_inclusive)), - }, - None => Download { - metadata, - download_stream: Box::pin(source), - }, + let download_stream: DownloadStream = match end_exclusive { + Some(end_exclusive) => Box::pin(ReaderStream::new( + source.take(end_exclusive - start_inclusive), + )), + None => Box::pin(ReaderStream::new(source)), + }; + Ok(Download { + metadata, + last_modified: None, + etag: None, + download_stream, }) } else { Err(DownloadError::NotFound) @@ -467,7 +483,9 @@ fn file_exists(file_path: &Utf8Path) -> anyhow::Result { mod fs_tests { use super::*; + use bytes::Bytes; use camino_tempfile::tempdir; + use futures_util::Stream; use std::{collections::HashMap, io::Write}; async fn read_and_assert_remote_file_contents( @@ -477,7 +495,7 @@ mod fs_tests { remote_storage_path: &RemotePath, expected_metadata: Option<&StorageMetadata>, ) -> anyhow::Result { - let mut download = storage + let download = storage .download(remote_storage_path) .await .map_err(|e| anyhow::anyhow!("Download failed: {e}"))?; @@ -486,13 +504,9 @@ mod fs_tests { "Unexpected metadata returned for the downloaded file" ); - let mut contents = String::new(); - download - .download_stream - .read_to_string(&mut contents) - .await - .context("Failed to read remote file contents into string")?; - Ok(contents) + let contents = aggregate(download.download_stream).await?; + + String::from_utf8(contents).map_err(anyhow::Error::new) } #[tokio::test] @@ -521,25 +535,26 @@ mod fs_tests { let storage = create_storage()?; let id = RemotePath::new(Utf8Path::new("dummy"))?; - let content = std::io::Cursor::new(b"12345"); + let content = Bytes::from_static(b"12345"); + let content = move || futures::stream::once(futures::future::ready(Ok(content.clone()))); // Check that you get an error if the size parameter doesn't match the actual // size of the stream. storage - .upload(Box::new(content.clone()), 0, &id, None) + .upload(content(), 0, &id, None) .await .expect_err("upload with zero size succeeded"); storage - .upload(Box::new(content.clone()), 4, &id, None) + .upload(content(), 4, &id, None) .await .expect_err("upload with too short size succeeded"); storage - .upload(Box::new(content.clone()), 6, &id, None) + .upload(content(), 6, &id, None) .await .expect_err("upload with too large size succeeded"); // Correct size is 5, this should succeed. - storage.upload(Box::new(content), 5, &id, None).await?; + storage.upload(content(), 5, &id, None).await?; Ok(()) } @@ -587,7 +602,7 @@ mod fs_tests { let uploaded_bytes = dummy_contents(upload_name).into_bytes(); let (first_part_local, second_part_local) = uploaded_bytes.split_at(3); - let mut first_part_download = storage + let first_part_download = storage .download_byte_range(&upload_target, 0, Some(first_part_local.len() as u64)) .await?; assert!( @@ -595,21 +610,13 @@ mod fs_tests { "No metadata should be returned for no metadata upload" ); - let mut first_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new())); - io::copy( - &mut first_part_download.download_stream, - &mut first_part_remote, - ) - .await?; - first_part_remote.flush().await?; - let first_part_remote = first_part_remote.into_inner().into_inner(); + let first_part_remote = aggregate(first_part_download.download_stream).await?; assert_eq!( - first_part_local, - first_part_remote.as_slice(), + first_part_local, first_part_remote, "First part bytes should be returned when requested" ); - let mut second_part_download = storage + let second_part_download = storage .download_byte_range( &upload_target, first_part_local.len() as u64, @@ -621,17 +628,9 @@ mod fs_tests { "No metadata should be returned for no metadata upload" ); - let mut second_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new())); - io::copy( - &mut second_part_download.download_stream, - &mut second_part_remote, - ) - .await?; - second_part_remote.flush().await?; - let second_part_remote = second_part_remote.into_inner().into_inner(); + let second_part_remote = aggregate(second_part_download.download_stream).await?; assert_eq!( - second_part_local, - second_part_remote.as_slice(), + second_part_local, second_part_remote, "Second part bytes should be returned when requested" ); @@ -721,17 +720,10 @@ mod fs_tests { let uploaded_bytes = dummy_contents(upload_name).into_bytes(); let (first_part_local, _) = uploaded_bytes.split_at(3); - let mut partial_download_with_metadata = storage + let partial_download_with_metadata = storage .download_byte_range(&upload_target, 0, Some(first_part_local.len() as u64)) .await?; - let mut first_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new())); - io::copy( - &mut partial_download_with_metadata.download_stream, - &mut first_part_remote, - ) - .await?; - first_part_remote.flush().await?; - let first_part_remote = first_part_remote.into_inner().into_inner(); + let first_part_remote = aggregate(partial_download_with_metadata.download_stream).await?; assert_eq!( first_part_local, first_part_remote.as_slice(), @@ -807,16 +799,16 @@ mod fs_tests { ) })?; - storage - .upload(Box::new(file), size, &relative_path, metadata) - .await?; + let file = tokio_util::io::ReaderStream::new(file); + + storage.upload(file, size, &relative_path, metadata).await?; Ok(relative_path) } async fn create_file_for_upload( path: &Utf8Path, contents: &str, - ) -> anyhow::Result<(io::BufReader, usize)> { + ) -> anyhow::Result<(fs::File, usize)> { std::fs::create_dir_all(path.parent().unwrap())?; let mut file_for_writing = std::fs::OpenOptions::new() .write(true) @@ -826,7 +818,7 @@ mod fs_tests { drop(file_for_writing); let file_size = path.metadata()?.len() as usize; Ok(( - io::BufReader::new(fs::OpenOptions::new().read(true).open(&path).await?), + fs::OpenOptions::new().read(true).open(&path).await?, file_size, )) } @@ -840,4 +832,16 @@ mod fs_tests { files.sort_by(|a, b| a.0.cmp(&b.0)); Ok(files) } + + async fn aggregate( + stream: impl Stream>, + ) -> anyhow::Result> { + use futures::stream::StreamExt; + let mut out = Vec::new(); + let mut stream = std::pin::pin!(stream); + while let Some(res) = stream.next().await { + out.extend_from_slice(&res?[..]); + } + Ok(out) + } } diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs index 0cb73f73b7..0f95458ad1 100644 --- a/libs/remote_storage/src/s3_bucket.rs +++ b/libs/remote_storage/src/s3_bucket.rs @@ -4,13 +4,19 @@ //! allowing multiple api users to independently work with the same S3 bucket, if //! their bucket prefixes are both specified and different. -use std::{borrow::Cow, sync::Arc}; +use std::{ + borrow::Cow, + pin::Pin, + sync::Arc, + task::{Context, Poll}, +}; -use anyhow::Context; +use anyhow::Context as _; use aws_config::{ environment::credentials::EnvironmentVariableCredentialsProvider, imds::credentials::ImdsCredentialsProvider, meta::credentials::CredentialsProviderChain, + profile::ProfileFileCredentialsProvider, provider_config::ProviderConfig, retry::{RetryConfigBuilder, RetryMode}, web_identity_token::WebIdentityTokenCredentialsProvider, @@ -28,11 +34,10 @@ use aws_smithy_async::rt::sleep::TokioSleep; use aws_smithy_types::body::SdkBody; use aws_smithy_types::byte_stream::ByteStream; +use bytes::Bytes; +use futures::stream::Stream; use hyper::Body; use scopeguard::ScopeGuard; -use tokio::io::{self, AsyncRead}; -use tokio_util::io::ReaderStream; -use tracing::debug; use super::StorageMetadata; use crate::{ @@ -63,27 +68,36 @@ struct GetObjectRequest { impl S3Bucket { /// Creates the S3 storage, errors if incorrect AWS S3 configuration provided. pub fn new(aws_config: &S3Config) -> anyhow::Result { - debug!( + tracing::debug!( "Creating s3 remote storage for S3 bucket {}", aws_config.bucket_name ); let region = Some(Region::new(aws_config.bucket_region.clone())); + let provider_conf = ProviderConfig::without_region().with_region(region.clone()); + let credentials_provider = { // uses "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY" CredentialsProviderChain::first_try( "env", EnvironmentVariableCredentialsProvider::new(), ) + // uses "AWS_PROFILE" / `aws sso login --profile ` + .or_else( + "profile-sso", + ProfileFileCredentialsProvider::builder() + .configure(&provider_conf) + .build(), + ) // uses "AWS_WEB_IDENTITY_TOKEN_FILE", "AWS_ROLE_ARN", "AWS_ROLE_SESSION_NAME" // needed to access remote extensions bucket - .or_else("token", { - let provider_conf = ProviderConfig::without_region().with_region(region.clone()); + .or_else( + "token", WebIdentityTokenCredentialsProvider::builder() .configure(&provider_conf) - .build() - }) + .build(), + ) // uses imds v2 .or_else("imds", ImdsCredentialsProvider::builder().build()) }; @@ -214,58 +228,99 @@ impl S3Bucket { let started_at = ScopeGuard::into_inner(started_at); - if get_object.is_err() { - metrics::BUCKET_METRICS.req_seconds.observe_elapsed( - kind, - AttemptOutcome::Err, - started_at, - ); - } - match get_object { Ok(object_output) => { let metadata = object_output.metadata().cloned().map(StorageMetadata); + let etag = object_output.e_tag.clone(); + let last_modified = object_output.last_modified.and_then(|t| t.try_into().ok()); + + let body = object_output.body; + let body = ByteStreamAsStream::from(body); + let body = PermitCarrying::new(permit, body); + let body = TimedDownload::new(started_at, body); + Ok(Download { metadata, - download_stream: Box::pin(io::BufReader::new(TimedDownload::new( - started_at, - RatelimitedAsyncRead::new(permit, object_output.body.into_async_read()), - ))), + etag, + last_modified, + download_stream: Box::pin(body), }) } Err(SdkError::ServiceError(e)) if matches!(e.err(), GetObjectError::NoSuchKey(_)) => { + // Count this in the AttemptOutcome::Ok bucket, because 404 is not + // an error: we expect to sometimes fetch an object and find it missing, + // e.g. when probing for timeline indices. + metrics::BUCKET_METRICS.req_seconds.observe_elapsed( + kind, + AttemptOutcome::Ok, + started_at, + ); Err(DownloadError::NotFound) } - Err(e) => Err(DownloadError::Other( - anyhow::Error::new(e).context("download s3 object"), - )), + Err(e) => { + metrics::BUCKET_METRICS.req_seconds.observe_elapsed( + kind, + AttemptOutcome::Err, + started_at, + ); + + Err(DownloadError::Other( + anyhow::Error::new(e).context("download s3 object"), + )) + } } } } +pin_project_lite::pin_project! { + struct ByteStreamAsStream { + #[pin] + inner: aws_smithy_types::byte_stream::ByteStream + } +} + +impl From for ByteStreamAsStream { + fn from(inner: aws_smithy_types::byte_stream::ByteStream) -> Self { + ByteStreamAsStream { inner } + } +} + +impl Stream for ByteStreamAsStream { + type Item = std::io::Result; + + fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + // this does the std::io::ErrorKind::Other conversion + self.project().inner.poll_next(cx).map_err(|x| x.into()) + } + + // cannot implement size_hint because inner.size_hint is remaining size in bytes, which makes + // sense and Stream::size_hint does not really +} + pin_project_lite::pin_project! { /// An `AsyncRead` adapter which carries a permit for the lifetime of the value. - struct RatelimitedAsyncRead { + struct PermitCarrying { permit: tokio::sync::OwnedSemaphorePermit, #[pin] inner: S, } } -impl RatelimitedAsyncRead { +impl PermitCarrying { fn new(permit: tokio::sync::OwnedSemaphorePermit, inner: S) -> Self { - RatelimitedAsyncRead { permit, inner } + Self { permit, inner } } } -impl AsyncRead for RatelimitedAsyncRead { - fn poll_read( - self: std::pin::Pin<&mut Self>, - cx: &mut std::task::Context<'_>, - buf: &mut io::ReadBuf<'_>, - ) -> std::task::Poll> { - let this = self.project(); - this.inner.poll_read(cx, buf) +impl>> Stream for PermitCarrying { + type Item = ::Item; + + fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + self.project().inner.poll_next(cx) + } + + fn size_hint(&self) -> (usize, Option) { + self.inner.size_hint() } } @@ -285,7 +340,7 @@ pin_project_lite::pin_project! { } } -impl TimedDownload { +impl TimedDownload { fn new(started_at: std::time::Instant, inner: S) -> Self { TimedDownload { started_at, @@ -295,25 +350,26 @@ impl TimedDownload { } } -impl AsyncRead for TimedDownload { - fn poll_read( - self: std::pin::Pin<&mut Self>, - cx: &mut std::task::Context<'_>, - buf: &mut io::ReadBuf<'_>, - ) -> std::task::Poll> { +impl>> Stream for TimedDownload { + type Item = ::Item; + + fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + use std::task::ready; + let this = self.project(); - let before = buf.filled().len(); - let read = std::task::ready!(this.inner.poll_read(cx, buf)); - let read_eof = buf.filled().len() == before; - - match read { - Ok(()) if read_eof => *this.outcome = AttemptOutcome::Ok, - Ok(()) => { /* still in progress */ } - Err(_) => *this.outcome = AttemptOutcome::Err, + let res = ready!(this.inner.poll_next(cx)); + match &res { + Some(Ok(_)) => {} + Some(Err(_)) => *this.outcome = metrics::AttemptOutcome::Err, + None => *this.outcome = metrics::AttemptOutcome::Ok, } - std::task::Poll::Ready(read) + Poll::Ready(res) + } + + fn size_hint(&self) -> (usize, Option) { + self.inner.size_hint() } } @@ -378,7 +434,7 @@ impl RemoteStorage for S3Bucket { let empty = Vec::new(); let prefixes = response.common_prefixes.as_ref().unwrap_or(&empty); - tracing::info!("list: {} prefixes, {} keys", prefixes.len(), keys.len()); + tracing::debug!("list: {} prefixes, {} keys", prefixes.len(), keys.len()); for object in keys { let object_path = object.key().expect("response does not contain a key"); @@ -403,7 +459,7 @@ impl RemoteStorage for S3Bucket { async fn upload( &self, - from: impl io::AsyncRead + Unpin + Send + Sync + 'static, + from: impl Stream> + Send + Sync + 'static, from_size_bytes: usize, to: &RemotePath, metadata: Option, @@ -413,7 +469,7 @@ impl RemoteStorage for S3Bucket { let started_at = start_measuring_requests(kind); - let body = Body::wrap_stream(ReaderStream::new(from)); + let body = Body::wrap_stream(from); let bytes_stream = ByteStream::new(SdkBody::from_body_0_4(body)); let res = self diff --git a/libs/remote_storage/src/simulate_failures.rs b/libs/remote_storage/src/simulate_failures.rs index cd13db1923..802b0db7f5 100644 --- a/libs/remote_storage/src/simulate_failures.rs +++ b/libs/remote_storage/src/simulate_failures.rs @@ -1,6 +1,8 @@ //! This module provides a wrapper around a real RemoteStorage implementation that //! causes the first N attempts at each upload or download operatio to fail. For //! testing purposes. +use bytes::Bytes; +use futures::stream::Stream; use std::collections::hash_map::Entry; use std::collections::HashMap; use std::sync::Mutex; @@ -108,7 +110,7 @@ impl RemoteStorage for UnreliableWrapper { async fn upload( &self, - data: impl tokio::io::AsyncRead + Unpin + Send + Sync + 'static, + data: impl Stream> + Send + Sync + 'static, // S3 PUT request requires the content length to be specified, // otherwise it starts to fail with the concurrent connection count increasing. data_size_bytes: usize, diff --git a/libs/remote_storage/tests/common/mod.rs b/libs/remote_storage/tests/common/mod.rs new file mode 100644 index 0000000000..bca117ed1a --- /dev/null +++ b/libs/remote_storage/tests/common/mod.rs @@ -0,0 +1,200 @@ +use std::collections::HashSet; +use std::ops::ControlFlow; +use std::path::PathBuf; +use std::sync::Arc; + +use anyhow::Context; +use bytes::Bytes; +use camino::Utf8Path; +use futures::stream::Stream; +use once_cell::sync::OnceCell; +use remote_storage::{Download, GenericRemoteStorage, RemotePath}; +use tokio::task::JoinSet; +use tracing::{debug, error, info}; + +static LOGGING_DONE: OnceCell<()> = OnceCell::new(); + +pub(crate) fn upload_stream( + content: std::borrow::Cow<'static, [u8]>, +) -> ( + impl Stream> + Send + Sync + 'static, + usize, +) { + use std::borrow::Cow; + + let content = match content { + Cow::Borrowed(x) => Bytes::from_static(x), + Cow::Owned(vec) => Bytes::from(vec), + }; + wrap_stream(content) +} + +pub(crate) fn wrap_stream( + content: bytes::Bytes, +) -> ( + impl Stream> + Send + Sync + 'static, + usize, +) { + let len = content.len(); + let content = futures::future::ready(Ok(content)); + + (futures::stream::once(content), len) +} + +pub(crate) async fn download_to_vec(dl: Download) -> anyhow::Result> { + let mut buf = Vec::new(); + tokio::io::copy_buf( + &mut tokio_util::io::StreamReader::new(dl.download_stream), + &mut buf, + ) + .await?; + Ok(buf) +} + +// Uploads files `folder{j}/blob{i}.txt`. See test description for more details. +pub(crate) async fn upload_simple_remote_data( + client: &Arc, + upload_tasks_count: usize, +) -> ControlFlow, HashSet> { + info!("Creating {upload_tasks_count} remote files"); + let mut upload_tasks = JoinSet::new(); + for i in 1..upload_tasks_count + 1 { + let task_client = Arc::clone(client); + upload_tasks.spawn(async move { + let blob_path = PathBuf::from(format!("folder{}/blob_{}.txt", i / 7, i)); + let blob_path = RemotePath::new( + Utf8Path::from_path(blob_path.as_path()).expect("must be valid blob path"), + ) + .with_context(|| format!("{blob_path:?} to RemotePath conversion"))?; + debug!("Creating remote item {i} at path {blob_path:?}"); + + let (data, len) = upload_stream(format!("remote blob data {i}").into_bytes().into()); + task_client.upload(data, len, &blob_path, None).await?; + + Ok::<_, anyhow::Error>(blob_path) + }); + } + + let mut upload_tasks_failed = false; + let mut uploaded_blobs = HashSet::with_capacity(upload_tasks_count); + while let Some(task_run_result) = upload_tasks.join_next().await { + match task_run_result + .context("task join failed") + .and_then(|task_result| task_result.context("upload task failed")) + { + Ok(upload_path) => { + uploaded_blobs.insert(upload_path); + } + Err(e) => { + error!("Upload task failed: {e:?}"); + upload_tasks_failed = true; + } + } + } + + if upload_tasks_failed { + ControlFlow::Break(uploaded_blobs) + } else { + ControlFlow::Continue(uploaded_blobs) + } +} + +pub(crate) async fn cleanup( + client: &Arc, + objects_to_delete: HashSet, +) { + info!( + "Removing {} objects from the remote storage during cleanup", + objects_to_delete.len() + ); + let mut delete_tasks = JoinSet::new(); + for object_to_delete in objects_to_delete { + let task_client = Arc::clone(client); + delete_tasks.spawn(async move { + debug!("Deleting remote item at path {object_to_delete:?}"); + task_client + .delete(&object_to_delete) + .await + .with_context(|| format!("{object_to_delete:?} removal")) + }); + } + + while let Some(task_run_result) = delete_tasks.join_next().await { + match task_run_result { + Ok(task_result) => match task_result { + Ok(()) => {} + Err(e) => error!("Delete task failed: {e:?}"), + }, + Err(join_err) => error!("Delete task did not finish correctly: {join_err}"), + } + } +} +pub(crate) struct Uploads { + pub(crate) prefixes: HashSet, + pub(crate) blobs: HashSet, +} + +pub(crate) async fn upload_remote_data( + client: &Arc, + base_prefix_str: &'static str, + upload_tasks_count: usize, +) -> ControlFlow { + info!("Creating {upload_tasks_count} remote files"); + let mut upload_tasks = JoinSet::new(); + for i in 1..upload_tasks_count + 1 { + let task_client = Arc::clone(client); + upload_tasks.spawn(async move { + let prefix = format!("{base_prefix_str}/sub_prefix_{i}/"); + let blob_prefix = RemotePath::new(Utf8Path::new(&prefix)) + .with_context(|| format!("{prefix:?} to RemotePath conversion"))?; + let blob_path = blob_prefix.join(Utf8Path::new(&format!("blob_{i}"))); + debug!("Creating remote item {i} at path {blob_path:?}"); + + let (data, data_len) = + upload_stream(format!("remote blob data {i}").into_bytes().into()); + task_client.upload(data, data_len, &blob_path, None).await?; + + Ok::<_, anyhow::Error>((blob_prefix, blob_path)) + }); + } + + let mut upload_tasks_failed = false; + let mut uploaded_prefixes = HashSet::with_capacity(upload_tasks_count); + let mut uploaded_blobs = HashSet::with_capacity(upload_tasks_count); + while let Some(task_run_result) = upload_tasks.join_next().await { + match task_run_result + .context("task join failed") + .and_then(|task_result| task_result.context("upload task failed")) + { + Ok((upload_prefix, upload_path)) => { + uploaded_prefixes.insert(upload_prefix); + uploaded_blobs.insert(upload_path); + } + Err(e) => { + error!("Upload task failed: {e:?}"); + upload_tasks_failed = true; + } + } + } + + let uploads = Uploads { + prefixes: uploaded_prefixes, + blobs: uploaded_blobs, + }; + if upload_tasks_failed { + ControlFlow::Break(uploads) + } else { + ControlFlow::Continue(uploads) + } +} + +pub(crate) fn ensure_logging_ready() { + LOGGING_DONE.get_or_init(|| { + utils::logging::init( + utils::logging::LogFormat::Test, + utils::logging::TracingErrorLayerEnablement::Disabled, + utils::logging::Output::Stdout, + ) + .expect("logging init failed"); + }); +} diff --git a/libs/remote_storage/tests/test_real_azure.rs b/libs/remote_storage/tests/test_real_azure.rs index b631079bc5..0387dc30e7 100644 --- a/libs/remote_storage/tests/test_real_azure.rs +++ b/libs/remote_storage/tests/test_real_azure.rs @@ -2,21 +2,23 @@ use std::collections::HashSet; use std::env; use std::num::NonZeroUsize; use std::ops::ControlFlow; -use std::path::PathBuf; use std::sync::Arc; use std::time::UNIX_EPOCH; use anyhow::Context; use camino::Utf8Path; -use once_cell::sync::OnceCell; use remote_storage::{ - AzureConfig, Download, GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind, + AzureConfig, GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind, }; use test_context::{test_context, AsyncTestContext}; -use tokio::task::JoinSet; -use tracing::{debug, error, info}; +use tracing::{debug, info}; -static LOGGING_DONE: OnceCell<()> = OnceCell::new(); +mod common; + +use common::{ + cleanup, download_to_vec, ensure_logging_ready, upload_remote_data, upload_simple_remote_data, + upload_stream, wrap_stream, +}; const ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME: &str = "ENABLE_REAL_AZURE_REMOTE_STORAGE"; @@ -28,7 +30,7 @@ const BASE_PREFIX: &str = "test"; /// If real Azure tests are disabled, the test passes, skipping any real test run: currently, there's no way to mark the test ignored in runtime with the /// deafult test framework, see https://github.com/rust-lang/rust/issues/68007 for details. /// -/// First, the test creates a set of Azure blobs with keys `/${random_prefix_part}/${base_prefix_str}/sub_prefix_${i}/blob_${i}` in [`upload_azure_data`] +/// First, the test creates a set of Azure blobs with keys `/${random_prefix_part}/${base_prefix_str}/sub_prefix_${i}/blob_${i}` in [`upload_remote_data`] /// where /// * `random_prefix_part` is set for the entire Azure client during the Azure client creation in [`create_azure_client`], to avoid multiple test runs interference /// * `base_prefix_str` is a common prefix to use in the client requests: we would want to ensure that the client is able to list nested prefixes inside the bucket @@ -95,7 +97,7 @@ async fn azure_pagination_should_work( /// Uses real Azure and requires [`ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME`] and related Azure cred env vars specified. Test will skip real code and pass if env vars not set. /// See `Azure_pagination_should_work` for more information. /// -/// First, create a set of Azure objects with keys `random_prefix/folder{j}/blob_{i}.txt` in [`upload_azure_data`] +/// First, create a set of Azure objects with keys `random_prefix/folder{j}/blob_{i}.txt` in [`upload_remote_data`] /// Then performs the following queries: /// 1. `list_files(None)`. This should return all files `random_prefix/folder{j}/blob_{i}.txt` /// 2. `list_files("folder1")`. This should return all files `random_prefix/folder1/blob_{i}.txt` @@ -180,23 +182,14 @@ async fn azure_delete_objects_works(ctx: &mut MaybeEnabledAzure) -> anyhow::Resu let path3 = RemotePath::new(Utf8Path::new(format!("{}/path3", ctx.base_prefix).as_str())) .with_context(|| "RemotePath conversion")?; - let data1 = "remote blob data1".as_bytes(); - let data1_len = data1.len(); - let data2 = "remote blob data2".as_bytes(); - let data2_len = data2.len(); - let data3 = "remote blob data3".as_bytes(); - let data3_len = data3.len(); - ctx.client - .upload(std::io::Cursor::new(data1), data1_len, &path1, None) - .await?; + let (data, len) = upload_stream("remote blob data1".as_bytes().into()); + ctx.client.upload(data, len, &path1, None).await?; - ctx.client - .upload(std::io::Cursor::new(data2), data2_len, &path2, None) - .await?; + let (data, len) = upload_stream("remote blob data2".as_bytes().into()); + ctx.client.upload(data, len, &path2, None).await?; - ctx.client - .upload(std::io::Cursor::new(data3), data3_len, &path3, None) - .await?; + let (data, len) = upload_stream("remote blob data3".as_bytes().into()); + ctx.client.upload(data, len, &path3, None).await?; ctx.client.delete_objects(&[path1, path2]).await?; @@ -219,53 +212,47 @@ async fn azure_upload_download_works(ctx: &mut MaybeEnabledAzure) -> anyhow::Res let path = RemotePath::new(Utf8Path::new(format!("{}/file", ctx.base_prefix).as_str())) .with_context(|| "RemotePath conversion")?; - let data = "remote blob data here".as_bytes(); - let data_len = data.len() as u64; + let orig = bytes::Bytes::from_static("remote blob data here".as_bytes()); - ctx.client - .upload(std::io::Cursor::new(data), data.len(), &path, None) - .await?; + let (data, len) = wrap_stream(orig.clone()); + + ctx.client.upload(data, len, &path, None).await?; - async fn download_and_compare(mut dl: Download) -> anyhow::Result> { - let mut buf = Vec::new(); - tokio::io::copy(&mut dl.download_stream, &mut buf).await?; - Ok(buf) - } // Normal download request let dl = ctx.client.download(&path).await?; - let buf = download_and_compare(dl).await?; - assert_eq!(buf, data); + let buf = download_to_vec(dl).await?; + assert_eq!(&buf, &orig); // Full range (end specified) let dl = ctx .client - .download_byte_range(&path, 0, Some(data_len)) + .download_byte_range(&path, 0, Some(len as u64)) .await?; - let buf = download_and_compare(dl).await?; - assert_eq!(buf, data); + let buf = download_to_vec(dl).await?; + assert_eq!(&buf, &orig); // partial range (end specified) let dl = ctx.client.download_byte_range(&path, 4, Some(10)).await?; - let buf = download_and_compare(dl).await?; - assert_eq!(buf, data[4..10]); + let buf = download_to_vec(dl).await?; + assert_eq!(&buf, &orig[4..10]); // partial range (end beyond real end) let dl = ctx .client - .download_byte_range(&path, 8, Some(data_len * 100)) + .download_byte_range(&path, 8, Some(len as u64 * 100)) .await?; - let buf = download_and_compare(dl).await?; - assert_eq!(buf, data[8..]); + let buf = download_to_vec(dl).await?; + assert_eq!(&buf, &orig[8..]); // Partial range (end unspecified) let dl = ctx.client.download_byte_range(&path, 4, None).await?; - let buf = download_and_compare(dl).await?; - assert_eq!(buf, data[4..]); + let buf = download_to_vec(dl).await?; + assert_eq!(&buf, &orig[4..]); // Full range (end unspecified) let dl = ctx.client.download_byte_range(&path, 0, None).await?; - let buf = download_and_compare(dl).await?; - assert_eq!(buf, data); + let buf = download_to_vec(dl).await?; + assert_eq!(&buf, &orig); debug!("Cleanup: deleting file at path {path:?}"); ctx.client @@ -276,17 +263,6 @@ async fn azure_upload_download_works(ctx: &mut MaybeEnabledAzure) -> anyhow::Res Ok(()) } -fn ensure_logging_ready() { - LOGGING_DONE.get_or_init(|| { - utils::logging::init( - utils::logging::LogFormat::Test, - utils::logging::TracingErrorLayerEnablement::Disabled, - utils::logging::Output::Stdout, - ) - .expect("logging init failed"); - }); -} - struct EnabledAzure { client: Arc, base_prefix: &'static str, @@ -356,7 +332,7 @@ impl AsyncTestContext for MaybeEnabledAzureWithTestBlobs { let enabled = EnabledAzure::setup(Some(max_keys_in_list_response)).await; - match upload_azure_data(&enabled.client, enabled.base_prefix, upload_tasks_count).await { + match upload_remote_data(&enabled.client, enabled.base_prefix, upload_tasks_count).await { ControlFlow::Continue(uploads) => { info!("Remote objects created successfully"); @@ -418,7 +394,7 @@ impl AsyncTestContext for MaybeEnabledAzureWithSimpleTestBlobs { let enabled = EnabledAzure::setup(Some(max_keys_in_list_response)).await; - match upload_simple_azure_data(&enabled.client, upload_tasks_count).await { + match upload_simple_remote_data(&enabled.client, upload_tasks_count).await { ControlFlow::Continue(uploads) => { info!("Remote objects created successfully"); @@ -482,143 +458,3 @@ fn create_azure_client( GenericRemoteStorage::from_config(&remote_storage_config).context("remote storage init")?, )) } - -struct Uploads { - prefixes: HashSet, - blobs: HashSet, -} - -async fn upload_azure_data( - client: &Arc, - base_prefix_str: &'static str, - upload_tasks_count: usize, -) -> ControlFlow { - info!("Creating {upload_tasks_count} Azure files"); - let mut upload_tasks = JoinSet::new(); - for i in 1..upload_tasks_count + 1 { - let task_client = Arc::clone(client); - upload_tasks.spawn(async move { - let prefix = format!("{base_prefix_str}/sub_prefix_{i}/"); - let blob_prefix = RemotePath::new(Utf8Path::new(&prefix)) - .with_context(|| format!("{prefix:?} to RemotePath conversion"))?; - let blob_path = blob_prefix.join(Utf8Path::new(&format!("blob_{i}"))); - debug!("Creating remote item {i} at path {blob_path:?}"); - - let data = format!("remote blob data {i}").into_bytes(); - let data_len = data.len(); - task_client - .upload(std::io::Cursor::new(data), data_len, &blob_path, None) - .await?; - - Ok::<_, anyhow::Error>((blob_prefix, blob_path)) - }); - } - - let mut upload_tasks_failed = false; - let mut uploaded_prefixes = HashSet::with_capacity(upload_tasks_count); - let mut uploaded_blobs = HashSet::with_capacity(upload_tasks_count); - while let Some(task_run_result) = upload_tasks.join_next().await { - match task_run_result - .context("task join failed") - .and_then(|task_result| task_result.context("upload task failed")) - { - Ok((upload_prefix, upload_path)) => { - uploaded_prefixes.insert(upload_prefix); - uploaded_blobs.insert(upload_path); - } - Err(e) => { - error!("Upload task failed: {e:?}"); - upload_tasks_failed = true; - } - } - } - - let uploads = Uploads { - prefixes: uploaded_prefixes, - blobs: uploaded_blobs, - }; - if upload_tasks_failed { - ControlFlow::Break(uploads) - } else { - ControlFlow::Continue(uploads) - } -} - -async fn cleanup(client: &Arc, objects_to_delete: HashSet) { - info!( - "Removing {} objects from the remote storage during cleanup", - objects_to_delete.len() - ); - let mut delete_tasks = JoinSet::new(); - for object_to_delete in objects_to_delete { - let task_client = Arc::clone(client); - delete_tasks.spawn(async move { - debug!("Deleting remote item at path {object_to_delete:?}"); - task_client - .delete(&object_to_delete) - .await - .with_context(|| format!("{object_to_delete:?} removal")) - }); - } - - while let Some(task_run_result) = delete_tasks.join_next().await { - match task_run_result { - Ok(task_result) => match task_result { - Ok(()) => {} - Err(e) => error!("Delete task failed: {e:?}"), - }, - Err(join_err) => error!("Delete task did not finish correctly: {join_err}"), - } - } -} - -// Uploads files `folder{j}/blob{i}.txt`. See test description for more details. -async fn upload_simple_azure_data( - client: &Arc, - upload_tasks_count: usize, -) -> ControlFlow, HashSet> { - info!("Creating {upload_tasks_count} Azure files"); - let mut upload_tasks = JoinSet::new(); - for i in 1..upload_tasks_count + 1 { - let task_client = Arc::clone(client); - upload_tasks.spawn(async move { - let blob_path = PathBuf::from(format!("folder{}/blob_{}.txt", i / 7, i)); - let blob_path = RemotePath::new( - Utf8Path::from_path(blob_path.as_path()).expect("must be valid blob path"), - ) - .with_context(|| format!("{blob_path:?} to RemotePath conversion"))?; - debug!("Creating remote item {i} at path {blob_path:?}"); - - let data = format!("remote blob data {i}").into_bytes(); - let data_len = data.len(); - task_client - .upload(std::io::Cursor::new(data), data_len, &blob_path, None) - .await?; - - Ok::<_, anyhow::Error>(blob_path) - }); - } - - let mut upload_tasks_failed = false; - let mut uploaded_blobs = HashSet::with_capacity(upload_tasks_count); - while let Some(task_run_result) = upload_tasks.join_next().await { - match task_run_result - .context("task join failed") - .and_then(|task_result| task_result.context("upload task failed")) - { - Ok(upload_path) => { - uploaded_blobs.insert(upload_path); - } - Err(e) => { - error!("Upload task failed: {e:?}"); - upload_tasks_failed = true; - } - } - } - - if upload_tasks_failed { - ControlFlow::Break(uploaded_blobs) - } else { - ControlFlow::Continue(uploaded_blobs) - } -} diff --git a/libs/remote_storage/tests/test_real_s3.rs b/libs/remote_storage/tests/test_real_s3.rs index 48f00e0106..8f46b2abd6 100644 --- a/libs/remote_storage/tests/test_real_s3.rs +++ b/libs/remote_storage/tests/test_real_s3.rs @@ -2,21 +2,23 @@ use std::collections::HashSet; use std::env; use std::num::NonZeroUsize; use std::ops::ControlFlow; -use std::path::PathBuf; use std::sync::Arc; use std::time::UNIX_EPOCH; use anyhow::Context; use camino::Utf8Path; -use once_cell::sync::OnceCell; use remote_storage::{ GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind, S3Config, }; use test_context::{test_context, AsyncTestContext}; -use tokio::task::JoinSet; -use tracing::{debug, error, info}; +use tracing::{debug, info}; -static LOGGING_DONE: OnceCell<()> = OnceCell::new(); +mod common; + +use common::{ + cleanup, download_to_vec, ensure_logging_ready, upload_remote_data, upload_simple_remote_data, + upload_stream, wrap_stream, +}; const ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME: &str = "ENABLE_REAL_S3_REMOTE_STORAGE"; @@ -28,7 +30,7 @@ const BASE_PREFIX: &str = "test"; /// If real S3 tests are disabled, the test passes, skipping any real test run: currently, there's no way to mark the test ignored in runtime with the /// deafult test framework, see https://github.com/rust-lang/rust/issues/68007 for details. /// -/// First, the test creates a set of S3 objects with keys `/${random_prefix_part}/${base_prefix_str}/sub_prefix_${i}/blob_${i}` in [`upload_s3_data`] +/// First, the test creates a set of S3 objects with keys `/${random_prefix_part}/${base_prefix_str}/sub_prefix_${i}/blob_${i}` in [`upload_remote_data`] /// where /// * `random_prefix_part` is set for the entire S3 client during the S3 client creation in [`create_s3_client`], to avoid multiple test runs interference /// * `base_prefix_str` is a common prefix to use in the client requests: we would want to ensure that the client is able to list nested prefixes inside the bucket @@ -93,7 +95,7 @@ async fn s3_pagination_should_work(ctx: &mut MaybeEnabledS3WithTestBlobs) -> any /// Uses real S3 and requires [`ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME`] and related S3 cred env vars specified. Test will skip real code and pass if env vars not set. /// See `s3_pagination_should_work` for more information. /// -/// First, create a set of S3 objects with keys `random_prefix/folder{j}/blob_{i}.txt` in [`upload_s3_data`] +/// First, create a set of S3 objects with keys `random_prefix/folder{j}/blob_{i}.txt` in [`upload_remote_data`] /// Then performs the following queries: /// 1. `list_files(None)`. This should return all files `random_prefix/folder{j}/blob_{i}.txt` /// 2. `list_files("folder1")`. This should return all files `random_prefix/folder1/blob_{i}.txt` @@ -176,23 +178,14 @@ async fn s3_delete_objects_works(ctx: &mut MaybeEnabledS3) -> anyhow::Result<()> let path3 = RemotePath::new(Utf8Path::new(format!("{}/path3", ctx.base_prefix).as_str())) .with_context(|| "RemotePath conversion")?; - let data1 = "remote blob data1".as_bytes(); - let data1_len = data1.len(); - let data2 = "remote blob data2".as_bytes(); - let data2_len = data2.len(); - let data3 = "remote blob data3".as_bytes(); - let data3_len = data3.len(); - ctx.client - .upload(std::io::Cursor::new(data1), data1_len, &path1, None) - .await?; + let (data, len) = upload_stream("remote blob data1".as_bytes().into()); + ctx.client.upload(data, len, &path1, None).await?; - ctx.client - .upload(std::io::Cursor::new(data2), data2_len, &path2, None) - .await?; + let (data, len) = upload_stream("remote blob data2".as_bytes().into()); + ctx.client.upload(data, len, &path2, None).await?; - ctx.client - .upload(std::io::Cursor::new(data3), data3_len, &path3, None) - .await?; + let (data, len) = upload_stream("remote blob data3".as_bytes().into()); + ctx.client.upload(data, len, &path3, None).await?; ctx.client.delete_objects(&[path1, path2]).await?; @@ -205,15 +198,65 @@ async fn s3_delete_objects_works(ctx: &mut MaybeEnabledS3) -> anyhow::Result<()> Ok(()) } -fn ensure_logging_ready() { - LOGGING_DONE.get_or_init(|| { - utils::logging::init( - utils::logging::LogFormat::Test, - utils::logging::TracingErrorLayerEnablement::Disabled, - utils::logging::Output::Stdout, - ) - .expect("logging init failed"); - }); +#[test_context(MaybeEnabledS3)] +#[tokio::test] +async fn s3_upload_download_works(ctx: &mut MaybeEnabledS3) -> anyhow::Result<()> { + let MaybeEnabledS3::Enabled(ctx) = ctx else { + return Ok(()); + }; + + let path = RemotePath::new(Utf8Path::new(format!("{}/file", ctx.base_prefix).as_str())) + .with_context(|| "RemotePath conversion")?; + + let orig = bytes::Bytes::from_static("remote blob data here".as_bytes()); + + let (data, len) = wrap_stream(orig.clone()); + + ctx.client.upload(data, len, &path, None).await?; + + // Normal download request + let dl = ctx.client.download(&path).await?; + let buf = download_to_vec(dl).await?; + assert_eq!(&buf, &orig); + + // Full range (end specified) + let dl = ctx + .client + .download_byte_range(&path, 0, Some(len as u64)) + .await?; + let buf = download_to_vec(dl).await?; + assert_eq!(&buf, &orig); + + // partial range (end specified) + let dl = ctx.client.download_byte_range(&path, 4, Some(10)).await?; + let buf = download_to_vec(dl).await?; + assert_eq!(&buf, &orig[4..10]); + + // partial range (end beyond real end) + let dl = ctx + .client + .download_byte_range(&path, 8, Some(len as u64 * 100)) + .await?; + let buf = download_to_vec(dl).await?; + assert_eq!(&buf, &orig[8..]); + + // Partial range (end unspecified) + let dl = ctx.client.download_byte_range(&path, 4, None).await?; + let buf = download_to_vec(dl).await?; + assert_eq!(&buf, &orig[4..]); + + // Full range (end unspecified) + let dl = ctx.client.download_byte_range(&path, 0, None).await?; + let buf = download_to_vec(dl).await?; + assert_eq!(&buf, &orig); + + debug!("Cleanup: deleting file at path {path:?}"); + ctx.client + .delete(&path) + .await + .with_context(|| format!("{path:?} removal"))?; + + Ok(()) } struct EnabledS3 { @@ -285,7 +328,7 @@ impl AsyncTestContext for MaybeEnabledS3WithTestBlobs { let enabled = EnabledS3::setup(Some(max_keys_in_list_response)).await; - match upload_s3_data(&enabled.client, enabled.base_prefix, upload_tasks_count).await { + match upload_remote_data(&enabled.client, enabled.base_prefix, upload_tasks_count).await { ControlFlow::Continue(uploads) => { info!("Remote objects created successfully"); @@ -347,7 +390,7 @@ impl AsyncTestContext for MaybeEnabledS3WithSimpleTestBlobs { let enabled = EnabledS3::setup(Some(max_keys_in_list_response)).await; - match upload_simple_s3_data(&enabled.client, upload_tasks_count).await { + match upload_simple_remote_data(&enabled.client, upload_tasks_count).await { ControlFlow::Continue(uploads) => { info!("Remote objects created successfully"); @@ -410,143 +453,3 @@ fn create_s3_client( GenericRemoteStorage::from_config(&remote_storage_config).context("remote storage init")?, )) } - -struct Uploads { - prefixes: HashSet, - blobs: HashSet, -} - -async fn upload_s3_data( - client: &Arc, - base_prefix_str: &'static str, - upload_tasks_count: usize, -) -> ControlFlow { - info!("Creating {upload_tasks_count} S3 files"); - let mut upload_tasks = JoinSet::new(); - for i in 1..upload_tasks_count + 1 { - let task_client = Arc::clone(client); - upload_tasks.spawn(async move { - let prefix = format!("{base_prefix_str}/sub_prefix_{i}/"); - let blob_prefix = RemotePath::new(Utf8Path::new(&prefix)) - .with_context(|| format!("{prefix:?} to RemotePath conversion"))?; - let blob_path = blob_prefix.join(Utf8Path::new(&format!("blob_{i}"))); - debug!("Creating remote item {i} at path {blob_path:?}"); - - let data = format!("remote blob data {i}").into_bytes(); - let data_len = data.len(); - task_client - .upload(std::io::Cursor::new(data), data_len, &blob_path, None) - .await?; - - Ok::<_, anyhow::Error>((blob_prefix, blob_path)) - }); - } - - let mut upload_tasks_failed = false; - let mut uploaded_prefixes = HashSet::with_capacity(upload_tasks_count); - let mut uploaded_blobs = HashSet::with_capacity(upload_tasks_count); - while let Some(task_run_result) = upload_tasks.join_next().await { - match task_run_result - .context("task join failed") - .and_then(|task_result| task_result.context("upload task failed")) - { - Ok((upload_prefix, upload_path)) => { - uploaded_prefixes.insert(upload_prefix); - uploaded_blobs.insert(upload_path); - } - Err(e) => { - error!("Upload task failed: {e:?}"); - upload_tasks_failed = true; - } - } - } - - let uploads = Uploads { - prefixes: uploaded_prefixes, - blobs: uploaded_blobs, - }; - if upload_tasks_failed { - ControlFlow::Break(uploads) - } else { - ControlFlow::Continue(uploads) - } -} - -async fn cleanup(client: &Arc, objects_to_delete: HashSet) { - info!( - "Removing {} objects from the remote storage during cleanup", - objects_to_delete.len() - ); - let mut delete_tasks = JoinSet::new(); - for object_to_delete in objects_to_delete { - let task_client = Arc::clone(client); - delete_tasks.spawn(async move { - debug!("Deleting remote item at path {object_to_delete:?}"); - task_client - .delete(&object_to_delete) - .await - .with_context(|| format!("{object_to_delete:?} removal")) - }); - } - - while let Some(task_run_result) = delete_tasks.join_next().await { - match task_run_result { - Ok(task_result) => match task_result { - Ok(()) => {} - Err(e) => error!("Delete task failed: {e:?}"), - }, - Err(join_err) => error!("Delete task did not finish correctly: {join_err}"), - } - } -} - -// Uploads files `folder{j}/blob{i}.txt`. See test description for more details. -async fn upload_simple_s3_data( - client: &Arc, - upload_tasks_count: usize, -) -> ControlFlow, HashSet> { - info!("Creating {upload_tasks_count} S3 files"); - let mut upload_tasks = JoinSet::new(); - for i in 1..upload_tasks_count + 1 { - let task_client = Arc::clone(client); - upload_tasks.spawn(async move { - let blob_path = PathBuf::from(format!("folder{}/blob_{}.txt", i / 7, i)); - let blob_path = RemotePath::new( - Utf8Path::from_path(blob_path.as_path()).expect("must be valid blob path"), - ) - .with_context(|| format!("{blob_path:?} to RemotePath conversion"))?; - debug!("Creating remote item {i} at path {blob_path:?}"); - - let data = format!("remote blob data {i}").into_bytes(); - let data_len = data.len(); - task_client - .upload(std::io::Cursor::new(data), data_len, &blob_path, None) - .await?; - - Ok::<_, anyhow::Error>(blob_path) - }); - } - - let mut upload_tasks_failed = false; - let mut uploaded_blobs = HashSet::with_capacity(upload_tasks_count); - while let Some(task_run_result) = upload_tasks.join_next().await { - match task_run_result - .context("task join failed") - .and_then(|task_result| task_result.context("upload task failed")) - { - Ok(upload_path) => { - uploaded_blobs.insert(upload_path); - } - Err(e) => { - error!("Upload task failed: {e:?}"); - upload_tasks_failed = true; - } - } - } - - if upload_tasks_failed { - ControlFlow::Break(uploaded_blobs) - } else { - ControlFlow::Continue(uploaded_blobs) - } -} diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml index ccf6f4f2d7..706b7a3187 100644 --- a/libs/utils/Cargo.toml +++ b/libs/utils/Cargo.toml @@ -4,6 +4,12 @@ version = "0.1.0" edition.workspace = true license.workspace = true +[features] +default = [] +# Enables test-only APIs, incuding failpoints. In particular, enables the `fail_point!` macro, +# which adds some runtime cost to run tests on outage conditions +testing = ["fail/failpoints"] + [dependencies] arc-swap.workspace = true sentry.workspace = true @@ -16,6 +22,7 @@ chrono.workspace = true heapless.workspace = true hex = { workspace = true, features = ["serde"] } hyper = { workspace = true, features = ["full"] } +fail.workspace = true futures = { workspace = true} jsonwebtoken.workspace = true nix.workspace = true @@ -50,6 +57,8 @@ const_format.workspace = true # why is it only here? no other crate should use it, streams are rarely needed. tokio-stream = { version = "0.1.14" } +serde_path_to_error.workspace = true + [dev-dependencies] byteorder.workspace = true bytes.workspace = true diff --git a/libs/utils/src/completion.rs b/libs/utils/src/completion.rs index e2e84dd0ee..ca6827c9b8 100644 --- a/libs/utils/src/completion.rs +++ b/libs/utils/src/completion.rs @@ -1,16 +1,14 @@ -use std::sync::Arc; - -use tokio::sync::{mpsc, Mutex}; +use tokio_util::task::{task_tracker::TaskTrackerToken, TaskTracker}; /// While a reference is kept around, the associated [`Barrier::wait`] will wait. /// /// Can be cloned, moved and kept around in futures as "guard objects". #[derive(Clone)] -pub struct Completion(mpsc::Sender<()>); +pub struct Completion(TaskTrackerToken); /// Barrier will wait until all clones of [`Completion`] have been dropped. #[derive(Clone)] -pub struct Barrier(Arc>>); +pub struct Barrier(TaskTracker); impl Default for Barrier { fn default() -> Self { @@ -21,7 +19,7 @@ impl Default for Barrier { impl Barrier { pub async fn wait(self) { - self.0.lock().await.recv().await; + self.0.wait().await; } pub async fn maybe_wait(barrier: Option) { @@ -33,8 +31,7 @@ impl Barrier { impl PartialEq for Barrier { fn eq(&self, other: &Self) -> bool { - // we don't use dyn so this is good - Arc::ptr_eq(&self.0, &other.0) + TaskTracker::ptr_eq(&self.0, &other.0) } } @@ -42,8 +39,10 @@ impl Eq for Barrier {} /// Create new Guard and Barrier pair. pub fn channel() -> (Completion, Barrier) { - let (tx, rx) = mpsc::channel::<()>(1); - let rx = Mutex::new(rx); - let rx = Arc::new(rx); - (Completion(tx), Barrier(rx)) + let tracker = TaskTracker::new(); + // otherwise wait never exits + tracker.close(); + + let token = tracker.token(); + (Completion(token), Barrier(tracker)) } diff --git a/pageserver/src/failpoint_support.rs b/libs/utils/src/failpoint_support.rs similarity index 61% rename from pageserver/src/failpoint_support.rs rename to libs/utils/src/failpoint_support.rs index 2190eba18a..5ec532e2a6 100644 --- a/pageserver/src/failpoint_support.rs +++ b/libs/utils/src/failpoint_support.rs @@ -1,3 +1,14 @@ +//! Failpoint support code shared between pageserver and safekeepers. + +use crate::http::{ + error::ApiError, + json::{json_request, json_response}, +}; +use hyper::{Body, Request, Response, StatusCode}; +use serde::{Deserialize, Serialize}; +use tokio_util::sync::CancellationToken; +use tracing::*; + /// use with fail::cfg("$name", "return(2000)") /// /// The effect is similar to a "sleep(2000)" action, i.e. we sleep for the @@ -25,7 +36,7 @@ pub use __failpoint_sleep_millis_async as sleep_millis_async; // Helper function used by the macro. (A function has nicer scoping so we // don't need to decorate everything with "::") #[doc(hidden)] -pub(crate) async fn failpoint_sleep_helper(name: &'static str, duration_str: String) { +pub async fn failpoint_sleep_helper(name: &'static str, duration_str: String) { let millis = duration_str.parse::().unwrap(); let d = std::time::Duration::from_millis(millis); @@ -71,7 +82,7 @@ pub fn init() -> fail::FailScenario<'static> { scenario } -pub(crate) fn apply_failpoint(name: &str, actions: &str) -> Result<(), String> { +pub fn apply_failpoint(name: &str, actions: &str) -> Result<(), String> { if actions == "exit" { fail::cfg_callback(name, exit_failpoint) } else { @@ -84,3 +95,45 @@ fn exit_failpoint() { tracing::info!("Exit requested by failpoint"); std::process::exit(1); } + +pub type ConfigureFailpointsRequest = Vec; + +/// Information for configuring a single fail point +#[derive(Debug, Serialize, Deserialize)] +pub struct FailpointConfig { + /// Name of the fail point + pub name: String, + /// List of actions to take, using the format described in `fail::cfg` + /// + /// We also support `actions = "exit"` to cause the fail point to immediately exit. + pub actions: String, +} + +/// Configure failpoints through http. +pub async fn failpoints_handler( + mut request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + if !fail::has_failpoints() { + return Err(ApiError::BadRequest(anyhow::anyhow!( + "Cannot manage failpoints because storage was compiled without failpoints support" + ))); + } + + let failpoints: ConfigureFailpointsRequest = json_request(&mut request).await?; + for fp in failpoints { + info!("cfg failpoint: {} {}", fp.name, fp.actions); + + // We recognize one extra "action" that's not natively recognized + // by the failpoints crate: exit, to immediately kill the process + let cfg_result = apply_failpoint(&fp.name, &fp.actions); + + if let Err(err_msg) = cfg_result { + return Err(ApiError::BadRequest(anyhow::anyhow!( + "Failed to configure failpoints: {err_msg}" + ))); + } + } + + json_response(StatusCode::OK, ()) +} diff --git a/libs/utils/src/generation.rs b/libs/utils/src/generation.rs index 49e290dab8..46eadee1da 100644 --- a/libs/utils/src/generation.rs +++ b/libs/utils/src/generation.rs @@ -152,3 +152,16 @@ impl Debug for Generation { } } } + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn generation_gt() { + // Important that a None generation compares less than a valid one, during upgrades from + // pre-generation systems. + assert!(Generation::none() < Generation::new(0)); + assert!(Generation::none() < Generation::new(1)); + } +} diff --git a/libs/utils/src/http/json.rs b/libs/utils/src/http/json.rs index 70e682cb76..7ca62561fe 100644 --- a/libs/utils/src/http/json.rs +++ b/libs/utils/src/http/json.rs @@ -25,8 +25,12 @@ pub async fn json_request_or_empty_body Deserialize<'de>>( if body.remaining() == 0 { return Ok(None); } - serde_json::from_reader(body.reader()) - .context("Failed to parse json request") + + let mut deser = serde_json::de::Deserializer::from_reader(body.reader()); + + serde_path_to_error::deserialize(&mut deser) + // intentionally stringify because the debug version is not helpful in python logs + .map_err(|e| anyhow::anyhow!("Failed to parse json request: {e}")) .map(Some) .map_err(ApiError::BadRequest) } diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs index bb6c848bf4..9e9b0adfe5 100644 --- a/libs/utils/src/lib.rs +++ b/libs/utils/src/lib.rs @@ -83,6 +83,8 @@ pub mod timeout; pub mod sync; +pub mod failpoint_support; + /// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages /// /// we have several cases: diff --git a/libs/utils/src/logging.rs b/libs/utils/src/logging.rs index 2f09c2f3ea..f7b73dc984 100644 --- a/libs/utils/src/logging.rs +++ b/libs/utils/src/logging.rs @@ -1,6 +1,7 @@ use std::str::FromStr; use anyhow::Context; +use metrics::{IntCounter, IntCounterVec}; use once_cell::sync::Lazy; use strum_macros::{EnumString, EnumVariantNames}; @@ -24,16 +25,48 @@ impl LogFormat { } } -static TRACING_EVENT_COUNT: Lazy = Lazy::new(|| { - metrics::register_int_counter_vec!( +struct TracingEventCountMetric { + error: IntCounter, + warn: IntCounter, + info: IntCounter, + debug: IntCounter, + trace: IntCounter, +} + +static TRACING_EVENT_COUNT_METRIC: Lazy = Lazy::new(|| { + let vec = metrics::register_int_counter_vec!( "libmetrics_tracing_event_count", "Number of tracing events, by level", &["level"] ) - .expect("failed to define metric") + .expect("failed to define metric"); + TracingEventCountMetric::new(vec) }); -struct TracingEventCountLayer(&'static metrics::IntCounterVec); +impl TracingEventCountMetric { + fn new(vec: IntCounterVec) -> Self { + Self { + error: vec.with_label_values(&["error"]), + warn: vec.with_label_values(&["warn"]), + info: vec.with_label_values(&["info"]), + debug: vec.with_label_values(&["debug"]), + trace: vec.with_label_values(&["trace"]), + } + } + + fn inc_for_level(&self, level: tracing::Level) { + let counter = match level { + tracing::Level::ERROR => &self.error, + tracing::Level::WARN => &self.warn, + tracing::Level::INFO => &self.info, + tracing::Level::DEBUG => &self.debug, + tracing::Level::TRACE => &self.trace, + }; + counter.inc(); + } +} + +struct TracingEventCountLayer(&'static TracingEventCountMetric); impl tracing_subscriber::layer::Layer for TracingEventCountLayer where @@ -44,15 +77,7 @@ where event: &tracing::Event<'_>, _ctx: tracing_subscriber::layer::Context<'_, S>, ) { - let level = event.metadata().level(); - let level = match *level { - tracing::Level::ERROR => "error", - tracing::Level::WARN => "warn", - tracing::Level::INFO => "info", - tracing::Level::DEBUG => "debug", - tracing::Level::TRACE => "trace", - }; - self.0.with_label_values(&[level]).inc(); + self.0.inc_for_level(*event.metadata().level()); } } @@ -106,7 +131,9 @@ pub fn init( }; log_layer.with_filter(rust_log_env_filter()) }); - let r = r.with(TracingEventCountLayer(&TRACING_EVENT_COUNT).with_filter(rust_log_env_filter())); + let r = r.with( + TracingEventCountLayer(&TRACING_EVENT_COUNT_METRIC).with_filter(rust_log_env_filter()), + ); match tracing_error_layer_enablement { TracingErrorLayerEnablement::EnableWithRustLogFilter => r .with(tracing_error::ErrorLayer::default().with_filter(rust_log_env_filter())) @@ -257,14 +284,14 @@ impl std::fmt::Debug for SecretString { mod tests { use metrics::{core::Opts, IntCounterVec}; - use super::TracingEventCountLayer; + use crate::logging::{TracingEventCountLayer, TracingEventCountMetric}; #[test] fn tracing_event_count_metric() { let counter_vec = IntCounterVec::new(Opts::new("testmetric", "testhelp"), &["level"]).unwrap(); - let counter_vec = Box::leak(Box::new(counter_vec)); // make it 'static - let layer = TracingEventCountLayer(counter_vec); + let metric = Box::leak(Box::new(TracingEventCountMetric::new(counter_vec.clone()))); + let layer = TracingEventCountLayer(metric); use tracing_subscriber::prelude::*; tracing::subscriber::with_default(tracing_subscriber::registry().with(layer), || { diff --git a/libs/utils/src/lsn.rs b/libs/utils/src/lsn.rs index 262dcb8a8a..b3269ae049 100644 --- a/libs/utils/src/lsn.rs +++ b/libs/utils/src/lsn.rs @@ -366,6 +366,49 @@ impl MonotonicCounter for RecordLsn { } } +/// Implements [`rand::distributions::uniform::UniformSampler`] so we can sample [`Lsn`]s. +/// +/// This is used by the `pagebench` pageserver benchmarking tool. +pub struct LsnSampler(::Sampler); + +impl rand::distributions::uniform::SampleUniform for Lsn { + type Sampler = LsnSampler; +} + +impl rand::distributions::uniform::UniformSampler for LsnSampler { + type X = Lsn; + + fn new(low: B1, high: B2) -> Self + where + B1: rand::distributions::uniform::SampleBorrow + Sized, + B2: rand::distributions::uniform::SampleBorrow + Sized, + { + Self( + ::Sampler::new( + low.borrow().0, + high.borrow().0, + ), + ) + } + + fn new_inclusive(low: B1, high: B2) -> Self + where + B1: rand::distributions::uniform::SampleBorrow + Sized, + B2: rand::distributions::uniform::SampleBorrow + Sized, + { + Self( + ::Sampler::new_inclusive( + low.borrow().0, + high.borrow().0, + ), + ) + } + + fn sample(&self, rng: &mut R) -> Self::X { + Lsn(self.0.sample(rng)) + } +} + #[cfg(test)] mod tests { use crate::bin_ser::BeSer; diff --git a/libs/utils/src/simple_rcu.rs b/libs/utils/src/simple_rcu.rs index 177a839d75..dc4a599111 100644 --- a/libs/utils/src/simple_rcu.rs +++ b/libs/utils/src/simple_rcu.rs @@ -1,10 +1,10 @@ //! //! RCU stands for Read-Copy-Update. It's a synchronization mechanism somewhat //! similar to a lock, but it allows readers to "hold on" to an old value of RCU -//! without blocking writers, and allows writing a new values without blocking -//! readers. When you update the new value, the new value is immediately visible +//! without blocking writers, and allows writing a new value without blocking +//! readers. When you update the value, the new value is immediately visible //! to new readers, but the update waits until all existing readers have -//! finishe, so that no one sees the old value anymore. +//! finished, so that on return, no one sees the old value anymore. //! //! This implementation isn't wait-free; it uses an RwLock that is held for a //! short duration when the value is read or updated. @@ -26,6 +26,7 @@ //! Increment the value by one, and wait for old readers to finish: //! //! ``` +//! # async fn dox() { //! # let rcu = utils::simple_rcu::Rcu::new(1); //! let write_guard = rcu.lock_for_write(); //! @@ -36,15 +37,17 @@ //! //! // Concurrent reads and writes are now possible again. Wait for all the readers //! // that still observe the old value to finish. -//! waitlist.wait(); +//! waitlist.wait().await; +//! # } //! ``` //! #![warn(missing_docs)] use std::ops::Deref; -use std::sync::mpsc::{sync_channel, Receiver, SyncSender}; use std::sync::{Arc, Weak}; -use std::sync::{Mutex, RwLock, RwLockWriteGuard}; +use std::sync::{RwLock, RwLockWriteGuard}; + +use tokio::sync::watch; /// /// Rcu allows multiple readers to read and hold onto a value without blocking @@ -68,22 +71,21 @@ struct RcuCell { value: V, /// A dummy channel. We never send anything to this channel. The point is - /// that when the RcuCell is dropped, any cloned Senders will be notified + /// that when the RcuCell is dropped, any subscribed Receivers will be notified /// that the channel is closed. Updaters can use this to wait out until the /// RcuCell has been dropped, i.e. until the old value is no longer in use. /// - /// We never do anything with the receiver, we just need to hold onto it so - /// that the Senders will be notified when it's dropped. But because it's - /// not Sync, we need a Mutex on it. - watch: (SyncSender<()>, Mutex>), + /// We never send anything to this, we just need to hold onto it so that the + /// Receivers will be notified when it's dropped. + watch: watch::Sender<()>, } impl RcuCell { fn new(value: V) -> Self { - let (watch_sender, watch_receiver) = sync_channel(0); + let (watch_sender, _) = watch::channel(()); RcuCell { value, - watch: (watch_sender, Mutex::new(watch_receiver)), + watch: watch_sender, } } } @@ -141,10 +143,10 @@ impl Deref for RcuReadGuard { /// /// Write guard returned by `write` /// -/// NB: Holding this guard blocks all concurrent `read` and `write` calls, so -/// it should only be held for a short duration! +/// NB: Holding this guard blocks all concurrent `read` and `write` calls, so it should only be +/// held for a short duration! /// -/// Calling `store` consumes the guard, making new reads and new writes possible +/// Calling [`Self::store_and_unlock`] consumes the guard, making new reads and new writes possible /// again. /// pub struct RcuWriteGuard<'a, V> { @@ -179,7 +181,7 @@ impl<'a, V> RcuWriteGuard<'a, V> { // the watches for any that do. self.inner.old_cells.retain(|weak| { if let Some(cell) = weak.upgrade() { - watches.push(cell.watch.0.clone()); + watches.push(cell.watch.subscribe()); true } else { false @@ -193,20 +195,20 @@ impl<'a, V> RcuWriteGuard<'a, V> { /// /// List of readers who can still see old values. /// -pub struct RcuWaitList(Vec>); +pub struct RcuWaitList(Vec>); impl RcuWaitList { /// /// Wait for old readers to finish. /// - pub fn wait(mut self) { + pub async fn wait(mut self) { // after all the old_cells are no longer in use, we're done for w in self.0.iter_mut() { // This will block until the Receiver is closed. That happens when // the RcuCell is dropped. #[allow(clippy::single_match)] - match w.send(()) { - Ok(_) => panic!("send() unexpectedly succeeded on dummy channel"), + match w.changed().await { + Ok(_) => panic!("changed() unexpectedly succeeded on dummy channel"), Err(_) => { // closed, which means that the cell has been dropped, and // its value is no longer in use @@ -220,11 +222,10 @@ impl RcuWaitList { mod tests { use super::*; use std::sync::{Arc, Mutex}; - use std::thread::{sleep, spawn}; use std::time::Duration; - #[test] - fn two_writers() { + #[tokio::test] + async fn two_writers() { let rcu = Rcu::new(1); let read1 = rcu.read(); @@ -248,33 +249,35 @@ mod tests { assert_eq!(*read1, 1); let log = Arc::new(Mutex::new(Vec::new())); - // Wait for the old readers to finish in separate threads. + // Wait for the old readers to finish in separate tasks. let log_clone = Arc::clone(&log); - let thread2 = spawn(move || { - wait2.wait(); + let task2 = tokio::spawn(async move { + wait2.wait().await; log_clone.lock().unwrap().push("wait2 done"); }); let log_clone = Arc::clone(&log); - let thread3 = spawn(move || { - wait3.wait(); + let task3 = tokio::spawn(async move { + wait3.wait().await; log_clone.lock().unwrap().push("wait3 done"); }); // without this sleep the test can pass on accident if the writer is slow - sleep(Duration::from_millis(500)); + tokio::time::sleep(Duration::from_millis(100)).await; // Release first reader. This allows first write to finish, but calling - // wait() on the second one would still block. + // wait() on the 'task3' would still block. log.lock().unwrap().push("dropping read1"); drop(read1); - thread2.join().unwrap(); + task2.await.unwrap(); - sleep(Duration::from_millis(500)); + assert!(!task3.is_finished()); + + tokio::time::sleep(Duration::from_millis(100)).await; // Release second reader, and finish second writer. log.lock().unwrap().push("dropping read2"); drop(read2); - thread3.join().unwrap(); + task3.await.unwrap(); assert_eq!( log.lock().unwrap().as_slice(), diff --git a/libs/utils/src/sync/gate.rs b/libs/utils/src/sync/gate.rs index 9aad0af22d..31c76d2f74 100644 --- a/libs/utils/src/sync/gate.rs +++ b/libs/utils/src/sync/gate.rs @@ -30,18 +30,32 @@ async fn warn_if_stuck( let mut fut = std::pin::pin!(fut); - loop { + let mut warned = false; + let ret = loop { match tokio::time::timeout(warn_period, &mut fut).await { - Ok(ret) => return ret, + Ok(ret) => break ret, Err(_) => { tracing::warn!( gate = name, elapsed_ms = started.elapsed().as_millis(), "still waiting, taking longer than expected..." ); + warned = true; } } + }; + + // If we emitted a warning for slowness, also emit a message when we complete, so that + // someone debugging a shutdown can know for sure whether we have moved past this operation. + if warned { + tracing::info!( + gate = name, + elapsed_ms = started.elapsed().as_millis(), + "completed, after taking longer than expected" + ) } + + ret } #[derive(Debug)] diff --git a/libs/utils/src/timeout.rs b/libs/utils/src/timeout.rs index 11fa417242..56bf57a900 100644 --- a/libs/utils/src/timeout.rs +++ b/libs/utils/src/timeout.rs @@ -2,8 +2,11 @@ use std::time::Duration; use tokio_util::sync::CancellationToken; +#[derive(thiserror::Error, Debug)] pub enum TimeoutCancellableError { + #[error("Timed out")] Timeout, + #[error("Cancelled")] Cancelled, } diff --git a/libs/walproposer/bindgen_deps.h b/libs/walproposer/bindgen_deps.h index b95788347c..41ee1cd4a3 100644 --- a/libs/walproposer/bindgen_deps.h +++ b/libs/walproposer/bindgen_deps.h @@ -1 +1,2 @@ +#include "postgres.h" #include "walproposer.h" diff --git a/libs/walproposer/build.rs b/libs/walproposer/build.rs index d32c8ab299..fd09030dbd 100644 --- a/libs/walproposer/build.rs +++ b/libs/walproposer/build.rs @@ -1,3 +1,6 @@ +//! Links with walproposer, pgcommon, pgport and runs bindgen on walproposer.h +//! to generate Rust bindings for it. + use std::{env, path::PathBuf, process::Command}; use anyhow::{anyhow, Context}; diff --git a/libs/walproposer/src/api_bindings.rs b/libs/walproposer/src/api_bindings.rs index 7f1bbc3b80..1f7bf952dc 100644 --- a/libs/walproposer/src/api_bindings.rs +++ b/libs/walproposer/src/api_bindings.rs @@ -1,3 +1,6 @@ +//! A C-Rust shim: defines implementation of C walproposer API, assuming wp +//! callback_data stores Box to some Rust implementation. + #![allow(dead_code)] use std::ffi::CStr; @@ -5,12 +8,12 @@ use std::ffi::CString; use crate::bindings::uint32; use crate::bindings::walproposer_api; +use crate::bindings::NeonWALReadResult; use crate::bindings::PGAsyncReadResult; use crate::bindings::PGAsyncWriteResult; use crate::bindings::Safekeeper; use crate::bindings::Size; use crate::bindings::StringInfoData; -use crate::bindings::TimeLineID; use crate::bindings::TimestampTz; use crate::bindings::WalProposer; use crate::bindings::WalProposerConnStatusType; @@ -175,31 +178,11 @@ extern "C" fn conn_blocking_write( } } -extern "C" fn recovery_download( - sk: *mut Safekeeper, - _timeline: TimeLineID, - startpos: XLogRecPtr, - endpos: XLogRecPtr, -) -> bool { +extern "C" fn recovery_download(wp: *mut WalProposer, sk: *mut Safekeeper) -> bool { unsafe { let callback_data = (*(*(*sk).wp).config).callback_data; let api = callback_data as *mut Box; - (*api).recovery_download(&mut (*sk), startpos, endpos) - } -} - -#[allow(clippy::unnecessary_cast)] -extern "C" fn wal_read( - sk: *mut Safekeeper, - buf: *mut ::std::os::raw::c_char, - startptr: XLogRecPtr, - count: Size, -) { - unsafe { - let buf = std::slice::from_raw_parts_mut(buf as *mut u8, count); - let callback_data = (*(*(*sk).wp).config).callback_data; - let api = callback_data as *mut Box; - (*api).wal_read(&mut (*sk), buf, startptr) + (*api).recovery_download(&mut (*wp), &mut (*sk)) } } @@ -211,11 +194,28 @@ extern "C" fn wal_reader_allocate(sk: *mut Safekeeper) { } } -extern "C" fn free_event_set(wp: *mut WalProposer) { +#[allow(clippy::unnecessary_cast)] +extern "C" fn wal_read( + sk: *mut Safekeeper, + buf: *mut ::std::os::raw::c_char, + startptr: XLogRecPtr, + count: Size, + _errmsg: *mut *mut ::std::os::raw::c_char, +) -> NeonWALReadResult { unsafe { - let callback_data = (*(*wp).config).callback_data; + let buf = std::slice::from_raw_parts_mut(buf as *mut u8, count); + let callback_data = (*(*(*sk).wp).config).callback_data; let api = callback_data as *mut Box; - (*api).free_event_set(&mut (*wp)); + // TODO: errmsg is not forwarded + (*api).wal_read(&mut (*sk), buf, startptr) + } +} + +extern "C" fn wal_reader_events(sk: *mut Safekeeper) -> uint32 { + unsafe { + let callback_data = (*(*(*sk).wp).config).callback_data; + let api = callback_data as *mut Box; + (*api).wal_reader_events(&mut (*sk)) } } @@ -235,6 +235,14 @@ extern "C" fn update_event_set(sk: *mut Safekeeper, events: uint32) { } } +extern "C" fn active_state_update_event_set(sk: *mut Safekeeper) { + unsafe { + let callback_data = (*(*(*sk).wp).config).callback_data; + let api = callback_data as *mut Box; + (*api).active_state_update_event_set(&mut (*sk)); + } +} + extern "C" fn add_safekeeper_event_set(sk: *mut Safekeeper, events: uint32) { unsafe { let callback_data = (*(*(*sk).wp).config).callback_data; @@ -243,6 +251,14 @@ extern "C" fn add_safekeeper_event_set(sk: *mut Safekeeper, events: uint32) { } } +extern "C" fn rm_safekeeper_event_set(sk: *mut Safekeeper) { + unsafe { + let callback_data = (*(*(*sk).wp).config).callback_data; + let api = callback_data as *mut Box; + (*api).rm_safekeeper_event_set(&mut (*sk)); + } +} + extern "C" fn wait_event_set( wp: *mut WalProposer, timeout: ::std::os::raw::c_long, @@ -310,14 +326,6 @@ extern "C" fn process_safekeeper_feedback(wp: *mut WalProposer, commit_lsn: XLog } } -extern "C" fn confirm_wal_streamed(wp: *mut WalProposer, lsn: XLogRecPtr) { - unsafe { - let callback_data = (*(*wp).config).callback_data; - let api = callback_data as *mut Box; - (*api).confirm_wal_streamed(&mut (*wp), lsn) - } -} - extern "C" fn log_internal( wp: *mut WalProposer, level: ::std::os::raw::c_int, @@ -332,14 +340,6 @@ extern "C" fn log_internal( } } -extern "C" fn after_election(wp: *mut WalProposer) { - unsafe { - let callback_data = (*(*wp).config).callback_data; - let api = callback_data as *mut Box; - (*api).after_election(&mut (*wp)) - } -} - #[derive(Debug)] pub enum Level { Debug5, @@ -398,20 +398,20 @@ pub(crate) fn create_api() -> walproposer_api { conn_async_write: Some(conn_async_write), conn_blocking_write: Some(conn_blocking_write), recovery_download: Some(recovery_download), - wal_read: Some(wal_read), wal_reader_allocate: Some(wal_reader_allocate), - free_event_set: Some(free_event_set), + wal_read: Some(wal_read), + wal_reader_events: Some(wal_reader_events), init_event_set: Some(init_event_set), update_event_set: Some(update_event_set), + active_state_update_event_set: Some(active_state_update_event_set), add_safekeeper_event_set: Some(add_safekeeper_event_set), + rm_safekeeper_event_set: Some(rm_safekeeper_event_set), wait_event_set: Some(wait_event_set), strong_random: Some(strong_random), get_redo_start_lsn: Some(get_redo_start_lsn), finish_sync_safekeepers: Some(finish_sync_safekeepers), process_safekeeper_feedback: Some(process_safekeeper_feedback), - confirm_wal_streamed: Some(confirm_wal_streamed), log_internal: Some(log_internal), - after_election: Some(after_election), } } diff --git a/libs/walproposer/src/walproposer.rs b/libs/walproposer/src/walproposer.rs index 0661d3a969..7251545792 100644 --- a/libs/walproposer/src/walproposer.rs +++ b/libs/walproposer/src/walproposer.rs @@ -6,8 +6,8 @@ use utils::id::TenantTimelineId; use crate::{ api_bindings::{create_api, take_vec_u8, Level}, bindings::{ - Safekeeper, WalProposer, WalProposerConfig, WalProposerCreate, WalProposerFree, - WalProposerStart, + NeonWALReadResult, Safekeeper, WalProposer, WalProposerConfig, WalProposerCreate, + WalProposerFree, WalProposerStart, }, }; @@ -86,19 +86,19 @@ pub trait ApiImpl { todo!() } - fn recovery_download(&self, _sk: &mut Safekeeper, _startpos: u64, _endpos: u64) -> bool { + fn recovery_download(&self, _wp: &mut WalProposer, _sk: &mut Safekeeper) -> bool { todo!() } - fn wal_read(&self, _sk: &mut Safekeeper, _buf: &mut [u8], _startpos: u64) { + fn wal_reader_allocate(&self, _sk: &mut Safekeeper) -> NeonWALReadResult { todo!() } - fn wal_reader_allocate(&self, _sk: &mut Safekeeper) { + fn wal_read(&self, _sk: &mut Safekeeper, _buf: &mut [u8], _startpos: u64) -> NeonWALReadResult { todo!() } - fn free_event_set(&self, _wp: &mut WalProposer) { + fn wal_reader_events(&self, _sk: &mut Safekeeper) -> u32 { todo!() } @@ -110,10 +110,18 @@ pub trait ApiImpl { todo!() } + fn active_state_update_event_set(&self, _sk: &mut Safekeeper) { + todo!() + } + fn add_safekeeper_event_set(&self, _sk: &mut Safekeeper, _events_mask: u32) { todo!() } + fn rm_safekeeper_event_set(&self, _sk: &mut Safekeeper) { + todo!() + } + fn wait_event_set(&self, _wp: &mut WalProposer, _timeout_millis: i64) -> WaitResult { todo!() } @@ -134,10 +142,6 @@ pub trait ApiImpl { todo!() } - fn confirm_wal_streamed(&self, _wp: &mut WalProposer, _lsn: u64) { - todo!() - } - fn log_internal(&self, _wp: &mut WalProposer, _level: Level, _msg: &str) { todo!() } @@ -240,6 +244,7 @@ impl Drop for Wrapper { #[cfg(test)] mod tests { + use core::panic; use std::{ cell::Cell, sync::{atomic::AtomicUsize, mpsc::sync_channel}, @@ -247,7 +252,7 @@ mod tests { use utils::id::TenantTimelineId; - use crate::{api_bindings::Level, walproposer::Wrapper}; + use crate::{api_bindings::Level, bindings::NeonWALReadResult, walproposer::Wrapper}; use super::ApiImpl; @@ -355,12 +360,17 @@ mod tests { true } - fn wal_reader_allocate(&self, _: &mut crate::bindings::Safekeeper) { - println!("wal_reader_allocate") + fn recovery_download( + &self, + _wp: &mut crate::bindings::WalProposer, + _sk: &mut crate::bindings::Safekeeper, + ) -> bool { + true } - fn free_event_set(&self, _: &mut crate::bindings::WalProposer) { - println!("free_event_set") + fn wal_reader_allocate(&self, _: &mut crate::bindings::Safekeeper) -> NeonWALReadResult { + println!("wal_reader_allocate"); + crate::bindings::NeonWALReadResult_NEON_WALREAD_SUCCESS } fn init_event_set(&self, _: &mut crate::bindings::WalProposer) { @@ -383,6 +393,13 @@ mod tests { self.wait_events.set(WaitEventsData { sk, event_mask }); } + fn rm_safekeeper_event_set(&self, sk: &mut crate::bindings::Safekeeper) { + println!( + "rm_safekeeper_event_set, sk={:?}", + sk as *mut crate::bindings::Safekeeper + ); + } + fn wait_event_set( &self, _: &mut crate::bindings::WalProposer, @@ -408,7 +425,7 @@ mod tests { } fn log_internal(&self, _wp: &mut crate::bindings::WalProposer, level: Level, msg: &str) { - println!("walprop_log[{}] {}", level, msg); + println!("wp_log[{}] {}", level, msg); } fn after_election(&self, _wp: &mut crate::bindings::WalProposer) { @@ -436,9 +453,9 @@ mod tests { event_mask: 0, }), expected_messages: vec![ - // Greeting(ProposerGreeting { protocol_version: 2, pg_version: 160000, proposer_id: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], system_id: 0, timeline_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tenant_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tli: 1, wal_seg_size: 16777216 }) + // Greeting(ProposerGreeting { protocol_version: 2, pg_version: 160001, proposer_id: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], system_id: 0, timeline_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tenant_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tli: 1, wal_seg_size: 16777216 }) vec![ - 103, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 113, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 103, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 1, 113, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 158, 76, 143, 54, 6, 60, 108, 110, 147, 188, 32, 214, 90, 130, 15, 61, 158, 76, 143, 54, 6, 60, 108, 110, 147, 188, 32, 214, 90, 130, 15, 61, 1, 0, 0, 0, 0, 0, 0, 1, @@ -478,7 +495,7 @@ mod tests { // walproposer will panic when it finishes sync_safekeepers std::panic::catch_unwind(|| wp.start()).unwrap_err(); // validate the resulting LSN - assert_eq!(receiver.recv()?, 1337); + assert_eq!(receiver.try_recv(), Ok(1337)); Ok(()) // drop() will free up resources here } diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index 35c260740c..980fbab22e 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -36,6 +36,7 @@ humantime.workspace = true humantime-serde.workspace = true hyper.workspace = true itertools.workspace = true +md5.workspace = true nix.workspace = true # hack to get the number of worker threads tokio uses num_cpus = { version = "1.15" } @@ -62,6 +63,7 @@ thiserror.workspace = true tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time"] } tokio-io-timeout.workspace = true tokio-postgres.workspace = true +tokio-stream.workspace = true tokio-util.workspace = true toml_edit = { workspace = true, features = [ "serde" ] } tracing.workspace = true diff --git a/pageserver/benches/bench_walredo.rs b/pageserver/benches/bench_walredo.rs index ba41866935..4837626086 100644 --- a/pageserver/benches/bench_walredo.rs +++ b/pageserver/benches/bench_walredo.rs @@ -13,6 +13,7 @@ use bytes::{Buf, Bytes}; use pageserver::{ config::PageServerConf, repository::Key, walrecord::NeonWalRecord, walredo::PostgresRedoManager, }; +use pageserver_api::shard::TenantShardId; use utils::{id::TenantId, lsn::Lsn}; use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion}; @@ -26,9 +27,9 @@ fn redo_scenarios(c: &mut Criterion) { let conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf()); let conf = Box::leak(Box::new(conf)); - let tenant_id = TenantId::generate(); + let tenant_shard_id = TenantShardId::unsharded(TenantId::generate()); - let manager = PostgresRedoManager::new(conf, tenant_id); + let manager = PostgresRedoManager::new(conf, tenant_shard_id); let manager = Arc::new(manager); diff --git a/pageserver/client/Cargo.toml b/pageserver/client/Cargo.toml new file mode 100644 index 0000000000..0ed27602cd --- /dev/null +++ b/pageserver/client/Cargo.toml @@ -0,0 +1,22 @@ +[package] +name = "pageserver_client" +version = "0.1.0" +edition.workspace = true +license.workspace = true + +[dependencies] +pageserver_api.workspace = true +thiserror.workspace = true +async-trait.workspace = true +reqwest.workspace = true +utils.workspace = true +serde.workspace = true +workspace_hack = { version = "0.1", path = "../../workspace_hack" } +tokio-postgres.workspace = true +tokio-stream.workspace = true +tokio.workspace = true +futures.workspace = true +tokio-util.workspace = true +anyhow.workspace = true +postgres.workspace = true +bytes.workspace = true diff --git a/pageserver/client/src/lib.rs b/pageserver/client/src/lib.rs new file mode 100644 index 0000000000..4a3f4dea47 --- /dev/null +++ b/pageserver/client/src/lib.rs @@ -0,0 +1,2 @@ +pub mod mgmt_api; +pub mod page_service; diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs new file mode 100644 index 0000000000..87e4ed8efd --- /dev/null +++ b/pageserver/client/src/mgmt_api.rs @@ -0,0 +1,202 @@ +use pageserver_api::models::*; +use reqwest::{IntoUrl, Method}; +use utils::{ + http::error::HttpErrorBody, + id::{TenantId, TimelineId}, +}; + +pub mod util; + +#[derive(Debug)] +pub struct Client { + mgmt_api_endpoint: String, + authorization_header: Option, + client: reqwest::Client, +} + +#[derive(thiserror::Error, Debug)] +pub enum Error { + #[error("receive body: {0}")] + ReceiveBody(reqwest::Error), + + #[error("receive error body: {0}")] + ReceiveErrorBody(String), + + #[error("pageserver API: {0}")] + ApiError(String), +} + +pub type Result = std::result::Result; + +#[async_trait::async_trait] +pub trait ResponseErrorMessageExt: Sized { + async fn error_from_body(self) -> Result; +} + +#[async_trait::async_trait] +impl ResponseErrorMessageExt for reqwest::Response { + async fn error_from_body(mut self) -> Result { + let status = self.status(); + if !(status.is_client_error() || status.is_server_error()) { + return Ok(self); + } + + let url = self.url().to_owned(); + Err(match self.json::().await { + Ok(HttpErrorBody { msg }) => Error::ApiError(msg), + Err(_) => { + Error::ReceiveErrorBody(format!("Http error ({}) at {}.", status.as_u16(), url)) + } + }) + } +} + +impl Client { + pub fn new(mgmt_api_endpoint: String, jwt: Option<&str>) -> Self { + Self { + mgmt_api_endpoint, + authorization_header: jwt.map(|jwt| format!("Bearer {jwt}")), + client: reqwest::Client::new(), + } + } + + pub async fn list_tenants(&self) -> Result> { + let uri = format!("{}/v1/tenant", self.mgmt_api_endpoint); + let resp = self.get(&uri).await?; + resp.json().await.map_err(Error::ReceiveBody) + } + + pub async fn tenant_details( + &self, + tenant_id: TenantId, + ) -> Result { + let uri = format!("{}/v1/tenant/{tenant_id}", self.mgmt_api_endpoint); + self.get(uri) + .await? + .json() + .await + .map_err(Error::ReceiveBody) + } + + pub async fn list_timelines( + &self, + tenant_id: TenantId, + ) -> Result> { + let uri = format!("{}/v1/tenant/{tenant_id}/timeline", self.mgmt_api_endpoint); + self.get(&uri) + .await? + .json() + .await + .map_err(Error::ReceiveBody) + } + + pub async fn timeline_info( + &self, + tenant_id: TenantId, + timeline_id: TimelineId, + ) -> Result { + let uri = format!( + "{}/v1/tenant/{tenant_id}/timeline/{timeline_id}", + self.mgmt_api_endpoint + ); + self.get(&uri) + .await? + .json() + .await + .map_err(Error::ReceiveBody) + } + + pub async fn keyspace( + &self, + tenant_id: TenantId, + timeline_id: TimelineId, + ) -> Result { + let uri = format!( + "{}/v1/tenant/{tenant_id}/timeline/{timeline_id}/keyspace", + self.mgmt_api_endpoint + ); + self.get(&uri) + .await? + .json() + .await + .map_err(Error::ReceiveBody) + } + + async fn get(&self, uri: U) -> Result { + self.request(Method::GET, uri, ()).await + } + + async fn request( + &self, + method: Method, + uri: U, + body: B, + ) -> Result { + let req = self.client.request(method, uri); + let req = if let Some(value) = &self.authorization_header { + req.header(reqwest::header::AUTHORIZATION, value) + } else { + req + }; + let res = req.json(&body).send().await.map_err(Error::ReceiveBody)?; + let response = res.error_from_body().await?; + Ok(response) + } + + pub async fn status(&self) -> Result<()> { + let uri = format!("{}/v1/status", self.mgmt_api_endpoint); + self.get(&uri).await?; + Ok(()) + } + + pub async fn tenant_create(&self, req: &TenantCreateRequest) -> Result { + let uri = format!("{}/v1/tenant", self.mgmt_api_endpoint); + self.request(Method::POST, &uri, req) + .await? + .json() + .await + .map_err(Error::ReceiveBody) + } + + pub async fn tenant_config(&self, req: &TenantConfigRequest) -> Result<()> { + let uri = format!("{}/v1/tenant/config", self.mgmt_api_endpoint); + self.request(Method::PUT, &uri, req).await?; + Ok(()) + } + + pub async fn location_config( + &self, + tenant_id: TenantId, + config: LocationConfig, + flush_ms: Option, + ) -> Result<()> { + let req_body = TenantLocationConfigRequest { tenant_id, config }; + let path = format!( + "{}/v1/tenant/{}/location_config", + self.mgmt_api_endpoint, tenant_id + ); + let path = if let Some(flush_ms) = flush_ms { + format!("{}?flush_ms={}", path, flush_ms.as_millis()) + } else { + path + }; + self.request(Method::PUT, &path, &req_body).await?; + Ok(()) + } + + pub async fn timeline_create( + &self, + tenant_id: TenantId, + req: &TimelineCreateRequest, + ) -> Result { + let uri = format!( + "{}/v1/tenant/{}/timeline", + self.mgmt_api_endpoint, tenant_id + ); + self.request(Method::POST, &uri, req) + .await? + .json() + .await + .map_err(Error::ReceiveBody) + } +} diff --git a/pageserver/client/src/mgmt_api/util.rs b/pageserver/client/src/mgmt_api/util.rs new file mode 100644 index 0000000000..048a3bb7cd --- /dev/null +++ b/pageserver/client/src/mgmt_api/util.rs @@ -0,0 +1,49 @@ +//! Helpers to do common higher-level tasks with the [`Client`]. + +use std::sync::Arc; + +use tokio::task::JoinSet; +use utils::id::{TenantId, TenantTimelineId}; + +use super::Client; + +/// Retrieve a list of all of the pageserver's timelines. +/// +/// Fails if there are sharded tenants present on the pageserver. +pub async fn get_pageserver_tenant_timelines_unsharded( + api_client: &Arc, +) -> anyhow::Result> { + let mut timelines: Vec = Vec::new(); + let mut tenants: Vec = Vec::new(); + for ti in api_client.list_tenants().await? { + if !ti.id.is_unsharded() { + anyhow::bail!( + "only unsharded tenants are supported at this time: {}", + ti.id + ); + } + tenants.push(ti.id.tenant_id) + } + let mut js = JoinSet::new(); + for tenant_id in tenants { + js.spawn({ + let mgmt_api_client = Arc::clone(api_client); + async move { + ( + tenant_id, + mgmt_api_client.tenant_details(tenant_id).await.unwrap(), + ) + } + }); + } + while let Some(res) = js.join_next().await { + let (tenant_id, details) = res.unwrap(); + for timeline_id in details.timelines { + timelines.push(TenantTimelineId { + tenant_id, + timeline_id, + }); + } + } + Ok(timelines) +} diff --git a/pageserver/client/src/page_service.rs b/pageserver/client/src/page_service.rs new file mode 100644 index 0000000000..231461267a --- /dev/null +++ b/pageserver/client/src/page_service.rs @@ -0,0 +1,144 @@ +use std::pin::Pin; + +use futures::SinkExt; +use pageserver_api::{ + models::{ + PagestreamBeMessage, PagestreamFeMessage, PagestreamGetPageRequest, + PagestreamGetPageResponse, + }, + reltag::RelTag, +}; +use tokio::task::JoinHandle; +use tokio_postgres::CopyOutStream; +use tokio_stream::StreamExt; +use tokio_util::sync::CancellationToken; +use utils::{ + id::{TenantId, TimelineId}, + lsn::Lsn, +}; + +pub struct Client { + client: tokio_postgres::Client, + cancel_on_client_drop: Option, + conn_task: JoinHandle<()>, +} + +pub struct BasebackupRequest { + pub tenant_id: TenantId, + pub timeline_id: TimelineId, + pub lsn: Option, + pub gzip: bool, +} + +impl Client { + pub async fn new(connstring: String) -> anyhow::Result { + let (client, connection) = tokio_postgres::connect(&connstring, postgres::NoTls).await?; + + let conn_task_cancel = CancellationToken::new(); + let conn_task = tokio::spawn({ + let conn_task_cancel = conn_task_cancel.clone(); + async move { + tokio::select! { + _ = conn_task_cancel.cancelled() => { } + res = connection => { + res.unwrap(); + } + } + } + }); + Ok(Self { + cancel_on_client_drop: Some(conn_task_cancel.drop_guard()), + conn_task, + client, + }) + } + + pub async fn pagestream( + self, + tenant_id: TenantId, + timeline_id: TimelineId, + ) -> anyhow::Result { + let copy_both: tokio_postgres::CopyBothDuplex = self + .client + .copy_both_simple(&format!("pagestream {tenant_id} {timeline_id}")) + .await?; + let Client { + cancel_on_client_drop, + conn_task, + client: _, + } = self; + Ok(PagestreamClient { + copy_both: Box::pin(copy_both), + conn_task, + cancel_on_client_drop, + }) + } + + pub async fn basebackup(&self, req: &BasebackupRequest) -> anyhow::Result { + let BasebackupRequest { + tenant_id, + timeline_id, + lsn, + gzip, + } = req; + let mut args = Vec::with_capacity(5); + args.push("basebackup".to_string()); + args.push(format!("{tenant_id}")); + args.push(format!("{timeline_id}")); + if let Some(lsn) = lsn { + args.push(format!("{lsn}")); + } + if *gzip { + args.push("--gzip".to_string()) + } + Ok(self.client.copy_out(&args.join(" ")).await?) + } +} + +/// Create using [`Client::pagestream`]. +pub struct PagestreamClient { + copy_both: Pin>>, + cancel_on_client_drop: Option, + conn_task: JoinHandle<()>, +} + +pub struct RelTagBlockNo { + pub rel_tag: RelTag, + pub block_no: u32, +} + +impl PagestreamClient { + pub async fn shutdown(mut self) { + let _ = self.cancel_on_client_drop.take(); + self.conn_task.await.unwrap(); + } + + pub async fn getpage( + &mut self, + req: PagestreamGetPageRequest, + ) -> anyhow::Result { + let req = PagestreamFeMessage::GetPage(req); + let req: bytes::Bytes = req.serialize(); + // let mut req = tokio_util::io::ReaderStream::new(&req); + let mut req = tokio_stream::once(Ok(req)); + + self.copy_both.send_all(&mut req).await?; + + let next: Option> = self.copy_both.next().await; + let next: bytes::Bytes = next.unwrap()?; + + let msg = PagestreamBeMessage::deserialize(next)?; + match msg { + PagestreamBeMessage::GetPage(p) => Ok(p), + PagestreamBeMessage::Error(e) => anyhow::bail!("Error: {:?}", e), + PagestreamBeMessage::Exists(_) + | PagestreamBeMessage::Nblocks(_) + | PagestreamBeMessage::DbSize(_) => { + anyhow::bail!( + "unexpected be message kind in response to getpage request: {}", + msg.kind() + ) + } + } + } +} diff --git a/pageserver/pagebench/Cargo.toml b/pageserver/pagebench/Cargo.toml new file mode 100644 index 0000000000..169d9b7f8e --- /dev/null +++ b/pageserver/pagebench/Cargo.toml @@ -0,0 +1,26 @@ +[package] +name = "pagebench" +version = "0.1.0" +edition.workspace = true +license.workspace = true + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +anyhow.workspace = true +clap.workspace = true +futures.workspace = true +hdrhistogram.workspace = true +humantime.workspace = true +humantime-serde.workspace = true +rand.workspace = true +serde.workspace = true +serde_json.workspace = true +tracing.workspace = true +tokio.workspace = true + +pageserver = { path = ".." } +pageserver_client.workspace = true +pageserver_api.workspace = true +utils = { path = "../../libs/utils/" } +workspace_hack = { version = "0.1", path = "../../workspace_hack" } diff --git a/pageserver/pagebench/src/cmd/basebackup.rs b/pageserver/pagebench/src/cmd/basebackup.rs new file mode 100644 index 0000000000..85a3e695de --- /dev/null +++ b/pageserver/pagebench/src/cmd/basebackup.rs @@ -0,0 +1,272 @@ +use anyhow::Context; +use pageserver_client::page_service::BasebackupRequest; + +use utils::id::TenantTimelineId; +use utils::lsn::Lsn; + +use rand::prelude::*; +use tokio::sync::Barrier; +use tokio::task::JoinSet; +use tracing::{debug, info, instrument}; + +use std::collections::HashMap; +use std::num::NonZeroUsize; +use std::ops::Range; +use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering}; +use std::sync::{Arc, Mutex}; +use std::time::Instant; + +use crate::util::tokio_thread_local_stats::AllThreadLocalStats; +use crate::util::{request_stats, tokio_thread_local_stats}; + +/// basebackup@LatestLSN +#[derive(clap::Parser)] +pub(crate) struct Args { + #[clap(long, default_value = "http://localhost:9898")] + mgmt_api_endpoint: String, + #[clap(long, default_value = "localhost:64000")] + page_service_host_port: String, + #[clap(long)] + pageserver_jwt: Option, + #[clap(long, default_value = "1")] + num_clients: NonZeroUsize, + #[clap(long, default_value = "1.0")] + gzip_probability: f64, + #[clap(long)] + runtime: Option, + #[clap(long)] + limit_to_first_n_targets: Option, + targets: Option>, +} + +#[derive(Debug, Default)] +struct LiveStats { + completed_requests: AtomicU64, +} + +impl LiveStats { + fn inc(&self) { + self.completed_requests.fetch_add(1, Ordering::Relaxed); + } +} + +struct Target { + timeline: TenantTimelineId, + lsn_range: Option>, +} + +#[derive(serde::Serialize)] +struct Output { + total: request_stats::Output, +} + +tokio_thread_local_stats::declare!(STATS: request_stats::Stats); + +pub(crate) fn main(args: Args) -> anyhow::Result<()> { + tokio_thread_local_stats::main!(STATS, move |thread_local_stats| { + main_impl(args, thread_local_stats) + }) +} + +async fn main_impl( + args: Args, + all_thread_local_stats: AllThreadLocalStats, +) -> anyhow::Result<()> { + let args: &'static Args = Box::leak(Box::new(args)); + + let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new( + args.mgmt_api_endpoint.clone(), + args.pageserver_jwt.as_deref(), + )); + + // discover targets + let timelines: Vec = crate::util::cli::targets::discover( + &mgmt_api_client, + crate::util::cli::targets::Spec { + limit_to_first_n_targets: args.limit_to_first_n_targets, + targets: args.targets.clone(), + }, + ) + .await?; + let mut js = JoinSet::new(); + for timeline in &timelines { + js.spawn({ + let timeline = *timeline; + // FIXME: this triggers initial logical size calculation + // https://github.com/neondatabase/neon/issues/6168 + let info = mgmt_api_client + .timeline_info(timeline.tenant_id, timeline.timeline_id) + .await + .unwrap(); + async move { + anyhow::Ok(Target { + timeline, + // TODO: support lsn_range != latest LSN + lsn_range: Some(info.last_record_lsn..(info.last_record_lsn + 1)), + }) + } + }); + } + let mut all_targets: Vec = Vec::new(); + while let Some(res) = js.join_next().await { + all_targets.push(res.unwrap().unwrap()); + } + + let live_stats = Arc::new(LiveStats::default()); + + let num_client_tasks = timelines.len(); + let num_live_stats_dump = 1; + let num_work_sender_tasks = 1; + + let start_work_barrier = Arc::new(tokio::sync::Barrier::new( + num_client_tasks + num_live_stats_dump + num_work_sender_tasks, + )); + let all_work_done_barrier = Arc::new(tokio::sync::Barrier::new(num_client_tasks)); + + tokio::spawn({ + let stats = Arc::clone(&live_stats); + let start_work_barrier = Arc::clone(&start_work_barrier); + async move { + start_work_barrier.wait().await; + loop { + let start = std::time::Instant::now(); + tokio::time::sleep(std::time::Duration::from_secs(1)).await; + let completed_requests = stats.completed_requests.swap(0, Ordering::Relaxed); + let elapsed = start.elapsed(); + info!( + "RPS: {:.0}", + completed_requests as f64 / elapsed.as_secs_f64() + ); + } + } + }); + + let mut work_senders = HashMap::new(); + let mut tasks = Vec::new(); + for tl in &timelines { + let (sender, receiver) = tokio::sync::mpsc::channel(1); // TODO: not sure what the implications of this are + work_senders.insert(tl, sender); + tasks.push(tokio::spawn(client( + args, + *tl, + Arc::clone(&start_work_barrier), + receiver, + Arc::clone(&all_work_done_barrier), + Arc::clone(&live_stats), + ))); + } + + let work_sender = async move { + start_work_barrier.wait().await; + loop { + let (timeline, work) = { + let mut rng = rand::thread_rng(); + let target = all_targets.choose(&mut rng).unwrap(); + let lsn = target.lsn_range.clone().map(|r| rng.gen_range(r)); + ( + target.timeline, + Work { + lsn, + gzip: rng.gen_bool(args.gzip_probability), + }, + ) + }; + let sender = work_senders.get(&timeline).unwrap(); + // TODO: what if this blocks? + sender.send(work).await.ok().unwrap(); + } + }; + + if let Some(runtime) = args.runtime { + match tokio::time::timeout(runtime.into(), work_sender).await { + Ok(()) => unreachable!("work sender never terminates"), + Err(_timeout) => { + // this implicitly drops the work_senders, making all the clients exit + } + } + } else { + work_sender.await; + unreachable!("work sender never terminates"); + } + + for t in tasks { + t.await.unwrap(); + } + + let output = Output { + total: { + let mut agg_stats = request_stats::Stats::new(); + for stats in all_thread_local_stats.lock().unwrap().iter() { + let stats = stats.lock().unwrap(); + agg_stats.add(&stats); + } + agg_stats.output() + }, + }; + + let output = serde_json::to_string_pretty(&output).unwrap(); + println!("{output}"); + + anyhow::Ok(()) +} + +#[derive(Copy, Clone)] +struct Work { + lsn: Option, + gzip: bool, +} + +#[instrument(skip_all)] +async fn client( + args: &'static Args, + timeline: TenantTimelineId, + start_work_barrier: Arc, + mut work: tokio::sync::mpsc::Receiver, + all_work_done_barrier: Arc, + live_stats: Arc, +) { + start_work_barrier.wait().await; + + let client = pageserver_client::page_service::Client::new(crate::util::connstring::connstring( + &args.page_service_host_port, + args.pageserver_jwt.as_deref(), + )) + .await + .unwrap(); + + while let Some(Work { lsn, gzip }) = work.recv().await { + let start = Instant::now(); + let copy_out_stream = client + .basebackup(&BasebackupRequest { + tenant_id: timeline.tenant_id, + timeline_id: timeline.timeline_id, + lsn, + gzip, + }) + .await + .with_context(|| format!("start basebackup for {timeline}")) + .unwrap(); + + use futures::StreamExt; + let size = Arc::new(AtomicUsize::new(0)); + copy_out_stream + .for_each({ + |r| { + let size = Arc::clone(&size); + async move { + let size = Arc::clone(&size); + size.fetch_add(r.unwrap().len(), Ordering::Relaxed); + } + } + }) + .await; + debug!("basebackup size is {} bytes", size.load(Ordering::Relaxed)); + let elapsed = start.elapsed(); + live_stats.inc(); + STATS.with(|stats| { + stats.borrow().lock().unwrap().observe(elapsed).unwrap(); + }); + } + + all_work_done_barrier.wait().await; +} diff --git a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs new file mode 100644 index 0000000000..cb36a403f1 --- /dev/null +++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs @@ -0,0 +1,351 @@ +use anyhow::Context; +use futures::future::join_all; +use pageserver::pgdatadir_mapping::key_to_rel_block; +use pageserver::repository; +use pageserver_api::key::is_rel_block_key; +use pageserver_api::models::PagestreamGetPageRequest; + +use utils::id::TenantTimelineId; +use utils::lsn::Lsn; + +use rand::prelude::*; +use tokio::sync::Barrier; +use tokio::task::JoinSet; +use tracing::{info, instrument}; + +use std::collections::HashMap; +use std::future::Future; +use std::num::NonZeroUsize; +use std::pin::Pin; +use std::sync::atomic::{AtomicU64, Ordering}; +use std::sync::{Arc, Mutex}; +use std::time::{Duration, Instant}; + +use crate::util::tokio_thread_local_stats::AllThreadLocalStats; +use crate::util::{request_stats, tokio_thread_local_stats}; + +/// GetPage@LatestLSN, uniformly distributed across the compute-accessible keyspace. +#[derive(clap::Parser)] +pub(crate) struct Args { + #[clap(long, default_value = "http://localhost:9898")] + mgmt_api_endpoint: String, + #[clap(long, default_value = "postgres://postgres@localhost:64000")] + page_service_connstring: String, + #[clap(long)] + pageserver_jwt: Option, + #[clap(long, default_value = "1")] + num_clients: NonZeroUsize, + #[clap(long)] + runtime: Option, + #[clap(long)] + per_target_rate_limit: Option, + /// Probability for sending `latest=true` in the request (uniform distribution). + #[clap(long, default_value = "1")] + req_latest_probability: f64, + #[clap(long)] + limit_to_first_n_targets: Option, + targets: Option>, +} + +#[derive(Debug, Default)] +struct LiveStats { + completed_requests: AtomicU64, +} + +impl LiveStats { + fn inc(&self) { + self.completed_requests.fetch_add(1, Ordering::Relaxed); + } +} + +#[derive(Clone)] +struct KeyRange { + timeline: TenantTimelineId, + timeline_lsn: Lsn, + start: i128, + end: i128, +} + +impl KeyRange { + fn len(&self) -> i128 { + self.end - self.start + } +} + +#[derive(serde::Serialize)] +struct Output { + total: request_stats::Output, +} + +tokio_thread_local_stats::declare!(STATS: request_stats::Stats); + +pub(crate) fn main(args: Args) -> anyhow::Result<()> { + tokio_thread_local_stats::main!(STATS, move |thread_local_stats| { + main_impl(args, thread_local_stats) + }) +} + +async fn main_impl( + args: Args, + all_thread_local_stats: AllThreadLocalStats, +) -> anyhow::Result<()> { + let args: &'static Args = Box::leak(Box::new(args)); + + let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new( + args.mgmt_api_endpoint.clone(), + args.pageserver_jwt.as_deref(), + )); + + // discover targets + let timelines: Vec = crate::util::cli::targets::discover( + &mgmt_api_client, + crate::util::cli::targets::Spec { + limit_to_first_n_targets: args.limit_to_first_n_targets, + targets: args.targets.clone(), + }, + ) + .await?; + + let mut js = JoinSet::new(); + for timeline in &timelines { + js.spawn({ + let mgmt_api_client = Arc::clone(&mgmt_api_client); + let timeline = *timeline; + async move { + let partitioning = mgmt_api_client + .keyspace(timeline.tenant_id, timeline.timeline_id) + .await?; + let lsn = partitioning.at_lsn; + + let ranges = partitioning + .keys + .ranges + .iter() + .filter_map(|r| { + let start = r.start; + let end = r.end; + // filter out non-relblock keys + match (is_rel_block_key(&start), is_rel_block_key(&end)) { + (true, true) => Some(KeyRange { + timeline, + timeline_lsn: lsn, + start: start.to_i128(), + end: end.to_i128(), + }), + (true, false) | (false, true) => { + unimplemented!("split up range") + } + (false, false) => None, + } + }) + .collect::>(); + + anyhow::Ok(ranges) + } + }); + } + let mut all_ranges: Vec = Vec::new(); + while let Some(res) = js.join_next().await { + all_ranges.extend(res.unwrap().unwrap()); + } + + let live_stats = Arc::new(LiveStats::default()); + + let num_client_tasks = timelines.len(); + let num_live_stats_dump = 1; + let num_work_sender_tasks = 1; + + let start_work_barrier = Arc::new(tokio::sync::Barrier::new( + num_client_tasks + num_live_stats_dump + num_work_sender_tasks, + )); + let all_work_done_barrier = Arc::new(tokio::sync::Barrier::new(num_client_tasks)); + + tokio::spawn({ + let stats = Arc::clone(&live_stats); + let start_work_barrier = Arc::clone(&start_work_barrier); + async move { + start_work_barrier.wait().await; + loop { + let start = std::time::Instant::now(); + tokio::time::sleep(std::time::Duration::from_secs(1)).await; + let completed_requests = stats.completed_requests.swap(0, Ordering::Relaxed); + let elapsed = start.elapsed(); + info!( + "RPS: {:.0}", + completed_requests as f64 / elapsed.as_secs_f64() + ); + } + } + }); + + let mut work_senders = HashMap::new(); + let mut tasks = Vec::new(); + for tl in &timelines { + let (sender, receiver) = tokio::sync::mpsc::channel(10); // TODO: not sure what the implications of this are + work_senders.insert(tl, sender); + tasks.push(tokio::spawn(client( + args, + *tl, + Arc::clone(&start_work_barrier), + receiver, + Arc::clone(&all_work_done_barrier), + Arc::clone(&live_stats), + ))); + } + + let work_sender: Pin>> = match args.per_target_rate_limit { + None => Box::pin(async move { + let weights = rand::distributions::weighted::WeightedIndex::new( + all_ranges.iter().map(|v| v.len()), + ) + .unwrap(); + + start_work_barrier.wait().await; + + loop { + let (timeline, req) = { + let mut rng = rand::thread_rng(); + let r = &all_ranges[weights.sample(&mut rng)]; + let key: i128 = rng.gen_range(r.start..r.end); + let key = repository::Key::from_i128(key); + let (rel_tag, block_no) = + key_to_rel_block(key).expect("we filter non-rel-block keys out above"); + ( + r.timeline, + PagestreamGetPageRequest { + latest: rng.gen_bool(args.req_latest_probability), + lsn: r.timeline_lsn, + rel: rel_tag, + blkno: block_no, + }, + ) + }; + let sender = work_senders.get(&timeline).unwrap(); + // TODO: what if this blocks? + sender.send(req).await.ok().unwrap(); + } + }), + Some(rps_limit) => Box::pin(async move { + let period = Duration::from_secs_f64(1.0 / (rps_limit as f64)); + + let make_timeline_task: &dyn Fn( + TenantTimelineId, + ) + -> Pin>> = &|timeline| { + let sender = work_senders.get(&timeline).unwrap(); + let ranges: Vec = all_ranges + .iter() + .filter(|r| r.timeline == timeline) + .cloned() + .collect(); + let weights = rand::distributions::weighted::WeightedIndex::new( + ranges.iter().map(|v| v.len()), + ) + .unwrap(); + + Box::pin(async move { + let mut ticker = tokio::time::interval(period); + ticker.set_missed_tick_behavior( + /* TODO review this choice */ + tokio::time::MissedTickBehavior::Burst, + ); + loop { + ticker.tick().await; + let req = { + let mut rng = rand::thread_rng(); + let r = &ranges[weights.sample(&mut rng)]; + let key: i128 = rng.gen_range(r.start..r.end); + let key = repository::Key::from_i128(key); + let (rel_tag, block_no) = key_to_rel_block(key) + .expect("we filter non-rel-block keys out above"); + PagestreamGetPageRequest { + latest: rng.gen_bool(args.req_latest_probability), + lsn: r.timeline_lsn, + rel: rel_tag, + blkno: block_no, + } + }; + sender.send(req).await.ok().unwrap(); + } + }) + }; + + let tasks: Vec<_> = work_senders + .keys() + .map(|tl| make_timeline_task(**tl)) + .collect(); + + start_work_barrier.wait().await; + + join_all(tasks).await; + }), + }; + + if let Some(runtime) = args.runtime { + match tokio::time::timeout(runtime.into(), work_sender).await { + Ok(()) => unreachable!("work sender never terminates"), + Err(_timeout) => { + // this implicitly drops the work_senders, making all the clients exit + } + } + } else { + work_sender.await; + unreachable!("work sender never terminates"); + } + + for t in tasks { + t.await.unwrap(); + } + + let output = Output { + total: { + let mut agg_stats = request_stats::Stats::new(); + for stats in all_thread_local_stats.lock().unwrap().iter() { + let stats = stats.lock().unwrap(); + agg_stats.add(&stats); + } + agg_stats.output() + }, + }; + + let output = serde_json::to_string_pretty(&output).unwrap(); + println!("{output}"); + + anyhow::Ok(()) +} + +#[instrument(skip_all)] +async fn client( + args: &'static Args, + timeline: TenantTimelineId, + start_work_barrier: Arc, + mut work: tokio::sync::mpsc::Receiver, + all_work_done_barrier: Arc, + live_stats: Arc, +) { + start_work_barrier.wait().await; + + let client = pageserver_client::page_service::Client::new(args.page_service_connstring.clone()) + .await + .unwrap(); + let mut client = client + .pagestream(timeline.tenant_id, timeline.timeline_id) + .await + .unwrap(); + + while let Some(req) = work.recv().await { + let start = Instant::now(); + client + .getpage(req) + .await + .with_context(|| format!("getpage for {timeline}")) + .unwrap(); + let elapsed = start.elapsed(); + live_stats.inc(); + STATS.with(|stats| { + stats.borrow().lock().unwrap().observe(elapsed).unwrap(); + }); + } + + all_work_done_barrier.wait().await; +} diff --git a/pageserver/pagebench/src/cmd/trigger_initial_size_calculation.rs b/pageserver/pagebench/src/cmd/trigger_initial_size_calculation.rs new file mode 100644 index 0000000000..d46ae94e8a --- /dev/null +++ b/pageserver/pagebench/src/cmd/trigger_initial_size_calculation.rs @@ -0,0 +1,85 @@ +use std::sync::Arc; + +use humantime::Duration; +use tokio::task::JoinSet; +use utils::id::TenantTimelineId; + +#[derive(clap::Parser)] +pub(crate) struct Args { + #[clap(long, default_value = "http://localhost:9898")] + mgmt_api_endpoint: String, + #[clap(long, default_value = "localhost:64000")] + page_service_host_port: String, + #[clap(long)] + pageserver_jwt: Option, + #[clap( + long, + help = "if specified, poll mgmt api to check whether init logical size calculation has completed" + )] + poll_for_completion: Option, + #[clap(long)] + limit_to_first_n_targets: Option, + targets: Option>, +} + +pub(crate) fn main(args: Args) -> anyhow::Result<()> { + let rt = tokio::runtime::Builder::new_multi_thread() + .enable_all() + .build() + .unwrap(); + + let main_task = rt.spawn(main_impl(args)); + rt.block_on(main_task).unwrap() +} + +async fn main_impl(args: Args) -> anyhow::Result<()> { + let args: &'static Args = Box::leak(Box::new(args)); + + let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new( + args.mgmt_api_endpoint.clone(), + args.pageserver_jwt.as_deref(), + )); + + // discover targets + let timelines: Vec = crate::util::cli::targets::discover( + &mgmt_api_client, + crate::util::cli::targets::Spec { + limit_to_first_n_targets: args.limit_to_first_n_targets, + targets: args.targets.clone(), + }, + ) + .await?; + + // kick it off + + let mut js = JoinSet::new(); + for tl in timelines { + let mgmt_api_client = Arc::clone(&mgmt_api_client); + js.spawn(async move { + // TODO: API to explicitly trigger initial logical size computation. + // Should probably also avoid making it a side effect of timeline details to trigger initial logical size calculation. + // => https://github.com/neondatabase/neon/issues/6168 + let info = mgmt_api_client + .timeline_info(tl.tenant_id, tl.timeline_id) + .await + .unwrap(); + + if let Some(period) = args.poll_for_completion { + let mut ticker = tokio::time::interval(period.into()); + ticker.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Delay); + let mut info = info; + while !info.current_logical_size_is_accurate { + ticker.tick().await; + info = mgmt_api_client + .timeline_info(tl.tenant_id, tl.timeline_id) + .await + .unwrap(); + } + } + }); + } + while let Some(res) = js.join_next().await { + let _: () = res.unwrap(); + } + Ok(()) +} diff --git a/pageserver/pagebench/src/main.rs b/pageserver/pagebench/src/main.rs new file mode 100644 index 0000000000..e0120c9212 --- /dev/null +++ b/pageserver/pagebench/src/main.rs @@ -0,0 +1,48 @@ +use clap::Parser; +use utils::logging; + +/// Re-usable pieces of code that aren't CLI-specific. +mod util { + pub(crate) mod connstring; + pub(crate) mod request_stats; + #[macro_use] + pub(crate) mod tokio_thread_local_stats; + /// Re-usable pieces of CLI-specific code. + pub(crate) mod cli { + pub(crate) mod targets; + } +} + +/// The pagebench CLI sub-commands, dispatched in [`main`] below. +mod cmd { + pub(super) mod basebackup; + pub(super) mod getpage_latest_lsn; + pub(super) mod trigger_initial_size_calculation; +} + +/// Component-level performance test for pageserver. +#[derive(clap::Parser)] +enum Args { + Basebackup(cmd::basebackup::Args), + GetPageLatestLsn(cmd::getpage_latest_lsn::Args), + TriggerInitialSizeCalculation(cmd::trigger_initial_size_calculation::Args), +} + +fn main() { + logging::init( + logging::LogFormat::Plain, + logging::TracingErrorLayerEnablement::Disabled, + logging::Output::Stderr, + ) + .unwrap(); + + let args = Args::parse(); + match args { + Args::Basebackup(args) => cmd::basebackup::main(args), + Args::GetPageLatestLsn(args) => cmd::getpage_latest_lsn::main(args), + Args::TriggerInitialSizeCalculation(args) => { + cmd::trigger_initial_size_calculation::main(args) + } + } + .unwrap() +} diff --git a/pageserver/pagebench/src/util/cli/targets.rs b/pageserver/pagebench/src/util/cli/targets.rs new file mode 100644 index 0000000000..848eae27cf --- /dev/null +++ b/pageserver/pagebench/src/util/cli/targets.rs @@ -0,0 +1,34 @@ +use std::sync::Arc; + +use pageserver_client::mgmt_api; +use tracing::info; +use utils::id::TenantTimelineId; + +pub(crate) struct Spec { + pub(crate) limit_to_first_n_targets: Option, + pub(crate) targets: Option>, +} + +pub(crate) async fn discover( + api_client: &Arc, + spec: Spec, +) -> anyhow::Result> { + let mut timelines = if let Some(targets) = spec.targets { + targets + } else { + mgmt_api::util::get_pageserver_tenant_timelines_unsharded(api_client).await? + }; + + if let Some(limit) = spec.limit_to_first_n_targets { + timelines.sort(); // for determinism + timelines.truncate(limit); + if timelines.len() < limit { + anyhow::bail!("pageserver has less than limit_to_first_n_targets={limit} tenants"); + } + } + + info!("timelines:\n{:?}", timelines); + info!("number of timelines:\n{:?}", timelines.len()); + + Ok(timelines) +} diff --git a/pageserver/pagebench/src/util/connstring.rs b/pageserver/pagebench/src/util/connstring.rs new file mode 100644 index 0000000000..07a0ff042d --- /dev/null +++ b/pageserver/pagebench/src/util/connstring.rs @@ -0,0 +1,8 @@ +pub(crate) fn connstring(host_port: &str, jwt: Option<&str>) -> String { + let colon_and_jwt = if let Some(jwt) = jwt { + format!(":{jwt}") // TODO: urlescape + } else { + String::new() + }; + format!("postgres://postgres{colon_and_jwt}@{host_port}") +} diff --git a/pageserver/pagebench/src/util/request_stats.rs b/pageserver/pagebench/src/util/request_stats.rs new file mode 100644 index 0000000000..5ecf1cbf24 --- /dev/null +++ b/pageserver/pagebench/src/util/request_stats.rs @@ -0,0 +1,88 @@ +use std::time::Duration; + +use anyhow::Context; + +pub(crate) struct Stats { + latency_histo: hdrhistogram::Histogram, +} + +impl Stats { + pub(crate) fn new() -> Self { + Self { + // Initialize with fixed bounds so that we panic at runtime instead of resizing the histogram, + // which would skew the benchmark results. + latency_histo: hdrhistogram::Histogram::new_with_bounds(1, 1_000_000_000, 3).unwrap(), + } + } + pub(crate) fn observe(&mut self, latency: Duration) -> anyhow::Result<()> { + let micros: u64 = latency + .as_micros() + .try_into() + .context("latency greater than u64")?; + self.latency_histo + .record(micros) + .context("add to histogram")?; + Ok(()) + } + pub(crate) fn output(&self) -> Output { + let latency_percentiles = std::array::from_fn(|idx| { + let micros = self + .latency_histo + .value_at_percentile(LATENCY_PERCENTILES[idx]); + Duration::from_micros(micros) + }); + Output { + request_count: self.latency_histo.len(), + latency_mean: Duration::from_micros(self.latency_histo.mean() as u64), + latency_percentiles: LatencyPercentiles { + latency_percentiles, + }, + } + } + pub(crate) fn add(&mut self, other: &Self) { + let Self { + ref mut latency_histo, + } = self; + latency_histo.add(&other.latency_histo).unwrap(); + } +} + +impl Default for Stats { + fn default() -> Self { + Self::new() + } +} + +const LATENCY_PERCENTILES: [f64; 4] = [95.0, 99.00, 99.90, 99.99]; + +struct LatencyPercentiles { + latency_percentiles: [Duration; 4], +} + +impl serde::Serialize for LatencyPercentiles { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + use serde::ser::SerializeMap; + let mut ser = serializer.serialize_map(Some(LATENCY_PERCENTILES.len()))?; + for p in LATENCY_PERCENTILES { + ser.serialize_entry( + &format!("p{p}"), + &format!( + "{}", + &humantime::format_duration(self.latency_percentiles[0]) + ), + )?; + } + ser.end() + } +} + +#[derive(serde::Serialize)] +pub(crate) struct Output { + request_count: u64, + #[serde(with = "humantime_serde")] + latency_mean: Duration, + latency_percentiles: LatencyPercentiles, +} diff --git a/pageserver/pagebench/src/util/tokio_thread_local_stats.rs b/pageserver/pagebench/src/util/tokio_thread_local_stats.rs new file mode 100644 index 0000000000..82526213b6 --- /dev/null +++ b/pageserver/pagebench/src/util/tokio_thread_local_stats.rs @@ -0,0 +1,45 @@ +pub(crate) type ThreadLocalStats = Arc>; +pub(crate) type AllThreadLocalStats = Arc>>>; + +macro_rules! declare { + ($THREAD_LOCAL_NAME:ident: $T:ty) => { + thread_local! { + pub static $THREAD_LOCAL_NAME: std::cell::RefCell> = std::cell::RefCell::new( + std::sync::Arc::new(std::sync::Mutex::new(Default::default())) + ); + } + }; +} + +use std::sync::{Arc, Mutex}; + +pub(crate) use declare; + +macro_rules! main { + ($THREAD_LOCAL_NAME:ident, $main_impl:expr) => {{ + let main_impl = $main_impl; + let all = Arc::new(Mutex::new(Vec::new())); + + let rt = tokio::runtime::Builder::new_multi_thread() + .on_thread_start({ + let all = Arc::clone(&all); + move || { + // pre-initialize the thread local stats by accessesing them + // (some stats like requests_stats::Stats are quite costly to initialize, + // we don't want to pay that cost during the measurement period) + $THREAD_LOCAL_NAME.with(|stats| { + let stats: Arc<_> = Arc::clone(&*stats.borrow()); + all.lock().unwrap().push(stats); + }); + } + }) + .enable_all() + .build() + .unwrap(); + + let main_task = rt.spawn(main_impl(all)); + rt.block_on(main_task).unwrap() + }}; +} + +pub(crate) use main; diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index ed452eae7d..7e5ae892ad 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -23,6 +23,7 @@ use tracing::*; use tokio_tar::{Builder, EntryType, Header}; use crate::context::RequestContext; +use crate::pgdatadir_mapping::Version; use crate::tenant::Timeline; use pageserver_api::reltag::{RelTag, SlruKind}; @@ -174,7 +175,7 @@ where ] { for segno in self .timeline - .list_slru_segments(kind, self.lsn, self.ctx) + .list_slru_segments(kind, Version::Lsn(self.lsn), self.ctx) .await? { self.add_slru_segment(kind, segno).await?; @@ -192,7 +193,7 @@ where // Otherwise only include init forks of unlogged relations. let rels = self .timeline - .list_rels(spcnode, dbnode, self.lsn, self.ctx) + .list_rels(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx) .await?; for &rel in rels.iter() { // Send init fork as main fork to provide well formed empty @@ -267,7 +268,7 @@ where async fn add_rel(&mut self, src: RelTag, dst: RelTag) -> anyhow::Result<()> { let nblocks = self .timeline - .get_rel_size(src, self.lsn, false, self.ctx) + .get_rel_size(src, Version::Lsn(self.lsn), false, self.ctx) .await?; // If the relation is empty, create an empty file @@ -288,7 +289,7 @@ where for blknum in startblk..endblk { let img = self .timeline - .get_rel_page_at_lsn(src, blknum, self.lsn, false, self.ctx) + .get_rel_page_at_lsn(src, blknum, Version::Lsn(self.lsn), false, self.ctx) .await?; segment_data.extend_from_slice(&img[..]); } @@ -310,7 +311,7 @@ where async fn add_slru_segment(&mut self, slru: SlruKind, segno: u32) -> anyhow::Result<()> { let nblocks = self .timeline - .get_slru_segment_size(slru, segno, self.lsn, self.ctx) + .get_slru_segment_size(slru, segno, Version::Lsn(self.lsn), self.ctx) .await?; let mut slru_buf: Vec = Vec::with_capacity(nblocks as usize * BLCKSZ as usize); @@ -352,7 +353,7 @@ where let relmap_img = if has_relmap_file { let img = self .timeline - .get_relmap_file(spcnode, dbnode, self.lsn, self.ctx) + .get_relmap_file(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx) .await?; ensure!( @@ -399,7 +400,7 @@ where if !has_relmap_file && self .timeline - .list_rels(spcnode, dbnode, self.lsn, self.ctx) + .list_rels(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx) .await? .is_empty() { diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 542c1b7b30..621ad050f4 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -14,7 +14,7 @@ use pageserver::control_plane_client::ControlPlaneClient; use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task}; use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING}; use pageserver::task_mgr::WALRECEIVER_RUNTIME; -use pageserver::tenant::TenantSharedResources; +use pageserver::tenant::{secondary, TenantSharedResources}; use remote_storage::GenericRemoteStorage; use tokio::time::Instant; use tracing::*; @@ -31,6 +31,7 @@ use pageserver::{ virtual_file, }; use postgres_backend::AuthType; +use utils::failpoint_support; use utils::logging::TracingErrorLayerEnablement; use utils::signals::ShutdownSignals; use utils::{ @@ -126,7 +127,7 @@ fn main() -> anyhow::Result<()> { } // Initialize up failpoints support - let scenario = pageserver::failpoint_support::init(); + let scenario = failpoint_support::init(); // Basic initialization of things that don't change after startup virtual_file::init(conf.max_file_descriptors); @@ -402,15 +403,11 @@ fn start_pageserver( let (init_remote_done_tx, init_remote_done_rx) = utils::completion::channel(); let (init_done_tx, init_done_rx) = utils::completion::channel(); - let (init_logical_size_done_tx, init_logical_size_done_rx) = utils::completion::channel(); - let (background_jobs_can_start, background_jobs_barrier) = utils::completion::channel(); let order = pageserver::InitializationOrder { initial_tenant_load_remote: Some(init_done_tx), initial_tenant_load: Some(init_remote_done_tx), - initial_logical_size_can_start: init_done_rx.clone(), - initial_logical_size_attempt: Some(init_logical_size_done_tx), background_jobs_can_start: background_jobs_barrier.clone(), }; @@ -429,7 +426,6 @@ fn start_pageserver( let tenant_manager = Arc::new(tenant_manager); BACKGROUND_RUNTIME.spawn({ - let init_done_rx = init_done_rx; let shutdown_pageserver = shutdown_pageserver.clone(); let drive_init = async move { // NOTE: unlike many futures in pageserver, this one is cancellation-safe @@ -464,7 +460,7 @@ fn start_pageserver( }); let WaitForPhaseResult { - timeout_remaining: timeout, + timeout_remaining: _timeout, skipped: init_load_skipped, } = wait_for_phase("initial_tenant_load", init_load_done, timeout).await; @@ -472,26 +468,6 @@ fn start_pageserver( scopeguard::ScopeGuard::into_inner(guard); - let guard = scopeguard::guard_on_success((), |_| { - tracing::info!("Cancelled before initial logical sizes completed") - }); - - let logical_sizes_done = std::pin::pin!(async { - init_logical_size_done_rx.wait().await; - startup_checkpoint( - started_startup_at, - "initial_logical_sizes", - "Initial logical sizes completed", - ); - }); - - let WaitForPhaseResult { - timeout_remaining: _, - skipped: logical_sizes_skipped, - } = wait_for_phase("initial_logical_sizes", logical_sizes_done, timeout).await; - - scopeguard::ScopeGuard::into_inner(guard); - // allow background jobs to start: we either completed prior stages, or they reached timeout // and were skipped. It is important that we do not let them block background jobs indefinitely, // because things like consumption metrics for billing are blocked by this barrier. @@ -514,9 +490,6 @@ fn start_pageserver( if let Some(f) = init_load_skipped { f.await; } - if let Some(f) = logical_sizes_skipped { - f.await; - } scopeguard::ScopeGuard::into_inner(guard); startup_checkpoint(started_startup_at, "complete", "Startup complete"); @@ -532,6 +505,17 @@ fn start_pageserver( } }); + let secondary_controller = if let Some(remote_storage) = &remote_storage { + secondary::spawn_tasks( + tenant_manager.clone(), + remote_storage.clone(), + background_jobs_barrier.clone(), + shutdown_pageserver.clone(), + ) + } else { + secondary::null_controller() + }; + // shared state between the disk-usage backed eviction background task and the http endpoint // that allows triggering disk-usage based eviction manually. note that the http endpoint // is still accessible even if background task is not configured as long as remote storage has @@ -561,6 +545,7 @@ fn start_pageserver( broker_client.clone(), disk_usage_eviction_state, deletion_queue.new_client(), + secondary_controller, ) .context("Failed to initialize router state")?, ); @@ -587,7 +572,6 @@ fn start_pageserver( } if let Some(metric_collection_endpoint) = &conf.metric_collection_endpoint { - let background_jobs_barrier = background_jobs_barrier; let metrics_ctx = RequestContext::todo_child( TaskKind::MetricsCollection, // This task itself shouldn't download anything. diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 737495d414..4560f5eca0 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -41,6 +41,8 @@ use crate::{ TIMELINE_DELETE_MARK_SUFFIX, TIMELINE_UNINIT_MARK_SUFFIX, }; +use self::defaults::DEFAULT_CONCURRENT_TENANT_WARMUP; + pub mod defaults { use crate::tenant::config::defaults::*; use const_format::formatcp; @@ -61,6 +63,8 @@ pub mod defaults { pub const DEFAULT_LOG_FORMAT: &str = "plain"; + pub const DEFAULT_CONCURRENT_TENANT_WARMUP: usize = 8; + pub const DEFAULT_CONCURRENT_TENANT_SIZE_LOGICAL_SIZE_QUERIES: usize = super::ConfigurableSemaphore::DEFAULT_INITIAL.get(); @@ -70,6 +74,10 @@ pub mod defaults { pub const DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL: &str = "10 min"; pub const DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY: &str = "10s"; + pub const DEFAULT_HEATMAP_UPLOAD_CONCURRENCY: usize = 8; + + pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100; + /// /// Default built-in configuration file. /// @@ -82,6 +90,7 @@ pub mod defaults { #wait_lsn_timeout = '{DEFAULT_WAIT_LSN_TIMEOUT}' #wal_redo_timeout = '{DEFAULT_WAL_REDO_TIMEOUT}' +#page_cache_size = {DEFAULT_PAGE_CACHE_SIZE} #max_file_descriptors = {DEFAULT_MAX_FILE_DESCRIPTORS} # initial superuser role name to use when creating a new tenant @@ -92,6 +101,7 @@ pub mod defaults { #log_format = '{DEFAULT_LOG_FORMAT}' #concurrent_tenant_size_logical_size_queries = '{DEFAULT_CONCURRENT_TENANT_SIZE_LOGICAL_SIZE_QUERIES}' +#concurrent_tenant_warmup = '{DEFAULT_CONCURRENT_TENANT_WARMUP}' #metric_collection_interval = '{DEFAULT_METRIC_COLLECTION_INTERVAL}' #cached_metric_collection_interval = '{DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL}' @@ -101,6 +111,8 @@ pub mod defaults { #background_task_maximum_delay = '{DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY}' +#ingest_batch_size = {DEFAULT_INGEST_BATCH_SIZE} + [tenant_config] #checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes #checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT} @@ -117,6 +129,8 @@ pub mod defaults { #evictions_low_residence_duration_metric_threshold = '{DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD}' #gc_feedback = false +#heatmap_upload_concurrency = {DEFAULT_HEATMAP_UPLOAD_CONCURRENCY} + [remote_storage] "# @@ -176,6 +190,11 @@ pub struct PageServerConf { pub log_format: LogFormat, + /// Number of tenants which will be concurrently loaded from remote storage proactively on startup, + /// does not limit tenants loaded in response to client I/O. A lower value implicitly deprioritizes + /// loading such tenants, vs. other work in the system. + pub concurrent_tenant_warmup: ConfigurableSemaphore, + /// Number of concurrent [`Tenant::gather_size_inputs`](crate::tenant::Tenant::gather_size_inputs) allowed. pub concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore, /// Limit of concurrent [`Tenant::gather_size_inputs`] issued by module `eviction_task`. @@ -215,6 +234,13 @@ pub struct PageServerConf { /// If true, pageserver will make best-effort to operate without a control plane: only /// for use in major incidents. pub control_plane_emergency_mode: bool, + + /// How many heatmap uploads may be done concurrency: lower values implicitly deprioritize + /// heatmap uploads vs. other remote storage operations. + pub heatmap_upload_concurrency: usize, + + /// Maximum number of WAL records to be ingested and committed at the same time + pub ingest_batch_size: u64, } /// We do not want to store this in a PageServerConf because the latter may be logged @@ -275,6 +301,7 @@ struct PageServerConfigBuilder { log_format: BuilderValue, + concurrent_tenant_warmup: BuilderValue, concurrent_tenant_size_logical_size_queries: BuilderValue, metric_collection_interval: BuilderValue, @@ -293,6 +320,10 @@ struct PageServerConfigBuilder { control_plane_api: BuilderValue>, control_plane_api_token: BuilderValue>, control_plane_emergency_mode: BuilderValue, + + heatmap_upload_concurrency: BuilderValue, + + ingest_batch_size: BuilderValue, } impl Default for PageServerConfigBuilder { @@ -330,6 +361,8 @@ impl Default for PageServerConfigBuilder { .expect("cannot parse default keepalive interval")), log_format: Set(LogFormat::from_str(DEFAULT_LOG_FORMAT).unwrap()), + concurrent_tenant_warmup: Set(NonZeroUsize::new(DEFAULT_CONCURRENT_TENANT_WARMUP) + .expect("Invalid default constant")), concurrent_tenant_size_logical_size_queries: Set( ConfigurableSemaphore::DEFAULT_INITIAL, ), @@ -361,6 +394,10 @@ impl Default for PageServerConfigBuilder { control_plane_api: Set(None), control_plane_api_token: Set(None), control_plane_emergency_mode: Set(false), + + heatmap_upload_concurrency: Set(DEFAULT_HEATMAP_UPLOAD_CONCURRENCY), + + ingest_batch_size: Set(DEFAULT_INGEST_BATCH_SIZE), } } } @@ -441,6 +478,10 @@ impl PageServerConfigBuilder { self.log_format = BuilderValue::Set(log_format) } + pub fn concurrent_tenant_warmup(&mut self, u: NonZeroUsize) { + self.concurrent_tenant_warmup = BuilderValue::Set(u); + } + pub fn concurrent_tenant_size_logical_size_queries(&mut self, u: NonZeroUsize) { self.concurrent_tenant_size_logical_size_queries = BuilderValue::Set(u); } @@ -501,7 +542,18 @@ impl PageServerConfigBuilder { self.control_plane_emergency_mode = BuilderValue::Set(enabled) } + pub fn heatmap_upload_concurrency(&mut self, value: usize) { + self.heatmap_upload_concurrency = BuilderValue::Set(value) + } + + pub fn ingest_batch_size(&mut self, ingest_batch_size: u64) { + self.ingest_batch_size = BuilderValue::Set(ingest_batch_size) + } + pub fn build(self) -> anyhow::Result { + let concurrent_tenant_warmup = self + .concurrent_tenant_warmup + .ok_or(anyhow!("missing concurrent_tenant_warmup"))?; let concurrent_tenant_size_logical_size_queries = self .concurrent_tenant_size_logical_size_queries .ok_or(anyhow!( @@ -554,6 +606,7 @@ impl PageServerConfigBuilder { .broker_keepalive_interval .ok_or(anyhow!("No broker keepalive interval provided"))?, log_format: self.log_format.ok_or(anyhow!("missing log_format"))?, + concurrent_tenant_warmup: ConfigurableSemaphore::new(concurrent_tenant_warmup), concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::new( concurrent_tenant_size_logical_size_queries, ), @@ -595,6 +648,12 @@ impl PageServerConfigBuilder { control_plane_emergency_mode: self .control_plane_emergency_mode .ok_or(anyhow!("missing control_plane_emergency_mode"))?, + heatmap_upload_concurrency: self + .heatmap_upload_concurrency + .ok_or(anyhow!("missing heatmap_upload_concurrency"))?, + ingest_batch_size: self + .ingest_batch_size + .ok_or(anyhow!("missing ingest_batch_size"))?, }) } } @@ -787,6 +846,11 @@ impl PageServerConf { "log_format" => builder.log_format( LogFormat::from_config(&parse_toml_string(key, item)?)? ), + "concurrent_tenant_warmup" => builder.concurrent_tenant_warmup({ + let input = parse_toml_string(key, item)?; + let permits = input.parse::().context("expected a number of initial permits, not {s:?}")?; + NonZeroUsize::new(permits).context("initial semaphore permits out of range: 0, use other configuration to disable a feature")? + }), "concurrent_tenant_size_logical_size_queries" => builder.concurrent_tenant_size_logical_size_queries({ let input = parse_toml_string(key, item)?; let permits = input.parse::().context("expected a number of initial permits, not {s:?}")?; @@ -828,8 +892,11 @@ impl PageServerConf { }, "control_plane_emergency_mode" => { builder.control_plane_emergency_mode(parse_toml_bool(key, item)?) - }, + "heatmap_upload_concurrency" => { + builder.heatmap_upload_concurrency(parse_toml_u64(key, item)? as usize) + }, + "ingest_batch_size" => builder.ingest_batch_size(parse_toml_u64(key, item)?), _ => bail!("unrecognized pageserver option '{key}'"), } } @@ -855,7 +922,8 @@ impl PageServerConf { #[cfg(test)] pub fn test_repo_dir(test_name: &str) -> Utf8PathBuf { - Utf8PathBuf::from(format!("../tmp_check/test_{test_name}")) + let test_output_dir = std::env::var("TEST_OUTPUT").unwrap_or("../tmp_check".into()); + Utf8PathBuf::from(format!("{test_output_dir}/test_{test_name}")) } pub fn dummy_conf(repo_dir: Utf8PathBuf) -> Self { @@ -881,6 +949,10 @@ impl PageServerConf { broker_endpoint: storage_broker::DEFAULT_ENDPOINT.parse().unwrap(), broker_keepalive_interval: Duration::from_secs(5000), log_format: LogFormat::from_str(defaults::DEFAULT_LOG_FORMAT).unwrap(), + concurrent_tenant_warmup: ConfigurableSemaphore::new( + NonZeroUsize::new(DEFAULT_CONCURRENT_TENANT_WARMUP) + .expect("Invalid default constant"), + ), concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(), eviction_task_immitated_concurrent_logical_size_queries: ConfigurableSemaphore::default( ), @@ -895,6 +967,8 @@ impl PageServerConf { control_plane_api: None, control_plane_api_token: None, control_plane_emergency_mode: false, + heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY, + ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE, } } } @@ -1098,6 +1172,9 @@ background_task_maximum_delay = '334 s' storage_broker::DEFAULT_KEEPALIVE_INTERVAL )?, log_format: LogFormat::from_str(defaults::DEFAULT_LOG_FORMAT).unwrap(), + concurrent_tenant_warmup: ConfigurableSemaphore::new( + NonZeroUsize::new(DEFAULT_CONCURRENT_TENANT_WARMUP).unwrap() + ), concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(), eviction_task_immitated_concurrent_logical_size_queries: ConfigurableSemaphore::default(), @@ -1119,7 +1196,9 @@ background_task_maximum_delay = '334 s' )?, control_plane_api: None, control_plane_api_token: None, - control_plane_emergency_mode: false + control_plane_emergency_mode: false, + heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY, + ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE, }, "Correct defaults should be used when no config values are provided" ); @@ -1163,6 +1242,9 @@ background_task_maximum_delay = '334 s' broker_endpoint: storage_broker::DEFAULT_ENDPOINT.parse().unwrap(), broker_keepalive_interval: Duration::from_secs(5), log_format: LogFormat::Json, + concurrent_tenant_warmup: ConfigurableSemaphore::new( + NonZeroUsize::new(DEFAULT_CONCURRENT_TENANT_WARMUP).unwrap() + ), concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(), eviction_task_immitated_concurrent_logical_size_queries: ConfigurableSemaphore::default(), @@ -1176,7 +1258,9 @@ background_task_maximum_delay = '334 s' background_task_maximum_delay: Duration::from_secs(334), control_plane_api: None, control_plane_api_token: None, - control_plane_emergency_mode: false + control_plane_emergency_mode: false, + heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY, + ingest_batch_size: 100, }, "Should be able to parse all basic config values correctly" ); @@ -1406,6 +1490,7 @@ threshold = "20m" period: Duration::from_secs(10), #[cfg(feature = "testing")] mock_statvfs: None, + eviction_order: crate::disk_usage_eviction_task::EvictionOrder::AbsoluteAccessed, }) ); match &conf.default_tenant_conf.eviction_policy { diff --git a/pageserver/src/consumption_metrics.rs b/pageserver/src/consumption_metrics.rs index 7ad6a0f890..bde2cedca7 100644 --- a/pageserver/src/consumption_metrics.rs +++ b/pageserver/src/consumption_metrics.rs @@ -3,7 +3,7 @@ use crate::context::{DownloadBehavior, RequestContext}; use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME}; use crate::tenant::tasks::BackgroundLoopKind; -use crate::tenant::{mgr, LogicalSizeCalculationCause, PageReconstructError}; +use crate::tenant::{mgr, LogicalSizeCalculationCause, PageReconstructError, Tenant}; use camino::Utf8PathBuf; use consumption_metrics::EventType; use pageserver_api::models::TenantState; @@ -256,8 +256,6 @@ async fn calculate_synthetic_size_worker( info!("calculate_synthetic_size_worker stopped"); }; - let cause = LogicalSizeCalculationCause::ConsumptionMetricsSyntheticSize; - loop { let started_at = Instant::now(); @@ -269,26 +267,25 @@ async fn calculate_synthetic_size_worker( } }; - for (tenant_id, tenant_state) in tenants { + for (tenant_shard_id, tenant_state) in tenants { if tenant_state != TenantState::Active { continue; } - if let Ok(tenant) = mgr::get_tenant(tenant_id, true) { - // TODO should we use concurrent_background_tasks_rate_limit() here, like the other background tasks? - // We can put in some prioritization for consumption metrics. - // Same for the loop that fetches computed metrics. - // By using the same limiter, we centralize metrics collection for "start" and "finished" counters, - // which turns out is really handy to understand the system. - if let Err(e) = tenant.calculate_synthetic_size(cause, cancel, ctx).await { - if let Some(PageReconstructError::Cancelled) = - e.downcast_ref::() - { - return Ok(()); - } - error!("failed to calculate synthetic size for tenant {tenant_id}: {e:#}"); - } + if !tenant_shard_id.is_zero() { + // We only send consumption metrics from shard 0, so don't waste time calculating + // synthetic size on other shards. + continue; } + + let Ok(tenant) = mgr::get_tenant(tenant_shard_id, true) else { + continue; + }; + + // there is never any reason to exit calculate_synthetic_size_worker following any + // return value -- we don't need to care about shutdown because no tenant is found when + // pageserver is shut down. + calculate_and_log(&tenant, cancel, ctx).await; } crate::tenant::tasks::warn_when_period_overrun( @@ -299,7 +296,7 @@ async fn calculate_synthetic_size_worker( let res = tokio::time::timeout_at( started_at + synthetic_size_calculation_interval, - task_mgr::shutdown_token().cancelled(), + cancel.cancelled(), ) .await; if res.is_ok() { @@ -307,3 +304,31 @@ async fn calculate_synthetic_size_worker( } } } + +async fn calculate_and_log(tenant: &Tenant, cancel: &CancellationToken, ctx: &RequestContext) { + const CAUSE: LogicalSizeCalculationCause = + LogicalSizeCalculationCause::ConsumptionMetricsSyntheticSize; + + // TODO should we use concurrent_background_tasks_rate_limit() here, like the other background tasks? + // We can put in some prioritization for consumption metrics. + // Same for the loop that fetches computed metrics. + // By using the same limiter, we centralize metrics collection for "start" and "finished" counters, + // which turns out is really handy to understand the system. + let Err(e) = tenant.calculate_synthetic_size(CAUSE, cancel, ctx).await else { + return; + }; + + // this error can be returned if timeline is shutting down, but it does not + // mean the synthetic size worker should terminate. we do not need any checks + // in this function because `mgr::get_tenant` will error out after shutdown has + // progressed to shutting down tenants. + let shutting_down = matches!( + e.downcast_ref::(), + Some(PageReconstructError::Cancelled | PageReconstructError::AncestorStopping(_)) + ); + + if !shutting_down { + let tenant_shard_id = tenant.tenant_shard_id(); + error!("failed to calculate synthetic size for tenant {tenant_shard_id}: {e:#}"); + } +} diff --git a/pageserver/src/consumption_metrics/metrics.rs b/pageserver/src/consumption_metrics/metrics.rs index 213f08484c..0b827816bc 100644 --- a/pageserver/src/consumption_metrics/metrics.rs +++ b/pageserver/src/consumption_metrics/metrics.rs @@ -1,5 +1,4 @@ -use crate::context::RequestContext; -use anyhow::Context; +use crate::{context::RequestContext, tenant::timeline::logical_size::CurrentLogicalSize}; use chrono::{DateTime, Utc}; use consumption_metrics::EventType; use futures::stream::StreamExt; @@ -198,12 +197,12 @@ pub(super) async fn collect_all_metrics( }; let tenants = futures::stream::iter(tenants).filter_map(|(id, state)| async move { - if state != TenantState::Active { + if state != TenantState::Active || !id.is_zero() { None } else { crate::tenant::mgr::get_tenant(id, true) .ok() - .map(|tenant| (id, tenant)) + .map(|tenant| (id.tenant_id, tenant)) } }); @@ -352,13 +351,16 @@ impl TimelineSnapshot { let current_exact_logical_size = { let span = tracing::info_span!("collect_metrics_iteration", tenant_id = %t.tenant_shard_id.tenant_id, timeline_id = %t.timeline_id); - let res = span - .in_scope(|| t.get_current_logical_size(ctx)) - .context("get_current_logical_size"); - match res? { + let size = span.in_scope(|| { + t.get_current_logical_size( + crate::tenant::timeline::GetLogicalSizePriority::Background, + ctx, + ) + }); + match size { // Only send timeline logical size when it is fully calculated. - (size, is_exact) if is_exact => Some(size), - (_, _) => None, + CurrentLogicalSize::Exact(ref size) => Some(size.into()), + CurrentLogicalSize::Approximate(_) => None, } }; diff --git a/pageserver/src/deletion_queue/list_writer.rs b/pageserver/src/deletion_queue/list_writer.rs index 7ff27ceb44..3a3d600ac2 100644 --- a/pageserver/src/deletion_queue/list_writer.rs +++ b/pageserver/src/deletion_queue/list_writer.rs @@ -312,7 +312,18 @@ impl ListWriter { for (tenant_shard_id, tenant_list) in &mut deletion_list.tenants { if let Some(attached_gen) = attached_tenants.get(tenant_shard_id) { if attached_gen.previous() == tenant_list.generation { + info!( + seq=%s, tenant_id=%tenant_shard_id.tenant_id, + shard_id=%tenant_shard_id.shard_slug(), + old_gen=?tenant_list.generation, new_gen=?attached_gen, + "Updating gen on recovered list"); tenant_list.generation = *attached_gen; + } else { + info!( + seq=%s, tenant_id=%tenant_shard_id.tenant_id, + shard_id=%tenant_shard_id.shard_slug(), + old_gen=?tenant_list.generation, new_gen=?attached_gen, + "Encountered stale generation on recovered list"); } } } diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs index f01cd1cf8c..23b9b573b6 100644 --- a/pageserver/src/disk_usage_eviction_task.rs +++ b/pageserver/src/disk_usage_eviction_task.rs @@ -42,7 +42,6 @@ // reading these fields. We use the Debug impl for semi-structured logging, though. use std::{ - collections::HashMap, sync::Arc, time::{Duration, SystemTime}, }; @@ -75,6 +74,45 @@ pub struct DiskUsageEvictionTaskConfig { pub period: Duration, #[cfg(feature = "testing")] pub mock_statvfs: Option, + /// Select sorting for evicted layers + #[serde(default)] + pub eviction_order: EvictionOrder, +} + +/// Selects the sort order for eviction candidates *after* per tenant `min_resident_size` +/// partitioning. +#[derive(Default, Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +#[serde(tag = "type", content = "args")] +pub enum EvictionOrder { + /// Order the layers to be evicted by how recently they have been accessed in absolute + /// time. + /// + /// This strategy is unfair when some tenants grow faster than others towards the slower + /// growing. + #[default] + AbsoluteAccessed, + + /// Order the layers to be evicted by how recently they have been accessed relatively within + /// the set of resident layers of a tenant. + /// + /// This strategy will evict layers more fairly but is untested. + RelativeAccessed { + #[serde(default)] + highest_layer_count_loses_first: bool, + }, +} + +impl EvictionOrder { + /// Return true, if with [`Self::RelativeAccessed`] order the tenants with the highest layer + /// counts should be the first ones to have their layers evicted. + fn highest_layer_count_loses_first(&self) -> bool { + match self { + EvictionOrder::AbsoluteAccessed => false, + EvictionOrder::RelativeAccessed { + highest_layer_count_loses_first, + } => *highest_layer_count_loses_first, + } + } } #[derive(Default)] @@ -125,7 +163,7 @@ pub fn launch_disk_usage_global_eviction_task( async fn disk_usage_eviction_task( state: &State, task_config: &DiskUsageEvictionTaskConfig, - _storage: &GenericRemoteStorage, + storage: &GenericRemoteStorage, tenants_dir: &Utf8Path, cancel: CancellationToken, ) { @@ -149,8 +187,14 @@ async fn disk_usage_eviction_task( let start = Instant::now(); async { - let res = - disk_usage_eviction_task_iteration(state, task_config, tenants_dir, &cancel).await; + let res = disk_usage_eviction_task_iteration( + state, + task_config, + storage, + tenants_dir, + &cancel, + ) + .await; match res { Ok(()) => {} @@ -181,12 +225,20 @@ pub trait Usage: Clone + Copy + std::fmt::Debug { async fn disk_usage_eviction_task_iteration( state: &State, task_config: &DiskUsageEvictionTaskConfig, + storage: &GenericRemoteStorage, tenants_dir: &Utf8Path, cancel: &CancellationToken, ) -> anyhow::Result<()> { let usage_pre = filesystem_level_usage::get(tenants_dir, task_config) .context("get filesystem-level disk usage before evictions")?; - let res = disk_usage_eviction_task_iteration_impl(state, usage_pre, cancel).await; + let res = disk_usage_eviction_task_iteration_impl( + state, + storage, + usage_pre, + task_config.eviction_order, + cancel, + ) + .await; match res { Ok(outcome) => { debug!(?outcome, "disk_usage_eviction_iteration finished"); @@ -268,9 +320,11 @@ struct LayerCount { count: usize, } -pub async fn disk_usage_eviction_task_iteration_impl( +pub(crate) async fn disk_usage_eviction_task_iteration_impl( state: &State, + _storage: &GenericRemoteStorage, usage_pre: U, + eviction_order: EvictionOrder, cancel: &CancellationToken, ) -> anyhow::Result> { // use tokio's mutex to get a Sync guard (instead of std::sync::Mutex) @@ -290,7 +344,7 @@ pub async fn disk_usage_eviction_task_iteration_impl( "running disk usage based eviction due to pressure" ); - let candidates = match collect_eviction_candidates(cancel).await? { + let candidates = match collect_eviction_candidates(eviction_order, cancel).await? { EvictionCandidates::Cancelled => { return Ok(IterationOutcome::Cancelled); } @@ -300,16 +354,16 @@ pub async fn disk_usage_eviction_task_iteration_impl( // Debug-log the list of candidates let now = SystemTime::now(); for (i, (partition, candidate)) in candidates.iter().enumerate() { + let nth = i + 1; let desc = candidate.layer.layer_desc(); + let total_candidates = candidates.len(); + let size = desc.file_size; + let rel = candidate.relative_last_activity; debug!( - "cand {}/{}: size={}, no_access_for={}us, partition={:?}, {}/{}/{}", - i + 1, - candidates.len(), - desc.file_size, + "cand {nth}/{total_candidates}: size={size}, rel_last_activity={rel}, no_access_for={}us, partition={partition:?}, {}/{}/{}", now.duration_since(candidate.last_activity_ts) .unwrap() .as_micros(), - partition, desc.tenant_shard_id, desc.timeline_id, candidate.layer, @@ -321,16 +375,16 @@ pub async fn disk_usage_eviction_task_iteration_impl( // Walk through the list of candidates, until we have accumulated enough layers to get // us back under the pressure threshold. 'usage_planned' is updated so that it tracks // how much disk space would be used after evicting all the layers up to the current - // point in the list. The layers are collected in 'batched', grouped per timeline. + // point in the list. // // If we get far enough in the list that we start to evict layers that are below // the tenant's min-resident-size threshold, print a warning, and memorize the disk // usage at that point, in 'usage_planned_min_resident_size_respecting'. - let mut batched: HashMap<_, Vec<_>> = HashMap::new(); let mut warned = None; let mut usage_planned = usage_pre; - let mut max_batch_size = 0; - for (i, (partition, candidate)) in candidates.into_iter().enumerate() { + let mut evicted_amount = 0; + + for (i, (partition, candidate)) in candidates.iter().enumerate() { if !usage_planned.has_pressure() { debug!( no_candidates_evicted = i, @@ -339,25 +393,13 @@ pub async fn disk_usage_eviction_task_iteration_impl( break; } - if partition == MinResidentSizePartition::Below && warned.is_none() { + if partition == &MinResidentSizePartition::Below && warned.is_none() { warn!(?usage_pre, ?usage_planned, candidate_no=i, "tenant_min_resident_size-respecting LRU would not relieve pressure, evicting more following global LRU policy"); warned = Some(usage_planned); } usage_planned.add_available_bytes(candidate.layer.layer_desc().file_size); - - // FIXME: batching makes no sense anymore because of no layermap locking, should just spawn - // tasks to evict all seen layers until we have evicted enough - - let batch = batched.entry(TimelineKey(candidate.timeline)).or_default(); - - // semaphore will later be used to limit eviction concurrency, and we can express at - // most u32 number of permits. unlikely we would have u32::MAX layers to be evicted, - // but fail gracefully by not making batches larger. - if batch.len() < u32::MAX as usize { - batch.push(candidate.layer); - max_batch_size = max_batch_size.max(batch.len()); - } + evicted_amount += 1; } let usage_planned = match warned { @@ -372,100 +414,79 @@ pub async fn disk_usage_eviction_task_iteration_impl( }; debug!(?usage_planned, "usage planned"); - // phase2: evict victims batched by timeline + // phase2: evict layers let mut js = tokio::task::JoinSet::new(); + let limit = 1000; - // ratelimit to 1k files or any higher max batch size - let limit = Arc::new(tokio::sync::Semaphore::new(1000.max(max_batch_size))); + let mut evicted = candidates.into_iter().take(evicted_amount).fuse(); + let mut consumed_all = false; - for (timeline, batch) in batched { - let tenant_shard_id = timeline.tenant_shard_id; - let timeline_id = timeline.timeline_id; - let batch_size = - u32::try_from(batch.len()).expect("batch size limited to u32::MAX during partitioning"); + // After the evictions, `usage_assumed` is the post-eviction usage, + // according to internal accounting. + let mut usage_assumed = usage_pre; + let mut evictions_failed = LayerCount::default(); - // I dislike naming of `available_permits` but it means current total amount of permits - // because permits can be added - assert!(batch_size as usize <= limit.available_permits()); + let evict_layers = async move { + loop { + let next = if js.len() >= limit || consumed_all { + js.join_next().await + } else if !js.is_empty() { + // opportunistically consume ready result, one per each new evicted + futures::future::FutureExt::now_or_never(js.join_next()).and_then(|x| x) + } else { + None + }; - debug!(%timeline_id, "evicting batch for timeline"); - - let evict = { - let limit = limit.clone(); - let cancel = cancel.clone(); - async move { - let mut evicted_bytes = 0; - let mut evictions_failed = LayerCount::default(); - - let Ok(_permit) = limit.acquire_many_owned(batch_size).await else { - // semaphore closing means cancelled - return (evicted_bytes, evictions_failed); - }; - - let results = timeline.evict_layers(&batch).await; - - match results { - Ok(results) => { - assert_eq!(results.len(), batch.len()); - for (result, layer) in results.into_iter().zip(batch.iter()) { - let file_size = layer.layer_desc().file_size; - match result { - Some(Ok(())) => { - evicted_bytes += file_size; - } - Some(Err(EvictionError::NotFound | EvictionError::Downloaded)) => { - evictions_failed.file_sizes += file_size; - evictions_failed.count += 1; - } - None => { - assert!(cancel.is_cancelled()); - } - } - } + if let Some(next) = next { + match next { + Ok(Ok(file_size)) => { + usage_assumed.add_available_bytes(file_size); } - Err(e) => { - warn!("failed to evict batch: {:#}", e); + Ok(Err((file_size, EvictionError::NotFound | EvictionError::Downloaded))) => { + evictions_failed.file_sizes += file_size; + evictions_failed.count += 1; } + Err(je) if je.is_cancelled() => unreachable!("not used"), + Err(je) if je.is_panic() => { /* already logged */ } + Err(je) => tracing::error!("unknown JoinError: {je:?}"), } - (evicted_bytes, evictions_failed) } - } - .instrument(tracing::info_span!("evict_batch", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id, batch_size)); - js.spawn(evict); - - // spwaning multiple thousands of these is essentially blocking, so give already spawned a - // chance of making progress - tokio::task::yield_now().await; - } - - let join_all = async move { - // After the evictions, `usage_assumed` is the post-eviction usage, - // according to internal accounting. - let mut usage_assumed = usage_pre; - let mut evictions_failed = LayerCount::default(); - - while let Some(res) = js.join_next().await { - match res { - Ok((evicted_bytes, failed)) => { - usage_assumed.add_available_bytes(evicted_bytes); - evictions_failed.file_sizes += failed.file_sizes; - evictions_failed.count += failed.count; - } - Err(je) if je.is_cancelled() => unreachable!("not used"), - Err(je) if je.is_panic() => { /* already logged */ } - Err(je) => tracing::error!("unknown JoinError: {je:?}"), + if consumed_all && js.is_empty() { + break; } + + // calling again when consumed_all is fine as evicted is fused. + let Some((_partition, candidate)) = evicted.next() else { + consumed_all = true; + continue; + }; + + js.spawn(async move { + let rtc = candidate.timeline.remote_client.as_ref().expect( + "holding the witness, all timelines must have a remote timeline client", + ); + let file_size = candidate.layer.layer_desc().file_size; + candidate + .layer + .evict_and_wait(rtc) + .await + .map(|()| file_size) + .map_err(|e| (file_size, e)) + }); + + tokio::task::yield_now().await; } + (usage_assumed, evictions_failed) }; let (usage_assumed, evictions_failed) = tokio::select! { - tuple = join_all => { tuple }, + tuple = evict_layers => { tuple }, _ = cancel.cancelled() => { - // close the semaphore to stop any pending acquires - limit.close(); + // dropping joinset will abort all pending evict_and_waits and that is fine, our + // requests will still stand return Ok(IterationOutcome::Cancelled); } }; @@ -485,6 +506,7 @@ struct EvictionCandidate { timeline: Arc, layer: Layer, last_activity_ts: SystemTime, + relative_last_activity: finite_f32::FiniteF32, } #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] @@ -504,24 +526,24 @@ enum EvictionCandidates { /// order. A caller that evicts in that order, until pressure is relieved, implements /// the eviction policy outlined in the module comment. /// -/// # Example +/// # Example with EvictionOrder::AbsoluteAccessed /// /// Imagine that there are two tenants, A and B, with five layers each, a-e. /// Each layer has size 100, and both tenant's min_resident_size is 150. /// The eviction order would be /// /// ```text -/// partition last_activity_ts tenant/layer -/// Above 18:30 A/c -/// Above 19:00 A/b -/// Above 18:29 B/c -/// Above 19:05 B/b -/// Above 20:00 B/a -/// Above 20:03 A/a -/// Below 20:30 A/d -/// Below 20:40 B/d -/// Below 20:45 B/e -/// Below 20:58 A/e +/// partition last_activity_ts tenant/layer +/// Above 18:30 A/c +/// Above 19:00 A/b +/// Above 18:29 B/c +/// Above 19:05 B/b +/// Above 20:00 B/a +/// Above 20:03 A/a +/// Below 20:30 A/d +/// Below 20:40 B/d +/// Below 20:45 B/e +/// Below 20:58 A/e /// ``` /// /// Now, if we need to evict 300 bytes to relieve pressure, we'd evict `A/c, A/b, B/c`. @@ -531,7 +553,77 @@ enum EvictionCandidates { /// `A/c, A/b, B/c, B/b, B/a, A/a, A/d, B/d, B/e`, reaching into the `Below` partition /// after exhauting the `Above` partition. /// So, we did not respect each tenant's min_resident_size. +/// +/// # Example with EvictionOrder::RelativeAccessed +/// +/// ```text +/// partition relative_age last_activity_ts tenant/layer +/// Above 0/4 18:30 A/c +/// Above 0/4 18:29 B/c +/// Above 1/4 19:00 A/b +/// Above 1/4 19:05 B/b +/// Above 2/4 20:00 B/a +/// Above 2/4 20:03 A/a +/// Below 3/4 20:30 A/d +/// Below 3/4 20:40 B/d +/// Below 4/4 20:45 B/e +/// Below 4/4 20:58 A/e +/// ``` +/// +/// With tenants having the same number of layers the picture does not change much. The same with +/// A having many more layers **resident** (not all of them listed): +/// +/// ```text +/// Above 0/100 18:30 A/c +/// Above 0/4 18:29 B/c +/// Above 1/100 19:00 A/b +/// Above 2/100 20:03 A/a +/// Above 3/100 20:03 A/nth_3 +/// Above 4/100 20:03 A/nth_4 +/// ... +/// Above 1/4 19:05 B/b +/// Above 25/100 20:04 A/nth_25 +/// ... +/// Above 2/4 20:00 B/a +/// Above 50/100 20:10 A/nth_50 +/// ... +/// Below 3/4 20:40 B/d +/// Below 99/100 20:30 A/nth_99 +/// Below 4/4 20:45 B/e +/// Below 100/100 20:58 A/nth_100 +/// ``` +/// +/// Now it's easier to see that because A has grown fast it has more layers to get evicted. What is +/// difficult to see is what happens on the next round assuming the evicting 23 from the above list +/// relieves the pressure (22 A layers gone, 1 B layers gone) but a new fast growing tenant C has +/// appeared: +/// +/// ```text +/// Above 0/87 20:04 A/nth_23 +/// Above 0/3 19:05 B/b +/// Above 0/50 20:59 C/nth_0 +/// Above 1/87 20:04 A/nth_24 +/// Above 1/50 21:00 C/nth_1 +/// Above 2/87 20:04 A/nth_25 +/// ... +/// Above 16/50 21:02 C/nth_16 +/// Above 1/3 20:00 B/a +/// Above 27/87 20:10 A/nth_50 +/// ... +/// Below 2/3 20:40 B/d +/// Below 49/50 21:05 C/nth_49 +/// Below 86/87 20:30 A/nth_99 +/// Below 3/3 20:45 B/e +/// Below 50/50 21:05 C/nth_50 +/// Below 87/87 20:58 A/nth_100 +/// ``` +/// +/// Now relieving pressure with 23 layers would cost: +/// - tenant A 14 layers +/// - tenant B 1 layer +/// - tenant C 8 layers async fn collect_eviction_candidates( + eviction_order: EvictionOrder, cancel: &CancellationToken, ) -> anyhow::Result { // get a snapshot of the list of tenants @@ -617,12 +709,63 @@ async fn collect_eviction_candidates( tenant_candidates .sort_unstable_by_key(|(_, layer_info)| std::cmp::Reverse(layer_info.last_activity_ts)); let mut cumsum: i128 = 0; - for (timeline, layer_info) in tenant_candidates.into_iter() { + + // keeping the -1 or not decides if every tenant should lose their least recently accessed + // layer OR if this should happen in the order of having highest layer count: + let fudge = if eviction_order.highest_layer_count_loses_first() { + // relative_age vs. tenant layer count: + // - 0.1..=1.0 (10 layers) + // - 0.01..=1.0 (100 layers) + // - 0.001..=1.0 (1000 layers) + // + // leading to evicting less of the smallest tenants. + 0 + } else { + // use full 0.0..=1.0 range, which means even the smallest tenants could always lose a + // layer. the actual ordering is unspecified: for 10k tenants on a pageserver it could + // be that less than 10k layer evictions is enough, so we would not need to evict from + // all tenants. + // + // as the tenant ordering is now deterministic this could hit the same tenants + // disproportionetly on multiple invocations. alternative could be to remember how many + // layers did we evict last time from this tenant, and inject that as an additional + // fudge here. + 1 + }; + + let total = tenant_candidates + .len() + .checked_sub(fudge) + .filter(|&x| x > 0) + // support 0 or 1 resident layer tenants as well + .unwrap_or(1); + let divider = total as f32; + + for (i, (timeline, layer_info)) in tenant_candidates.into_iter().enumerate() { let file_size = layer_info.file_size(); + + // as we iterate this reverse sorted list, the most recently accessed layer will always + // be 1.0; this is for us to evict it last. + let relative_last_activity = if matches!( + eviction_order, + EvictionOrder::RelativeAccessed { .. } + ) { + // another possibility: use buckets, like (256.0 * relative_last_activity) as u8 or + // similarly for u16. unsure how it would help. + finite_f32::FiniteF32::try_from_normalized((total - i) as f32 / divider) + .unwrap_or_else(|val| { + tracing::warn!(%fudge, "calculated invalid relative_last_activity for i={i}, total={total}: {val}"); + finite_f32::FiniteF32::ZERO + }) + } else { + finite_f32::FiniteF32::ZERO + }; + let candidate = EvictionCandidate { timeline, last_activity_ts: layer_info.last_activity_ts, layer: layer_info.layer, + relative_last_activity, }; let partition = if cumsum > min_resident_size as i128 { MinResidentSizePartition::Above @@ -636,8 +779,19 @@ async fn collect_eviction_candidates( debug_assert!(MinResidentSizePartition::Above < MinResidentSizePartition::Below, "as explained in the function's doc comment, layers that aren't in the tenant's min_resident_size are evicted first"); - candidates - .sort_unstable_by_key(|(partition, candidate)| (*partition, candidate.last_activity_ts)); + + match eviction_order { + EvictionOrder::AbsoluteAccessed => { + candidates.sort_unstable_by_key(|(partition, candidate)| { + (*partition, candidate.last_activity_ts) + }); + } + EvictionOrder::RelativeAccessed { .. } => { + candidates.sort_unstable_by_key(|(partition, candidate)| { + (*partition, candidate.relative_last_activity) + }); + } + } Ok(EvictionCandidates::Finished(candidates)) } @@ -666,6 +820,66 @@ impl std::ops::Deref for TimelineKey { } } +/// A totally ordered f32 subset we can use with sorting functions. +mod finite_f32 { + + /// A totally ordered f32 subset we can use with sorting functions. + #[derive(Clone, Copy, PartialEq)] + pub struct FiniteF32(f32); + + impl std::fmt::Debug for FiniteF32 { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + std::fmt::Debug::fmt(&self.0, f) + } + } + + impl std::fmt::Display for FiniteF32 { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + std::fmt::Display::fmt(&self.0, f) + } + } + + impl std::cmp::Eq for FiniteF32 {} + + impl std::cmp::PartialOrd for FiniteF32 { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } + } + + impl std::cmp::Ord for FiniteF32 { + fn cmp(&self, other: &Self) -> std::cmp::Ordering { + self.0.total_cmp(&other.0) + } + } + + impl TryFrom for FiniteF32 { + type Error = f32; + + fn try_from(value: f32) -> Result { + if value.is_finite() { + Ok(FiniteF32(value)) + } else { + Err(value) + } + } + } + + impl FiniteF32 { + pub const ZERO: FiniteF32 = FiniteF32(0.0); + + pub fn try_from_normalized(value: f32) -> Result { + if (0.0..=1.0).contains(&value) { + // -0.0 is within the range, make sure it is assumed 0.0..=1.0 + let value = value.abs(); + Ok(FiniteF32(value)) + } else { + Err(value) + } + } + } +} + mod filesystem_level_usage { use anyhow::Context; use camino::Utf8Path; @@ -747,6 +961,7 @@ mod filesystem_level_usage { #[test] fn max_usage_pct_pressure() { + use super::EvictionOrder; use super::Usage as _; use std::time::Duration; use utils::serde_percent::Percent; @@ -758,6 +973,7 @@ mod filesystem_level_usage { period: Duration::MAX, #[cfg(feature = "testing")] mock_statvfs: None, + eviction_order: EvictionOrder::default(), }, total_bytes: 100_000, avail_bytes: 0, diff --git a/pageserver/src/http/mod.rs b/pageserver/src/http/mod.rs index 1c083bd382..c82d1c0362 100644 --- a/pageserver/src/http/mod.rs +++ b/pageserver/src/http/mod.rs @@ -1,4 +1,2 @@ pub mod routes; pub use routes::make_router; - -pub use pageserver_api::models; diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index 2e418f4d8f..1fbca1086f 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -84,7 +84,6 @@ paths: required: true schema: type: string - format: hex get: description: Get tenant status responses: @@ -160,6 +159,12 @@ paths: application/json: schema: $ref: "#/components/schemas/ConflictError" + "412": + description: Deletion may not proceed, tenant is not in Active state + content: + application/json: + schema: + $ref: "#/components/schemas/PreconditionFailedError" "500": description: Generic operation error content: @@ -181,7 +186,6 @@ paths: required: true schema: type: string - format: hex get: description: Get timelines for tenant responses: @@ -232,7 +236,6 @@ paths: required: true schema: type: string - format: hex - name: timeline_id in: path required: true @@ -338,7 +341,6 @@ paths: required: true schema: type: string - format: hex - name: timeline_id in: path required: true @@ -401,7 +403,6 @@ paths: required: true schema: type: string - format: hex - name: timeline_id in: path required: true @@ -469,7 +470,6 @@ paths: required: true schema: type: string - format: hex - name: timeline_id in: path required: true @@ -523,7 +523,6 @@ paths: required: true schema: type: string - format: hex post: description: | Schedules attach operation to happen in the background for the given tenant. @@ -631,7 +630,6 @@ paths: required: true schema: type: string - format: hex - name: flush_ms in: query required: false @@ -724,7 +722,6 @@ paths: required: true schema: type: string - format: hex - name: detach_ignored in: query required: false @@ -784,7 +781,6 @@ paths: required: true schema: type: string - format: hex post: description: | Remove tenant data (including all corresponding timelines) from pageserver's memory. @@ -833,7 +829,6 @@ paths: required: true schema: type: string - format: hex post: description: | Schedules an operation that attempts to load a tenant from the local disk and @@ -890,7 +885,6 @@ paths: required: true schema: type: string - format: hex get: description: | Calculate tenant's synthetic size @@ -933,7 +927,6 @@ paths: required: true schema: type: string - format: hex - name: inputs_only in: query required: false @@ -1003,11 +996,10 @@ paths: required: true schema: type: string - format: hex post: description: | - Create a timeline. Returns new timeline id on success.\ - If no new timeline id is specified in parameters, it would be generated. It's an error to recreate the same timeline. + Create a timeline. Returns new timeline id on success. + Recreating the same timeline will succeed if the parameters match the existing timeline. If no pg_version is specified, assume DEFAULT_PG_VERSION hardcoded in the pageserver. requestBody: content: @@ -1028,6 +1020,9 @@ paths: format: hex pg_version: type: integer + existing_initdb_timeline_id: + type: string + format: hex responses: "201": description: TimelineInfo @@ -1134,7 +1129,6 @@ paths: application/json: schema: type: string - format: hex "400": description: Malformed tenant create request content: @@ -1231,7 +1225,6 @@ paths: required: true schema: type: string - format: hex get: description: | Returns tenant's config description: specific config overrides a tenant has @@ -1337,7 +1330,6 @@ components: properties: new_tenant_id: type: string - format: hex generation: type: integer description: Attachment generation number. @@ -1366,7 +1358,6 @@ components: properties: tenant_id: type: string - format: hex TenantLocationConfigRequest: type: object required: @@ -1374,7 +1365,6 @@ components: properties: tenant_id: type: string - format: hex mode: type: string enum: ["AttachedSingle", "AttachedMulti", "AttachedStale", "Secondary", "Detached"] @@ -1421,6 +1411,8 @@ components: type: integer trace_read_requests: type: boolean + heatmap_period: + type: integer TenantConfigResponse: type: object properties: @@ -1443,7 +1435,6 @@ components: format: hex tenant_id: type: string - format: hex last_record_lsn: type: string format: hex diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 37159be95c..157e6b4e3e 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -14,6 +14,7 @@ use hyper::header; use hyper::StatusCode; use hyper::{Body, Request, Response, Uri}; use metrics::launch_timestamp::LaunchTimestamp; +use pageserver_api::models::TenantDetails; use pageserver_api::models::{ DownloadRemoteLayersTaskSpawnRequest, LocationConfigMode, TenantAttachRequest, TenantLoadRequest, TenantLocationConfigRequest, @@ -24,24 +25,23 @@ use tenant_size_model::{SizeResult, StorageModel}; use tokio_util::sync::CancellationToken; use tracing::*; use utils::auth::JwtAuth; +use utils::failpoint_support::failpoints_handler; use utils::http::endpoint::request_span; use utils::http::json::json_request_or_empty_body; use utils::http::request::{get_request_param, must_get_query_param, parse_query_param}; -use super::models::{ - StatusResponse, TenantConfigRequest, TenantCreateRequest, TenantCreateResponse, TenantInfo, - TimelineCreateRequest, TimelineGcRequest, TimelineInfo, -}; use crate::context::{DownloadBehavior, RequestContext}; use crate::deletion_queue::DeletionQueueClient; use crate::metrics::{StorageTimeOperation, STORAGE_TIME_GLOBAL}; use crate::pgdatadir_mapping::LsnForTimestamp; use crate::task_mgr::TaskKind; use crate::tenant::config::{LocationConf, TenantConfOpt}; +use crate::tenant::mgr::GetActiveTenantError; use crate::tenant::mgr::{ GetTenantError, SetNewTenantConfigError, TenantManager, TenantMapError, TenantMapInsertError, TenantSlotError, TenantSlotUpsertError, TenantStateError, }; +use crate::tenant::secondary::SecondaryController; use crate::tenant::size::ModelInputs; use crate::tenant::storage_layer::LayerAccessStatsReset; use crate::tenant::timeline::CompactFlags; @@ -49,6 +49,10 @@ use crate::tenant::timeline::Timeline; use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError, TenantSharedResources}; use crate::{config::PageServerConf, tenant::mgr}; use crate::{disk_usage_eviction_task, tenant}; +use pageserver_api::models::{ + StatusResponse, TenantConfigRequest, TenantCreateRequest, TenantCreateResponse, TenantInfo, + TimelineCreateRequest, TimelineGcRequest, TimelineInfo, +}; use utils::{ auth::SwappableJwtAuth, generation::Generation, @@ -63,8 +67,10 @@ use utils::{ lsn::Lsn, }; -// Imports only used for testing APIs -use super::models::ConfigureFailpointsRequest; +// For APIs that require an Active tenant, how long should we block waiting for that state? +// This is not functionally necessary (clients will retry), but avoids generating a lot of +// failed API calls while tenants are activating. +const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(5000); pub struct State { conf: &'static PageServerConf, @@ -75,9 +81,11 @@ pub struct State { broker_client: storage_broker::BrokerClientChannel, disk_usage_eviction_state: Arc, deletion_queue_client: DeletionQueueClient, + secondary_controller: SecondaryController, } impl State { + #[allow(clippy::too_many_arguments)] pub fn new( conf: &'static PageServerConf, tenant_manager: Arc, @@ -86,6 +94,7 @@ impl State { broker_client: storage_broker::BrokerClientChannel, disk_usage_eviction_state: Arc, deletion_queue_client: DeletionQueueClient, + secondary_controller: SecondaryController, ) -> anyhow::Result { let allowlist_routes = ["/v1/status", "/v1/doc", "/swagger.yml", "/metrics"] .iter() @@ -100,6 +109,7 @@ impl State { broker_client, disk_usage_eviction_state, deletion_queue_client, + secondary_controller, }) } @@ -136,11 +146,6 @@ impl From for ApiError { fn from(pre: PageReconstructError) -> ApiError { match pre { PageReconstructError::Other(pre) => ApiError::InternalServerError(pre), - PageReconstructError::NeedsDownload(_, _) => { - // This shouldn't happen, because we use a RequestContext that requests to - // download any missing layer files on-demand. - ApiError::InternalServerError(anyhow::anyhow!("need to download remote layer file")) - } PageReconstructError::Cancelled => { ApiError::InternalServerError(anyhow::anyhow!("request was cancelled")) } @@ -233,6 +238,19 @@ impl From for ApiError { } } +impl From for ApiError { + fn from(e: GetActiveTenantError) -> ApiError { + match e { + GetActiveTenantError::WillNotBecomeActive(_) => ApiError::Conflict(format!("{}", e)), + GetActiveTenantError::Cancelled => ApiError::ShuttingDown, + GetActiveTenantError::NotFound(gte) => gte.into(), + GetActiveTenantError::WaitForActiveTimeout { .. } => { + ApiError::ResourceUnavailable(format!("{}", e).into()) + } + } + } +} + impl From for ApiError { fn from(e: SetNewTenantConfigError) -> ApiError { match e { @@ -288,6 +306,7 @@ impl From for ApiError { SlotUpsertError(e) => e.into(), Other(o) => ApiError::InternalServerError(o), e @ InvalidState(_) => ApiError::PreconditionFailed(e.to_string().into_boxed_str()), + Cancelled => ApiError::ShuttingDown, } } } @@ -319,6 +338,7 @@ async fn build_timeline_info_common( ctx: &RequestContext, ) -> anyhow::Result { crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id(); + let initdb_lsn = timeline.initdb_lsn; let last_record_lsn = timeline.get_last_record_lsn(); let (wal_source_connstr, last_received_msg_lsn, last_received_msg_ts) = { let guard = timeline.last_received_wal.lock().unwrap(); @@ -338,13 +358,8 @@ async fn build_timeline_info_common( Lsn(0) => None, lsn @ Lsn(_) => Some(lsn), }; - let current_logical_size = match timeline.get_current_logical_size(ctx) { - Ok((size, _)) => Some(size), - Err(err) => { - error!("Timeline info creation failed to get current logical size: {err:?}"); - None - } - }; + let current_logical_size = + timeline.get_current_logical_size(tenant::timeline::GetLogicalSizePriority::User, ctx); let current_physical_size = Some(timeline.layer_size_sum().await); let state = timeline.current_state(); let remote_consistent_lsn_projected = timeline @@ -357,18 +372,22 @@ async fn build_timeline_info_common( let walreceiver_status = timeline.walreceiver_status(); let info = TimelineInfo { - // TODO(sharding): add a shard_id field, or make tenant_id into a tenant_shard_id - tenant_id: timeline.tenant_shard_id.tenant_id, + tenant_id: timeline.tenant_shard_id, timeline_id: timeline.timeline_id, ancestor_timeline_id, ancestor_lsn, disk_consistent_lsn: timeline.get_disk_consistent_lsn(), remote_consistent_lsn: remote_consistent_lsn_projected, remote_consistent_lsn_visible, + initdb_lsn, last_record_lsn, prev_record_lsn: Some(timeline.get_prev_record_lsn()), latest_gc_cutoff_lsn: *timeline.get_latest_gc_cutoff_lsn(), - current_logical_size, + current_logical_size: current_logical_size.size_dont_care_about_accuracy(), + current_logical_size_is_accurate: match current_logical_size.accuracy() { + tenant::timeline::logical_size::Accuracy::Approximate => false, + tenant::timeline::logical_size::Accuracy::Exact => true, + }, current_physical_size, current_logical_size_non_incremental: None, timeline_dir_layer_file_size_sum: None, @@ -435,12 +454,16 @@ async fn timeline_create_handler( let state = get_state(&request); async { - let tenant = state.tenant_manager.get_attached_tenant_shard(tenant_shard_id, true)?; + let tenant = state.tenant_manager.get_attached_tenant_shard(tenant_shard_id, false)?; + + tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?; + match tenant.create_timeline( new_timeline_id, request_data.ancestor_timeline_id.map(TimelineId::from), request_data.ancestor_start_lsn, request_data.pg_version.unwrap_or(crate::DEFAULT_PG_VERSION), + request_data.existing_initdb_timeline_id, state.broker_client.clone(), &ctx, ) @@ -452,7 +475,7 @@ async fn timeline_create_handler( .map_err(ApiError::InternalServerError)?; json_response(StatusCode::CREATED, timeline_info) } - Err(tenant::CreateTimelineError::AlreadyExists) => { + Err(tenant::CreateTimelineError::Conflict | tenant::CreateTimelineError::AlreadyCreating) => { json_response(StatusCode::CONFLICT, ()) } Err(tenant::CreateTimelineError::AncestorLsn(err)) => { @@ -480,15 +503,15 @@ async fn timeline_list_handler( request: Request, _cancel: CancellationToken, ) -> Result, ApiError> { - let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; let include_non_incremental_logical_size: Option = parse_query_param(&request, "include-non-incremental-logical-size")?; - check_permission(&request, Some(tenant_id))?; + check_permission(&request, Some(tenant_shard_id.tenant_id))?; let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); let response_data = async { - let tenant = mgr::get_tenant(tenant_id, true)?; + let tenant = mgr::get_tenant(tenant_shard_id, true)?; let timelines = tenant.list_timelines(); let mut response_data = Vec::with_capacity(timelines.len()); @@ -507,7 +530,9 @@ async fn timeline_list_handler( } Ok::, ApiError>(response_data) } - .instrument(info_span!("timeline_list", %tenant_id)) + .instrument(info_span!("timeline_list", + tenant_id = %tenant_shard_id.tenant_id, + shard_id = %tenant_shard_id.shard_slug())) .await?; json_response(StatusCode::OK, response_data) @@ -517,17 +542,17 @@ async fn timeline_detail_handler( request: Request, _cancel: CancellationToken, ) -> Result, ApiError> { - let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; let include_non_incremental_logical_size: Option = parse_query_param(&request, "include-non-incremental-logical-size")?; - check_permission(&request, Some(tenant_id))?; + check_permission(&request, Some(tenant_shard_id.tenant_id))?; // Logical size calculation needs downloading. let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); let timeline_info = async { - let tenant = mgr::get_tenant(tenant_id, true)?; + let tenant = mgr::get_tenant(tenant_shard_id, true)?; let timeline = tenant .get_timeline(timeline_id, false) @@ -544,7 +569,10 @@ async fn timeline_detail_handler( Ok::<_, ApiError>(timeline_info) } - .instrument(info_span!("timeline_detail", %tenant_id, %timeline_id)) + .instrument(info_span!("timeline_detail", + tenant_id = %tenant_shard_id.tenant_id, + shard_id = %tenant_shard_id.shard_slug(), + %timeline_id)) .await?; json_response(StatusCode::OK, timeline_info) @@ -554,10 +582,15 @@ async fn get_lsn_by_timestamp_handler( request: Request, cancel: CancellationToken, ) -> Result, ApiError> { - let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; - check_permission(&request, Some(tenant_id))?; + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; + check_permission(&request, Some(tenant_shard_id.tenant_id))?; - let version: Option = parse_query_param(&request, "version")?; + if !tenant_shard_id.is_zero() { + // Requires SLRU contents, which are only stored on shard zero + return Err(ApiError::BadRequest(anyhow!( + "Size calculations are only available on shard zero" + ))); + } let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; let timestamp_raw = must_get_query_param(&request, "timestamp")?; @@ -567,43 +600,37 @@ async fn get_lsn_by_timestamp_handler( let timestamp_pg = postgres_ffi::to_pg_timestamp(timestamp); let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); - let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?; + let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?; let result = timeline .find_lsn_for_timestamp(timestamp_pg, &cancel, &ctx) .await?; - - if version.unwrap_or(0) > 1 { - #[derive(serde::Serialize)] - struct Result { - lsn: Lsn, - kind: &'static str, - } - let (lsn, kind) = match result { - LsnForTimestamp::Present(lsn) => (lsn, "present"), - LsnForTimestamp::Future(lsn) => (lsn, "future"), - LsnForTimestamp::Past(lsn) => (lsn, "past"), - LsnForTimestamp::NoData(lsn) => (lsn, "nodata"), - }; - json_response(StatusCode::OK, Result { lsn, kind }) - } else { - // FIXME: this is a temporary crutch not to break backwards compatibility - // See https://github.com/neondatabase/neon/pull/5608 - let result = match result { - LsnForTimestamp::Present(lsn) => format!("{lsn}"), - LsnForTimestamp::Future(_lsn) => "future".into(), - LsnForTimestamp::Past(_lsn) => "past".into(), - LsnForTimestamp::NoData(_lsn) => "nodata".into(), - }; - json_response(StatusCode::OK, result) + #[derive(serde::Serialize)] + struct Result { + lsn: Lsn, + kind: &'static str, } + let (lsn, kind) = match result { + LsnForTimestamp::Present(lsn) => (lsn, "present"), + LsnForTimestamp::Future(lsn) => (lsn, "future"), + LsnForTimestamp::Past(lsn) => (lsn, "past"), + LsnForTimestamp::NoData(lsn) => (lsn, "nodata"), + }; + json_response(StatusCode::OK, Result { lsn, kind }) } async fn get_timestamp_of_lsn_handler( request: Request, _cancel: CancellationToken, ) -> Result, ApiError> { - let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; - check_permission(&request, Some(tenant_id))?; + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; + check_permission(&request, Some(tenant_shard_id.tenant_id))?; + + if !tenant_shard_id.is_zero() { + // Requires SLRU contents, which are only stored on shard zero + return Err(ApiError::BadRequest(anyhow!( + "Size calculations are only available on shard zero" + ))); + } let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; @@ -613,7 +640,7 @@ async fn get_timestamp_of_lsn_handler( .map_err(ApiError::BadRequest)?; let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); - let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?; + let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?; let result = timeline.get_timestamp_for_lsn(lsn, &ctx).await?; match result { @@ -674,11 +701,23 @@ async fn timeline_delete_handler( let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; check_permission(&request, Some(tenant_shard_id.tenant_id))?; - let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn); let state = get_state(&request); - state.tenant_manager.delete_timeline(tenant_shard_id, timeline_id, &ctx) - .instrument(info_span!("timeline_delete", tenant_id=%tenant_shard_id.tenant_id, shard=%tenant_shard_id.shard_slug(), %timeline_id)) + let tenant = state + .tenant_manager + .get_attached_tenant_shard(tenant_shard_id, false) + .map_err(|e| { + match e { + // GetTenantError has a built-in conversion to ApiError, but in this context we don't + // want to treat missing tenants as 404, to avoid ambiguity with successful deletions. + GetTenantError::NotFound(_) => ApiError::PreconditionFailed( + "Requested tenant is missing".to_string().into_boxed_str(), + ), + e => e.into(), + } + })?; + tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?; + tenant.delete_timeline(timeline_id).instrument(info_span!("timeline_delete", tenant_id=%tenant_shard_id.tenant_id, shard=%tenant_shard_id.shard_slug(), %timeline_id)) .await?; json_response(StatusCode::ACCEPTED, ()) @@ -709,6 +748,26 @@ async fn tenant_detach_handler( json_response(StatusCode::OK, ()) } +async fn tenant_reset_handler( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; + check_permission(&request, Some(tenant_shard_id.tenant_id))?; + + let drop_cache: Option = parse_query_param(&request, "drop_cache")?; + + let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn); + let state = get_state(&request); + state + .tenant_manager + .reset_tenant(tenant_shard_id, drop_cache.unwrap_or(false), ctx) + .await + .map_err(ApiError::InternalServerError)?; + + json_response(StatusCode::OK, ()) +} + async fn tenant_load_handler( mut request: Request, _cancel: CancellationToken, @@ -785,11 +844,11 @@ async fn tenant_status( request: Request, _cancel: CancellationToken, ) -> Result, ApiError> { - let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; - check_permission(&request, Some(tenant_id))?; + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; + check_permission(&request, Some(tenant_shard_id.tenant_id))?; let tenant_info = async { - let tenant = mgr::get_tenant(tenant_id, false)?; + let tenant = mgr::get_tenant(tenant_shard_id, false)?; // Calculate total physical size of all timelines let mut current_physical_size = 0; @@ -798,14 +857,19 @@ async fn tenant_status( } let state = tenant.current_state(); - Result::<_, ApiError>::Ok(TenantInfo { - id: tenant_id, - state: state.clone(), - current_physical_size: Some(current_physical_size), - attachment_status: state.attachment_status(), + Result::<_, ApiError>::Ok(TenantDetails { + tenant_info: TenantInfo { + id: tenant_shard_id, + state: state.clone(), + current_physical_size: Some(current_physical_size), + attachment_status: state.attachment_status(), + }, + timelines: tenant.list_timeline_ids(), }) } - .instrument(info_span!("tenant_status_handler", %tenant_id)) + .instrument(info_span!("tenant_status_handler", + tenant_id = %tenant_shard_id.tenant_id, + shard_id = %tenant_shard_id.shard_slug())) .await?; json_response(StatusCode::OK, tenant_info) @@ -821,10 +885,12 @@ async fn tenant_delete_handler( let state = get_state(&request); - mgr::delete_tenant(state.conf, state.remote_storage.clone(), tenant_shard_id) + state + .tenant_manager + .delete_tenant(tenant_shard_id, ACTIVE_TENANT_TIMEOUT) .instrument(info_span!("tenant_delete_handler", tenant_id = %tenant_shard_id.tenant_id, - shard = tenant_shard_id.shard_slug() + shard = %tenant_shard_id.shard_slug() )) .await?; @@ -848,14 +914,20 @@ async fn tenant_size_handler( request: Request, cancel: CancellationToken, ) -> Result, ApiError> { - let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; - check_permission(&request, Some(tenant_id))?; + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; + check_permission(&request, Some(tenant_shard_id.tenant_id))?; let inputs_only: Option = parse_query_param(&request, "inputs_only")?; let retention_period: Option = parse_query_param(&request, "retention_period")?; let headers = request.headers(); let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); - let tenant = mgr::get_tenant(tenant_id, true)?; + let tenant = mgr::get_tenant(tenant_shard_id, true)?; + + if !tenant_shard_id.is_zero() { + return Err(ApiError::BadRequest(anyhow!( + "Size calculations are only available on shard zero" + ))); + } // this can be long operation let inputs = tenant @@ -907,7 +979,7 @@ async fn tenant_size_handler( json_response( StatusCode::OK, TenantHistorySize { - id: tenant_id, + id: tenant_shard_id.tenant_id, size: sizes.as_ref().map(|x| x.total_size), segment_sizes: sizes.map(|x| x.segments), inputs, @@ -919,14 +991,14 @@ async fn layer_map_info_handler( request: Request, _cancel: CancellationToken, ) -> Result, ApiError> { - let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; let reset: LayerAccessStatsReset = parse_query_param(&request, "reset")?.unwrap_or(LayerAccessStatsReset::NoReset); - check_permission(&request, Some(tenant_id))?; + check_permission(&request, Some(tenant_shard_id.tenant_id))?; - let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?; + let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?; let layer_map_info = timeline.layer_map_info(reset).await; json_response(StatusCode::OK, layer_map_info) @@ -936,13 +1008,12 @@ async fn layer_download_handler( request: Request, _cancel: CancellationToken, ) -> Result, ApiError> { - let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; - check_permission(&request, Some(tenant_id))?; + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; let layer_file_name = get_request_param(&request, "layer_file_name")?; - check_permission(&request, Some(tenant_id))?; + check_permission(&request, Some(tenant_shard_id.tenant_id))?; - let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?; + let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?; let downloaded = timeline .download_layer(layer_file_name) .await @@ -953,7 +1024,7 @@ async fn layer_download_handler( Some(false) => json_response(StatusCode::NOT_MODIFIED, ()), None => json_response( StatusCode::BAD_REQUEST, - format!("Layer {tenant_id}/{timeline_id}/{layer_file_name} not found"), + format!("Layer {tenant_shard_id}/{timeline_id}/{layer_file_name} not found"), ), } } @@ -962,12 +1033,12 @@ async fn evict_timeline_layer_handler( request: Request, _cancel: CancellationToken, ) -> Result, ApiError> { - let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; - check_permission(&request, Some(tenant_id))?; + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; + check_permission(&request, Some(tenant_shard_id.tenant_id))?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; let layer_file_name = get_request_param(&request, "layer_file_name")?; - let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?; + let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?; let evicted = timeline .evict_layer(layer_file_name) .await @@ -978,7 +1049,7 @@ async fn evict_timeline_layer_handler( Some(false) => json_response(StatusCode::NOT_MODIFIED, ()), None => json_response( StatusCode::BAD_REQUEST, - format!("Layer {tenant_id}/{timeline_id}/{layer_file_name} not found"), + format!("Layer {tenant_shard_id}/{timeline_id}/{layer_file_name} not found"), ), } } @@ -1089,7 +1160,10 @@ async fn tenant_create_handler( // We created the tenant. Existing API semantics are that the tenant // is Active when this function returns. - if let res @ Err(_) = new_tenant.wait_to_become_active().await { + if let res @ Err(_) = new_tenant + .wait_to_become_active(ACTIVE_TENANT_TIMEOUT) + .await + { // This shouldn't happen because we just created the tenant directory // in tenant::mgr::create_tenant, and there aren't any remote timelines // to load, so, nothing can really fail during load. @@ -1110,10 +1184,10 @@ async fn get_tenant_config_handler( request: Request, _cancel: CancellationToken, ) -> Result, ApiError> { - let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; - check_permission(&request, Some(tenant_id))?; + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; + check_permission(&request, Some(tenant_shard_id.tenant_id))?; - let tenant = mgr::get_tenant(tenant_id, false)?; + let tenant = mgr::get_tenant(tenant_shard_id, false)?; let response = HashMap::from([ ( @@ -1173,7 +1247,7 @@ async fn put_tenant_location_config_handler( mgr::detach_tenant(conf, tenant_shard_id, true, &state.deletion_queue_client) .instrument(info_span!("tenant_detach", tenant_id = %tenant_shard_id.tenant_id, - shard = tenant_shard_id.shard_slug() + shard = %tenant_shard_id.shard_slug() )) .await { @@ -1207,9 +1281,9 @@ async fn handle_tenant_break( r: Request, _cancel: CancellationToken, ) -> Result, ApiError> { - let tenant_id: TenantId = parse_request_param(&r, "tenant_id")?; + let tenant_shard_id: TenantShardId = parse_request_param(&r, "tenant_shard_id")?; - let tenant = crate::tenant::mgr::get_tenant(tenant_id, true) + let tenant = crate::tenant::mgr::get_tenant(tenant_shard_id, true) .map_err(|_| ApiError::Conflict(String::from("no active tenant found")))?; tenant.set_broken("broken from test".to_owned()).await; @@ -1217,47 +1291,20 @@ async fn handle_tenant_break( json_response(StatusCode::OK, ()) } -async fn failpoints_handler( - mut request: Request, - _cancel: CancellationToken, -) -> Result, ApiError> { - if !fail::has_failpoints() { - return Err(ApiError::BadRequest(anyhow!( - "Cannot manage failpoints because pageserver was compiled without failpoints support" - ))); - } - - let failpoints: ConfigureFailpointsRequest = json_request(&mut request).await?; - for fp in failpoints { - info!("cfg failpoint: {} {}", fp.name, fp.actions); - - // We recognize one extra "action" that's not natively recognized - // by the failpoints crate: exit, to immediately kill the process - let cfg_result = crate::failpoint_support::apply_failpoint(&fp.name, &fp.actions); - - if let Err(err_msg) = cfg_result { - return Err(ApiError::BadRequest(anyhow!( - "Failed to configure failpoints: {err_msg}" - ))); - } - } - - json_response(StatusCode::OK, ()) -} - // Run GC immediately on given timeline. async fn timeline_gc_handler( mut request: Request, cancel: CancellationToken, ) -> Result, ApiError> { - let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; - check_permission(&request, Some(tenant_id))?; + check_permission(&request, Some(tenant_shard_id.tenant_id))?; let gc_req: TimelineGcRequest = json_request(&mut request).await?; let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); - let wait_task_done = mgr::immediate_gc(tenant_id, timeline_id, gc_req, cancel, &ctx).await?; + let wait_task_done = + mgr::immediate_gc(tenant_shard_id, timeline_id, gc_req, cancel, &ctx).await?; let gc_result = wait_task_done .await .context("wait for gc task") @@ -1272,9 +1319,9 @@ async fn timeline_compact_handler( request: Request, cancel: CancellationToken, ) -> Result, ApiError> { - let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; - check_permission(&request, Some(tenant_id))?; + check_permission(&request, Some(tenant_shard_id.tenant_id))?; let mut flags = EnumSet::empty(); if Some(true) == parse_query_param::<_, bool>(&request, "force_repartition")? { @@ -1282,14 +1329,14 @@ async fn timeline_compact_handler( } async { let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); - let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?; + let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?; timeline .compact(&cancel, flags, &ctx) .await .map_err(|e| ApiError::InternalServerError(e.into()))?; json_response(StatusCode::OK, ()) } - .instrument(info_span!("manual_compaction", %tenant_id, %timeline_id)) + .instrument(info_span!("manual_compaction", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id)) .await } @@ -1298,9 +1345,9 @@ async fn timeline_checkpoint_handler( request: Request, cancel: CancellationToken, ) -> Result, ApiError> { - let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; - check_permission(&request, Some(tenant_id))?; + check_permission(&request, Some(tenant_shard_id.tenant_id))?; let mut flags = EnumSet::empty(); if Some(true) == parse_query_param::<_, bool>(&request, "force_repartition")? { @@ -1308,7 +1355,7 @@ async fn timeline_checkpoint_handler( } async { let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); - let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?; + let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?; timeline .freeze_and_flush() .await @@ -1320,7 +1367,7 @@ async fn timeline_checkpoint_handler( json_response(StatusCode::OK, ()) } - .instrument(info_span!("manual_checkpoint", %tenant_id, %timeline_id)) + .instrument(info_span!("manual_checkpoint", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id)) .await } @@ -1328,12 +1375,12 @@ async fn timeline_download_remote_layers_handler_post( mut request: Request, _cancel: CancellationToken, ) -> Result, ApiError> { - let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; let body: DownloadRemoteLayersTaskSpawnRequest = json_request(&mut request).await?; - check_permission(&request, Some(tenant_id))?; + check_permission(&request, Some(tenant_shard_id.tenant_id))?; - let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?; + let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?; match timeline.spawn_download_all_remote_layers(body).await { Ok(st) => json_response(StatusCode::ACCEPTED, st), Err(st) => json_response(StatusCode::CONFLICT, st), @@ -1344,11 +1391,11 @@ async fn timeline_download_remote_layers_handler_get( request: Request, _cancel: CancellationToken, ) -> Result, ApiError> { - let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; - check_permission(&request, Some(tenant_id))?; + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; + check_permission(&request, Some(tenant_shard_id.tenant_id))?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; - let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?; + let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?; let info = timeline .get_download_all_remote_layers_task_info() .context("task never started since last pageserver process start") @@ -1394,9 +1441,9 @@ async fn getpage_at_lsn_handler( request: Request, _cancel: CancellationToken, ) -> Result, ApiError> { - let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; - check_permission(&request, Some(tenant_id))?; + check_permission(&request, Some(tenant_shard_id.tenant_id))?; struct Key(crate::repository::Key); @@ -1415,7 +1462,7 @@ async fn getpage_at_lsn_handler( async { let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); - let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?; + let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?; let page = timeline.get(key.0, lsn, &ctx).await?; @@ -1427,7 +1474,7 @@ async fn getpage_at_lsn_handler( .unwrap(), ) } - .instrument(info_span!("timeline_get", %tenant_id, %timeline_id)) + .instrument(info_span!("timeline_get", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id)) .await } @@ -1435,95 +1482,34 @@ async fn timeline_collect_keyspace( request: Request, _cancel: CancellationToken, ) -> Result, ApiError> { - let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; - check_permission(&request, Some(tenant_id))?; - - struct Partitioning { - keys: crate::keyspace::KeySpace, - - at_lsn: Lsn, - } - - impl serde::Serialize for Partitioning { - fn serialize(&self, serializer: S) -> std::result::Result - where - S: serde::Serializer, - { - use serde::ser::SerializeMap; - let mut map = serializer.serialize_map(Some(2))?; - map.serialize_key("keys")?; - map.serialize_value(&KeySpace(&self.keys))?; - map.serialize_key("at_lsn")?; - map.serialize_value(&WithDisplay(&self.at_lsn))?; - map.end() - } - } - - struct WithDisplay<'a, T>(&'a T); - - impl<'a, T: std::fmt::Display> serde::Serialize for WithDisplay<'a, T> { - fn serialize(&self, serializer: S) -> std::result::Result - where - S: serde::Serializer, - { - serializer.collect_str(&self.0) - } - } - - struct KeySpace<'a>(&'a crate::keyspace::KeySpace); - - impl<'a> serde::Serialize for KeySpace<'a> { - fn serialize(&self, serializer: S) -> std::result::Result - where - S: serde::Serializer, - { - use serde::ser::SerializeSeq; - let mut seq = serializer.serialize_seq(Some(self.0.ranges.len()))?; - for kr in &self.0.ranges { - seq.serialize_element(&KeyRange(kr))?; - } - seq.end() - } - } - - struct KeyRange<'a>(&'a std::ops::Range); - - impl<'a> serde::Serialize for KeyRange<'a> { - fn serialize(&self, serializer: S) -> Result - where - S: serde::Serializer, - { - use serde::ser::SerializeTuple; - let mut t = serializer.serialize_tuple(2)?; - t.serialize_element(&WithDisplay(&self.0.start))?; - t.serialize_element(&WithDisplay(&self.0.end))?; - t.end() - } - } + check_permission(&request, Some(tenant_shard_id.tenant_id))?; let at_lsn: Option = parse_query_param(&request, "at_lsn")?; async { let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); - let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?; + let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?; let at_lsn = at_lsn.unwrap_or_else(|| timeline.get_last_record_lsn()); let keys = timeline .collect_keyspace(at_lsn, &ctx) .await .map_err(|e| ApiError::InternalServerError(e.into()))?; - json_response(StatusCode::OK, Partitioning { keys, at_lsn }) + let res = pageserver_api::models::partitioning::Partitioning { keys, at_lsn }; + + json_response(StatusCode::OK, res) } - .instrument(info_span!("timeline_collect_keyspace", %tenant_id, %timeline_id)) + .instrument(info_span!("timeline_collect_keyspace", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id)) .await } async fn active_timeline_of_active_tenant( - tenant_id: TenantId, + tenant_shard_id: TenantShardId, timeline_id: TimelineId, ) -> Result, ApiError> { - let tenant = mgr::get_tenant(tenant_id, true)?; + let tenant = mgr::get_tenant(tenant_shard_id, true)?; tenant .get_timeline(timeline_id, true) .map_err(|e| ApiError::NotFound(e.into())) @@ -1545,7 +1531,7 @@ async fn always_panic_handler( async fn disk_usage_eviction_run( mut r: Request, - _cancel: CancellationToken, + cancel: CancellationToken, ) -> Result, ApiError> { check_permission(&r, None)?; @@ -1553,19 +1539,22 @@ async fn disk_usage_eviction_run( struct Config { /// How many bytes to evict before reporting that pressure is relieved. evict_bytes: u64, + + #[serde(default)] + eviction_order: crate::disk_usage_eviction_task::EvictionOrder, } #[derive(Debug, Clone, Copy, serde::Serialize)] struct Usage { // remains unchanged after instantiation of the struct - config: Config, + evict_bytes: u64, // updated by `add_available_bytes` freed_bytes: u64, } impl crate::disk_usage_eviction_task::Usage for Usage { fn has_pressure(&self) -> bool { - self.config.evict_bytes > self.freed_bytes + self.evict_bytes > self.freed_bytes } fn add_available_bytes(&mut self, bytes: u64) { @@ -1573,57 +1562,52 @@ async fn disk_usage_eviction_run( } } - let config = json_request::(&mut r) - .await - .map_err(|_| ApiError::BadRequest(anyhow::anyhow!("invalid JSON body")))?; + let config = json_request::(&mut r).await?; let usage = Usage { - config, + evict_bytes: config.evict_bytes, freed_bytes: 0, }; - let (tx, rx) = tokio::sync::oneshot::channel(); - let state = get_state(&r); - if state.remote_storage.as_ref().is_none() { + let Some(storage) = state.remote_storage.as_ref() else { return Err(ApiError::InternalServerError(anyhow::anyhow!( "remote storage not configured, cannot run eviction iteration" ))); - } + }; let state = state.disk_usage_eviction_state.clone(); - let cancel = CancellationToken::new(); - let child_cancel = cancel.clone(); - let _g = cancel.drop_guard(); + let res = crate::disk_usage_eviction_task::disk_usage_eviction_task_iteration_impl( + &state, + storage, + usage, + config.eviction_order, + &cancel, + ) + .await; - crate::task_mgr::spawn( - crate::task_mgr::BACKGROUND_RUNTIME.handle(), - TaskKind::DiskUsageEviction, - None, - None, - "ondemand disk usage eviction", - false, - async move { - let res = crate::disk_usage_eviction_task::disk_usage_eviction_task_iteration_impl( - &state, - usage, - &child_cancel, - ) - .await; + info!(?res, "disk_usage_eviction_task_iteration_impl finished"); - info!(?res, "disk_usage_eviction_task_iteration_impl finished"); + let res = res.map_err(ApiError::InternalServerError)?; - let _ = tx.send(res); - Ok(()) - } - .in_current_span(), - ); + json_response(StatusCode::OK, res) +} - let response = rx.await.unwrap().map_err(ApiError::InternalServerError)?; +async fn secondary_upload_handler( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + let state = get_state(&request); + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; + state + .secondary_controller + .upload_tenant(tenant_shard_id) + .await + .map_err(ApiError::InternalServerError)?; - json_response(StatusCode::OK, response) + json_response(StatusCode::OK, ()) } async fn handler_404(_: Request) -> Result, ApiError> { @@ -1800,23 +1784,25 @@ pub fn make_router( }) .get("/v1/tenant", |r| api_handler(r, tenant_list_handler)) .post("/v1/tenant", |r| api_handler(r, tenant_create_handler)) - .get("/v1/tenant/:tenant_id", |r| api_handler(r, tenant_status)) + .get("/v1/tenant/:tenant_shard_id", |r| { + api_handler(r, tenant_status) + }) .delete("/v1/tenant/:tenant_shard_id", |r| { api_handler(r, tenant_delete_handler) }) - .get("/v1/tenant/:tenant_id/synthetic_size", |r| { + .get("/v1/tenant/:tenant_shard_id/synthetic_size", |r| { api_handler(r, tenant_size_handler) }) .put("/v1/tenant/config", |r| { api_handler(r, update_tenant_config_handler) }) - .get("/v1/tenant/:tenant_id/config", |r| { + .get("/v1/tenant/:tenant_shard_id/config", |r| { api_handler(r, get_tenant_config_handler) }) .put("/v1/tenant/:tenant_shard_id/location_config", |r| { api_handler(r, put_tenant_location_config_handler) }) - .get("/v1/tenant/:tenant_id/timeline", |r| { + .get("/v1/tenant/:tenant_shard_id/timeline", |r| { api_handler(r, timeline_list_handler) }) .post("/v1/tenant/:tenant_shard_id/timeline", |r| { @@ -1828,73 +1814,83 @@ pub fn make_router( .post("/v1/tenant/:tenant_id/detach", |r| { api_handler(r, tenant_detach_handler) }) + .post("/v1/tenant/:tenant_shard_id/reset", |r| { + api_handler(r, tenant_reset_handler) + }) .post("/v1/tenant/:tenant_id/load", |r| { api_handler(r, tenant_load_handler) }) .post("/v1/tenant/:tenant_id/ignore", |r| { api_handler(r, tenant_ignore_handler) }) - .get("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| { + .get("/v1/tenant/:tenant_shard_id/timeline/:timeline_id", |r| { api_handler(r, timeline_detail_handler) }) .get( - "/v1/tenant/:tenant_id/timeline/:timeline_id/get_lsn_by_timestamp", + "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/get_lsn_by_timestamp", |r| api_handler(r, get_lsn_by_timestamp_handler), ) .get( - "/v1/tenant/:tenant_id/timeline/:timeline_id/get_timestamp_of_lsn", + "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/get_timestamp_of_lsn", |r| api_handler(r, get_timestamp_of_lsn_handler), ) - .put("/v1/tenant/:tenant_id/timeline/:timeline_id/do_gc", |r| { - api_handler(r, timeline_gc_handler) - }) - .put("/v1/tenant/:tenant_id/timeline/:timeline_id/compact", |r| { - testing_api_handler("run timeline compaction", r, timeline_compact_handler) - }) .put( - "/v1/tenant/:tenant_id/timeline/:timeline_id/checkpoint", + "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/do_gc", + |r| api_handler(r, timeline_gc_handler), + ) + .put( + "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/compact", + |r| testing_api_handler("run timeline compaction", r, timeline_compact_handler), + ) + .put( + "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/checkpoint", |r| testing_api_handler("run timeline checkpoint", r, timeline_checkpoint_handler), ) .post( - "/v1/tenant/:tenant_id/timeline/:timeline_id/download_remote_layers", + "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/download_remote_layers", |r| api_handler(r, timeline_download_remote_layers_handler_post), ) .get( - "/v1/tenant/:tenant_id/timeline/:timeline_id/download_remote_layers", + "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/download_remote_layers", |r| api_handler(r, timeline_download_remote_layers_handler_get), ) .delete("/v1/tenant/:tenant_shard_id/timeline/:timeline_id", |r| { api_handler(r, timeline_delete_handler) }) - .get("/v1/tenant/:tenant_id/timeline/:timeline_id/layer", |r| { - api_handler(r, layer_map_info_handler) - }) .get( - "/v1/tenant/:tenant_id/timeline/:timeline_id/layer/:layer_file_name", + "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/layer", + |r| api_handler(r, layer_map_info_handler), + ) + .get( + "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/layer/:layer_file_name", |r| api_handler(r, layer_download_handler), ) .delete( - "/v1/tenant/:tenant_id/timeline/:timeline_id/layer/:layer_file_name", + "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/layer/:layer_file_name", |r| api_handler(r, evict_timeline_layer_handler), ) + .post("/v1/tenant/:tenant_shard_id/heatmap_upload", |r| { + api_handler(r, secondary_upload_handler) + }) .put("/v1/disk_usage_eviction/run", |r| { api_handler(r, disk_usage_eviction_run) }) .put("/v1/deletion_queue/flush", |r| { api_handler(r, deletion_queue_flush) }) - .put("/v1/tenant/:tenant_id/break", |r| { + .put("/v1/tenant/:tenant_shard_id/break", |r| { testing_api_handler("set tenant state to broken", r, handle_tenant_break) }) .get("/v1/panic", |r| api_handler(r, always_panic_handler)) .post("/v1/tracing/event", |r| { testing_api_handler("emit a tracing event", r, post_tracing_event_handler) }) - .get("/v1/tenant/:tenant_id/timeline/:timeline_id/getpage", |r| { - testing_api_handler("getpage@lsn", r, getpage_at_lsn_handler) - }) .get( - "/v1/tenant/:tenant_id/timeline/:timeline_id/keyspace", + "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/getpage", + |r| testing_api_handler("getpage@lsn", r, getpage_at_lsn_handler), + ) + .get( + "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/keyspace", |r| testing_api_handler("read out the keyspace", r, timeline_collect_keyspace), ) .any(handler_404)) diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs index 770458e02e..d66df36b3a 100644 --- a/pageserver/src/import_datadir.rs +++ b/pageserver/src/import_datadir.rs @@ -2,17 +2,18 @@ //! Import data and WAL from a PostgreSQL data directory and WAL segments into //! a neon Timeline. //! +use std::io::SeekFrom; use std::path::{Path, PathBuf}; -use std::pin::Pin; -use std::task::{self, Poll}; use anyhow::{bail, ensure, Context, Result}; +use async_compression::tokio::bufread::ZstdDecoder; use async_compression::{tokio::write::ZstdEncoder, zstd::CParameter, Level}; use bytes::Bytes; use camino::Utf8Path; use futures::StreamExt; use nix::NixPath; -use tokio::io::{AsyncRead, AsyncReadExt, AsyncWrite, AsyncWriteExt}; +use tokio::fs::{File, OpenOptions}; +use tokio::io::{AsyncBufRead, AsyncRead, AsyncReadExt, AsyncSeekExt, AsyncWriteExt}; use tokio_tar::Archive; use tokio_tar::Builder; use tokio_tar::HeaderMode; @@ -20,6 +21,7 @@ use tracing::*; use walkdir::WalkDir; use crate::context::RequestContext; +use crate::metrics::WAL_INGEST; use crate::pgdatadir_mapping::*; use crate::tenant::remote_timeline_client::INITDB_PATH; use crate::tenant::Timeline; @@ -311,13 +313,16 @@ async fn import_wal( waldecoder.feed_bytes(&buf); let mut nrecords = 0; - let mut modification = tline.begin_modification(endpoint); + let mut modification = tline.begin_modification(last_lsn); let mut decoded = DecodedWALRecord::default(); while last_lsn <= endpoint { if let Some((lsn, recdata)) = waldecoder.poll_decode()? { walingest .ingest_record(recdata, lsn, &mut modification, &mut decoded, ctx) .await?; + WAL_INGEST.records_committed.inc(); + + modification.commit(ctx).await?; last_lsn = lsn; nrecords += 1; @@ -447,13 +452,14 @@ pub async fn import_wal_from_tar( waldecoder.feed_bytes(&bytes[offset..]); - let mut modification = tline.begin_modification(end_lsn); + let mut modification = tline.begin_modification(last_lsn); let mut decoded = DecodedWALRecord::default(); while last_lsn <= end_lsn { if let Some((lsn, recdata)) = waldecoder.poll_decode()? { walingest .ingest_record(recdata, lsn, &mut modification, &mut decoded, ctx) .await?; + modification.commit(ctx).await?; last_lsn = lsn; debug!("imported record at {} (end {})", lsn, end_lsn); @@ -628,70 +634,16 @@ async fn read_all_bytes(reader: &mut (impl AsyncRead + Unpin)) -> Result Ok(Bytes::from(buf)) } -/// An in-memory buffer implementing `AsyncWrite`, inserting yields every now and then -/// -/// The number of yields is bounded by above by the number of times poll_write is called, -/// so calling it with 8 KB chunks and 8 MB chunks gives the same number of yields in total. -/// This is an explicit choice as the `YieldingVec` is meant to give the async executor -/// breathing room between units of CPU intensive preparation of buffers to be written. -/// Once a write call is issued, the whole buffer has been prepared already, so there is no -/// gain in splitting up the memcopy further. -struct YieldingVec { - yield_budget: usize, - // the buffer written into - buf: Vec, -} +pub async fn create_tar_zst(pgdata_path: &Utf8Path, tmp_path: &Utf8Path) -> Result<(File, u64)> { + let file = OpenOptions::new() + .create(true) + .truncate(true) + .read(true) + .write(true) + .open(&tmp_path) + .await + .with_context(|| format!("tempfile creation {tmp_path}"))?; -impl YieldingVec { - fn new() -> Self { - Self { - yield_budget: 0, - buf: Vec::new(), - } - } - // Whether we should yield for a read operation of given size - fn should_yield(&mut self, add_buf_len: usize) -> bool { - // Set this limit to a small value so that we are a - // good async citizen and yield repeatedly (but not - // too often for many small writes to cause many yields) - const YIELD_DIST: usize = 1024; - - let target_buf_len = self.buf.len() + add_buf_len; - let ret = self.yield_budget / YIELD_DIST < target_buf_len / YIELD_DIST; - if self.yield_budget < target_buf_len { - self.yield_budget += add_buf_len; - } - ret - } -} - -impl AsyncWrite for YieldingVec { - fn poll_write( - mut self: Pin<&mut Self>, - cx: &mut task::Context<'_>, - buf: &[u8], - ) -> Poll> { - if self.should_yield(buf.len()) { - cx.waker().wake_by_ref(); - return Poll::Pending; - } - self.get_mut().buf.extend_from_slice(buf); - Poll::Ready(Ok(buf.len())) - } - - fn poll_flush(self: Pin<&mut Self>, _cx: &mut task::Context<'_>) -> Poll> { - Poll::Ready(Ok(())) - } - - fn poll_shutdown( - self: Pin<&mut Self>, - _cx: &mut task::Context<'_>, - ) -> Poll> { - Poll::Ready(Ok(())) - } -} - -pub async fn create_tar_zst(pgdata_path: &Utf8Path) -> Result> { let mut paths = Vec::new(); for entry in WalkDir::new(pgdata_path) { let entry = entry?; @@ -706,7 +658,7 @@ pub async fn create_tar_zst(pgdata_path: &Utf8Path) -> Result> { // Do a sort to get a more consistent listing paths.sort_unstable(); let zstd = ZstdEncoder::with_quality_and_params( - YieldingVec::new(), + file, Level::Default, &[CParameter::enable_long_distance_matching(true)], ); @@ -724,11 +676,22 @@ pub async fn create_tar_zst(pgdata_path: &Utf8Path) -> Result> { } let mut zstd = builder.into_inner().await?; zstd.shutdown().await?; - let compressed = zstd.into_inner(); - let compressed_len = compressed.buf.len(); - const INITDB_TAR_ZST_WARN_LIMIT: usize = 2_000_000; + let mut compressed = zstd.into_inner(); + let compressed_len = compressed.metadata().await?.len(); + const INITDB_TAR_ZST_WARN_LIMIT: u64 = 2 * 1024 * 1024; if compressed_len > INITDB_TAR_ZST_WARN_LIMIT { warn!("compressed {INITDB_PATH} size of {compressed_len} is above limit {INITDB_TAR_ZST_WARN_LIMIT}."); } - Ok(compressed.buf) + compressed.seek(SeekFrom::Start(0)).await?; + Ok((compressed, compressed_len)) +} + +pub async fn extract_tar_zst( + pgdata_path: &Utf8Path, + tar_zst: impl AsyncBufRead + Unpin, +) -> Result<()> { + let tar = Box::pin(ZstdDecoder::new(tar_zst)); + let mut archive = Archive::new(tar); + archive.unpack(pgdata_path).await?; + Ok(()) } diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index 3f74694ef2..c1ce0af47b 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -10,7 +10,7 @@ pub mod deletion_queue; pub mod disk_usage_eviction_task; pub mod http; pub mod import_datadir; -pub mod keyspace; +pub use pageserver_api::keyspace; pub mod metrics; pub mod page_cache; pub mod page_service; @@ -25,8 +25,6 @@ pub mod walingest; pub mod walrecord; pub mod walredo; -pub mod failpoint_support; - use crate::task_mgr::TaskKind; use camino::Utf8Path; use deletion_queue::DeletionQueue; @@ -186,13 +184,6 @@ pub struct InitializationOrder { /// Each initial tenant load task carries this until completion. pub initial_tenant_load: Option, - /// Barrier for when we can start initial logical size calculations. - pub initial_logical_size_can_start: utils::completion::Barrier, - - /// Each timeline owns a clone of this to be consumed on the initial logical size calculation - /// attempt. It is important to drop this once the attempt has completed. - pub initial_logical_size_attempt: Option, - /// Barrier for when we can start any background jobs. /// /// This can be broken up later on, but right now there is just one class of a background job. @@ -212,7 +203,7 @@ async fn timed( match tokio::time::timeout(warn_at, &mut fut).await { Ok(ret) => { tracing::info!( - task = name, + stage = name, elapsed_ms = started.elapsed().as_millis(), "completed" ); @@ -220,7 +211,7 @@ async fn timed( } Err(_) => { tracing::info!( - task = name, + stage = name, elapsed_ms = started.elapsed().as_millis(), "still waiting, taking longer than expected..." ); @@ -229,7 +220,7 @@ async fn timed( // this has a global allowed_errors tracing::warn!( - task = name, + stage = name, elapsed_ms = started.elapsed().as_millis(), "completed, took longer than expected" ); diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index c15b1fce63..4725903783 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -2,9 +2,10 @@ use enum_map::EnumMap; use metrics::metric_vec_duration::DurationResultObserver; use metrics::{ register_counter_vec, register_gauge_vec, register_histogram, register_histogram_vec, - register_int_counter, register_int_counter_vec, register_int_gauge, register_int_gauge_vec, - register_uint_gauge, register_uint_gauge_vec, Counter, CounterVec, GaugeVec, Histogram, - HistogramVec, IntCounter, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec, + register_int_counter, register_int_counter_pair_vec, register_int_counter_vec, + register_int_gauge, register_int_gauge_vec, register_uint_gauge, register_uint_gauge_vec, + Counter, CounterVec, GaugeVec, Histogram, HistogramVec, IntCounter, IntCounterPairVec, + IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec, }; use once_cell::sync::Lazy; use pageserver_api::shard::TenantShardId; @@ -285,6 +286,63 @@ pub static PAGE_CACHE_SIZE: Lazy = Lazy::new(|| PageCacheS }, }); +pub(crate) mod page_cache_eviction_metrics { + use std::num::NonZeroUsize; + + use metrics::{register_int_counter_vec, IntCounter, IntCounterVec}; + use once_cell::sync::Lazy; + + #[derive(Clone, Copy)] + pub(crate) enum Outcome { + FoundSlotUnused { iters: NonZeroUsize }, + FoundSlotEvicted { iters: NonZeroUsize }, + ItersExceeded { iters: NonZeroUsize }, + } + + static ITERS_TOTAL_VEC: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "pageserver_page_cache_find_victim_iters_total", + "Counter for the number of iterations in the find_victim loop", + &["outcome"], + ) + .expect("failed to define a metric") + }); + + static CALLS_VEC: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "pageserver_page_cache_find_victim_calls", + "Incremented at the end of each find_victim() call.\ + Filter by outcome to get e.g., eviction rate.", + &["outcome"] + ) + .unwrap() + }); + + pub(crate) fn observe(outcome: Outcome) { + macro_rules! dry { + ($label:literal, $iters:expr) => {{ + static LABEL: &'static str = $label; + static ITERS_TOTAL: Lazy = + Lazy::new(|| ITERS_TOTAL_VEC.with_label_values(&[LABEL])); + static CALLS: Lazy = + Lazy::new(|| CALLS_VEC.with_label_values(&[LABEL])); + ITERS_TOTAL.inc_by(($iters.get()) as u64); + CALLS.inc(); + }}; + } + match outcome { + Outcome::FoundSlotUnused { iters } => dry!("found_empty", iters), + Outcome::FoundSlotEvicted { iters } => { + dry!("found_evicted", iters) + } + Outcome::ItersExceeded { iters } => { + dry!("err_iters_exceeded", iters); + super::page_cache_errors_inc(super::PageCacheErrorKind::EvictIterLimit); + } + } + } +} + pub(crate) static PAGE_CACHE_ACQUIRE_PINNED_SLOT_TIME: Lazy = Lazy::new(|| { register_histogram!( "pageserver_page_cache_acquire_pinned_slot_seconds", @@ -294,14 +352,6 @@ pub(crate) static PAGE_CACHE_ACQUIRE_PINNED_SLOT_TIME: Lazy = Lazy::n .expect("failed to define a metric") }); -pub(crate) static PAGE_CACHE_FIND_VICTIMS_ITERS_TOTAL: Lazy = Lazy::new(|| { - register_int_counter!( - "pageserver_page_cache_find_victim_iters_total", - "Counter for the number of iterations in the find_victim loop", - ) - .expect("failed to define a metric") -}); - static PAGE_CACHE_ERRORS: Lazy = Lazy::new(|| { register_int_counter_vec!( "page_cache_errors_total", @@ -403,6 +453,133 @@ static CURRENT_LOGICAL_SIZE: Lazy = Lazy::new(|| { .expect("failed to define current logical size metric") }); +pub(crate) mod initial_logical_size { + use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec}; + use once_cell::sync::Lazy; + + pub(crate) struct StartCalculation(IntCounterVec); + pub(crate) static START_CALCULATION: Lazy = Lazy::new(|| { + StartCalculation( + register_int_counter_vec!( + "pageserver_initial_logical_size_start_calculation", + "Incremented each time we start an initial logical size calculation attempt. \ + The `circumstances` label provides some additional details.", + &["attempt", "circumstances"] + ) + .unwrap(), + ) + }); + + struct DropCalculation { + first: IntCounter, + retry: IntCounter, + } + + static DROP_CALCULATION: Lazy = Lazy::new(|| { + let vec = register_int_counter_vec!( + "pageserver_initial_logical_size_drop_calculation", + "Incremented each time we abort a started size calculation attmpt.", + &["attempt"] + ) + .unwrap(); + DropCalculation { + first: vec.with_label_values(&["first"]), + retry: vec.with_label_values(&["retry"]), + } + }); + + pub(crate) struct Calculated { + pub(crate) births: IntCounter, + pub(crate) deaths: IntCounter, + } + + pub(crate) static CALCULATED: Lazy = Lazy::new(|| Calculated { + births: register_int_counter!( + "pageserver_initial_logical_size_finish_calculation", + "Incremented every time we finish calculation of initial logical size.\ + If everything is working well, this should happen at most once per Timeline object." + ) + .unwrap(), + deaths: register_int_counter!( + "pageserver_initial_logical_size_drop_finished_calculation", + "Incremented when we drop a finished initial logical size calculation result.\ + Mainly useful to turn pageserver_initial_logical_size_finish_calculation into a gauge." + ) + .unwrap(), + }); + + pub(crate) struct OngoingCalculationGuard { + inc_drop_calculation: Option, + } + + #[derive(strum_macros::IntoStaticStr)] + pub(crate) enum StartCircumstances { + EmptyInitial, + SkippedConcurrencyLimiter, + AfterBackgroundTasksRateLimit, + } + + impl StartCalculation { + pub(crate) fn first(&self, circumstances: StartCircumstances) -> OngoingCalculationGuard { + let circumstances_label: &'static str = circumstances.into(); + self.0 + .with_label_values(&["first", circumstances_label]) + .inc(); + OngoingCalculationGuard { + inc_drop_calculation: Some(DROP_CALCULATION.first.clone()), + } + } + pub(crate) fn retry(&self, circumstances: StartCircumstances) -> OngoingCalculationGuard { + let circumstances_label: &'static str = circumstances.into(); + self.0 + .with_label_values(&["retry", circumstances_label]) + .inc(); + OngoingCalculationGuard { + inc_drop_calculation: Some(DROP_CALCULATION.retry.clone()), + } + } + } + + impl Drop for OngoingCalculationGuard { + fn drop(&mut self) { + if let Some(counter) = self.inc_drop_calculation.take() { + counter.inc(); + } + } + } + + impl OngoingCalculationGuard { + pub(crate) fn calculation_result_saved(mut self) -> FinishedCalculationGuard { + drop(self.inc_drop_calculation.take()); + CALCULATED.births.inc(); + FinishedCalculationGuard { + inc_on_drop: CALCULATED.deaths.clone(), + } + } + } + + pub(crate) struct FinishedCalculationGuard { + inc_on_drop: IntCounter, + } + + impl Drop for FinishedCalculationGuard { + fn drop(&mut self) { + self.inc_on_drop.inc(); + } + } + + // context: https://github.com/neondatabase/neon/issues/5963 + pub(crate) static TIMELINES_WHERE_WALRECEIVER_GOT_APPROXIMATE_SIZE: Lazy = + Lazy::new(|| { + register_int_counter!( + "pageserver_initial_logical_size_timelines_where_walreceiver_got_approximate_size", + "Counter for the following event: walreceiver calls\ + Timeline::get_current_logical_size() and it returns `Approximate` for the first time." + ) + .unwrap() + }); +} + pub(crate) static TENANT_STATE_METRIC: Lazy = Lazy::new(|| { register_uint_gauge_vec!( "pageserver_tenant_states_count", @@ -478,7 +655,7 @@ static EVICTIONS_WITH_LOW_RESIDENCE_DURATION: Lazy = Lazy::new(|| "pageserver_evictions_with_low_residence_duration", "If a layer is evicted that was resident for less than `low_threshold`, it is counted to this counter. \ Residence duration is determined using the `residence_duration_data_source`.", - &["tenant_id", "timeline_id", "residence_duration_data_source", "low_threshold_secs"] + &["tenant_id", "shard_id", "timeline_id", "residence_duration_data_source", "low_threshold_secs"] ) .expect("failed to define a metric") }); @@ -511,14 +688,54 @@ pub static STARTUP_IS_LOADING: Lazy = Lazy::new(|| { .expect("Failed to register pageserver_startup_is_loading") }); -/// How long did tenants take to go from construction to active state? -pub(crate) static TENANT_ACTIVATION: Lazy = Lazy::new(|| { - register_histogram!( +/// Metrics related to the lifecycle of a [`crate::tenant::Tenant`] object: things +/// like how long it took to load. +/// +/// Note that these are process-global metrics, _not_ per-tenant metrics. Per-tenant +/// metrics are rather expensive, and usually fine grained stuff makes more sense +/// at a timeline level than tenant level. +pub(crate) struct TenantMetrics { + /// How long did tenants take to go from construction to active state? + pub(crate) activation: Histogram, + pub(crate) preload: Histogram, + pub(crate) attach: Histogram, + + /// How many tenants are included in the initial startup of the pagesrever? + pub(crate) startup_scheduled: IntCounter, + pub(crate) startup_complete: IntCounter, +} + +pub(crate) static TENANT: Lazy = Lazy::new(|| { + TenantMetrics { + activation: register_histogram!( "pageserver_tenant_activation_seconds", "Time taken by tenants to activate, in seconds", CRITICAL_OP_BUCKETS.into() ) - .expect("Failed to register pageserver_tenant_activation_seconds metric") + .expect("Failed to register metric"), + preload: register_histogram!( + "pageserver_tenant_preload_seconds", + "Time taken by tenants to load remote metadata on startup/attach, in seconds", + CRITICAL_OP_BUCKETS.into() + ) + .expect("Failed to register metric"), + attach: register_histogram!( + "pageserver_tenant_attach_seconds", + "Time taken by tenants to intialize, after remote metadata is already loaded", + CRITICAL_OP_BUCKETS.into() + ) + .expect("Failed to register metric"), + startup_scheduled: register_int_counter!( + "pageserver_tenant_startup_scheduled", + "Number of tenants included in pageserver startup (doesn't count tenants attached later)" + ).expect("Failed to register metric"), + startup_complete: register_int_counter!( + "pageserver_tenant_startup_complete", + "Number of tenants that have completed warm-up, or activated on-demand during initial startup: \ + should eventually reach `pageserver_tenant_startup_scheduled_total`. Does not include broken \ + tenants: such cases will lead to this metric never reaching the scheduled count." + ).expect("Failed to register metric"), +} }); /// Each `Timeline`'s [`EVICTIONS_WITH_LOW_RESIDENCE_DURATION`] metric. @@ -542,10 +759,16 @@ impl EvictionsWithLowResidenceDurationBuilder { } } - fn build(&self, tenant_id: &str, timeline_id: &str) -> EvictionsWithLowResidenceDuration { + fn build( + &self, + tenant_id: &str, + shard_id: &str, + timeline_id: &str, + ) -> EvictionsWithLowResidenceDuration { let counter = EVICTIONS_WITH_LOW_RESIDENCE_DURATION .get_metric_with_label_values(&[ tenant_id, + shard_id, timeline_id, self.data_source, &EvictionsWithLowResidenceDuration::threshold_label_value(self.threshold), @@ -576,21 +799,24 @@ impl EvictionsWithLowResidenceDuration { pub fn change_threshold( &mut self, tenant_id: &str, + shard_id: &str, timeline_id: &str, new_threshold: Duration, ) { if new_threshold == self.threshold { return; } - let mut with_new = - EvictionsWithLowResidenceDurationBuilder::new(self.data_source, new_threshold) - .build(tenant_id, timeline_id); + let mut with_new = EvictionsWithLowResidenceDurationBuilder::new( + self.data_source, + new_threshold, + ) + .build(tenant_id, shard_id, timeline_id); std::mem::swap(self, &mut with_new); - with_new.remove(tenant_id, timeline_id); + with_new.remove(tenant_id, shard_id, timeline_id); } // This could be a `Drop` impl, but, we need the `tenant_id` and `timeline_id`. - fn remove(&mut self, tenant_id: &str, timeline_id: &str) { + fn remove(&mut self, tenant_id: &str, shard_id: &str, timeline_id: &str) { let Some(_counter) = self.counter.take() else { return; }; @@ -599,6 +825,7 @@ impl EvictionsWithLowResidenceDuration { let removed = EVICTIONS_WITH_LOW_RESIDENCE_DURATION.remove_label_values(&[ tenant_id, + shard_id, timeline_id, self.data_source, &threshold, @@ -651,6 +878,7 @@ const STORAGE_IO_TIME_BUCKETS: &[f64] = &[ )] pub(crate) enum StorageIoOperation { Open, + OpenAfterReplace, Close, CloseByReplace, Read, @@ -664,6 +892,7 @@ impl StorageIoOperation { pub fn as_str(&self) -> &'static str { match self { StorageIoOperation::Open => "open", + StorageIoOperation::OpenAfterReplace => "open-after-replace", StorageIoOperation::Close => "close", StorageIoOperation::CloseByReplace => "close-by-replace", StorageIoOperation::Read => "read", @@ -718,6 +947,25 @@ pub(crate) static STORAGE_IO_SIZE: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); +pub(crate) mod virtual_file_descriptor_cache { + use super::*; + + pub(crate) static SIZE_MAX: Lazy = Lazy::new(|| { + register_uint_gauge!( + "pageserver_virtual_file_descriptor_cache_size_max", + "Maximum number of open file descriptors in the cache." + ) + .unwrap() + }); + + // SIZE_CURRENT: derive it like so: + // ``` + // sum (pageserver_io_operations_seconds_count{operation=~"^(open|open-after-replace)$") + // -ignoring(operation) + // sum(pageserver_io_operations_seconds_count{operation=~"^(close|close-by-replace)$"} + // ``` +} + #[derive(Debug)] struct GlobalAndPerTimelineHistogram { global: Histogram, @@ -775,12 +1023,62 @@ static SMGR_QUERY_TIME_PER_TENANT_TIMELINE: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); +static SMGR_QUERY_TIME_GLOBAL_BUCKETS: Lazy> = Lazy::new(|| { + [ + 1, + 10, + 20, + 40, + 60, + 80, + 100, + 200, + 300, + 400, + 500, + 600, + 700, + 800, + 900, + 1_000, // 1ms + 2_000, + 4_000, + 6_000, + 8_000, + 10_000, // 10ms + 20_000, + 40_000, + 60_000, + 80_000, + 100_000, + 200_000, + 400_000, + 600_000, + 800_000, + 1_000_000, // 1s + 2_000_000, + 4_000_000, + 6_000_000, + 8_000_000, + 10_000_000, // 10s + 20_000_000, + 50_000_000, + 100_000_000, + 200_000_000, + 1_000_000_000, // 1000s + ] + .into_iter() + .map(Duration::from_micros) + .map(|d| d.as_secs_f64()) + .collect() +}); + static SMGR_QUERY_TIME_GLOBAL: Lazy = Lazy::new(|| { register_histogram_vec!( "pageserver_smgr_query_seconds_global", "Time spent on smgr query handling, aggregated by query type.", &["smgr_query_type"], - CRITICAL_OP_BUCKETS.into(), + SMGR_QUERY_TIME_GLOBAL_BUCKETS.clone(), ) .expect("failed to define a metric") }); @@ -1044,6 +1342,52 @@ pub(crate) static DELETION_QUEUE: Lazy = Lazy::new(|| { } }); +pub(crate) struct WalIngestMetrics { + pub(crate) records_received: IntCounter, + pub(crate) records_committed: IntCounter, + pub(crate) records_filtered: IntCounter, +} + +pub(crate) static WAL_INGEST: Lazy = Lazy::new(|| WalIngestMetrics { + records_received: register_int_counter!( + "pageserver_wal_ingest_records_received", + "Number of WAL records received from safekeepers" + ) + .expect("failed to define a metric"), + records_committed: register_int_counter!( + "pageserver_wal_ingest_records_committed", + "Number of WAL records which resulted in writes to pageserver storage" + ) + .expect("failed to define a metric"), + records_filtered: register_int_counter!( + "pageserver_wal_ingest_records_filtered", + "Number of WAL records filtered out due to sharding" + ) + .expect("failed to define a metric"), +}); +pub(crate) struct SecondaryModeMetrics { + pub(crate) upload_heatmap: IntCounter, + pub(crate) upload_heatmap_errors: IntCounter, + pub(crate) upload_heatmap_duration: Histogram, +} +pub(crate) static SECONDARY_MODE: Lazy = Lazy::new(|| SecondaryModeMetrics { + upload_heatmap: register_int_counter!( + "pageserver_secondary_upload_heatmap", + "Number of heatmaps written to remote storage by attached tenants" + ) + .expect("failed to define a metric"), + upload_heatmap_errors: register_int_counter!( + "pageserver_secondary_upload_heatmap_errors", + "Failures writing heatmap to remote storage" + ) + .expect("failed to define a metric"), + upload_heatmap_duration: register_histogram!( + "pageserver_secondary_upload_heatmap_duration", + "Time to build and upload a heatmap, including any waiting inside the S3 client" + ) + .expect("failed to define a metric"), +}); + #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] pub enum RemoteOpKind { Upload, @@ -1094,25 +1438,16 @@ pub(crate) static TENANT_TASK_EVENTS: Lazy = Lazy::new(|| { .expect("Failed to register tenant_task_events metric") }); -pub(crate) static BACKGROUND_LOOP_SEMAPHORE_WAIT_START_COUNT: Lazy = - Lazy::new(|| { - register_int_counter_vec!( - "pageserver_background_loop_semaphore_wait_start_count", - "Counter for background loop concurrency-limiting semaphore acquire calls started", - &["task"], - ) - .unwrap() - }); - -pub(crate) static BACKGROUND_LOOP_SEMAPHORE_WAIT_FINISH_COUNT: Lazy = - Lazy::new(|| { - register_int_counter_vec!( - "pageserver_background_loop_semaphore_wait_finish_count", - "Counter for background loop concurrency-limiting semaphore acquire calls finished", - &["task"], - ) - .unwrap() - }); +pub(crate) static BACKGROUND_LOOP_SEMAPHORE_WAIT_GAUGE: Lazy = Lazy::new(|| { + register_int_counter_pair_vec!( + "pageserver_background_loop_semaphore_wait_start_count", + "Counter for background loop concurrency-limiting semaphore acquire calls started", + "pageserver_background_loop_semaphore_wait_finish_count", + "Counter for background loop concurrency-limiting semaphore acquire calls finished", + &["task"], + ) + .unwrap() +}); pub(crate) static BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT: Lazy = Lazy::new(|| { register_int_counter_vec!( @@ -1265,6 +1600,8 @@ pub(crate) static WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM: Lazy = pub(crate) struct WalRedoProcessCounters { pub(crate) started: IntCounter, pub(crate) killed_by_cause: enum_map::EnumMap, + pub(crate) active_stderr_logger_tasks_started: IntCounter, + pub(crate) active_stderr_logger_tasks_finished: IntCounter, } #[derive(Debug, enum_map::Enum, strum_macros::IntoStaticStr)] @@ -1288,6 +1625,19 @@ impl Default for WalRedoProcessCounters { &["cause"], ) .unwrap(); + + let active_stderr_logger_tasks_started = register_int_counter!( + "pageserver_walredo_stderr_logger_tasks_started_total", + "Number of active walredo stderr logger tasks that have started", + ) + .unwrap(); + + let active_stderr_logger_tasks_finished = register_int_counter!( + "pageserver_walredo_stderr_logger_tasks_finished_total", + "Number of active walredo stderr logger tasks that have finished", + ) + .unwrap(); + Self { started, killed_by_cause: EnumMap::from_array(std::array::from_fn(|i| { @@ -1295,6 +1645,8 @@ impl Default for WalRedoProcessCounters { let cause_str: &'static str = cause.into(); killed.with_label_values(&[cause_str]) })), + active_stderr_logger_tasks_started, + active_stderr_logger_tasks_finished, } } } @@ -1369,6 +1721,7 @@ impl StorageTimeMetrics { #[derive(Debug)] pub struct TimelineMetrics { tenant_id: String, + shard_id: String, timeline_id: String, pub flush_time_histo: StorageTimeMetrics, pub compact_time_histo: StorageTimeMetrics, @@ -1389,11 +1742,12 @@ pub struct TimelineMetrics { impl TimelineMetrics { pub fn new( - tenant_id: &TenantId, + tenant_shard_id: &TenantShardId, timeline_id: &TimelineId, evictions_with_low_residence_duration_builder: EvictionsWithLowResidenceDurationBuilder, ) -> Self { - let tenant_id = tenant_id.to_string(); + let tenant_id = tenant_shard_id.tenant_id.to_string(); + let shard_id = format!("{}", tenant_shard_id.shard_slug()); let timeline_id = timeline_id.to_string(); let flush_time_histo = StorageTimeMetrics::new(StorageTimeOperation::LayerFlush, &tenant_id, &timeline_id); @@ -1430,11 +1784,12 @@ impl TimelineMetrics { let evictions = EVICTIONS .get_metric_with_label_values(&[&tenant_id, &timeline_id]) .unwrap(); - let evictions_with_low_residence_duration = - evictions_with_low_residence_duration_builder.build(&tenant_id, &timeline_id); + let evictions_with_low_residence_duration = evictions_with_low_residence_duration_builder + .build(&tenant_id, &shard_id, &timeline_id); TimelineMetrics { tenant_id, + shard_id, timeline_id, flush_time_histo, compact_time_histo, @@ -1480,6 +1835,7 @@ impl Drop for TimelineMetrics { fn drop(&mut self) { let tenant_id = &self.tenant_id; let timeline_id = &self.timeline_id; + let shard_id = &self.shard_id; let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, timeline_id]); { RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(self.resident_physical_size_get()); @@ -1493,7 +1849,7 @@ impl Drop for TimelineMetrics { self.evictions_with_low_residence_duration .write() .unwrap() - .remove(tenant_id, timeline_id); + .remove(tenant_id, shard_id, timeline_id); // The following metrics are born outside of the TimelineMetrics lifecycle but still // removed at the end of it. The idea is to have the metrics outlive the @@ -1951,9 +2307,14 @@ pub fn preinitialize_metrics() { // Deletion queue stats Lazy::force(&DELETION_QUEUE); + // Tenant stats + Lazy::force(&TENANT); + // Tenant manager stats Lazy::force(&TENANT_MANAGER); + Lazy::force(&crate::tenant::storage_layer::layer::LAYER_IMPL_METRICS); + // countervecs [&BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT] .into_iter() diff --git a/pageserver/src/page_cache.rs b/pageserver/src/page_cache.rs index 0702057766..c3c98af406 100644 --- a/pageserver/src/page_cache.rs +++ b/pageserver/src/page_cache.rs @@ -28,7 +28,7 @@ //! Page cache maps from a cache key to a buffer slot. //! The cache key uniquely identifies the piece of data that is being cached. //! -//! The cache key for **materialized pages** is [`TenantId`], [`TimelineId`], [`Key`], and [`Lsn`]. +//! The cache key for **materialized pages** is [`TenantShardId`], [`TimelineId`], [`Key`], and [`Lsn`]. //! Use [`PageCache::memorize_materialized_page`] and [`PageCache::lookup_materialized_page`] for fill & access. //! //! The cache key for **immutable file** pages is [`FileId`] and a block number. @@ -83,12 +83,14 @@ use std::{ use anyhow::Context; use once_cell::sync::OnceCell; -use utils::{ - id::{TenantId, TimelineId}, - lsn::Lsn, -}; +use pageserver_api::shard::TenantShardId; +use utils::{id::TimelineId, lsn::Lsn}; -use crate::{context::RequestContext, metrics::PageCacheSizeMetrics, repository::Key}; +use crate::{ + context::RequestContext, + metrics::{page_cache_eviction_metrics, PageCacheSizeMetrics}, + repository::Key, +}; static PAGE_CACHE: OnceCell = OnceCell::new(); const TEST_PAGE_CACHE_SIZE: usize = 50; @@ -150,7 +152,13 @@ enum CacheKey { #[derive(Debug, PartialEq, Eq, Hash, Clone)] struct MaterializedPageHashKey { - tenant_id: TenantId, + /// Why is this TenantShardId rather than TenantId? + /// + /// Usually, the materialized value of a page@lsn is identical on any shard in the same tenant. However, this + /// this not the case for certain internally-generated pages (e.g. relation sizes). In future, we may make this + /// key smaller by omitting the shard, if we ensure that reads to such pages always skip the cache, or are + /// special-cased in some other way. + tenant_shard_id: TenantShardId, timeline_id: TimelineId, key: Key, } @@ -374,7 +382,7 @@ impl PageCache { /// returned page. pub async fn lookup_materialized_page( &self, - tenant_id: TenantId, + tenant_shard_id: TenantShardId, timeline_id: TimelineId, key: &Key, lsn: Lsn, @@ -391,7 +399,7 @@ impl PageCache { let mut cache_key = CacheKey::MaterializedPage { hash_key: MaterializedPageHashKey { - tenant_id, + tenant_shard_id, timeline_id, key: *key, }, @@ -432,7 +440,7 @@ impl PageCache { /// pub async fn memorize_materialized_page( &self, - tenant_id: TenantId, + tenant_shard_id: TenantShardId, timeline_id: TimelineId, key: Key, lsn: Lsn, @@ -440,7 +448,7 @@ impl PageCache { ) -> anyhow::Result<()> { let cache_key = CacheKey::MaterializedPage { hash_key: MaterializedPageHashKey { - tenant_id, + tenant_shard_id, timeline_id, key, }, @@ -897,8 +905,10 @@ impl PageCache { // Note that just yielding to tokio during iteration without such // priority boosting is likely counter-productive. We'd just give more opportunities // for B to bump usage count, further starving A. - crate::metrics::page_cache_errors_inc( - crate::metrics::PageCacheErrorKind::EvictIterLimit, + page_cache_eviction_metrics::observe( + page_cache_eviction_metrics::Outcome::ItersExceeded { + iters: iters.try_into().unwrap(), + }, ); anyhow::bail!("exceeded evict iter limit"); } @@ -909,8 +919,18 @@ impl PageCache { // remove mapping for old buffer self.remove_mapping(old_key); inner.key = None; + page_cache_eviction_metrics::observe( + page_cache_eviction_metrics::Outcome::FoundSlotEvicted { + iters: iters.try_into().unwrap(), + }, + ); + } else { + page_cache_eviction_metrics::observe( + page_cache_eviction_metrics::Outcome::FoundSlotUnused { + iters: iters.try_into().unwrap(), + }, + ); } - crate::metrics::PAGE_CACHE_FIND_VICTIMS_ITERS_TOTAL.inc_by(iters as u64); return Ok((slot_idx, inner)); } } diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 82c16eb9bd..d478d375f8 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -53,21 +53,23 @@ use crate::context::{DownloadBehavior, RequestContext}; use crate::import_datadir::import_wal_from_tar; use crate::metrics; use crate::metrics::LIVE_CONNECTIONS_COUNT; +use crate::pgdatadir_mapping::{rel_block_to_key, Version}; use crate::task_mgr; use crate::task_mgr::TaskKind; use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id; use crate::tenant::mgr; use crate::tenant::mgr::get_active_tenant_with_timeout; use crate::tenant::mgr::GetActiveTenantError; +use crate::tenant::mgr::ShardSelector; use crate::tenant::Timeline; use crate::trace::Tracer; use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID; use postgres_ffi::BLCKSZ; -// How long we may block waiting for a [`TenantSlot::InProgress`]` and/or a [`Tenant`] which +// How long we may wait for a [`TenantSlot::InProgress`]` and/or a [`Tenant`] which // is not yet in state [`TenantState::Active`]. -const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(5000); +const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(30000); /// Read the end of a tar archive. /// @@ -399,16 +401,19 @@ impl PageServerHandler { { debug_assert_current_span_has_tenant_and_timeline_id(); - // TODO(sharding): enumerate local tenant shards for this tenant, and select the one - // that should serve this request. - - // Make request tracer if needed + // Note that since one connection may contain getpage requests that target different + // shards (e.g. during splitting when the compute is not yet aware of the split), the tenant + // that we look up here may not be the one that serves all the actual requests: we will double + // check the mapping of key->shard later before calling into Timeline for getpage requests. let tenant = mgr::get_active_tenant_with_timeout( tenant_id, + ShardSelector::First, ACTIVE_TENANT_TIMEOUT, &task_mgr::shutdown_token(), ) .await?; + + // Make request tracer if needed let mut tracer = if tenant.get_trace_read_requests() { let connection_id = ConnectionId::generate(); let path = @@ -566,6 +571,7 @@ impl PageServerHandler { info!("creating new timeline"); let tenant = get_active_tenant_with_timeout( tenant_id, + ShardSelector::Zero, ACTIVE_TENANT_TIMEOUT, &task_mgr::shutdown_token(), ) @@ -628,7 +634,7 @@ impl PageServerHandler { debug_assert_current_span_has_tenant_and_timeline_id(); let timeline = self - .get_active_tenant_timeline(tenant_id, timeline_id) + .get_active_tenant_timeline(tenant_id, timeline_id, ShardSelector::Zero) .await?; let last_record_lsn = timeline.get_last_record_lsn(); if last_record_lsn != start_lsn { @@ -741,7 +747,7 @@ impl PageServerHandler { .await?; let exists = timeline - .get_rel_exists(req.rel, lsn, req.latest, ctx) + .get_rel_exists(req.rel, Version::Lsn(lsn), req.latest, ctx) .await?; Ok(PagestreamBeMessage::Exists(PagestreamExistsResponse { @@ -760,7 +766,9 @@ impl PageServerHandler { Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx) .await?; - let n_blocks = timeline.get_rel_size(req.rel, lsn, req.latest, ctx).await?; + let n_blocks = timeline + .get_rel_size(req.rel, Version::Lsn(lsn), req.latest, ctx) + .await?; Ok(PagestreamBeMessage::Nblocks(PagestreamNblocksResponse { n_blocks, @@ -779,7 +787,13 @@ impl PageServerHandler { .await?; let total_blocks = timeline - .get_db_size(DEFAULTTABLESPACE_OID, req.dbnode, lsn, req.latest, ctx) + .get_db_size( + DEFAULTTABLESPACE_OID, + req.dbnode, + Version::Lsn(lsn), + req.latest, + ctx, + ) .await?; let db_size = total_blocks as i64 * BLCKSZ as i64; @@ -788,7 +802,7 @@ impl PageServerHandler { })) } - async fn handle_get_page_at_lsn_request( + async fn do_handle_get_page_at_lsn_request( &self, timeline: &Timeline, req: &PagestreamGetPageRequest, @@ -798,17 +812,8 @@ impl PageServerHandler { let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx) .await?; - /* - // Add a 1s delay to some requests. The delay helps the requests to - // hit the race condition from github issue #1047 more easily. - use rand::Rng; - if rand::thread_rng().gen::() < 5 { - std::thread::sleep(std::time::Duration::from_millis(1000)); - } - */ - let page = timeline - .get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest, ctx) + .get_rel_page_at_lsn(req.rel, req.blkno, Version::Lsn(lsn), req.latest, ctx) .await?; Ok(PagestreamBeMessage::GetPage(PagestreamGetPageResponse { @@ -816,6 +821,58 @@ impl PageServerHandler { })) } + async fn handle_get_page_at_lsn_request( + &self, + timeline: &Timeline, + req: &PagestreamGetPageRequest, + ctx: &RequestContext, + ) -> anyhow::Result { + let key = rel_block_to_key(req.rel, req.blkno); + if timeline.get_shard_identity().is_key_local(&key) { + self.do_handle_get_page_at_lsn_request(timeline, req, ctx) + .await + } else { + // The Tenant shard we looked up at connection start does not hold this particular + // key: look for other shards in this tenant. This scenario occurs if a pageserver + // has multiple shards for the same tenant. + // + // TODO: optimize this (https://github.com/neondatabase/neon/pull/6037) + let timeline = match self + .get_active_tenant_timeline( + timeline.tenant_shard_id.tenant_id, + timeline.timeline_id, + ShardSelector::Page(key), + ) + .await + { + Ok(t) => t, + Err(GetActiveTimelineError::Tenant(GetActiveTenantError::NotFound(_))) => { + // We already know this tenant exists in general, because we resolved it at + // start of connection. Getting a NotFound here indicates that the shard containing + // the requested page is not present on this node. + + // TODO: this should be some kind of structured error that the client will understand, + // so that it can block until its config is updated: this error is expected in the case + // that the Tenant's shards' placements are being updated and the client hasn't been + // informed yet. + // + // https://github.com/neondatabase/neon/issues/6038 + tracing::warn!("Page request routed to wrong shard: my identity {:?}, should go to shard {}, key {}", + timeline.get_shard_identity(), timeline.get_shard_identity().get_shard_number(&key).0, key); + return Err(anyhow::anyhow!("Request routed to wrong shard")); + } + Err(e) => return Err(e.into()), + }; + + // Take a GateGuard for the duration of this request. If we were using our main Timeline object, + // the GateGuard was already held over the whole connection. + let _timeline_guard = timeline.gate.enter().map_err(|_| QueryError::Shutdown)?; + + self.do_handle_get_page_at_lsn_request(&timeline, req, ctx) + .await + } + } + #[allow(clippy::too_many_arguments)] #[instrument(skip_all, fields(?lsn, ?prev_lsn, %full_backup))] async fn handle_basebackup_request( @@ -838,7 +895,7 @@ impl PageServerHandler { // check that the timeline exists let timeline = self - .get_active_tenant_timeline(tenant_id, timeline_id) + .get_active_tenant_timeline(tenant_id, timeline_id, ShardSelector::Zero) .await?; let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); if let Some(lsn) = lsn { @@ -944,9 +1001,11 @@ impl PageServerHandler { &self, tenant_id: TenantId, timeline_id: TimelineId, + selector: ShardSelector, ) -> Result, GetActiveTimelineError> { let tenant = get_active_tenant_with_timeout( tenant_id, + selector, ACTIVE_TENANT_TIMEOUT, &task_mgr::shutdown_token(), ) @@ -1120,7 +1179,7 @@ where self.check_permission(Some(tenant_id))?; let timeline = self - .get_active_tenant_timeline(tenant_id, timeline_id) + .get_active_tenant_timeline(tenant_id, timeline_id, ShardSelector::Zero) .await?; let end_of_timeline = timeline.get_last_record_rlsn(); @@ -1307,6 +1366,7 @@ where let tenant = get_active_tenant_with_timeout( tenant_id, + ShardSelector::Zero, ACTIVE_TENANT_TIMEOUT, &task_mgr::shutdown_token(), ) diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 15d5609ceb..9fe75e5baf 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -11,8 +11,9 @@ use crate::context::RequestContext; use crate::keyspace::{KeySpace, KeySpaceAccum}; use crate::repository::*; use crate::walrecord::NeonWalRecord; -use anyhow::Context; +use anyhow::{ensure, Context}; use bytes::{Buf, Bytes}; +use pageserver_api::key::is_rel_block_key; use pageserver_api::reltag::{RelTag, SlruKind}; use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM}; use postgres_ffi::BLCKSZ; @@ -146,6 +147,7 @@ impl Timeline { { DatadirModification { tline: self, + pending_lsns: Vec::new(), pending_updates: HashMap::new(), pending_deletions: Vec::new(), pending_nblocks: 0, @@ -162,7 +164,7 @@ impl Timeline { &self, tag: RelTag, blknum: BlockNumber, - lsn: Lsn, + version: Version<'_>, latest: bool, ctx: &RequestContext, ) -> Result { @@ -172,17 +174,20 @@ impl Timeline { )); } - let nblocks = self.get_rel_size(tag, lsn, latest, ctx).await?; + let nblocks = self.get_rel_size(tag, version, latest, ctx).await?; if blknum >= nblocks { debug!( "read beyond EOF at {} blk {} at {}, size is {}: returning all-zeros page", - tag, blknum, lsn, nblocks + tag, + blknum, + version.get_lsn(), + nblocks ); return Ok(ZERO_PAGE.clone()); } let key = rel_block_to_key(tag, blknum); - self.get(key, lsn, ctx).await + version.get(self, key, ctx).await } // Get size of a database in blocks @@ -190,16 +195,16 @@ impl Timeline { &self, spcnode: Oid, dbnode: Oid, - lsn: Lsn, + version: Version<'_>, latest: bool, ctx: &RequestContext, ) -> Result { let mut total_blocks = 0; - let rels = self.list_rels(spcnode, dbnode, lsn, ctx).await?; + let rels = self.list_rels(spcnode, dbnode, version, ctx).await?; for rel in rels { - let n_blocks = self.get_rel_size(rel, lsn, latest, ctx).await?; + let n_blocks = self.get_rel_size(rel, version, latest, ctx).await?; total_blocks += n_blocks as usize; } Ok(total_blocks) @@ -209,7 +214,7 @@ impl Timeline { pub async fn get_rel_size( &self, tag: RelTag, - lsn: Lsn, + version: Version<'_>, latest: bool, ctx: &RequestContext, ) -> Result { @@ -219,12 +224,12 @@ impl Timeline { )); } - if let Some(nblocks) = self.get_cached_rel_size(&tag, lsn) { + if let Some(nblocks) = self.get_cached_rel_size(&tag, version.get_lsn()) { return Ok(nblocks); } if (tag.forknum == FSM_FORKNUM || tag.forknum == VISIBILITYMAP_FORKNUM) - && !self.get_rel_exists(tag, lsn, latest, ctx).await? + && !self.get_rel_exists(tag, version, latest, ctx).await? { // FIXME: Postgres sometimes calls smgrcreate() to create // FSM, and smgrnblocks() on it immediately afterwards, @@ -234,7 +239,7 @@ impl Timeline { } let key = rel_size_to_key(tag); - let mut buf = self.get(key, lsn, ctx).await?; + let mut buf = version.get(self, key, ctx).await?; let nblocks = buf.get_u32_le(); if latest { @@ -245,7 +250,7 @@ impl Timeline { // latest=true, then it can not cause cache corruption, because with latest=true // pageserver choose max(request_lsn, last_written_lsn) and so cached value will be // associated with most recent value of LSN. - self.update_cached_rel_size(tag, lsn, nblocks); + self.update_cached_rel_size(tag, version.get_lsn(), nblocks); } Ok(nblocks) } @@ -254,7 +259,7 @@ impl Timeline { pub async fn get_rel_exists( &self, tag: RelTag, - lsn: Lsn, + version: Version<'_>, _latest: bool, ctx: &RequestContext, ) -> Result { @@ -265,12 +270,12 @@ impl Timeline { } // first try to lookup relation in cache - if let Some(_nblocks) = self.get_cached_rel_size(&tag, lsn) { + if let Some(_nblocks) = self.get_cached_rel_size(&tag, version.get_lsn()) { return Ok(true); } // fetch directory listing let key = rel_dir_to_key(tag.spcnode, tag.dbnode); - let buf = self.get(key, lsn, ctx).await?; + let buf = version.get(self, key, ctx).await?; match RelDirectory::des(&buf).context("deserialization failure") { Ok(dir) => { @@ -282,16 +287,20 @@ impl Timeline { } /// Get a list of all existing relations in given tablespace and database. + /// + /// # Cancel-Safety + /// + /// This method is cancellation-safe. pub async fn list_rels( &self, spcnode: Oid, dbnode: Oid, - lsn: Lsn, + version: Version<'_>, ctx: &RequestContext, ) -> Result, PageReconstructError> { // fetch directory listing let key = rel_dir_to_key(spcnode, dbnode); - let buf = self.get(key, lsn, ctx).await?; + let buf = version.get(self, key, ctx).await?; match RelDirectory::des(&buf).context("deserialization failure") { Ok(dir) => { @@ -327,11 +336,11 @@ impl Timeline { &self, kind: SlruKind, segno: u32, - lsn: Lsn, + version: Version<'_>, ctx: &RequestContext, ) -> Result { let key = slru_segment_size_to_key(kind, segno); - let mut buf = self.get(key, lsn, ctx).await?; + let mut buf = version.get(self, key, ctx).await?; Ok(buf.get_u32_le()) } @@ -340,12 +349,12 @@ impl Timeline { &self, kind: SlruKind, segno: u32, - lsn: Lsn, + version: Version<'_>, ctx: &RequestContext, ) -> Result { // fetch directory listing let key = slru_dir_to_key(kind); - let buf = self.get(key, lsn, ctx).await?; + let buf = version.get(self, key, ctx).await?; match SlruSegmentDirectory::des(&buf).context("deserialization failure") { Ok(dir) => { @@ -496,11 +505,11 @@ impl Timeline { mut f: impl FnMut(TimestampTz) -> ControlFlow, ) -> Result { for segno in self - .list_slru_segments(SlruKind::Clog, probe_lsn, ctx) + .list_slru_segments(SlruKind::Clog, Version::Lsn(probe_lsn), ctx) .await? { let nblocks = self - .get_slru_segment_size(SlruKind::Clog, segno, probe_lsn, ctx) + .get_slru_segment_size(SlruKind::Clog, segno, Version::Lsn(probe_lsn), ctx) .await?; for blknum in (0..nblocks).rev() { let clog_page = self @@ -526,13 +535,13 @@ impl Timeline { pub async fn list_slru_segments( &self, kind: SlruKind, - lsn: Lsn, + version: Version<'_>, ctx: &RequestContext, ) -> Result, PageReconstructError> { // fetch directory entry let key = slru_dir_to_key(kind); - let buf = self.get(key, lsn, ctx).await?; + let buf = version.get(self, key, ctx).await?; match SlruSegmentDirectory::des(&buf).context("deserialization failure") { Ok(dir) => Ok(dir.segments), Err(e) => Err(PageReconstructError::from(e)), @@ -543,12 +552,12 @@ impl Timeline { &self, spcnode: Oid, dbnode: Oid, - lsn: Lsn, + version: Version<'_>, ctx: &RequestContext, ) -> Result { let key = relmap_file_key(spcnode, dbnode); - let buf = self.get(key, lsn, ctx).await?; + let buf = version.get(self, key, ctx).await?; Ok(buf) } @@ -630,6 +639,10 @@ impl Timeline { /// /// Only relation blocks are counted currently. That excludes metadata, /// SLRUs, twophase files etc. + /// + /// # Cancel-Safety + /// + /// This method is cancellation-safe. pub async fn get_current_logical_size_non_incremental( &self, lsn: Lsn, @@ -643,7 +656,10 @@ impl Timeline { let mut total_size: u64 = 0; for (spcnode, dbnode) in dbdir.dbdirs.keys() { - for rel in self.list_rels(*spcnode, *dbnode, lsn, ctx).await? { + for rel in self + .list_rels(*spcnode, *dbnode, Version::Lsn(lsn), ctx) + .await? + { if self.cancel.is_cancelled() { return Err(CalculateLogicalSizeError::Cancelled); } @@ -683,7 +699,7 @@ impl Timeline { result.add_key(rel_dir_to_key(spcnode, dbnode)); let mut rels: Vec = self - .list_rels(spcnode, dbnode, lsn, ctx) + .list_rels(spcnode, dbnode, Version::Lsn(lsn), ctx) .await? .into_iter() .collect(); @@ -790,18 +806,39 @@ pub struct DatadirModification<'a> { /// in the state in 'tline' yet. pub tline: &'a Timeline, - /// Lsn assigned by begin_modification - pub lsn: Lsn, + /// Current LSN of the modification + lsn: Lsn, // The modifications are not applied directly to the underlying key-value store. // The put-functions add the modifications here, and they are flushed to the // underlying key-value store by the 'finish' function. - pending_updates: HashMap, - pending_deletions: Vec>, + pending_lsns: Vec, + pending_updates: HashMap>, + pending_deletions: Vec<(Range, Lsn)>, pending_nblocks: i64, } impl<'a> DatadirModification<'a> { + /// Get the current lsn + pub(crate) fn get_lsn(&self) -> Lsn { + self.lsn + } + + /// Set the current lsn + pub(crate) fn set_lsn(&mut self, lsn: Lsn) -> anyhow::Result<()> { + ensure!( + lsn >= self.lsn, + "setting an older lsn {} than {} is not allowed", + lsn, + self.lsn + ); + if lsn > self.lsn { + self.pending_lsns.push(self.lsn); + self.lsn = lsn; + } + Ok(()) + } + /// Initialize a completely new repository. /// /// This inserts the directory metadata entries that are assumed to @@ -813,10 +850,7 @@ impl<'a> DatadirModification<'a> { self.put(DBDIR_KEY, Value::Image(buf.into())); // Create AuxFilesDirectory - let buf = AuxFilesDirectory::ser(&AuxFilesDirectory { - files: HashMap::new(), - })?; - self.put(AUX_FILES_KEY, Value::Image(Bytes::from(buf))); + self.init_aux_dir()?; let buf = TwoPhaseDirectory::ser(&TwoPhaseDirectory { xids: HashSet::new(), @@ -924,10 +958,7 @@ impl<'a> DatadirModification<'a> { self.put(DBDIR_KEY, Value::Image(buf.into())); // Create AuxFilesDirectory as well - let buf = AuxFilesDirectory::ser(&AuxFilesDirectory { - files: HashMap::new(), - })?; - self.put(AUX_FILES_KEY, Value::Image(Bytes::from(buf))); + self.init_aux_dir()?; } if r.is_none() { // Create RelDirectory @@ -981,11 +1012,9 @@ impl<'a> DatadirModification<'a> { dbnode: Oid, ctx: &RequestContext, ) -> anyhow::Result<()> { - let req_lsn = self.tline.get_last_record_lsn(); - let total_blocks = self .tline - .get_db_size(spcnode, dbnode, req_lsn, true, ctx) + .get_db_size(spcnode, dbnode, Version::Modified(self), true, ctx) .await?; // Remove entry from dbdir @@ -1074,8 +1103,11 @@ impl<'a> DatadirModification<'a> { ctx: &RequestContext, ) -> anyhow::Result<()> { anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode); - let last_lsn = self.tline.get_last_record_lsn(); - if self.tline.get_rel_exists(rel, last_lsn, true, ctx).await? { + if self + .tline + .get_rel_exists(rel, Version::Modified(self), true, ctx) + .await? + { let size_key = rel_size_to_key(rel); // Fetch the old size first let old_size = self.get(size_key, ctx).await?.get_u32_le(); @@ -1252,6 +1284,14 @@ impl<'a> DatadirModification<'a> { Ok(()) } + pub fn init_aux_dir(&mut self) -> anyhow::Result<()> { + let buf = AuxFilesDirectory::ser(&AuxFilesDirectory { + files: HashMap::new(), + })?; + self.put(AUX_FILES_KEY, Value::Image(Bytes::from(buf))); + Ok(()) + } + pub async fn put_file( &mut self, path: &str, @@ -1312,17 +1352,23 @@ impl<'a> DatadirModification<'a> { let writer = self.tline.writer().await; // Flush relation and SLRU data blocks, keep metadata. - let mut retained_pending_updates = HashMap::new(); - for (key, value) in self.pending_updates.drain() { - if is_rel_block_key(key) || is_slru_block_key(key) { - // This bails out on first error without modifying pending_updates. - // That's Ok, cf this function's doc comment. - writer.put(key, self.lsn, &value, ctx).await?; - } else { - retained_pending_updates.insert(key, value); + let mut retained_pending_updates = HashMap::<_, Vec<_>>::new(); + for (key, values) in self.pending_updates.drain() { + for (lsn, value) in values { + if is_rel_block_key(&key) || is_slru_block_key(key) { + // This bails out on first error without modifying pending_updates. + // That's Ok, cf this function's doc comment. + writer.put(key, lsn, &value, ctx).await?; + } else { + retained_pending_updates + .entry(key) + .or_default() + .push((lsn, value)); + } } } - self.pending_updates.extend(retained_pending_updates); + + self.pending_updates = retained_pending_updates; if pending_nblocks != 0 { writer.update_current_logical_size(pending_nblocks * i64::from(BLCKSZ)); @@ -1339,18 +1385,28 @@ impl<'a> DatadirModification<'a> { /// pub async fn commit(&mut self, ctx: &RequestContext) -> anyhow::Result<()> { let writer = self.tline.writer().await; - let lsn = self.lsn; + let pending_nblocks = self.pending_nblocks; self.pending_nblocks = 0; - for (key, value) in self.pending_updates.drain() { - writer.put(key, lsn, &value, ctx).await?; - } - for key_range in self.pending_deletions.drain(..) { - writer.delete(key_range, lsn).await?; + if !self.pending_updates.is_empty() { + writer.put_batch(&self.pending_updates, ctx).await?; + self.pending_updates.clear(); } - writer.finish_write(lsn); + if !self.pending_deletions.is_empty() { + writer.delete_batch(&self.pending_deletions).await?; + self.pending_deletions.clear(); + } + + self.pending_lsns.push(self.lsn); + for pending_lsn in self.pending_lsns.drain(..) { + // Ideally, we should be able to call writer.finish_write() only once + // with the highest LSN. However, the last_record_lsn variable in the + // timeline keeps track of the latest LSN and the immediate previous LSN + // so we need to record every LSN to not leave a gap between them. + writer.finish_write(pending_lsn); + } if pending_nblocks != 0 { writer.update_current_logical_size(pending_nblocks * i64::from(BLCKSZ)); @@ -1359,40 +1415,86 @@ impl<'a> DatadirModification<'a> { Ok(()) } + pub(crate) fn len(&self) -> usize { + self.pending_updates.len() + self.pending_deletions.len() + } + // Internal helper functions to batch the modifications async fn get(&self, key: Key, ctx: &RequestContext) -> Result { - // Have we already updated the same key? Read the pending updated + // Have we already updated the same key? Read the latest pending updated // version in that case. // // Note: we don't check pending_deletions. It is an error to request a // value that has been removed, deletion only avoids leaking storage. - if let Some(value) = self.pending_updates.get(&key) { - if let Value::Image(img) = value { - Ok(img.clone()) - } else { - // Currently, we never need to read back a WAL record that we - // inserted in the same "transaction". All the metadata updates - // work directly with Images, and we never need to read actual - // data pages. We could handle this if we had to, by calling - // the walredo manager, but let's keep it simple for now. - Err(PageReconstructError::from(anyhow::anyhow!( - "unexpected pending WAL record" - ))) + if let Some(values) = self.pending_updates.get(&key) { + if let Some((_, value)) = values.last() { + return if let Value::Image(img) = value { + Ok(img.clone()) + } else { + // Currently, we never need to read back a WAL record that we + // inserted in the same "transaction". All the metadata updates + // work directly with Images, and we never need to read actual + // data pages. We could handle this if we had to, by calling + // the walredo manager, but let's keep it simple for now. + Err(PageReconstructError::from(anyhow::anyhow!( + "unexpected pending WAL record" + ))) + }; } - } else { - let lsn = Lsn::max(self.tline.get_last_record_lsn(), self.lsn); - self.tline.get(key, lsn, ctx).await } + let lsn = Lsn::max(self.tline.get_last_record_lsn(), self.lsn); + self.tline.get(key, lsn, ctx).await } fn put(&mut self, key: Key, val: Value) { - self.pending_updates.insert(key, val); + let values = self.pending_updates.entry(key).or_default(); + // Replace the previous value if it exists at the same lsn + if let Some((last_lsn, last_value)) = values.last_mut() { + if *last_lsn == self.lsn { + *last_value = val; + return; + } + } + values.push((self.lsn, val)); } fn delete(&mut self, key_range: Range) { trace!("DELETE {}-{}", key_range.start, key_range.end); - self.pending_deletions.push(key_range); + self.pending_deletions.push((key_range, self.lsn)); + } +} + +/// This struct facilitates accessing either a committed key from the timeline at a +/// specific LSN, or the latest uncommitted key from a pending modification. +/// During WAL ingestion, the records from multiple LSNs may be batched in the same +/// modification before being flushed to the timeline. Hence, the routines in WalIngest +/// need to look up the keys in the modification first before looking them up in the +/// timeline to not miss the latest updates. +#[derive(Clone, Copy)] +pub enum Version<'a> { + Lsn(Lsn), + Modified(&'a DatadirModification<'a>), +} + +impl<'a> Version<'a> { + async fn get( + &self, + timeline: &Timeline, + key: Key, + ctx: &RequestContext, + ) -> Result { + match self { + Version::Lsn(lsn) => timeline.get(key, *lsn, ctx).await, + Version::Modified(modification) => modification.get(key, ctx).await, + } + } + + fn get_lsn(&self) -> Lsn { + match self { + Version::Lsn(lsn) => *lsn, + Version::Modified(modification) => modification.lsn, + } } } @@ -1570,7 +1672,7 @@ fn rel_dir_to_key(spcnode: Oid, dbnode: Oid) -> Key { } } -fn rel_block_to_key(rel: RelTag, blknum: BlockNumber) -> Key { +pub(crate) fn rel_block_to_key(rel: RelTag, blknum: BlockNumber) -> Key { Key { field1: 0x00, field2: rel.spcnode, @@ -1754,6 +1856,14 @@ const AUX_FILES_KEY: Key = Key { // Reverse mappings for a few Keys. // These are needed by WAL redo manager. +// AUX_FILES currently stores only data for logical replication (slots etc), and +// we don't preserve these on a branch because safekeepers can't follow timeline +// switch (and generally it likely should be optional), so ignore these. +pub fn is_inherited_key(key: Key) -> bool { + key != AUX_FILES_KEY +} + +/// Guaranteed to return `Ok()` if [[is_rel_block_key]] returns `true` for `key`. pub fn key_to_rel_block(key: Key) -> anyhow::Result<(RelTag, BlockNumber)> { Ok(match key.field1 { 0x00 => ( @@ -1768,11 +1878,6 @@ pub fn key_to_rel_block(key: Key) -> anyhow::Result<(RelTag, BlockNumber)> { _ => anyhow::bail!("unexpected value kind 0x{:02x}", key.field1), }) } - -fn is_rel_block_key(key: Key) -> bool { - key.field1 == 0x00 && key.field4 != 0 -} - pub fn is_rel_fsm_block_key(key: Key) -> bool { key.field1 == 0x00 && key.field4 != 0 && key.field5 == FSM_FORKNUM && key.field6 != 0xffffffff } diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs index 97d731bf49..c726139524 100644 --- a/pageserver/src/repository.rs +++ b/pageserver/src/repository.rs @@ -2,38 +2,11 @@ use crate::walrecord::NeonWalRecord; use anyhow::Result; use bytes::Bytes; use serde::{Deserialize, Serialize}; -use std::ops::{AddAssign, Range}; +use std::ops::AddAssign; use std::time::Duration; pub use pageserver_api::key::{Key, KEY_SIZE}; -pub fn key_range_size(key_range: &Range) -> u32 { - let start = key_range.start; - let end = key_range.end; - - if end.field1 != start.field1 - || end.field2 != start.field2 - || end.field3 != start.field3 - || end.field4 != start.field4 - { - return u32::MAX; - } - - let start = (start.field5 as u64) << 32 | start.field6 as u64; - let end = (end.field5 as u64) << 32 | end.field6 as u64; - - let diff = end - start; - if diff > u32::MAX as u64 { - u32::MAX - } else { - diff as u32 - } -} - -pub fn singleton_range(key: Key) -> Range { - key..key.next() -} - /// A 'value' stored for a one Key. #[derive(Debug, Clone, Serialize, Deserialize)] #[cfg_attr(test, derive(PartialEq))] diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs index 4270b6edb0..eabb1d0022 100644 --- a/pageserver/src/task_mgr.rs +++ b/pageserver/src/task_mgr.rs @@ -42,6 +42,7 @@ use std::sync::atomic::{AtomicU64, Ordering}; use std::sync::{Arc, Mutex}; use futures::FutureExt; +use pageserver_api::shard::TenantShardId; use tokio::runtime::Runtime; use tokio::task::JoinHandle; use tokio::task_local; @@ -51,7 +52,7 @@ use tracing::{debug, error, info, warn}; use once_cell::sync::Lazy; -use utils::id::{TenantId, TimelineId}; +use utils::id::TimelineId; use crate::shutdown_pageserver; @@ -146,7 +147,7 @@ pub(crate) static BACKGROUND_RUNTIME_WORKER_THREADS: Lazy = Lazy::new(|| // else, but that has not been needed in a long time. std::env::var("TOKIO_WORKER_THREADS") .map(|s| s.parse::().unwrap()) - .unwrap_or_else(|_e| usize::max(1, num_cpus::get())) + .unwrap_or_else(|_e| usize::max(2, num_cpus::get())) }); #[derive(Debug, Clone, Copy)] @@ -257,6 +258,9 @@ pub enum TaskKind { /// See [`crate::disk_usage_eviction_task`]. DiskUsageEviction, + /// See [`crate::tenant::secondary`]. + SecondaryUploads, + // Initial logical size calculation InitialLogicalSizeCalculation, @@ -317,7 +321,7 @@ struct PageServerTask { /// Tasks may optionally be launched for a particular tenant/timeline, enabling /// later cancelling tasks for that tenant/timeline in [`shutdown_tasks`] - tenant_id: Option, + tenant_shard_id: Option, timeline_id: Option, mutable: Mutex, @@ -329,7 +333,7 @@ struct PageServerTask { pub fn spawn( runtime: &tokio::runtime::Handle, kind: TaskKind, - tenant_id: Option, + tenant_shard_id: Option, timeline_id: Option, name: &str, shutdown_process_on_error: bool, @@ -345,7 +349,7 @@ where kind, name: name.to_string(), cancel: cancel.clone(), - tenant_id, + tenant_shard_id, timeline_id, mutable: Mutex::new(MutableTaskState { join_handle: None }), }); @@ -424,28 +428,28 @@ async fn task_finish( Ok(Err(err)) => { if shutdown_process_on_error { error!( - "Shutting down: task '{}' tenant_id: {:?}, timeline_id: {:?} exited with error: {:?}", - task_name, task.tenant_id, task.timeline_id, err + "Shutting down: task '{}' tenant_shard_id: {:?}, timeline_id: {:?} exited with error: {:?}", + task_name, task.tenant_shard_id, task.timeline_id, err ); shutdown_process = true; } else { error!( - "Task '{}' tenant_id: {:?}, timeline_id: {:?} exited with error: {:?}", - task_name, task.tenant_id, task.timeline_id, err + "Task '{}' tenant_shard_id: {:?}, timeline_id: {:?} exited with error: {:?}", + task_name, task.tenant_shard_id, task.timeline_id, err ); } } Err(err) => { if shutdown_process_on_error { error!( - "Shutting down: task '{}' tenant_id: {:?}, timeline_id: {:?} panicked: {:?}", - task_name, task.tenant_id, task.timeline_id, err + "Shutting down: task '{}' tenant_shard_id: {:?}, timeline_id: {:?} panicked: {:?}", + task_name, task.tenant_shard_id, task.timeline_id, err ); shutdown_process = true; } else { error!( - "Task '{}' tenant_id: {:?}, timeline_id: {:?} panicked: {:?}", - task_name, task.tenant_id, task.timeline_id, err + "Task '{}' tenant_shard_id: {:?}, timeline_id: {:?} panicked: {:?}", + task_name, task.tenant_shard_id, task.timeline_id, err ); } } @@ -467,11 +471,11 @@ async fn task_finish( /// /// Or to shut down all tasks for given timeline: /// -/// shutdown_tasks(None, Some(tenant_id), Some(timeline_id)) +/// shutdown_tasks(None, Some(tenant_shard_id), Some(timeline_id)) /// pub async fn shutdown_tasks( kind: Option, - tenant_id: Option, + tenant_shard_id: Option, timeline_id: Option, ) { let mut victim_tasks = Vec::new(); @@ -480,35 +484,35 @@ pub async fn shutdown_tasks( let tasks = TASKS.lock().unwrap(); for task in tasks.values() { if (kind.is_none() || Some(task.kind) == kind) - && (tenant_id.is_none() || task.tenant_id == tenant_id) + && (tenant_shard_id.is_none() || task.tenant_shard_id == tenant_shard_id) && (timeline_id.is_none() || task.timeline_id == timeline_id) { task.cancel.cancel(); victim_tasks.push(( Arc::clone(task), task.kind, - task.tenant_id, + task.tenant_shard_id, task.timeline_id, )); } } } - let log_all = kind.is_none() && tenant_id.is_none() && timeline_id.is_none(); + let log_all = kind.is_none() && tenant_shard_id.is_none() && timeline_id.is_none(); - for (task, task_kind, tenant_id, timeline_id) in victim_tasks { + for (task, task_kind, tenant_shard_id, timeline_id) in victim_tasks { let join_handle = { let mut task_mut = task.mutable.lock().unwrap(); task_mut.join_handle.take() }; if let Some(mut join_handle) = join_handle { if log_all { - if tenant_id.is_none() { + if tenant_shard_id.is_none() { // there are quite few of these info!(name = task.name, kind = ?task_kind, "stopping global task"); } else { // warn to catch these in tests; there shouldn't be any - warn!(name = task.name, tenant_id = ?tenant_id, timeline_id = ?timeline_id, kind = ?task_kind, "stopping left-over"); + warn!(name = task.name, tenant_shard_id = ?tenant_shard_id, timeline_id = ?timeline_id, kind = ?task_kind, "stopping left-over"); } } if tokio::time::timeout(std::time::Duration::from_secs(1), &mut join_handle) @@ -517,12 +521,13 @@ pub async fn shutdown_tasks( { // allow some time to elapse before logging to cut down the number of log // lines. - info!("waiting for {} to shut down", task.name); + info!("waiting for task {} to shut down", task.name); // we never handled this return value, but: // - we don't deschedule which would lead to is_cancelled // - panics are already logged (is_panicked) // - task errors are already logged in the wrapper let _ = join_handle.await; + info!("task {} completed", task.name); } } else { // Possibly one of: @@ -556,9 +561,14 @@ pub async fn shutdown_watcher() { /// cancelled. It can however be moved to other tasks, such as `tokio::task::spawn_blocking` or /// `tokio::task::JoinSet::spawn`. pub fn shutdown_token() -> CancellationToken { - SHUTDOWN_TOKEN - .try_with(|t| t.clone()) - .expect("shutdown_token() called in an unexpected task or thread") + let res = SHUTDOWN_TOKEN.try_with(|t| t.clone()); + + if cfg!(test) { + // in tests this method is called from non-taskmgr spawned tasks, and that is all ok. + res.unwrap_or_default() + } else { + res.expect("shutdown_token() called in an unexpected task or thread") + } } /// Has the current task been requested to shut down? diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 08066a612d..1660de8923 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -12,18 +12,19 @@ //! use anyhow::{bail, Context}; -use bytes::Bytes; use camino::{Utf8Path, Utf8PathBuf}; use enumset::EnumSet; use futures::stream::FuturesUnordered; use futures::FutureExt; use futures::StreamExt; use pageserver_api::models::TimelineState; +use pageserver_api::shard::ShardIdentity; use pageserver_api::shard::TenantShardId; use remote_storage::DownloadError; use remote_storage::GenericRemoteStorage; use std::fmt; use storage_broker::BrokerClientChannel; +use tokio::io::BufReader; use tokio::runtime::Handle; use tokio::sync::watch; use tokio::task::JoinSet; @@ -32,9 +33,12 @@ use tracing::*; use utils::backoff; use utils::completion; use utils::crashsafe::path_with_suffix_extension; +use utils::failpoint_support; use utils::fs_ext; use utils::sync::gate::Gate; use utils::sync::gate::GateGuard; +use utils::timeout::timeout_cancellable; +use utils::timeout::TimeoutCancellableError; use self::config::AttachedLocationConfig; use self::config::AttachmentMode; @@ -47,6 +51,7 @@ use self::mgr::GetActiveTenantError; use self::mgr::GetTenantError; use self::mgr::TenantsMap; use self::remote_timeline_client::RemoteTimelineClient; +use self::timeline::uninit::TimelineExclusionError; use self::timeline::uninit::TimelineUninitMark; use self::timeline::uninit::UninitializedTimeline; use self::timeline::EvictionTaskTenantState; @@ -57,7 +62,7 @@ use crate::deletion_queue::DeletionQueueClient; use crate::deletion_queue::DeletionQueueError; use crate::import_datadir; use crate::is_uninit_mark; -use crate::metrics::TENANT_ACTIVATION; +use crate::metrics::TENANT; use crate::metrics::{remove_tenant_metrics, TENANT_STATE_METRIC, TENANT_SYNTHETIC_SIZE_METRIC}; use crate::repository::GcResult; use crate::task_mgr; @@ -67,6 +72,7 @@ use crate::tenant::config::TenantConfOpt; use crate::tenant::metadata::load_metadata; pub use crate::tenant::remote_timeline_client::index::IndexPart; use crate::tenant::remote_timeline_client::MaybeDeletedIndexPart; +use crate::tenant::remote_timeline_client::INITDB_PATH; use crate::tenant::storage_layer::DeltaLayer; use crate::tenant::storage_layer::ImageLayer; use crate::InitializationOrder; @@ -85,7 +91,6 @@ use std::process::Stdio; use std::sync::atomic::AtomicU64; use std::sync::atomic::Ordering; use std::sync::Arc; -use std::sync::MutexGuard; use std::sync::{Mutex, RwLock}; use std::time::{Duration, Instant}; @@ -142,6 +147,7 @@ pub mod storage_layer; pub mod config; pub mod delete; pub mod mgr; +pub mod secondary; pub mod tasks; pub mod upload_queue; @@ -223,7 +229,7 @@ pub struct Tenant { /// The value creation timestamp, used to measure activation delay, see: /// - loading_started_at: Instant, + constructed_at: Instant, state: watch::Sender, @@ -235,6 +241,9 @@ pub struct Tenant { tenant_shard_id: TenantShardId, + // The detailed sharding information, beyond the number/count in tenant_shard_id + shard_identity: ShardIdentity, + /// The remote storage generation, used to protect S3 objects from split-brain. /// Does not change over the lifetime of the [`Tenant`] object. /// @@ -243,6 +252,12 @@ pub struct Tenant { generation: Generation, timelines: Mutex>>, + + /// During timeline creation, we first insert the TimelineId to the + /// creating map, then `timelines`, then remove it from the creating map. + /// **Lock order**: if acquring both, acquire`timelines` before `timelines_creating` + timelines_creating: std::sync::Mutex>, + // This mutex prevents creation of new timelines during GC. // Adding yet another mutex (in addition to `timelines`) is needed because holding // `timelines` mutex during all GC iteration @@ -264,6 +279,11 @@ pub struct Tenant { eviction_task_tenant_state: tokio::sync::Mutex, + /// If the tenant is in Activating state, notify this to encourage it + /// to proceed to Active as soon as possible, rather than waiting for lazy + /// background warmup. + pub(crate) activate_now_sem: tokio::sync::Semaphore, + pub(crate) delete_progress: Arc>, // Cancellation token fires when we have entered shutdown(). This is a parent of @@ -311,6 +331,9 @@ impl WalRedoManager { } } + /// # Cancel-Safety + /// + /// This method is cancellation-safe. pub async fn request_redo( &self, key: crate::repository::Key, @@ -398,8 +421,10 @@ impl Debug for SetStoppingError { #[derive(thiserror::Error, Debug)] pub enum CreateTimelineError { - #[error("a timeline with the given ID already exists")] - AlreadyExists, + #[error("creation of timeline with the given ID is in progress")] + AlreadyCreating, + #[error("timeline already exists with different parameters")] + Conflict, #[error(transparent)] AncestorLsn(anyhow::Error), #[error("ancestor timeline is not active")] @@ -468,7 +493,6 @@ impl Tenant { index_part: Option, metadata: TimelineMetadata, ancestor: Option>, - init_order: Option<&InitializationOrder>, _ctx: &RequestContext, ) -> anyhow::Result<()> { let tenant_id = self.tenant_shard_id; @@ -478,7 +502,6 @@ impl Tenant { &metadata, ancestor.clone(), resources, - init_order, CreateTimelineCause::Load, )?; let disk_consistent_lsn = timeline.get_disk_consistent_lsn(); @@ -566,15 +589,15 @@ impl Tenant { tenant_shard_id: TenantShardId, resources: TenantSharedResources, attached_conf: AttachedTenantConf, + shard_identity: ShardIdentity, init_order: Option, tenants: &'static std::sync::RwLock, mode: SpawnMode, ctx: &RequestContext, ) -> anyhow::Result> { - // TODO(sharding): make WalRedoManager shard-aware let wal_redo_manager = Arc::new(WalRedoManager::from(PostgresRedoManager::new( conf, - tenant_shard_id.tenant_id, + tenant_shard_id, ))); let TenantSharedResources { @@ -587,6 +610,7 @@ impl Tenant { TenantState::Attaching, conf, attached_conf, + shard_identity, wal_redo_manager, tenant_shard_id, remote_storage.clone(), @@ -600,11 +624,19 @@ impl Tenant { task_mgr::spawn( &tokio::runtime::Handle::current(), TaskKind::Attach, - Some(tenant_shard_id.tenant_id), + Some(tenant_shard_id), None, "attach tenant", false, async move { + // Is this tenant being spawned as part of process startup? + let starting_up = init_order.is_some(); + scopeguard::defer! { + if starting_up { + TENANT.startup_complete.inc(); + } + } + // Ideally we should use Tenant::set_broken_no_wait, but it is not supposed to be used when tenant is in loading state. let make_broken = |t: &Tenant, err: anyhow::Error| { @@ -631,8 +663,62 @@ impl Tenant { .as_mut() .and_then(|x| x.initial_tenant_load_remote.take()); + enum AttachType<'a> { + // During pageserver startup, we are attaching this tenant lazily in the background + Warmup(tokio::sync::SemaphorePermit<'a>), + // During pageserver startup, we are attaching this tenant as soon as we can, + // because a client tried to access it. + OnDemand, + // During normal operations after startup, we are attaching a tenant. + Normal, + } + + // Before doing any I/O, wait for either or: + // - A client to attempt to access to this tenant (on-demand loading) + // - A permit to become available in the warmup semaphore (background warmup) + // + // Some-ness of init_order is how we know if we're attaching during startup or later + // in process lifetime. + let attach_type = if init_order.is_some() { + tokio::select!( + _ = tenant_clone.activate_now_sem.acquire() => { + tracing::info!("Activating tenant (on-demand)"); + AttachType::OnDemand + }, + permit_result = conf.concurrent_tenant_warmup.inner().acquire() => { + match permit_result { + Ok(p) => { + tracing::info!("Activating tenant (warmup)"); + AttachType::Warmup(p) + } + Err(_) => { + // This is unexpected: the warmup semaphore should stay alive + // for the lifetime of init_order. Log a warning and proceed. + tracing::warn!("warmup_limit semaphore unexpectedly closed"); + AttachType::Normal + } + } + + } + _ = tenant_clone.cancel.cancelled() => { + // This is safe, but should be pretty rare: it is interesting if a tenant + // stayed in Activating for such a long time that shutdown found it in + // that state. + tracing::info!(state=%tenant_clone.current_state(), "Tenant shut down before activation"); + return Ok(()); + }, + ) + } else { + AttachType::Normal + }; + + let preload_timer = TENANT.preload.start_timer(); let preload = match mode { - SpawnMode::Create => {None}, + SpawnMode::Create => { + // Don't count the skipped preload into the histogram of preload durations + preload_timer.stop_and_discard(); + None + }, SpawnMode::Normal => { match &remote_storage { Some(remote_storage) => Some( @@ -642,7 +728,11 @@ impl Tenant { tracing::info_span!(parent: None, "attach_preload", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()), ) .await { - Ok(p) => p, + Ok(p) => { + preload_timer.observe_duration(); + p + } + , Err(e) => { make_broken(&tenant_clone, anyhow::anyhow!(e)); return Ok(()); @@ -679,10 +769,6 @@ impl Tenant { // as we are no longer loading, signal completion by dropping // the completion while we resume deletion drop(_completion); - // do not hold to initial_logical_size_attempt as it will prevent loading from proceeding without timeout - let _ = init_order - .as_mut() - .and_then(|x| x.initial_logical_size_attempt.take()); let background_jobs_can_start = init_order.as_ref().map(|x| &x.background_jobs_can_start); if let Some(background) = background_jobs_can_start { @@ -696,7 +782,6 @@ impl Tenant { &tenant_clone, preload, tenants, - init_order, &ctx, ) .await @@ -709,15 +794,43 @@ impl Tenant { } } - match tenant_clone.attach(init_order, preload, &ctx).await { + // We will time the duration of the attach phase unless this is a creation (attach will do no work) + let attach_timer = match mode { + SpawnMode::Create => None, + SpawnMode::Normal => {Some(TENANT.attach.start_timer())} + }; + match tenant_clone.attach(preload, &ctx).await { Ok(()) => { info!("attach finished, activating"); + if let Some(t)= attach_timer {t.observe_duration();} tenant_clone.activate(broker_client, None, &ctx); } Err(e) => { + if let Some(t)= attach_timer {t.observe_duration();} make_broken(&tenant_clone, anyhow::anyhow!(e)); } } + + // If we are doing an opportunistic warmup attachment at startup, initialize + // logical size at the same time. This is better than starting a bunch of idle tenants + // with cold caches and then coming back later to initialize their logical sizes. + // + // It also prevents the warmup proccess competing with the concurrency limit on + // logical size calculations: if logical size calculation semaphore is saturated, + // then warmup will wait for that before proceeding to the next tenant. + if let AttachType::Warmup(_permit) = attach_type { + let mut futs = FuturesUnordered::new(); + let timelines: Vec<_> = tenant_clone.timelines.lock().unwrap().values().cloned().collect(); + for t in timelines { + futs.push(t.await_initial_logical_size()) + } + tracing::info!("Waiting for initial logical sizes while warming up..."); + while futs.next().await.is_some() { + + } + tracing::info!("Warm-up complete"); + } + Ok(()) } .instrument({ @@ -772,20 +885,19 @@ impl Tenant { /// async fn attach( self: &Arc, - init_order: Option, preload: Option, ctx: &RequestContext, ) -> anyhow::Result<()> { span::debug_assert_current_span_has_tenant_id(); - crate::failpoint_support::sleep_millis_async!("before-attaching-tenant"); + failpoint_support::sleep_millis_async!("before-attaching-tenant"); let preload = match preload { Some(p) => p, None => { // Deprecated dev mode: load from local disk state instead of remote storage // https://github.com/neondatabase/neon/issues/5624 - return self.load_local(init_order, ctx).await; + return self.load_local(ctx).await; } }; @@ -795,20 +907,31 @@ impl Tenant { let mut timeline_ancestors = HashMap::new(); let mut existent_timelines = HashSet::new(); for (timeline_id, preload) in preload.timelines { - // In this context a timeline "exists" if it has any content in remote storage: this will - // be our cue to not delete any corresponding local directory - existent_timelines.insert(timeline_id); - let index_part = match preload.index_part { Ok(i) => { debug!("remote index part exists for timeline {timeline_id}"); + // We found index_part on the remote, this is the standard case. + existent_timelines.insert(timeline_id); i } + Err(DownloadError::NotFound) => { + // There is no index_part on the remote. We only get here + // if there is some prefix for the timeline in the remote storage. + // This can e.g. be the initdb.tar.zst archive, maybe a + // remnant from a prior incomplete creation or deletion attempt. + // Delete the local directory as the deciding criterion for a + // timeline's existence is presence of index_part. + info!(%timeline_id, "index_part not found on remote"); + continue; + } Err(e) => { - // Timeline creation is not atomic: we might upload a layer but no index_part. We expect - // that the creation will be retried by the control plane and eventually result in - // a valid loadable state. + // Some (possibly ephemeral) error happened during index_part download. + // Pretend the timeline exists to not delete the timeline directory, + // as it might be a temporary issue and we don't want to re-download + // everything after it resolves. warn!(%timeline_id, "Failed to load index_part from remote storage, failed creation? ({e})"); + + existent_timelines.insert(timeline_id); continue; } }; @@ -869,7 +992,6 @@ impl Tenant { &index_part.metadata, Some(remote_timeline_client), self.deletion_queue_client.clone(), - None, ) .await .context("resume_deletion") @@ -880,7 +1002,7 @@ impl Tenant { // IndexPart is the source of truth. self.clean_up_timelines(&existent_timelines)?; - crate::failpoint_support::sleep_millis_async!("attach-before-activate"); + failpoint_support::sleep_millis_async!("attach-before-activate"); info!("Done"); @@ -994,10 +1116,6 @@ impl Tenant { None }; - // we can load remote timelines during init, but they are assumed to be so rare that - // initialization order is not passed to here. - let init_order = None; - // timeline loading after attach expects to find metadata file for each metadata save_metadata( self.conf, @@ -1015,7 +1133,6 @@ impl Tenant { Some(index_part), remote_metadata, ancestor, - init_order, ctx, ) .await @@ -1027,10 +1144,9 @@ impl Tenant { tenant_shard_id: TenantShardId, reason: String, ) -> Arc { - // TODO(sharding): make WalRedoManager shard-aware let wal_redo_manager = Arc::new(WalRedoManager::from(PostgresRedoManager::new( conf, - tenant_shard_id.tenant_id, + tenant_shard_id, ))); Arc::new(Tenant::new( TenantState::Broken { @@ -1039,6 +1155,9 @@ impl Tenant { }, conf, AttachedTenantConf::try_from(LocationConf::default()).unwrap(), + // Shard identity isn't meaningful for a broken tenant: it's just a placeholder + // to occupy the slot for this TenantShardId. + ShardIdentity::broken(tenant_shard_id.shard_number, tenant_shard_id.shard_count), wal_redo_manager, tenant_shard_id, None, @@ -1257,11 +1376,7 @@ impl Tenant { /// files on disk. Used at pageserver startup. /// /// No background tasks are started as part of this routine. - async fn load_local( - self: &Arc, - init_order: Option, - ctx: &RequestContext, - ) -> anyhow::Result<()> { + async fn load_local(self: &Arc, ctx: &RequestContext) -> anyhow::Result<()> { span::debug_assert_current_span_has_tenant_id(); debug!("loading tenant task"); @@ -1287,7 +1402,7 @@ impl Tenant { // Process loadable timelines first for (timeline_id, local_metadata) in scan.sorted_timelines_to_load { if let Err(e) = self - .load_local_timeline(timeline_id, local_metadata, init_order.as_ref(), ctx, false) + .load_local_timeline(timeline_id, local_metadata, ctx, false) .await { match e { @@ -1321,13 +1436,7 @@ impl Tenant { } Some(local_metadata) => { if let Err(e) = self - .load_local_timeline( - timeline_id, - local_metadata, - init_order.as_ref(), - ctx, - true, - ) + .load_local_timeline(timeline_id, local_metadata, ctx, true) .await { match e { @@ -1355,12 +1464,11 @@ impl Tenant { /// Subroutine of `load_tenant`, to load an individual timeline /// /// NB: The parent is assumed to be already loaded! - #[instrument(skip(self, local_metadata, init_order, ctx))] + #[instrument(skip(self, local_metadata, ctx))] async fn load_local_timeline( self: &Arc, timeline_id: TimelineId, local_metadata: TimelineMetadata, - init_order: Option<&InitializationOrder>, ctx: &RequestContext, found_delete_mark: bool, ) -> Result<(), LoadLocalTimelineError> { @@ -1377,7 +1485,6 @@ impl Tenant { &local_metadata, None, self.deletion_queue_client.clone(), - init_order, ) .await .context("resume deletion") @@ -1394,17 +1501,9 @@ impl Tenant { None }; - self.timeline_init_and_sync( - timeline_id, - resources, - None, - local_metadata, - ancestor, - init_order, - ctx, - ) - .await - .map_err(LoadLocalTimelineError::Load) + self.timeline_init_and_sync(timeline_id, resources, None, local_metadata, ancestor, ctx) + .await + .map_err(LoadLocalTimelineError::Load) } pub(crate) fn tenant_id(&self) -> TenantId { @@ -1452,6 +1551,10 @@ impl Tenant { .collect() } + pub fn list_timeline_ids(&self) -> Vec { + self.timelines.lock().unwrap().keys().cloned().collect() + } + /// This is used to create the initial 'main' timeline during bootstrapping, /// or when importing a new base backup. The caller is expected to load an /// initial image of the datadir to the new timeline after this. @@ -1467,7 +1570,7 @@ impl Tenant { /// For tests, use `DatadirModification::init_empty_test_timeline` + `commit` to setup the /// minimum amount of keys required to get a writable timeline. /// (Without it, `put` might fail due to `repartition` failing.) - pub async fn create_empty_timeline( + pub(crate) async fn create_empty_timeline( &self, new_timeline_id: TimelineId, initdb_lsn: Lsn, @@ -1479,10 +1582,7 @@ impl Tenant { "Cannot create empty timelines on inactive tenant" ); - let timeline_uninit_mark = { - let timelines = self.timelines.lock().unwrap(); - self.create_timeline_uninit_mark(new_timeline_id, &timelines)? - }; + let timeline_uninit_mark = self.create_timeline_uninit_mark(new_timeline_id)?; let new_metadata = TimelineMetadata::new( // Initialize disk_consistent LSN to 0, The caller must import some data to // make it valid, before calling finish_creation() @@ -1558,12 +1658,14 @@ impl Tenant { /// /// If the caller specified the timeline ID to use (`new_timeline_id`), and timeline with /// the same timeline ID already exists, returns CreateTimelineError::AlreadyExists. - pub async fn create_timeline( + #[allow(clippy::too_many_arguments)] + pub(crate) async fn create_timeline( &self, new_timeline_id: TimelineId, ancestor_timeline_id: Option, mut ancestor_start_lsn: Option, pg_version: u32, + load_existing_initdb: Option, broker_client: storage_broker::BrokerClientChannel, ctx: &RequestContext, ) -> Result, CreateTimelineError> { @@ -1578,26 +1680,51 @@ impl Tenant { .enter() .map_err(|_| CreateTimelineError::ShuttingDown)?; - if let Ok(existing) = self.get_timeline(new_timeline_id, false) { - debug!("timeline {new_timeline_id} already exists"); - - if let Some(remote_client) = existing.remote_client.as_ref() { - // Wait for uploads to complete, so that when we return Ok, the timeline - // is known to be durable on remote storage. Just like we do at the end of - // this function, after we have created the timeline ourselves. - // - // We only really care that the initial version of `index_part.json` has - // been uploaded. That's enough to remember that the timeline - // exists. However, there is no function to wait specifically for that so - // we just wait for all in-progress uploads to finish. - remote_client - .wait_completion() - .await - .context("wait for timeline uploads to complete")?; + // Get exclusive access to the timeline ID: this ensures that it does not already exist, + // and that no other creation attempts will be allowed in while we are working. The + // uninit_mark is a guard. + let uninit_mark = match self.create_timeline_uninit_mark(new_timeline_id) { + Ok(m) => m, + Err(TimelineExclusionError::AlreadyCreating) => { + // Creation is in progress, we cannot create it again, and we cannot + // check if this request matches the existing one, so caller must try + // again later. + return Err(CreateTimelineError::AlreadyCreating); } + Err(TimelineExclusionError::Other(e)) => { + return Err(CreateTimelineError::Other(e)); + } + Err(TimelineExclusionError::AlreadyExists(existing)) => { + debug!("timeline {new_timeline_id} already exists"); - return Err(CreateTimelineError::AlreadyExists); - } + // Idempotency: creating the same timeline twice is not an error, unless + // the second creation has different parameters. + if existing.get_ancestor_timeline_id() != ancestor_timeline_id + || existing.pg_version != pg_version + || (ancestor_start_lsn.is_some() + && ancestor_start_lsn != Some(existing.get_ancestor_lsn())) + { + return Err(CreateTimelineError::Conflict); + } + + if let Some(remote_client) = existing.remote_client.as_ref() { + // Wait for uploads to complete, so that when we return Ok, the timeline + // is known to be durable on remote storage. Just like we do at the end of + // this function, after we have created the timeline ourselves. + // + // We only really care that the initial version of `index_part.json` has + // been uploaded. That's enough to remember that the timeline + // exists. However, there is no function to wait specifically for that so + // we just wait for all in-progress uploads to finish. + remote_client + .wait_completion() + .await + .context("wait for timeline uploads to complete")?; + } + + return Ok(existing); + } + }; let loaded_timeline = match ancestor_timeline_id { Some(ancestor_timeline_id) => { @@ -1634,18 +1761,32 @@ impl Tenant { ancestor_timeline.wait_lsn(*lsn, ctx).await?; } - self.branch_timeline(&ancestor_timeline, new_timeline_id, ancestor_start_lsn, ctx) - .await? + self.branch_timeline( + &ancestor_timeline, + new_timeline_id, + ancestor_start_lsn, + uninit_mark, + ctx, + ) + .await? } None => { - self.bootstrap_timeline(new_timeline_id, pg_version, ctx) - .await? + self.bootstrap_timeline( + new_timeline_id, + pg_version, + load_existing_initdb, + uninit_mark, + ctx, + ) + .await? } }; + // At this point we have dropped our guard on [`Self::timelines_creating`], and + // the timeline is visible in [`Self::timelines`], but it is _not_ durable yet. We must + // not send a success to the caller until it is. The same applies to handling retries, + // see the handling of [`TimelineExclusionError::AlreadyExists`] above. if let Some(remote_client) = loaded_timeline.remote_client.as_ref() { - // Wait for the upload of the 'index_part.json` file to finish, so that when we return - // Ok, the timeline is durable in remote storage. let kind = ancestor_timeline_id .map(|_| "branched") .unwrap_or("bootstrapped"); @@ -1659,6 +1800,15 @@ impl Tenant { Ok(loaded_timeline) } + pub(crate) async fn delete_timeline( + self: Arc, + timeline_id: TimelineId, + ) -> Result<(), DeleteTimelineError> { + DeleteTimelineFlow::run(&self, timeline_id, false).await?; + + Ok(()) + } + /// perform one garbage collection iteration, removing old data files from disk. /// this function is periodically called by gc task. /// also it can be explicitly requested through page server api 'do_gc' command. @@ -1820,7 +1970,7 @@ impl Tenant { ); *current_state = TenantState::Active; - let elapsed = self.loading_started_at.elapsed(); + let elapsed = self.constructed_at.elapsed(); let total_timelines = timelines_accessor.len(); // log a lot of stuff, because some tenants sometimes suffer from user-visible @@ -1835,7 +1985,7 @@ impl Tenant { "activation attempt finished" ); - TENANT_ACTIVATION.observe(elapsed.as_secs_f64()); + TENANT.activation.observe(elapsed.as_secs_f64()); }); } } @@ -1925,7 +2075,7 @@ impl Tenant { // // this will additionally shutdown and await all timeline tasks. tracing::debug!("Waiting for tasks..."); - task_mgr::shutdown_tasks(None, Some(self.tenant_shard_id.tenant_id), None).await; + task_mgr::shutdown_tasks(None, Some(self.tenant_shard_id), None).await; // Wait for any in-flight operations to complete self.gate.close().await; @@ -2090,18 +2240,41 @@ impl Tenant { self.state.subscribe() } - pub(crate) async fn wait_to_become_active(&self) -> Result<(), GetActiveTenantError> { + /// The activate_now semaphore is initialized with zero units. As soon as + /// we add a unit, waiters will be able to acquire a unit and proceed. + pub(crate) fn activate_now(&self) { + self.activate_now_sem.add_permits(1); + } + + pub(crate) async fn wait_to_become_active( + &self, + timeout: Duration, + ) -> Result<(), GetActiveTenantError> { let mut receiver = self.state.subscribe(); loop { let current_state = receiver.borrow_and_update().clone(); match current_state { TenantState::Loading | TenantState::Attaching | TenantState::Activating(_) => { // in these states, there's a chance that we can reach ::Active - receiver.changed().await.map_err( - |_e: tokio::sync::watch::error::RecvError| - // Tenant existed but was dropped: report it as non-existent - GetActiveTenantError::NotFound(GetTenantError::NotFound(self.tenant_shard_id.tenant_id)) - )?; + self.activate_now(); + match timeout_cancellable(timeout, &self.cancel, receiver.changed()).await { + Ok(r) => { + r.map_err( + |_e: tokio::sync::watch::error::RecvError| + // Tenant existed but was dropped: report it as non-existent + GetActiveTenantError::NotFound(GetTenantError::NotFound(self.tenant_shard_id.tenant_id)) + )? + } + Err(TimeoutCancellableError::Cancelled) => { + return Err(GetActiveTenantError::Cancelled); + } + Err(TimeoutCancellableError::Timeout) => { + return Err(GetActiveTenantError::WaitForActiveTimeout { + latest_state: Some(self.current_state()), + wait_time: timeout, + }); + } + } } TenantState::Active { .. } => { return Ok(()); @@ -2122,6 +2295,14 @@ impl Tenant { .attach_mode .clone() } + + pub(crate) fn get_tenant_shard_id(&self) -> &TenantShardId { + &self.tenant_shard_id + } + + pub(crate) fn get_generation(&self) -> Generation { + self.generation + } } /// Given a Vec of timelines and their ancestors (timeline_id, ancestor_id), @@ -2260,6 +2441,18 @@ impl Tenant { .or(self.conf.default_tenant_conf.min_resident_size_override) } + pub fn get_heatmap_period(&self) -> Option { + let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf; + let heatmap_period = tenant_conf + .heatmap_period + .unwrap_or(self.conf.default_tenant_conf.heatmap_period); + if heatmap_period.is_zero() { + None + } else { + Some(heatmap_period) + } + } + pub fn set_new_tenant_config(&self, new_tenant_conf: TenantConfOpt) { self.tenant_conf.write().unwrap().tenant_conf = new_tenant_conf; // Don't hold self.timelines.lock() during the notifies. @@ -2297,7 +2490,6 @@ impl Tenant { new_metadata: &TimelineMetadata, ancestor: Option>, resources: TimelineResources, - init_order: Option<&InitializationOrder>, cause: CreateTimelineCause, ) -> anyhow::Result> { let state = match cause { @@ -2312,9 +2504,6 @@ impl Tenant { CreateTimelineCause::Delete => TimelineState::Stopping, }; - let initial_logical_size_can_start = init_order.map(|x| &x.initial_logical_size_can_start); - let initial_logical_size_attempt = init_order.map(|x| &x.initial_logical_size_attempt); - let pg_version = new_metadata.pg_version(); let timeline = Timeline::new( @@ -2325,11 +2514,10 @@ impl Tenant { new_timeline_id, self.tenant_shard_id, self.generation, + self.shard_identity, Arc::clone(&self.walredo_mgr), resources, pg_version, - initial_logical_size_can_start.cloned(), - initial_logical_size_attempt.cloned().flatten(), state, self.cancel.child_token(), ); @@ -2344,6 +2532,7 @@ impl Tenant { state: TenantState, conf: &'static PageServerConf, attached_conf: AttachedTenantConf, + shard_identity: ShardIdentity, walredo_mgr: Arc, tenant_shard_id: TenantShardId, remote_storage: Option, @@ -2405,13 +2594,15 @@ impl Tenant { Tenant { tenant_shard_id, + shard_identity, generation: attached_conf.location.generation, conf, // using now here is good enough approximation to catch tenants with really long // activation times. - loading_started_at: Instant::now(), + constructed_at: Instant::now(), tenant_conf: Arc::new(RwLock::new(attached_conf)), timelines: Mutex::new(HashMap::new()), + timelines_creating: Mutex::new(HashSet::new()), gc_cs: tokio::sync::Mutex::new(()), walredo_mgr, remote_storage, @@ -2420,6 +2611,7 @@ impl Tenant { cached_logical_sizes: tokio::sync::Mutex::new(HashMap::new()), cached_synthetic_tenant_size: Arc::new(AtomicU64::new(0)), eviction_task_tenant_state: tokio::sync::Mutex::new(EvictionTaskTenantState::default()), + activate_now_sem: tokio::sync::Semaphore::new(0), delete_progress: Arc::new(tokio::sync::Mutex::new(DeleteTenantFlow::default())), cancel: CancellationToken::default(), gate: Gate::new(format!("Tenant<{tenant_shard_id}>")), @@ -2526,7 +2718,7 @@ impl Tenant { } } - info!("persisting tenantconf to {config_path}"); + debug!("persisting tenantconf to {config_path}"); let mut conf_content = r#"# This file contains a specific per-tenant's config. # It is read in case of pageserver restart. @@ -2561,7 +2753,7 @@ impl Tenant { target_config_path: &Utf8Path, tenant_conf: &TenantConfOpt, ) -> anyhow::Result<()> { - info!("persisting tenantconf to {target_config_path}"); + debug!("persisting tenantconf to {target_config_path}"); let mut conf_content = r#"# This file contains a specific per-tenant's config. # It is read in case of pageserver restart. @@ -2646,9 +2838,7 @@ impl Tenant { } }; - crate::failpoint_support::sleep_millis_async!( - "gc_iteration_internal_after_getting_gc_timelines" - ); + failpoint_support::sleep_millis_async!("gc_iteration_internal_after_getting_gc_timelines"); // If there is nothing to GC, we don't want any messages in the INFO log. if !gc_timelines.is_empty() { @@ -2803,8 +2993,9 @@ impl Tenant { start_lsn: Option, ctx: &RequestContext, ) -> Result, CreateTimelineError> { + let uninit_mark = self.create_timeline_uninit_mark(dst_id).unwrap(); let tl = self - .branch_timeline_impl(src_timeline, dst_id, start_lsn, ctx) + .branch_timeline_impl(src_timeline, dst_id, start_lsn, uninit_mark, ctx) .await?; tl.set_state(TimelineState::Active); Ok(tl) @@ -2818,9 +3009,10 @@ impl Tenant { src_timeline: &Arc, dst_id: TimelineId, start_lsn: Option, + timeline_uninit_mark: TimelineUninitMark<'_>, ctx: &RequestContext, ) -> Result, CreateTimelineError> { - self.branch_timeline_impl(src_timeline, dst_id, start_lsn, ctx) + self.branch_timeline_impl(src_timeline, dst_id, start_lsn, timeline_uninit_mark, ctx) .await } @@ -2829,13 +3021,14 @@ impl Tenant { src_timeline: &Arc, dst_id: TimelineId, start_lsn: Option, + timeline_uninit_mark: TimelineUninitMark<'_>, _ctx: &RequestContext, ) -> Result, CreateTimelineError> { let src_id = src_timeline.timeline_id; - // First acquire the GC lock so that another task cannot advance the GC - // cutoff in 'gc_info', and make 'start_lsn' invalid, while we are - // creating the branch. + // We will validate our ancestor LSN in this function. Acquire the GC lock so that + // this check cannot race with GC, and the ancestor LSN is guaranteed to remain + // valid while we are creating the branch. let _gc_cs = self.gc_cs.lock().await; // If no start LSN is specified, we branch the new timeline from the source timeline's last record LSN @@ -2845,13 +3038,6 @@ impl Tenant { lsn }); - // Create a placeholder for the new branch. This will error - // out if the new timeline ID is already in use. - let timeline_uninit_mark = { - let timelines = self.timelines.lock().unwrap(); - self.create_timeline_uninit_mark(dst_id, &timelines)? - }; - // Ensure that `start_lsn` is valid, i.e. the LSN is within the PITR // horizon on the source timeline // @@ -2943,26 +3129,45 @@ impl Tenant { Ok(new_timeline) } + /// For unit tests, make this visible so that other modules can directly create timelines + #[cfg(test)] + #[tracing::instrument(fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), %timeline_id))] + pub(crate) async fn bootstrap_timeline_test( + &self, + timeline_id: TimelineId, + pg_version: u32, + load_existing_initdb: Option, + ctx: &RequestContext, + ) -> anyhow::Result> { + let uninit_mark = self.create_timeline_uninit_mark(timeline_id).unwrap(); + self.bootstrap_timeline( + timeline_id, + pg_version, + load_existing_initdb, + uninit_mark, + ctx, + ) + .await + } + /// - run initdb to init temporary instance and get bootstrap data /// - after initialization completes, tar up the temp dir and upload it to S3. /// /// The caller is responsible for activating the returned timeline. - pub(crate) async fn bootstrap_timeline( + async fn bootstrap_timeline( &self, timeline_id: TimelineId, pg_version: u32, + load_existing_initdb: Option, + timeline_uninit_mark: TimelineUninitMark<'_>, ctx: &RequestContext, ) -> anyhow::Result> { - let timeline_uninit_mark = { - let timelines = self.timelines.lock().unwrap(); - self.create_timeline_uninit_mark(timeline_id, &timelines)? - }; // create a `tenant/{tenant_id}/timelines/basebackup-{timeline_id}.{TEMP_FILE_SUFFIX}/` // temporary directory for basebackup files for the given timeline. + + let timelines_path = self.conf.timelines_path(&self.tenant_shard_id); let pgdata_path = path_with_suffix_extension( - self.conf - .timelines_path(&self.tenant_shard_id) - .join(format!("basebackup-{timeline_id}")), + timelines_path.join(format!("basebackup-{timeline_id}")), TEMP_FILE_SUFFIX, ); @@ -2973,8 +3178,6 @@ impl Tenant { format!("Failed to remove already existing initdb directory: {pgdata_path}") })?; } - // Init temporarily repo to get bootstrap data, this creates a directory in the `initdb_path` path - run_initdb(self.conf, &pgdata_path, pg_version, &self.cancel).await?; // this new directory is very temporary, set to remove it immediately after bootstrap, we don't need it scopeguard::defer! { if let Err(e) = fs::remove_dir_all(&pgdata_path) { @@ -2982,31 +3185,83 @@ impl Tenant { error!("Failed to remove temporary initdb directory '{pgdata_path}': {e}"); } } - let pgdata_lsn = import_datadir::get_lsn_from_controlfile(&pgdata_path)?.align(); + if let Some(existing_initdb_timeline_id) = load_existing_initdb { + let Some(storage) = &self.remote_storage else { + bail!("no storage configured but load_existing_initdb set to {existing_initdb_timeline_id}"); + }; + let (initdb_tar_zst_path, initdb_tar_zst) = + self::remote_timeline_client::download_initdb_tar_zst( + self.conf, + storage, + &self.tenant_shard_id, + &existing_initdb_timeline_id, + &self.cancel, + ) + .await + .context("download initdb tar")?; + let buf_read = + BufReader::with_capacity(remote_timeline_client::BUFFER_SIZE, initdb_tar_zst); + import_datadir::extract_tar_zst(&pgdata_path, buf_read) + .await + .context("extract initdb tar")?; - // Upload the created data dir to S3 - if let Some(storage) = &self.remote_storage { - let pgdata_zstd = import_datadir::create_tar_zst(&pgdata_path).await?; - let pgdata_zstd = Bytes::from(pgdata_zstd); - backoff::retry( - || async { - self::remote_timeline_client::upload_initdb_dir( - storage, - &self.tenant_shard_id.tenant_id, - &timeline_id, - pgdata_zstd.clone(), - ) + tokio::fs::remove_file(&initdb_tar_zst_path) + .await + .or_else(|e| { + if e.kind() == std::io::ErrorKind::NotFound { + // If something else already removed the file, ignore the error + Ok(()) + } else { + Err(e) + } + }) + .with_context(|| format!("tempfile removal {initdb_tar_zst_path}"))?; + } else { + // Init temporarily repo to get bootstrap data, this creates a directory in the `initdb_path` path + run_initdb(self.conf, &pgdata_path, pg_version, &self.cancel).await?; + + // Upload the created data dir to S3 + if let Some(storage) = &self.remote_storage { + let temp_path = timelines_path.join(format!( + "{INITDB_PATH}.upload-{timeline_id}.{TEMP_FILE_SUFFIX}" + )); + + let (pgdata_zstd, tar_zst_size) = + import_datadir::create_tar_zst(&pgdata_path, &temp_path).await?; + backoff::retry( + || async { + self::remote_timeline_client::upload_initdb_dir( + storage, + &self.tenant_shard_id.tenant_id, + &timeline_id, + pgdata_zstd.try_clone().await?, + tar_zst_size, + &self.cancel, + ) + .await + }, + |_| false, + 3, + u32::MAX, + "persist_initdb_tar_zst", + backoff::Cancel::new(self.cancel.clone(), || anyhow::anyhow!("Cancelled")), + ) + .await?; + + tokio::fs::remove_file(&temp_path) .await - }, - |_| false, - 3, - u32::MAX, - "persist_initdb_tar_zst", - // TODO: use a cancellation token (https://github.com/neondatabase/neon/issues/5066) - backoff::Cancel::new(CancellationToken::new(), || unreachable!()), - ) - .await?; + .or_else(|e| { + if e.kind() == std::io::ErrorKind::NotFound { + // If something else already removed the file, ignore the error + Ok(()) + } else { + Err(e) + } + }) + .with_context(|| format!("tempfile removal {temp_path}"))?; + } } + let pgdata_lsn = import_datadir::get_lsn_from_controlfile(&pgdata_path)?.align(); // Import the contents of the data directory at the initial checkpoint // LSN, and any WAL after that. @@ -3104,11 +3359,11 @@ impl Tenant { /// at 'disk_consistent_lsn'. After any initial data has been imported, call /// `finish_creation` to insert the Timeline into the timelines map and to remove the /// uninit mark file. - async fn prepare_new_timeline( - &self, + async fn prepare_new_timeline<'a>( + &'a self, new_timeline_id: TimelineId, new_metadata: &TimelineMetadata, - uninit_mark: TimelineUninitMark, + uninit_mark: TimelineUninitMark<'a>, start_lsn: Lsn, ancestor: Option>, ) -> anyhow::Result { @@ -3125,7 +3380,6 @@ impl Tenant { new_metadata, ancestor, resources, - None, CreateTimelineCause::Load, ) .context("Failed to create timeline data structure")?; @@ -3182,24 +3436,42 @@ impl Tenant { fn create_timeline_uninit_mark( &self, timeline_id: TimelineId, - timelines: &MutexGuard>>, - ) -> anyhow::Result { + ) -> Result { let tenant_shard_id = self.tenant_shard_id; - anyhow::ensure!( - timelines.get(&timeline_id).is_none(), - "Timeline {tenant_shard_id}/{timeline_id} already exists in pageserver's memory" - ); - let timeline_path = self.conf.timeline_path(&tenant_shard_id, &timeline_id); - anyhow::ensure!( - !timeline_path.exists(), - "Timeline {timeline_path} already exists, cannot create its uninit mark file", - ); - let uninit_mark_path = self .conf .timeline_uninit_mark_file_path(tenant_shard_id, timeline_id); - fs::File::create(&uninit_mark_path) + let timeline_path = self.conf.timeline_path(&tenant_shard_id, &timeline_id); + + let uninit_mark = TimelineUninitMark::new( + self, + timeline_id, + uninit_mark_path.clone(), + timeline_path.clone(), + )?; + + // At this stage, we have got exclusive access to in-memory state for this timeline ID + // for creation. + // A timeline directory should never exist on disk already: + // - a previous failed creation would have cleaned up after itself + // - a pageserver restart would clean up timeline directories that don't have valid remote state + // + // Therefore it is an unexpected internal error to encounter a timeline directory already existing here, + // this error may indicate a bug in cleanup on failed creations. + if timeline_path.exists() { + return Err(TimelineExclusionError::Other(anyhow::anyhow!( + "Timeline directory already exists! This is a bug." + ))); + } + + // Create the on-disk uninit mark _after_ the in-memory acquisition of the tenant ID: guarantees + // that during process runtime, colliding creations will be caught in-memory without getting + // as far as failing to write a file. + fs::OpenOptions::new() + .write(true) + .create_new(true) + .open(&uninit_mark_path) .context("Failed to create uninit mark file") .and_then(|_| { crashsafe::fsync_file_and_parent(&uninit_mark_path) @@ -3209,8 +3481,6 @@ impl Tenant { format!("Failed to crate uninit mark for timeline {tenant_shard_id}/{timeline_id}") })?; - let uninit_mark = TimelineUninitMark::new(uninit_mark_path, timeline_path); - Ok(uninit_mark) } @@ -3653,6 +3923,7 @@ pub(crate) mod harness { tenant_conf.evictions_low_residence_duration_metric_threshold, ), gc_feedback: Some(tenant_conf.gc_feedback), + heatmap_period: Some(tenant_conf.heatmap_period), } } } @@ -3788,6 +4059,8 @@ pub(crate) mod harness { self.generation, )) .unwrap(), + // This is a legacy/test code path: sharding isn't supported here. + ShardIdentity::unsharded(), walredo_mgr, self.tenant_shard_id, Some(self.remote_storage.clone()), @@ -3797,7 +4070,7 @@ pub(crate) mod harness { match mode { LoadMode::Local => { tenant - .load_local(None, ctx) + .load_local(ctx) .instrument(info_span!("try_load", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug())) .await?; } @@ -3807,7 +4080,7 @@ pub(crate) mod harness { .instrument(info_span!("try_load_preload", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug())) .await?; tenant - .attach(None, Some(preload), ctx) + .attach(Some(preload), ctx) .instrument(info_span!("try_load", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug())) .await?; } @@ -3850,6 +4123,9 @@ pub(crate) mod harness { pub(crate) struct TestRedoManager; impl TestRedoManager { + /// # Cancel-Safety + /// + /// This method is cancellation-safe. pub async fn request_redo( &self, key: Key, @@ -3954,13 +4230,7 @@ mod tests { .await { Ok(_) => panic!("duplicate timeline creation should fail"), - Err(e) => assert_eq!( - e.to_string(), - format!( - "Timeline {}/{} already exists in pageserver's memory", - tenant.tenant_shard_id, TIMELINE_ID - ) - ), + Err(e) => assert_eq!(e.to_string(), "Already exists".to_string()), } Ok(()) diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs index 7a454b53d2..2d4cd350d7 100644 --- a/pageserver/src/tenant/config.rs +++ b/pageserver/src/tenant/config.rs @@ -46,6 +46,8 @@ pub mod defaults { pub const DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT: &str = "10 seconds"; pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 10 * 1024 * 1024; pub const DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD: &str = "24 hour"; + + pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100; } #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] @@ -334,6 +336,11 @@ pub struct TenantConf { #[serde(with = "humantime_serde")] pub evictions_low_residence_duration_metric_threshold: Duration, pub gc_feedback: bool, + + /// If non-zero, the period between uploads of a heatmap from attached tenants. This + /// may be disabled if a Tenant will not have secondary locations: only secondary + /// locations will use the heatmap uploaded by attached locations. + pub heatmap_period: Duration, } /// Same as TenantConf, but this struct preserves the information about @@ -414,6 +421,11 @@ pub struct TenantConfOpt { #[serde(skip_serializing_if = "Option::is_none")] #[serde(default)] pub gc_feedback: Option, + + #[serde(skip_serializing_if = "Option::is_none")] + #[serde(with = "humantime_serde")] + #[serde(default)] + pub heatmap_period: Option, } #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] @@ -482,6 +494,7 @@ impl TenantConfOpt { .evictions_low_residence_duration_metric_threshold .unwrap_or(global_conf.evictions_low_residence_duration_metric_threshold), gc_feedback: self.gc_feedback.unwrap_or(global_conf.gc_feedback), + heatmap_period: self.heatmap_period.unwrap_or(global_conf.heatmap_period), } } } @@ -519,6 +532,7 @@ impl Default for TenantConf { ) .expect("cannot parse default evictions_low_residence_duration_metric_threshold"), gc_feedback: false, + heatmap_period: Duration::ZERO, } } } diff --git a/pageserver/src/tenant/delete.rs b/pageserver/src/tenant/delete.rs index b7b2ef9c79..b21bad51ba 100644 --- a/pageserver/src/tenant/delete.rs +++ b/pageserver/src/tenant/delete.rs @@ -15,7 +15,6 @@ use crate::{ context::RequestContext, task_mgr::{self, TaskKind}, tenant::mgr::{TenantSlot, TenantsMapRemoveResult}, - InitializationOrder, }; use super::{ @@ -49,6 +48,9 @@ pub(crate) enum DeleteTenantError { #[error("Timeline {0}")] Timeline(#[from] DeleteTimelineError), + #[error("Cancelled")] + Cancelled, + #[error(transparent)] Other(#[from] anyhow::Error), } @@ -72,22 +74,24 @@ async fn create_remote_delete_mark( conf: &PageServerConf, remote_storage: &GenericRemoteStorage, tenant_shard_id: &TenantShardId, + cancel: &CancellationToken, ) -> Result<(), DeleteTenantError> { let remote_mark_path = remote_tenant_delete_mark_path(conf, tenant_shard_id)?; let data: &[u8] = &[]; backoff::retry( || async { + let data = bytes::Bytes::from_static(data); + let stream = futures::stream::once(futures::future::ready(Ok(data))); remote_storage - .upload(data, 0, &remote_mark_path, None) + .upload(stream, 0, &remote_mark_path, None) .await }, |_e| false, FAILED_UPLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES, "mark_upload", - // TODO: use a cancellation token (https://github.com/neondatabase/neon/issues/5066) - backoff::Cancel::new(CancellationToken::new(), || unreachable!()), + backoff::Cancel::new(cancel.clone(), || anyhow::anyhow!("Cancelled")), ) .await .context("mark_upload")?; @@ -169,6 +173,7 @@ async fn remove_tenant_remote_delete_mark( conf: &PageServerConf, remote_storage: Option<&GenericRemoteStorage>, tenant_shard_id: &TenantShardId, + cancel: &CancellationToken, ) -> Result<(), DeleteTenantError> { if let Some(remote_storage) = remote_storage { let path = remote_tenant_delete_mark_path(conf, tenant_shard_id)?; @@ -178,8 +183,7 @@ async fn remove_tenant_remote_delete_mark( FAILED_UPLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES, "remove_tenant_remote_delete_mark", - // TODO: use a cancellation token (https://github.com/neondatabase/neon/issues/5066) - backoff::Cancel::new(CancellationToken::new(), || unreachable!()), + backoff::Cancel::new(cancel.clone(), || anyhow::anyhow!("Cancelled")), ) .await .context("remove_tenant_remote_delete_mark")?; @@ -321,9 +325,15 @@ impl DeleteTenantFlow { // Though sounds scary, different mark name? // Detach currently uses remove_dir_all so in case of a crash we can end up in a weird state. if let Some(remote_storage) = &remote_storage { - create_remote_delete_mark(conf, remote_storage, &tenant.tenant_shard_id) - .await - .context("remote_mark")? + create_remote_delete_mark( + conf, + remote_storage, + &tenant.tenant_shard_id, + // Can't use tenant.cancel, it's already shut down. TODO: wire in an appropriate token + &CancellationToken::new(), + ) + .await + .context("remote_mark")? } fail::fail_point!("tenant-delete-before-create-local-mark", |_| { @@ -390,7 +400,6 @@ impl DeleteTenantFlow { tenant: &Arc, preload: Option, tenants: &'static std::sync::RwLock, - init_order: Option, ctx: &RequestContext, ) -> Result<(), DeleteTenantError> { let (_, progress) = completion::channel(); @@ -400,10 +409,7 @@ impl DeleteTenantFlow { .await .expect("cant be stopping or broken"); - tenant - .attach(init_order, preload, ctx) - .await - .context("attach")?; + tenant.attach(preload, ctx).await.context("attach")?; Self::background( guard, @@ -466,7 +472,7 @@ impl DeleteTenantFlow { task_mgr::spawn( task_mgr::BACKGROUND_RUNTIME.handle(), TaskKind::TimelineDeletionWorker, - Some(tenant_shard_id.tenant_id), + Some(tenant_shard_id), None, "tenant_delete", false, @@ -527,8 +533,14 @@ impl DeleteTenantFlow { .context("timelines dir not empty")?; } - remove_tenant_remote_delete_mark(conf, remote_storage.as_ref(), &tenant.tenant_shard_id) - .await?; + remove_tenant_remote_delete_mark( + conf, + remote_storage.as_ref(), + &tenant.tenant_shard_id, + // Can't use tenant.cancel, it's already shut down. TODO: wire in an appropriate token + &CancellationToken::new(), + ) + .await?; fail::fail_point!("tenant-delete-before-cleanup-remaining-fs-traces", |_| { Err(anyhow::anyhow!( @@ -553,7 +565,7 @@ impl DeleteTenantFlow { // we encounter an InProgress marker, yield the barrier it contains and wait on it. let barrier = { let mut locked = tenants.write().unwrap(); - let removed = locked.remove(&tenant.tenant_shard_id.tenant_id); + let removed = locked.remove(tenant.tenant_shard_id); // FIXME: we should not be modifying this from outside of mgr.rs. // This will go away when we simplify deletion (https://github.com/neondatabase/neon/issues/5080) diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index f34d62ba53..250de7247d 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -2,7 +2,8 @@ //! page server. use camino::{Utf8DirEntry, Utf8Path, Utf8PathBuf}; -use pageserver_api::shard::TenantShardId; +use pageserver_api::key::Key; +use pageserver_api::shard::{ShardIdentity, ShardNumber, TenantShardId}; use rand::{distributions::Alphanumeric, Rng}; use std::borrow::Cow; use std::collections::{BTreeMap, HashMap}; @@ -27,7 +28,7 @@ use crate::control_plane_client::{ ControlPlaneClient, ControlPlaneGenerationsApi, RetryForeverError, }; use crate::deletion_queue::DeletionQueueClient; -use crate::metrics::TENANT_MANAGER as METRICS; +use crate::metrics::{TENANT, TENANT_MANAGER as METRICS}; use crate::task_mgr::{self, TaskKind}; use crate::tenant::config::{ AttachedLocationConfig, AttachmentMode, LocationConf, LocationMode, TenantConfOpt, @@ -43,7 +44,6 @@ use utils::generation::Generation; use utils::id::{TenantId, TimelineId}; use super::delete::DeleteTenantError; -use super::timeline::delete::DeleteTimelineFlow; use super::TenantSharedResources; /// For a tenant that appears in TenantsMap, it may either be @@ -97,49 +97,78 @@ pub(crate) enum TenantsMap { ShuttingDown(BTreeMap), } -/// Helper for mapping shard-unaware functions to a sharding-aware map -/// TODO(sharding): all users of this must be made shard-aware. -fn exactly_one_or_none<'a>( - map: &'a BTreeMap, - tenant_id: &TenantId, -) -> Option<(&'a TenantShardId, &'a TenantSlot)> { - let mut slots = map.range(TenantShardId::tenant_range(*tenant_id)); - - // Retrieve the first two slots in the range: if both are populated, we must panic because the caller - // needs a shard-naive view of the world in which only one slot can exist for a TenantId at a time. - let slot_a = slots.next(); - let slot_b = slots.next(); - match (slot_a, slot_b) { - (None, None) => None, - (Some(slot), None) => { - // Exactly one matching slot - Some(slot) - } - (Some(_slot_a), Some(_slot_b)) => { - // Multiple shards for this tenant: cannot handle this yet. - // TODO(sharding): callers of get() should be shard-aware. - todo!("Attaching multiple shards in teh same tenant to the same pageserver") - } - (None, Some(_)) => unreachable!(), - } -} - pub(crate) enum TenantsMapRemoveResult { Occupied(TenantSlot), Vacant, InProgress(utils::completion::Barrier), } +/// When resolving a TenantId to a shard, we may be looking for the 0th +/// shard, or we might be looking for whichever shard holds a particular page. +pub(crate) enum ShardSelector { + /// Only return the 0th shard, if it is present. If a non-0th shard is present, + /// ignore it. + Zero, + /// Pick the first shard we find for the TenantId + First, + /// Pick the shard that holds this key + Page(Key), +} + impl TenantsMap { /// Convenience function for typical usage, where we want to get a `Tenant` object, for /// working with attached tenants. If the TenantId is in the map but in Secondary state, /// None is returned. - pub(crate) fn get(&self, tenant_id: &TenantId) -> Option<&Arc> { + pub(crate) fn get(&self, tenant_shard_id: &TenantShardId) -> Option<&Arc> { match self { TenantsMap::Initializing => None, TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => { - // TODO(sharding): callers of get() should be shard-aware. - exactly_one_or_none(m, tenant_id).and_then(|(_, slot)| slot.get_attached()) + m.get(tenant_shard_id).and_then(|slot| slot.get_attached()) + } + } + } + + /// A page service client sends a TenantId, and to look up the correct Tenant we must + /// resolve this to a fully qualified TenantShardId. + fn resolve_attached_shard( + &self, + tenant_id: &TenantId, + selector: ShardSelector, + ) -> Option { + let mut want_shard = None; + match self { + TenantsMap::Initializing => None, + TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => { + for slot in m.range(TenantShardId::tenant_range(*tenant_id)) { + // Ignore all slots that don't contain an attached tenant + let tenant = match &slot.1 { + TenantSlot::Attached(t) => t, + _ => continue, + }; + + match selector { + ShardSelector::First => return Some(*slot.0), + ShardSelector::Zero if slot.0.shard_number == ShardNumber(0) => { + return Some(*slot.0) + } + ShardSelector::Page(key) => { + // First slot we see for this tenant, calculate the expected shard number + // for the key: we will use this for checking if this and subsequent + // slots contain the key, rather than recalculating the hash each time. + if want_shard.is_none() { + want_shard = Some(tenant.shard_identity.get_shard_number(&key)); + } + + if Some(tenant.shard_identity.number) == want_shard { + return Some(*slot.0); + } + } + _ => continue, + } + } + + // Fall through: we didn't find an acceptable shard + None } } } @@ -148,25 +177,19 @@ impl TenantsMap { /// /// The normal way to remove a tenant is using a SlotGuard, which will gracefully remove the guarded /// slot if the enclosed tenant is shutdown. - pub(crate) fn remove(&mut self, tenant_id: &TenantId) -> TenantsMapRemoveResult { + pub(crate) fn remove(&mut self, tenant_shard_id: TenantShardId) -> TenantsMapRemoveResult { use std::collections::btree_map::Entry; match self { TenantsMap::Initializing => TenantsMapRemoveResult::Vacant, - TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => { - let key = exactly_one_or_none(m, tenant_id).map(|(k, _)| *k); - match key { - Some(key) => match m.entry(key) { - Entry::Occupied(entry) => match entry.get() { - TenantSlot::InProgress(barrier) => { - TenantsMapRemoveResult::InProgress(barrier.clone()) - } - _ => TenantsMapRemoveResult::Occupied(entry.remove()), - }, - Entry::Vacant(_entry) => TenantsMapRemoveResult::Vacant, - }, - None => TenantsMapRemoveResult::Vacant, - } - } + TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => match m.entry(tenant_shard_id) { + Entry::Occupied(entry) => match entry.get() { + TenantSlot::InProgress(barrier) => { + TenantsMapRemoveResult::InProgress(barrier.clone()) + } + _ => TenantsMapRemoveResult::Occupied(entry.remove()), + }, + Entry::Vacant(_entry) => TenantsMapRemoveResult::Vacant, + }, } } @@ -214,49 +237,6 @@ async fn safe_rename_tenant_dir(path: impl AsRef) -> std::io::Result> = Lazy::new(|| std::sync::RwLock::new(TenantsMap::Initializing)); -/// Create a directory, including parents. This does no fsyncs and makes -/// no guarantees about the persistence of the resulting metadata: for -/// use when creating dirs for use as cache. -async fn unsafe_create_dir_all(path: &Utf8PathBuf) -> std::io::Result<()> { - let mut dirs_to_create = Vec::new(); - let mut path: &Utf8Path = path.as_ref(); - - // Figure out which directories we need to create. - loop { - let meta = tokio::fs::metadata(path).await; - match meta { - Ok(metadata) if metadata.is_dir() => break, - Ok(_) => { - return Err(std::io::Error::new( - std::io::ErrorKind::AlreadyExists, - format!("non-directory found in path: {path}"), - )); - } - Err(ref e) if e.kind() == std::io::ErrorKind::NotFound => {} - Err(e) => return Err(e), - } - - dirs_to_create.push(path); - - match path.parent() { - Some(parent) => path = parent, - None => { - return Err(std::io::Error::new( - std::io::ErrorKind::InvalidInput, - format!("can't find parent of path '{path}'"), - )); - } - } - } - - // Create directories from parent to child. - for &path in dirs_to_create.iter().rev() { - tokio::fs::create_dir(path).await?; - } - - Ok(()) -} - /// The TenantManager is responsible for storing and mutating the collection of all tenants /// that this pageserver process has state for. Every Tenant and SecondaryTenant instance /// lives inside the TenantManager. @@ -451,6 +431,13 @@ pub async fn init_tenant_mgr( let tenant_generations = init_load_generations(conf, &tenant_configs, &resources, &cancel).await?; + tracing::info!( + "Attaching {} tenants at startup, warming up {} at a time", + tenant_configs.len(), + conf.concurrent_tenant_warmup.initial_permits() + ); + TENANT.startup_scheduled.inc_by(tenant_configs.len() as u64); + // Construct `Tenant` objects and start them running for (tenant_shard_id, location_conf) in tenant_configs { let tenant_dir_path = conf.tenant_path(&tenant_shard_id); @@ -515,22 +502,21 @@ pub async fn init_tenant_mgr( location_conf.attach_in_generation(generation); Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf).await?; + let shard_identity = location_conf.shard; match tenant_spawn( conf, tenant_shard_id, &tenant_dir_path, resources.clone(), AttachedTenantConf::try_from(location_conf)?, + shard_identity, Some(init_order.clone()), &TENANTS, SpawnMode::Normal, &ctx, ) { Ok(tenant) => { - tenants.insert( - TenantShardId::unsharded(tenant.tenant_id()), - TenantSlot::Attached(tenant), - ); + tenants.insert(tenant_shard_id, TenantSlot::Attached(tenant)); } Err(e) => { error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Failed to start tenant: {e:#}"); @@ -561,6 +547,7 @@ pub(crate) fn tenant_spawn( tenant_path: &Utf8Path, resources: TenantSharedResources, location_conf: AttachedTenantConf, + shard_identity: ShardIdentity, init_order: Option, tenants: &'static std::sync::RwLock, mode: SpawnMode, @@ -587,12 +574,19 @@ pub(crate) fn tenant_spawn( "Cannot load tenant, ignore mark found at {tenant_ignore_mark:?}" ); - info!("Attaching tenant {tenant_shard_id}"); + info!( + tenant_id = %tenant_shard_id.tenant_id, + shard_id = %tenant_shard_id.shard_slug(), + generation = ?location_conf.location.generation, + attach_mode = ?location_conf.location.attach_mode, + "Attaching tenant" + ); let tenant = match Tenant::spawn( conf, tenant_shard_id, resources, location_conf, + shard_identity, init_order, tenants, mode, @@ -762,12 +756,14 @@ pub(crate) async fn create_tenant( tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustNotExist)?; let tenant_path = super::create_tenant_files(conf, &location_conf, &tenant_shard_id).await?; + let shard_identity = location_conf.shard; let created_tenant = tenant_spawn( conf, tenant_shard_id, &tenant_path, resources, AttachedTenantConf::try_from(location_conf)?, + shard_identity, None, &TENANTS, SpawnMode::Create, @@ -797,14 +793,16 @@ pub(crate) async fn set_new_tenant_config( new_tenant_conf: TenantConfOpt, tenant_id: TenantId, ) -> Result<(), SetNewTenantConfigError> { + // Legacy API: does not support sharding + let tenant_shard_id = TenantShardId::unsharded(tenant_id); + info!("configuring tenant {tenant_id}"); - let tenant = get_tenant(tenant_id, true)?; + let tenant = get_tenant(tenant_shard_id, true)?; // This is a legacy API that only operates on attached tenants: the preferred // API to use is the location_config/ endpoint, which lets the caller provide // the full LocationConf. let location_conf = LocationConf::attached_single(new_tenant_conf, tenant.generation); - let tenant_shard_id = TenantShardId::unsharded(tenant_id); Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf) .await @@ -814,6 +812,12 @@ pub(crate) async fn set_new_tenant_config( } impl TenantManager { + /// Convenience function so that anyone with a TenantManager can get at the global configuration, without + /// having to pass it around everywhere as a separate object. + pub(crate) fn get_conf(&self) -> &'static PageServerConf { + self.conf + } + /// Gets the attached tenant from the in-memory data, erroring if it's absent, in secondary mode, or is not fitting to the query. /// `active_only = true` allows to query only tenants that are ready for operations, erroring on other kinds of tenants. pub(crate) fn get_attached_tenant_shard( @@ -849,17 +853,7 @@ impl TenantManager { } } - pub(crate) async fn delete_timeline( - &self, - tenant_shard_id: TenantShardId, - timeline_id: TimelineId, - _ctx: &RequestContext, - ) -> Result<(), DeleteTimelineError> { - let tenant = self.get_attached_tenant_shard(tenant_shard_id, true)?; - DeleteTimelineFlow::run(&tenant, timeline_id, false).await?; - Ok(()) - } - + #[instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))] pub(crate) async fn upsert_location( &self, tenant_shard_id: TenantShardId, @@ -967,41 +961,35 @@ impl TenantManager { } let tenant_path = self.conf.tenant_path(&tenant_shard_id); + let timelines_path = self.conf.timelines_path(&tenant_shard_id); + + // Directory structure is the same for attached and secondary modes: + // create it if it doesn't exist. Timeline load/creation expects the + // timelines/ subdir to already exist. + // + // Does not need to be fsync'd because local storage is just a cache. + tokio::fs::create_dir_all(&timelines_path) + .await + .with_context(|| format!("Creating {timelines_path}"))?; + + // Before activating either secondary or attached mode, persist the + // configuration, so that on restart we will re-attach (or re-start + // secondary) on the tenant. + Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config) + .await + .map_err(SetNewTenantConfigError::Persist)?; let new_slot = match &new_location_config.mode { - LocationMode::Secondary(_) => { - // Directory doesn't need to be fsync'd because if we crash it can - // safely be recreated next time this tenant location is configured. - unsafe_create_dir_all(&tenant_path) - .await - .with_context(|| format!("Creating {tenant_path}"))?; - - Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config) - .await - .map_err(SetNewTenantConfigError::Persist)?; - - TenantSlot::Secondary - } + LocationMode::Secondary(_) => TenantSlot::Secondary, LocationMode::Attached(_attach_config) => { - let timelines_path = self.conf.timelines_path(&tenant_shard_id); - - // Directory doesn't need to be fsync'd because we do not depend on - // it to exist after crashes: it may be recreated when tenant is - // re-attached, see https://github.com/neondatabase/neon/issues/5550 - unsafe_create_dir_all(&timelines_path) - .await - .with_context(|| format!("Creating {timelines_path}"))?; - - Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config) - .await - .map_err(SetNewTenantConfigError::Persist)?; - + let shard_identity = new_location_config.shard; let tenant = tenant_spawn( self.conf, tenant_shard_id, &tenant_path, self.resources.clone(), AttachedTenantConf::try_from(new_location_config)?, + shard_identity, None, self.tenants, SpawnMode::Normal, @@ -1016,6 +1004,160 @@ impl TenantManager { Ok(()) } + + /// Resetting a tenant is equivalent to detaching it, then attaching it again with the same + /// LocationConf that was last used to attach it. Optionally, the local file cache may be + /// dropped before re-attaching. + /// + /// This is not part of a tenant's normal lifecycle: it is used for debug/support, in situations + /// where an issue is identified that would go away with a restart of the tenant. + /// + /// This does not have any special "force" shutdown of a tenant: it relies on the tenant's tasks + /// to respect the cancellation tokens used in normal shutdown(). + #[instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %drop_cache))] + pub(crate) async fn reset_tenant( + &self, + tenant_shard_id: TenantShardId, + drop_cache: bool, + ctx: RequestContext, + ) -> anyhow::Result<()> { + let mut slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)?; + let Some(old_slot) = slot_guard.get_old_value() else { + anyhow::bail!("Tenant not found when trying to reset"); + }; + + let Some(tenant) = old_slot.get_attached() else { + slot_guard.revert(); + anyhow::bail!("Tenant is not in attached state"); + }; + + let (_guard, progress) = utils::completion::channel(); + match tenant.shutdown(progress, false).await { + Ok(()) => { + slot_guard.drop_old_value()?; + } + Err(_barrier) => { + slot_guard.revert(); + anyhow::bail!("Cannot reset Tenant, already shutting down"); + } + } + + let tenant_path = self.conf.tenant_path(&tenant_shard_id); + let timelines_path = self.conf.timelines_path(&tenant_shard_id); + let config = Tenant::load_tenant_config(self.conf, &tenant_shard_id)?; + + if drop_cache { + tracing::info!("Dropping local file cache"); + + match tokio::fs::read_dir(&timelines_path).await { + Err(e) => { + tracing::warn!("Failed to list timelines while dropping cache: {}", e); + } + Ok(mut entries) => { + while let Some(entry) = entries.next_entry().await? { + tokio::fs::remove_dir_all(entry.path()).await?; + } + } + } + } + + let shard_identity = config.shard; + let tenant = tenant_spawn( + self.conf, + tenant_shard_id, + &tenant_path, + self.resources.clone(), + AttachedTenantConf::try_from(config)?, + shard_identity, + None, + self.tenants, + SpawnMode::Normal, + &ctx, + )?; + + slot_guard.upsert(TenantSlot::Attached(tenant))?; + + Ok(()) + } + + pub(crate) fn get_attached_active_tenant_shards(&self) -> Vec> { + let locked = self.tenants.read().unwrap(); + match &*locked { + TenantsMap::Initializing => Vec::new(), + TenantsMap::Open(map) | TenantsMap::ShuttingDown(map) => map + .values() + .filter_map(|slot| { + slot.get_attached() + .and_then(|t| if t.is_active() { Some(t.clone()) } else { None }) + }) + .collect(), + } + } + + pub(crate) async fn delete_tenant( + &self, + tenant_shard_id: TenantShardId, + activation_timeout: Duration, + ) -> Result<(), DeleteTenantError> { + // We acquire a SlotGuard during this function to protect against concurrent + // changes while the ::prepare phase of DeleteTenantFlow executes, but then + // have to return the Tenant to the map while the background deletion runs. + // + // TODO: refactor deletion to happen outside the lifetime of a Tenant. + // Currently, deletion requires a reference to the tenants map in order to + // keep the Tenant in the map until deletion is complete, and then remove + // it at the end. + // + // See https://github.com/neondatabase/neon/issues/5080 + + let slot_guard = + tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustExist)?; + + // unwrap is safe because we used MustExist mode when acquiring + let tenant = match slot_guard.get_old_value().as_ref().unwrap() { + TenantSlot::Attached(tenant) => tenant.clone(), + _ => { + // Express "not attached" as equivalent to "not found" + return Err(DeleteTenantError::NotAttached); + } + }; + + match tenant.current_state() { + TenantState::Broken { .. } | TenantState::Stopping { .. } => { + // If a tenant is broken or stopping, DeleteTenantFlow can + // handle it: broken tenants proceed to delete, stopping tenants + // are checked for deletion already in progress. + } + _ => { + tenant + .wait_to_become_active(activation_timeout) + .await + .map_err(|e| match e { + GetActiveTenantError::WillNotBecomeActive(_) => { + DeleteTenantError::InvalidState(tenant.current_state()) + } + GetActiveTenantError::Cancelled => DeleteTenantError::Cancelled, + GetActiveTenantError::NotFound(_) => DeleteTenantError::NotAttached, + GetActiveTenantError::WaitForActiveTimeout { + latest_state: _latest_state, + wait_time: _wait_time, + } => DeleteTenantError::InvalidState(tenant.current_state()), + })?; + } + } + + let result = DeleteTenantFlow::run( + self.conf, + self.resources.remote_storage.clone(), + &TENANTS, + tenant, + ) + .await; + + // The Tenant goes back into the map in Stopping state, it will eventually be removed by DeleteTenantFLow + slot_guard.revert(); + result + } } #[derive(Debug, thiserror::Error)] @@ -1040,14 +1182,11 @@ pub(crate) enum GetTenantError { /// /// This method is cancel-safe. pub(crate) fn get_tenant( - tenant_id: TenantId, + tenant_shard_id: TenantShardId, active_only: bool, ) -> Result, GetTenantError> { let locked = TENANTS.read().unwrap(); - // TODO(sharding): make all callers of get_tenant shard-aware - let tenant_shard_id = TenantShardId::unsharded(tenant_id); - let peek_slot = tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Read)?; match peek_slot { @@ -1059,14 +1198,18 @@ pub(crate) fn get_tenant( TenantState::Active => Ok(Arc::clone(tenant)), _ => { if active_only { - Err(GetTenantError::NotActive(tenant_id)) + Err(GetTenantError::NotActive(tenant_shard_id.tenant_id)) } else { Ok(Arc::clone(tenant)) } } }, - Some(TenantSlot::InProgress(_)) => Err(GetTenantError::NotActive(tenant_id)), - None | Some(TenantSlot::Secondary) => Err(GetTenantError::NotFound(tenant_id)), + Some(TenantSlot::InProgress(_)) => { + Err(GetTenantError::NotActive(tenant_shard_id.tenant_id)) + } + None | Some(TenantSlot::Secondary) => { + Err(GetTenantError::NotFound(tenant_shard_id.tenant_id)) + } } } @@ -1100,6 +1243,7 @@ pub(crate) enum GetActiveTenantError { /// then wait for up to `timeout` (minus however long we waited for the slot). pub(crate) async fn get_active_tenant_with_timeout( tenant_id: TenantId, + shard_selector: ShardSelector, timeout: Duration, cancel: &CancellationToken, ) -> Result, GetActiveTenantError> { @@ -1108,15 +1252,19 @@ pub(crate) async fn get_active_tenant_with_timeout( Tenant(Arc), } - // TODO(sharding): make page service interface sharding-aware (page service should apply ShardIdentity to the key - // to decide which shard services the request) - let tenant_shard_id = TenantShardId::unsharded(tenant_id); - let wait_start = Instant::now(); let deadline = wait_start + timeout; - let wait_for = { + let (wait_for, tenant_shard_id) = { let locked = TENANTS.read().unwrap(); + + // Resolve TenantId to TenantShardId + let tenant_shard_id = locked + .resolve_attached_shard(&tenant_id, shard_selector) + .ok_or(GetActiveTenantError::NotFound(GetTenantError::NotFound( + tenant_id, + )))?; + let peek_slot = tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Read) .map_err(GetTenantError::MapState)?; match peek_slot { @@ -1126,7 +1274,10 @@ pub(crate) async fn get_active_tenant_with_timeout( // Fast path: we don't need to do any async waiting. return Ok(tenant.clone()); } - _ => WaitFor::Tenant(tenant.clone()), + _ => { + tenant.activate_now(); + (WaitFor::Tenant(tenant.clone()), tenant_shard_id) + } } } Some(TenantSlot::Secondary) => { @@ -1134,7 +1285,9 @@ pub(crate) async fn get_active_tenant_with_timeout( tenant_id, ))) } - Some(TenantSlot::InProgress(barrier)) => WaitFor::Barrier(barrier.clone()), + Some(TenantSlot::InProgress(barrier)) => { + (WaitFor::Barrier(barrier.clone()), tenant_shard_id) + } None => { return Err(GetActiveTenantError::NotFound(GetTenantError::NotFound( tenant_id, @@ -1178,64 +1331,10 @@ pub(crate) async fn get_active_tenant_with_timeout( }; tracing::debug!("Waiting for tenant to enter active state..."); - match timeout_cancellable( - deadline.duration_since(Instant::now()), - cancel, - tenant.wait_to_become_active(), - ) - .await - { - Ok(Ok(())) => Ok(tenant), - Ok(Err(e)) => Err(e), - Err(TimeoutCancellableError::Timeout) => { - let latest_state = tenant.current_state(); - if latest_state == TenantState::Active { - Ok(tenant) - } else { - Err(GetActiveTenantError::WaitForActiveTimeout { - latest_state: Some(latest_state), - wait_time: timeout, - }) - } - } - Err(TimeoutCancellableError::Cancelled) => Err(GetActiveTenantError::Cancelled), - } -} - -pub(crate) async fn delete_tenant( - conf: &'static PageServerConf, - remote_storage: Option, - tenant_shard_id: TenantShardId, -) -> Result<(), DeleteTenantError> { - // We acquire a SlotGuard during this function to protect against concurrent - // changes while the ::prepare phase of DeleteTenantFlow executes, but then - // have to return the Tenant to the map while the background deletion runs. - // - // TODO: refactor deletion to happen outside the lifetime of a Tenant. - // Currently, deletion requires a reference to the tenants map in order to - // keep the Tenant in the map until deletion is complete, and then remove - // it at the end. - // - // See https://github.com/neondatabase/neon/issues/5080 - - // TODO(sharding): make delete API sharding-aware - let mut slot_guard = - tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustExist)?; - - // unwrap is safe because we used MustExist mode when acquiring - let tenant = match slot_guard.get_old_value().as_ref().unwrap() { - TenantSlot::Attached(tenant) => tenant.clone(), - _ => { - // Express "not attached" as equivalent to "not found" - return Err(DeleteTenantError::NotAttached); - } - }; - - let result = DeleteTenantFlow::run(conf, remote_storage, &TENANTS, tenant).await; - - // The Tenant goes back into the map in Stopping state, it will eventually be removed by DeleteTenantFLow - slot_guard.revert(); - result + tenant + .wait_to_become_active(deadline.duration_since(Instant::now())) + .await?; + Ok(tenant) } #[derive(Debug, thiserror::Error)] @@ -1377,12 +1476,14 @@ pub(crate) async fn load_tenant( Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf).await?; + let shard_identity = location_conf.shard; let new_tenant = tenant_spawn( conf, tenant_shard_id, &tenant_path, resources, AttachedTenantConf::try_from(location_conf)?, + shard_identity, None, &TENANTS, SpawnMode::Normal, @@ -1433,7 +1534,8 @@ pub(crate) enum TenantMapListError { /// /// Get list of tenants, for the mgmt API /// -pub(crate) async fn list_tenants() -> Result, TenantMapListError> { +pub(crate) async fn list_tenants() -> Result, TenantMapListError> +{ let tenants = TENANTS.read().unwrap(); let m = match &*tenants { TenantsMap::Initializing => return Err(TenantMapListError::Initializing), @@ -1441,12 +1543,10 @@ pub(crate) async fn list_tenants() -> Result, Tenan }; Ok(m.iter() .filter_map(|(id, tenant)| match tenant { - TenantSlot::Attached(tenant) => Some((id, tenant.current_state())), + TenantSlot::Attached(tenant) => Some((*id, tenant.current_state())), TenantSlot::Secondary => None, TenantSlot::InProgress(_) => None, }) - // TODO(sharding): make callers of this function shard-aware - .map(|(k, v)| (k.tenant_id, v)) .collect()) } @@ -1472,12 +1572,14 @@ pub(crate) async fn attach_tenant( // TODO: tenant directory remains on disk if we bail out from here on. // See https://github.com/neondatabase/neon/issues/4233 + let shard_identity = location_conf.shard; let attached_tenant = tenant_spawn( conf, tenant_shard_id, &tenant_dir, resources, AttachedTenantConf::try_from(location_conf)?, + shard_identity, None, &TENANTS, SpawnMode::Normal, @@ -1543,9 +1645,10 @@ pub enum TenantSlotUpsertError { MapState(#[from] TenantMapError), } -#[derive(Debug)] +#[derive(Debug, thiserror::Error)] enum TenantSlotDropError { /// It is only legal to drop a TenantSlot if its contents are fully shut down + #[error("Tenant was not shut down")] NotShutdown, } @@ -1605,9 +1708,9 @@ impl SlotGuard { } } - /// Take any value that was present in the slot before we acquired ownership + /// Get any value that was present in the slot before we acquired ownership /// of it: in state transitions, this will be the old state. - fn get_old_value(&mut self) -> &Option { + fn get_old_value(&self) -> &Option { &self.old_value } @@ -1825,7 +1928,7 @@ fn tenant_map_acquire_slot_impl( METRICS.tenant_slot_writes.inc(); let mut locked = tenants.write().unwrap(); - let span = tracing::info_span!("acquire_slot", tenant_id=%tenant_shard_id.tenant_id, shard=tenant_shard_id.shard_slug()); + let span = tracing::info_span!("acquire_slot", tenant_id=%tenant_shard_id.tenant_id, shard = %tenant_shard_id.shard_slug()); let _guard = span.enter(); let m = match &mut *locked { @@ -1977,21 +2080,19 @@ use { }; pub(crate) async fn immediate_gc( - tenant_id: TenantId, + tenant_shard_id: TenantShardId, timeline_id: TimelineId, gc_req: TimelineGcRequest, cancel: CancellationToken, ctx: &RequestContext, ) -> Result>, ApiError> { let guard = TENANTS.read().unwrap(); - let tenant = guard - .get(&tenant_id) - .map(Arc::clone) - .with_context(|| format!("tenant {tenant_id}")) - .map_err(|e| ApiError::NotFound(e.into()))?; - // TODO(sharding): make callers of this function shard-aware - let tenant_shard_id = TenantShardId::unsharded(tenant_id); + let tenant = guard + .get(&tenant_shard_id) + .map(Arc::clone) + .with_context(|| format!("tenant {tenant_shard_id}")) + .map_err(|e| ApiError::NotFound(e.into()))?; let gc_horizon = gc_req.gc_horizon.unwrap_or_else(|| tenant.get_gc_horizon()); // Use tenant's pitr setting @@ -2004,9 +2105,9 @@ pub(crate) async fn immediate_gc( task_mgr::spawn( &tokio::runtime::Handle::current(), TaskKind::GarbageCollector, - Some(tenant_id), + Some(tenant_shard_id), Some(timeline_id), - &format!("timeline_gc_handler garbage collection run for tenant {tenant_id} timeline {timeline_id}"), + &format!("timeline_gc_handler garbage collection run for tenant {tenant_shard_id} timeline {timeline_id}"), false, async move { fail::fail_point!("immediate_gc_task_pre"); diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index 183ee19a40..60b40d70a7 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -180,7 +180,7 @@ //! [`Tenant::timeline_init_and_sync`]: super::Tenant::timeline_init_and_sync //! [`Timeline::load_layer_map`]: super::Timeline::load_layer_map -mod download; +pub(crate) mod download; pub mod index; mod upload; @@ -188,6 +188,7 @@ use anyhow::Context; use camino::Utf8Path; use chrono::{NaiveDateTime, Utc}; +pub(crate) use download::download_initdb_tar_zst; use pageserver_api::shard::{ShardIndex, TenantShardId}; use scopeguard::ScopeGuard; use tokio_util::sync::CancellationToken; @@ -195,10 +196,12 @@ pub(crate) use upload::upload_initdb_dir; use utils::backoff::{ self, exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS, }; +use utils::timeout::{timeout_cancellable, TimeoutCancellableError}; use std::collections::{HashMap, VecDeque}; use std::sync::atomic::{AtomicU32, Ordering}; use std::sync::{Arc, Mutex}; +use std::time::Duration; use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath}; use std::ops::DerefMut; @@ -253,6 +256,9 @@ pub(crate) const FAILED_UPLOAD_WARN_THRESHOLD: u32 = 3; pub(crate) const INITDB_PATH: &str = "initdb.tar.zst"; +/// Default buffer size when interfacing with [`tokio::fs::File`]. +pub(crate) const BUFFER_SIZE: usize = 32 * 1024; + pub enum MaybeDeletedIndexPart { IndexPart(IndexPart), Deleted(IndexPart), @@ -312,6 +318,47 @@ pub struct RemoteTimelineClient { storage_impl: GenericRemoteStorage, deletion_queue_client: DeletionQueueClient, + + cancel: CancellationToken, +} + +/// This timeout is intended to deal with hangs in lower layers, e.g. stuck TCP flows. It is not +/// intended to be snappy enough for prompt shutdown, as we have a CancellationToken for that. +const UPLOAD_TIMEOUT: Duration = Duration::from_secs(120); +const DOWNLOAD_TIMEOUT: Duration = Duration::from_secs(120); + +/// Wrapper for timeout_cancellable that flattens result and converts TimeoutCancellableError to anyhow. +/// +/// This is a convenience for the various upload functions. In future +/// the anyhow::Error result should be replaced with a more structured type that +/// enables callers to avoid handling shutdown as an error. +async fn upload_cancellable(cancel: &CancellationToken, future: F) -> anyhow::Result<()> +where + F: std::future::Future>, +{ + match timeout_cancellable(UPLOAD_TIMEOUT, cancel, future).await { + Ok(Ok(())) => Ok(()), + Ok(Err(e)) => Err(e), + Err(TimeoutCancellableError::Timeout) => Err(anyhow::anyhow!("Timeout")), + Err(TimeoutCancellableError::Cancelled) => Err(anyhow::anyhow!("Shutting down")), + } +} +/// Wrapper for timeout_cancellable that flattens result and converts TimeoutCancellableError to DownloaDError. +async fn download_cancellable( + cancel: &CancellationToken, + future: F, +) -> Result +where + F: std::future::Future>, +{ + match timeout_cancellable(DOWNLOAD_TIMEOUT, cancel, future).await { + Ok(Ok(r)) => Ok(r), + Ok(Err(e)) => Err(e), + Err(TimeoutCancellableError::Timeout) => { + Err(DownloadError::Other(anyhow::anyhow!("Timed out"))) + } + Err(TimeoutCancellableError::Cancelled) => Err(DownloadError::Cancelled), + } } impl RemoteTimelineClient { @@ -347,6 +394,7 @@ impl RemoteTimelineClient { &tenant_shard_id, &timeline_id, )), + cancel: CancellationToken::new(), } } @@ -497,6 +545,7 @@ impl RemoteTimelineClient { &self, layer_file_name: &LayerFileName, layer_metadata: &LayerFileMetadata, + cancel: &CancellationToken, ) -> anyhow::Result { let downloaded_size = { let _unfinished_gauge_guard = self.metrics.call_begin( @@ -513,6 +562,7 @@ impl RemoteTimelineClient { self.timeline_id, layer_file_name, layer_metadata, + cancel, ) .measure_remote_op( self.tenant_shard_id.tenant_id, @@ -768,8 +818,25 @@ impl RemoteTimelineClient { fn schedule_deletion_of_unlinked0( self: &Arc, upload_queue: &mut UploadQueueInitialized, - with_metadata: Vec<(LayerFileName, LayerFileMetadata)>, + mut with_metadata: Vec<(LayerFileName, LayerFileMetadata)>, ) { + // Filter out any layers which were not created by this tenant shard. These are + // layers that originate from some ancestor shard after a split, and may still + // be referenced by other shards. We are free to delete them locally and remove + // them from our index (and would have already done so when we reach this point + // in the code), but we may not delete them remotely. + with_metadata.retain(|(name, meta)| { + let retain = meta.shard.shard_number == self.tenant_shard_id.shard_number + && meta.shard.shard_count == self.tenant_shard_id.shard_count; + if !retain { + tracing::debug!( + "Skipping deletion of ancestor-shard layer {name}, from shard {}", + meta.shard + ); + } + retain + }); + for (name, meta) in &with_metadata { info!( "scheduling deletion of layer {}{} (shard {})", @@ -967,6 +1034,7 @@ impl RemoteTimelineClient { &self.timeline_id, self.generation, &index_part_with_deleted_at, + &self.cancel, ) }, |_e| false, @@ -976,8 +1044,7 @@ impl RemoteTimelineClient { // when executed as part of tenant deletion this happens in the background 2, "persist_index_part_with_deleted_flag", - // TODO: use a cancellation token (https://github.com/neondatabase/neon/issues/5066) - backoff::Cancel::new(CancellationToken::new(), || unreachable!()), + backoff::Cancel::new(self.cancel.clone(), || anyhow::anyhow!("Cancelled")), ) .await?; @@ -1077,7 +1144,17 @@ impl RemoteTimelineClient { let remaining_layers: Vec = remaining .into_iter() - .filter(|p| p!= &latest_index) + .filter(|p| { + if p == &latest_index { + return false; + } + if let Some(name) = p.object_name() { + if name == INITDB_PATH { + return false; + } + } + true + }) .inspect(|path| { if let Some(name) = path.object_name() { info!(%name, "deleting a file not referenced from index_part.json"); @@ -1209,7 +1286,7 @@ impl RemoteTimelineClient { task_mgr::spawn( &self.runtime, TaskKind::RemoteUploadTask, - Some(self.tenant_shard_id.tenant_id), + Some(self.tenant_shard_id), Some(self.timeline_id), "remote upload", false, @@ -1267,6 +1344,7 @@ impl RemoteTimelineClient { path, layer_metadata, self.generation, + &self.cancel, ) .measure_remote_op( self.tenant_shard_id.tenant_id, @@ -1293,6 +1371,7 @@ impl RemoteTimelineClient { &self.timeline_id, self.generation, index_part, + &self.cancel, ) .measure_remote_op( self.tenant_shard_id.tenant_id, @@ -1590,6 +1669,23 @@ impl RemoteTimelineClient { } } } + + pub(crate) fn get_layers_metadata( + &self, + layers: Vec, + ) -> anyhow::Result>> { + let q = self.upload_queue.lock().unwrap(); + let q = match &*q { + UploadQueue::Stopped(_) | UploadQueue::Uninitialized => { + anyhow::bail!("queue is in state {}", q.as_str()) + } + UploadQueue::Initialized(inner) => inner, + }; + + let decorated = layers.into_iter().map(|l| q.latest_files.get(&l).cloned()); + + Ok(decorated.collect()) + } } pub fn remote_timelines_path(tenant_shard_id: &TenantShardId) -> RemotePath { @@ -1645,6 +1741,13 @@ pub fn remote_index_path( .expect("Failed to construct path") } +pub const HEATMAP_BASENAME: &str = "heatmap-v1.json"; + +pub(crate) fn remote_heatmap_path(tenant_shard_id: &TenantShardId) -> RemotePath { + RemotePath::from_string(&format!("tenants/{tenant_shard_id}/{HEATMAP_BASENAME}")) + .expect("Failed to construct path") +} + /// Given the key of an index, parse out the generation part of the name pub fn parse_remote_index_path(path: RemotePath) -> Option { let file_name = match path.get_path().file_name() { @@ -1790,6 +1893,7 @@ mod tests { &self.harness.tenant_shard_id, &TIMELINE_ID, )), + cancel: CancellationToken::new(), }) } @@ -2105,15 +2209,6 @@ mod tests { let index_part_bytes = serde_json::to_vec(&example_index_part).unwrap(); - let timeline_path = test_state.harness.timeline_path(&TIMELINE_ID); - let remote_timeline_dir = test_state.harness.remote_fs_dir.join( - timeline_path - .strip_prefix(&test_state.harness.conf.workdir) - .unwrap(), - ); - - std::fs::create_dir_all(remote_timeline_dir).expect("creating test dir should work"); - let index_path = test_state.harness.remote_fs_dir.join( remote_index_path( &test_state.harness.tenant_shard_id, @@ -2122,6 +2217,10 @@ mod tests { ) .get_path(), ); + + std::fs::create_dir_all(index_path.parent().unwrap()) + .expect("creating test dir should work"); + eprintln!("Writing {index_path}"); std::fs::write(&index_path, index_part_bytes).unwrap(); example_index_part diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs index 1e9dcfe76a..d3956163c8 100644 --- a/pageserver/src/tenant/remote_timeline_client/download.rs +++ b/pageserver/src/tenant/remote_timeline_client/download.rs @@ -5,33 +5,36 @@ use std::collections::HashSet; use std::future::Future; -use std::time::Duration; use anyhow::{anyhow, Context}; -use camino::Utf8Path; +use camino::{Utf8Path, Utf8PathBuf}; use pageserver_api::shard::TenantShardId; -use tokio::fs; -use tokio::io::AsyncWriteExt; +use tokio::fs::{self, File, OpenOptions}; +use tokio::io::{AsyncSeekExt, AsyncWriteExt}; use tokio_util::sync::CancellationToken; +use tracing::warn; +use utils::timeout::timeout_cancellable; use utils::{backoff, crashsafe}; use crate::config::PageServerConf; -use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_path}; +use crate::tenant::remote_timeline_client::{ + download_cancellable, remote_layer_path, remote_timelines_path, DOWNLOAD_TIMEOUT, +}; use crate::tenant::storage_layer::LayerFileName; use crate::tenant::timeline::span::debug_assert_current_span_has_tenant_and_timeline_id; use crate::tenant::Generation; +use crate::virtual_file::on_fatal_io_error; +use crate::TEMP_FILE_SUFFIX; use remote_storage::{DownloadError, GenericRemoteStorage, ListingMode}; use utils::crashsafe::path_with_suffix_extension; use utils::id::TimelineId; use super::index::{IndexPart, LayerFileMetadata}; use super::{ - parse_remote_index_path, remote_index_path, FAILED_DOWNLOAD_WARN_THRESHOLD, - FAILED_REMOTE_OP_RETRIES, + parse_remote_index_path, remote_index_path, remote_initdb_archive_path, + FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES, INITDB_PATH, }; -static MAX_DOWNLOAD_DURATION: Duration = Duration::from_secs(120); - /// /// If 'metadata' is given, we will validate that the downloaded file's size matches that /// in the metadata. (In the future, we might do more cross-checks, like CRC validation) @@ -44,6 +47,7 @@ pub async fn download_layer_file<'a>( timeline_id: TimelineId, layer_file_name: &'a LayerFileName, layer_metadata: &'a LayerFileMetadata, + cancel: &CancellationToken, ) -> Result { debug_assert_current_span_has_tenant_and_timeline_id(); @@ -71,15 +75,18 @@ pub async fn download_layer_file<'a>( // If pageserver crashes the temp file will be deleted on startup and re-downloaded. let temp_file_path = path_with_suffix_extension(&local_path, TEMP_DOWNLOAD_EXTENSION); + let cancel_inner = cancel.clone(); let (mut destination_file, bytes_amount) = download_retry( || async { - // TODO: this doesn't use the cached fd for some reason? - let mut destination_file = fs::File::create(&temp_file_path) + let destination_file = tokio::fs::File::create(&temp_file_path) .await .with_context(|| format!("create a destination file for layer '{temp_file_path}'")) .map_err(DownloadError::Other)?; - let mut download = storage - .download(&remote_path) + + // Cancellation safety: it is safe to cancel this future, because it isn't writing to a local + // file: the write to local file doesn't start until after the request header is returned + // and we start draining the body stream below + let download = download_cancellable(&cancel_inner, storage.download(&remote_path)) .await .with_context(|| { format!( @@ -88,12 +95,38 @@ pub async fn download_layer_file<'a>( }) .map_err(DownloadError::Other)?; - let bytes_amount = tokio::time::timeout( - MAX_DOWNLOAD_DURATION, - tokio::io::copy(&mut download.download_stream, &mut destination_file), + let mut destination_file = + tokio::io::BufWriter::with_capacity(super::BUFFER_SIZE, destination_file); + + let mut reader = tokio_util::io::StreamReader::new(download.download_stream); + + // Cancellation safety: it is safe to cancel this future because it is writing into a temporary file, + // and we will unlink the temporary file if there is an error. This unlink is important because we + // are in a retry loop, and we wouldn't want to leave behind a rogue write I/O to a file that + // we will imminiently try and write to again. + let bytes_amount: u64 = match timeout_cancellable( + DOWNLOAD_TIMEOUT, + &cancel_inner, + tokio::io::copy_buf(&mut reader, &mut destination_file), ) .await - .map_err(|e| DownloadError::Other(anyhow::anyhow!("Timed out {:?}", e)))? + .with_context(|| { + format!( + "download layer at remote path '{remote_path:?}' into file {temp_file_path:?}" + ) + }) + .map_err(DownloadError::Other)? + { + Ok(b) => Ok(b), + Err(e) => { + // Remove incomplete files: on restart Timeline would do this anyway, but we must + // do it here for the retry case. + if let Err(e) = tokio::fs::remove_file(&temp_file_path).await { + on_fatal_io_error(&e, &format!("Removing temporary file {temp_file_path}")); + } + Err(e) + } + } .with_context(|| { format!( "download layer at remote path '{remote_path:?}' into file {temp_file_path:?}" @@ -101,9 +134,12 @@ pub async fn download_layer_file<'a>( }) .map_err(DownloadError::Other)?; + let destination_file = destination_file.into_inner(); + Ok((destination_file, bytes_amount)) }, &format!("download {remote_path:?}"), + cancel, ) .await?; @@ -180,8 +216,14 @@ pub async fn list_remote_timelines( anyhow::bail!("storage-sync-list-remote-timelines"); }); + let cancel_inner = cancel.clone(); let listing = download_retry_forever( - || storage.list(Some(&remote_path), ListingMode::WithDelimiter), + || { + download_cancellable( + &cancel_inner, + storage.list(Some(&remote_path), ListingMode::WithDelimiter), + ) + }, &format!("list timelines for {tenant_shard_id}"), cancel, ) @@ -218,20 +260,26 @@ async fn do_download_index_part( index_generation: Generation, cancel: CancellationToken, ) -> Result { + use futures::stream::StreamExt; + let remote_path = remote_index_path(tenant_shard_id, timeline_id, index_generation); + let cancel_inner = cancel.clone(); let index_part_bytes = download_retry_forever( || async { - let mut index_part_download = storage.download(&remote_path).await?; + // Cancellation: if is safe to cancel this future because we're just downloading into + // a memory buffer, not touching local disk. + let index_part_download = + download_cancellable(&cancel_inner, storage.download(&remote_path)).await?; let mut index_part_bytes = Vec::new(); - tokio::io::copy( - &mut index_part_download.download_stream, - &mut index_part_bytes, - ) - .await - .with_context(|| format!("download index part at {remote_path:?}")) - .map_err(DownloadError::Other)?; + let mut stream = std::pin::pin!(index_part_download.download_stream); + while let Some(chunk) = stream.next().await { + let chunk = chunk + .with_context(|| format!("download index part at {remote_path:?}")) + .map_err(DownloadError::Other)?; + index_part_bytes.extend_from_slice(&chunk[..]); + } Ok(index_part_bytes) }, &format!("download {remote_path:?}"), @@ -337,10 +385,7 @@ pub(super) async fn download_index_part( FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES, "listing index_part files", - // TODO: use a cancellation token (https://github.com/neondatabase/neon/issues/5066) - backoff::Cancel::new(CancellationToken::new(), || -> anyhow::Error { - unreachable!() - }), + backoff::Cancel::new(cancel.clone(), || anyhow::anyhow!("Cancelled")), ) .await .map_err(DownloadError::Other)?; @@ -361,7 +406,7 @@ pub(super) async fn download_index_part( None => { // Migration from legacy pre-generation state: we have a generation but no prior // attached pageservers did. Try to load from a no-generation path. - tracing::info!("No index_part.json* found"); + tracing::debug!("No index_part.json* found"); do_download_index_part( storage, tenant_shard_id, @@ -374,6 +419,83 @@ pub(super) async fn download_index_part( } } +pub(crate) async fn download_initdb_tar_zst( + conf: &'static PageServerConf, + storage: &GenericRemoteStorage, + tenant_shard_id: &TenantShardId, + timeline_id: &TimelineId, + cancel: &CancellationToken, +) -> Result<(Utf8PathBuf, File), DownloadError> { + debug_assert_current_span_has_tenant_and_timeline_id(); + + let remote_path = remote_initdb_archive_path(&tenant_shard_id.tenant_id, timeline_id); + + let timeline_path = conf.timelines_path(tenant_shard_id); + + if !timeline_path.exists() { + tokio::fs::create_dir_all(&timeline_path) + .await + .with_context(|| format!("timeline dir creation {timeline_path}")) + .map_err(DownloadError::Other)?; + } + let temp_path = timeline_path.join(format!( + "{INITDB_PATH}.download-{timeline_id}.{TEMP_FILE_SUFFIX}" + )); + + let cancel_inner = cancel.clone(); + + let file = download_retry( + || async { + let file = OpenOptions::new() + .create(true) + .truncate(true) + .read(true) + .write(true) + .open(&temp_path) + .await + .with_context(|| format!("tempfile creation {temp_path}")) + .map_err(DownloadError::Other)?; + + let download = + download_cancellable(&cancel_inner, storage.download(&remote_path)).await?; + let mut download = tokio_util::io::StreamReader::new(download.download_stream); + let mut writer = tokio::io::BufWriter::with_capacity(8 * 1024, file); + + // TODO: this consumption of the response body should be subject to timeout + cancellation, but + // not without thinking carefully about how to recover safely from cancelling a write to + // local storage (e.g. by writing into a temp file as we do in download_layer) + tokio::io::copy_buf(&mut download, &mut writer) + .await + .with_context(|| format!("download initdb.tar.zst at {remote_path:?}")) + .map_err(DownloadError::Other)?; + + let mut file = writer.into_inner(); + + file.seek(std::io::SeekFrom::Start(0)) + .await + .with_context(|| format!("rewinding initdb.tar.zst at: {remote_path:?}")) + .map_err(DownloadError::Other)?; + + Ok(file) + }, + &format!("download {remote_path}"), + cancel, + ) + .await + .map_err(|e| { + // Do a best-effort attempt at deleting the temporary file upon encountering an error. + // We don't have async here nor do we want to pile on any extra errors. + if let Err(e) = std::fs::remove_file(&temp_path) { + if e.kind() != std::io::ErrorKind::NotFound { + warn!("error deleting temporary file {temp_path}: {e}"); + } + } + e + })?; + + Ok((temp_path, file)) +} + /// Helper function to handle retries for a download operation. /// /// Remote operations can fail due to rate limits (IAM, S3), spurious network @@ -381,7 +503,11 @@ pub(super) async fn download_index_part( /// with backoff. /// /// (See similar logic for uploads in `perform_upload_task`) -async fn download_retry(op: O, description: &str) -> Result +async fn download_retry( + op: O, + description: &str, + cancel: &CancellationToken, +) -> Result where O: FnMut() -> F, F: Future>, @@ -392,10 +518,7 @@ where FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES, description, - // TODO: use a cancellation token (https://github.com/neondatabase/neon/issues/5066) - backoff::Cancel::new(CancellationToken::new(), || -> DownloadError { - unreachable!() - }), + backoff::Cancel::new(cancel.clone(), || DownloadError::Cancelled), ) .await } diff --git a/pageserver/src/tenant/remote_timeline_client/upload.rs b/pageserver/src/tenant/remote_timeline_client/upload.rs index 4ca4438003..11c6956875 100644 --- a/pageserver/src/tenant/remote_timeline_client/upload.rs +++ b/pageserver/src/tenant/remote_timeline_client/upload.rs @@ -1,18 +1,20 @@ //! Helper functions to upload files to remote storage with a RemoteStorage use anyhow::{bail, Context}; -use bytes::Bytes; use camino::Utf8Path; use fail::fail_point; use pageserver_api::shard::TenantShardId; -use std::io::ErrorKind; -use tokio::fs; +use std::io::{ErrorKind, SeekFrom}; +use tokio::fs::{self, File}; +use tokio::io::AsyncSeekExt; +use tokio_util::sync::CancellationToken; use super::Generation; use crate::{ config::PageServerConf, tenant::remote_timeline_client::{ index::IndexPart, remote_index_path, remote_initdb_archive_path, remote_path, + upload_cancellable, }, }; use remote_storage::GenericRemoteStorage; @@ -29,6 +31,7 @@ pub(super) async fn upload_index_part<'a>( timeline_id: &TimelineId, generation: Generation, index_part: &'a IndexPart, + cancel: &CancellationToken, ) -> anyhow::Result<()> { tracing::trace!("uploading new index part"); @@ -41,13 +44,19 @@ pub(super) async fn upload_index_part<'a>( .to_s3_bytes() .context("serialize index part file into bytes")?; let index_part_size = index_part_bytes.len(); - let index_part_bytes = tokio::io::BufReader::new(std::io::Cursor::new(index_part_bytes)); + let index_part_bytes = bytes::Bytes::from(index_part_bytes); let remote_path = remote_index_path(tenant_shard_id, timeline_id, generation); - storage - .upload_storage_object(Box::new(index_part_bytes), index_part_size, &remote_path) - .await - .with_context(|| format!("upload index part for '{tenant_shard_id} / {timeline_id}'")) + upload_cancellable( + cancel, + storage.upload_storage_object( + futures::stream::once(futures::future::ready(Ok(index_part_bytes))), + index_part_size, + &remote_path, + ), + ) + .await + .with_context(|| format!("upload index part for '{tenant_shard_id} / {timeline_id}'")) } /// Attempts to upload given layer files. @@ -60,6 +69,7 @@ pub(super) async fn upload_timeline_layer<'a>( source_path: &'a Utf8Path, known_metadata: &'a LayerFileMetadata, generation: Generation, + cancel: &CancellationToken, ) -> anyhow::Result<()> { fail_point!("before-upload-layer", |_| { bail!("failpoint before-upload-layer") @@ -101,8 +111,9 @@ pub(super) async fn upload_timeline_layer<'a>( let fs_size = usize::try_from(fs_size) .with_context(|| format!("convert {source_path:?} size {fs_size} usize"))?; - storage - .upload(source_file, fs_size, &storage_path, None) + let reader = tokio_util::io::ReaderStream::with_capacity(source_file, super::BUFFER_SIZE); + + upload_cancellable(cancel, storage.upload(reader, fs_size, &storage_path, None)) .await .with_context(|| format!("upload layer from local path '{source_path}'"))?; @@ -114,16 +125,22 @@ pub(crate) async fn upload_initdb_dir( storage: &GenericRemoteStorage, tenant_id: &TenantId, timeline_id: &TimelineId, - initdb_dir: Bytes, + mut initdb_tar_zst: File, + size: u64, + cancel: &CancellationToken, ) -> anyhow::Result<()> { tracing::trace!("uploading initdb dir"); - let size = initdb_dir.len(); - let bytes = tokio::io::BufReader::new(std::io::Cursor::new(initdb_dir)); + // We might have read somewhat into the file already in the prior retry attempt + initdb_tar_zst.seek(SeekFrom::Start(0)).await?; + + let file = tokio_util::io::ReaderStream::with_capacity(initdb_tar_zst, super::BUFFER_SIZE); let remote_path = remote_initdb_archive_path(tenant_id, timeline_id); - storage - .upload_storage_object(bytes, size, &remote_path) - .await - .with_context(|| format!("upload initdb dir for '{tenant_id} / {timeline_id}'")) + upload_cancellable( + cancel, + storage.upload_storage_object(file, size as usize, &remote_path), + ) + .await + .with_context(|| format!("upload initdb dir for '{tenant_id} / {timeline_id}'")) } diff --git a/pageserver/src/tenant/secondary.rs b/pageserver/src/tenant/secondary.rs new file mode 100644 index 0000000000..d25fe56b92 --- /dev/null +++ b/pageserver/src/tenant/secondary.rs @@ -0,0 +1,104 @@ +pub mod heatmap; +mod heatmap_uploader; + +use std::sync::Arc; + +use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME}; + +use self::heatmap_uploader::heatmap_uploader_task; + +use super::mgr::TenantManager; + +use pageserver_api::shard::TenantShardId; +use remote_storage::GenericRemoteStorage; + +use tokio_util::sync::CancellationToken; +use utils::completion::Barrier; + +enum UploadCommand { + Upload(TenantShardId), +} + +struct CommandRequest { + payload: T, + response_tx: tokio::sync::oneshot::Sender, +} + +struct CommandResponse { + result: anyhow::Result<()>, +} + +/// The SecondaryController is a pseudo-rpc client for administrative control of secondary mode downloads, +/// and heatmap uploads. This is not a hot data path: it's primarily a hook for tests, +/// where we want to immediately upload/download for a particular tenant. In normal operation +/// uploads & downloads are autonomous and not driven by this interface. +pub struct SecondaryController { + upload_req_tx: tokio::sync::mpsc::Sender>, +} + +impl SecondaryController { + async fn dispatch( + &self, + queue: &tokio::sync::mpsc::Sender>, + payload: T, + ) -> anyhow::Result<()> { + let (response_tx, response_rx) = tokio::sync::oneshot::channel(); + + queue + .send(CommandRequest { + payload, + response_tx, + }) + .await + .map_err(|_| anyhow::anyhow!("Receiver shut down"))?; + + let response = response_rx + .await + .map_err(|_| anyhow::anyhow!("Request dropped"))?; + + response.result + } + + pub async fn upload_tenant(&self, tenant_shard_id: TenantShardId) -> anyhow::Result<()> { + self.dispatch(&self.upload_req_tx, UploadCommand::Upload(tenant_shard_id)) + .await + } +} + +pub fn spawn_tasks( + tenant_manager: Arc, + remote_storage: GenericRemoteStorage, + background_jobs_can_start: Barrier, + cancel: CancellationToken, +) -> SecondaryController { + let (upload_req_tx, upload_req_rx) = + tokio::sync::mpsc::channel::>(16); + + task_mgr::spawn( + BACKGROUND_RUNTIME.handle(), + TaskKind::SecondaryUploads, + None, + None, + "heatmap uploads", + false, + async move { + heatmap_uploader_task( + tenant_manager, + remote_storage, + upload_req_rx, + background_jobs_can_start, + cancel, + ) + .await + }, + ); + + SecondaryController { upload_req_tx } +} + +/// For running with remote storage disabled: a SecondaryController that is connected to nothing. +pub fn null_controller() -> SecondaryController { + let (upload_req_tx, _upload_req_rx) = + tokio::sync::mpsc::channel::>(16); + SecondaryController { upload_req_tx } +} diff --git a/pageserver/src/tenant/secondary/heatmap.rs b/pageserver/src/tenant/secondary/heatmap.rs new file mode 100644 index 0000000000..99aaaeb8c8 --- /dev/null +++ b/pageserver/src/tenant/secondary/heatmap.rs @@ -0,0 +1,64 @@ +use std::time::SystemTime; + +use crate::tenant::{ + remote_timeline_client::index::IndexLayerMetadata, storage_layer::LayerFileName, +}; + +use serde::{Deserialize, Serialize}; +use serde_with::{serde_as, DisplayFromStr, TimestampSeconds}; + +use utils::{generation::Generation, id::TimelineId}; + +#[derive(Serialize, Deserialize)] +pub(super) struct HeatMapTenant { + /// Generation of the attached location that uploaded the heatmap: this is not required + /// for correctness, but acts as a hint to secondary locations in order to detect thrashing + /// in the unlikely event that two attached locations are both uploading conflicting heatmaps. + pub(super) generation: Generation, + + pub(super) timelines: Vec, +} + +#[serde_as] +#[derive(Serialize, Deserialize)] +pub(crate) struct HeatMapTimeline { + #[serde_as(as = "DisplayFromStr")] + pub(super) timeline_id: TimelineId, + + pub(super) layers: Vec, +} + +#[serde_as] +#[derive(Serialize, Deserialize)] +pub(crate) struct HeatMapLayer { + pub(super) name: LayerFileName, + pub(super) metadata: IndexLayerMetadata, + + #[serde_as(as = "TimestampSeconds")] + pub(super) access_time: SystemTime, + // TODO: an actual 'heat' score that would let secondary locations prioritize downloading + // the hottest layers, rather than trying to simply mirror whatever layers are on-disk on the primary. +} + +impl HeatMapLayer { + pub(crate) fn new( + name: LayerFileName, + metadata: IndexLayerMetadata, + access_time: SystemTime, + ) -> Self { + Self { + name, + metadata, + access_time, + } + } +} + +impl HeatMapTimeline { + pub(crate) fn new(timeline_id: TimelineId, layers: Vec) -> Self { + Self { + timeline_id, + layers, + } + } +} diff --git a/pageserver/src/tenant/secondary/heatmap_uploader.rs b/pageserver/src/tenant/secondary/heatmap_uploader.rs new file mode 100644 index 0000000000..ece2b93ce1 --- /dev/null +++ b/pageserver/src/tenant/secondary/heatmap_uploader.rs @@ -0,0 +1,582 @@ +use std::{ + collections::HashMap, + sync::{Arc, Weak}, + time::{Duration, Instant}, +}; + +use crate::{ + metrics::SECONDARY_MODE, + tenant::{ + config::AttachmentMode, mgr::TenantManager, remote_timeline_client::remote_heatmap_path, + secondary::CommandResponse, span::debug_assert_current_span_has_tenant_id, Tenant, + }, +}; + +use md5; +use pageserver_api::shard::TenantShardId; +use remote_storage::GenericRemoteStorage; + +use tokio::task::JoinSet; +use tokio_util::sync::CancellationToken; +use tracing::instrument; +use utils::{backoff, completion::Barrier}; + +use super::{heatmap::HeatMapTenant, CommandRequest, UploadCommand}; + +/// Period between heatmap uploader walking Tenants to look for work to do. +/// If any tenants have a heatmap upload period lower than this, it will be adjusted +/// downward to match. +const DEFAULT_SCHEDULING_INTERVAL: Duration = Duration::from_millis(60000); +const MIN_SCHEDULING_INTERVAL: Duration = Duration::from_millis(1000); + +struct WriteInProgress { + barrier: Barrier, +} + +struct UploadPending { + tenant: Arc, + last_digest: Option, +} + +struct WriteComplete { + tenant_shard_id: TenantShardId, + completed_at: Instant, + digest: Option, + next_upload: Option, +} + +/// The heatmap uploader keeps a little bit of per-tenant state, mainly to remember +/// when we last did a write. We only populate this after doing at least one +/// write for a tenant -- this avoids holding state for tenants that have +/// uploads disabled. + +struct UploaderTenantState { + // This Weak only exists to enable culling idle instances of this type + // when the Tenant has been deallocated. + tenant: Weak, + + /// Digest of the serialized heatmap that we last successfully uploaded + /// + /// md5 is generally a bad hash. We use it because it's convenient for interop with AWS S3's ETag, + /// which is also an md5sum. + last_digest: Option, + + /// When the last upload attempt completed (may have been successful or failed) + last_upload: Option, + + /// When should we next do an upload? None means never. + next_upload: Option, +} + +/// This type is owned by a single task ([`heatmap_uploader_task`]) which runs an event +/// handling loop and mutates it as needed: there are no locks here, because that event loop +/// can hold &mut references to this type throughout. +struct HeatmapUploader { + tenant_manager: Arc, + remote_storage: GenericRemoteStorage, + cancel: CancellationToken, + + tenants: HashMap, + + /// Tenants with work to do, for which tasks should be spawned as soon as concurrency + /// limits permit it. + tenants_pending: std::collections::VecDeque, + + /// Tenants for which a task in `tasks` has been spawned. + tenants_uploading: HashMap, + + tasks: JoinSet<()>, + + /// Channel for our child tasks to send results to: we use a channel for results rather than + /// just getting task results via JoinSet because we need the channel's recv() "sleep until something + /// is available" semantic, rather than JoinSet::join_next()'s "sleep until next thing is available _or_ I'm empty" + /// behavior. + task_result_tx: tokio::sync::mpsc::UnboundedSender, + task_result_rx: tokio::sync::mpsc::UnboundedReceiver, + + concurrent_uploads: usize, + + scheduling_interval: Duration, +} + +/// The uploader task runs a loop that periodically wakes up and schedules tasks for +/// tenants that require an upload, or handles any commands that have been sent into +/// `command_queue`. No I/O is done in this loop: that all happens in the tasks we +/// spawn. +/// +/// Scheduling iterations are somewhat infrequent. However, each one will enqueue +/// all tenants that require an upload, and in between scheduling iterations we will +/// continue to spawn new tasks for pending tenants, as our concurrency limit permits. +/// +/// While we take a CancellationToken here, it is subordinate to the CancellationTokens +/// of tenants: i.e. we expect all Tenants to have been shut down before we are shut down, otherwise +/// we might block waiting on a Tenant. +pub(super) async fn heatmap_uploader_task( + tenant_manager: Arc, + remote_storage: GenericRemoteStorage, + mut command_queue: tokio::sync::mpsc::Receiver>, + background_jobs_can_start: Barrier, + cancel: CancellationToken, +) -> anyhow::Result<()> { + let concurrent_uploads = tenant_manager.get_conf().heatmap_upload_concurrency; + + let (result_tx, result_rx) = tokio::sync::mpsc::unbounded_channel(); + + let mut uploader = HeatmapUploader { + tenant_manager, + remote_storage, + cancel: cancel.clone(), + tasks: JoinSet::new(), + tenants: HashMap::new(), + tenants_pending: std::collections::VecDeque::new(), + tenants_uploading: HashMap::new(), + task_result_tx: result_tx, + task_result_rx: result_rx, + concurrent_uploads, + scheduling_interval: DEFAULT_SCHEDULING_INTERVAL, + }; + + tracing::info!("Waiting for background_jobs_can start..."); + background_jobs_can_start.wait().await; + tracing::info!("background_jobs_can is ready, proceeding."); + + while !cancel.is_cancelled() { + // Look for new work: this is relatively expensive because we have to go acquire the lock on + // the tenant manager to retrieve tenants, and then iterate over them to figure out which ones + // require an upload. + uploader.schedule_iteration().await?; + + // Between scheduling iterations, we will: + // - Drain any complete tasks and spawn pending tasks + // - Handle incoming administrative commands + // - Check our cancellation token + let next_scheduling_iteration = Instant::now() + .checked_add(uploader.scheduling_interval) + .unwrap_or_else(|| { + tracing::warn!( + "Scheduling interval invalid ({}s), running immediately!", + uploader.scheduling_interval.as_secs_f64() + ); + Instant::now() + }); + loop { + tokio::select! { + _ = cancel.cancelled() => { + // We do not simply drop the JoinSet, in order to have an orderly shutdown without cancellation. + tracing::info!("Heatmap uploader joining tasks"); + while let Some(_r) = uploader.tasks.join_next().await {}; + tracing::info!("Heatmap uploader terminating"); + + break; + }, + _ = tokio::time::sleep(next_scheduling_iteration.duration_since(Instant::now())) => { + tracing::debug!("heatmap_uploader_task: woke for scheduling interval"); + break;}, + cmd = command_queue.recv() => { + tracing::debug!("heatmap_uploader_task: woke for command queue"); + let cmd = match cmd { + Some(c) =>c, + None => { + // SecondaryController was destroyed, and this has raced with + // our CancellationToken + tracing::info!("Heatmap uploader terminating"); + cancel.cancel(); + break; + } + }; + + let CommandRequest{ + response_tx, + payload + } = cmd; + uploader.handle_command(payload, response_tx); + }, + _ = uploader.process_next_completion() => { + if !cancel.is_cancelled() { + uploader.spawn_pending(); + } + } + } + } + } + + Ok(()) +} + +impl HeatmapUploader { + /// Periodic execution phase: inspect all attached tenants and schedule any work they require. + async fn schedule_iteration(&mut self) -> anyhow::Result<()> { + // Cull any entries in self.tenants whose Arc is gone + self.tenants + .retain(|_k, v| v.tenant.upgrade().is_some() && v.next_upload.is_some()); + + // The priority order of previously scheduled work may be invalidated by current state: drop + // all pending work (it will be re-scheduled if still needed) + self.tenants_pending.clear(); + + // Used a fixed 'now' through the following loop, for efficiency and fairness. + let now = Instant::now(); + + // While iterating over the potentially-long list of tenants, we will periodically yield + // to avoid blocking executor. + const YIELD_ITERATIONS: usize = 1000; + + // Iterate over tenants looking for work to do. + let tenants = self.tenant_manager.get_attached_active_tenant_shards(); + for (i, tenant) in tenants.into_iter().enumerate() { + // Process is shutting down, drop out + if self.cancel.is_cancelled() { + return Ok(()); + } + + // Skip tenants that already have a write in flight + if self + .tenants_uploading + .contains_key(tenant.get_tenant_shard_id()) + { + continue; + } + + self.maybe_schedule_upload(&now, tenant); + + if i + 1 % YIELD_ITERATIONS == 0 { + tokio::task::yield_now().await; + } + } + + // Spawn tasks for as many of our pending tenants as we can. + self.spawn_pending(); + + Ok(()) + } + + /// + /// Cancellation: this method is cancel-safe. + async fn process_next_completion(&mut self) { + match self.task_result_rx.recv().await { + Some(r) => { + self.on_completion(r); + } + None => { + unreachable!("Result sender is stored on Self"); + } + } + } + + /// The 'maybe' refers to the tenant's state: whether it is configured + /// for heatmap uploads at all, and whether sufficient time has passed + /// since the last upload. + fn maybe_schedule_upload(&mut self, now: &Instant, tenant: Arc) { + match tenant.get_heatmap_period() { + None => { + // Heatmaps are disabled for this tenant + return; + } + Some(period) => { + // If any tenant has asked for uploads more frequent than our scheduling interval, + // reduce it to match so that we can keep up. This is mainly useful in testing, where + // we may set rather short intervals. + if period < self.scheduling_interval { + self.scheduling_interval = std::cmp::max(period, MIN_SCHEDULING_INTERVAL); + } + } + } + + // Stale attachments do not upload anything: if we are in this state, there is probably some + // other attachment in mode Single or Multi running on another pageserver, and we don't + // want to thrash and overwrite their heatmap uploads. + if tenant.get_attach_mode() == AttachmentMode::Stale { + return; + } + + // Create an entry in self.tenants if one doesn't already exist: this will later be updated + // with the completion time in on_completion. + let state = self + .tenants + .entry(*tenant.get_tenant_shard_id()) + .or_insert_with(|| UploaderTenantState { + tenant: Arc::downgrade(&tenant), + last_upload: None, + next_upload: Some(Instant::now()), + last_digest: None, + }); + + // Decline to do the upload if insufficient time has passed + if state.next_upload.map(|nu| &nu > now).unwrap_or(false) { + return; + } + + let last_digest = state.last_digest; + self.tenants_pending.push_back(UploadPending { + tenant, + last_digest, + }) + } + + fn spawn_pending(&mut self) { + while !self.tenants_pending.is_empty() + && self.tenants_uploading.len() < self.concurrent_uploads + { + // unwrap: loop condition includes !is_empty() + let pending = self.tenants_pending.pop_front().unwrap(); + self.spawn_upload(pending.tenant, pending.last_digest); + } + } + + fn spawn_upload(&mut self, tenant: Arc, last_digest: Option) { + let remote_storage = self.remote_storage.clone(); + let tenant_shard_id = *tenant.get_tenant_shard_id(); + let (completion, barrier) = utils::completion::channel(); + let result_tx = self.task_result_tx.clone(); + self.tasks.spawn(async move { + // Guard for the barrier in [`WriteInProgress`] + let _completion = completion; + + let started_at = Instant::now(); + let digest = match upload_tenant_heatmap(remote_storage, &tenant, last_digest).await { + Ok(UploadHeatmapOutcome::Uploaded(digest)) => { + let duration = Instant::now().duration_since(started_at); + SECONDARY_MODE + .upload_heatmap_duration + .observe(duration.as_secs_f64()); + SECONDARY_MODE.upload_heatmap.inc(); + Some(digest) + } + Ok(UploadHeatmapOutcome::NoChange | UploadHeatmapOutcome::Skipped) => last_digest, + Err(UploadHeatmapError::Upload(e)) => { + tracing::warn!( + "Failed to upload heatmap for tenant {}: {e:#}", + tenant.get_tenant_shard_id(), + ); + let duration = Instant::now().duration_since(started_at); + SECONDARY_MODE + .upload_heatmap_duration + .observe(duration.as_secs_f64()); + SECONDARY_MODE.upload_heatmap_errors.inc(); + last_digest + } + Err(UploadHeatmapError::Cancelled) => { + tracing::info!("Cancelled heatmap upload, shutting down"); + last_digest + } + }; + + let now = Instant::now(); + let next_upload = tenant + .get_heatmap_period() + .and_then(|period| now.checked_add(period)); + + result_tx + .send(WriteComplete { + tenant_shard_id: *tenant.get_tenant_shard_id(), + completed_at: now, + digest, + next_upload, + }) + .ok(); + }); + + self.tenants_uploading + .insert(tenant_shard_id, WriteInProgress { barrier }); + } + + #[instrument(skip_all, fields(tenant_id=%completion.tenant_shard_id.tenant_id, shard_id=%completion.tenant_shard_id.shard_slug()))] + fn on_completion(&mut self, completion: WriteComplete) { + tracing::debug!("Heatmap upload completed"); + let WriteComplete { + tenant_shard_id, + completed_at, + digest, + next_upload, + } = completion; + self.tenants_uploading.remove(&tenant_shard_id); + use std::collections::hash_map::Entry; + match self.tenants.entry(tenant_shard_id) { + Entry::Vacant(_) => { + // Tenant state was dropped, nothing to update. + } + Entry::Occupied(mut entry) => { + entry.get_mut().last_upload = Some(completed_at); + entry.get_mut().last_digest = digest; + entry.get_mut().next_upload = next_upload + } + } + } + + fn handle_command( + &mut self, + command: UploadCommand, + response_tx: tokio::sync::oneshot::Sender, + ) { + match command { + UploadCommand::Upload(tenant_shard_id) => { + // If an upload was ongoing for this tenant, let it finish first. + let barrier = if let Some(writing_state) = + self.tenants_uploading.get(&tenant_shard_id) + { + tracing::info!( + tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), + "Waiting for heatmap write to complete"); + writing_state.barrier.clone() + } else { + // Spawn the upload then immediately wait for it. This will block processing of other commands and + // starting of other background work. + tracing::info!( + tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), + "Starting heatmap write on command"); + let tenant = match self + .tenant_manager + .get_attached_tenant_shard(tenant_shard_id, true) + { + Ok(t) => t, + Err(e) => { + // Drop result of send: we don't care if caller dropped their receiver + drop(response_tx.send(CommandResponse { + result: Err(e.into()), + })); + return; + } + }; + self.spawn_upload(tenant, None); + let writing_state = self + .tenants_uploading + .get(&tenant_shard_id) + .expect("We just inserted this"); + tracing::info!( + tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), + "Waiting for heatmap upload to complete"); + + writing_state.barrier.clone() + }; + + // This task does no I/O: it only listens for a barrier's completion and then + // sends to the command response channel. It is therefore safe to spawn this without + // any gates/task_mgr hooks. + tokio::task::spawn(async move { + barrier.wait().await; + + tracing::info!( + tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), + "Heatmap upload complete"); + + // Drop result of send: we don't care if caller dropped their receiver + drop(response_tx.send(CommandResponse { result: Ok(()) })) + }); + } + } + } +} + +enum UploadHeatmapOutcome { + /// We successfully wrote to remote storage, with this digest. + Uploaded(md5::Digest), + /// We did not upload because the heatmap digest was unchanged since the last upload + NoChange, + /// We skipped the upload for some reason, such as tenant/timeline not ready + Skipped, +} + +#[derive(thiserror::Error, Debug)] +enum UploadHeatmapError { + #[error("Cancelled")] + Cancelled, + + #[error(transparent)] + Upload(#[from] anyhow::Error), +} + +/// The inner upload operation. This will skip if `last_digest` is Some and matches the digest +/// of the object we would have uploaded. +#[instrument(skip_all, fields(tenant_id = %tenant.get_tenant_shard_id().tenant_id, shard_id = %tenant.get_tenant_shard_id().shard_slug()))] +async fn upload_tenant_heatmap( + remote_storage: GenericRemoteStorage, + tenant: &Arc, + last_digest: Option, +) -> Result { + debug_assert_current_span_has_tenant_id(); + + let generation = tenant.get_generation(); + if generation.is_none() { + // We do not expect this: generations were implemented before heatmap uploads. However, + // handle it so that we don't have to make the generation in the heatmap an Option<> + // (Generation::none is not serializable) + tracing::warn!("Skipping heatmap upload for tenant with generation==None"); + return Ok(UploadHeatmapOutcome::Skipped); + } + + let mut heatmap = HeatMapTenant { + timelines: Vec::new(), + generation, + }; + let timelines = tenant.timelines.lock().unwrap().clone(); + + let tenant_cancel = tenant.cancel.clone(); + + // Ensure that Tenant::shutdown waits for any upload in flight: this is needed because otherwise + // when we delete a tenant, we might race with an upload in flight and end up leaving a heatmap behind + // in remote storage. + let _guard = match tenant.gate.enter() { + Ok(g) => g, + Err(_) => { + tracing::info!("Skipping heatmap upload for tenant which is shutting down"); + return Err(UploadHeatmapError::Cancelled); + } + }; + + for (timeline_id, timeline) in timelines { + let heatmap_timeline = timeline.generate_heatmap().await; + match heatmap_timeline { + None => { + tracing::debug!( + "Skipping heatmap upload because timeline {timeline_id} is not ready" + ); + return Ok(UploadHeatmapOutcome::Skipped); + } + Some(heatmap_timeline) => { + heatmap.timelines.push(heatmap_timeline); + } + } + } + + // Serialize the heatmap + let bytes = serde_json::to_vec(&heatmap).map_err(|e| anyhow::anyhow!(e))?; + let size = bytes.len(); + + // Drop out early if nothing changed since our last upload + let digest = md5::compute(&bytes); + if Some(digest) == last_digest { + return Ok(UploadHeatmapOutcome::NoChange); + } + + let path = remote_heatmap_path(tenant.get_tenant_shard_id()); + + // Write the heatmap. + tracing::debug!("Uploading {size} byte heatmap to {path}"); + if let Err(e) = backoff::retry( + || async { + let bytes = futures::stream::once(futures::future::ready(Ok(bytes::Bytes::from( + bytes.clone(), + )))); + remote_storage + .upload_storage_object(bytes, size, &path) + .await + }, + |_| false, + 3, + u32::MAX, + "Uploading heatmap", + backoff::Cancel::new(tenant_cancel.clone(), || anyhow::anyhow!("Shutting down")), + ) + .await + { + if tenant_cancel.is_cancelled() { + return Err(UploadHeatmapError::Cancelled); + } else { + return Err(e.into()); + } + } + + tracing::info!("Successfully uploaded {size} byte heatmap to {path}"); + + Ok(UploadHeatmapOutcome::Uploaded(digest)) +} diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs index 944e05883f..6e9a4932d8 100644 --- a/pageserver/src/tenant/storage_layer.rs +++ b/pageserver/src/tenant/storage_layer.rs @@ -4,7 +4,7 @@ pub mod delta_layer; mod filename; pub mod image_layer; mod inmemory_layer; -mod layer; +pub(crate) mod layer; mod layer_desc; use crate::context::{AccessStatsBehavior, RequestContext}; diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs index 003cf0e92b..7c9103eea8 100644 --- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs +++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs @@ -23,7 +23,7 @@ use utils::{bin_ser::BeSer, id::TimelineId, lsn::Lsn, vec_map::VecMap}; // while being able to use std::fmt::Write's methods use std::fmt::Write as _; use std::ops::Range; -use tokio::sync::RwLock; +use tokio::sync::{RwLock, RwLockWriteGuard}; use super::{DeltaLayerWriter, ResidentLayer}; @@ -246,16 +246,43 @@ impl InMemoryLayer { /// Common subroutine of the public put_wal_record() and put_page_image() functions. /// Adds the page version to the in-memory tree - pub async fn put_value( + pub(crate) async fn put_value( &self, key: Key, lsn: Lsn, val: &Value, ctx: &RequestContext, ) -> Result<()> { - trace!("put_value key {} at {}/{}", key, self.timeline_id, lsn); - let inner: &mut _ = &mut *self.inner.write().await; + let mut inner = self.inner.write().await; self.assert_writable(); + self.put_value_locked(&mut inner, key, lsn, val, ctx).await + } + + pub(crate) async fn put_values( + &self, + values: &HashMap>, + ctx: &RequestContext, + ) -> Result<()> { + let mut inner = self.inner.write().await; + self.assert_writable(); + for (key, vals) in values { + for (lsn, val) in vals { + self.put_value_locked(&mut inner, *key, *lsn, val, ctx) + .await?; + } + } + Ok(()) + } + + async fn put_value_locked( + &self, + locked_inner: &mut RwLockWriteGuard<'_, InMemoryLayerInner>, + key: Key, + lsn: Lsn, + val: &Value, + ctx: &RequestContext, + ) -> Result<()> { + trace!("put_value key {} at {}/{}", key, self.timeline_id, lsn); let off = { // Avoid doing allocations for "small" values. @@ -264,7 +291,7 @@ impl InMemoryLayer { let mut buf = smallvec::SmallVec::<[u8; 256]>::new(); buf.clear(); val.ser_into(&mut buf)?; - inner + locked_inner .file .write_blob( &buf, @@ -275,7 +302,7 @@ impl InMemoryLayer { .await? }; - let vec_map = inner.index.entry(key).or_default(); + let vec_map = locked_inner.index.entry(key).or_default(); let old = vec_map.append_or_update_last(lsn, off).unwrap().0; if old.is_some() { // We already had an entry for this LSN. That's odd.. @@ -285,13 +312,11 @@ impl InMemoryLayer { Ok(()) } - pub async fn put_tombstone(&self, _key_range: Range, _lsn: Lsn) -> Result<()> { + pub(crate) async fn put_tombstones(&self, _key_ranges: &[(Range, Lsn)]) -> Result<()> { // TODO: Currently, we just leak the storage for any deleted keys - Ok(()) } - /// Make the layer non-writeable. Only call once. /// Records the end_lsn for non-dropped layers. /// `end_lsn` is exclusive pub async fn freeze(&self, end_lsn: Lsn) { diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs index 3ed4e05bea..8ae911b31e 100644 --- a/pageserver/src/tenant/storage_layer/layer.rs +++ b/pageserver/src/tenant/storage_layer/layer.rs @@ -222,14 +222,18 @@ impl Layer { /// /// [gc]: [`RemoteTimelineClient::schedule_gc_update`] /// [compaction]: [`RemoteTimelineClient::schedule_compaction_update`] - pub(crate) fn garbage_collect_on_drop(&self) { - self.0.garbage_collect_on_drop(); + pub(crate) fn delete_on_drop(&self) { + self.0.delete_on_drop(); } /// Return data needed to reconstruct given page at LSN. /// /// It is up to the caller to collect more data from the previous layer and /// perform WAL redo, if necessary. + /// + /// # Cancellation-Safety + /// + /// This method is cancellation-safe. pub(crate) async fn get_value_reconstruct_data( &self, key: Key, @@ -255,8 +259,9 @@ impl Layer { layer .get_value_reconstruct_data(key, lsn_range, reconstruct_data, &self.0, ctx) - .instrument(tracing::info_span!("get_value_reconstruct_data", layer=%self)) + .instrument(tracing::debug_span!("get_value_reconstruct_data", layer=%self)) .await + .with_context(|| format!("get_value_reconstruct_data for layer {self}")) } /// Download the layer if evicted. @@ -327,10 +332,10 @@ impl Layer { Ok(()) } - /// Waits until this layer has been dropped (and if needed, local garbage collection and remote + /// Waits until this layer has been dropped (and if needed, local file deletion and remote /// deletion scheduling has completed). /// - /// Does not start garbage collection, use [`Self::garbage_collect_on_drop`] for that + /// Does not start local deletion, use [`Self::delete_on_drop`] for that /// separatedly. #[cfg(feature = "testing")] pub(crate) fn wait_drop(&self) -> impl std::future::Future + 'static { @@ -419,8 +424,8 @@ struct LayerInner { /// Initialization and deinitialization are done while holding a permit. inner: heavier_once_cell::OnceCell, - /// Do we want to garbage collect this when `LayerInner` is dropped - wanted_garbage_collected: AtomicBool, + /// Do we want to delete locally and remotely this when `LayerInner` is dropped + wanted_deleted: AtomicBool, /// Do we want to evict this layer as soon as possible? After being set to `true`, all accesses /// will try to downgrade [`ResidentOrWantedEvicted`], which will eventually trigger @@ -434,10 +439,6 @@ struct LayerInner { version: AtomicUsize, /// Allow subscribing to when the layer actually gets evicted. - /// - /// If in future we need to implement "wait until layer instances are gone and done", carrying - /// this over to the gc spawn_blocking from LayerInner::drop will do the trick, and adding a - /// method for "wait_gc" which will wait to this being closed. status: tokio::sync::broadcast::Sender, /// Counter for exponential backoff with the download @@ -457,6 +458,8 @@ struct LayerInner { /// For loaded layers, this may be some other value if the tenant has undergone /// a shard split since the layer was originally written. shard: ShardIndex, + + last_evicted_at: std::sync::Mutex>, } impl std::fmt::Display for LayerInner { @@ -479,14 +482,14 @@ enum Status { impl Drop for LayerInner { fn drop(&mut self) { - if !*self.wanted_garbage_collected.get_mut() { + if !*self.wanted_deleted.get_mut() { // should we try to evict if the last wish was for eviction? // feels like there's some hazard of overcrowding near shutdown near by, but we don't // run drops during shutdown (yet) return; } - let span = tracing::info_span!(parent: None, "layer_gc", tenant_id = %self.layer_desc().tenant_shard_id.tenant_id, shard_id=%self.layer_desc().tenant_shard_id.shard_slug(), timeline_id = %self.layer_desc().timeline_id); + let span = tracing::info_span!(parent: None, "layer_delete", tenant_id = %self.layer_desc().tenant_shard_id.tenant_id, shard_id=%self.layer_desc().tenant_shard_id.shard_slug(), timeline_id = %self.layer_desc().timeline_id); let path = std::mem::take(&mut self.path); let file_name = self.layer_desc().filename(); @@ -513,8 +516,8 @@ impl Drop for LayerInner { false } Err(e) => { - tracing::error!("failed to remove garbage collected layer: {e}"); - LAYER_IMPL_METRICS.inc_gc_removes_failed(); + tracing::error!("failed to remove wanted deleted layer: {e}"); + LAYER_IMPL_METRICS.inc_delete_removes_failed(); false } }; @@ -536,15 +539,15 @@ impl Drop for LayerInner { } else { tracing::warn!("scheduling deletion on drop failed: {e:#}"); } - LAYER_IMPL_METRICS.inc_gcs_failed(GcFailed::DeleteSchedulingFailed); + LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::DeleteSchedulingFailed); } else { - LAYER_IMPL_METRICS.inc_completed_gcs(); + LAYER_IMPL_METRICS.inc_completed_deletes(); } } } else { // no need to nag that timeline is gone: under normal situation on // task_mgr::remove_tenant_from_memory the timeline is gone before we get dropped. - LAYER_IMPL_METRICS.inc_gcs_failed(GcFailed::TimelineGone); + LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::TimelineGone); } }); } @@ -579,7 +582,7 @@ impl LayerInner { timeline: Arc::downgrade(timeline), have_remote_client: timeline.remote_client.is_some(), access_stats, - wanted_garbage_collected: AtomicBool::new(false), + wanted_deleted: AtomicBool::new(false), wanted_evicted: AtomicBool::new(false), inner, version: AtomicUsize::new(version), @@ -587,19 +590,17 @@ impl LayerInner { consecutive_failures: AtomicUsize::new(0), generation, shard, + last_evicted_at: std::sync::Mutex::default(), } } - fn garbage_collect_on_drop(&self) { - let res = self.wanted_garbage_collected.compare_exchange( - false, - true, - Ordering::Release, - Ordering::Relaxed, - ); + fn delete_on_drop(&self) { + let res = + self.wanted_deleted + .compare_exchange(false, true, Ordering::Release, Ordering::Relaxed); if res.is_ok() { - LAYER_IMPL_METRICS.inc_started_gcs(); + LAYER_IMPL_METRICS.inc_started_deletes(); } } @@ -654,7 +655,6 @@ impl LayerInner { } /// Cancellation safe. - #[tracing::instrument(skip_all, fields(layer=%self))] async fn get_or_maybe_download( self: &Arc, allow_download: bool, @@ -663,79 +663,101 @@ impl LayerInner { let mut init_permit = None; loop { - let download = move |permit| async move { - // disable any scheduled but not yet running eviction deletions for this - let next_version = 1 + self.version.fetch_add(1, Ordering::Relaxed); + let download = move |permit| { + async move { + // disable any scheduled but not yet running eviction deletions for this + let next_version = 1 + self.version.fetch_add(1, Ordering::Relaxed); - // no need to make the evict_and_wait wait for the actual download to complete - drop(self.status.send(Status::Downloaded)); + // count cancellations, which currently remain largely unexpected + let init_cancelled = + scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled()); - let timeline = self - .timeline - .upgrade() - .ok_or_else(|| DownloadError::TimelineShutdown)?; + // no need to make the evict_and_wait wait for the actual download to complete + drop(self.status.send(Status::Downloaded)); - let can_ever_evict = timeline.remote_client.as_ref().is_some(); + let timeline = self + .timeline + .upgrade() + .ok_or_else(|| DownloadError::TimelineShutdown)?; - // check if we really need to be downloaded; could have been already downloaded by a - // cancelled previous attempt. - let needs_download = self - .needs_download() - .await - .map_err(DownloadError::PreStatFailed)?; + // FIXME: grab a gate - let permit = if let Some(reason) = needs_download { - if let NeedsDownload::NotFile(ft) = reason { - return Err(DownloadError::NotFile(ft)); + let can_ever_evict = timeline.remote_client.as_ref().is_some(); + + // check if we really need to be downloaded; could have been already downloaded by a + // cancelled previous attempt. + let needs_download = self + .needs_download() + .await + .map_err(DownloadError::PreStatFailed)?; + + let permit = if let Some(reason) = needs_download { + if let NeedsDownload::NotFile(ft) = reason { + return Err(DownloadError::NotFile(ft)); + } + + // only reset this after we've decided we really need to download. otherwise it'd + // be impossible to mark cancelled downloads for eviction, like one could imagine + // we would like to do for prefetching which was not needed. + self.wanted_evicted.store(false, Ordering::Release); + + if !can_ever_evict { + return Err(DownloadError::NoRemoteStorage); + } + + if let Some(ctx) = ctx { + self.check_expected_download(ctx)?; + } + + if !allow_download { + // this does look weird, but for LayerInner the "downloading" means also changing + // internal once related state ... + return Err(DownloadError::DownloadRequired); + } + + tracing::info!(%reason, "downloading on-demand"); + + self.spawn_download_and_wait(timeline, permit).await? + } else { + // the file is present locally, probably by a previous but cancelled call to + // get_or_maybe_download. alternatively we might be running without remote storage. + LAYER_IMPL_METRICS.inc_init_needed_no_download(); + + permit + }; + + let since_last_eviction = + self.last_evicted_at.lock().unwrap().map(|ts| ts.elapsed()); + if let Some(since_last_eviction) = since_last_eviction { + // FIXME: this will not always be recorded correctly until #6028 (the no + // download needed branch above) + LAYER_IMPL_METRICS.record_redownloaded_after(since_last_eviction); } - // only reset this after we've decided we really need to download. otherwise it'd - // be impossible to mark cancelled downloads for eviction, like one could imagine - // we would like to do for prefetching which was not needed. - self.wanted_evicted.store(false, Ordering::Release); + let res = Arc::new(DownloadedLayer { + owner: Arc::downgrade(self), + kind: tokio::sync::OnceCell::default(), + version: next_version, + }); - if !can_ever_evict { - return Err(DownloadError::NoRemoteStorage); + self.access_stats.record_residence_event( + LayerResidenceStatus::Resident, + LayerResidenceEventReason::ResidenceChange, + ); + + let waiters = self.inner.initializer_count(); + if waiters > 0 { + tracing::info!( + waiters, + "completing the on-demand download for other tasks" + ); } - if let Some(ctx) = ctx { - self.check_expected_download(ctx)?; - } + scopeguard::ScopeGuard::into_inner(init_cancelled); - if !allow_download { - // this does look weird, but for LayerInner the "downloading" means also changing - // internal once related state ... - return Err(DownloadError::DownloadRequired); - } - - tracing::info!(%reason, "downloading on-demand"); - - self.spawn_download_and_wait(timeline, permit).await? - } else { - // the file is present locally, probably by a previous but cancelled call to - // get_or_maybe_download. alternatively we might be running without remote storage. - LAYER_IMPL_METRICS.inc_init_needed_no_download(); - - permit - }; - - let res = Arc::new(DownloadedLayer { - owner: Arc::downgrade(self), - kind: tokio::sync::OnceCell::default(), - version: next_version, - }); - - self.access_stats.record_residence_event( - LayerResidenceStatus::Resident, - LayerResidenceEventReason::ResidenceChange, - ); - - let waiters = self.inner.initializer_count(); - if waiters > 0 { - tracing::info!(waiters, "completing the on-demand download for other tasks"); + Ok((ResidentOrWantedEvicted::Resident(res), permit)) } - - Ok((ResidentOrWantedEvicted::Resident(res), permit)) + .instrument(tracing::info_span!("get_or_maybe_download", layer=%self)) }; if let Some(init_permit) = init_permit.take() { @@ -832,7 +854,7 @@ impl LayerInner { crate::task_mgr::spawn( &tokio::runtime::Handle::current(), crate::task_mgr::TaskKind::RemoteDownloadTask, - Some(self.desc.tenant_shard_id.tenant_id), + Some(self.desc.tenant_shard_id), Some(self.desc.timeline_id), &task_name, false, @@ -846,6 +868,7 @@ impl LayerInner { let result = client.download_layer_file( &this.desc.filename(), &this.metadata(), + &crate::task_mgr::shutdown_token() ) .await; @@ -855,6 +878,23 @@ impl LayerInner { Ok(()) } Err(e) => { + let consecutive_failures = + this.consecutive_failures.fetch_add(1, Ordering::Relaxed); + + let backoff = utils::backoff::exponential_backoff_duration_seconds( + consecutive_failures.min(u32::MAX as usize) as u32, + 1.5, + 60.0, + ); + + let backoff = std::time::Duration::from_secs_f64(backoff); + + tokio::select! { + _ = tokio::time::sleep(backoff) => {}, + _ = crate::task_mgr::shutdown_token().cancelled_owned() => {}, + _ = timeline.cancel.cancelled() => {}, + }; + Err(e) } }; @@ -863,14 +903,13 @@ impl LayerInner { match res { (Ok(()), _) => { // our caller is cancellation safe so this is fine; if someone - // else requests the layer, they'll find it already downloaded - // or redownload. + // else requests the layer, they'll find it already downloaded. // - // however, could be that we should consider marking the layer - // for eviction? alas, cannot: because only DownloadedLayer - // will handle that. - tracing::info!("layer file download completed after requester had cancelled"); - LAYER_IMPL_METRICS.inc_download_completed_without_requester(); + // See counter [`LayerImplMetrics::inc_init_needed_no_download`] + // + // FIXME(#6028): however, could be that we should consider marking the + // layer for eviction? alas, cannot: because only DownloadedLayer will + // handle that. }, (Err(e), _) => { // our caller is cancellation safe, but we might be racing with @@ -904,21 +943,9 @@ impl LayerInner { Ok(permit) } Ok((Err(e), _permit)) => { - // FIXME: this should be with the spawned task and be cancellation sensitive - // - // while we should not need this, this backoff has turned out to be useful with - // a bug of unexpectedly deleted remote layer file (#5787). - let consecutive_failures = - self.consecutive_failures.fetch_add(1, Ordering::Relaxed); + // sleep already happened in the spawned task, if it was not cancelled + let consecutive_failures = self.consecutive_failures.load(Ordering::Relaxed); tracing::error!(consecutive_failures, "layer file download failed: {e:#}"); - let backoff = utils::backoff::exponential_backoff_duration_seconds( - consecutive_failures.min(u32::MAX as usize) as u32, - 1.5, - 60.0, - ); - let backoff = std::time::Duration::from_secs_f64(backoff); - - tokio::time::sleep(backoff).await; Err(DownloadError::DownloadFailed) } Err(_gone) => Err(DownloadError::DownloadCancelled), @@ -990,12 +1017,15 @@ impl LayerInner { /// `DownloadedLayer` is being dropped, so it calls this method. fn on_downloaded_layer_drop(self: Arc, version: usize) { - let gc = self.wanted_garbage_collected.load(Ordering::Acquire); + let delete = self.wanted_deleted.load(Ordering::Acquire); let evict = self.wanted_evicted.load(Ordering::Acquire); let can_evict = self.have_remote_client; - if gc { - // do nothing now, only in LayerInner::drop + if delete { + // do nothing now, only in LayerInner::drop -- this was originally implemented because + // we could had already scheduled the deletion at the time. + // + // FIXME: this is not true anymore, we can safely evict wanted deleted files. } else if can_evict && evict { let span = tracing::info_span!(parent: None, "layer_evict", tenant_id = %self.desc.tenant_shard_id.tenant_id, shard_id = %self.desc.tenant_shard_id.shard_slug(), timeline_id = %self.desc.timeline_id, layer=%self, %version); @@ -1010,7 +1040,7 @@ impl LayerInner { crate::task_mgr::BACKGROUND_RUNTIME.spawn_blocking(move || { let _g = span.entered(); - // if LayerInner is already dropped here, do nothing because the garbage collection + // if LayerInner is already dropped here, do nothing because the delete on drop // has already ran while we were in queue let Some(this) = this.upgrade() else { LAYER_IMPL_METRICS.inc_eviction_cancelled(EvictionCancelled::LayerGone); @@ -1110,6 +1140,8 @@ impl LayerInner { // we are still holding the permit, so no new spawn_download_and_wait can happen drop(self.status.send(Status::Evicted)); + *self.last_evicted_at.lock().unwrap() = Some(std::time::Instant::now()); + res } @@ -1401,35 +1433,38 @@ impl From for Layer { } } -use metrics::{IntCounter, IntCounterVec}; +use metrics::IntCounter; -struct LayerImplMetrics { +pub(crate) struct LayerImplMetrics { started_evictions: IntCounter, completed_evictions: IntCounter, - cancelled_evictions: IntCounterVec, + cancelled_evictions: enum_map::EnumMap, - started_gcs: IntCounter, - completed_gcs: IntCounter, - failed_gcs: IntCounterVec, + started_deletes: IntCounter, + completed_deletes: IntCounter, + failed_deletes: enum_map::EnumMap, - rare_counters: IntCounterVec, + rare_counters: enum_map::EnumMap, + inits_cancelled: metrics::core::GenericCounter, + redownload_after: metrics::Histogram, } impl Default for LayerImplMetrics { fn default() -> Self { - let evictions = metrics::register_int_counter_vec!( - "pageserver_layer_evictions_count", - "Evictions started and completed in the Layer implementation", - &["state"] + use enum_map::Enum; + + // reminder: these will be pageserver_layer_* with "_total" suffix + + let started_evictions = metrics::register_int_counter!( + "pageserver_layer_started_evictions", + "Evictions started in the Layer implementation" + ) + .unwrap(); + let completed_evictions = metrics::register_int_counter!( + "pageserver_layer_completed_evictions", + "Evictions completed in the Layer implementation" ) .unwrap(); - - let started_evictions = evictions - .get_metric_with_label_values(&["started"]) - .unwrap(); - let completed_evictions = evictions - .get_metric_with_label_values(&["completed"]) - .unwrap(); let cancelled_evictions = metrics::register_int_counter_vec!( "pageserver_layer_cancelled_evictions_count", @@ -1438,24 +1473,36 @@ impl Default for LayerImplMetrics { ) .unwrap(); - // reminder: this will be pageserver_layer_gcs_count_total with "_total" suffix - let gcs = metrics::register_int_counter_vec!( - "pageserver_layer_gcs_count", - "Garbage collections started and completed in the Layer implementation", - &["state"] + let cancelled_evictions = enum_map::EnumMap::from_array(std::array::from_fn(|i| { + let reason = EvictionCancelled::from_usize(i); + let s = reason.as_str(); + cancelled_evictions.with_label_values(&[s]) + })); + + let started_deletes = metrics::register_int_counter!( + "pageserver_layer_started_deletes", + "Deletions on drop pending in the Layer implementation" + ) + .unwrap(); + let completed_deletes = metrics::register_int_counter!( + "pageserver_layer_completed_deletes", + "Deletions on drop completed in the Layer implementation" ) .unwrap(); - let started_gcs = gcs.get_metric_with_label_values(&["pending"]).unwrap(); - let completed_gcs = gcs.get_metric_with_label_values(&["completed"]).unwrap(); - - let failed_gcs = metrics::register_int_counter_vec!( - "pageserver_layer_failed_gcs_count", - "Different reasons for garbage collections to have failed", + let failed_deletes = metrics::register_int_counter_vec!( + "pageserver_layer_failed_deletes_count", + "Different reasons for deletions on drop to have failed", &["reason"] ) .unwrap(); + let failed_deletes = enum_map::EnumMap::from_array(std::array::from_fn(|i| { + let reason = DeleteFailed::from_usize(i); + let s = reason.as_str(); + failed_deletes.with_label_values(&[s]) + })); + let rare_counters = metrics::register_int_counter_vec!( "pageserver_layer_assumed_rare_count", "Times unexpected or assumed rare event happened", @@ -1463,16 +1510,50 @@ impl Default for LayerImplMetrics { ) .unwrap(); + let rare_counters = enum_map::EnumMap::from_array(std::array::from_fn(|i| { + let event = RareEvent::from_usize(i); + let s = event.as_str(); + rare_counters.with_label_values(&[s]) + })); + + let inits_cancelled = metrics::register_int_counter!( + "pageserver_layer_inits_cancelled_count", + "Times Layer initialization was cancelled", + ) + .unwrap(); + + let redownload_after = { + let minute = 60.0; + let hour = 60.0 * minute; + metrics::register_histogram!( + "pageserver_layer_redownloaded_after", + "Time between evicting and re-downloading.", + vec![ + 10.0, + 30.0, + minute, + 5.0 * minute, + 15.0 * minute, + 30.0 * minute, + hour, + 12.0 * hour, + ] + ) + .unwrap() + }; + Self { started_evictions, completed_evictions, cancelled_evictions, - started_gcs, - completed_gcs, - failed_gcs, + started_deletes, + completed_deletes, + failed_deletes, rare_counters, + inits_cancelled, + redownload_after, } } } @@ -1485,57 +1566,33 @@ impl LayerImplMetrics { self.completed_evictions.inc(); } fn inc_eviction_cancelled(&self, reason: EvictionCancelled) { - self.cancelled_evictions - .get_metric_with_label_values(&[reason.as_str()]) - .unwrap() - .inc() + self.cancelled_evictions[reason].inc() } - fn inc_started_gcs(&self) { - self.started_gcs.inc(); + fn inc_started_deletes(&self) { + self.started_deletes.inc(); } - fn inc_completed_gcs(&self) { - self.completed_gcs.inc(); + fn inc_completed_deletes(&self) { + self.completed_deletes.inc(); } - fn inc_gcs_failed(&self, reason: GcFailed) { - self.failed_gcs - .get_metric_with_label_values(&[reason.as_str()]) - .unwrap() - .inc(); + fn inc_deletes_failed(&self, reason: DeleteFailed) { + self.failed_deletes[reason].inc(); } - /// Counted separatedly from failed gcs because we will complete the gc attempt regardless of - /// failure to delete local file. - fn inc_gc_removes_failed(&self) { - self.rare_counters - .get_metric_with_label_values(&["gc_remove_failed"]) - .unwrap() - .inc(); + /// Counted separatedly from failed layer deletes because we will complete the layer deletion + /// attempt regardless of failure to delete local file. + fn inc_delete_removes_failed(&self) { + self.rare_counters[RareEvent::RemoveOnDropFailed].inc(); } - /// Expected rare because requires a race with `evict_blocking` and - /// `get_or_maybe_download`. + /// Expected rare because requires a race with `evict_blocking` and `get_or_maybe_download`. fn inc_retried_get_or_maybe_download(&self) { - self.rare_counters - .get_metric_with_label_values(&["retried_gomd"]) - .unwrap() - .inc(); + self.rare_counters[RareEvent::RetriedGetOrMaybeDownload].inc(); } - /// Expected rare because cancellations are unexpected - fn inc_download_completed_without_requester(&self) { - self.rare_counters - .get_metric_with_label_values(&["download_completed_without"]) - .unwrap() - .inc(); - } - - /// Expected rare because cancellations are unexpected + /// Expected rare because cancellations are unexpected, and failures are unexpected fn inc_download_failed_without_requester(&self) { - self.rare_counters - .get_metric_with_label_values(&["download_failed_without"]) - .unwrap() - .inc(); + self.rare_counters[RareEvent::DownloadFailedWithoutRequester].inc(); } /// The Weak in ResidentOrWantedEvicted::WantedEvicted was successfully upgraded. @@ -1543,37 +1600,34 @@ impl LayerImplMetrics { /// If this counter is always zero, we should replace ResidentOrWantedEvicted type with an /// Option. fn inc_raced_wanted_evicted_accesses(&self) { - self.rare_counters - .get_metric_with_label_values(&["raced_wanted_evicted"]) - .unwrap() - .inc(); + self.rare_counters[RareEvent::UpgradedWantedEvicted].inc(); } - /// These are only expected for [`Self::inc_download_completed_without_requester`] amount when + /// These are only expected for [`Self::inc_init_cancelled`] amount when /// running with remote storage. fn inc_init_needed_no_download(&self) { - self.rare_counters - .get_metric_with_label_values(&["init_needed_no_download"]) - .unwrap() - .inc(); + self.rare_counters[RareEvent::InitWithoutDownload].inc(); } /// Expected rare because all layer files should be readable and good fn inc_permanent_loading_failures(&self) { - self.rare_counters - .get_metric_with_label_values(&["permanent_loading_failure"]) - .unwrap() - .inc(); + self.rare_counters[RareEvent::PermanentLoadingFailure].inc(); } fn inc_broadcast_lagged(&self) { - self.rare_counters - .get_metric_with_label_values(&["broadcast_lagged"]) - .unwrap() - .inc(); + self.rare_counters[RareEvent::EvictAndWaitLagged].inc(); + } + + fn inc_init_cancelled(&self) { + self.inits_cancelled.inc() + } + + fn record_redownloaded_after(&self, duration: std::time::Duration) { + self.redownload_after.observe(duration.as_secs_f64()) } } +#[derive(enum_map::Enum)] enum EvictionCancelled { LayerGone, TimelineGone, @@ -1602,19 +1656,47 @@ impl EvictionCancelled { } } -enum GcFailed { +#[derive(enum_map::Enum)] +enum DeleteFailed { TimelineGone, DeleteSchedulingFailed, } -impl GcFailed { +impl DeleteFailed { fn as_str(&self) -> &'static str { match self { - GcFailed::TimelineGone => "timeline_gone", - GcFailed::DeleteSchedulingFailed => "delete_scheduling_failed", + DeleteFailed::TimelineGone => "timeline_gone", + DeleteFailed::DeleteSchedulingFailed => "delete_scheduling_failed", } } } -static LAYER_IMPL_METRICS: once_cell::sync::Lazy = +#[derive(enum_map::Enum)] +enum RareEvent { + RemoveOnDropFailed, + RetriedGetOrMaybeDownload, + DownloadFailedWithoutRequester, + UpgradedWantedEvicted, + InitWithoutDownload, + PermanentLoadingFailure, + EvictAndWaitLagged, +} + +impl RareEvent { + fn as_str(&self) -> &'static str { + use RareEvent::*; + + match self { + RemoveOnDropFailed => "remove_on_drop_failed", + RetriedGetOrMaybeDownload => "retried_gomd", + DownloadFailedWithoutRequester => "download_failed_without", + UpgradedWantedEvicted => "raced_wanted_evicted", + InitWithoutDownload => "init_needed_no_download", + PermanentLoadingFailure => "permanent_loading_failure", + EvictAndWaitLagged => "broadcast_lagged", + } + } +} + +pub(crate) static LAYER_IMPL_METRICS: once_cell::sync::Lazy = once_cell::sync::Lazy::new(LayerImplMetrics::default); diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs index 138578ec8a..7ff1873eda 100644 --- a/pageserver/src/tenant/tasks.rs +++ b/pageserver/src/tenant/tasks.rs @@ -44,6 +44,7 @@ pub(crate) enum BackgroundLoopKind { Eviction, ConsumptionMetricsCollectMetrics, ConsumptionMetricsSyntheticSizeWorker, + InitialLogicalSizeCalculation, } impl BackgroundLoopKind { @@ -53,31 +54,18 @@ impl BackgroundLoopKind { } } -pub(crate) enum RateLimitError { - Cancelled, -} - -pub(crate) async fn concurrent_background_tasks_rate_limit( +/// Cancellation safe. +pub(crate) async fn concurrent_background_tasks_rate_limit_permit( loop_kind: BackgroundLoopKind, _ctx: &RequestContext, - cancel: &CancellationToken, -) -> Result { - crate::metrics::BACKGROUND_LOOP_SEMAPHORE_WAIT_START_COUNT +) -> impl Drop { + let _guard = crate::metrics::BACKGROUND_LOOP_SEMAPHORE_WAIT_GAUGE .with_label_values(&[loop_kind.as_static_str()]) - .inc(); - scopeguard::defer!( - crate::metrics::BACKGROUND_LOOP_SEMAPHORE_WAIT_FINISH_COUNT.with_label_values(&[loop_kind.as_static_str()]).inc(); - ); - tokio::select! { - permit = CONCURRENT_BACKGROUND_TASKS.acquire() => { - match permit { - Ok(permit) => Ok(permit), - Err(_closed) => unreachable!("we never close the semaphore"), - } - }, - _ = cancel.cancelled() => { - Err(RateLimitError::Cancelled) - } + .guard(); + + match CONCURRENT_BACKGROUND_TASKS.acquire().await { + Ok(permit) => permit, + Err(_closed) => unreachable!("we never close the semaphore"), } } @@ -86,13 +74,13 @@ pub fn start_background_loops( tenant: &Arc, background_jobs_can_start: Option<&completion::Barrier>, ) { - let tenant_id = tenant.tenant_shard_id.tenant_id; + let tenant_shard_id = tenant.tenant_shard_id; task_mgr::spawn( BACKGROUND_RUNTIME.handle(), TaskKind::Compaction, - Some(tenant_id), + Some(tenant_shard_id), None, - &format!("compactor for tenant {tenant_id}"), + &format!("compactor for tenant {tenant_shard_id}"), false, { let tenant = Arc::clone(tenant); @@ -104,7 +92,7 @@ pub fn start_background_loops( _ = completion::Barrier::maybe_wait(background_jobs_can_start) => {} }; compaction_loop(tenant, cancel) - .instrument(info_span!("compaction_loop", tenant_id = %tenant_id)) + .instrument(info_span!("compaction_loop", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug())) .await; Ok(()) } @@ -113,9 +101,9 @@ pub fn start_background_loops( task_mgr::spawn( BACKGROUND_RUNTIME.handle(), TaskKind::GarbageCollector, - Some(tenant_id), + Some(tenant_shard_id), None, - &format!("garbage collector for tenant {tenant_id}"), + &format!("garbage collector for tenant {tenant_shard_id}"), false, { let tenant = Arc::clone(tenant); @@ -127,7 +115,7 @@ pub fn start_background_loops( _ = completion::Barrier::maybe_wait(background_jobs_can_start) => {} }; gc_loop(tenant, cancel) - .instrument(info_span!("gc_loop", tenant_id = %tenant_id)) + .instrument(info_span!("gc_loop", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug())) .await; Ok(()) } diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 9a7c9a6df3..e8340a74b2 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -2,7 +2,7 @@ pub mod delete; mod eviction_task; mod init; pub mod layer_manager; -mod logical_size; +pub(crate) mod logical_size; pub mod span; pub mod uninit; mod walreceiver; @@ -18,25 +18,29 @@ use pageserver_api::{ DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest, LayerMapInfo, TimelineState, }, - shard::TenantShardId, + shard::{ShardIdentity, TenantShardId}, }; +use rand::Rng; use serde_with::serde_as; use storage_broker::BrokerClientChannel; use tokio::{ runtime::Handle, - sync::{oneshot, watch, TryAcquireError}, + sync::{oneshot, watch}, }; use tokio_util::sync::CancellationToken; use tracing::*; -use utils::{id::TenantTimelineId, sync::gate::Gate}; +use utils::sync::gate::Gate; -use std::cmp::{max, min, Ordering}; use std::collections::{BinaryHeap, HashMap, HashSet}; use std::ops::{Deref, Range}; use std::pin::pin; use std::sync::atomic::Ordering as AtomicOrdering; use std::sync::{Arc, Mutex, RwLock, Weak}; use std::time::{Duration, Instant, SystemTime}; +use std::{ + cmp::{max, min, Ordering}, + ops::ControlFlow, +}; use crate::context::{ AccessStatsBehavior, DownloadBehavior, RequestContext, RequestContextBuilder, @@ -47,7 +51,7 @@ use crate::tenant::storage_layer::{ LayerAccessStatsReset, LayerFileName, ResidentLayer, ValueReconstructResult, ValueReconstructState, }; -use crate::tenant::tasks::{BackgroundLoopKind, RateLimitError}; +use crate::tenant::tasks::BackgroundLoopKind; use crate::tenant::timeline::logical_size::CurrentLogicalSize; use crate::tenant::{ layer_map::{LayerMap, SearchResult}, @@ -62,7 +66,7 @@ use crate::metrics::{ TimelineMetrics, MATERIALIZED_PAGE_CACHE_HIT, MATERIALIZED_PAGE_CACHE_HIT_DIRECT, }; use crate::pgdatadir_mapping::LsnForTimestamp; -use crate::pgdatadir_mapping::{is_rel_fsm_block_key, is_rel_vm_block_key}; +use crate::pgdatadir_mapping::{is_inherited_key, is_rel_fsm_block_key, is_rel_vm_block_key}; use crate::pgdatadir_mapping::{BlockNumber, CalculateLogicalSizeError}; use crate::tenant::config::{EvictionPolicy, TenantConfOpt}; use pageserver_api::reltag::RelTag; @@ -73,7 +77,7 @@ use postgres_ffi::to_pg_timestamp; use utils::{ completion, generation::Generation, - id::{TenantId, TimelineId}, + id::TimelineId, lsn::{AtomicLsn, Lsn, RecordLsn}, seqwait::SeqWait, simple_rcu::{Rcu, RcuReadGuard}, @@ -94,8 +98,9 @@ use self::logical_size::LogicalSize; use self::walreceiver::{WalReceiver, WalReceiverConf}; use super::config::TenantConf; -use super::remote_timeline_client::index::IndexPart; +use super::remote_timeline_client::index::{IndexLayerMetadata, IndexPart}; use super::remote_timeline_client::RemoteTimelineClient; +use super::secondary::heatmap::{HeatMapLayer, HeatMapTimeline}; use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf}; #[derive(Debug, PartialEq, Eq, Clone, Copy)] @@ -163,6 +168,10 @@ pub struct Timeline { /// this copy enforces the invariant that generatio doesn't change during a Tenant's lifetime. pub(crate) generation: Generation, + /// The detailed sharding information from our parent Tenant. This enables us to map keys + /// to shards, and is constant through the lifetime of this Timeline. + shard_identity: ShardIdentity, + pub pg_version: u32, /// The tuple has two elements. @@ -298,13 +307,6 @@ pub struct Timeline { eviction_task_timeline_state: tokio::sync::Mutex, - /// Barrier to wait before doing initial logical size calculation. Used only during startup. - initial_logical_size_can_start: Option, - - /// Completion shared between all timelines loaded during startup; used to delay heavier - /// background tasks until some logical sizes have been calculated. - initial_logical_size_attempt: Mutex>, - /// Load or creation time information about the disk_consistent_lsn and when the loading /// happened. Used for consumption metrics. pub(crate) loaded_at: (Lsn, SystemTime), @@ -376,9 +378,6 @@ pub enum PageReconstructError { #[error(transparent)] Other(#[from] anyhow::Error), - /// The operation would require downloading a layer that is missing locally. - NeedsDownload(TenantTimelineId, LayerFileName), - /// The operation was cancelled Cancelled, @@ -407,14 +406,6 @@ impl std::fmt::Debug for PageReconstructError { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> { match self { Self::Other(err) => err.fmt(f), - Self::NeedsDownload(tenant_timeline_id, layer_file_name) => { - write!( - f, - "layer {}/{} needs download", - tenant_timeline_id, - layer_file_name.file_name() - ) - } Self::Cancelled => write!(f, "cancelled"), Self::AncestorStopping(timeline_id) => { write!(f, "ancestor timeline {timeline_id} is being stopped") @@ -428,14 +419,6 @@ impl std::fmt::Display for PageReconstructError { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> { match self { Self::Other(err) => err.fmt(f), - Self::NeedsDownload(tenant_timeline_id, layer_file_name) => { - write!( - f, - "layer {}/{} needs download", - tenant_timeline_id, - layer_file_name.file_name() - ) - } Self::Cancelled => write!(f, "cancelled"), Self::AncestorStopping(timeline_id) => { write!(f, "ancestor timeline {timeline_id} is being stopped") @@ -453,11 +436,22 @@ pub enum LogicalSizeCalculationCause { TenantSizeHandler, } +pub enum GetLogicalSizePriority { + User, + Background, +} + #[derive(enumset::EnumSetType)] pub(crate) enum CompactFlags { ForceRepartition, } +impl std::fmt::Debug for Timeline { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + write!(f, "Timeline<{}>", self.timeline_id) + } +} + /// Public interface functions impl Timeline { /// Get the LSN where this branch was created @@ -472,7 +466,7 @@ impl Timeline { .map(|ancestor| ancestor.timeline_id) } - /// Lock and get timeline's GC cuttof + /// Lock and get timeline's GC cutoff pub fn get_latest_gc_cutoff_lsn(&self) -> RcuReadGuard { self.latest_gc_cutoff_lsn.read() } @@ -489,6 +483,9 @@ impl Timeline { /// an ancestor branch, for example, or waste a lot of cycles chasing the /// non-existing key. /// + /// # Cancel-Safety + /// + /// This method is cancellation-safe. pub async fn get( &self, key: Key, @@ -499,6 +496,11 @@ impl Timeline { return Err(PageReconstructError::Other(anyhow::anyhow!("Invalid LSN"))); } + // This check is debug-only because of the cost of hashing, and because it's a double-check: we + // already checked the key against the shard_identity when looking up the Timeline from + // page_service. + debug_assert!(!self.shard_identity.is_key_disposable(&key)); + // XXX: structured stats collection for layer eviction here. trace!( "get page request for {}@{} from task kind {:?}", @@ -718,19 +720,27 @@ impl Timeline { flags: EnumSet, ctx: &RequestContext, ) -> Result<(), CompactionError> { - let _g = self.compaction_lock.lock().await; + // most likely the cancellation token is from background task, but in tests it could be the + // request task as well. + + let prepare = async move { + let guard = self.compaction_lock.lock().await; + + let permit = super::tasks::concurrent_background_tasks_rate_limit_permit( + BackgroundLoopKind::Compaction, + ctx, + ) + .await; + + (guard, permit) + }; // this wait probably never needs any "long time spent" logging, because we already nag if // compaction task goes over it's period (20s) which is quite often in production. - let _permit = match super::tasks::concurrent_background_tasks_rate_limit( - BackgroundLoopKind::Compaction, - ctx, - cancel, - ) - .await - { - Ok(permit) => permit, - Err(RateLimitError::Cancelled) => return Ok(()), + let (_guard, _permit) = tokio::select! { + tuple = prepare => { tuple }, + _ = self.cancel.cancelled() => return Ok(()), + _ = cancel.cancelled() => return Ok(()), }; let last_record_lsn = self.get_last_record_lsn(); @@ -801,7 +811,12 @@ impl Timeline { .access_stats_behavior(AccessStatsBehavior::Skip) .build(); - // 2. Create new image layers for partitions that have been modified + // 2. Compact + let timer = self.metrics.compact_time_histo.start_timer(); + self.compact_level0(target_file_size, ctx).await?; + timer.stop_and_record(); + + // 3. Create new image layers for partitions that have been modified // "enough". let layers = self .create_image_layers(&partitioning, lsn, false, &image_ctx) @@ -813,11 +828,6 @@ impl Timeline { } } - // 3. Compact - let timer = self.metrics.compact_time_histo.start_timer(); - self.compact_level0(target_file_size, ctx).await?; - timer.stop_and_record(); - if let Some(remote_client) = &self.remote_client { // should any new image layer been created, not uploading index_part will // result in a mismatch between remote_physical_size and layermap calculated @@ -849,31 +859,6 @@ impl Timeline { } } - /// Retrieve current logical size of the timeline. - /// - /// The size could be lagging behind the actual number, in case - /// the initial size calculation has not been run (gets triggered on the first size access). - /// - /// return size and boolean flag that shows if the size is exact - pub fn get_current_logical_size( - self: &Arc, - ctx: &RequestContext, - ) -> anyhow::Result<(u64, bool)> { - let current_size = self.current_logical_size.current_size()?; - debug!("Current size: {current_size:?}"); - - let mut is_exact = true; - let size = current_size.size(); - if let (CurrentLogicalSize::Approximate(_), Some(initial_part_end)) = - (current_size, self.current_logical_size.initial_part_end) - { - is_exact = false; - self.try_spawn_size_init_task(initial_part_end, ctx); - } - - Ok((size, is_exact)) - } - /// Check if more than 'checkpoint_distance' of WAL has been accumulated in /// the in-memory layer, and initiate flushing it if so. /// @@ -923,6 +908,7 @@ impl Timeline { background_jobs_can_start: Option<&completion::Barrier>, ctx: &RequestContext, ) { + self.spawn_initial_logical_size_computation_task(ctx); self.launch_wal_receiver(ctx, broker_client); self.set_state(TimelineState::Active); self.launch_eviction_task(background_jobs_can_start); @@ -941,7 +927,7 @@ impl Timeline { tracing::debug!("Waiting for WalReceiverManager..."); task_mgr::shutdown_tasks( Some(TaskKind::WalReceiverManager), - Some(self.tenant_shard_id.tenant_id), + Some(self.tenant_shard_id), Some(self.timeline_id), ) .await; @@ -992,7 +978,7 @@ impl Timeline { // Shut down the layer flush task before the remote client, as one depends on the other task_mgr::shutdown_tasks( Some(TaskKind::LayerFlushTask), - Some(self.tenant_shard_id.tenant_id), + Some(self.tenant_shard_id), Some(self.timeline_id), ) .await; @@ -1010,12 +996,7 @@ impl Timeline { tracing::debug!("Waiting for tasks..."); - task_mgr::shutdown_tasks( - None, - Some(self.tenant_shard_id.tenant_id), - Some(self.timeline_id), - ) - .await; + task_mgr::shutdown_tasks(None, Some(self.tenant_shard_id), Some(self.timeline_id)).await; // Finally wait until any gate-holders are complete self.gate.close().await; @@ -1036,17 +1017,6 @@ impl Timeline { error!("Not activating a Stopping timeline"); } (_, new_state) => { - if matches!( - new_state, - TimelineState::Stopping | TimelineState::Broken { .. } - ) { - // drop the completion guard, if any; it might be holding off the completion - // forever needlessly - self.initial_logical_size_attempt - .lock() - .unwrap_or_else(|e| e.into_inner()) - .take(); - } self.state.send_replace(new_state); } } @@ -1149,8 +1119,9 @@ impl Timeline { Ok(Some(true)) } - /// Like [`evict_layer_batch`](Self::evict_layer_batch), but for just one layer. - /// Additional case `Ok(None)` covers the case where the layer could not be found by its `layer_file_name`. + /// Evict just one layer. + /// + /// Returns `Ok(None)` in the case where the layer could not be found by its `layer_file_name`. pub async fn evict_layer(&self, layer_file_name: &str) -> anyhow::Result> { let _gate = self .gate @@ -1161,109 +1132,17 @@ impl Timeline { return Ok(None); }; - let Some(local_layer) = local_layer.keep_resident().await? else { - return Ok(Some(false)); - }; - - let local_layer: Layer = local_layer.into(); - - let remote_client = self + let rtc = self .remote_client .as_ref() .ok_or_else(|| anyhow::anyhow!("remote storage not configured; cannot evict"))?; - let results = self - .evict_layer_batch(remote_client, &[local_layer]) - .await?; - assert_eq!(results.len(), 1); - let result: Option> = results.into_iter().next().unwrap(); - match result { - None => anyhow::bail!("task_mgr shutdown requested"), - Some(Ok(())) => Ok(Some(true)), - Some(Err(e)) => Err(anyhow::Error::new(e)), + match local_layer.evict_and_wait(rtc).await { + Ok(()) => Ok(Some(true)), + Err(EvictionError::NotFound) => Ok(Some(false)), + Err(EvictionError::Downloaded) => Ok(Some(false)), } } - - /// Evict a batch of layers. - pub(crate) async fn evict_layers( - &self, - layers_to_evict: &[Layer], - ) -> anyhow::Result>>> { - let _gate = self - .gate - .enter() - .map_err(|_| anyhow::anyhow!("Shutting down"))?; - - let remote_client = self - .remote_client - .as_ref() - .context("timeline must have RemoteTimelineClient")?; - - self.evict_layer_batch(remote_client, layers_to_evict).await - } - - /// Evict multiple layers at once, continuing through errors. - /// - /// The `remote_client` should be this timeline's `self.remote_client`. - /// We make the caller provide it so that they are responsible for handling the case - /// where someone wants to evict the layer but no remote storage is configured. - /// - /// Returns either `Err()` or `Ok(results)` where `results.len() == layers_to_evict.len()`. - /// If `Err()` is returned, no eviction was attempted. - /// Each position of `Ok(results)` corresponds to the layer in `layers_to_evict`. - /// Meaning of each `result[i]`: - /// - `Some(Err(...))` if layer replacement failed for some reason - /// - replacement failed for an expectable reason (e.g., layer removed by GC before we grabbed all locks) - /// - `Some(Ok(()))` if everything went well. - /// - `None` if no eviction attempt was made for the layer because `cancel.is_cancelled() == true`. - async fn evict_layer_batch( - &self, - remote_client: &Arc, - layers_to_evict: &[Layer], - ) -> anyhow::Result>>> { - { - // to avoid racing with detach and delete_timeline - let state = self.current_state(); - anyhow::ensure!( - state == TimelineState::Active, - "timeline is not active but {state:?}" - ); - } - - let mut results = Vec::with_capacity(layers_to_evict.len()); - for _ in 0..layers_to_evict.len() { - results.push(None); - } - - let mut js = tokio::task::JoinSet::new(); - - for (i, l) in layers_to_evict.iter().enumerate() { - js.spawn({ - let l = l.to_owned(); - let remote_client = remote_client.clone(); - async move { (i, l.evict_and_wait(&remote_client).await) } - }); - } - - let join = async { - while let Some(next) = js.join_next().await { - match next { - Ok((i, res)) => results[i] = Some(res), - Err(je) if je.is_cancelled() => unreachable!("not used"), - Err(je) if je.is_panic() => { /* already logged */ } - Err(je) => tracing::error!("unknown JoinError: {je:?}"), - } - } - }; - - tokio::select! { - _ = self.cancel.cancelled() => {}, - _ = join => {} - } - - assert_eq!(results.len(), layers_to_evict.len()); - Ok(results) - } } /// Number of times we will compute partition within a checkpoint distance. @@ -1340,16 +1219,20 @@ impl Timeline { &self.conf.default_tenant_conf, ); - // TODO(sharding): make evictions state shard aware - // (https://github.com/neondatabase/neon/issues/5953) let tenant_id_str = self.tenant_shard_id.tenant_id.to_string(); + let shard_id_str = format!("{}", self.tenant_shard_id.shard_slug()); let timeline_id_str = self.timeline_id.to_string(); self.metrics .evictions_with_low_residence_duration .write() .unwrap() - .change_threshold(&tenant_id_str, &timeline_id_str, new_threshold); + .change_threshold( + &tenant_id_str, + &shard_id_str, + &timeline_id_str, + new_threshold, + ); } } @@ -1365,11 +1248,10 @@ impl Timeline { timeline_id: TimelineId, tenant_shard_id: TenantShardId, generation: Generation, + shard_identity: ShardIdentity, walredo_mgr: Arc, resources: TimelineResources, pg_version: u32, - initial_logical_size_can_start: Option, - initial_logical_size_attempt: Option, state: TimelineState, cancel: CancellationToken, ) -> Arc { @@ -1396,6 +1278,7 @@ impl Timeline { timeline_id, tenant_shard_id, generation, + shard_identity, pg_version, layers: Arc::new(tokio::sync::RwLock::new(LayerManager::create())), wanted_image_layers: Mutex::new(None), @@ -1421,7 +1304,7 @@ impl Timeline { ancestor_lsn: metadata.ancestor_lsn(), metrics: TimelineMetrics::new( - &tenant_shard_id.tenant_id, + &tenant_shard_id, &timeline_id, crate::metrics::EvictionsWithLowResidenceDurationBuilder::new( "mtime", @@ -1469,8 +1352,6 @@ impl Timeline { ), delete_progress: Arc::new(tokio::sync::Mutex::new(DeleteTimelineFlow::default())), - initial_logical_size_can_start, - initial_logical_size_attempt: Mutex::new(initial_logical_size_attempt), cancel, gate: Gate::new(format!("Timeline<{tenant_shard_id}/{timeline_id}>")), @@ -1524,7 +1405,7 @@ impl Timeline { task_mgr::spawn( task_mgr::BACKGROUND_RUNTIME.handle(), task_mgr::TaskKind::LayerFlushTask, - Some(self.tenant_shard_id.tenant_id), + Some(self.tenant_shard_id), Some(self.timeline_id), "layer flush task", false, @@ -1583,6 +1464,7 @@ impl Timeline { max_lsn_wal_lag, auth_token: crate::config::SAFEKEEPER_AUTH_TOKEN.get().cloned(), availability_zone: self.conf.availability_zone.clone(), + ingest_batch_size: self.conf.ingest_batch_size, }, broker_client, ctx, @@ -1782,38 +1664,92 @@ impl Timeline { Ok(()) } - fn try_spawn_size_init_task(self: &Arc, lsn: Lsn, ctx: &RequestContext) { - let state = self.current_state(); - if matches!( - state, - TimelineState::Broken { .. } | TimelineState::Stopping - ) { - // Can happen when timeline detail endpoint is used when deletion is ongoing (or its broken). - return; + /// Retrieve current logical size of the timeline. + /// + /// The size could be lagging behind the actual number, in case + /// the initial size calculation has not been run (gets triggered on the first size access). + /// + /// return size and boolean flag that shows if the size is exact + pub(crate) fn get_current_logical_size( + self: &Arc, + priority: GetLogicalSizePriority, + ctx: &RequestContext, + ) -> logical_size::CurrentLogicalSize { + let current_size = self.current_logical_size.current_size(); + debug!("Current size: {current_size:?}"); + + match (current_size.accuracy(), priority) { + (logical_size::Accuracy::Exact, _) => (), // nothing to do + (logical_size::Accuracy::Approximate, GetLogicalSizePriority::Background) => { + // background task will eventually deliver an exact value, we're in no rush + } + (logical_size::Accuracy::Approximate, GetLogicalSizePriority::User) => { + // background task is not ready, but user is asking for it now; + // => make the background task skip the line + // (The alternative would be to calculate the size here, but, + // it can actually take a long time if the user has a lot of rels. + // And we'll inevitable need it again; So, let the background task do the work.) + match self + .current_logical_size + .cancel_wait_for_background_loop_concurrency_limit_semaphore + .get() + { + Some(cancel) => cancel.cancel(), + None => { + let state = self.current_state(); + if matches!( + state, + TimelineState::Broken { .. } | TimelineState::Stopping + ) { + + // Can happen when timeline detail endpoint is used when deletion is ongoing (or its broken). + // Don't make noise. + } else { + warn!("unexpected: cancel_wait_for_background_loop_concurrency_limit_semaphore not set, priority-boosting of logical size calculation will not work"); + } + } + }; + } } - let permit = match Arc::clone(&self.current_logical_size.initial_size_computation) - .try_acquire_owned() - { - Ok(permit) => permit, - Err(TryAcquireError::NoPermits) => { - // computation already ongoing or finished with success - return; + if let CurrentLogicalSize::Approximate(_) = ¤t_size { + if ctx.task_kind() == TaskKind::WalReceiverConnectionHandler { + let first = self + .current_logical_size + .did_return_approximate_to_walreceiver + .compare_exchange( + false, + true, + AtomicOrdering::Relaxed, + AtomicOrdering::Relaxed, + ) + .is_ok(); + if first { + crate::metrics::initial_logical_size::TIMELINES_WHERE_WALRECEIVER_GOT_APPROXIMATE_SIZE.inc(); + } } - Err(TryAcquireError::Closed) => unreachable!("we never call close"), - }; - debug_assert!(self - .current_logical_size - .initial_logical_size - .get() - .is_none()); + } + + current_size + } + + fn spawn_initial_logical_size_computation_task(self: &Arc, ctx: &RequestContext) { + let Some(initial_part_end) = self.current_logical_size.initial_part_end else { + // nothing to do for freshly created timelines; + assert_eq!( + self.current_logical_size.current_size().accuracy(), + logical_size::Accuracy::Exact, + ); + self.current_logical_size.initialized.add_permits(1); + return; + }; + + let cancel_wait_for_background_loop_concurrency_limit_semaphore = CancellationToken::new(); + let token = cancel_wait_for_background_loop_concurrency_limit_semaphore.clone(); + self.current_logical_size + .cancel_wait_for_background_loop_concurrency_limit_semaphore.set(token) + .expect("initial logical size calculation task must be spawned exactly once per Timeline object"); - info!( - "spawning logical size computation from context of task kind {:?}", - ctx.task_kind() - ); - // We need to start the computation task. - // It gets a separate context since it will outlive the request that called this function. let self_clone = Arc::clone(self); let background_ctx = ctx.detached_child( TaskKind::InitialLogicalSizeCalculation, @@ -1822,95 +1758,163 @@ impl Timeline { task_mgr::spawn( task_mgr::BACKGROUND_RUNTIME.handle(), task_mgr::TaskKind::InitialLogicalSizeCalculation, - Some(self.tenant_shard_id.tenant_id), + Some(self.tenant_shard_id), Some(self.timeline_id), "initial size calculation", false, // NB: don't log errors here, task_mgr will do that. async move { - let cancel = task_mgr::shutdown_token(); + self_clone + .initial_logical_size_calculation_task( + initial_part_end, + cancel_wait_for_background_loop_concurrency_limit_semaphore, + cancel, + background_ctx, + ) + .await; + Ok(()) + } + .instrument(info_span!(parent: None, "initial_size_calculation", tenant_id=%self.tenant_shard_id.tenant_id, timeline_id=%self.timeline_id)), + ); + } - // in case we were created during pageserver initialization, wait for - // initialization to complete before proceeding. startup time init runs on the same - // runtime. - tokio::select! { - _ = cancel.cancelled() => { return Ok(()); }, - _ = completion::Barrier::maybe_wait(self_clone.initial_logical_size_can_start.clone()) => {} + async fn initial_logical_size_calculation_task( + self: Arc, + initial_part_end: Lsn, + skip_concurrency_limiter: CancellationToken, + cancel: CancellationToken, + background_ctx: RequestContext, + ) { + scopeguard::defer! { + // Irrespective of the outcome of this operation, we should unblock anyone waiting for it. + self.current_logical_size.initialized.add_permits(1); + } + + enum BackgroundCalculationError { + Cancelled, + Other(anyhow::Error), + } + + let try_once = |attempt: usize| { + let background_ctx = &background_ctx; + let self_ref = &self; + let skip_concurrency_limiter = &skip_concurrency_limiter; + async move { + let cancel = task_mgr::shutdown_token(); + let wait_for_permit = super::tasks::concurrent_background_tasks_rate_limit_permit( + BackgroundLoopKind::InitialLogicalSizeCalculation, + background_ctx, + ); + + use crate::metrics::initial_logical_size::StartCircumstances; + let (_maybe_permit, circumstances) = tokio::select! { + permit = wait_for_permit => { + (Some(permit), StartCircumstances::AfterBackgroundTasksRateLimit) + } + _ = self_ref.cancel.cancelled() => { + return Err(BackgroundCalculationError::Cancelled); + } + _ = cancel.cancelled() => { + return Err(BackgroundCalculationError::Cancelled); + }, + () = skip_concurrency_limiter.cancelled() => { + // Some action that is part of a end user interaction requested logical size + // => break out of the rate limit + // TODO: ideally we'd not run on BackgroundRuntime but the requester's runtime; + // but then again what happens if they cancel; also, we should just be using + // one runtime across the entire process, so, let's leave this for now. + (None, StartCircumstances::SkippedConcurrencyLimiter) + } }; - // hold off background tasks from starting until all timelines get to try at least - // once initial logical size calculation; though retry will rarely be useful. - // holding off is done because heavier tasks execute blockingly on the same - // runtime. - // - // dropping this at every outcome is probably better than trying to cling on to it, - // delay will be terminated by a timeout regardless. - let _completion = { self_clone.initial_logical_size_attempt.lock().expect("unexpected initial_logical_size_attempt poisoned").take() }; + let metrics_guard = if attempt == 1 { + crate::metrics::initial_logical_size::START_CALCULATION.first(circumstances) + } else { + crate::metrics::initial_logical_size::START_CALCULATION.retry(circumstances) + }; - let calculated_size = match self_clone - .logical_size_calculation_task(lsn, LogicalSizeCalculationCause::Initial, &background_ctx) + match self_ref + .logical_size_calculation_task( + initial_part_end, + LogicalSizeCalculationCause::Initial, + background_ctx, + ) .await { - Ok(s) => s, + Ok(calculated_size) => Ok((calculated_size, metrics_guard)), Err(CalculateLogicalSizeError::Cancelled) => { - // Don't make noise, this is a common task. - // In the unlikely case that there is another call to this function, we'll retry - // because initial_logical_size is still None. - info!("initial size calculation cancelled, likely timeline delete / tenant detach"); - return Ok(()); + Err(BackgroundCalculationError::Cancelled) } Err(CalculateLogicalSizeError::Other(err)) => { - if let Some(e @ PageReconstructError::AncestorStopping(_)) = + if let Some(PageReconstructError::AncestorStopping(_)) = err.root_cause().downcast_ref() { - // This can happen if the timeline parent timeline switches to - // Stopping state while we're still calculating the initial - // timeline size for the child, for example if the tenant is - // being detached or the pageserver is shut down. Like with - // CalculateLogicalSizeError::Cancelled, don't make noise. - info!("initial size calculation failed because the timeline or its ancestor is Stopping, likely because the tenant is being detached: {e:#}"); - return Ok(()); + Err(BackgroundCalculationError::Cancelled) + } else { + Err(BackgroundCalculationError::Other(err)) } - return Err(err.context("Failed to calculate logical size")); - } - }; - - // we cannot query current_logical_size.current_size() to know the current - // *negative* value, only truncated to u64. - let added = self_clone - .current_logical_size - .size_added_after_initial - .load(AtomicOrdering::Relaxed); - - let sum = calculated_size.saturating_add_signed(added); - - // set the gauge value before it can be set in `update_current_logical_size`. - self_clone.metrics.current_logical_size_gauge.set(sum); - - match self_clone - .current_logical_size - .initial_logical_size - .set(calculated_size) - { - Ok(()) => (), - Err(_what_we_just_attempted_to_set) => { - let existing_size = self_clone - .current_logical_size - .initial_logical_size - .get() - .expect("once_cell set was lost, then get failed, impossible."); - // This shouldn't happen because the semaphore is initialized with 1. - // But if it happens, just complain & report success so there are no further retries. - error!("Tried to update initial timeline size value to {calculated_size}, but the size was already set to {existing_size}, not changing") } } - // now that `initial_logical_size.is_some()`, reduce permit count to 0 - // so that we prevent future callers from spawning this task - permit.forget(); - Ok(()) - }.in_current_span(), - ); + } + }; + + let retrying = async { + let mut attempt = 0; + loop { + attempt += 1; + + match try_once(attempt).await { + Ok(res) => return ControlFlow::Continue(res), + Err(BackgroundCalculationError::Cancelled) => return ControlFlow::Break(()), + Err(BackgroundCalculationError::Other(e)) => { + warn!(attempt, "initial size calculation failed: {e:?}"); + // exponential back-off doesn't make sense at these long intervals; + // use fixed retry interval with generous jitter instead + let sleep_duration = Duration::from_secs( + u64::try_from( + // 1hour base + (60_i64 * 60_i64) + // 10min jitter + + rand::thread_rng().gen_range(-10 * 60..10 * 60), + ) + .expect("10min < 1hour"), + ); + tokio::time::sleep(sleep_duration).await; + } + } + } + }; + + let (calculated_size, metrics_guard) = tokio::select! { + res = retrying => { + match res { + ControlFlow::Continue(calculated_size) => calculated_size, + ControlFlow::Break(()) => return, + } + } + _ = cancel.cancelled() => { + return; + } + }; + + // we cannot query current_logical_size.current_size() to know the current + // *negative* value, only truncated to u64. + let added = self + .current_logical_size + .size_added_after_initial + .load(AtomicOrdering::Relaxed); + + let sum = calculated_size.saturating_add_signed(added); + + // set the gauge value before it can be set in `update_current_logical_size`. + self.metrics.current_logical_size_gauge.set(sum); + + self.current_logical_size + .initial_logical_size + .set((calculated_size, metrics_guard.calculation_result_saved())) + .ok() + .expect("only this task sets it"); } pub fn spawn_ondemand_logical_size_calculation( @@ -1932,7 +1936,7 @@ impl Timeline { task_mgr::spawn( task_mgr::BACKGROUND_RUNTIME.handle(), task_mgr::TaskKind::OndemandLogicalSizeCalculation, - Some(self.tenant_shard_id.tenant_id), + Some(self.tenant_shard_id), Some(self.timeline_id), "ondemand logical size calculation", false, @@ -1948,6 +1952,9 @@ impl Timeline { receiver } + /// # Cancel-Safety + /// + /// This method is cancellation-safe. #[instrument(skip_all)] async fn logical_size_calculation_task( self: &Arc, @@ -1985,6 +1992,10 @@ impl Timeline { /// /// NOTE: counted incrementally, includes ancestors. This can be a slow operation, /// especially if we need to download remote layers. + /// + /// # Cancel-Safety + /// + /// This method is cancellation-safe. pub async fn calculate_logical_size( &self, up_to_lsn: Lsn, @@ -2049,16 +2060,14 @@ impl Timeline { // one value while current_logical_size is set to the // other. match logical_size.current_size() { - Ok(CurrentLogicalSize::Exact(new_current_size)) => self + CurrentLogicalSize::Exact(ref new_current_size) => self .metrics .current_logical_size_gauge - .set(new_current_size), - Ok(CurrentLogicalSize::Approximate(_)) => { + .set(new_current_size.into()), + CurrentLogicalSize::Approximate(_) => { // don't update the gauge yet, this allows us not to update the gauge back and // forth between the initial size calculation task. } - // this is overflow - Err(e) => error!("Failed to compute current logical size for metrics update: {e:?}"), } } @@ -2073,6 +2082,55 @@ impl Timeline { None } + + /// The timeline heatmap is a hint to secondary locations from the primary location, + /// indicating which layers are currently on-disk on the primary. + /// + /// None is returned if the Timeline is in a state where uploading a heatmap + /// doesn't make sense, such as shutting down or initializing. The caller + /// should treat this as a cue to simply skip doing any heatmap uploading + /// for this timeline. + pub(crate) async fn generate_heatmap(&self) -> Option { + let eviction_info = self.get_local_layers_for_disk_usage_eviction().await; + + let remote_client = match &self.remote_client { + Some(c) => c, + None => return None, + }; + + let layer_file_names = eviction_info + .resident_layers + .iter() + .map(|l| l.layer.layer_desc().filename()) + .collect::>(); + + let decorated = match remote_client.get_layers_metadata(layer_file_names) { + Ok(d) => d, + Err(_) => { + // Getting metadata only fails on Timeline in bad state. + return None; + } + }; + + let heatmap_layers = std::iter::zip( + eviction_info.resident_layers.into_iter(), + decorated.into_iter(), + ) + .filter_map(|(layer, remote_info)| { + remote_info.map(|remote_info| { + HeatMapLayer::new( + layer.layer.layer_desc().filename(), + IndexLayerMetadata::from(remote_info), + layer.last_activity_ts, + ) + }) + }); + + Some(HeatMapTimeline::new( + self.timeline_id, + heatmap_layers.collect(), + )) + } } type TraversalId = String; @@ -2102,6 +2160,10 @@ impl Timeline { /// /// This function takes the current timeline's locked LayerMap as an argument, /// so callers can avoid potential race conditions. + /// + /// # Cancel-Safety + /// + /// This method is cancellation-safe. async fn get_reconstruct_data( &self, key: Key, @@ -2167,13 +2229,13 @@ impl Timeline { return Err(layer_traversal_error( if cfg!(test) { format!( - "could not find data for key {} at LSN {}, for request at LSN {}\n{}", - key, cont_lsn, request_lsn, std::backtrace::Backtrace::force_capture(), + "could not find data for key {} (shard {:?}) at LSN {}, for request at LSN {}\n{}", + key, self.shard_identity.get_shard_number(&key), cont_lsn, request_lsn, std::backtrace::Backtrace::force_capture(), ) } else { format!( - "could not find data for key {} at LSN {}, for request at LSN {}", - key, cont_lsn, request_lsn + "could not find data for key {} (shard {:?}) at LSN {}, for request at LSN {}", + key, self.shard_identity.get_shard_number(&key), cont_lsn, request_lsn ) }, traversal_path, @@ -2182,7 +2244,7 @@ impl Timeline { } // Recurse into ancestor if needed - if Lsn(cont_lsn.0 - 1) <= timeline.ancestor_lsn { + if is_inherited_key(key) && Lsn(cont_lsn.0 - 1) <= timeline.ancestor_lsn { trace!( "going into ancestor {}, cont_lsn is {}", timeline.ancestor_lsn, @@ -2350,6 +2412,9 @@ impl Timeline { } } + /// # Cancel-safety + /// + /// This method is cancellation-safe. async fn lookup_cached_page( &self, key: &Key, @@ -2361,13 +2426,7 @@ impl Timeline { // FIXME: It's pointless to check the cache for things that are not 8kB pages. // We should look at the key to determine if it's a cacheable object let (lsn, read_guard) = cache - .lookup_materialized_page( - self.tenant_shard_id.tenant_id, - self.timeline_id, - key, - lsn, - ctx, - ) + .lookup_materialized_page(self.tenant_shard_id, self.timeline_id, key, lsn, ctx) .await?; let img = Bytes::from(read_guard.to_vec()); Some((lsn, img)) @@ -2384,6 +2443,10 @@ impl Timeline { Ok(Arc::clone(ancestor)) } + pub(crate) fn get_shard_identity(&self) -> &ShardIdentity { + &self.shard_identity + } + /// /// Get a handle to the latest layer for appending. /// @@ -2414,13 +2477,31 @@ impl Timeline { Ok(()) } - async fn put_tombstone(&self, key_range: Range, lsn: Lsn) -> anyhow::Result<()> { - let layer = self.get_layer_for_write(lsn).await?; - layer.put_tombstone(key_range, lsn).await?; + async fn put_values( + &self, + values: &HashMap>, + ctx: &RequestContext, + ) -> anyhow::Result<()> { + // Pick the first LSN in the batch to get the layer to write to. + for lsns in values.values() { + if let Some((lsn, _)) = lsns.first() { + let layer = self.get_layer_for_write(*lsn).await?; + layer.put_values(values, ctx).await?; + break; + } + } Ok(()) } - fn finish_write(&self, new_lsn: Lsn) { + async fn put_tombstones(&self, tombstones: &[(Range, Lsn)]) -> anyhow::Result<()> { + if let Some((_, lsn)) = tombstones.first() { + let layer = self.get_layer_for_write(*lsn).await?; + layer.put_tombstones(tombstones).await?; + } + Ok(()) + } + + pub(crate) fn finish_write(&self, new_lsn: Lsn) { assert!(new_lsn.is_aligned()); self.metrics.last_record_gauge.set(new_lsn.0 as i64); @@ -2978,6 +3059,15 @@ impl Timeline { for range in &partition.ranges { let mut key = range.start; while key < range.end { + if self.shard_identity.is_key_disposable(&key) { + debug!( + "Dropping key {} during compaction (it belongs on shard {:?})", + key, + self.shard_identity.get_shard_number(&key) + ); + key = key.next(); + continue; + } let img = match self.get(key, lsn, ctx).await { Ok(img) => img, Err(err) => { @@ -3004,6 +3094,7 @@ impl Timeline { } } }; + image_layer_writer.put_image(key, &img).await?; key = key.next(); } @@ -3053,6 +3144,32 @@ impl Timeline { Ok(image_layers) } + + /// Wait until the background initial logical size calculation is complete, or + /// this Timeline is shut down. Calling this function will cause the initial + /// logical size calculation to skip waiting for the background jobs barrier. + pub(crate) async fn await_initial_logical_size(self: Arc) { + if let Some(await_bg_cancel) = self + .current_logical_size + .cancel_wait_for_background_loop_concurrency_limit_semaphore + .get() + { + await_bg_cancel.cancel(); + } else { + // We should not wait if we were not able to explicitly instruct + // the logical size cancellation to skip the concurrency limit semaphore. + // TODO: this is an unexpected case. We should restructure so that it + // can't happen. + tracing::info!( + "await_initial_logical_size: can't get semaphore cancel token, skipping" + ); + } + + tokio::select!( + _ = self.current_logical_size.initialized.acquire() => {}, + _ = self.cancel.cancelled() => {} + ) + } } #[derive(Default)] @@ -3105,7 +3222,7 @@ impl DurationRecorder { #[derive(Default)] struct CompactLevel0Phase1StatsBuilder { version: Option, - tenant_id: Option, + tenant_id: Option, timeline_id: Option, read_lock_acquisition_micros: DurationRecorder, read_lock_held_spawn_blocking_startup_micros: DurationRecorder, @@ -3122,7 +3239,7 @@ struct CompactLevel0Phase1StatsBuilder { #[derive(serde::Serialize)] struct CompactLevel0Phase1Stats { version: u64, - tenant_id: TenantId, + tenant_id: TenantShardId, timeline_id: TimelineId, read_lock_acquisition_micros: RecordedDuration, read_lock_held_spawn_blocking_startup_micros: RecordedDuration, @@ -3548,7 +3665,15 @@ impl Timeline { ))) }); - writer.as_mut().unwrap().put_value(key, lsn, value).await?; + if !self.shard_identity.is_key_disposable(&key) { + writer.as_mut().unwrap().put_value(key, lsn, value).await?; + } else { + debug!( + "Dropping key {} during compaction (it belongs on shard {:?})", + key, + self.shard_identity.get_shard_number(&key) + ); + } if !new_layers.is_empty() { fail_point!("after-timeline-compacted-first-L1"); @@ -3641,7 +3766,7 @@ impl Timeline { let ctx = ctx.attached_child(); let mut stats = CompactLevel0Phase1StatsBuilder { version: Some(2), - tenant_id: Some(self.tenant_shard_id.tenant_id), + tenant_id: Some(self.tenant_shard_id), timeline_id: Some(self.timeline_id), ..Default::default() }; @@ -3809,7 +3934,14 @@ impl Timeline { /// within a layer file. We can only remove the whole file if it's fully /// obsolete. pub(super) async fn gc(&self) -> anyhow::Result { - let _g = self.gc_lock.lock().await; + // this is most likely the background tasks, but it might be the spawned task from + // immediate_gc + let cancel = crate::task_mgr::shutdown_token(); + let _g = tokio::select! { + guard = self.gc_lock.lock() => guard, + _ = self.cancel.cancelled() => return Ok(GcResult::default()), + _ = cancel.cancelled() => return Ok(GcResult::default()), + }; let timer = self.metrics.garbage_collect_histo.start_timer(); fail_point!("before-timeline-gc"); @@ -3867,7 +3999,7 @@ impl Timeline { // for details. This will block until the old value is no longer in use. // // The GC cutoff should only ever move forwards. - { + let waitlist = { let write_guard = self.latest_gc_cutoff_lsn.lock_for_write(); ensure!( *write_guard <= new_gc_cutoff, @@ -3875,8 +4007,9 @@ impl Timeline { *write_guard, new_gc_cutoff ); - write_guard.store_and_unlock(new_gc_cutoff).wait(); - } + write_guard.store_and_unlock(new_gc_cutoff) + }; + waitlist.wait().await; info!("GC starting"); @@ -4102,7 +4235,7 @@ impl Timeline { let cache = page_cache::get(); if let Err(e) = cache .memorize_materialized_page( - self.tenant_shard_id.tenant_id, + self.tenant_shard_id, self.timeline_id, key, last_rec_lsn, @@ -4146,7 +4279,7 @@ impl Timeline { let task_id = task_mgr::spawn( task_mgr::BACKGROUND_RUNTIME.handle(), task_mgr::TaskKind::DownloadAllRemoteLayers, - Some(self.tenant_shard_id.tenant_id), + Some(self.tenant_shard_id), Some(self.timeline_id), "download all remote layers task", false, @@ -4438,8 +4571,16 @@ impl<'a> TimelineWriter<'a> { self.tl.put_value(key, lsn, value, ctx).await } - pub async fn delete(&self, key_range: Range, lsn: Lsn) -> anyhow::Result<()> { - self.tl.put_tombstone(key_range, lsn).await + pub(crate) async fn put_batch( + &self, + batch: &HashMap>, + ctx: &RequestContext, + ) -> anyhow::Result<()> { + self.tl.put_values(batch, ctx).await + } + + pub(crate) async fn delete_batch(&self, batch: &[(Range, Lsn)]) -> anyhow::Result<()> { + self.tl.put_tombstones(batch).await } /// Track the end of the latest digested WAL record. @@ -4450,11 +4591,11 @@ impl<'a> TimelineWriter<'a> { /// 'lsn' must be aligned. This wakes up any wait_lsn() callers waiting for /// the 'lsn' or anything older. The previous last record LSN is stored alongside /// the latest and can be read. - pub fn finish_write(&self, new_lsn: Lsn) { + pub(crate) fn finish_write(&self, new_lsn: Lsn) { self.tl.finish_write(new_lsn); } - pub fn update_current_logical_size(&self, delta: i64) { + pub(crate) fn update_current_logical_size(&self, delta: i64) { self.tl.update_current_logical_size(delta) } } @@ -4507,7 +4648,7 @@ mod tests { .await .unwrap(); - let rc = timeline + let rtc = timeline .remote_client .clone() .expect("just configured this"); @@ -4520,16 +4661,12 @@ mod tests { .expect("should had been resident") .drop_eviction_guard(); - let batch = [layer]; - - let first = async { timeline.evict_layer_batch(&rc, &batch).await.unwrap() }; - let second = async { timeline.evict_layer_batch(&rc, &batch).await.unwrap() }; + let first = async { layer.evict_and_wait(&rtc).await }; + let second = async { layer.evict_and_wait(&rtc).await }; let (first, second) = tokio::join!(first, second); - let (first, second) = (only_one(first), only_one(second)); - - let res = batch[0].keep_resident().await; + let res = layer.keep_resident().await; assert!(matches!(res, Ok(None)), "{res:?}"); match (first, second) { @@ -4550,14 +4687,6 @@ mod tests { RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error) } - fn only_one(mut input: Vec>) -> T { - assert_eq!(1, input.len()); - input - .pop() - .expect("length just checked") - .expect("no cancellation") - } - async fn find_some_layer(timeline: &Timeline) -> Layer { let layers = timeline.layers.read().await; let desc = layers diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs index 497796c80a..be873181d9 100644 --- a/pageserver/src/tenant/timeline/delete.rs +++ b/pageserver/src/tenant/timeline/delete.rs @@ -21,7 +21,6 @@ use crate::{ }, CreateTimelineCause, DeleteTimelineError, Tenant, }, - InitializationOrder, }; use super::{Timeline, TimelineResources}; @@ -44,7 +43,7 @@ async fn stop_tasks(timeline: &Timeline) -> Result<(), DeleteTimelineError> { // Shut down the layer flush task before the remote client, as one depends on the other task_mgr::shutdown_tasks( Some(TaskKind::LayerFlushTask), - Some(timeline.tenant_shard_id.tenant_id), + Some(timeline.tenant_shard_id), Some(timeline.timeline_id), ) .await; @@ -72,7 +71,7 @@ async fn stop_tasks(timeline: &Timeline) -> Result<(), DeleteTimelineError> { info!("waiting for timeline tasks to shutdown"); task_mgr::shutdown_tasks( None, - Some(timeline.tenant_shard_id.tenant_id), + Some(timeline.tenant_shard_id), Some(timeline.timeline_id), ) .await; @@ -407,7 +406,6 @@ impl DeleteTimelineFlow { local_metadata: &TimelineMetadata, remote_client: Option, deletion_queue_client: DeletionQueueClient, - init_order: Option<&InitializationOrder>, ) -> anyhow::Result<()> { // Note: here we even skip populating layer map. Timeline is essentially uninitialized. // RemoteTimelineClient is the only functioning part. @@ -420,7 +418,6 @@ impl DeleteTimelineFlow { remote_client, deletion_queue_client, }, - init_order, // Important. We dont pass ancestor above because it can be missing. // Thus we need to skip the validation here. CreateTimelineCause::Delete, @@ -531,7 +528,7 @@ impl DeleteTimelineFlow { task_mgr::spawn( task_mgr::BACKGROUND_RUNTIME.handle(), TaskKind::TimelineDeletionWorker, - Some(tenant_shard_id.tenant_id), + Some(tenant_shard_id), Some(timeline_id), "timeline_delete", false, diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs index 3fe4bc0f83..ea5f5f5fa7 100644 --- a/pageserver/src/tenant/timeline/eviction_task.rs +++ b/pageserver/src/tenant/timeline/eviction_task.rs @@ -30,7 +30,7 @@ use crate::{ task_mgr::{self, TaskKind, BACKGROUND_RUNTIME}, tenant::{ config::{EvictionPolicy, EvictionPolicyLayerAccessThreshold}, - tasks::{BackgroundLoopKind, RateLimitError}, + tasks::BackgroundLoopKind, timeline::EvictionError, LogicalSizeCalculationCause, Tenant, }, @@ -60,7 +60,7 @@ impl Timeline { task_mgr::spawn( BACKGROUND_RUNTIME.handle(), TaskKind::Eviction, - Some(self.tenant_shard_id.tenant_id), + Some(self.tenant_shard_id), Some(self.timeline_id), &format!( "layer eviction for {}/{}", @@ -158,15 +158,15 @@ impl Timeline { ) -> ControlFlow<()> { let now = SystemTime::now(); - let _permit = match crate::tenant::tasks::concurrent_background_tasks_rate_limit( + let acquire_permit = crate::tenant::tasks::concurrent_background_tasks_rate_limit_permit( BackgroundLoopKind::Eviction, ctx, - cancel, - ) - .await - { - Ok(permit) => permit, - Err(RateLimitError::Cancelled) => return ControlFlow::Break(()), + ); + + let _permit = tokio::select! { + permit = acquire_permit => permit, + _ = cancel.cancelled() => return ControlFlow::Break(()), + _ = self.cancel.cancelled() => return ControlFlow::Break(()), }; // If we evict layers but keep cached values derived from those layers, then @@ -212,11 +212,21 @@ impl Timeline { // Gather layers for eviction. // NB: all the checks can be invalidated as soon as we release the layer map lock. // We don't want to hold the layer map lock during eviction. + // So, we just need to deal with this. - let candidates: Vec<_> = { + + let remote_client = match self.remote_client.as_ref() { + Some(c) => c, + None => { + error!("no remote storage configured, cannot evict layers"); + return ControlFlow::Continue(()); + } + }; + + let mut js = tokio::task::JoinSet::new(); + { let guard = self.layers.read().await; let layers = guard.layer_map(); - let mut candidates = Vec::new(); for hist_layer in layers.iter_historic_layers() { let hist_layer = guard.get_from_desc(&hist_layer); @@ -262,54 +272,49 @@ impl Timeline { continue; } }; + let layer = guard.drop_eviction_guard(); if no_activity_for > p.threshold { - candidates.push(guard.drop_eviction_guard()) + let remote_client = remote_client.clone(); + // this could cause a lot of allocations in some cases + js.spawn(async move { layer.evict_and_wait(&remote_client).await }); + stats.candidates += 1; } } - candidates - }; - stats.candidates = candidates.len(); - - let remote_client = match self.remote_client.as_ref() { - None => { - error!( - num_candidates = candidates.len(), - "no remote storage configured, cannot evict layers" - ); - return ControlFlow::Continue(()); - } - Some(c) => c, }; - let results = match self.evict_layer_batch(remote_client, &candidates).await { - Err(pre_err) => { - stats.errors += candidates.len(); - error!("could not do any evictions: {pre_err:#}"); - return ControlFlow::Continue(()); + let join_all = async move { + while let Some(next) = js.join_next().await { + match next { + Ok(Ok(())) => stats.evicted += 1, + Ok(Err(EvictionError::NotFound | EvictionError::Downloaded)) => { + stats.not_evictable += 1; + } + Err(je) if je.is_cancelled() => unreachable!("not used"), + Err(je) if je.is_panic() => { + /* already logged */ + stats.errors += 1; + } + Err(je) => tracing::error!("unknown JoinError: {je:?}"), + } } - Ok(results) => results, + stats }; - assert_eq!(results.len(), candidates.len()); - for result in results { - match result { - None => { - stats.skipped_for_shutdown += 1; - } - Some(Ok(())) => { - stats.evicted += 1; - } - Some(Err(EvictionError::NotFound | EvictionError::Downloaded)) => { - stats.not_evictable += 1; + + tokio::select! { + stats = join_all => { + if stats.candidates == stats.not_evictable { + debug!(stats=?stats, "eviction iteration complete"); + } else if stats.errors > 0 || stats.not_evictable > 0 { + warn!(stats=?stats, "eviction iteration complete"); + } else { + info!(stats=?stats, "eviction iteration complete"); } } + _ = cancel.cancelled() => { + // just drop the joinset to "abort" + } } - if stats.candidates == stats.not_evictable { - debug!(stats=?stats, "eviction iteration complete"); - } else if stats.errors > 0 || stats.not_evictable > 0 { - warn!(stats=?stats, "eviction iteration complete"); - } else { - info!(stats=?stats, "eviction iteration complete"); - } + ControlFlow::Continue(()) } @@ -343,7 +348,7 @@ impl Timeline { // Make one of the tenant's timelines draw the short straw and run the calculation. // The others wait until the calculation is done so that they take into account the // imitated accesses that the winner made. - let tenant = match crate::tenant::mgr::get_tenant(self.tenant_shard_id.tenant_id, true) { + let tenant = match crate::tenant::mgr::get_tenant(self.tenant_shard_id, true) { Ok(t) => t, Err(_) => { return ControlFlow::Break(()); diff --git a/pageserver/src/tenant/timeline/layer_manager.rs b/pageserver/src/tenant/timeline/layer_manager.rs index dcd82949dd..e38f5be209 100644 --- a/pageserver/src/tenant/timeline/layer_manager.rs +++ b/pageserver/src/tenant/timeline/layer_manager.rs @@ -243,7 +243,7 @@ impl LayerManager { // map index without actually rebuilding the index. updates.remove_historic(desc); mapping.remove(layer); - layer.garbage_collect_on_drop(); + layer.delete_on_drop(); } pub(crate) fn contains(&self, layer: &Layer) -> bool { diff --git a/pageserver/src/tenant/timeline/logical_size.rs b/pageserver/src/tenant/timeline/logical_size.rs index d9c2bc4cb9..03bc59ea38 100644 --- a/pageserver/src/tenant/timeline/logical_size.rs +++ b/pageserver/src/tenant/timeline/logical_size.rs @@ -1,11 +1,10 @@ use anyhow::Context; -use once_cell::sync::OnceCell; -use tokio::sync::Semaphore; +use once_cell::sync::OnceCell; +use tokio_util::sync::CancellationToken; use utils::lsn::Lsn; -use std::sync::atomic::{AtomicI64, Ordering as AtomicOrdering}; -use std::sync::Arc; +use std::sync::atomic::{AtomicBool, AtomicI64, Ordering as AtomicOrdering}; /// Internal structure to hold all data needed for logical size calculation. /// @@ -23,10 +22,20 @@ pub(super) struct LogicalSize { /// /// NOTE: size at a given LSN is constant, but after a restart we will calculate /// the initial size at a different LSN. - pub initial_logical_size: OnceCell, + pub initial_logical_size: OnceCell<( + u64, + crate::metrics::initial_logical_size::FinishedCalculationGuard, + )>, - /// Semaphore to track ongoing calculation of `initial_logical_size`. - pub initial_size_computation: Arc, + /// Cancellation for the best-effort logical size calculation. + /// + /// The token is kept in a once-cell so that we can error out if a higher priority + /// request comes in *before* we have started the normal logical size calculation. + pub(crate) cancel_wait_for_background_loop_concurrency_limit_semaphore: + OnceCell, + + /// Once the initial logical size is initialized, this is notified. + pub(crate) initialized: tokio::sync::Semaphore, /// Latest Lsn that has its size uncalculated, could be absent for freshly created timelines. pub initial_part_end: Option, @@ -52,25 +61,57 @@ pub(super) struct LogicalSize { /// see `current_logical_size_gauge`. Use the `update_current_logical_size` /// to modify this, it will also keep the prometheus metric in sync. pub size_added_after_initial: AtomicI64, + + /// For [`crate::metrics::initial_logical_size::TIMELINES_WHERE_WALRECEIVER_GOT_APPROXIMATE_SIZE`]. + pub(super) did_return_approximate_to_walreceiver: AtomicBool, } /// Normalized current size, that the data in pageserver occupies. #[derive(Debug, Clone, Copy)] -pub(super) enum CurrentLogicalSize { +pub(crate) enum CurrentLogicalSize { /// The size is not yet calculated to the end, this is an intermediate result, /// constructed from walreceiver increments and normalized: logical data could delete some objects, hence be negative, /// yet total logical size cannot be below 0. - Approximate(u64), + Approximate(Approximate), // Fully calculated logical size, only other future walreceiver increments are changing it, and those changes are // available for observation without any calculations. - Exact(u64), + Exact(Exact), +} + +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +pub(crate) enum Accuracy { + Approximate, + Exact, +} + +#[derive(Debug, Clone, Copy)] +pub(crate) struct Approximate(u64); +#[derive(Debug, Clone, Copy)] +pub(crate) struct Exact(u64); + +impl From<&Approximate> for u64 { + fn from(value: &Approximate) -> Self { + value.0 + } +} + +impl From<&Exact> for u64 { + fn from(val: &Exact) -> Self { + val.0 + } } impl CurrentLogicalSize { - pub(super) fn size(&self) -> u64 { - *match self { - Self::Approximate(size) => size, - Self::Exact(size) => size, + pub(crate) fn size_dont_care_about_accuracy(&self) -> u64 { + match self { + Self::Approximate(size) => size.into(), + Self::Exact(size) => size.into(), + } + } + pub(crate) fn accuracy(&self) -> Accuracy { + match self { + Self::Approximate(_) => Accuracy::Approximate, + Self::Exact(_) => Accuracy::Exact, } } } @@ -78,36 +119,44 @@ impl CurrentLogicalSize { impl LogicalSize { pub(super) fn empty_initial() -> Self { Self { - initial_logical_size: OnceCell::with_value(0), - // initial_logical_size already computed, so, don't admit any calculations - initial_size_computation: Arc::new(Semaphore::new(0)), + initial_logical_size: OnceCell::with_value((0, { + crate::metrics::initial_logical_size::START_CALCULATION + .first(crate::metrics::initial_logical_size::StartCircumstances::EmptyInitial) + .calculation_result_saved() + })), + cancel_wait_for_background_loop_concurrency_limit_semaphore: OnceCell::new(), initial_part_end: None, size_added_after_initial: AtomicI64::new(0), + did_return_approximate_to_walreceiver: AtomicBool::new(false), + initialized: tokio::sync::Semaphore::new(0), } } pub(super) fn deferred_initial(compute_to: Lsn) -> Self { Self { initial_logical_size: OnceCell::new(), - initial_size_computation: Arc::new(Semaphore::new(1)), + cancel_wait_for_background_loop_concurrency_limit_semaphore: OnceCell::new(), initial_part_end: Some(compute_to), size_added_after_initial: AtomicI64::new(0), + did_return_approximate_to_walreceiver: AtomicBool::new(false), + initialized: tokio::sync::Semaphore::new(0), } } - pub(super) fn current_size(&self) -> anyhow::Result { + pub(super) fn current_size(&self) -> CurrentLogicalSize { let size_increment: i64 = self.size_added_after_initial.load(AtomicOrdering::Acquire); // ^^^ keep this type explicit so that the casts in this function break if // we change the type. match self.initial_logical_size.get() { - Some(initial_size) => { - initial_size.checked_add_signed(size_increment) + Some((initial_size, _)) => { + CurrentLogicalSize::Exact(Exact(initial_size.checked_add_signed(size_increment) .with_context(|| format!("Overflow during logical size calculation, initial_size: {initial_size}, size_increment: {size_increment}")) - .map(CurrentLogicalSize::Exact) + .unwrap())) } None => { + let non_negative_size_increment = u64::try_from(size_increment).unwrap_or(0); - Ok(CurrentLogicalSize::Approximate(non_negative_size_increment)) + CurrentLogicalSize::Approximate(Approximate(non_negative_size_increment)) } } } @@ -121,7 +170,7 @@ impl LogicalSize { /// available for re-use. This doesn't contain the incremental part. pub(super) fn initialized_size(&self, lsn: Lsn) -> Option { match self.initial_part_end { - Some(v) if v == lsn => self.initial_logical_size.get().copied(), + Some(v) if v == lsn => self.initial_logical_size.get().map(|(s, _)| *s), _ => None, } } diff --git a/pageserver/src/tenant/timeline/uninit.rs b/pageserver/src/tenant/timeline/uninit.rs index 61130f541a..27d6fd9c28 100644 --- a/pageserver/src/tenant/timeline/uninit.rs +++ b/pageserver/src/tenant/timeline/uninit.rs @@ -19,14 +19,14 @@ use super::Timeline; pub struct UninitializedTimeline<'t> { pub(crate) owning_tenant: &'t Tenant, timeline_id: TimelineId, - raw_timeline: Option<(Arc, TimelineUninitMark)>, + raw_timeline: Option<(Arc, TimelineUninitMark<'t>)>, } impl<'t> UninitializedTimeline<'t> { pub(crate) fn new( owning_tenant: &'t Tenant, timeline_id: TimelineId, - raw_timeline: Option<(Arc, TimelineUninitMark)>, + raw_timeline: Option<(Arc, TimelineUninitMark<'t>)>, ) -> Self { Self { owning_tenant, @@ -169,18 +169,55 @@ pub(crate) fn cleanup_timeline_directory(uninit_mark: TimelineUninitMark) { /// /// XXX: it's important to create it near the timeline dir, not inside it to ensure timeline dir gets removed first. #[must_use] -pub(crate) struct TimelineUninitMark { +pub(crate) struct TimelineUninitMark<'t> { + owning_tenant: &'t Tenant, + timeline_id: TimelineId, uninit_mark_deleted: bool, uninit_mark_path: Utf8PathBuf, pub(crate) timeline_path: Utf8PathBuf, } -impl TimelineUninitMark { - pub(crate) fn new(uninit_mark_path: Utf8PathBuf, timeline_path: Utf8PathBuf) -> Self { - Self { - uninit_mark_deleted: false, - uninit_mark_path, - timeline_path, +/// Errors when acquiring exclusive access to a timeline ID for creation +#[derive(thiserror::Error, Debug)] +pub(crate) enum TimelineExclusionError { + #[error("Already exists")] + AlreadyExists(Arc), + #[error("Already creating")] + AlreadyCreating, + + // e.g. I/O errors, or some failure deep in postgres initdb + #[error(transparent)] + Other(#[from] anyhow::Error), +} + +impl<'t> TimelineUninitMark<'t> { + pub(crate) fn new( + owning_tenant: &'t Tenant, + timeline_id: TimelineId, + uninit_mark_path: Utf8PathBuf, + timeline_path: Utf8PathBuf, + ) -> Result { + // Lock order: this is the only place we take both locks. During drop() we only + // lock creating_timelines + let timelines = owning_tenant.timelines.lock().unwrap(); + let mut creating_timelines: std::sync::MutexGuard< + '_, + std::collections::HashSet, + > = owning_tenant.timelines_creating.lock().unwrap(); + + if let Some(existing) = timelines.get(&timeline_id) { + Err(TimelineExclusionError::AlreadyExists(existing.clone())) + } else if creating_timelines.contains(&timeline_id) { + Err(TimelineExclusionError::AlreadyCreating) + } else { + creating_timelines.insert(timeline_id); + Ok(Self { + owning_tenant, + timeline_id, + uninit_mark_deleted: false, + uninit_mark_path, + timeline_path, + }) } } @@ -207,7 +244,7 @@ impl TimelineUninitMark { } } -impl Drop for TimelineUninitMark { +impl Drop for TimelineUninitMark<'_> { fn drop(&mut self) { if !self.uninit_mark_deleted { if self.timeline_path.exists() { @@ -226,5 +263,11 @@ impl Drop for TimelineUninitMark { } } } + + self.owning_tenant + .timelines_creating + .lock() + .unwrap() + .remove(&self.timeline_id); } } diff --git a/pageserver/src/tenant/timeline/walreceiver.rs b/pageserver/src/tenant/timeline/walreceiver.rs index 04ff8602d6..2fab6722b8 100644 --- a/pageserver/src/tenant/timeline/walreceiver.rs +++ b/pageserver/src/tenant/timeline/walreceiver.rs @@ -30,6 +30,7 @@ use crate::tenant::timeline::walreceiver::connection_manager::{ connection_manager_loop_step, ConnectionManagerState, }; +use pageserver_api::shard::TenantShardId; use std::future::Future; use std::num::NonZeroU64; use std::ops::ControlFlow; @@ -41,7 +42,7 @@ use tokio::sync::watch; use tokio_util::sync::CancellationToken; use tracing::*; -use utils::id::TenantTimelineId; +use utils::id::TimelineId; use self::connection_manager::ConnectionManagerStatus; @@ -57,10 +58,12 @@ pub struct WalReceiverConf { pub max_lsn_wal_lag: NonZeroU64, pub auth_token: Option>, pub availability_zone: Option, + pub ingest_batch_size: u64, } pub struct WalReceiver { - timeline: TenantTimelineId, + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, manager_status: Arc>>, } @@ -71,7 +74,7 @@ impl WalReceiver { mut broker_client: BrokerClientChannel, ctx: &RequestContext, ) -> Self { - let tenant_id = timeline.tenant_shard_id.tenant_id; + let tenant_shard_id = timeline.tenant_shard_id; let timeline_id = timeline.timeline_id; let walreceiver_ctx = ctx.detached_child(TaskKind::WalReceiverManager, DownloadBehavior::Error); @@ -81,9 +84,9 @@ impl WalReceiver { task_mgr::spawn( WALRECEIVER_RUNTIME.handle(), TaskKind::WalReceiverManager, - Some(tenant_id), + Some(timeline.tenant_shard_id), Some(timeline_id), - &format!("walreceiver for timeline {tenant_id}/{timeline_id}"), + &format!("walreceiver for timeline {tenant_shard_id}/{timeline_id}"), false, async move { debug_assert_current_span_has_tenant_and_timeline_id(); @@ -117,11 +120,12 @@ impl WalReceiver { *loop_status.write().unwrap() = None; Ok(()) } - .instrument(info_span!(parent: None, "wal_connection_manager", tenant_id = %tenant_id, timeline_id = %timeline_id)) + .instrument(info_span!(parent: None, "wal_connection_manager", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), timeline_id = %timeline_id)) ); Self { - timeline: TenantTimelineId::new(tenant_id, timeline_id), + tenant_shard_id, + timeline_id, manager_status, } } @@ -129,8 +133,8 @@ impl WalReceiver { pub async fn stop(self) { task_mgr::shutdown_tasks( Some(TaskKind::WalReceiverManager), - Some(self.timeline.tenant_id), - Some(self.timeline.timeline_id), + Some(self.tenant_shard_id), + Some(self.timeline_id), ) .await; } diff --git a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs index 7bfa246eeb..7fa5bb7689 100644 --- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs +++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs @@ -138,7 +138,7 @@ pub(super) async fn connection_manager_loop_step( Ok(Some(broker_update)) => connection_manager_state.register_timeline_update(broker_update), Err(status) => { match status.code() { - Code::Unknown if status.message().contains("stream closed because of a broken pipe") => { + Code::Unknown if status.message().contains("stream closed because of a broken pipe") || status.message().contains("connection reset") => { // tonic's error handling doesn't provide a clear code for disconnections: we get // "h2 protocol error: error reading a body from connection: stream closed because of a broken pipe" info!("broker disconnected: {status}"); @@ -411,6 +411,7 @@ impl ConnectionManagerState { let node_id = new_sk.safekeeper_id; let connect_timeout = self.conf.wal_connect_timeout; + let ingest_batch_size = self.conf.ingest_batch_size; let timeline = Arc::clone(&self.timeline); let ctx = ctx.detached_child( TaskKind::WalReceiverConnectionHandler, @@ -430,6 +431,7 @@ impl ConnectionManagerState { connect_timeout, ctx, node_id, + ingest_batch_size, ) .await; @@ -1345,6 +1347,7 @@ mod tests { max_lsn_wal_lag: NonZeroU64::new(1024 * 1024).unwrap(), auth_token: None, availability_zone: None, + ingest_batch_size: 1, }, wal_connection: None, wal_stream_candidates: HashMap::new(), diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs index 2b4aea7596..e398d683e5 100644 --- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs @@ -26,7 +26,7 @@ use tracing::{debug, error, info, trace, warn, Instrument}; use super::TaskStateUpdate; use crate::{ context::RequestContext, - metrics::{LIVE_CONNECTIONS_COUNT, WALRECEIVER_STARTED_CONNECTIONS}, + metrics::{LIVE_CONNECTIONS_COUNT, WALRECEIVER_STARTED_CONNECTIONS, WAL_INGEST}, task_mgr, task_mgr::TaskKind, task_mgr::WALRECEIVER_RUNTIME, @@ -106,6 +106,7 @@ impl From for WalReceiverError { /// Open a connection to the given safekeeper and receive WAL, sending back progress /// messages as we go. +#[allow(clippy::too_many_arguments)] pub(super) async fn handle_walreceiver_connection( timeline: Arc, wal_source_connconf: PgConnectionConfig, @@ -114,6 +115,7 @@ pub(super) async fn handle_walreceiver_connection( connect_timeout: Duration, ctx: RequestContext, node: NodeId, + ingest_batch_size: u64, ) -> Result<(), WalReceiverError> { debug_assert_current_span_has_tenant_and_timeline_id(); @@ -163,7 +165,7 @@ pub(super) async fn handle_walreceiver_connection( task_mgr::spawn( WALRECEIVER_RUNTIME.handle(), TaskKind::WalReceiverConnectionPoller, - Some(timeline.tenant_shard_id.tenant_id), + Some(timeline.tenant_shard_id), Some(timeline.timeline_id), "walreceiver connection", false, @@ -305,7 +307,9 @@ pub(super) async fn handle_walreceiver_connection( { let mut decoded = DecodedWALRecord::default(); - let mut modification = timeline.begin_modification(endlsn); + let mut modification = timeline.begin_modification(startlsn); + let mut uncommitted_records = 0; + let mut filtered_records = 0; while let Some((lsn, recdata)) = waldecoder.poll_decode()? { // It is important to deal with the aligned records as lsn in getPage@LSN is // aligned and can be several bytes bigger. Without this alignment we are @@ -314,14 +318,40 @@ pub(super) async fn handle_walreceiver_connection( return Err(WalReceiverError::Other(anyhow!("LSN not aligned"))); } - walingest + // Ingest the records without immediately committing them. + let ingested = walingest .ingest_record(recdata, lsn, &mut modification, &mut decoded, &ctx) .await .with_context(|| format!("could not ingest record at {lsn}"))?; + if !ingested { + tracing::debug!("ingest: filtered out record @ LSN {lsn}"); + WAL_INGEST.records_filtered.inc(); + filtered_records += 1; + } fail_point!("walreceiver-after-ingest"); last_rec_lsn = lsn; + + // Commit every ingest_batch_size records. Even if we filtered out + // all records, we still need to call commit to advance the LSN. + uncommitted_records += 1; + if uncommitted_records >= ingest_batch_size { + WAL_INGEST + .records_committed + .inc_by(uncommitted_records - filtered_records); + modification.commit(&ctx).await?; + uncommitted_records = 0; + filtered_records = 0; + } + } + + // Commit the remaining records. + if uncommitted_records > 0 { + WAL_INGEST + .records_committed + .inc_by(uncommitted_records - filtered_records); + modification.commit(&ctx).await?; } } @@ -396,11 +426,15 @@ pub(super) async fn handle_walreceiver_connection( // Send the replication feedback message. // Regular standby_status_update fields are put into this message. - let (timeline_logical_size, _) = timeline - .get_current_logical_size(&ctx) - .context("Status update creation failed to get current logical size")?; + let current_timeline_size = timeline + .get_current_logical_size( + crate::tenant::timeline::GetLogicalSizePriority::User, + &ctx, + ) + // FIXME: https://github.com/neondatabase/neon/issues/5963 + .size_dont_care_about_accuracy(); let status_update = PageserverFeedback { - current_timeline_size: timeline_logical_size, + current_timeline_size, last_received_lsn, disk_consistent_lsn, remote_consistent_lsn, diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs index b58b883ab6..10bed7ca06 100644 --- a/pageserver/src/virtual_file.rs +++ b/pageserver/src/virtual_file.rs @@ -288,6 +288,9 @@ impl VirtualFile { } let (handle, mut slot_guard) = get_open_files().find_victim_slot(); + // NB: there is also StorageIoOperation::OpenAfterReplace which is for the case + // where our caller doesn't get to use the returned VirtualFile before its + // slot gets re-used by someone else. let file = STORAGE_IO_TIME_METRIC .get(StorageIoOperation::Open) .observe_closure_duration(|| open_options.open(path))?; @@ -311,6 +314,9 @@ impl VirtualFile { timeline_id, }; + // TODO: Under pressure, it's likely the slot will get re-used and + // the underlying file closed before they get around to using it. + // => https://github.com/neondatabase/neon/issues/6065 slot_guard.file.replace(file); Ok(vfile) @@ -421,9 +427,12 @@ impl VirtualFile { // now locked in write-mode. Find a free slot to put it in. let (handle, mut slot_guard) = open_files.find_victim_slot(); - // Open the physical file + // Re-open the physical file. + // NB: we use StorageIoOperation::OpenAferReplace for this to distinguish this + // case from StorageIoOperation::Open. This helps with identifying thrashing + // of the virtual file descriptor cache. let file = STORAGE_IO_TIME_METRIC - .get(StorageIoOperation::Open) + .get(StorageIoOperation::OpenAfterReplace) .observe_closure_duration(|| self.open_options.open(&self.path))?; // Perform the requested operation on it @@ -610,9 +619,11 @@ impl Drop for VirtualFile { slot.recently_used.store(false, Ordering::Relaxed); // there is also operation "close-by-replace" for closes done on eviction for // comparison. - STORAGE_IO_TIME_METRIC - .get(StorageIoOperation::Close) - .observe_closure_duration(|| drop(slot_guard.file.take())); + if let Some(fd) = slot_guard.file.take() { + STORAGE_IO_TIME_METRIC + .get(StorageIoOperation::Close) + .observe_closure_duration(|| drop(fd)); + } } } } @@ -643,6 +654,7 @@ pub fn init(num_slots: usize) { if OPEN_FILES.set(OpenFiles::new(num_slots)).is_err() { panic!("virtual_file::init called twice"); } + crate::metrics::virtual_file_descriptor_cache::SIZE_MAX.set(num_slots as u64); } const TEST_MAX_FILE_DESCRIPTORS: usize = 10; diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index e9b2b78499..bb1aec030b 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -21,6 +21,7 @@ //! redo Postgres process, but some records it can handle directly with //! bespoken Rust code. +use pageserver_api::shard::ShardIdentity; use postgres_ffi::v14::nonrelfile_utils::clogpage_precedes; use postgres_ffi::v14::nonrelfile_utils::slru_may_delete_clogsegment; use postgres_ffi::{fsm_logical_to_physical, page_is_new, page_set_lsn}; @@ -28,8 +29,10 @@ use postgres_ffi::{fsm_logical_to_physical, page_is_new, page_set_lsn}; use anyhow::{bail, Context, Result}; use bytes::{Buf, Bytes, BytesMut}; use tracing::*; +use utils::failpoint_support; use crate::context::RequestContext; +use crate::metrics::WAL_INGEST; use crate::pgdatadir_mapping::*; use crate::tenant::PageReconstructError; use crate::tenant::Timeline; @@ -45,9 +48,8 @@ use postgres_ffi::TransactionId; use postgres_ffi::BLCKSZ; use utils::lsn::Lsn; -pub struct WalIngest<'a> { - timeline: &'a Timeline, - +pub struct WalIngest { + shard: ShardIdentity, checkpoint: CheckPoint, checkpoint_modified: bool, } @@ -67,12 +69,12 @@ enum IngestRecordOutcome { UnexpectedRecordType, } -impl<'a> WalIngest<'a> { +impl WalIngest { pub async fn new( - timeline: &'a Timeline, + timeline: &Timeline, startpoint: Lsn, - ctx: &'_ RequestContext, - ) -> anyhow::Result> { + ctx: &RequestContext, + ) -> anyhow::Result { // Fetch the latest checkpoint into memory, so that we can compare with it // quickly in `ingest_record` and update it when it changes. let checkpoint_bytes = timeline.get_checkpoint(startpoint, ctx).await?; @@ -80,7 +82,7 @@ impl<'a> WalIngest<'a> { trace!("CheckPoint.nextXid = {}", checkpoint.nextXid.value); Ok(WalIngest { - timeline, + shard: *timeline.get_shard_identity(), checkpoint, checkpoint_modified: false, }) @@ -94,6 +96,8 @@ impl<'a> WalIngest<'a> { /// Helper function to parse a WAL record and call the Timeline's PUT functions for all the /// relations/pages that the record affects. /// + /// This function returns `true` if the record was ingested, and `false` if it was filtered out + /// pub async fn ingest_record( &mut self, recdata: Bytes, @@ -101,9 +105,13 @@ impl<'a> WalIngest<'a> { modification: &mut DatadirModification<'_>, decoded: &mut DecodedWALRecord, ctx: &RequestContext, - ) -> anyhow::Result<()> { - modification.lsn = lsn; - decode_wal_record(recdata, decoded, self.timeline.pg_version)?; + ) -> anyhow::Result { + WAL_INGEST.records_received.inc(); + let pg_version = modification.tline.pg_version; + let prev_len = modification.len(); + + modification.set_lsn(lsn)?; + decode_wal_record(recdata, decoded, pg_version)?; let mut buf = decoded.record.clone(); buf.advance(decoded.main_data_offset); @@ -146,9 +154,9 @@ impl<'a> WalIngest<'a> { } pg_constants::RM_DBASE_ID => { let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK; - debug!(%info, pg_version=%self.timeline.pg_version, "handle RM_DBASE_ID"); + debug!(%info, %pg_version, "handle RM_DBASE_ID"); - if self.timeline.pg_version == 14 { + if pg_version == 14 { if info == postgres_ffi::v14::bindings::XLOG_DBASE_CREATE { let createdb = XlCreateDatabase::decode(&mut buf); debug!("XLOG_DBASE_CREATE v14"); @@ -171,7 +179,7 @@ impl<'a> WalIngest<'a> { } else { IngestRecordOutcome::UnknownRecordType } - } else if self.timeline.pg_version == 15 { + } else if pg_version == 15 { if info == postgres_ffi::v15::bindings::XLOG_DBASE_CREATE_WAL_LOG { debug!("XLOG_DBASE_CREATE_WAL_LOG: noop"); IngestRecordOutcome::Noop @@ -199,7 +207,7 @@ impl<'a> WalIngest<'a> { } else { IngestRecordOutcome::UnknownRecordType } - } else if self.timeline.pg_version == 16 { + } else if pg_version == 16 { if info == postgres_ffi::v16::bindings::XLOG_DBASE_CREATE_WAL_LOG { debug!("XLOG_DBASE_CREATE_WAL_LOG: noop"); IngestRecordOutcome::Noop @@ -437,9 +445,7 @@ impl<'a> WalIngest<'a> { // particular point in the WAL. For more fine-grained control, // we could peek into the message and only pause if it contains // a particular string, for example, but this is enough for now. - crate::failpoint_support::sleep_millis_async!( - "wal-ingest-logical-message-sleep" - ); + failpoint_support::sleep_millis_async!("wal-ingest-logical-message-sleep"); IngestRecordOutcome::Noop } else if let Some(path) = prefix.strip_prefix("neon-file:") { modification.put_file(path, message, ctx).await?; @@ -493,6 +499,33 @@ impl<'a> WalIngest<'a> { // Iterate through all the blocks that the record modifies, and // "put" a separate copy of the record for each block. for blk in decoded.blocks.iter() { + let rel = RelTag { + spcnode: blk.rnode_spcnode, + dbnode: blk.rnode_dbnode, + relnode: blk.rnode_relnode, + forknum: blk.forknum, + }; + + let key = rel_block_to_key(rel, blk.blkno); + let key_is_local = self.shard.is_key_local(&key); + + tracing::debug!( + lsn=%lsn, + key=%key, + "ingest: shard decision {} (checkpoint={})", + if !key_is_local { "drop" } else { "keep" }, + self.checkpoint_modified + ); + + if !key_is_local { + if self.shard.is_zero() { + // Shard 0 tracks relation sizes. Although we will not store this block, we will observe + // its blkno in case it implicitly extends a relation. + self.observe_decoded_block(modification, blk, ctx).await?; + } + + continue; + } self.ingest_decoded_block(modification, lsn, decoded, blk, ctx) .await?; } @@ -505,11 +538,28 @@ impl<'a> WalIngest<'a> { self.checkpoint_modified = false; } - // Now that this record has been fully handled, including updating the - // checkpoint data, let the repository know that it is up-to-date to this LSN - modification.commit(ctx).await?; + // Note that at this point this record is only cached in the modification + // until commit() is called to flush the data into the repository and update + // the latest LSN. - Ok(()) + Ok(modification.len() > prev_len) + } + + /// Do not store this block, but observe it for the purposes of updating our relation size state. + async fn observe_decoded_block( + &mut self, + modification: &mut DatadirModification<'_>, + blk: &DecodedBkpBlock, + ctx: &RequestContext, + ) -> Result<(), PageReconstructError> { + let rel = RelTag { + spcnode: blk.rnode_spcnode, + dbnode: blk.rnode_dbnode, + relnode: blk.rnode_relnode, + forknum: blk.forknum, + }; + self.handle_rel_extend(modification, rel, blk.blkno, ctx) + .await } async fn ingest_decoded_block( @@ -538,8 +588,10 @@ impl<'a> WalIngest<'a> { && decoded.xl_rmid == pg_constants::RM_XLOG_ID && (decoded.xl_info == pg_constants::XLOG_FPI || decoded.xl_info == pg_constants::XLOG_FPI_FOR_HINT) - // compression of WAL is not yet supported: fall back to storing the original WAL record - && !postgres_ffi::bkpimage_is_compressed(blk.bimg_info, self.timeline.pg_version)? + // compression of WAL is not yet supported: fall back to storing the original WAL record + && !postgres_ffi::bkpimage_is_compressed(blk.bimg_info, modification.tline.pg_version)? + // do not materialize null pages because them most likely be soon replaced with real data + && blk.bimg_len != 0 { // Extract page image from FPI record let img_len = blk.bimg_len as usize; @@ -591,7 +643,7 @@ impl<'a> WalIngest<'a> { let mut flags = pg_constants::VISIBILITYMAP_VALID_BITS; #[allow(clippy::if_same_then_else)] - let outcome = match self.timeline.pg_version { + let outcome = match modification.tline.pg_version { 14 => { if decoded.xl_rmid == pg_constants::RM_HEAP_ID { let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK; @@ -933,7 +985,7 @@ impl<'a> WalIngest<'a> { // replaying it would fail to find the previous image of the page, because // it doesn't exist. So check if the VM page(s) exist, and skip the WAL // record if it doesn't. - let vm_size = self.get_relsize(vm_rel, modification.lsn, ctx).await?; + let vm_size = get_relsize(modification, vm_rel, ctx).await?; if let Some(blknum) = new_vm_blk { if blknum >= vm_size { new_vm_blk = None; @@ -1023,10 +1075,11 @@ impl<'a> WalIngest<'a> { let mut new_heap_blkno: Option = None; let mut old_heap_blkno: Option = None; let mut flags = pg_constants::VISIBILITYMAP_VALID_BITS; + let pg_version = modification.tline.pg_version; assert_eq!(decoded.xl_rmid, pg_constants::RM_NEON_ID); - match self.timeline.pg_version { + match pg_version { 16 => { let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK; @@ -1089,7 +1142,7 @@ impl<'a> WalIngest<'a> { } _ => bail!( "Neon RMGR has no known compatibility with PostgreSQL version {}", - self.timeline.pg_version + pg_version ), } @@ -1112,7 +1165,7 @@ impl<'a> WalIngest<'a> { // replaying it would fail to find the previous image of the page, because // it doesn't exist. So check if the VM page(s) exist, and skip the WAL // record if it doesn't. - let vm_size = self.get_relsize(vm_rel, modification.lsn, ctx).await?; + let vm_size = get_relsize(modification, vm_rel, ctx).await?; if let Some(blknum) = new_vm_blk { if blknum >= vm_size { new_vm_blk = None; @@ -1190,16 +1243,14 @@ impl<'a> WalIngest<'a> { let src_db_id = rec.src_db_id; let src_tablespace_id = rec.src_tablespace_id; - // Creating a database is implemented by copying the template (aka. source) database. - // To copy all the relations, we need to ask for the state as of the same LSN, but we - // cannot pass 'lsn' to the Timeline.get_* functions, or they will block waiting for - // the last valid LSN to advance up to it. So we use the previous record's LSN in the - // get calls instead. - let req_lsn = modification.tline.get_last_record_lsn(); - let rels = modification .tline - .list_rels(src_tablespace_id, src_db_id, req_lsn, ctx) + .list_rels( + src_tablespace_id, + src_db_id, + Version::Modified(modification), + ctx, + ) .await?; debug!("ingest_xlog_dbase_create: {} rels", rels.len()); @@ -1207,7 +1258,12 @@ impl<'a> WalIngest<'a> { // Copy relfilemap let filemap = modification .tline - .get_relmap_file(src_tablespace_id, src_db_id, req_lsn, ctx) + .get_relmap_file( + src_tablespace_id, + src_db_id, + Version::Modified(modification), + ctx, + ) .await?; modification .put_relmap_file(tablespace_id, db_id, filemap, ctx) @@ -1221,7 +1277,7 @@ impl<'a> WalIngest<'a> { let nblocks = modification .tline - .get_rel_size(src_rel, req_lsn, true, ctx) + .get_rel_size(src_rel, Version::Modified(modification), true, ctx) .await?; let dst_rel = RelTag { spcnode: tablespace_id, @@ -1239,7 +1295,13 @@ impl<'a> WalIngest<'a> { let content = modification .tline - .get_rel_page_at_lsn(src_rel, blknum, req_lsn, true, ctx) + .get_rel_page_at_lsn( + src_rel, + blknum, + Version::Modified(modification), + true, + ctx, + ) .await?; modification.put_rel_page_image(dst_rel, blknum, content)?; num_blocks_copied += 1; @@ -1310,7 +1372,7 @@ impl<'a> WalIngest<'a> { modification.put_rel_page_image(rel, fsm_physical_page_no, ZERO_PAGE.clone())?; fsm_physical_page_no += 1; } - let nblocks = self.get_relsize(rel, modification.lsn, ctx).await?; + let nblocks = get_relsize(modification, rel, ctx).await?; if nblocks > fsm_physical_page_no { // check if something to do: FSM is larger than truncate position self.put_rel_truncation(modification, rel, fsm_physical_page_no, ctx) @@ -1332,7 +1394,7 @@ impl<'a> WalIngest<'a> { modification.put_rel_page_image(rel, vm_page_no, ZERO_PAGE.clone())?; vm_page_no += 1; } - let nblocks = self.get_relsize(rel, modification.lsn, ctx).await?; + let nblocks = get_relsize(modification, rel, ctx).await?; if nblocks > vm_page_no { // check if something to do: VM is larger than truncate position self.put_rel_truncation(modification, rel, vm_page_no, ctx) @@ -1405,10 +1467,9 @@ impl<'a> WalIngest<'a> { dbnode: xnode.dbnode, relnode: xnode.relnode, }; - let last_lsn = self.timeline.get_last_record_lsn(); if modification .tline - .get_rel_exists(rel, last_lsn, true, ctx) + .get_rel_exists(rel, Version::Modified(modification), true, ctx) .await? { self.put_rel_drop(modification, rel, ctx).await?; @@ -1462,10 +1523,9 @@ impl<'a> WalIngest<'a> { // will block waiting for the last valid LSN to advance up to // it. So we use the previous record's LSN in the get calls // instead. - let req_lsn = modification.tline.get_last_record_lsn(); for segno in modification .tline - .list_slru_segments(SlruKind::Clog, req_lsn, ctx) + .list_slru_segments(SlruKind::Clog, Version::Modified(modification), ctx) .await? { let segpage = segno * pg_constants::SLRU_PAGES_PER_SEGMENT; @@ -1677,20 +1737,6 @@ impl<'a> WalIngest<'a> { Ok(()) } - async fn get_relsize( - &mut self, - rel: RelTag, - lsn: Lsn, - ctx: &RequestContext, - ) -> anyhow::Result { - let nblocks = if !self.timeline.get_rel_exists(rel, lsn, true, ctx).await? { - 0 - } else { - self.timeline.get_rel_size(rel, lsn, true, ctx).await? - }; - Ok(nblocks) - } - async fn handle_rel_extend( &mut self, modification: &mut DatadirModification<'_>, @@ -1702,10 +1748,21 @@ impl<'a> WalIngest<'a> { // Check if the relation exists. We implicitly create relations on first // record. // TODO: would be nice if to be more explicit about it - let last_lsn = modification.lsn; - let old_nblocks = if !self - .timeline - .get_rel_exists(rel, last_lsn, true, ctx) + + // Get current size and put rel creation if rel doesn't exist + // + // NOTE: we check the cache first even though get_rel_exists and get_rel_size would + // check the cache too. This is because eagerly checking the cache results in + // less work overall and 10% better performance. It's more work on cache miss + // but cache miss is rare. + let old_nblocks = if let Some(nblocks) = modification + .tline + .get_cached_rel_size(&rel, modification.get_lsn()) + { + nblocks + } else if !modification + .tline + .get_rel_exists(rel, Version::Modified(modification), true, ctx) .await? { // create it with 0 size initially, the logic below will extend it @@ -1715,15 +1772,25 @@ impl<'a> WalIngest<'a> { .context("Relation Error")?; 0 } else { - self.timeline.get_rel_size(rel, last_lsn, true, ctx).await? + modification + .tline + .get_rel_size(rel, Version::Modified(modification), true, ctx) + .await? }; if new_nblocks > old_nblocks { //info!("extending {} {} to {}", rel, old_nblocks, new_nblocks); modification.put_rel_extend(rel, new_nblocks, ctx).await?; + let mut key = rel_block_to_key(rel, blknum); // fill the gap with zeros for gap_blknum in old_nblocks..blknum { + key.field6 = gap_blknum; + + if self.shard.get_shard_number(&key) != self.shard.number { + continue; + } + modification.put_rel_page_image(rel, gap_blknum, ZERO_PAGE.clone())?; } } @@ -1761,10 +1828,9 @@ impl<'a> WalIngest<'a> { // Check if the relation exists. We implicitly create relations on first // record. // TODO: would be nice if to be more explicit about it - let last_lsn = self.timeline.get_last_record_lsn(); - let old_nblocks = if !self - .timeline - .get_slru_segment_exists(kind, segno, last_lsn, ctx) + let old_nblocks = if !modification + .tline + .get_slru_segment_exists(kind, segno, Version::Modified(modification), ctx) .await? { // create it with 0 size initially, the logic below will extend it @@ -1773,8 +1839,9 @@ impl<'a> WalIngest<'a> { .await?; 0 } else { - self.timeline - .get_slru_segment_size(kind, segno, last_lsn, ctx) + modification + .tline + .get_slru_segment_size(kind, segno, Version::Modified(modification), ctx) .await? }; @@ -1797,11 +1864,32 @@ impl<'a> WalIngest<'a> { } } +async fn get_relsize( + modification: &DatadirModification<'_>, + rel: RelTag, + ctx: &RequestContext, +) -> anyhow::Result { + let nblocks = if !modification + .tline + .get_rel_exists(rel, Version::Modified(modification), true, ctx) + .await? + { + 0 + } else { + modification + .tline + .get_rel_size(rel, Version::Modified(modification), true, ctx) + .await? + }; + Ok(nblocks) +} + #[allow(clippy::bool_assert_comparison)] #[cfg(test)] mod tests { use super::*; use crate::tenant::harness::*; + use crate::tenant::remote_timeline_client::{remote_initdb_archive_path, INITDB_PATH}; use crate::tenant::Timeline; use postgres_ffi::v14::xlog_utils::SIZEOF_CHECKPOINT; use postgres_ffi::RELSEG_SIZE; @@ -1822,10 +1910,7 @@ mod tests { static ZERO_CHECKPOINT: Bytes = Bytes::from_static(&[0u8; SIZEOF_CHECKPOINT]); - async fn init_walingest_test<'a>( - tline: &'a Timeline, - ctx: &RequestContext, - ) -> Result> { + async fn init_walingest_test(tline: &Timeline, ctx: &RequestContext) -> Result { let mut m = tline.begin_modification(Lsn(0x10)); m.put_checkpoint(ZERO_CHECKPOINT.clone())?; m.put_relmap_file(0, 111, Bytes::from(""), ctx).await?; // dummy relmapper file @@ -1870,29 +1955,29 @@ mod tests { // The relation was created at LSN 2, not visible at LSN 1 yet. assert_eq!( tline - .get_rel_exists(TESTREL_A, Lsn(0x10), false, &ctx) + .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x10)), false, &ctx) .await?, false ); assert!(tline - .get_rel_size(TESTREL_A, Lsn(0x10), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x10)), false, &ctx) .await .is_err()); assert_eq!( tline - .get_rel_exists(TESTREL_A, Lsn(0x20), false, &ctx) + .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx) .await?, true ); assert_eq!( tline - .get_rel_size(TESTREL_A, Lsn(0x20), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx) .await?, 1 ); assert_eq!( tline - .get_rel_size(TESTREL_A, Lsn(0x50), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), false, &ctx) .await?, 3 ); @@ -1900,46 +1985,46 @@ mod tests { // Check page contents at each LSN assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x20), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x20)), false, &ctx) .await?, TEST_IMG("foo blk 0 at 2") ); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x30), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x30)), false, &ctx) .await?, TEST_IMG("foo blk 0 at 3") ); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x40), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x40)), false, &ctx) .await?, TEST_IMG("foo blk 0 at 3") ); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x40), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x40)), false, &ctx) .await?, TEST_IMG("foo blk 1 at 4") ); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x50), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x50)), false, &ctx) .await?, TEST_IMG("foo blk 0 at 3") ); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x50), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x50)), false, &ctx) .await?, TEST_IMG("foo blk 1 at 4") ); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 2, Lsn(0x50), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, 2, Version::Lsn(Lsn(0x50)), false, &ctx) .await?, TEST_IMG("foo blk 2 at 5") ); @@ -1955,19 +2040,19 @@ mod tests { // Check reported size and contents after truncation assert_eq!( tline - .get_rel_size(TESTREL_A, Lsn(0x60), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x60)), false, &ctx) .await?, 2 ); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x60), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x60)), false, &ctx) .await?, TEST_IMG("foo blk 0 at 3") ); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x60), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x60)), false, &ctx) .await?, TEST_IMG("foo blk 1 at 4") ); @@ -1975,13 +2060,13 @@ mod tests { // should still see the truncated block with older LSN assert_eq!( tline - .get_rel_size(TESTREL_A, Lsn(0x50), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), false, &ctx) .await?, 3 ); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 2, Lsn(0x50), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, 2, Version::Lsn(Lsn(0x50)), false, &ctx) .await?, TEST_IMG("foo blk 2 at 5") ); @@ -1994,7 +2079,7 @@ mod tests { m.commit(&ctx).await?; assert_eq!( tline - .get_rel_size(TESTREL_A, Lsn(0x68), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x68)), false, &ctx) .await?, 0 ); @@ -2007,19 +2092,19 @@ mod tests { m.commit(&ctx).await?; assert_eq!( tline - .get_rel_size(TESTREL_A, Lsn(0x70), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x70)), false, &ctx) .await?, 2 ); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x70), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x70)), false, &ctx) .await?, ZERO_PAGE ); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x70), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x70)), false, &ctx) .await?, TEST_IMG("foo blk 1") ); @@ -2032,21 +2117,21 @@ mod tests { m.commit(&ctx).await?; assert_eq!( tline - .get_rel_size(TESTREL_A, Lsn(0x80), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x80)), false, &ctx) .await?, 1501 ); for blk in 2..1500 { assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, blk, Lsn(0x80), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, blk, Version::Lsn(Lsn(0x80)), false, &ctx) .await?, ZERO_PAGE ); } assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 1500, Lsn(0x80), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, 1500, Version::Lsn(Lsn(0x80)), false, &ctx) .await?, TEST_IMG("foo blk 1500") ); @@ -2073,13 +2158,13 @@ mod tests { // Check that rel exists and size is correct assert_eq!( tline - .get_rel_exists(TESTREL_A, Lsn(0x20), false, &ctx) + .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx) .await?, true ); assert_eq!( tline - .get_rel_size(TESTREL_A, Lsn(0x20), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx) .await?, 1 ); @@ -2092,7 +2177,7 @@ mod tests { // Check that rel is not visible anymore assert_eq!( tline - .get_rel_exists(TESTREL_A, Lsn(0x30), false, &ctx) + .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x30)), false, &ctx) .await?, false ); @@ -2110,13 +2195,13 @@ mod tests { // Check that rel exists and size is correct assert_eq!( tline - .get_rel_exists(TESTREL_A, Lsn(0x40), false, &ctx) + .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x40)), false, &ctx) .await?, true ); assert_eq!( tline - .get_rel_size(TESTREL_A, Lsn(0x40), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x40)), false, &ctx) .await?, 1 ); @@ -2149,24 +2234,24 @@ mod tests { // The relation was created at LSN 20, not visible at LSN 1 yet. assert_eq!( tline - .get_rel_exists(TESTREL_A, Lsn(0x10), false, &ctx) + .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x10)), false, &ctx) .await?, false ); assert!(tline - .get_rel_size(TESTREL_A, Lsn(0x10), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x10)), false, &ctx) .await .is_err()); assert_eq!( tline - .get_rel_exists(TESTREL_A, Lsn(0x20), false, &ctx) + .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx) .await?, true ); assert_eq!( tline - .get_rel_size(TESTREL_A, Lsn(0x20), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx) .await?, relsize ); @@ -2177,7 +2262,7 @@ mod tests { let data = format!("foo blk {} at {}", blkno, lsn); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, blkno, lsn, false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(lsn), false, &ctx) .await?, TEST_IMG(&data) ); @@ -2194,7 +2279,7 @@ mod tests { // Check reported size and contents after truncation assert_eq!( tline - .get_rel_size(TESTREL_A, Lsn(0x60), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x60)), false, &ctx) .await?, 1 ); @@ -2204,7 +2289,7 @@ mod tests { let data = format!("foo blk {} at {}", blkno, lsn); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x60), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x60)), false, &ctx) .await?, TEST_IMG(&data) ); @@ -2213,7 +2298,7 @@ mod tests { // should still see all blocks with older LSN assert_eq!( tline - .get_rel_size(TESTREL_A, Lsn(0x50), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), false, &ctx) .await?, relsize ); @@ -2222,7 +2307,7 @@ mod tests { let data = format!("foo blk {} at {}", blkno, lsn); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x50), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x50)), false, &ctx) .await?, TEST_IMG(&data) ); @@ -2242,13 +2327,13 @@ mod tests { assert_eq!( tline - .get_rel_exists(TESTREL_A, Lsn(0x80), false, &ctx) + .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x80)), false, &ctx) .await?, true ); assert_eq!( tline - .get_rel_size(TESTREL_A, Lsn(0x80), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x80)), false, &ctx) .await?, relsize ); @@ -2258,7 +2343,7 @@ mod tests { let data = format!("foo blk {} at {}", blkno, lsn); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x80), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x80)), false, &ctx) .await?, TEST_IMG(&data) ); @@ -2291,7 +2376,9 @@ mod tests { assert_current_logical_size(&tline, Lsn(lsn)); assert_eq!( - tline.get_rel_size(TESTREL_A, Lsn(lsn), false, &ctx).await?, + tline + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), false, &ctx) + .await?, RELSEG_SIZE + 1 ); @@ -2303,7 +2390,9 @@ mod tests { .await?; m.commit(&ctx).await?; assert_eq!( - tline.get_rel_size(TESTREL_A, Lsn(lsn), false, &ctx).await?, + tline + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), false, &ctx) + .await?, RELSEG_SIZE ); assert_current_logical_size(&tline, Lsn(lsn)); @@ -2316,7 +2405,9 @@ mod tests { .await?; m.commit(&ctx).await?; assert_eq!( - tline.get_rel_size(TESTREL_A, Lsn(lsn), false, &ctx).await?, + tline + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), false, &ctx) + .await?, RELSEG_SIZE - 1 ); assert_current_logical_size(&tline, Lsn(lsn)); @@ -2332,7 +2423,9 @@ mod tests { .await?; m.commit(&ctx).await?; assert_eq!( - tline.get_rel_size(TESTREL_A, Lsn(lsn), false, &ctx).await?, + tline + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), false, &ctx) + .await?, size as BlockNumber ); @@ -2367,21 +2460,25 @@ mod tests { let pg_version = 15; // The test data was generated by pg15 let path = "test_data/sk_wal_segment_from_pgbench"; let wal_segment_path = format!("{path}/000000010000000000000001.zst"); + let source_initdb_path = format!("{path}/{INITDB_PATH}"); let startpoint = Lsn::from_hex("14AEC08").unwrap(); - let endpoint = Lsn::from_hex("1FFFF98").unwrap(); + let _endpoint = Lsn::from_hex("1FFFF98").unwrap(); + + let harness = TenantHarness::create("test_ingest_real_wal").unwrap(); + let (tenant, ctx) = harness.load().await; + + let remote_initdb_path = remote_initdb_archive_path(&tenant.tenant_id(), &TIMELINE_ID); + let initdb_path = harness.remote_fs_dir.join(remote_initdb_path.get_path()); + + std::fs::create_dir_all(initdb_path.parent().unwrap()) + .expect("creating test dir should work"); + std::fs::copy(source_initdb_path, initdb_path).expect("copying the initdb.tar.zst works"); // Bootstrap a real timeline. We can't use create_test_timeline because // it doesn't create a real checkpoint, and Walingest::new tries to parse // the garbage data. - // - // TODO use the initdb.tar.zst file stored with the test data to avoid - // problems with inconsistent initdb results after pg minor version bumps. - let (tenant, ctx) = TenantHarness::create("test_ingest_real_wal") - .unwrap() - .load() - .await; let tline = tenant - .bootstrap_timeline(TIMELINE_ID, pg_version, &ctx) + .bootstrap_timeline_test(TIMELINE_ID, pg_version, Some(TIMELINE_ID), &ctx) .await .unwrap(); @@ -2407,7 +2504,7 @@ mod tests { let mut walingest = WalIngest::new(tline.as_ref(), startpoint, &ctx) .await .unwrap(); - let mut modification = tline.begin_modification(endpoint); + let mut modification = tline.begin_modification(startpoint); let mut decoded = DecodedWALRecord::default(); println!("decoding {} bytes", bytes.len() - xlogoff); @@ -2421,6 +2518,7 @@ mod tests { .await .unwrap(); } + modification.commit(&ctx).await.unwrap(); } let duration = started_at.elapsed(); diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index edce158e75..6918698f29 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -22,6 +22,7 @@ use anyhow::Context; use byteorder::{ByteOrder, LittleEndian}; use bytes::{BufMut, Bytes, BytesMut}; use nix::poll::*; +use pageserver_api::shard::TenantShardId; use serde::Serialize; use std::collections::VecDeque; use std::io; @@ -34,16 +35,12 @@ use std::process::{Child, ChildStdin, ChildStdout, Command}; use std::sync::{Arc, Mutex, MutexGuard, RwLock}; use std::time::Duration; use std::time::Instant; -use tokio_util::sync::CancellationToken; use tracing::*; -use utils::{bin_ser::BeSer, id::TenantId, lsn::Lsn, nonblock::set_nonblock}; +use utils::{bin_ser::BeSer, lsn::Lsn, nonblock::set_nonblock}; #[cfg(feature = "testing")] use std::sync::atomic::{AtomicUsize, Ordering}; -#[cfg(feature = "testing")] -use pageserver_api::shard::TenantShardId; - use crate::config::PageServerConf; use crate::metrics::{ WalRedoKillCause, WAL_REDO_BYTES_HISTOGRAM, WAL_REDO_PROCESS_COUNTERS, @@ -93,7 +90,7 @@ struct ProcessOutput { /// records. /// pub struct PostgresRedoManager { - tenant_id: TenantId, + tenant_shard_id: TenantShardId, conf: &'static PageServerConf, last_redo_at: std::sync::Mutex>, redo_process: RwLock>>, @@ -124,7 +121,9 @@ impl PostgresRedoManager { /// The WAL redo is handled by a separate thread, so this just sends a request /// to the thread and waits for response. /// - /// CANCEL SAFETY: NOT CANCEL SAFE. + /// # Cancel-Safety + /// + /// This method is cancellation-safe. pub async fn request_redo( &self, key: Key, @@ -157,7 +156,6 @@ impl PostgresRedoManager { self.conf.wal_redo_timeout, pg_version, ) - .await }; img = Some(result?); @@ -178,7 +176,6 @@ impl PostgresRedoManager { self.conf.wal_redo_timeout, pg_version, ) - .await } } } @@ -187,10 +184,13 @@ impl PostgresRedoManager { /// /// Create a new PostgresRedoManager. /// - pub fn new(conf: &'static PageServerConf, tenant_id: TenantId) -> PostgresRedoManager { + pub fn new( + conf: &'static PageServerConf, + tenant_shard_id: TenantShardId, + ) -> PostgresRedoManager { // The actual process is launched lazily, on first request. PostgresRedoManager { - tenant_id, + tenant_shard_id, conf, last_redo_at: std::sync::Mutex::default(), redo_process: RwLock::new(None), @@ -216,7 +216,7 @@ impl PostgresRedoManager { /// Process one request for WAL redo using wal-redo postgres /// #[allow(clippy::too_many_arguments)] - async fn apply_batch_postgres( + fn apply_batch_postgres( &self, key: Key, lsn: Lsn, @@ -245,8 +245,12 @@ impl PostgresRedoManager { let timer = WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM.start_timer(); let proc = Arc::new( - WalRedoProcess::launch(self.conf, self.tenant_id, pg_version) - .context("launch walredo process")?, + WalRedoProcess::launch( + self.conf, + self.tenant_shard_id, + pg_version, + ) + .context("launch walredo process")?, ); timer.observe_duration(); *proc_guard = Some(Arc::clone(&proc)); @@ -332,12 +336,7 @@ impl PostgresRedoManager { // than we can SIGKILL & `wait` for them to exit. By doing it the way we do here, // we limit this risk of run-away to at most $num_runtimes * $num_executor_threads. // This probably needs revisiting at some later point. - let mut wait_done = proc.stderr_logger_task_done.clone(); drop(proc); - wait_done - .wait_for(|v| *v) - .await - .expect("we use scopeguard to ensure we always send `true` to the channel before dropping the sender"); } else if n_attempts != 0 { info!(n_attempts, "retried walredo succeeded"); } @@ -644,13 +643,11 @@ impl CloseFileDescriptors for C { struct WalRedoProcess { #[allow(dead_code)] conf: &'static PageServerConf, - tenant_id: TenantId, + tenant_shard_id: TenantShardId, // Some() on construction, only becomes None on Drop. child: Option, stdout: Mutex, stdin: Mutex, - stderr_logger_cancel: CancellationToken, - stderr_logger_task_done: tokio::sync::watch::Receiver, /// Counter to separate same sized walredo inputs failing at the same millisecond. #[cfg(feature = "testing")] dump_sequence: AtomicUsize, @@ -660,10 +657,10 @@ impl WalRedoProcess { // // Start postgres binary in special WAL redo mode. // - #[instrument(skip_all,fields(tenant_id=%tenant_id, pg_version=pg_version))] + #[instrument(skip_all,fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), pg_version=pg_version))] fn launch( conf: &'static PageServerConf, - tenant_id: TenantId, + tenant_shard_id: TenantShardId, pg_version: u32, ) -> anyhow::Result { let pg_bin_dir_path = conf.pg_bin_dir(pg_version).context("pg_bin_dir")?; // TODO these should be infallible. @@ -688,7 +685,7 @@ impl WalRedoProcess { // as close-on-exec by default, but that's not enough, since we use // libraries that directly call libc open without setting that flag. .close_fds() - .spawn_no_leak_child(tenant_id) + .spawn_no_leak_child(tenant_shard_id) .context("spawn process")?; WAL_REDO_PROCESS_COUNTERS.started.inc(); let mut child = scopeguard::guard(child, |child| { @@ -699,6 +696,8 @@ impl WalRedoProcess { let stdin = child.stdin.take().unwrap(); let stdout = child.stdout.take().unwrap(); let stderr = child.stderr.take().unwrap(); + let stderr = tokio::process::ChildStderr::from_std(stderr) + .context("convert to tokio::ChildStderr")?; macro_rules! set_nonblock_or_log_err { ($file:ident) => {{ let res = set_nonblock($file.as_raw_fd()); @@ -710,73 +709,49 @@ impl WalRedoProcess { } set_nonblock_or_log_err!(stdin)?; set_nonblock_or_log_err!(stdout)?; - set_nonblock_or_log_err!(stderr)?; - - let mut stderr = tokio::io::unix::AsyncFd::new(stderr).context("AsyncFd::with_interest")?; // all fallible operations post-spawn are complete, so get rid of the guard let child = scopeguard::ScopeGuard::into_inner(child); - let stderr_logger_cancel = CancellationToken::new(); - let (stderr_logger_task_done_tx, stderr_logger_task_done_rx) = - tokio::sync::watch::channel(false); - tokio::spawn({ - let stderr_logger_cancel = stderr_logger_cancel.clone(); + tokio::spawn( async move { scopeguard::defer! { debug!("wal-redo-postgres stderr_logger_task finished"); - let _ = stderr_logger_task_done_tx.send(true); + crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_finished.inc(); } debug!("wal-redo-postgres stderr_logger_task started"); - loop { - // NB: we purposefully don't do a select! for the cancellation here. - // The cancellation would likely cause us to miss stderr messages. - // We can rely on this to return from .await because when we SIGKILL - // the child, the writing end of the stderr pipe gets closed. - match stderr.readable_mut().await { - Ok(mut guard) => { - let mut errbuf = [0; 16384]; - let res = guard.try_io(|fd| { - use std::io::Read; - fd.get_mut().read(&mut errbuf) - }); - match res { - Ok(Ok(0)) => { - // it closed the stderr pipe - break; - } - Ok(Ok(n)) => { - // The message might not be split correctly into lines here. But this is - // good enough, the important thing is to get the message to the log. - let output = String::from_utf8_lossy(&errbuf[0..n]).to_string(); - error!(output, "received output"); - }, - Ok(Err(e)) => { - error!(error = ?e, "read() error, waiting for cancellation"); - stderr_logger_cancel.cancelled().await; - error!(error = ?e, "read() error, cancellation complete"); - break; - } - Err(e) => { - let _e: tokio::io::unix::TryIoError = e; - // the read() returned WouldBlock, that's expected - } - } + crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_started.inc(); + + use tokio::io::AsyncBufReadExt; + let mut stderr_lines = tokio::io::BufReader::new(stderr); + let mut buf = Vec::new(); + let res = loop { + buf.clear(); + // TODO we don't trust the process to cap its stderr length. + // Currently it can do unbounded Vec allocation. + match stderr_lines.read_until(b'\n', &mut buf).await { + Ok(0) => break Ok(()), // eof + Ok(num_bytes) => { + let output = String::from_utf8_lossy(&buf[..num_bytes]); + error!(%output, "received output"); } Err(e) => { - error!(error = ?e, "read() error, waiting for cancellation"); - stderr_logger_cancel.cancelled().await; - error!(error = ?e, "read() error, cancellation complete"); - break; + break Err(e); } } + }; + match res { + Ok(()) => (), + Err(e) => { + error!(error=?e, "failed to read from walredo stderr"); + } } - }.instrument(tracing::info_span!(parent: None, "wal-redo-postgres-stderr", pid = child.id(), tenant_id = %tenant_id, %pg_version)) - }); + }.instrument(tracing::info_span!(parent: None, "wal-redo-postgres-stderr", pid = child.id(), tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %pg_version)) + ); Ok(Self { conf, - tenant_id, + tenant_shard_id, child: Some(child), stdin: Mutex::new(ProcessInput { stdin, @@ -787,8 +762,6 @@ impl WalRedoProcess { pending_responses: VecDeque::new(), n_processed_responses: 0, }), - stderr_logger_cancel, - stderr_logger_task_done: stderr_logger_task_done_rx, #[cfg(feature = "testing")] dump_sequence: AtomicUsize::default(), }) @@ -804,7 +777,7 @@ impl WalRedoProcess { // Apply given WAL records ('records') over an old page image. Returns // new page image. // - #[instrument(skip_all, fields(tenant_id=%self.tenant_id, pid=%self.id()))] + #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), pid=%self.id()))] fn apply_wal_records( &self, tag: BufferTag, @@ -998,11 +971,7 @@ impl WalRedoProcess { // these files will be collected to an allure report let filename = format!("walredo-{millis}-{}-{seq}.walredo", writebuf.len()); - // TODO(sharding): update this call when WalRedoProcess gets a TenantShardId. - let path = self - .conf - .tenant_path(&TenantShardId::unsharded(self.tenant_id)) - .join(&filename); + let path = self.conf.tenant_path(&self.tenant_shard_id).join(&filename); let res = std::fs::OpenOptions::new() .write(true) @@ -1029,7 +998,6 @@ impl Drop for WalRedoProcess { .take() .expect("we only do this once") .kill_and_wait(WalRedoKillCause::WalRedoProcessDrop); - self.stderr_logger_cancel.cancel(); // no way to wait for stderr_logger_task from Drop because that is async only } } @@ -1037,7 +1005,7 @@ impl Drop for WalRedoProcess { /// Wrapper type around `std::process::Child` which guarantees that the child /// will be killed and waited-for by this process before being dropped. struct NoLeakChild { - tenant_id: TenantId, + tenant_id: TenantShardId, child: Option, } @@ -1056,7 +1024,7 @@ impl DerefMut for NoLeakChild { } impl NoLeakChild { - fn spawn(tenant_id: TenantId, command: &mut Command) -> io::Result { + fn spawn(tenant_id: TenantShardId, command: &mut Command) -> io::Result { let child = command.spawn()?; Ok(NoLeakChild { tenant_id, @@ -1111,7 +1079,7 @@ impl Drop for NoLeakChild { Some(child) => child, None => return, }; - let tenant_id = self.tenant_id; + let tenant_shard_id = self.tenant_id; // Offload the kill+wait of the child process into the background. // If someone stops the runtime, we'll leak the child process. // We can ignore that case because we only stop the runtime on pageserver exit. @@ -1119,7 +1087,11 @@ impl Drop for NoLeakChild { tokio::task::spawn_blocking(move || { // Intentionally don't inherit the tracing context from whoever is dropping us. // This thread here is going to outlive of our dropper. - let span = tracing::info_span!("walredo", %tenant_id); + let span = tracing::info_span!( + "walredo", + tenant_id = %tenant_shard_id.tenant_id, + shard_id = %tenant_shard_id.shard_slug() + ); let _entered = span.enter(); Self::kill_and_wait_impl(child, WalRedoKillCause::NoLeakChildDrop); }) @@ -1129,11 +1101,11 @@ impl Drop for NoLeakChild { } trait NoLeakChildCommandExt { - fn spawn_no_leak_child(&mut self, tenant_id: TenantId) -> io::Result; + fn spawn_no_leak_child(&mut self, tenant_id: TenantShardId) -> io::Result; } impl NoLeakChildCommandExt for Command { - fn spawn_no_leak_child(&mut self, tenant_id: TenantId) -> io::Result { + fn spawn_no_leak_child(&mut self, tenant_id: TenantShardId) -> io::Result { NoLeakChild::spawn(tenant_id, self) } } @@ -1188,6 +1160,7 @@ mod tests { use crate::repository::Key; use crate::{config::PageServerConf, walrecord::NeonWalRecord}; use bytes::Bytes; + use pageserver_api::shard::TenantShardId; use std::str::FromStr; use utils::{id::TenantId, lsn::Lsn}; @@ -1297,9 +1270,9 @@ mod tests { let repo_dir = camino_tempfile::tempdir()?; let conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf()); let conf = Box::leak(Box::new(conf)); - let tenant_id = TenantId::generate(); + let tenant_shard_id = TenantShardId::unsharded(TenantId::generate()); - let manager = PostgresRedoManager::new(conf, tenant_id); + let manager = PostgresRedoManager::new(conf, tenant_shard_id); Ok(RedoHarness { _repo_dir: repo_dir, diff --git a/pgxn/neon/Makefile b/pgxn/neon/Makefile index 7fc99523db..c6b224a14d 100644 --- a/pgxn/neon/Makefile +++ b/pgxn/neon/Makefile @@ -9,6 +9,7 @@ OBJS = \ libpagestore.o \ neon.o \ neon_utils.o \ + neon_walreader.o \ pagestore_smgr.o \ relsize_cache.o \ walproposer.o \ @@ -41,6 +42,17 @@ libwalproposer.a: $(WALPROP_OBJS) rm -f $@ $(AR) $(AROPT) $@ $^ +# needs vars: +# FIND_TYPEDEF pointing to find_typedef +# INDENT pointing to pg_bsd_indent +# PGINDENT_SCRIPT pointing to pgindent (be careful with PGINDENT var name: +# pgindent will pick it up as pg_bsd_indent path). +.PHONY: pgindent +pgindent: + +@ echo top_srcdir=$(top_srcdir) top_builddir=$(top_builddir) srcdir=$(srcdir) + $(FIND_TYPEDEF) . > neon.typedefs + INDENT=$(INDENT) $(PGINDENT_SCRIPT) --typedefs neon.typedefs $(srcdir)/*.c $(srcdir)/*.h + PG_CONFIG = pg_config PGXS := $(shell $(PG_CONFIG) --pgxs) include $(PGXS) diff --git a/pgxn/neon/control_plane_connector.c b/pgxn/neon/control_plane_connector.c index 2546e6de5e..e467a9c43a 100644 --- a/pgxn/neon/control_plane_connector.c +++ b/pgxn/neon/control_plane_connector.c @@ -19,20 +19,21 @@ *------------------------------------------------------------------------- */ #include "postgres.h" + +#include + +#include "access/xact.h" +#include "commands/defrem.h" +#include "fmgr.h" +#include "libpq/crypt.h" +#include "miscadmin.h" #include "tcop/pquery.h" #include "tcop/utility.h" -#include "access/xact.h" +#include "utils/acl.h" +#include "utils/guc.h" #include "utils/hsearch.h" #include "utils/memutils.h" -#include "commands/defrem.h" -#include "miscadmin.h" -#include "utils/acl.h" -#include "fmgr.h" -#include "utils/guc.h" -#include "port.h" -#include #include "utils/jsonb.h" -#include "libpq/crypt.h" static ProcessUtility_hook_type PreviousProcessUtilityHook = NULL; @@ -41,7 +42,7 @@ static char *ConsoleURL = NULL; static bool ForwardDDL = true; /* Curl structures for sending the HTTP requests */ -static CURL * CurlHandle; +static CURL *CurlHandle; static struct curl_slist *ContentHeader = NULL; /* @@ -54,7 +55,7 @@ typedef enum { Op_Set, /* An upsert: Either a creation or an alter */ Op_Delete, -} OpType; +} OpType; typedef struct { @@ -62,7 +63,7 @@ typedef struct Oid owner; char old_name[NAMEDATALEN]; OpType type; -} DbEntry; +} DbEntry; typedef struct { @@ -70,7 +71,7 @@ typedef struct char old_name[NAMEDATALEN]; const char *password; OpType type; -} RoleEntry; +} RoleEntry; /* * We keep one of these for each subtransaction in a stack. When a subtransaction @@ -82,10 +83,10 @@ typedef struct DdlHashTable struct DdlHashTable *prev_table; HTAB *db_table; HTAB *role_table; -} DdlHashTable; +} DdlHashTable; static DdlHashTable RootTable; -static DdlHashTable * CurrentDdlTable = &RootTable; +static DdlHashTable *CurrentDdlTable = &RootTable; static void PushKeyValue(JsonbParseState **state, char *key, char *value) @@ -199,7 +200,7 @@ typedef struct { char str[ERROR_SIZE]; size_t size; -} ErrorString; +} ErrorString; static size_t ErrorWriteCallback(char *ptr, size_t size, size_t nmemb, void *userdata) @@ -478,7 +479,7 @@ NeonXactCallback(XactEvent event, void *arg) static bool RoleIsNeonSuperuser(const char *role_name) { - return strcmp(role_name, "neon_superuser") == 0; + return strcmp(role_name, "neon_superuser") == 0; } static void @@ -509,6 +510,7 @@ HandleCreateDb(CreatedbStmt *stmt) if (downer && downer->arg) { const char *owner_name = defGetString(downer); + if (RoleIsNeonSuperuser(owner_name)) elog(ERROR, "can't create a database with owner neon_superuser"); entry->owner = get_role_oid(owner_name, false); @@ -536,6 +538,7 @@ HandleAlterOwner(AlterOwnerStmt *stmt) if (!found) memset(entry->old_name, 0, sizeof(entry->old_name)); const char *new_owner = get_rolespec_name(stmt->newowner); + if (RoleIsNeonSuperuser(new_owner)) elog(ERROR, "can't alter owner to neon_superuser"); entry->owner = get_role_oid(new_owner, false); @@ -633,6 +636,7 @@ HandleAlterRole(AlterRoleStmt *stmt) DefElem *dpass = NULL; ListCell *option; const char *role_name = stmt->role->rolename; + if (RoleIsNeonSuperuser(role_name)) elog(ERROR, "can't ALTER neon_superuser"); diff --git a/pgxn/neon/extension_server.c b/pgxn/neon/extension_server.c index 6053425de0..d9a75142f1 100644 --- a/pgxn/neon/extension_server.c +++ b/pgxn/neon/extension_server.c @@ -1,4 +1,3 @@ - /*------------------------------------------------------------------------- * * extension_server.c @@ -10,94 +9,86 @@ *------------------------------------------------------------------------- */ #include "postgres.h" -#include "tcop/pquery.h" -#include "tcop/utility.h" -#include "access/xact.h" -#include "utils/hsearch.h" -#include "utils/memutils.h" -#include "commands/defrem.h" -#include "miscadmin.h" -#include "utils/acl.h" -#include "fmgr.h" -#include "utils/guc.h" -#include "port.h" -#include "fmgr.h" #include -static int extension_server_port = 0; +#include "utils/guc.h" + +static int extension_server_port = 0; static download_extension_file_hook_type prev_download_extension_file_hook = NULL; -// to download all SQL (and data) files for an extension: -// curl -X POST http://localhost:8080/extension_server/postgis -// it covers two possible extension files layouts: -// 1. extension_name--version--platform.sql -// 2. extension_name/extension_name--version.sql -// extension_name/extra_files.csv -// -// to download specific library file: -// curl -X POST http://localhost:8080/extension_server/postgis-3.so?is_library=true +/* + * to download all SQL (and data) files for an extension: + * curl -X POST http://localhost:8080/extension_server/postgis + * it covers two possible extension files layouts: + * 1. extension_name--version--platform.sql + * 2. extension_name/extension_name--version.sql + * extension_name/extra_files.csv + * to download specific library file: + * curl -X POST http://localhost:8080/extension_server/postgis-3.so?is_library=true + */ static bool neon_download_extension_file_http(const char *filename, bool is_library) { - CURL *curl; - CURLcode res; - char *compute_ctl_url; - char *postdata; - bool ret = false; + CURL *curl; + CURLcode res; + char *compute_ctl_url; + char *postdata; + bool ret = false; - if ((curl = curl_easy_init()) == NULL) - { - elog(ERROR, "Failed to initialize curl handle"); - } + if ((curl = curl_easy_init()) == NULL) + { + elog(ERROR, "Failed to initialize curl handle"); + } - compute_ctl_url = psprintf("http://localhost:%d/extension_server/%s%s", - extension_server_port, filename, is_library ? "?is_library=true" : ""); + compute_ctl_url = psprintf("http://localhost:%d/extension_server/%s%s", + extension_server_port, filename, is_library ? "?is_library=true" : ""); - elog(LOG, "Sending request to compute_ctl: %s", compute_ctl_url); + elog(LOG, "Sending request to compute_ctl: %s", compute_ctl_url); - curl_easy_setopt(curl, CURLOPT_CUSTOMREQUEST, "POST"); - curl_easy_setopt(curl, CURLOPT_URL, compute_ctl_url); - curl_easy_setopt(curl, CURLOPT_TIMEOUT, 3L /* seconds */); + curl_easy_setopt(curl, CURLOPT_CUSTOMREQUEST, "POST"); + curl_easy_setopt(curl, CURLOPT_URL, compute_ctl_url); + curl_easy_setopt(curl, CURLOPT_TIMEOUT, 3L /* seconds */ ); - if (curl) - { - /* Perform the request, res will get the return code */ - res = curl_easy_perform(curl); - /* Check for errors */ - if (res == CURLE_OK) - { - ret = true; - } - else - { - // Don't error here because postgres will try to find the file - // and will fail with some proper error message if it's not found. - elog(WARNING, "neon_download_extension_file_http failed: %s\n", curl_easy_strerror(res)); - } + if (curl) + { + /* Perform the request, res will get the return code */ + res = curl_easy_perform(curl); + /* Check for errors */ + if (res == CURLE_OK) + { + ret = true; + } + else + { + /* Don't error here because postgres will try to find the file */ + /* and will fail with some proper error message if it's not found. */ + elog(WARNING, "neon_download_extension_file_http failed: %s\n", curl_easy_strerror(res)); + } - /* always cleanup */ - curl_easy_cleanup(curl); - } + /* always cleanup */ + curl_easy_cleanup(curl); + } - return ret; + return ret; } -void pg_init_extension_server() +void +pg_init_extension_server() { - // Port to connect to compute_ctl on localhost - // to request extension files. - DefineCustomIntVariable("neon.extension_server_port", - "connection string to the compute_ctl", - NULL, - &extension_server_port, - 0, 0, INT_MAX, - PGC_POSTMASTER, - 0, /* no flags required */ - NULL, NULL, NULL); + /* Port to connect to compute_ctl on localhost */ + /* to request extension files. */ + DefineCustomIntVariable("neon.extension_server_port", + "connection string to the compute_ctl", + NULL, + &extension_server_port, + 0, 0, INT_MAX, + PGC_POSTMASTER, + 0, /* no flags required */ + NULL, NULL, NULL); - // set download_extension_file_hook - prev_download_extension_file_hook = download_extension_file_hook; - download_extension_file_hook = neon_download_extension_file_http; + /* set download_extension_file_hook */ + prev_download_extension_file_hook = download_extension_file_hook; + download_extension_file_hook = neon_download_extension_file_http; } diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c index e70f0163c0..6725ce8fff 100644 --- a/pgxn/neon/file_cache.c +++ b/pgxn/neon/file_cache.c @@ -13,32 +13,30 @@ *------------------------------------------------------------------------- */ +#include "postgres.h" + #include #include #include -#include "postgres.h" - #include "neon_pgversioncompat.h" +#include "access/parallel.h" #include "funcapi.h" #include "miscadmin.h" -#include "pgstat.h" #include "pagestore_client.h" -#include "access/parallel.h" +#include "pgstat.h" #include "postmaster/bgworker.h" #include RELFILEINFO_HDR #include "storage/buf_internals.h" -#include "storage/latch.h" +#include "storage/fd.h" #include "storage/ipc.h" +#include "storage/latch.h" #include "storage/lwlock.h" +#include "storage/pg_shmem.h" #include "utils/builtins.h" #include "utils/dynahash.h" #include "utils/guc.h" -#include "storage/fd.h" -#include "storage/pg_shmem.h" -#include "storage/buf_internals.h" -#include "pgstat.h" /* * Local file cache is used to temporary store relations pages in local file system. @@ -67,32 +65,34 @@ typedef struct FileCacheEntry { BufferTag key; - uint32 hash; + uint32 hash; uint32 offset; uint32 access_count; - uint32 bitmap[BLOCKS_PER_CHUNK/32]; - dlist_node lru_node; /* LRU list node */ + uint32 bitmap[BLOCKS_PER_CHUNK / 32]; + dlist_node lru_node; /* LRU list node */ } FileCacheEntry; typedef struct FileCacheControl { - uint64 generation; /* generation is needed to handle correct hash reenabling */ - uint32 size; /* size of cache file in chunks */ - uint32 used; /* number of used chunks */ - uint32 limit; /* shared copy of lfc_size_limit */ - uint64 hits; - uint64 misses; - uint64 writes; - dlist_head lru; /* double linked list for LRU replacement algorithm */ + uint64 generation; /* generation is needed to handle correct hash + * reenabling */ + uint32 size; /* size of cache file in chunks */ + uint32 used; /* number of used chunks */ + uint32 limit; /* shared copy of lfc_size_limit */ + uint64 hits; + uint64 misses; + uint64 writes; + dlist_head lru; /* double linked list for LRU replacement + * algorithm */ } FileCacheControl; -static HTAB* lfc_hash; -static int lfc_desc = 0; +static HTAB *lfc_hash; +static int lfc_desc = 0; static LWLockId lfc_lock; -static int lfc_max_size; -static int lfc_size_limit; -static char* lfc_path; -static FileCacheControl* lfc_ctl; +static int lfc_max_size; +static int lfc_size_limit; +static char *lfc_path; +static FileCacheControl *lfc_ctl; static shmem_startup_hook_type prev_shmem_startup_hook; #if PG_VERSION_NUM>=150000 static shmem_request_hook_type prev_shmem_request_hook; @@ -100,8 +100,6 @@ static shmem_request_hook_type prev_shmem_request_hook; #define LFC_ENABLED() (lfc_ctl->limit != 0) -void PGDLLEXPORT FileCacheMonitorMain(Datum main_arg); - /* * Local file cache is optional and Neon can work without it. * In case of any any errors with this cache, we should disable it but to not throw error. @@ -109,9 +107,10 @@ void PGDLLEXPORT FileCacheMonitorMain(Datum main_arg); * All cache content should be invalidated to avoid reading of stale or corrupted data */ static void -lfc_disable(char const* op) +lfc_disable(char const *op) { - int fd; + int fd; + elog(WARNING, "Failed to %s local file cache at %s: %m, disabling local file cache", op, lfc_path); /* Invalidate hash */ @@ -120,7 +119,7 @@ lfc_disable(char const* op) if (LFC_ENABLED()) { HASH_SEQ_STATUS status; - FileCacheEntry* entry; + FileCacheEntry *entry; hash_seq_init(&status, lfc_hash); while ((entry = hash_seq_search(&status)) != NULL) @@ -135,16 +134,24 @@ lfc_disable(char const* op) if (lfc_desc > 0) { - /* If the reason of error is ENOSPC, then truncation of file may help to reclaim some space */ - int rc = ftruncate(lfc_desc, 0); + /* + * If the reason of error is ENOSPC, then truncation of file may + * help to reclaim some space + */ + int rc = ftruncate(lfc_desc, 0); + if (rc < 0) elog(WARNING, "Failed to truncate local file cache %s: %m", lfc_path); } } - /* We need to use unlink to to avoid races in LFC write, because it is not protectedby */ + + /* + * We need to use unlink to to avoid races in LFC write, because it is not + * protectedby + */ unlink(lfc_path); - fd = BasicOpenFile(lfc_path, O_RDWR|O_CREAT|O_TRUNC); + fd = BasicOpenFile(lfc_path, O_RDWR | O_CREAT | O_TRUNC); if (fd < 0) elog(WARNING, "Failed to recreate local file cache %s: %m", lfc_path); else @@ -170,13 +177,15 @@ lfc_maybe_disabled(void) static bool lfc_ensure_opened(void) { - bool enabled = !lfc_maybe_disabled(); + bool enabled = !lfc_maybe_disabled(); + /* Open cache file if not done yet */ if (lfc_desc <= 0 && enabled) { lfc_desc = BasicOpenFile(lfc_path, O_RDWR); - if (lfc_desc < 0) { + if (lfc_desc < 0) + { lfc_disable("open"); return false; } @@ -187,7 +196,7 @@ lfc_ensure_opened(void) static void lfc_shmem_startup(void) { - bool found; + bool found; static HASHCTL info; if (prev_shmem_startup_hook) @@ -197,17 +206,22 @@ lfc_shmem_startup(void) LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE); - lfc_ctl = (FileCacheControl*)ShmemInitStruct("lfc", sizeof(FileCacheControl), &found); + lfc_ctl = (FileCacheControl *) ShmemInitStruct("lfc", sizeof(FileCacheControl), &found); if (!found) { - int fd; - uint32 lfc_size = SIZE_MB_TO_CHUNKS(lfc_max_size); - lfc_lock = (LWLockId)GetNamedLWLockTranche("lfc_lock"); + int fd; + uint32 lfc_size = SIZE_MB_TO_CHUNKS(lfc_max_size); + + lfc_lock = (LWLockId) GetNamedLWLockTranche("lfc_lock"); info.keysize = sizeof(BufferTag); info.entrysize = sizeof(FileCacheEntry); + + /* + * lfc_size+1 because we add new element to hash table before eviction + * of victim + */ lfc_hash = ShmemInitHash("lfc_hash", - /* lfc_size+1 because we add new element to hash table before eviction of victim */ - lfc_size+1, lfc_size+1, + lfc_size + 1, lfc_size + 1, &info, HASH_ELEM | HASH_BLOBS); lfc_ctl->generation = 0; @@ -219,7 +233,7 @@ lfc_shmem_startup(void) dlist_init(&lfc_ctl->lru); /* Recreate file cache on restart */ - fd = BasicOpenFile(lfc_path, O_RDWR|O_CREAT|O_TRUNC); + fd = BasicOpenFile(lfc_path, O_RDWR | O_CREAT | O_TRUNC); if (fd < 0) { elog(WARNING, "Failed to create local file cache %s: %m", lfc_path); @@ -242,7 +256,7 @@ lfc_shmem_request(void) prev_shmem_request_hook(); #endif - RequestAddinShmemSpace(sizeof(FileCacheControl) + hash_estimate_size(SIZE_MB_TO_CHUNKS(lfc_max_size)+1, sizeof(FileCacheEntry))); + RequestAddinShmemSpace(sizeof(FileCacheControl) + hash_estimate_size(SIZE_MB_TO_CHUNKS(lfc_max_size) + 1, sizeof(FileCacheEntry))); RequestNamedLWLockTranche("lfc_lock", 1); } @@ -250,9 +264,11 @@ static bool is_normal_backend(void) { /* - * Stats collector detach shared memory, so we should not try to access shared memory here. - * Parallel workers first assign default value (0), so not perform truncation in parallel workers. - * The Postmaster can handle SIGHUP and it has access to shared memory (UsedShmemSegAddr != NULL), but has no PGPROC. + * Stats collector detach shared memory, so we should not try to access + * shared memory here. Parallel workers first assign default value (0), so + * not perform truncation in parallel workers. The Postmaster can handle + * SIGHUP and it has access to shared memory (UsedShmemSegAddr != NULL), + * but has no PGPROC. */ return lfc_ctl && MyProc && UsedShmemSegAddr && !IsParallelWorker(); } @@ -271,7 +287,7 @@ lfc_check_limit_hook(int *newval, void **extra, GucSource source) static void lfc_change_limit_hook(int newval, void *extra) { - uint32 new_size = SIZE_MB_TO_CHUNKS(newval); + uint32 new_size = SIZE_MB_TO_CHUNKS(newval); if (!is_normal_backend()) return; @@ -283,11 +299,15 @@ lfc_change_limit_hook(int newval, void *extra) while (new_size < lfc_ctl->used && !dlist_is_empty(&lfc_ctl->lru)) { - /* Shrink cache by throwing away least recently accessed chunks and returning their space to file system */ - FileCacheEntry* victim = dlist_container(FileCacheEntry, lru_node, dlist_pop_head_node(&lfc_ctl->lru)); + /* + * Shrink cache by throwing away least recently accessed chunks and + * returning their space to file system + */ + FileCacheEntry *victim = dlist_container(FileCacheEntry, lru_node, dlist_pop_head_node(&lfc_ctl->lru)); + Assert(victim->access_count == 0); #ifdef FALLOC_FL_PUNCH_HOLE - if (fallocate(lfc_desc, FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE, (off_t)victim->offset*BLOCKS_PER_CHUNK*BLCKSZ, BLOCKS_PER_CHUNK*BLCKSZ) < 0) + if (fallocate(lfc_desc, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, (off_t) victim->offset * BLOCKS_PER_CHUNK * BLCKSZ, BLOCKS_PER_CHUNK * BLCKSZ) < 0) elog(LOG, "Failed to punch hole in file: %m"); #endif hash_search_with_hash_value(lfc_hash, &victim->key, victim->hash, HASH_REMOVE, NULL); @@ -314,7 +334,7 @@ lfc_init(void) "Maximal size of Neon local file cache", NULL, &lfc_max_size, - 0, /* disabled by default */ + 0, /* disabled by default */ 0, INT_MAX, PGC_POSTMASTER, @@ -327,7 +347,7 @@ lfc_init(void) "Current limit for size of Neon local file cache", NULL, &lfc_size_limit, - 0, /* disabled by default */ + 0, /* disabled by default */ 0, INT_MAX, PGC_SIGHUP, @@ -367,18 +387,18 @@ lfc_init(void) bool lfc_cache_contains(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno) { - BufferTag tag; - FileCacheEntry* entry; - int chunk_offs = blkno & (BLOCKS_PER_CHUNK-1); - bool found = false; - uint32 hash; + BufferTag tag; + FileCacheEntry *entry; + int chunk_offs = blkno & (BLOCKS_PER_CHUNK - 1); + bool found = false; + uint32 hash; - if (lfc_maybe_disabled()) /* fast exit if file cache is disabled */ + if (lfc_maybe_disabled()) /* fast exit if file cache is disabled */ return false; CopyNRelFileInfoToBufTag(tag, rinfo); tag.forkNum = forkNum; - tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK-1); + tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK - 1); hash = get_hash_value(lfc_hash, &tag); LWLockAcquire(lfc_lock, LW_SHARED); @@ -397,13 +417,13 @@ lfc_cache_contains(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno) void lfc_evict(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno) { - BufferTag tag; - FileCacheEntry* entry; - bool found; - int chunk_offs = blkno & (BLOCKS_PER_CHUNK-1); - uint32 hash; + BufferTag tag; + FileCacheEntry *entry; + bool found; + int chunk_offs = blkno & (BLOCKS_PER_CHUNK - 1); + uint32 hash; - if (lfc_maybe_disabled()) /* fast exit if file cache is disabled */ + if (lfc_maybe_disabled()) /* fast exit if file cache is disabled */ return; CopyNRelFileInfoToBufTag(tag, rinfo); @@ -438,9 +458,10 @@ lfc_evict(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno) */ if (entry->bitmap[chunk_offs >> 5] == 0) { - bool has_remaining_pages; + bool has_remaining_pages; - for (int i = 0; i < (BLOCKS_PER_CHUNK / 32); i++) { + for (int i = 0; i < (BLOCKS_PER_CHUNK / 32); i++) + { if (entry->bitmap[i] != 0) { has_remaining_pages = true; @@ -449,8 +470,8 @@ lfc_evict(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno) } /* - * Put the entry at the position that is first to be reclaimed when - * we have no cached pages remaining in the chunk + * Put the entry at the position that is first to be reclaimed when we + * have no cached pages remaining in the chunk */ if (!has_remaining_pages) { @@ -476,16 +497,16 @@ bool lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, char *buffer) { - BufferTag tag; - FileCacheEntry* entry; - ssize_t rc; - int chunk_offs = blkno & (BLOCKS_PER_CHUNK-1); - bool result = true; - uint32 hash; - uint64 generation; - uint32 entry_offset; + BufferTag tag; + FileCacheEntry *entry; + ssize_t rc; + int chunk_offs = blkno & (BLOCKS_PER_CHUNK - 1); + bool result = true; + uint32 hash; + uint64 generation; + uint32 entry_offset; - if (lfc_maybe_disabled()) /* fast exit if file cache is disabled */ + if (lfc_maybe_disabled()) /* fast exit if file cache is disabled */ return false; if (!lfc_ensure_opened()) @@ -493,7 +514,7 @@ lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, CopyNRelFileInfoToBufTag(tag, rinfo); tag.forkNum = forkNum; - tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK-1); + tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK - 1); hash = get_hash_value(lfc_hash, &tag); LWLockAcquire(lfc_lock, LW_EXCLUSIVE); @@ -520,7 +541,7 @@ lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, LWLockRelease(lfc_lock); - rc = pread(lfc_desc, buffer, BLCKSZ, ((off_t)entry_offset*BLOCKS_PER_CHUNK + chunk_offs)*BLCKSZ); + rc = pread(lfc_desc, buffer, BLCKSZ, ((off_t) entry_offset * BLOCKS_PER_CHUNK + chunk_offs) * BLCKSZ); if (rc != BLCKSZ) { lfc_disable("read"); @@ -551,30 +572,29 @@ lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, * If cache is full then evict some other page. */ void -lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, #if PG_MAJORVERSION_NUM < 16 - char *buffer) +lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, char *buffer) #else - const void *buffer) +lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, const void *buffer) #endif { - BufferTag tag; - FileCacheEntry* entry; - ssize_t rc; - bool found; - int chunk_offs = blkno & (BLOCKS_PER_CHUNK-1); - uint32 hash; - uint64 generation; - uint32 entry_offset; + BufferTag tag; + FileCacheEntry *entry; + ssize_t rc; + bool found; + int chunk_offs = blkno & (BLOCKS_PER_CHUNK - 1); + uint32 hash; + uint64 generation; + uint32 entry_offset; - if (lfc_maybe_disabled()) /* fast exit if file cache is disabled */ + if (lfc_maybe_disabled()) /* fast exit if file cache is disabled */ return; if (!lfc_ensure_opened()) return; tag.forkNum = forkNum; - tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK-1); + tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK - 1); CopyNRelFileInfoToBufTag(tag, rinfo); hash = get_hash_value(lfc_hash, &tag); @@ -590,24 +610,36 @@ lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, if (found) { - /* Unlink entry from LRU list to pin it for the duration of IO operation */ + /* + * Unlink entry from LRU list to pin it for the duration of IO + * operation + */ if (entry->access_count++ == 0) dlist_delete(&entry->lru_node); } else { /* - * We have two choices if all cache pages are pinned (i.e. used in IO operations): - * 1. Wait until some of this operation is completed and pages is unpinned - * 2. Allocate one more chunk, so that specified cache size is more recommendation than hard limit. - * As far as probability of such event (that all pages are pinned) is considered to be very very small: - * there are should be very large number of concurrent IO operations and them are limited by max_connections, - * we prefer not to complicate code and use second approach. + * We have two choices if all cache pages are pinned (i.e. used in IO + * operations): + * + * 1) Wait until some of this operation is completed and pages is + * unpinned. + * + * 2) Allocate one more chunk, so that specified cache size is more + * recommendation than hard limit. + * + * As far as probability of such event (that all pages are pinned) is + * considered to be very very small: there are should be very large + * number of concurrent IO operations and them are limited by + * max_connections, we prefer not to complicate code and use second + * approach. */ if (lfc_ctl->used >= lfc_ctl->limit && !dlist_is_empty(&lfc_ctl->lru)) { /* Cache overflow: evict least recently used chunk */ - FileCacheEntry* victim = dlist_container(FileCacheEntry, lru_node, dlist_pop_head_node(&lfc_ctl->lru)); + FileCacheEntry *victim = dlist_container(FileCacheEntry, lru_node, dlist_pop_head_node(&lfc_ctl->lru)); + Assert(victim->access_count == 0); entry->offset = victim->offset; /* grab victim's chunk */ hash_search_with_hash_value(lfc_hash, &victim->key, victim->hash, HASH_REMOVE, NULL); @@ -616,7 +648,8 @@ lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, else { lfc_ctl->used += 1; - entry->offset = lfc_ctl->size++; /* allocate new chunk at end of file */ + entry->offset = lfc_ctl->size++; /* allocate new chunk at end + * of file */ } entry->access_count = 1; entry->hash = hash; @@ -628,7 +661,7 @@ lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, lfc_ctl->writes += 1; LWLockRelease(lfc_lock); - rc = pwrite(lfc_desc, buffer, BLCKSZ, ((off_t)entry_offset*BLOCKS_PER_CHUNK + chunk_offs)*BLCKSZ); + rc = pwrite(lfc_desc, buffer, BLCKSZ, ((off_t) entry_offset * BLOCKS_PER_CHUNK + chunk_offs) * BLCKSZ); if (rc != BLCKSZ) { lfc_disable("write"); @@ -665,13 +698,13 @@ Datum neon_get_lfc_stats(PG_FUNCTION_ARGS) { FuncCallContext *funcctx; - NeonGetStatsCtx* fctx; + NeonGetStatsCtx *fctx; MemoryContext oldcontext; TupleDesc tupledesc; Datum result; HeapTuple tuple; - char const* key; - uint64 value; + char const *key; + uint64 value; Datum values[NUM_NEON_GET_STATS_COLS]; bool nulls[NUM_NEON_GET_STATS_COLS]; @@ -683,7 +716,7 @@ neon_get_lfc_stats(PG_FUNCTION_ARGS) oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); /* Create a user function context for cross-call persistence */ - fctx = (NeonGetStatsCtx*) palloc(sizeof(NeonGetStatsCtx)); + fctx = (NeonGetStatsCtx *) palloc(sizeof(NeonGetStatsCtx)); /* Construct a tuple descriptor for the result rows. */ tupledesc = CreateTemplateTupleDesc(NUM_NEON_GET_STATS_COLS); @@ -704,7 +737,7 @@ neon_get_lfc_stats(PG_FUNCTION_ARGS) funcctx = SRF_PERCALL_SETUP(); /* Get the saved state */ - fctx = (NeonGetStatsCtx*) funcctx->user_fctx; + fctx = (NeonGetStatsCtx *) funcctx->user_fctx; switch (funcctx->call_cntr) { @@ -792,9 +825,9 @@ local_cache_pages(PG_FUNCTION_ARGS) if (SRF_IS_FIRSTCALL()) { - HASH_SEQ_STATUS status; - FileCacheEntry* entry; - uint32 n_pages = 0; + HASH_SEQ_STATUS status; + FileCacheEntry *entry; + uint32 n_pages = 0; funcctx = SRF_FIRSTCALL_INIT(); @@ -851,7 +884,7 @@ local_cache_pages(PG_FUNCTION_ARGS) hash_seq_init(&status, lfc_hash); while ((entry = hash_seq_search(&status)) != NULL) { - for (int i = 0; i < BLOCKS_PER_CHUNK/32; i++) + for (int i = 0; i < BLOCKS_PER_CHUNK / 32; i++) n_pages += pg_popcount32(entry->bitmap[i]); } } @@ -870,10 +903,11 @@ local_cache_pages(PG_FUNCTION_ARGS) if (n_pages != 0) { /* - * Scan through all the cache entries, saving the relevant fields in the - * fctx->record structure. + * Scan through all the cache entries, saving the relevant fields + * in the fctx->record structure. */ - uint32 n = 0; + uint32 n = 0; + hash_seq_init(&status, lfc_hash); while ((entry = hash_seq_search(&status)) != NULL) { @@ -881,7 +915,7 @@ local_cache_pages(PG_FUNCTION_ARGS) { if (entry->bitmap[i >> 5] & (1 << (i & 31))) { - fctx->record[n].pageoffs = entry->offset*BLOCKS_PER_CHUNK + i; + fctx->record[n].pageoffs = entry->offset * BLOCKS_PER_CHUNK + i; fctx->record[n].relfilenode = NInfoGetRelNumber(BufTagGetNRelFileInfo(entry->key)); fctx->record[n].reltablespace = NInfoGetSpcOid(BufTagGetNRelFileInfo(entry->key)); fctx->record[n].reldatabase = NInfoGetDbOid(BufTagGetNRelFileInfo(entry->key)); diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c index 8eb9ebb915..3a7c0f1bb6 100644 --- a/pgxn/neon/libpagestore.c +++ b/pgxn/neon/libpagestore.c @@ -14,32 +14,29 @@ */ #include "postgres.h" -#include "pagestore_client.h" -#include "fmgr.h" #include "access/xlog.h" -#include "access/xlogutils.h" -#include "storage/buf_internals.h" -#include "storage/lwlock.h" -#include "storage/ipc.h" -#include "storage/pg_shmem.h" -#include "c.h" -#include "postmaster/interrupt.h" - +#include "fmgr.h" #include "libpq-fe.h" -#include "libpq/pqformat.h" #include "libpq/libpq.h" - +#include "libpq/pqformat.h" #include "miscadmin.h" #include "pgstat.h" +#include "postmaster/interrupt.h" +#include "storage/buf_internals.h" +#include "storage/ipc.h" +#include "storage/lwlock.h" +#include "storage/pg_shmem.h" #include "utils/guc.h" #include "neon.h" -#include "walproposer.h" #include "neon_utils.h" +#include "pagestore_client.h" +#include "walproposer.h" #define PageStoreTrace DEBUG5 -#define RECONNECT_INTERVAL_USEC 1000000 +#define MIN_RECONNECT_INTERVAL_USEC 1000 +#define MAX_RECONNECT_INTERVAL_USEC 1000000 bool connected = false; PGconn *pageserver_conn = NULL; @@ -62,16 +59,16 @@ char *neon_auth_token; int readahead_buffer_size = 128; int flush_every_n_requests = 8; -int n_reconnect_attempts = 0; -int max_reconnect_attempts = 60; +static int n_reconnect_attempts = 0; +static int max_reconnect_attempts = 60; #define MAX_PAGESERVER_CONNSTRING_SIZE 256 typedef struct { - LWLockId lock; - pg_atomic_uint64 update_counter; - char pageserver_connstring[MAX_PAGESERVER_CONNSTRING_SIZE]; + LWLockId lock; + pg_atomic_uint64 update_counter; + char pageserver_connstring[MAX_PAGESERVER_CONNSTRING_SIZE]; } PagestoreShmemState; #if PG_VERSION_NUM >= 150000 @@ -83,51 +80,49 @@ static PagestoreShmemState *pagestore_shared; static uint64 pagestore_local_counter = 0; static char local_pageserver_connstring[MAX_PAGESERVER_CONNSTRING_SIZE]; -bool (*old_redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id) = NULL; - static bool pageserver_flush(void); static void pageserver_disconnect(void); static bool PagestoreShmemIsValid() { - return pagestore_shared && UsedShmemSegAddr; + return pagestore_shared && UsedShmemSegAddr; } static bool CheckPageserverConnstring(char **newval, void **extra, GucSource source) { - return strlen(*newval) < MAX_PAGESERVER_CONNSTRING_SIZE; + return strlen(*newval) < MAX_PAGESERVER_CONNSTRING_SIZE; } static void AssignPageserverConnstring(const char *newval, void *extra) { - if(!PagestoreShmemIsValid()) - return; - LWLockAcquire(pagestore_shared->lock, LW_EXCLUSIVE); - strlcpy(pagestore_shared->pageserver_connstring, newval, MAX_PAGESERVER_CONNSTRING_SIZE); - pg_atomic_fetch_add_u64(&pagestore_shared->update_counter, 1); - LWLockRelease(pagestore_shared->lock); + if (!PagestoreShmemIsValid()) + return; + LWLockAcquire(pagestore_shared->lock, LW_EXCLUSIVE); + strlcpy(pagestore_shared->pageserver_connstring, newval, MAX_PAGESERVER_CONNSTRING_SIZE); + pg_atomic_fetch_add_u64(&pagestore_shared->update_counter, 1); + LWLockRelease(pagestore_shared->lock); } static bool CheckConnstringUpdated() { - if(!PagestoreShmemIsValid()) - return false; - return pagestore_local_counter < pg_atomic_read_u64(&pagestore_shared->update_counter); + if (!PagestoreShmemIsValid()) + return false; + return pagestore_local_counter < pg_atomic_read_u64(&pagestore_shared->update_counter); } static void ReloadConnstring() { - if(!PagestoreShmemIsValid()) - return; - LWLockAcquire(pagestore_shared->lock, LW_SHARED); - strlcpy(local_pageserver_connstring, pagestore_shared->pageserver_connstring, sizeof(local_pageserver_connstring)); - pagestore_local_counter = pg_atomic_read_u64(&pagestore_shared->update_counter); - LWLockRelease(pagestore_shared->lock); + if (!PagestoreShmemIsValid()) + return; + LWLockAcquire(pagestore_shared->lock, LW_SHARED); + strlcpy(local_pageserver_connstring, pagestore_shared->pageserver_connstring, sizeof(local_pageserver_connstring)); + pagestore_local_counter = pg_atomic_read_u64(&pagestore_shared->update_counter); + LWLockRelease(pagestore_shared->lock); } static bool @@ -139,23 +134,43 @@ pageserver_connect(int elevel) const char *values[3]; int n; + static TimestampTz last_connect_time = 0; + static uint64_t delay_us = MIN_RECONNECT_INTERVAL_USEC; + TimestampTz now; + uint64_t us_since_last_connect; + Assert(!connected); - if(CheckConnstringUpdated()) - { - ReloadConnstring(); - } + if (CheckConnstringUpdated()) + { + ReloadConnstring(); + } + + now = GetCurrentTimestamp(); + us_since_last_connect = now - last_connect_time; + if (us_since_last_connect < delay_us) + { + pg_usleep(delay_us - us_since_last_connect); + delay_us *= 2; + if (delay_us > MAX_RECONNECT_INTERVAL_USEC) + delay_us = MAX_RECONNECT_INTERVAL_USEC; + last_connect_time = GetCurrentTimestamp(); + } + else + { + delay_us = MIN_RECONNECT_INTERVAL_USEC; + last_connect_time = now; + } /* * Connect using the connection string we got from the * neon.pageserver_connstring GUC. If the NEON_AUTH_TOKEN environment * variable was set, use that as the password. * - * The connection options are parsed in the order they're given, so - * when we set the password before the connection string, the - * connection string can override the password from the env variable. - * Seems useful, although we don't currently use that capability - * anywhere. + * The connection options are parsed in the order they're given, so when + * we set the password before the connection string, the connection string + * can override the password from the env variable. Seems useful, although + * we don't currently use that capability anywhere. */ n = 0; if (neon_auth_token) @@ -198,9 +213,9 @@ pageserver_connect(int elevel) pageserver_conn_wes = CreateWaitEventSet(TopMemoryContext, 3); AddWaitEventToSet(pageserver_conn_wes, WL_LATCH_SET, PGINVALID_SOCKET, - MyLatch, NULL); + MyLatch, NULL); AddWaitEventToSet(pageserver_conn_wes, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET, - NULL, NULL); + NULL, NULL); AddWaitEventToSet(pageserver_conn_wes, WL_SOCKET_READABLE, PQsocket(pageserver_conn), NULL, NULL); while (PQisBusy(pageserver_conn)) @@ -265,6 +280,7 @@ retry: if (!PQconsumeInput(pageserver_conn)) { char *msg = pchomp(PQerrorMessage(pageserver_conn)); + neon_log(LOG, "could not get response from pageserver: %s", msg); pfree(msg); return -1; @@ -305,15 +321,15 @@ pageserver_disconnect(void) } static bool -pageserver_send(NeonRequest * request) +pageserver_send(NeonRequest *request) { StringInfoData req_buff; - if(CheckConnstringUpdated()) - { - pageserver_disconnect(); - ReloadConnstring(); - } + if (CheckConnstringUpdated()) + { + pageserver_disconnect(); + ReloadConnstring(); + } /* If the connection was lost for some reason, reconnect */ if (connected && PQstatus(pageserver_conn) == CONNECTION_BAD) @@ -326,10 +342,12 @@ pageserver_send(NeonRequest * request) /* * If pageserver is stopped, the connections from compute node are broken. - * The compute node doesn't notice that immediately, but it will cause the next request to fail, usually on the next query. - * That causes user-visible errors if pageserver is restarted, or the tenant is moved from one pageserver to another. - * See https://github.com/neondatabase/neon/issues/1138 - * So try to reestablish connection in case of failure. + * The compute node doesn't notice that immediately, but it will cause the + * next request to fail, usually on the next query. That causes + * user-visible errors if pageserver is restarted, or the tenant is moved + * from one pageserver to another. See + * https://github.com/neondatabase/neon/issues/1138 So try to reestablish + * connection in case of failure. */ if (!connected) { @@ -337,7 +355,6 @@ pageserver_send(NeonRequest * request) { HandleMainLoopInterrupts(); n_reconnect_attempts += 1; - pg_usleep(RECONNECT_INTERVAL_USEC); } n_reconnect_attempts = 0; } @@ -353,6 +370,7 @@ pageserver_send(NeonRequest * request) if (PQputCopyData(pageserver_conn, req_buff.data, req_buff.len) <= 0) { char *msg = pchomp(PQerrorMessage(pageserver_conn)); + pageserver_disconnect(); neon_log(LOG, "pageserver_send disconnect because failed to send page request (try to reconnect): %s", msg); pfree(msg); @@ -410,7 +428,8 @@ pageserver_receive(void) } else if (rc == -2) { - char* msg = pchomp(PQerrorMessage(pageserver_conn)); + char *msg = pchomp(PQerrorMessage(pageserver_conn)); + pageserver_disconnect(); neon_log(ERROR, "pageserver_receive disconnect because could not read COPY data: %s", msg); } @@ -444,6 +463,7 @@ pageserver_flush(void) if (PQflush(pageserver_conn)) { char *msg = pchomp(PQerrorMessage(pageserver_conn)); + pageserver_disconnect(); neon_log(LOG, "pageserver_flush disconnect because failed to flush page requests: %s", msg); pfree(msg); @@ -471,46 +491,47 @@ check_neon_id(char **newval, void **extra, GucSource source) static Size PagestoreShmemSize(void) { - return sizeof(PagestoreShmemState); + return sizeof(PagestoreShmemState); } static bool PagestoreShmemInit(void) { - bool found; - LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE); - pagestore_shared = ShmemInitStruct("libpagestore shared state", - PagestoreShmemSize(), - &found); - if(!found) - { - pagestore_shared->lock = &(GetNamedLWLockTranche("neon_libpagestore")->lock); - pg_atomic_init_u64(&pagestore_shared->update_counter, 0); - AssignPageserverConnstring(page_server_connstring, NULL); - } - LWLockRelease(AddinShmemInitLock); - return found; + bool found; + + LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE); + pagestore_shared = ShmemInitStruct("libpagestore shared state", + PagestoreShmemSize(), + &found); + if (!found) + { + pagestore_shared->lock = &(GetNamedLWLockTranche("neon_libpagestore")->lock); + pg_atomic_init_u64(&pagestore_shared->update_counter, 0); + AssignPageserverConnstring(page_server_connstring, NULL); + } + LWLockRelease(AddinShmemInitLock); + return found; } static void pagestore_shmem_startup_hook(void) { - if(prev_shmem_startup_hook) - prev_shmem_startup_hook(); + if (prev_shmem_startup_hook) + prev_shmem_startup_hook(); - PagestoreShmemInit(); + PagestoreShmemInit(); } static void pagestore_shmem_request(void) { #if PG_VERSION_NUM >= 150000 - if(prev_shmem_request_hook) - prev_shmem_request_hook(); + if (prev_shmem_request_hook) + prev_shmem_request_hook(); #endif - RequestAddinShmemSpace(PagestoreShmemSize()); - RequestNamedLWLockTranche("neon_libpagestore", 1); + RequestAddinShmemSpace(PagestoreShmemSize()); + RequestNamedLWLockTranche("neon_libpagestore", 1); } static void @@ -520,7 +541,7 @@ pagestore_prepare_shmem(void) prev_shmem_request_hook = shmem_request_hook; shmem_request_hook = pagestore_shmem_request; #else - pagestore_shmem_request(); + pagestore_shmem_request(); #endif prev_shmem_startup_hook = shmem_startup_hook; shmem_startup_hook = pagestore_shmem_startup_hook; @@ -532,7 +553,7 @@ pagestore_prepare_shmem(void) void pg_init_libpagestore(void) { - pagestore_prepare_shmem(); + pagestore_prepare_shmem(); DefineCustomStringVariable("neon.pageserver_connstring", "connection string to the page server", @@ -607,7 +628,10 @@ pg_init_libpagestore(void) neon_log(PageStoreTrace, "libpagestore already loaded"); page_server = &api; - /* Retrieve the auth token to use when connecting to pageserver and safekeepers */ + /* + * Retrieve the auth token to use when connecting to pageserver and + * safekeepers + */ neon_auth_token = getenv("NEON_AUTH_TOKEN"); if (neon_auth_token) neon_log(LOG, "using storage auth token from NEON_AUTH_TOKEN environment variable"); @@ -618,8 +642,6 @@ pg_init_libpagestore(void) smgr_hook = smgr_neon; smgr_init_hook = smgr_init_neon; dbsize_hook = neon_dbsize; - old_redo_read_buffer_filter = redo_read_buffer_filter; - redo_read_buffer_filter = neon_redo_read_buffer_filter; } lfc_init(); diff --git a/pgxn/neon/libpqwalproposer.h b/pgxn/neon/libpqwalproposer.h new file mode 100644 index 0000000000..cd7e568a47 --- /dev/null +++ b/pgxn/neon/libpqwalproposer.h @@ -0,0 +1,96 @@ +/* + * Interface to set of libpq wrappers walproposer and neon_walreader need. + * Similar to libpqwalreceiver, but it has blocking connection establishment and + * pqexec which don't fit us. Implementation is at walproposer_pg.c. + */ +#ifndef ___LIBPQWALPROPOSER_H__ +#define ___LIBPQWALPROPOSER_H__ + +/* Re-exported and modified ExecStatusType */ +typedef enum +{ + /* We received a single CopyBoth result */ + WP_EXEC_SUCCESS_COPYBOTH, + + /* + * Any success result other than a single CopyBoth was received. The + * specifics of the result were already logged, but it may be useful to + * provide an error message indicating which safekeeper messed up. + * + * Do not expect PQerrorMessage to be appropriately set. + */ + WP_EXEC_UNEXPECTED_SUCCESS, + + /* + * No result available at this time. Wait until read-ready, then call + * again. Internally, this is returned when PQisBusy indicates that + * PQgetResult would block. + */ + WP_EXEC_NEEDS_INPUT, + /* Catch-all failure. Check PQerrorMessage. */ + WP_EXEC_FAILED, +} WalProposerExecStatusType; + +/* Possible return values from walprop_async_read */ +typedef enum +{ + /* The full read was successful. buf now points to the data */ + PG_ASYNC_READ_SUCCESS, + + /* + * The read is ongoing. Wait until the connection is read-ready, then try + * again. + */ + PG_ASYNC_READ_TRY_AGAIN, + /* Reading failed. Check PQerrorMessage(conn) */ + PG_ASYNC_READ_FAIL, +} PGAsyncReadResult; + +/* Possible return values from walprop_async_write */ +typedef enum +{ + /* The write fully completed */ + PG_ASYNC_WRITE_SUCCESS, + + /* + * The write started, but you'll need to call PQflush some more times to + * finish it off. We just tried, so it's best to wait until the connection + * is read- or write-ready to try again. + * + * If it becomes read-ready, call PQconsumeInput and flush again. If it + * becomes write-ready, just call PQflush. + */ + PG_ASYNC_WRITE_TRY_FLUSH, + /* Writing failed. Check PQerrorMessage(conn) */ + PG_ASYNC_WRITE_FAIL, +} PGAsyncWriteResult; + +/* + * This header is included by walproposer.h to define walproposer_api; if we're + * building walproposer without pg, ignore libpq part, leaving only interface + * types. + */ +#ifndef WALPROPOSER_LIB + +#include "libpq-fe.h" + +/* + * Sometimes working directly with underlying PGconn is simpler, export the + * whole thing for simplicity. + */ +typedef struct WalProposerConn +{ + PGconn *pg_conn; + bool is_nonblocking; /* whether the connection is non-blocking */ + char *recvbuf; /* last received CopyData message from + * walprop_async_read */ +} WalProposerConn; + +extern WalProposerConn *libpqwp_connect_start(char *conninfo); +extern bool libpqwp_send_query(WalProposerConn *conn, char *query); +extern WalProposerExecStatusType libpqwp_get_query_result(WalProposerConn *conn); +extern PGAsyncReadResult libpqwp_async_read(WalProposerConn *conn, char **buf, int *amount); +extern void libpqwp_disconnect(WalProposerConn *conn); + +#endif /* WALPROPOSER_LIB */ +#endif /* ___LIBPQWALPROPOSER_H__ */ diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c index 4850b0d6a1..b930fdb3ca 100644 --- a/pgxn/neon/neon.c +++ b/pgxn/neon/neon.c @@ -48,9 +48,11 @@ _PG_init(void) pg_init_extension_server(); - // Important: This must happen after other parts of the extension - // are loaded, otherwise any settings to GUCs that were set before - // the extension was loaded will be removed. + /* + * Important: This must happen after other parts of the extension are + * loaded, otherwise any settings to GUCs that were set before the + * extension was loaded will be removed. + */ EmitWarningsOnPlaceholders("neon"); } diff --git a/pgxn/neon/neon.h b/pgxn/neon/neon.h index 3300c67456..c3afecc679 100644 --- a/pgxn/neon/neon.h +++ b/pgxn/neon/neon.h @@ -27,13 +27,6 @@ extern void pg_init_walproposer(void); extern void pg_init_extension_server(void); -/* - * Returns true if we shouldn't do REDO on that block in record indicated by - * block_id; false otherwise. - */ -extern bool neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id); -extern bool (*old_redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id); - extern uint64 BackpressureThrottlingTime(void); extern void replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn); diff --git a/pgxn/neon/neon_pgversioncompat.h b/pgxn/neon/neon_pgversioncompat.h index 8db0d5341e..f19732cbbb 100644 --- a/pgxn/neon/neon_pgversioncompat.h +++ b/pgxn/neon/neon_pgversioncompat.h @@ -59,7 +59,7 @@ #define DropRelationAllLocalBuffers DropRelFileNodeAllLocalBuffers -#else /* major version >= 16 */ +#else /* major version >= 16 */ #define USE_RELFILELOCATOR @@ -109,4 +109,4 @@ #define DropRelationAllLocalBuffers DropRelationAllLocalBuffers #endif -#endif //NEON_PGVERSIONCOMPAT_H +#endif /* NEON_PGVERSIONCOMPAT_H */ diff --git a/pgxn/neon/neon_utils.c b/pgxn/neon/neon_utils.c index 06faea7490..9135847aaf 100644 --- a/pgxn/neon/neon_utils.c +++ b/pgxn/neon/neon_utils.c @@ -1,32 +1,10 @@ + +#include + #include "postgres.h" -#include "access/timeline.h" -#include "access/xlogutils.h" -#include "common/logging.h" -#include "common/ip.h" -#include "funcapi.h" -#include "libpq/libpq.h" +#include "lib/stringinfo.h" #include "libpq/pqformat.h" -#include "miscadmin.h" -#include "postmaster/interrupt.h" -#include "replication/slot.h" -#include "replication/walsender_private.h" - -#include "storage/ipc.h" -#include "utils/builtins.h" -#include "utils/ps_status.h" - -#include "libpq-fe.h" -#include -#include - -#if PG_VERSION_NUM >= 150000 -#include "access/xlogutils.h" -#include "access/xlogrecovery.h" -#endif -#if PG_MAJORVERSION_NUM >= 16 -#include "utils/guc.h" -#endif /* * Convert a character which represents a hexadecimal digit to an integer. @@ -114,3 +92,25 @@ pq_sendint64_le(StringInfo buf, uint64 i) memcpy(buf->data + buf->len, &i, sizeof(uint64)); buf->len += sizeof(uint64); } + +/* + * Disables core dump for the current process. + */ +void +disable_core_dump() +{ + struct rlimit rlim; + +#ifdef WALPROPOSER_LIB /* skip in simulation mode */ + return; +#endif + + rlim.rlim_cur = 0; + rlim.rlim_max = 0; + if (setrlimit(RLIMIT_CORE, &rlim)) + { + int save_errno = errno; + + fprintf(stderr, "WARNING: disable cores setrlimit failed: %s", strerror(save_errno)); + } +} diff --git a/pgxn/neon/neon_utils.h b/pgxn/neon/neon_utils.h index e3fafc8d0f..a86f1e061c 100644 --- a/pgxn/neon/neon_utils.h +++ b/pgxn/neon/neon_utils.h @@ -1,12 +1,11 @@ #ifndef __NEON_UTILS_H__ #define __NEON_UTILS_H__ -#include "postgres.h" - bool HexDecodeString(uint8 *result, char *input, int nbytes); uint32 pq_getmsgint32_le(StringInfo msg); uint64 pq_getmsgint64_le(StringInfo msg); void pq_sendint32_le(StringInfo buf, uint32 i); void pq_sendint64_le(StringInfo buf, uint64 i); +extern void disable_core_dump(); #endif /* __NEON_UTILS_H__ */ diff --git a/pgxn/neon/neon_walreader.c b/pgxn/neon/neon_walreader.c new file mode 100644 index 0000000000..f7ec9e5bfa --- /dev/null +++ b/pgxn/neon/neon_walreader.c @@ -0,0 +1,742 @@ +/* + * Like WALRead, but when WAL segment doesn't exist locally instead of throwing + * ERROR asynchronously tries to fetch it from the most advanced safekeeper. + * + * We can't use libpqwalreceiver as it blocks during connection establishment + * (and waiting for PQExec result), so use libpqwalproposer instead. + * + * TODO: keepalives are currently never sent, so the other side can close the + * connection prematurely. + * + * TODO: close conn if reading takes too long to prevent stuck connections. + */ +#include "postgres.h" + +#include +#include + +#include "access/xlog_internal.h" +#include "access/xlogdefs.h" +#include "access/xlogreader.h" +#include "libpq/pqformat.h" +#include "storage/fd.h" +#include "utils/wait_event.h" + +#include "libpq-fe.h" + +#include "neon_walreader.h" +#include "walproposer.h" + +#define NEON_WALREADER_ERR_MSG_LEN 512 + +/* + * Can be called where NeonWALReader *state is available in the context, adds log_prefix. + */ +#define nwr_log(elevel, fmt, ...) elog(elevel, "%s" fmt, state->log_prefix, ## __VA_ARGS__) + +static NeonWALReadResult NeonWALReadRemote(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli); +static NeonWALReadResult NeonWALReaderReadMsg(NeonWALReader *state); +static void NeonWALReaderResetRemote(NeonWALReader *state); +static bool NeonWALReadLocal(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli); +static bool neon_wal_segment_open(NeonWALReader *state, XLogSegNo nextSegNo, TimeLineID *tli_p); +static void neon_wal_segment_close(NeonWALReader *state); +static bool is_wal_segment_exists(XLogSegNo segno, int segsize, + TimeLineID tli); + +/* + * State of connection to donor safekeeper. + */ +typedef enum +{ + /* no remote connection */ + RS_NONE, + /* doing PQconnectPoll, need readable socket */ + RS_CONNECTING_READ, + /* doing PQconnectPoll, need writable socket */ + RS_CONNECTING_WRITE, + /* Waiting for START_REPLICATION result */ + RS_WAIT_EXEC_RESULT, + /* replication stream established */ + RS_ESTABLISHED, +} NeonWALReaderRemoteState; + +struct NeonWALReader +{ + /* + * LSN before which we assume WAL is not available locally. Exists because + * though first segment after startup always exists, part before + * basebackup LSN is filled with zeros. + */ + XLogRecPtr available_lsn; + WALSegmentContext segcxt; + WALOpenSegment seg; + int wre_errno; + /* Explains failure to read, static for simplicity. */ + char err_msg[NEON_WALREADER_ERR_MSG_LEN]; + + /* + * Saved info about request in progress, used to check validity of + * arguments after resume and remember how far we accomplished it. req_lsn + * is 0 if there is no request in progress. + */ + XLogRecPtr req_lsn; + Size req_len; + Size req_progress; + WalProposer *wp; /* we learn donor through walproposer */ + char donor_name[64]; /* saved donor safekeeper name for logging */ + /* state of connection to safekeeper */ + NeonWALReaderRemoteState rem_state; + WalProposerConn *wp_conn; + + /* + * position in wp_conn recvbuf from which we'll copy WAL next time, or + * NULL if there is no unprocessed message + */ + char *wal_ptr; + Size wal_rem_len; /* how many unprocessed bytes left in recvbuf */ + + /* + * LSN of wal_ptr position according to walsender to cross check against + * read request + */ + XLogRecPtr rem_lsn; + + /* prepended to lines logged by neon_walreader, if provided */ + char log_prefix[64]; +}; + +/* palloc and initialize NeonWALReader */ +NeonWALReader * +NeonWALReaderAllocate(int wal_segment_size, XLogRecPtr available_lsn, WalProposer *wp, char *log_prefix) +{ + NeonWALReader *reader; + + reader = (NeonWALReader *) + palloc_extended(sizeof(NeonWALReader), + MCXT_ALLOC_NO_OOM | MCXT_ALLOC_ZERO); + if (!reader) + return NULL; + + reader->available_lsn = available_lsn; + reader->seg.ws_file = -1; + reader->seg.ws_segno = 0; + reader->seg.ws_tli = 0; + reader->segcxt.ws_segsize = wal_segment_size; + + reader->wp = wp; + + reader->rem_state = RS_NONE; + + if (log_prefix) + strlcpy(reader->log_prefix, log_prefix, sizeof(reader->log_prefix)); + + return reader; +} + +void +NeonWALReaderFree(NeonWALReader *state) +{ + if (state->seg.ws_file != -1) + neon_wal_segment_close(state); + if (state->wp_conn) + libpqwp_disconnect(state->wp_conn); + pfree(state); +} + +/* + * Like vanilla WALRead, but if requested position is before available_lsn or + * WAL segment doesn't exist on disk, it tries to fetch needed segment from the + * advanced safekeeper. + * + * Read 'count' bytes into 'buf', starting at location 'startptr', from WAL + * fetched from timeline 'tli'. + * + * Returns NEON_WALREAD_SUCCESS if succeeded, NEON_WALREAD_ERROR if an error + * occurs, in which case 'err' has the desciption. Error always closes remote + * connection, if there was any, so socket subscription should be removed. + * + * NEON_WALREAD_WOULDBLOCK means caller should obtain socket to wait for with + * NeonWALReaderSocket and call NeonWALRead again with exactly the same + * arguments when NeonWALReaderEvents happen on the socket. Note that per libpq + * docs during connection establishment (before first successful read) socket + * underneath might change. + * + * Also, eventually walreader should switch from remote to local read; caller + * should remove subscription to socket then by checking NeonWALReaderEvents + * after successful read (otherwise next read might reopen the connection with + * different socket). + * + * Reading not monotonically is not supported and will result in error. + * + * Caller should be sure that WAL up to requested LSN exists, otherwise + * NEON_WALREAD_WOULDBLOCK might be always returned. + */ +NeonWALReadResult +NeonWALRead(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli) +{ + /* + * If requested data is before known available basebackup lsn or there is + * already active remote state, do remote read. + */ + if (startptr < state->available_lsn || state->rem_state != RS_NONE) + { + return NeonWALReadRemote(state, buf, startptr, count, tli); + } + if (NeonWALReadLocal(state, buf, startptr, count, tli)) + { + return NEON_WALREAD_SUCCESS; + } + else if (state->wre_errno == ENOENT) + { + nwr_log(LOG, "local read failed as segment at %X/%X doesn't exist, attempting remote", + LSN_FORMAT_ARGS(startptr)); + return NeonWALReadRemote(state, buf, startptr, count, tli); + } + else + { + return NEON_WALREAD_ERROR; + } +} + +/* Do the read from remote safekeeper. */ +static NeonWALReadResult +NeonWALReadRemote(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli) +{ + if (state->rem_state == RS_NONE) + { + XLogRecPtr donor_lsn; + + /* no connection yet; start one */ + Safekeeper *donor = GetDonor(state->wp, &donor_lsn); + + if (donor == NULL) + { + snprintf(state->err_msg, sizeof(state->err_msg), + "failed to establish remote connection to fetch WAL: no donor available"); + return NEON_WALREAD_ERROR; + } + snprintf(state->donor_name, sizeof(state->donor_name), "%s:%s", donor->host, donor->port); + nwr_log(LOG, "establishing connection to %s, flush_lsn %X/%X to fetch WAL", + state->donor_name, LSN_FORMAT_ARGS(donor_lsn)); + state->wp_conn = libpqwp_connect_start(donor->conninfo); + if (PQstatus(state->wp_conn->pg_conn) == CONNECTION_BAD) + { + snprintf(state->err_msg, sizeof(state->err_msg), + "failed to connect to %s to fetch WAL: immediately failed with %s", + state->donor_name, PQerrorMessage(state->wp_conn->pg_conn)); + NeonWALReaderResetRemote(state); + return NEON_WALREAD_ERROR; + } + /* we'll poll immediately */ + state->rem_state = RS_CONNECTING_READ; + } + + if (state->rem_state == RS_CONNECTING_READ || state->rem_state == RS_CONNECTING_WRITE) + { + switch (PQconnectPoll(state->wp_conn->pg_conn)) + { + case PGRES_POLLING_FAILED: + snprintf(state->err_msg, sizeof(state->err_msg), + "failed to connect to %s to fetch WAL: poll error: %s", + state->donor_name, PQerrorMessage(state->wp_conn->pg_conn)); + NeonWALReaderResetRemote(state); + return NEON_WALREAD_ERROR; + case PGRES_POLLING_READING: + state->rem_state = RS_CONNECTING_READ; + return NEON_WALREAD_WOULDBLOCK; + case PGRES_POLLING_WRITING: + state->rem_state = RS_CONNECTING_WRITE; + return NEON_WALREAD_WOULDBLOCK; + case PGRES_POLLING_OK: + { + /* connection successfully established */ + char start_repl_query[128]; + + snprintf(start_repl_query, sizeof(start_repl_query), + "START_REPLICATION PHYSICAL %X/%X (term='" UINT64_FORMAT "')", + LSN_FORMAT_ARGS(startptr), state->wp->propTerm); + nwr_log(LOG, "connection to %s to fetch WAL succeeded, running %s", + state->donor_name, start_repl_query); + if (!libpqwp_send_query(state->wp_conn, start_repl_query)) + { + snprintf(state->err_msg, sizeof(state->err_msg), + "failed to send %s query to %s: %s", + start_repl_query, state->donor_name, PQerrorMessage(state->wp_conn->pg_conn)); + NeonWALReaderResetRemote(state); + return NEON_WALREAD_ERROR; + } + state->rem_state = RS_WAIT_EXEC_RESULT; + break; + } + + default: /* there is unused PGRES_POLLING_ACTIVE */ + Assert(false); + return NEON_WALREAD_ERROR; /* keep the compiler quiet */ + } + } + + if (state->rem_state == RS_WAIT_EXEC_RESULT) + { + switch (libpqwp_get_query_result(state->wp_conn)) + { + case WP_EXEC_SUCCESS_COPYBOTH: + state->rem_state = RS_ESTABLISHED; + break; + case WP_EXEC_NEEDS_INPUT: + return NEON_WALREAD_WOULDBLOCK; + case WP_EXEC_FAILED: + snprintf(state->err_msg, sizeof(state->err_msg), + "get START_REPLICATION result from %s failed: %s", + state->donor_name, PQerrorMessage(state->wp_conn->pg_conn)); + NeonWALReaderResetRemote(state); + return NEON_WALREAD_ERROR; + default: /* can't happen */ + snprintf(state->err_msg, sizeof(state->err_msg), + "get START_REPLICATION result from %s: unexpected result", + state->donor_name); + NeonWALReaderResetRemote(state); + return NEON_WALREAD_ERROR; + } + } + + Assert(state->rem_state == RS_ESTABLISHED); + + /* + * If we had the request before, verify args are the same and advance the + * result ptr according to the progress; otherwise register the request. + */ + if (state->req_lsn != InvalidXLogRecPtr) + { + if (state->req_lsn != startptr || state->req_len != count) + { + snprintf(state->err_msg, sizeof(state->err_msg), + "args changed during request, was %X/%X %zu, now %X/%X %zu", + LSN_FORMAT_ARGS(state->req_lsn), state->req_len, LSN_FORMAT_ARGS(startptr), count); + NeonWALReaderResetRemote(state); + return NEON_WALREAD_ERROR; + } + nwr_log(DEBUG5, "continuing remote read at req_lsn=%X/%X len=%zu, req_progress=%zu", + LSN_FORMAT_ARGS(startptr), + count, + state->req_progress); + buf += state->req_progress; + } + else + { + state->req_lsn = startptr; + state->req_len = count; + state->req_progress = 0; + nwr_log(DEBUG5, "starting remote read req_lsn=%X/%X len=%zu", + LSN_FORMAT_ARGS(startptr), + count); + } + + while (true) + { + Size to_copy; + + /* + * If we have no ready data, receive new message. + */ + if (state->wal_rem_len == 0 && + + /* + * check for the sake of 0 length reads; walproposer does these for + * heartbeats, though generally they shouldn't hit remote source. + */ + state->req_len - state->req_progress > 0) + { + NeonWALReadResult read_msg_res = NeonWALReaderReadMsg(state); + + if (read_msg_res != NEON_WALREAD_SUCCESS) + return read_msg_res; + } + + if (state->req_lsn + state->req_progress != state->rem_lsn) + { + snprintf(state->err_msg, sizeof(state->err_msg), + "expected remote WAL at %X/%X but got %X/%X. Non monotonic read requests could have caused this. req_lsn=%X/%X len=%zu", + LSN_FORMAT_ARGS(state->req_lsn + state->req_progress), + LSN_FORMAT_ARGS(state->rem_lsn), + LSN_FORMAT_ARGS(state->req_lsn), + state->req_len); + NeonWALReaderResetRemote(state); + return NEON_WALREAD_ERROR; + } + + /* We can copy min of (available, requested) bytes. */ + to_copy = + Min(state->req_len - state->req_progress, state->wal_rem_len); + memcpy(buf, state->wal_ptr, to_copy); + state->wal_ptr += to_copy; + state->wal_rem_len -= to_copy; + state->rem_lsn += to_copy; + if (state->wal_rem_len == 0) + state->wal_ptr = NULL; /* freed by libpqwalproposer */ + buf += to_copy; + state->req_progress += to_copy; + if (state->req_progress == state->req_len) + { + XLogSegNo next_segno; + XLogSegNo req_segno; + + XLByteToSeg(state->req_lsn, req_segno, state->segcxt.ws_segsize); + XLByteToSeg(state->rem_lsn, next_segno, state->segcxt.ws_segsize); + + /* + * Request completed. If there is a chance of serving next one + * locally, close the connection. + */ + if (state->req_lsn < state->available_lsn && + state->rem_lsn >= state->available_lsn) + { + nwr_log(LOG, "closing remote connection as available_lsn %X/%X crossed and next read at %X/%X is likely to be served locally", + LSN_FORMAT_ARGS(state->available_lsn), LSN_FORMAT_ARGS(state->rem_lsn)); + NeonWALReaderResetRemote(state); + } + else if (state->rem_lsn >= state->available_lsn && next_segno > req_segno && + is_wal_segment_exists(next_segno, state->segcxt.ws_segsize, tli)) + { + nwr_log(LOG, "closing remote connection as WAL file at next lsn %X/%X exists", + LSN_FORMAT_ARGS(state->rem_lsn)); + NeonWALReaderResetRemote(state); + } + state->req_lsn = InvalidXLogRecPtr; + state->req_len = 0; + state->req_progress = 0; + return NEON_WALREAD_SUCCESS; + } + } +} + +/* + * Read one WAL message from the stream, sets state->wal_ptr in case of success. + * Resets remote state in case of failure. + */ +static NeonWALReadResult +NeonWALReaderReadMsg(NeonWALReader *state) +{ + while (true) /* loop until we get 'w' */ + { + char *copydata_ptr; + int copydata_size; + StringInfoData s; + char msg_type; + int hdrlen; + + Assert(state->rem_state == RS_ESTABLISHED); + Assert(state->wal_ptr == NULL && state->wal_rem_len == 0); + + switch (libpqwp_async_read(state->wp_conn, + ©data_ptr, + ©data_size)) + { + case PG_ASYNC_READ_SUCCESS: + break; + case PG_ASYNC_READ_TRY_AGAIN: + return NEON_WALREAD_WOULDBLOCK; + case PG_ASYNC_READ_FAIL: + snprintf(state->err_msg, + sizeof(state->err_msg), + "req_lsn=%X/%X, req_len=%zu, req_progress=%zu, get copydata failed: %s", + LSN_FORMAT_ARGS(state->req_lsn), + state->req_len, + state->req_progress, + PQerrorMessage(state->wp_conn->pg_conn)); + goto err; + } + + /* put data on StringInfo to parse */ + s.data = copydata_ptr; + s.len = copydata_size; + s.cursor = 0; + s.maxlen = -1; + + if (copydata_size == 0) + { + snprintf(state->err_msg, + sizeof(state->err_msg), + "zero length copydata received"); + goto err; + } + msg_type = pq_getmsgbyte(&s); + switch (msg_type) + { + case 'w': + { + XLogRecPtr start_lsn; + + hdrlen = sizeof(int64) + sizeof(int64) + sizeof(int64); + if (s.len - s.cursor < hdrlen) + { + snprintf(state->err_msg, + sizeof(state->err_msg), + "invalid WAL message received from primary"); + goto err; + } + + start_lsn = pq_getmsgint64(&s); + pq_getmsgint64(&s); /* XLogRecPtr end_lsn; */ + pq_getmsgint64(&s); /* TimestampTz send_time */ + + state->rem_lsn = start_lsn; + state->wal_rem_len = (Size) (s.len - s.cursor); + state->wal_ptr = (char *) pq_getmsgbytes(&s, s.len - s.cursor); + nwr_log(DEBUG5, "received WAL msg at %X/%X len %zu", + LSN_FORMAT_ARGS(state->rem_lsn), state->wal_rem_len); + + return NEON_WALREAD_SUCCESS; + } + case 'k': + { + XLogRecPtr end_lsn; + bool reply_requested; + + hdrlen = sizeof(int64) + sizeof(int64) + sizeof(char); + if (s.len - s.cursor < hdrlen) + { + snprintf(state->err_msg, sizeof(state->err_msg), + "invalid keepalive message received from primary"); + goto err; + } + + end_lsn = pq_getmsgint64(&s); + pq_getmsgint64(&s); /* TimestampTz timestamp; */ + reply_requested = pq_getmsgbyte(&s); + nwr_log(DEBUG5, "received keepalive end_lsn=%X/%X reply_requested=%d", + LSN_FORMAT_ARGS(end_lsn), + reply_requested); + if (end_lsn < state->req_lsn + state->req_len) + { + snprintf(state->err_msg, sizeof(state->err_msg), + "closing remote connection: requested WAL up to %X/%X, but current donor %s has only up to %X/%X", + LSN_FORMAT_ARGS(state->req_lsn + state->req_len), state->donor_name, LSN_FORMAT_ARGS(end_lsn)); + goto err; + } + continue; + } + default: + nwr_log(WARNING, "invalid replication message type %d", msg_type); + continue; + } + } +err: + NeonWALReaderResetRemote(state); + return NEON_WALREAD_ERROR; +} + +/* reset remote connection and request in progress */ +static void +NeonWALReaderResetRemote(NeonWALReader *state) +{ + state->req_lsn = InvalidXLogRecPtr; + state->req_len = 0; + state->req_progress = 0; + state->rem_state = RS_NONE; + if (state->wp_conn) + { + libpqwp_disconnect(state->wp_conn); + state->wp_conn = NULL; + } + state->donor_name[0] = '\0'; + state->wal_ptr = NULL; + state->wal_rem_len = 0; + state->rem_lsn = InvalidXLogRecPtr; +} + +/* + * Return socket of connection to remote source. Must be called only when + * connection exists (NeonWALReaderEvents returns non zero). + */ +pgsocket +NeonWALReaderSocket(NeonWALReader *state) +{ + if (!state->wp_conn) + nwr_log(FATAL, "NeonWALReaderSocket is called without active remote connection"); + return PQsocket(state->wp_conn->pg_conn); +} + +/* + * Whether remote connection is established. Once this is done, until successful + * local read or error socket is stable and user can update socket events + * instead of readding it each time. + */ +bool +NeonWALReaderIsRemConnEstablished(NeonWALReader *state) +{ + return state->rem_state == RS_ESTABLISHED; +} + +/* + * Returns events user should wait on connection socket or 0 if remote + * connection is not active. + */ +extern uint32 +NeonWALReaderEvents(NeonWALReader *state) +{ + switch (state->rem_state) + { + case RS_NONE: + return 0; + case RS_CONNECTING_READ: + return WL_SOCKET_READABLE; + case RS_CONNECTING_WRITE: + return WL_SOCKET_WRITEABLE; + case RS_WAIT_EXEC_RESULT: + case RS_ESTABLISHED: + return WL_SOCKET_READABLE; + default: + Assert(false); + return 0; /* make compiler happy */ + } +} + +static bool +NeonWALReadLocal(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli) +{ + char *p; + XLogRecPtr recptr; + Size nbytes; + + p = buf; + recptr = startptr; + nbytes = count; + + while (nbytes > 0) + { + uint32 startoff; + int segbytes; + int readbytes; + + startoff = XLogSegmentOffset(recptr, state->segcxt.ws_segsize); + + /* + * If the data we want is not in a segment we have open, close what we + * have (if anything) and open the next one, using the caller's + * provided openSegment callback. + */ + if (state->seg.ws_file < 0 || + !XLByteInSeg(recptr, state->seg.ws_segno, state->segcxt.ws_segsize) || + tli != state->seg.ws_tli) + { + XLogSegNo nextSegNo; + + neon_wal_segment_close(state); + + XLByteToSeg(recptr, nextSegNo, state->segcxt.ws_segsize); + if (!neon_wal_segment_open(state, nextSegNo, &tli)) + { + char fname[MAXFNAMELEN]; + + state->wre_errno = errno; + + XLogFileName(fname, tli, nextSegNo, state->segcxt.ws_segsize); + snprintf(state->err_msg, sizeof(state->err_msg), "failed to open WAL segment %s while reading at %X/%X: %s", + fname, LSN_FORMAT_ARGS(recptr), strerror(state->wre_errno)); + return false; + } + + /* This shouldn't happen -- indicates a bug in segment_open */ + Assert(state->seg.ws_file >= 0); + + /* Update the current segment info. */ + state->seg.ws_tli = tli; + state->seg.ws_segno = nextSegNo; + } + + /* How many bytes are within this segment? */ + if (nbytes > (state->segcxt.ws_segsize - startoff)) + segbytes = state->segcxt.ws_segsize - startoff; + else + segbytes = nbytes; + +#ifndef FRONTEND + pgstat_report_wait_start(WAIT_EVENT_WAL_READ); +#endif + + /* Reset errno first; eases reporting non-errno-affecting errors */ + errno = 0; + readbytes = pg_pread(state->seg.ws_file, p, segbytes, (off_t) startoff); + +#ifndef FRONTEND + pgstat_report_wait_end(); +#endif + + if (readbytes <= 0) + { + char fname[MAXFNAMELEN]; + + XLogFileName(fname, state->seg.ws_tli, state->seg.ws_segno, state->segcxt.ws_segsize); + + if (readbytes < 0) + { + state->wre_errno = errno; + snprintf(state->err_msg, sizeof(state->err_msg), "could not read from log segment %s, offset %d: %m: %s", + fname, startoff, strerror(state->wre_errno)); + } + else + { + snprintf(state->err_msg, sizeof(state->err_msg), "could not read from log segment %s, offset %d: %m: unexpected EOF", + fname, startoff); + } + return false; + } + + /* Update state for read */ + recptr += readbytes; + nbytes -= readbytes; + p += readbytes; + } + + return true; +} + +/* + * Copy of vanilla wal_segment_open, but returns false in case of error instead + * of ERROR, with errno set. + * + * XLogReaderRoutine->segment_open callback for local pg_wal files + */ +static bool +neon_wal_segment_open(NeonWALReader *state, XLogSegNo nextSegNo, + TimeLineID *tli_p) +{ + TimeLineID tli = *tli_p; + char path[MAXPGPATH]; + + XLogFilePath(path, tli, nextSegNo, state->segcxt.ws_segsize); + nwr_log(DEBUG5, "opening %s", path); + state->seg.ws_file = BasicOpenFile(path, O_RDONLY | PG_BINARY); + if (state->seg.ws_file >= 0) + return true; + + return false; +} + +static bool +is_wal_segment_exists(XLogSegNo segno, int segsize, TimeLineID tli) +{ + struct stat stat_buffer; + char path[MAXPGPATH]; + + XLogFilePath(path, tli, segno, segsize); + return stat(path, &stat_buffer) == 0; +} + +/* copy of vanilla wal_segment_close with NeonWALReader */ +static void +neon_wal_segment_close(NeonWALReader *state) +{ + if (state->seg.ws_file >= 0) + { + close(state->seg.ws_file); + /* need to check errno? */ + state->seg.ws_file = -1; + } +} + +char * +NeonWALReaderErrMsg(NeonWALReader *state) +{ + return state->err_msg; +} diff --git a/pgxn/neon/neon_walreader.h b/pgxn/neon/neon_walreader.h new file mode 100644 index 0000000000..6be9f149aa --- /dev/null +++ b/pgxn/neon/neon_walreader.h @@ -0,0 +1,30 @@ +#ifndef __NEON_WALREADER_H__ +#define __NEON_WALREADER_H__ + +#include "access/xlogdefs.h" + +/* forward declare so we don't have to expose the struct to the public */ +struct NeonWALReader; +typedef struct NeonWALReader NeonWALReader; + +/* avoid including walproposer.h as it includes us */ +struct WalProposer; +typedef struct WalProposer WalProposer; + +/* NeonWALRead return value */ +typedef enum +{ + NEON_WALREAD_SUCCESS, + NEON_WALREAD_WOULDBLOCK, + NEON_WALREAD_ERROR, +} NeonWALReadResult; + +extern NeonWALReader *NeonWALReaderAllocate(int wal_segment_size, XLogRecPtr available_lsn, WalProposer *wp, char *log_prefix); +extern void NeonWALReaderFree(NeonWALReader *state); +extern NeonWALReadResult NeonWALRead(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli); +extern pgsocket NeonWALReaderSocket(NeonWALReader *state); +extern uint32 NeonWALReaderEvents(NeonWALReader *state); +extern bool NeonWALReaderIsRemConnEstablished(NeonWALReader *state); +extern char *NeonWALReaderErrMsg(NeonWALReader *state); + +#endif /* __NEON_WALREADER_H__ */ diff --git a/pgxn/neon/pagestore_client.h b/pgxn/neon/pagestore_client.h index d61f74b5c8..3fcaab0bee 100644 --- a/pgxn/neon/pagestore_client.h +++ b/pgxn/neon/pagestore_client.h @@ -13,19 +13,16 @@ #ifndef pageserver_h #define pageserver_h -#include "postgres.h" #include "neon_pgversioncompat.h" #include "access/xlogdefs.h" #include RELFILEINFO_HDR -#include "storage/block.h" -#include "storage/smgr.h" #include "lib/stringinfo.h" #include "libpq/pqformat.h" +#include "storage/block.h" +#include "storage/smgr.h" #include "utils/memutils.h" -#include "pg_config.h" - typedef enum { /* pagestore_client -> pagestore */ @@ -40,13 +37,13 @@ typedef enum T_NeonGetPageResponse, T_NeonErrorResponse, T_NeonDbSizeResponse, -} NeonMessageTag; +} NeonMessageTag; /* base struct for c-style inheritance */ typedef struct { NeonMessageTag tag; -} NeonMessage; +} NeonMessage; #define messageTag(m) (((const NeonMessage *)(m))->tag) @@ -67,27 +64,27 @@ typedef struct NeonMessageTag tag; bool latest; /* if true, request latest page version */ XLogRecPtr lsn; /* request page version @ this LSN */ -} NeonRequest; +} NeonRequest; typedef struct { NeonRequest req; NRelFileInfo rinfo; ForkNumber forknum; -} NeonExistsRequest; +} NeonExistsRequest; typedef struct { NeonRequest req; NRelFileInfo rinfo; ForkNumber forknum; -} NeonNblocksRequest; +} NeonNblocksRequest; typedef struct { NeonRequest req; Oid dbNode; -} NeonDbSizeRequest; +} NeonDbSizeRequest; typedef struct { @@ -95,31 +92,31 @@ typedef struct NRelFileInfo rinfo; ForkNumber forknum; BlockNumber blkno; -} NeonGetPageRequest; +} NeonGetPageRequest; /* supertype of all the Neon*Response structs below */ typedef struct { NeonMessageTag tag; -} NeonResponse; +} NeonResponse; typedef struct { NeonMessageTag tag; bool exists; -} NeonExistsResponse; +} NeonExistsResponse; typedef struct { NeonMessageTag tag; uint32 n_blocks; -} NeonNblocksResponse; +} NeonNblocksResponse; typedef struct { NeonMessageTag tag; char page[FLEXIBLE_ARRAY_MEMBER]; -} NeonGetPageResponse; +} NeonGetPageResponse; #define PS_GETPAGERESPONSE_SIZE (MAXALIGN(offsetof(NeonGetPageResponse, page) + BLCKSZ)) @@ -127,18 +124,18 @@ typedef struct { NeonMessageTag tag; int64 db_size; -} NeonDbSizeResponse; +} NeonDbSizeResponse; typedef struct { NeonMessageTag tag; char message[FLEXIBLE_ARRAY_MEMBER]; /* null-terminated error * message */ -} NeonErrorResponse; +} NeonErrorResponse; -extern StringInfoData nm_pack_request(NeonRequest * msg); -extern NeonResponse * nm_unpack_response(StringInfo s); -extern char *nm_to_string(NeonMessage * msg); +extern StringInfoData nm_pack_request(NeonRequest *msg); +extern NeonResponse *nm_unpack_response(StringInfo s); +extern char *nm_to_string(NeonMessage *msg); /* * API @@ -146,23 +143,20 @@ extern char *nm_to_string(NeonMessage * msg); typedef struct { - bool (*send) (NeonRequest * request); + bool (*send) (NeonRequest *request); NeonResponse *(*receive) (void); bool (*flush) (void); -} page_server_api; +} page_server_api; extern void prefetch_on_ps_disconnect(void); -extern page_server_api * page_server; +extern page_server_api *page_server; extern char *page_server_connstring; -extern int flush_every_n_requests; -extern int readahead_buffer_size; -extern bool seqscan_prefetch_enabled; -extern int seqscan_prefetch_distance; +extern int flush_every_n_requests; +extern int readahead_buffer_size; extern char *neon_timeline; extern char *neon_tenant; -extern bool wal_redo; extern int32 max_cluster_size; extern const f_smgr *smgr_neon(BackendId backend, NRelFileInfo rinfo); @@ -194,14 +188,14 @@ extern bool neon_prefetch(SMgrRelation reln, ForkNumber forknum, extern void neon_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer); extern PGDLLEXPORT void neon_read_at_lsn(NRelFileInfo rnode, ForkNumber forkNum, BlockNumber blkno, - XLogRecPtr request_lsn, bool request_latest, char *buffer); + XLogRecPtr request_lsn, bool request_latest, char *buffer); extern void neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool skipFsync); #else extern void neon_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, void *buffer); extern PGDLLEXPORT void neon_read_at_lsn(NRelFileInfo rnode, ForkNumber forkNum, BlockNumber blkno, - XLogRecPtr request_lsn, bool request_latest, void *buffer); + XLogRecPtr request_lsn, bool request_latest, void *buffer); extern void neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const void *buffer, bool skipFsync); #endif diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c index 84b26198a7..8888cd89c6 100644 --- a/pgxn/neon/pagestore_smgr.c +++ b/pgxn/neon/pagestore_smgr.c @@ -47,24 +47,26 @@ #include "access/xact.h" #include "access/xlog.h" +#include "access/xlogdefs.h" #include "access/xloginsert.h" #include "access/xlog_internal.h" -#include "access/xlogdefs.h" +#include "access/xlogutils.h" #include "catalog/pg_class.h" #include "common/hashfn.h" #include "executor/instrument.h" -#include "pagestore_client.h" -#include "postmaster/interrupt.h" +#include "pgstat.h" #include "postmaster/autovacuum.h" +#include "postmaster/interrupt.h" #include "replication/walsender.h" #include "storage/bufmgr.h" #include "storage/buf_internals.h" -#include "storage/smgr.h" +#include "storage/fsm_internals.h" #include "storage/md.h" -#include "pgstat.h" +#include "storage/smgr.h" + +#include "pagestore_client.h" #if PG_VERSION_NUM >= 150000 -#include "access/xlogutils.h" #include "access/xlogrecovery.h" #endif @@ -100,21 +102,24 @@ typedef enum UNLOGGED_BUILD_PHASE_1, UNLOGGED_BUILD_PHASE_2, UNLOGGED_BUILD_NOT_PERMANENT -} UnloggedBuildPhase; +} UnloggedBuildPhase; static SMgrRelation unlogged_build_rel = NULL; static UnloggedBuildPhase unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS; +static bool neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id); +static bool (*old_redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id) = NULL; + /* * Prefetch implementation: - * + * * Prefetch is performed locally by each backend. * * There can be up to readahead_buffer_size active IO requests registered at * any time. Requests using smgr_prefetch are sent to the pageserver, but we * don't wait on the response. Requests using smgr_read are either read from * the buffer, or (if that's not possible) we wait on the response to arrive - - * this also will allow us to receive other prefetched pages. + * this also will allow us to receive other prefetched pages. * Each request is immediately written to the output buffer of the pageserver * connection, but may not be flushed if smgr_prefetch is used: pageserver * flushes sent requests on manual flush, or every neon.flush_output_after @@ -138,7 +143,7 @@ static UnloggedBuildPhase unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS; /* * State machine: - * + * * not in hash : in hash * : * UNUSED ------> REQUESTED --> RECEIVED @@ -149,30 +154,34 @@ static UnloggedBuildPhase unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS; * +----------------+------------+ * : */ -typedef enum PrefetchStatus { - PRFS_UNUSED = 0, /* unused slot */ - PRFS_REQUESTED, /* request was written to the sendbuffer to PS, but not - * necessarily flushed. - * all fields except response valid */ - PRFS_RECEIVED, /* all fields valid */ - PRFS_TAG_REMAINS, /* only buftag and my_ring_index are still valid */ +typedef enum PrefetchStatus +{ + PRFS_UNUSED = 0, /* unused slot */ + PRFS_REQUESTED, /* request was written to the sendbuffer to + * PS, but not necessarily flushed. all fields + * except response valid */ + PRFS_RECEIVED, /* all fields valid */ + PRFS_TAG_REMAINS, /* only buftag and my_ring_index are still + * valid */ } PrefetchStatus; -typedef struct PrefetchRequest { - BufferTag buftag; /* must be first entry in the struct */ +typedef struct PrefetchRequest +{ + BufferTag buftag; /* must be first entry in the struct */ XLogRecPtr effective_request_lsn; XLogRecPtr actual_request_lsn; - NeonResponse *response; /* may be null */ + NeonResponse *response; /* may be null */ PrefetchStatus status; uint64 my_ring_index; } PrefetchRequest; /* prefetch buffer lookup hash table */ -typedef struct PrfHashEntry { +typedef struct PrfHashEntry +{ PrefetchRequest *slot; - uint32 status; - uint32 hash; + uint32 status; + uint32 hash; } PrfHashEntry; #define SH_PREFIX prfh @@ -196,39 +205,45 @@ typedef struct PrfHashEntry { /* * PrefetchState maintains the state of (prefetch) getPage@LSN requests. * It maintains a (ring) buffer of in-flight requests and responses. - * + * * We maintain several indexes into the ring buffer: * ring_unused >= ring_flush >= ring_receive >= ring_last >= 0 - * + * * ring_unused points to the first unused slot of the buffer * ring_receive is the next request that is to be received * ring_last is the oldest received entry in the buffer - * + * * Apart from being an entry in the ring buffer of prefetch requests, each * PrefetchRequest that is not UNUSED is indexed in prf_hash by buftag. */ -typedef struct PrefetchState { - MemoryContext bufctx; /* context for prf_buffer[].response allocations */ - MemoryContext errctx; /* context for prf_buffer[].response allocations */ - MemoryContext hashctx; /* context for prf_buffer */ +typedef struct PrefetchState +{ + MemoryContext bufctx; /* context for prf_buffer[].response + * allocations */ + MemoryContext errctx; /* context for prf_buffer[].response + * allocations */ + MemoryContext hashctx; /* context for prf_buffer */ /* buffer indexes */ - uint64 ring_unused; /* first unused slot */ - uint64 ring_flush; /* next request to flush */ - uint64 ring_receive; /* next slot that is to receive a response */ - uint64 ring_last; /* min slot with a response value */ + uint64 ring_unused; /* first unused slot */ + uint64 ring_flush; /* next request to flush */ + uint64 ring_receive; /* next slot that is to receive a response */ + uint64 ring_last; /* min slot with a response value */ /* metrics / statistics */ - int n_responses_buffered; /* count of PS responses not yet in buffers */ - int n_requests_inflight; /* count of PS requests considered in flight */ - int n_unused; /* count of buffers < unused, > last, that are also unused */ + int n_responses_buffered; /* count of PS responses not yet in + * buffers */ + int n_requests_inflight; /* count of PS requests considered in + * flight */ + int n_unused; /* count of buffers < unused, > last, that are + * also unused */ /* the buffers */ - prfh_hash *prf_hash; - PrefetchRequest prf_buffer[]; /* prefetch buffers */ + prfh_hash *prf_hash; + PrefetchRequest prf_buffer[]; /* prefetch buffers */ } PrefetchState; -PrefetchState *MyPState; +static PrefetchState *MyPState; #define GetPrfSlot(ring_index) ( \ ( \ @@ -246,7 +261,7 @@ PrefetchState *MyPState; ) \ ) -XLogRecPtr prefetch_lsn = 0; +static XLogRecPtr prefetch_lsn = 0; static bool compact_prefetch_buffers(void); static void consume_prefetch_responses(void); @@ -263,10 +278,10 @@ static XLogRecPtr neon_get_request_lsn(bool *latest, NRelFileInfo rinfo, static bool compact_prefetch_buffers(void) { - uint64 empty_ring_index = MyPState->ring_last; - uint64 search_ring_index = MyPState->ring_receive; - int n_moved = 0; - + uint64 empty_ring_index = MyPState->ring_last; + uint64 search_ring_index = MyPState->ring_receive; + int n_moved = 0; + if (MyPState->ring_receive == MyPState->ring_last) return false; @@ -281,15 +296,14 @@ compact_prefetch_buffers(void) } /* - * Here we have established: - * slots < search_ring_index have an unknown state (not scanned) - * slots >= search_ring_index and <= empty_ring_index are unused - * slots > empty_ring_index are in use, or outside our buffer's range. - * ... unless search_ring_index <= ring_last - * + * Here we have established: slots < search_ring_index have an unknown + * state (not scanned) slots >= search_ring_index and <= empty_ring_index + * are unused slots > empty_ring_index are in use, or outside our buffer's + * range. ... unless search_ring_index <= ring_last + * * Therefore, there is a gap of at least one unused items between - * search_ring_index and empty_ring_index (both inclusive), which grows as we hit - * more unused items while moving backwards through the array. + * search_ring_index and empty_ring_index (both inclusive), which grows as + * we hit more unused items while moving backwards through the array. */ while (search_ring_index > MyPState->ring_last) @@ -329,7 +343,10 @@ compact_prefetch_buffers(void) /* empty the moved slot */ source_slot->status = PRFS_UNUSED; - source_slot->buftag = (BufferTag) {0}; + source_slot->buftag = (BufferTag) + { + 0 + }; source_slot->response = NULL; source_slot->my_ring_index = 0; source_slot->effective_request_lsn = 0; @@ -339,8 +356,8 @@ compact_prefetch_buffers(void) } /* - * Only when we've moved slots we can expect trailing unused slots, - * so only then we clean up trailing unused slots. + * Only when we've moved slots we can expect trailing unused slots, so + * only then we clean up trailing unused slots. */ if (n_moved > 0) { @@ -357,10 +374,9 @@ readahead_buffer_resize(int newsize, void *extra) uint64 end, nfree = newsize; PrefetchState *newPState; - Size newprfs_size = offsetof(PrefetchState, prf_buffer) + ( - sizeof(PrefetchRequest) * newsize - ); - + Size newprfs_size = offsetof(PrefetchState, prf_buffer) + + (sizeof(PrefetchRequest) * newsize); + /* don't try to re-initialize if we haven't initialized yet */ if (MyPState == NULL) return; @@ -387,12 +403,12 @@ readahead_buffer_resize(int newsize, void *extra) newPState->ring_receive = newsize; newPState->ring_flush = newsize; - /* + /* * Copy over the prefetches. - * + * * We populate the prefetch array from the end; to retain the most recent - * prefetches, but this has the benefit of only needing to do one iteration - * on the dataset, and trivial compaction. + * prefetches, but this has the benefit of only needing to do one + * iteration on the dataset, and trivial compaction. */ for (end = MyPState->ring_unused - 1; end >= MyPState->ring_last && end != UINT64_MAX && nfree != 0; @@ -400,7 +416,7 @@ readahead_buffer_resize(int newsize, void *extra) { PrefetchRequest *slot = GetPrfSlot(end); PrefetchRequest *newslot; - bool found; + bool found; if (slot->status == PRFS_UNUSED) continue; @@ -463,10 +479,11 @@ consume_prefetch_responses(void) static void prefetch_cleanup_trailing_unused(void) { - uint64 ring_index; + uint64 ring_index; PrefetchRequest *slot; - while (MyPState->ring_last < MyPState->ring_receive) { + while (MyPState->ring_last < MyPState->ring_receive) + { ring_index = MyPState->ring_last; slot = GetPrfSlot(ring_index); @@ -480,7 +497,7 @@ prefetch_cleanup_trailing_unused(void) /* * Wait for slot of ring_index to have received its response. * The caller is responsible for making sure the request buffer is flushed. - * + * * NOTE: this function may indirectly update MyPState->pfs_hash; which * invalidates any active pointers into the hash table. */ @@ -512,7 +529,7 @@ prefetch_wait_for(uint64 ring_index) /* * Read the response of a prefetch request into its slot. - * + * * The caller is responsible for making sure that the request for this buffer * was flushed to the PageServer. * @@ -552,7 +569,7 @@ prefetch_read(PrefetchRequest *slot) /* * Disconnect hook - drop prefetches when the connection drops - * + * * If we don't remove the failed prefetches, we'd be serving incorrect * data to the smgr. */ @@ -563,7 +580,7 @@ prefetch_on_ps_disconnect(void) while (MyPState->ring_receive < MyPState->ring_unused) { PrefetchRequest *slot; - uint64 ring_index = MyPState->ring_receive; + uint64 ring_index = MyPState->ring_receive; slot = GetPrfSlot(ring_index); @@ -593,7 +610,7 @@ prefetch_set_unused(uint64 ring_index) PrefetchRequest *slot = GetPrfSlot(ring_index); if (ring_index < MyPState->ring_last) - return; /* Should already be unused */ + return; /* Should already be unused */ Assert(MyPState->ring_unused > ring_index); @@ -624,7 +641,11 @@ prefetch_set_unused(uint64 ring_index) /* run cleanup if we're holding back ring_last */ if (MyPState->ring_last == ring_index) prefetch_cleanup_trailing_unused(); - /* ... and try to store the buffered responses more compactly if > 12.5% of the buffer is gaps */ + + /* + * ... and try to store the buffered responses more compactly if > 12.5% + * of the buffer is gaps + */ else if (ReceiveBufferNeedsCompaction()) compact_prefetch_buffers(); } @@ -632,7 +653,7 @@ prefetch_set_unused(uint64 ring_index) static void prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force_lsn) { - bool found; + bool found; NeonGetPageRequest request = { .req.tag = T_NeonGetPageRequest, .req.latest = false, @@ -650,21 +671,22 @@ prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force } else { - XLogRecPtr lsn = neon_get_request_lsn( - &request.req.latest, - BufTagGetNRelFileInfo(slot->buftag), - slot->buftag.forkNum, - slot->buftag.blockNum - ); + XLogRecPtr lsn = neon_get_request_lsn( + &request.req.latest, + BufTagGetNRelFileInfo(slot->buftag), + slot->buftag.forkNum, + slot->buftag.blockNum + ); + /* - * Note: effective_request_lsn is potentially higher than the requested - * LSN, but still correct: - * + * Note: effective_request_lsn is potentially higher than the + * requested LSN, but still correct: + * * We know there are no changes between the actual requested LSN and * the value of effective_request_lsn: If there were, the page would - * have been in cache and evicted between those LSN values, which - * then would have had to result in a larger request LSN for this page. - * + * have been in cache and evicted between those LSN values, which then + * would have had to result in a larger request LSN for this page. + * * It is possible that a concurrent backend loads the page, modifies * it and then evicts it again, but the LSN of that eviction cannot be * smaller than the current WAL insert/redo pointer, which is already @@ -701,7 +723,7 @@ prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force * prefetch_register_buffer() - register and prefetch buffer * * Register that we may want the contents of BufferTag in the near future. - * + * * If force_latest and force_lsn are not NULL, those values are sent to the * pageserver. If they are NULL, we utilize the lastWrittenLsn -infrastructure * to fill in these values manually. @@ -713,14 +735,14 @@ prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force static uint64 prefetch_register_buffer(BufferTag tag, bool *force_latest, XLogRecPtr *force_lsn) { - uint64 ring_index; + uint64 ring_index; PrefetchRequest req; PrefetchRequest *slot; PrfHashEntry *entry; /* use an intermediate PrefetchRequest struct to ensure correct alignment */ req.buftag = tag; - Retry: +Retry: entry = prfh_lookup(MyPState->prf_hash, (PrefetchRequest *) &req); if (entry != NULL) @@ -740,7 +762,10 @@ prefetch_register_buffer(BufferTag tag, bool *force_latest, XLogRecPtr *force_ls */ if (force_latest && force_lsn) { - /* if we want the latest version, any effective_request_lsn < request lsn is OK */ + /* + * if we want the latest version, any effective_request_lsn < + * request lsn is OK + */ if (*force_latest) { if (*force_lsn > slot->effective_request_lsn) @@ -751,7 +776,11 @@ prefetch_register_buffer(BufferTag tag, bool *force_latest, XLogRecPtr *force_ls } } - /* if we don't want the latest version, only accept requests with the exact same LSN */ + + /* + * if we don't want the latest version, only accept requests with + * the exact same LSN + */ else { if (*force_lsn != slot->effective_request_lsn) @@ -798,7 +827,8 @@ prefetch_register_buffer(BufferTag tag, bool *force_latest, XLogRecPtr *force_ls */ if (MyPState->ring_last + readahead_buffer_size - 1 == MyPState->ring_unused) { - uint64 cleanup_index = MyPState->ring_last; + uint64 cleanup_index = MyPState->ring_last; + slot = GetPrfSlot(cleanup_index); Assert(slot->status != PRFS_UNUSED); @@ -813,7 +843,10 @@ prefetch_register_buffer(BufferTag tag, bool *force_latest, XLogRecPtr *force_ls } else { - /* We have the slot for ring_last, so that must still be in progress */ + /* + * We have the slot for ring_last, so that must still be in + * progress + */ switch (slot->status) { case PRFS_REQUESTED: @@ -832,8 +865,8 @@ prefetch_register_buffer(BufferTag tag, bool *force_latest, XLogRecPtr *force_ls } /* - * The next buffer pointed to by `ring_unused` is now definitely empty, - * so we can insert the new request to it. + * The next buffer pointed to by `ring_unused` is now definitely empty, so + * we can insert the new request to it. */ ring_index = MyPState->ring_unused; slot = &MyPState->prf_buffer[((ring_index) % readahead_buffer_size)]; @@ -859,7 +892,10 @@ prefetch_register_buffer(BufferTag tag, bool *force_latest, XLogRecPtr *force_ls { if (!page_server->flush()) { - /* Prefetch set is reset in case of error, so we should try to register our request once again */ + /* + * Prefetch set is reset in case of error, so we should try to + * register our request once again + */ goto Retry; } MyPState->ring_flush = MyPState->ring_unused; @@ -871,8 +907,10 @@ prefetch_register_buffer(BufferTag tag, bool *force_latest, XLogRecPtr *force_ls static NeonResponse * page_server_request(void const *req) { - NeonResponse* resp; - do { + NeonResponse *resp; + + do + { while (!page_server->send((NeonRequest *) req) || !page_server->flush()); MyPState->ring_flush = MyPState->ring_unused; consume_prefetch_responses(); @@ -884,7 +922,7 @@ page_server_request(void const *req) StringInfoData -nm_pack_request(NeonRequest * msg) +nm_pack_request(NeonRequest *msg) { StringInfoData s; @@ -1000,7 +1038,7 @@ nm_unpack_response(StringInfo s) /* XXX: should be varlena */ memcpy(msg_resp->page, pq_getmsgbytes(s, BLCKSZ), BLCKSZ); pq_getmsgend(s); - + Assert(msg_resp->tag == T_NeonGetPageResponse); resp = (NeonResponse *) msg_resp; @@ -1056,7 +1094,7 @@ nm_unpack_response(StringInfo s) /* dump to json for debugging / error reporting purposes */ char * -nm_to_string(NeonMessage * msg) +nm_to_string(NeonMessage *msg) { StringInfoData s; @@ -1185,7 +1223,7 @@ nm_to_string(NeonMessage * msg) * directly because it skips the logging if the LSN is new enough. */ static XLogRecPtr -log_newpage_copy(NRelFileInfo *rinfo, ForkNumber forkNum, BlockNumber blkno, +log_newpage_copy(NRelFileInfo * rinfo, ForkNumber forkNum, BlockNumber blkno, Page page, bool page_std) { PGAlignedBlock copied_buffer; @@ -1208,11 +1246,10 @@ PageIsEmptyHeapPage(char *buffer) } static void -neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, #if PG_MAJORVERSION_NUM < 16 - char *buffer, bool force) +neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool force) #else - const char *buffer, bool force) +neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const char *buffer, bool force) #endif { XLogRecPtr lsn = PageGetLSN((Page) buffer); @@ -1312,24 +1349,23 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, void neon_init(void) { - Size prfs_size; + Size prfs_size; if (MyPState != NULL) return; - prfs_size = offsetof(PrefetchState, prf_buffer) + ( - sizeof(PrefetchRequest) * readahead_buffer_size - ); + prfs_size = offsetof(PrefetchState, prf_buffer) + + sizeof(PrefetchRequest) * readahead_buffer_size; MyPState = MemoryContextAllocZero(TopMemoryContext, prfs_size); - + MyPState->n_unused = readahead_buffer_size; MyPState->bufctx = SlabContextCreate(TopMemoryContext, "NeonSMGR/prefetch", SLAB_DEFAULT_BLOCK_SIZE * 17, PS_GETPAGERESPONSE_SIZE); - MyPState->errctx = AllocSetContextCreate(TopMemoryContext, + MyPState->errctx = AllocSetContextCreate(TopMemoryContext, "NeonSMGR/errors", ALLOCSET_DEFAULT_SIZES); MyPState->hashctx = AllocSetContextCreate(TopMemoryContext, @@ -1339,6 +1375,9 @@ neon_init(void) MyPState->prf_hash = prfh_create(MyPState->hashctx, readahead_buffer_size, NULL); + old_redo_read_buffer_filter = redo_read_buffer_filter; + redo_read_buffer_filter = neon_redo_read_buffer_filter; + #ifdef DEBUG_COMPARE_LOCAL mdinit(); #endif @@ -1569,14 +1608,14 @@ neon_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo) /* * Newly created relation is empty, remember that in the relsize cache. * - * Note that in REDO, this is called to make sure the relation fork exists, - * but it does not truncate the relation. So, we can only update the - * relsize if it didn't exist before. - * + * Note that in REDO, this is called to make sure the relation fork + * exists, but it does not truncate the relation. So, we can only update + * the relsize if it didn't exist before. + * * Also, in redo, we must make sure to update the cached size of the - * relation, as that is the primary source of truth for REDO's - * file length considerations, and as file extension isn't (perfectly) - * logged, we need to take care of that before we hit file size checks. + * relation, as that is the primary source of truth for REDO's file length + * considerations, and as file extension isn't (perfectly) logged, we need + * to take care of that before we hit file size checks. * * FIXME: This is currently not just an optimization, but required for * correctness. Postgres can call smgrnblocks() on the newly-created @@ -1652,7 +1691,7 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, #endif { XLogRecPtr lsn; - BlockNumber n_blocks = 0; + BlockNumber n_blocks = 0; switch (reln->smgr_relpersistence) { @@ -1693,9 +1732,10 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, } /* - * Usually Postgres doesn't extend relation on more than one page - * (leaving holes). But this rule is violated in PG-15 where CreateAndCopyRelationData - * call smgrextend for destination relation n using size of source relation + * Usually Postgres doesn't extend relation on more than one page (leaving + * holes). But this rule is violated in PG-15 where + * CreateAndCopyRelationData call smgrextend for destination relation n + * using size of source relation */ n_blocks = neon_nblocks(reln, forkNum); while (n_blocks < blkno) @@ -1716,11 +1756,13 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, if (IS_LOCAL_REL(reln)) mdextend(reln, forkNum, blkno, buffer, skipFsync); #endif + /* - * smgr_extend is often called with an all-zeroes page, so lsn==InvalidXLogRecPtr. - * An smgr_write() call will come for the buffer later, after it has been initialized - * with the real page contents, and it is eventually evicted from the buffer cache. - * But we need a valid LSN to the relation metadata update now. + * smgr_extend is often called with an all-zeroes page, so + * lsn==InvalidXLogRecPtr. An smgr_write() call will come for the buffer + * later, after it has been initialized with the real page contents, and + * it is eventually evicted from the buffer cache. But we need a valid LSN + * to the relation metadata update now. */ if (lsn == InvalidXLogRecPtr) { @@ -1779,9 +1821,9 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum, if ((uint64) blocknum + nblocks >= (uint64) InvalidBlockNumber) ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), - errmsg("cannot extend file \"%s\" beyond %u blocks", - relpath(reln->smgr_rlocator, forkNum), - InvalidBlockNumber))); + errmsg("cannot extend file \"%s\" beyond %u blocks", + relpath(reln->smgr_rlocator, forkNum), + InvalidBlockNumber))); /* Don't log any pages if we're not allowed to do so. */ if (!XLogInsertAllowed()) @@ -1863,12 +1905,12 @@ neon_close(SMgrRelation reln, ForkNumber forknum) bool neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum) { - BufferTag tag; uint64 ring_index PG_USED_FOR_ASSERTS_ONLY; + BufferTag tag; switch (reln->smgr_relpersistence) { - case 0: /* probably shouldn't happen, but ignore it */ + case 0: /* probably shouldn't happen, but ignore it */ case RELPERSISTENCE_PERMANENT: break; @@ -1883,10 +1925,9 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum) if (lfc_cache_contains(InfoFromSMgrRel(reln), forknum, blocknum)) return false; - tag = (BufferTag) { - .forkNum = forknum, - .blockNum = blocknum - }; + tag.forkNum = forknum; + tag.blockNum = blocknum; + CopyNRelFileInfoToBufTag(tag, InfoFromSMgrRel(reln)); ring_index = prefetch_register_buffer(tag, NULL, NULL); @@ -1939,23 +1980,21 @@ neon_writeback(SMgrRelation reln, ForkNumber forknum, * While function is defined in the neon extension it's used within neon_test_utils directly. * To avoid breaking tests in the runtime please keep function signature in sync. */ +void #if PG_MAJORVERSION_NUM < 16 -void PGDLLEXPORT neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, XLogRecPtr request_lsn, bool request_latest, char *buffer) #else -void PGDLLEXPORT neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, XLogRecPtr request_lsn, bool request_latest, void *buffer) #endif { NeonResponse *resp; - BufferTag buftag; uint64 ring_index; PrfHashEntry *entry; PrefetchRequest *slot; - - buftag = (BufferTag) { + BufferTag buftag = + { .forkNum = forkNum, .blockNum = blkno, }; @@ -1964,12 +2003,11 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, /* * The redo process does not lock pages that it needs to replay but are - * not in the shared buffers, so a concurrent process may request the - * page after redo has decided it won't redo that page and updated the - * LwLSN for that page. - * If we're in hot standby we need to take care that we don't return - * until after REDO has finished replaying up to that LwLSN, as the page - * should have been locked up to that point. + * not in the shared buffers, so a concurrent process may request the page + * after redo has decided it won't redo that page and updated the LwLSN + * for that page. If we're in hot standby we need to take care that we + * don't return until after REDO has finished replaying up to that LwLSN, + * as the page should have been locked up to that point. * * See also the description on neon_redo_read_buffer_filter below. * @@ -1977,7 +2015,7 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, * concurrent failed read IOs. Those IOs should never have a request_lsn * that is as large as the WAL record we're currently replaying, if it * weren't for the behaviour of the LwLsn cache that uses the highest - * value of the LwLsn cache when the entry is not found. + * value of the LwLsn cache when the entry is not found. */ if (RecoveryInProgress() && !(MyBackendType == B_STARTUP)) XLogWaitForReplayOf(request_lsn); @@ -1995,12 +2033,14 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, ring_index = slot->my_ring_index; pgBufferUsage.prefetch.hits += 1; } - else /* the current prefetch LSN is not large enough, so drop the prefetch */ + else /* the current prefetch LSN is not large + * enough, so drop the prefetch */ { /* * We can't drop cache for not-yet-received requested items. It is - * unlikely this happens, but it can happen if prefetch distance is - * large enough and a backend didn't consume all prefetch requests. + * unlikely this happens, but it can happen if prefetch distance + * is large enough and a backend didn't consume all prefetch + * requests. */ if (slot->status == PRFS_REQUESTED) { @@ -2027,11 +2067,11 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, else { /* - * Empty our reference to the prefetch buffer's hash entry. - * When we wait for prefetches, the entry reference is invalidated by - * potential updates to the hash, and when we reconnect to the - * pageserver the prefetch we're waiting for may be dropped, - * in which case we need to retry and take the branch above. + * Empty our reference to the prefetch buffer's hash entry. When + * we wait for prefetches, the entry reference is invalidated by + * potential updates to the hash, and when we reconnect to the + * pageserver the prefetch we're waiting for may be dropped, in + * which case we need to retry and take the branch above. */ entry = NULL; } @@ -2079,11 +2119,10 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, * neon_read() -- Read the specified block from a relation. */ void -neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, #if PG_MAJORVERSION_NUM < 16 - char *buffer) +neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, char *buffer) #else - void *buffer) +neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer) #endif { bool latest; @@ -2218,11 +2257,10 @@ hexdump_page(char *page) * use mdextend(). */ void -neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, #if PG_MAJORVERSION_NUM < 16 - char *buffer, bool skipFsync) +neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool skipFsync) #else - const void *buffer, bool skipFsync) +neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const void *buffer, bool skipFsync) #endif { XLogRecPtr lsn; @@ -2722,9 +2760,90 @@ smgr_init_neon(void) } +static void +neon_extend_rel_size(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno, XLogRecPtr end_recptr) +{ + BlockNumber relsize; + + /* Extend the relation if we know its size */ + if (get_cached_relsize(rinfo, forknum, &relsize)) + { + if (relsize < blkno + 1) + { + update_cached_relsize(rinfo, forknum, blkno + 1); + SetLastWrittenLSNForRelation(end_recptr, rinfo, forknum); + } + } + else + { + /* + * Size was not cached. We populate the cache now, with the size of + * the relation measured after this WAL record is applied. + * + * This length is later reused when we open the smgr to read the + * block, which is fine and expected. + */ + + NeonResponse *response; + NeonNblocksResponse *nbresponse; + NeonNblocksRequest request = { + .req = (NeonRequest) { + .lsn = end_recptr, + .latest = false, + .tag = T_NeonNblocksRequest, + }, + .rinfo = rinfo, + .forknum = forknum, + }; + + response = page_server_request(&request); + + Assert(response->tag == T_NeonNblocksResponse); + nbresponse = (NeonNblocksResponse *) response; + + relsize = Max(nbresponse->n_blocks, blkno + 1); + + set_cached_relsize(rinfo, forknum, relsize); + SetLastWrittenLSNForRelation(end_recptr, rinfo, forknum); + + elog(SmgrTrace, "Set length to %d", relsize); + } +} + +#define FSM_TREE_DEPTH ((SlotsPerFSMPage >= 1626) ? 3 : 4) + +/* + * TODO: May be it is better to make correspondent fgunctio from freespace.c public? + */ +static BlockNumber +get_fsm_physical_block(BlockNumber heapblk) +{ + BlockNumber pages; + int leafno; + int l; + + /* + * Calculate the logical page number of the first leaf page below the + * given page. + */ + leafno = heapblk / SlotsPerFSMPage; + + /* Count upper level nodes required to address the leaf page */ + pages = 0; + for (l = 0; l < FSM_TREE_DEPTH; l++) + { + pages += leafno + 1; + leafno /= SlotsPerFSMPage; + } + + /* Turn the page count into 0-based block number */ + return pages - 1; +} + + /* * Return whether we can skip the redo for this block. - * + * * The conditions for skipping the IO are: * * - The block is not in the shared buffers, and @@ -2757,19 +2876,18 @@ smgr_init_neon(void) * contents, where with REDO locking it would wait on block 1 and see * block 3 with post-REDO contents only. */ -bool +static bool neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id) { XLogRecPtr end_recptr = record->EndRecPtr; NRelFileInfo rinfo; ForkNumber forknum; - BlockNumber blkno; + BlockNumber blkno; BufferTag tag; uint32 hash; LWLock *partitionLock; Buffer buffer; bool no_redo_needed; - BlockNumber relsize; if (old_redo_read_buffer_filter && old_redo_read_buffer_filter(record, block_id)) return true; @@ -2783,8 +2901,8 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id) /* * Out of an abundance of caution, we always run redo on shared catalogs, - * regardless of whether the block is stored in shared buffers. - * See also this function's top comment. + * regardless of whether the block is stored in shared buffers. See also + * this function's top comment. */ if (!OidIsValid(NInfoGetDbOid(rinfo))) return false; @@ -2810,8 +2928,9 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id) /* In both cases st lwlsn past this WAL record */ SetLastWrittenLSNForBlock(end_recptr, rinfo, forknum, blkno); - /* we don't have the buffer in memory, update lwLsn past this record, - * also evict page fro file cache + /* + * we don't have the buffer in memory, update lwLsn past this record, also + * evict page fro file cache */ if (no_redo_needed) lfc_evict(rinfo, forknum, blkno); @@ -2819,49 +2938,10 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id) LWLockRelease(partitionLock); - /* Extend the relation if we know its size */ - if (get_cached_relsize(rinfo, forknum, &relsize)) + neon_extend_rel_size(rinfo, forknum, blkno, end_recptr); + if (forknum == MAIN_FORKNUM) { - if (relsize < blkno + 1) - { - update_cached_relsize(rinfo, forknum, blkno + 1); - SetLastWrittenLSNForRelation(end_recptr, rinfo, forknum); - } + neon_extend_rel_size(rinfo, FSM_FORKNUM, get_fsm_physical_block(blkno), end_recptr); } - else - { - /* - * Size was not cached. We populate the cache now, with the size of the - * relation measured after this WAL record is applied. - * - * This length is later reused when we open the smgr to read the block, - * which is fine and expected. - */ - - NeonResponse *response; - NeonNblocksResponse *nbresponse; - NeonNblocksRequest request = { - .req = (NeonRequest) { - .lsn = end_recptr, - .latest = false, - .tag = T_NeonNblocksRequest, - }, - .rinfo = rinfo, - .forknum = forknum, - }; - - response = page_server_request(&request); - - Assert(response->tag == T_NeonNblocksResponse); - nbresponse = (NeonNblocksResponse *) response; - - Assert(nbresponse->n_blocks > blkno); - - set_cached_relsize(rinfo, forknum, nbresponse->n_blocks); - SetLastWrittenLSNForRelation(end_recptr, rinfo, forknum); - - elog(SmgrTrace, "Set length to %d", nbresponse->n_blocks); - } - return no_redo_needed; } diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c index 10544ba7a8..1f7c473e7d 100644 --- a/pgxn/neon/walproposer.c +++ b/pgxn/neon/walproposer.c @@ -35,6 +35,8 @@ * *------------------------------------------------------------------------- */ +#include + #include "postgres.h" #include "libpq/pqformat.h" #include "neon.h" @@ -43,7 +45,6 @@ /* Prototypes for private functions */ static void WalProposerLoop(WalProposer *wp); -static void HackyRemoveWalProposerEvent(Safekeeper *to_remove); static void ShutdownConnection(Safekeeper *sk); static void ResetConnection(Safekeeper *sk); static long TimeToReconnect(WalProposer *wp, TimestampTz now); @@ -76,11 +77,11 @@ static bool BlockingWrite(Safekeeper *sk, void *msg, size_t msg_size, Safekeeper static bool AsyncWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState flush_state); static bool AsyncFlush(Safekeeper *sk); static int CompareLsn(const void *a, const void *b); -static char *FormatSafekeeperState(SafekeeperState state); +static char *FormatSafekeeperState(Safekeeper *sk); static void AssertEventsOkForState(uint32 events, Safekeeper *sk); -static uint32 SafekeeperStateDesiredEvents(SafekeeperState state); static char *FormatEvents(WalProposer *wp, uint32 events); + WalProposer * WalProposerCreate(WalProposerConfig *config, walproposer_api api) { @@ -98,7 +99,7 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api) port = strchr(host, ':'); if (port == NULL) { - walprop_log(FATAL, "port is not specified"); + wp_log(FATAL, "port is not specified"); } *port++ = '\0'; sep = strchr(port, ','); @@ -106,11 +107,12 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api) *sep++ = '\0'; if (wp->n_safekeepers + 1 >= MAX_SAFEKEEPERS) { - walprop_log(FATAL, "Too many safekeepers"); + wp_log(FATAL, "too many safekeepers"); } wp->safekeeper[wp->n_safekeepers].host = host; wp->safekeeper[wp->n_safekeepers].port = port; wp->safekeeper[wp->n_safekeepers].state = SS_OFFLINE; + wp->safekeeper[wp->n_safekeepers].active_state = SS_ACTIVE_SEND; wp->safekeeper[wp->n_safekeepers].wp = wp; { @@ -121,19 +123,17 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api) "host=%s port=%s dbname=replication options='-c timeline_id=%s tenant_id=%s'", sk->host, sk->port, wp->config->neon_timeline, wp->config->neon_tenant); if (written > MAXCONNINFO || written < 0) - walprop_log(FATAL, "could not create connection string for safekeeper %s:%s", sk->host, sk->port); + wp_log(FATAL, "could not create connection string for safekeeper %s:%s", sk->host, sk->port); } initStringInfo(&wp->safekeeper[wp->n_safekeepers].outbuf); - wp->api.wal_reader_allocate(&wp->safekeeper[wp->n_safekeepers]); - wp->safekeeper[wp->n_safekeepers].flushWrite = false; wp->safekeeper[wp->n_safekeepers].startStreamingAt = InvalidXLogRecPtr; wp->safekeeper[wp->n_safekeepers].streamingAt = InvalidXLogRecPtr; wp->n_safekeepers += 1; } if (wp->n_safekeepers < 1) { - walprop_log(FATAL, "Safekeepers addresses are not specified"); + wp_log(FATAL, "safekeepers addresses are not specified"); } wp->quorum = wp->n_safekeepers / 2 + 1; @@ -144,15 +144,15 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api) wp->api.strong_random(wp, &wp->greetRequest.proposerId, sizeof(wp->greetRequest.proposerId)); wp->greetRequest.systemId = wp->config->systemId; if (!wp->config->neon_timeline) - walprop_log(FATAL, "neon.timeline_id is not provided"); + wp_log(FATAL, "neon.timeline_id is not provided"); if (*wp->config->neon_timeline != '\0' && !HexDecodeString(wp->greetRequest.timeline_id, wp->config->neon_timeline, 16)) - walprop_log(FATAL, "Could not parse neon.timeline_id, %s", wp->config->neon_timeline); + wp_log(FATAL, "could not parse neon.timeline_id, %s", wp->config->neon_timeline); if (!wp->config->neon_tenant) - walprop_log(FATAL, "neon.tenant_id is not provided"); + wp_log(FATAL, "neon.tenant_id is not provided"); if (*wp->config->neon_tenant != '\0' && !HexDecodeString(wp->greetRequest.tenant_id, wp->config->neon_tenant, 16)) - walprop_log(FATAL, "Could not parse neon.tenant_id, %s", wp->config->neon_tenant); + wp_log(FATAL, "could not parse neon.tenant_id, %s", wp->config->neon_tenant); wp->greetRequest.timeline = wp->config->pgTimeline; wp->greetRequest.walSegSize = wp->config->wal_segment_size; @@ -178,7 +178,7 @@ WalProposerFree(WalProposer *wp) if (wp->propTermHistory.entries != NULL) pfree(wp->propTermHistory.entries); wp->propTermHistory.entries = NULL; - + pfree(wp); } @@ -274,8 +274,8 @@ WalProposerPoll(WalProposer *wp) if (TimestampDifferenceExceeds(sk->latestMsgReceivedAt, now, wp->config->safekeeper_connection_timeout)) { - walprop_log(WARNING, "terminating connection to safekeeper '%s:%s' in '%s' state: no messages received during the last %dms or connection attempt took longer than that", - sk->host, sk->port, FormatSafekeeperState(sk->state), wp->config->safekeeper_connection_timeout); + wp_log(WARNING, "terminating connection to safekeeper '%s:%s' in '%s' state: no messages received during the last %dms or connection attempt took longer than that", + sk->host, sk->port, FormatSafekeeperState(sk), wp->config->safekeeper_connection_timeout); ShutdownConnection(sk); } } @@ -303,58 +303,20 @@ WalProposerLoop(WalProposer *wp) WalProposerPoll(wp); } -/* - * Hack: provides a way to remove the event corresponding to an individual walproposer from the set. - * - * Note: Internally, this completely reconstructs the event set. It should be avoided if possible. - */ -static void -HackyRemoveWalProposerEvent(Safekeeper *to_remove) -{ - WalProposer *wp = to_remove->wp; - - /* Remove the existing event set, assign sk->eventPos = -1 */ - wp->api.free_event_set(wp); - /* Re-initialize it without adding any safekeeper events */ - wp->api.init_event_set(wp); - - /* - * loop through the existing safekeepers. If they aren't the one we're - * removing, and if they have a socket we can use, re-add the applicable - * events. - */ - for (int i = 0; i < wp->n_safekeepers; i++) - { - uint32 desired_events = WL_NO_EVENTS; - Safekeeper *sk = &wp->safekeeper[i]; - - if (sk == to_remove) - continue; - - /* If this safekeeper isn't offline, add an event for it! */ - if (sk->state != SS_OFFLINE) - { - desired_events = SafekeeperStateDesiredEvents(sk->state); - /* will set sk->eventPos */ - wp->api.add_safekeeper_event_set(sk, desired_events); - } - } -} /* Shuts down and cleans up the connection for a safekeeper. Sets its state to SS_OFFLINE */ static void ShutdownConnection(Safekeeper *sk) { - sk->wp->api.conn_finish(sk); sk->state = SS_OFFLINE; - sk->flushWrite = false; sk->streamingAt = InvalidXLogRecPtr; if (sk->voteResponse.termHistory.entries) pfree(sk->voteResponse.termHistory.entries); sk->voteResponse.termHistory.entries = NULL; - HackyRemoveWalProposerEvent(sk); + sk->wp->api.conn_finish(sk); + sk->wp->api.rm_safekeeper_event_set(sk); } /* @@ -394,8 +356,8 @@ ResetConnection(Safekeeper *sk) * * https://www.postgresql.org/docs/devel/libpq-connect.html#LIBPQ-PQCONNECTSTARTPARAMS */ - walprop_log(WARNING, "Immediate failure to connect with node '%s:%s':\n\terror: %s", - sk->host, sk->port, wp->api.conn_error_message(sk)); + wp_log(WARNING, "immediate failure to connect with node '%s:%s':\n\terror: %s", + sk->host, sk->port, wp->api.conn_error_message(sk)); /* * Even though the connection failed, we still need to clean up the @@ -418,7 +380,7 @@ ResetConnection(Safekeeper *sk) * (see libpqrcv_connect, defined in * src/backend/replication/libpqwalreceiver/libpqwalreceiver.c) */ - walprop_log(LOG, "connecting with node %s:%s", sk->host, sk->port); + wp_log(LOG, "connecting with node %s:%s", sk->host, sk->port); sk->state = SS_CONNECTING_WRITE; sk->latestMsgReceivedAt = wp->api.get_current_timestamp(wp); @@ -472,7 +434,9 @@ ReconnectSafekeepers(WalProposer *wp) static void AdvancePollState(Safekeeper *sk, uint32 events) { +#ifdef WALPROPOSER_LIB /* wp_log needs wp in lib build */ WalProposer *wp = sk->wp; +#endif /* * Sanity check. We assume further down that the operations don't block @@ -488,8 +452,8 @@ AdvancePollState(Safekeeper *sk, uint32 events) * ResetConnection */ case SS_OFFLINE: - walprop_log(FATAL, "Unexpected safekeeper %s:%s state advancement: is offline", - sk->host, sk->port); + wp_log(FATAL, "unexpected safekeeper %s:%s state advancement: is offline", + sk->host, sk->port); break; /* actually unreachable, but prevents * -Wimplicit-fallthrough */ @@ -524,8 +488,8 @@ AdvancePollState(Safekeeper *sk, uint32 events) * requests. */ case SS_VOTING: - walprop_log(WARNING, "EOF from node %s:%s in %s state", sk->host, - sk->port, FormatSafekeeperState(sk->state)); + wp_log(WARNING, "EOF from node %s:%s in %s state", sk->host, + sk->port, FormatSafekeeperState(sk)); ResetConnection(sk); return; @@ -553,8 +517,8 @@ AdvancePollState(Safekeeper *sk, uint32 events) * Idle state for waiting votes from quorum. */ case SS_IDLE: - walprop_log(WARNING, "EOF from node %s:%s in %s state", sk->host, - sk->port, FormatSafekeeperState(sk->state)); + wp_log(WARNING, "EOF from node %s:%s in %s state", sk->host, + sk->port, FormatSafekeeperState(sk)); ResetConnection(sk); return; @@ -579,8 +543,8 @@ HandleConnectionEvent(Safekeeper *sk) switch (result) { case WP_CONN_POLLING_OK: - walprop_log(LOG, "connected with node %s:%s", sk->host, - sk->port); + wp_log(LOG, "connected with node %s:%s", sk->host, + sk->port); sk->latestMsgReceivedAt = wp->api.get_current_timestamp(wp); /* @@ -603,8 +567,8 @@ HandleConnectionEvent(Safekeeper *sk) break; case WP_CONN_POLLING_FAILED: - walprop_log(WARNING, "failed to connect to node '%s:%s': %s", - sk->host, sk->port, wp->api.conn_error_message(sk)); + wp_log(WARNING, "failed to connect to node '%s:%s': %s", + sk->host, sk->port, wp->api.conn_error_message(sk)); /* * If connecting failed, we don't want to restart the connection @@ -620,7 +584,7 @@ HandleConnectionEvent(Safekeeper *sk) * Because PQconnectPoll can change the socket, we have to un-register the * old event and re-register an event on the new socket. */ - HackyRemoveWalProposerEvent(sk); + wp->api.rm_safekeeper_event_set(sk); wp->api.add_safekeeper_event_set(sk, new_events); /* If we successfully connected, send START_WAL_PUSH query */ @@ -640,8 +604,8 @@ SendStartWALPush(Safekeeper *sk) if (!wp->api.conn_send_query(sk, "START_WAL_PUSH")) { - walprop_log(WARNING, "Failed to send 'START_WAL_PUSH' query to safekeeper %s:%s: %s", - sk->host, sk->port, wp->api.conn_error_message(sk)); + wp_log(WARNING, "failed to send 'START_WAL_PUSH' query to safekeeper %s:%s: %s", + sk->host, sk->port, wp->api.conn_error_message(sk)); ShutdownConnection(sk); return; } @@ -677,8 +641,8 @@ RecvStartWALPushResult(Safekeeper *sk) break; case WP_EXEC_FAILED: - walprop_log(WARNING, "Failed to send query to safekeeper %s:%s: %s", - sk->host, sk->port, wp->api.conn_error_message(sk)); + wp_log(WARNING, "failed to send query to safekeeper %s:%s: %s", + sk->host, sk->port, wp->api.conn_error_message(sk)); ShutdownConnection(sk); return; @@ -688,8 +652,8 @@ RecvStartWALPushResult(Safekeeper *sk) * wrong" */ case WP_EXEC_UNEXPECTED_SUCCESS: - walprop_log(WARNING, "Received bad response from safekeeper %s:%s query execution", - sk->host, sk->port); + wp_log(WARNING, "received bad response from safekeeper %s:%s query execution", + sk->host, sk->port); ShutdownConnection(sk); return; } @@ -724,7 +688,7 @@ RecvAcceptorGreeting(Safekeeper *sk) if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) &sk->greetResponse)) return; - walprop_log(LOG, "received AcceptorGreeting from safekeeper %s:%s", sk->host, sk->port); + wp_log(LOG, "received AcceptorGreeting from safekeeper %s:%s", sk->host, sk->port); /* Protocol is all good, move to voting. */ sk->state = SS_VOTING; @@ -744,7 +708,7 @@ RecvAcceptorGreeting(Safekeeper *sk) if (wp->n_connected == wp->quorum) { wp->propTerm++; - walprop_log(LOG, "proposer connected to quorum (%d) safekeepers, propTerm=" INT64_FORMAT, wp->quorum, wp->propTerm); + wp_log(LOG, "proposer connected to quorum (%d) safekeepers, propTerm=" INT64_FORMAT, wp->quorum, wp->propTerm); wp->voteRequest = (VoteRequest) { @@ -757,9 +721,9 @@ RecvAcceptorGreeting(Safekeeper *sk) else if (sk->greetResponse.term > wp->propTerm) { /* Another compute with higher term is running. */ - walprop_log(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "", - sk->host, sk->port, - sk->greetResponse.term, wp->propTerm); + wp_log(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "", + sk->host, sk->port, + sk->greetResponse.term, wp->propTerm); } /* @@ -799,7 +763,7 @@ SendVoteRequest(Safekeeper *sk) WalProposer *wp = sk->wp; /* We have quorum for voting, send our vote request */ - walprop_log(LOG, "requesting vote from %s:%s for term " UINT64_FORMAT, sk->host, sk->port, wp->voteRequest.term); + wp_log(LOG, "requesting vote from %s:%s for term " UINT64_FORMAT, sk->host, sk->port, wp->voteRequest.term); /* On failure, logging & resetting is handled */ if (!BlockingWrite(sk, &wp->voteRequest, sizeof(wp->voteRequest), SS_WAIT_VERDICT)) return; @@ -816,12 +780,12 @@ RecvVoteResponse(Safekeeper *sk) if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) &sk->voteResponse)) return; - walprop_log(LOG, - "got VoteResponse from acceptor %s:%s, voteGiven=" UINT64_FORMAT ", epoch=" UINT64_FORMAT ", flushLsn=%X/%X, truncateLsn=%X/%X, timelineStartLsn=%X/%X", - sk->host, sk->port, sk->voteResponse.voteGiven, GetHighestTerm(&sk->voteResponse.termHistory), - LSN_FORMAT_ARGS(sk->voteResponse.flushLsn), - LSN_FORMAT_ARGS(sk->voteResponse.truncateLsn), - LSN_FORMAT_ARGS(sk->voteResponse.timelineStartLsn)); + wp_log(LOG, + "got VoteResponse from acceptor %s:%s, voteGiven=" UINT64_FORMAT ", epoch=" UINT64_FORMAT ", flushLsn=%X/%X, truncateLsn=%X/%X, timelineStartLsn=%X/%X", + sk->host, sk->port, sk->voteResponse.voteGiven, GetHighestTerm(&sk->voteResponse.termHistory), + LSN_FORMAT_ARGS(sk->voteResponse.flushLsn), + LSN_FORMAT_ARGS(sk->voteResponse.truncateLsn), + LSN_FORMAT_ARGS(sk->voteResponse.timelineStartLsn)); /* * In case of acceptor rejecting our vote, bail out, but only if either it @@ -831,9 +795,9 @@ RecvVoteResponse(Safekeeper *sk) if ((!sk->voteResponse.voteGiven) && (sk->voteResponse.term > wp->propTerm || wp->n_votes < wp->quorum)) { - walprop_log(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "", - sk->host, sk->port, - sk->voteResponse.term, wp->propTerm); + wp_log(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "", + sk->host, sk->port, + sk->voteResponse.term, wp->propTerm); } Assert(sk->voteResponse.term == wp->propTerm); @@ -845,7 +809,7 @@ RecvVoteResponse(Safekeeper *sk) } else if (wp->n_votes > wp->quorum) { - /* recovery already performed, just start streaming */ + /* already elected, start streaming */ SendProposerElected(sk); } else @@ -871,21 +835,16 @@ HandleElectedProposer(WalProposer *wp) DetermineEpochStartLsn(wp); /* - * Check if not all safekeepers are up-to-date, we need to download WAL - * needed to synchronize them + * Synchronously download WAL from the most advanced safekeeper. We do + * that only for logical replication (and switching logical walsenders to + * neon_walreader is a todo.) */ - if (wp->truncateLsn < wp->propEpochStartLsn) + if (!wp->api.recovery_download(wp, &wp->safekeeper[wp->donor])) { - walprop_log(LOG, - "start recovery because truncateLsn=%X/%X is not " - "equal to epochStartLsn=%X/%X", - LSN_FORMAT_ARGS(wp->truncateLsn), - LSN_FORMAT_ARGS(wp->propEpochStartLsn)); - /* Perform recovery */ - if (!wp->api.recovery_download(&wp->safekeeper[wp->donor], wp->greetRequest.timeline, wp->truncateLsn, wp->propEpochStartLsn)) - walprop_log(FATAL, "Failed to recover state"); + wp_log(FATAL, "failed to download WAL for logical replicaiton"); } - else if (wp->config->syncSafekeepers) + + if (wp->truncateLsn == wp->propEpochStartLsn && wp->config->syncSafekeepers) { /* Sync is not needed: just exit */ wp->api.finish_sync_safekeepers(wp, wp->propEpochStartLsn); @@ -989,10 +948,10 @@ DetermineEpochStartLsn(WalProposer *wp) if (wp->timelineStartLsn != InvalidXLogRecPtr && wp->timelineStartLsn != wp->safekeeper[i].voteResponse.timelineStartLsn) { - walprop_log(WARNING, - "inconsistent timelineStartLsn: current %X/%X, received %X/%X", - LSN_FORMAT_ARGS(wp->timelineStartLsn), - LSN_FORMAT_ARGS(wp->safekeeper[i].voteResponse.timelineStartLsn)); + wp_log(WARNING, + "inconsistent timelineStartLsn: current %X/%X, received %X/%X", + LSN_FORMAT_ARGS(wp->timelineStartLsn), + LSN_FORMAT_ARGS(wp->safekeeper[i].voteResponse.timelineStartLsn)); } wp->timelineStartLsn = wp->safekeeper[i].voteResponse.timelineStartLsn; } @@ -1010,7 +969,7 @@ DetermineEpochStartLsn(WalProposer *wp) { wp->timelineStartLsn = wp->api.get_redo_start_lsn(wp); } - walprop_log(LOG, "bumped epochStartLsn to the first record %X/%X", LSN_FORMAT_ARGS(wp->propEpochStartLsn)); + wp_log(LOG, "bumped epochStartLsn to the first record %X/%X", LSN_FORMAT_ARGS(wp->propEpochStartLsn)); } /* @@ -1037,12 +996,12 @@ DetermineEpochStartLsn(WalProposer *wp) wp->propTermHistory.entries[wp->propTermHistory.n_entries - 1].term = wp->propTerm; wp->propTermHistory.entries[wp->propTermHistory.n_entries - 1].lsn = wp->propEpochStartLsn; - walprop_log(LOG, "got votes from majority (%d) of nodes, term " UINT64_FORMAT ", epochStartLsn %X/%X, donor %s:%s, truncate_lsn %X/%X", - wp->quorum, - wp->propTerm, - LSN_FORMAT_ARGS(wp->propEpochStartLsn), - wp->safekeeper[wp->donor].host, wp->safekeeper[wp->donor].port, - LSN_FORMAT_ARGS(wp->truncateLsn)); + wp_log(LOG, "got votes from majority (%d) of nodes, term " UINT64_FORMAT ", epochStartLsn %X/%X, donor %s:%s, truncate_lsn %X/%X", + wp->quorum, + wp->propTerm, + LSN_FORMAT_ARGS(wp->propEpochStartLsn), + wp->safekeeper[wp->donor].host, wp->safekeeper[wp->donor].port, + LSN_FORMAT_ARGS(wp->truncateLsn)); /* * Ensure the basebackup we are running (at RedoStartLsn) matches LSN @@ -1069,21 +1028,20 @@ DetermineEpochStartLsn(WalProposer *wp) if (!((dth->n_entries >= 1) && (dth->entries[dth->n_entries - 1].term == walprop_shared->mineLastElectedTerm))) { - walprop_log(PANIC, - "collected propEpochStartLsn %X/%X, but basebackup LSN %X/%X", - LSN_FORMAT_ARGS(wp->propEpochStartLsn), - LSN_FORMAT_ARGS(wp->api.get_redo_start_lsn(wp))); + /* + * Panic to restart PG as we need to retake basebackup. + * However, don't dump core as this is kinda expected + * scenario. + */ + disable_core_dump(); + wp_log(PANIC, + "collected propEpochStartLsn %X/%X, but basebackup LSN %X/%X", + LSN_FORMAT_ARGS(wp->propEpochStartLsn), + LSN_FORMAT_ARGS(wp->api.get_redo_start_lsn(wp))); } } walprop_shared->mineLastElectedTerm = wp->propTerm; } - - /* - * WalProposer has just elected itself and initialized history, so - * we can call election callback. Usually it updates truncateLsn to - * fetch WAL for logical replication. - */ - wp->api.after_election(wp); } /* @@ -1104,6 +1062,9 @@ SendProposerElected(Safekeeper *sk) term_t lastCommonTerm; int i; + /* Now that we are ready to send it's a good moment to create WAL reader */ + wp->api.wal_reader_allocate(sk); + /* * Determine start LSN by comparing safekeeper's log term switch history * and proposer's, searching for the divergence point. @@ -1130,34 +1091,10 @@ SendProposerElected(Safekeeper *sk) { /* safekeeper is empty or no common point, start from the beginning */ sk->startStreamingAt = wp->propTermHistory.entries[0].lsn; - - if (sk->startStreamingAt < wp->truncateLsn) - { - /* - * There's a gap between the WAL starting point and a truncateLsn, - * which can't appear in a normal working cluster. That gap means - * that all safekeepers reported that they have persisted WAL up - * to the truncateLsn before, but now current safekeeper tells - * otherwise. - * - * Also we have a special condition here, which is empty - * safekeeper with no history. In combination with a gap, that can - * happen when we introduce a new safekeeper to the cluster. This - * is a rare case, which is triggered manually for now, and should - * be treated with care. - */ - - /* - * truncateLsn will not change without ack from current - * safekeeper, and it's aligned to the WAL record, so we can - * safely start streaming from this point. - */ - sk->startStreamingAt = wp->truncateLsn; - - walprop_log(WARNING, "empty safekeeper joined cluster as %s:%s, historyStart=%X/%X, sk->startStreamingAt=%X/%X", - sk->host, sk->port, LSN_FORMAT_ARGS(wp->propTermHistory.entries[0].lsn), - LSN_FORMAT_ARGS(sk->startStreamingAt)); - } + wp_log(LOG, "no common point with sk %s:%s, streaming since first term at %X/%X, timelineStartLsn=%X/%X, termHistory.n_entries=%u" , + sk->host, sk->port, LSN_FORMAT_ARGS(sk->startStreamingAt), LSN_FORMAT_ARGS(wp->timelineStartLsn), wp->propTermHistory.n_entries); + /* wp->timelineStartLsn == InvalidXLogRecPtr can be only when timeline is created manually (test_s3_wal_replay) */ + Assert(sk->startStreamingAt == wp->timelineStartLsn || wp->timelineStartLsn == InvalidXLogRecPtr); } else { @@ -1180,7 +1117,7 @@ SendProposerElected(Safekeeper *sk) } } - Assert(sk->startStreamingAt >= wp->truncateLsn && sk->startStreamingAt <= wp->availableLsn); + Assert(sk->startStreamingAt <= wp->availableLsn); msg.tag = 'e'; msg.term = wp->propTerm; @@ -1189,9 +1126,9 @@ SendProposerElected(Safekeeper *sk) msg.timelineStartLsn = wp->timelineStartLsn; lastCommonTerm = i >= 0 ? wp->propTermHistory.entries[i].term : 0; - walprop_log(LOG, - "sending elected msg to node " UINT64_FORMAT " term=" UINT64_FORMAT ", startStreamingAt=%X/%X (lastCommonTerm=" UINT64_FORMAT "), termHistory.n_entries=%u to %s:%s, timelineStartLsn=%X/%X", - sk->greetResponse.nodeId, msg.term, LSN_FORMAT_ARGS(msg.startStreamingAt), lastCommonTerm, msg.termHistory->n_entries, sk->host, sk->port, LSN_FORMAT_ARGS(msg.timelineStartLsn)); + wp_log(LOG, + "sending elected msg to node " UINT64_FORMAT " term=" UINT64_FORMAT ", startStreamingAt=%X/%X (lastCommonTerm=" UINT64_FORMAT "), termHistory.n_entries=%u to %s:%s, timelineStartLsn=%X/%X", + sk->greetResponse.nodeId, msg.term, LSN_FORMAT_ARGS(msg.startStreamingAt), lastCommonTerm, msg.termHistory->n_entries, sk->host, sk->port, LSN_FORMAT_ARGS(msg.timelineStartLsn)); resetStringInfo(&sk->outbuf); pq_sendint64_le(&sk->outbuf, msg.tag); @@ -1223,6 +1160,7 @@ StartStreaming(Safekeeper *sk) * once for a connection. */ sk->state = SS_ACTIVE; + sk->active_state = SS_ACTIVE_SEND; sk->streamingAt = sk->startStreamingAt; /* event set will be updated inside SendMessageToNode */ @@ -1281,9 +1219,13 @@ HandleActiveState(Safekeeper *sk, uint32 events) { WalProposer *wp = sk->wp; - uint32 newEvents = WL_SOCKET_READABLE; - - if (events & WL_SOCKET_WRITEABLE) + /* + * Note: we don't known which socket awoke us (sk or nwr). However, as + * SendAppendRequests always tries to send at least one msg in + * SS_ACTIVE_SEND be careful not to go there if are only after sk + * response, otherwise it'd create busy loop of pings. + */ + if (events & WL_SOCKET_WRITEABLE || sk->active_state == SS_ACTIVE_READ_WAL) if (!SendAppendRequests(sk)) return; @@ -1291,28 +1233,29 @@ HandleActiveState(Safekeeper *sk, uint32 events) if (!RecvAppendResponses(sk)) return; - /* - * We should wait for WL_SOCKET_WRITEABLE event if we have unflushed data - * in the buffer. - * - * LSN comparison checks if we have pending unsent messages. This check - * isn't necessary now, because we always send append messages immediately - * after arrival. But it's good to have it here in case we change this - * behavior in the future. - */ - if (sk->streamingAt != wp->availableLsn || sk->flushWrite) - newEvents |= WL_SOCKET_WRITEABLE; +#if PG_VERSION_NUM >= 150000 + /* expected never to happen, c.f. walprop_pg_active_state_update_event_set */ + if (events & WL_SOCKET_CLOSED) + { + wp_log(WARNING, "connection to %s:%s in active state failed, got WL_SOCKET_CLOSED on neon_walreader socket", + sk->host, sk->port); + ShutdownConnection(sk); + return; + } +#endif - wp->api.update_event_set(sk, newEvents); + /* configures event set for yield whatever is the substate */ + wp->api.active_state_update_event_set(sk); } /* * Send WAL messages starting from sk->streamingAt until the end or non-writable - * socket, whichever comes first. Caller should take care of updating event set. - * Even if no unsent WAL is available, at least one empty message will be sent - * as a heartbeat, if socket is ready. + * socket or neon_walreader blocks, whichever comes first; active_state is + * updated accordingly. Caller should take care of updating event set. Even if + * no unsent WAL is available, at least one empty message will be sent as a + * heartbeat, if socket is ready. * - * Can change state if Async* functions encounter errors and reset connection. + * Resets state and kills the connections if any error on them is encountered. * Returns false in this case, true otherwise. */ static bool @@ -1320,11 +1263,11 @@ SendAppendRequests(Safekeeper *sk) { WalProposer *wp = sk->wp; XLogRecPtr endLsn; - AppendRequestHeader *req; PGAsyncWriteResult writeResult; bool sentAnything = false; + AppendRequestHeader *req; - if (sk->flushWrite) + if (sk->active_state == SS_ACTIVE_FLUSH) { if (!AsyncFlush(sk)) @@ -1335,76 +1278,101 @@ SendAppendRequests(Safekeeper *sk) return sk->state == SS_ACTIVE; /* Event set will be updated in the end of HandleActiveState */ - sk->flushWrite = false; + sk->active_state = SS_ACTIVE_SEND; } while (sk->streamingAt != wp->availableLsn || !sentAnything) { - sentAnything = true; - - endLsn = sk->streamingAt; - endLsn += MAX_SEND_SIZE; - - /* if we went beyond available WAL, back off */ - if (endLsn > wp->availableLsn) + if (sk->active_state == SS_ACTIVE_SEND) { - endLsn = wp->availableLsn; + sentAnything = true; + + endLsn = sk->streamingAt; + endLsn += MAX_SEND_SIZE; + + /* if we went beyond available WAL, back off */ + if (endLsn > wp->availableLsn) + { + endLsn = wp->availableLsn; + } + + req = &sk->appendRequest; + PrepareAppendRequest(sk->wp, &sk->appendRequest, sk->streamingAt, endLsn); + + wp_log(DEBUG5, "sending message len %ld beginLsn=%X/%X endLsn=%X/%X commitLsn=%X/%X truncateLsn=%X/%X to %s:%s", + req->endLsn - req->beginLsn, + LSN_FORMAT_ARGS(req->beginLsn), + LSN_FORMAT_ARGS(req->endLsn), + LSN_FORMAT_ARGS(req->commitLsn), + LSN_FORMAT_ARGS(wp->truncateLsn), sk->host, sk->port); + + resetStringInfo(&sk->outbuf); + + /* write AppendRequest header */ + appendBinaryStringInfo(&sk->outbuf, (char *) req, sizeof(AppendRequestHeader)); + enlargeStringInfo(&sk->outbuf, req->endLsn - req->beginLsn); + sk->active_state = SS_ACTIVE_READ_WAL; } - req = &sk->appendRequest; - PrepareAppendRequest(sk->wp, &sk->appendRequest, sk->streamingAt, endLsn); - - walprop_log(DEBUG2, "sending message len %ld beginLsn=%X/%X endLsn=%X/%X commitLsn=%X/%X truncateLsn=%X/%X to %s:%s", - req->endLsn - req->beginLsn, - LSN_FORMAT_ARGS(req->beginLsn), - LSN_FORMAT_ARGS(req->endLsn), - LSN_FORMAT_ARGS(req->commitLsn), - LSN_FORMAT_ARGS(wp->truncateLsn), sk->host, sk->port); - - resetStringInfo(&sk->outbuf); - - /* write AppendRequest header */ - appendBinaryStringInfo(&sk->outbuf, (char *) req, sizeof(AppendRequestHeader)); - - /* write the WAL itself */ - enlargeStringInfo(&sk->outbuf, req->endLsn - req->beginLsn); - /* wal_read will raise error on failure */ - wp->api.wal_read(sk, - &sk->outbuf.data[sk->outbuf.len], - req->beginLsn, - req->endLsn - req->beginLsn); - sk->outbuf.len += req->endLsn - req->beginLsn; - - writeResult = wp->api.conn_async_write(sk, sk->outbuf.data, sk->outbuf.len); - - /* Mark current message as sent, whatever the result is */ - sk->streamingAt = endLsn; - - switch (writeResult) + if (sk->active_state == SS_ACTIVE_READ_WAL) { - case PG_ASYNC_WRITE_SUCCESS: - /* Continue writing the next message */ - break; + char *errmsg; - case PG_ASYNC_WRITE_TRY_FLUSH: + req = &sk->appendRequest; - /* - * * We still need to call PQflush some more to finish the - * job. Caller function will handle this by setting right - * event* set. - */ - sk->flushWrite = true; - return true; + switch (wp->api.wal_read(sk, + &sk->outbuf.data[sk->outbuf.len], + req->beginLsn, + req->endLsn - req->beginLsn, + &errmsg)) + { + case NEON_WALREAD_SUCCESS: + break; + case NEON_WALREAD_WOULDBLOCK: + return true; + case NEON_WALREAD_ERROR: + wp_log(WARNING, "WAL reading for node %s:%s failed: %s", + sk->host, sk->port, errmsg); + ShutdownConnection(sk); + return false; + default: + Assert(false); + } - case PG_ASYNC_WRITE_FAIL: - walprop_log(WARNING, "Failed to send to node %s:%s in %s state: %s", - sk->host, sk->port, FormatSafekeeperState(sk->state), - wp->api.conn_error_message(sk)); - ShutdownConnection(sk); - return false; - default: - Assert(false); - return false; + sk->outbuf.len += req->endLsn - req->beginLsn; + + writeResult = wp->api.conn_async_write(sk, sk->outbuf.data, sk->outbuf.len); + + /* Mark current message as sent, whatever the result is */ + sk->streamingAt = req->endLsn; + + switch (writeResult) + { + case PG_ASYNC_WRITE_SUCCESS: + /* Continue writing the next message */ + sk->active_state = SS_ACTIVE_SEND; + break; + + case PG_ASYNC_WRITE_TRY_FLUSH: + + /* + * We still need to call PQflush some more to finish the + * job. Caller function will handle this by setting right + * event set. + */ + sk->active_state = SS_ACTIVE_FLUSH; + return true; + + case PG_ASYNC_WRITE_FAIL: + wp_log(WARNING, "failed to send to node %s:%s in %s state: %s", + sk->host, sk->port, FormatSafekeeperState(sk), + wp->api.conn_error_message(sk)); + ShutdownConnection(sk); + return false; + default: + Assert(false); + return false; + } } } @@ -1414,7 +1382,7 @@ SendAppendRequests(Safekeeper *sk) /* * Receive and process all available feedback. * - * Can change state if Async* functions encounter errors and reset connection. + * Resets state and kills the connection if any error on it is encountered. * Returns false in this case, true otherwise. * * NB: This function can call SendMessageToNode and produce new messages. @@ -1437,18 +1405,23 @@ RecvAppendResponses(Safekeeper *sk) if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) &sk->appendResponse)) break; - walprop_log(DEBUG2, "received message term=" INT64_FORMAT " flushLsn=%X/%X commitLsn=%X/%X from %s:%s", - sk->appendResponse.term, - LSN_FORMAT_ARGS(sk->appendResponse.flushLsn), - LSN_FORMAT_ARGS(sk->appendResponse.commitLsn), - sk->host, sk->port); + wp_log(DEBUG2, "received message term=" INT64_FORMAT " flushLsn=%X/%X commitLsn=%X/%X from %s:%s", + sk->appendResponse.term, + LSN_FORMAT_ARGS(sk->appendResponse.flushLsn), + LSN_FORMAT_ARGS(sk->appendResponse.commitLsn), + sk->host, sk->port); if (sk->appendResponse.term > wp->propTerm) { - /* Another compute with higher term is running. */ - walprop_log(PANIC, "WAL acceptor %s:%s with term " INT64_FORMAT " rejected our request, our term " INT64_FORMAT "", - sk->host, sk->port, - sk->appendResponse.term, wp->propTerm); + /* + * Another compute with higher term is running. Panic to restart + * PG as we likely need to retake basebackup. However, don't dump + * core as this is kinda expected scenario. + */ + disable_core_dump(); + wp_log(PANIC, "WAL acceptor %s:%s with term " INT64_FORMAT " rejected our request, our term " INT64_FORMAT "", + sk->host, sk->port, + sk->appendResponse.term, wp->propTerm); } readAnything = true; @@ -1492,32 +1465,32 @@ ParsePageserverFeedbackMessage(WalProposer *wp, StringInfo reply_message, Pagese pq_getmsgint(reply_message, sizeof(int32)); /* read value length */ rf->currentClusterSize = pq_getmsgint64(reply_message); - walprop_log(DEBUG2, "ParsePageserverFeedbackMessage: current_timeline_size %lu", - rf->currentClusterSize); + wp_log(DEBUG2, "ParsePageserverFeedbackMessage: current_timeline_size %lu", + rf->currentClusterSize); } else if ((strcmp(key, "ps_writelsn") == 0) || (strcmp(key, "last_received_lsn") == 0)) { pq_getmsgint(reply_message, sizeof(int32)); /* read value length */ rf->last_received_lsn = pq_getmsgint64(reply_message); - walprop_log(DEBUG2, "ParsePageserverFeedbackMessage: last_received_lsn %X/%X", - LSN_FORMAT_ARGS(rf->last_received_lsn)); + wp_log(DEBUG2, "ParsePageserverFeedbackMessage: last_received_lsn %X/%X", + LSN_FORMAT_ARGS(rf->last_received_lsn)); } else if ((strcmp(key, "ps_flushlsn") == 0) || (strcmp(key, "disk_consistent_lsn") == 0)) { pq_getmsgint(reply_message, sizeof(int32)); /* read value length */ rf->disk_consistent_lsn = pq_getmsgint64(reply_message); - walprop_log(DEBUG2, "ParsePageserverFeedbackMessage: disk_consistent_lsn %X/%X", - LSN_FORMAT_ARGS(rf->disk_consistent_lsn)); + wp_log(DEBUG2, "ParsePageserverFeedbackMessage: disk_consistent_lsn %X/%X", + LSN_FORMAT_ARGS(rf->disk_consistent_lsn)); } else if ((strcmp(key, "ps_applylsn") == 0) || (strcmp(key, "remote_consistent_lsn") == 0)) { pq_getmsgint(reply_message, sizeof(int32)); /* read value length */ rf->remote_consistent_lsn = pq_getmsgint64(reply_message); - walprop_log(DEBUG2, "ParsePageserverFeedbackMessage: remote_consistent_lsn %X/%X", - LSN_FORMAT_ARGS(rf->remote_consistent_lsn)); + wp_log(DEBUG2, "ParsePageserverFeedbackMessage: remote_consistent_lsn %X/%X", + LSN_FORMAT_ARGS(rf->remote_consistent_lsn)); } else if ((strcmp(key, "ps_replytime") == 0) || (strcmp(key, "replytime") == 0)) { @@ -1529,8 +1502,8 @@ ParsePageserverFeedbackMessage(WalProposer *wp, StringInfo reply_message, Pagese /* Copy because timestamptz_to_str returns a static buffer */ replyTimeStr = pstrdup(timestamptz_to_str(rf->replytime)); - walprop_log(DEBUG2, "ParsePageserverFeedbackMessage: replytime %lu reply_time: %s", - rf->replytime, replyTimeStr); + wp_log(DEBUG2, "ParsePageserverFeedbackMessage: replytime %lu reply_time: %s", + rf->replytime, replyTimeStr); pfree(replyTimeStr); } @@ -1544,7 +1517,7 @@ ParsePageserverFeedbackMessage(WalProposer *wp, StringInfo reply_message, Pagese * Skip unknown keys to support backward compatibile protocol * changes */ - walprop_log(LOG, "ParsePageserverFeedbackMessage: unknown key: %s len %d", key, len); + wp_log(LOG, "ParsePageserverFeedbackMessage: unknown key: %s len %d", key, len); pq_getmsgbytes(reply_message, len); }; } @@ -1595,39 +1568,77 @@ GetAcknowledgedByQuorumWALPosition(WalProposer *wp) return responses[wp->n_safekeepers - wp->quorum]; } +/* + * Return safekeeper with active connection from which WAL can be downloaded, or + * none if it doesn't exist. donor_lsn is set to end position of the donor to + * the best of our knowledge. + */ +Safekeeper * +GetDonor(WalProposer *wp, XLogRecPtr *donor_lsn) +{ + *donor_lsn = InvalidXLogRecPtr; + Safekeeper *donor = NULL; + int i; + + if (wp->n_votes < wp->quorum) + { + wp_log(WARNING, "GetDonor called before elections are won"); + return NULL; + } + + /* + * First, consider node which had determined our term start LSN as we know + * about its position immediately after election before any feedbacks are + * sent. + */ + if (wp->safekeeper[wp->donor].state >= SS_IDLE) + { + donor = &wp->safekeeper[wp->donor]; + *donor_lsn = wp->propEpochStartLsn; + } + + /* + * But also check feedbacks from all nodes with live connections and take + * the highest one. Note: if node sends feedbacks it already processed + * elected message so its term is fine. + */ + for (i = 0; i < wp->n_safekeepers; i++) + { + Safekeeper *sk = &wp->safekeeper[i]; + + if (sk->state == SS_ACTIVE && sk->appendResponse.flushLsn > *donor_lsn) + { + donor = sk; + *donor_lsn = sk->appendResponse.flushLsn; + } + } + return donor; +} + static void HandleSafekeeperResponse(WalProposer *wp) { XLogRecPtr minQuorumLsn; - XLogRecPtr minFlushLsn; + XLogRecPtr candidateTruncateLsn; minQuorumLsn = GetAcknowledgedByQuorumWALPosition(wp); wp->api.process_safekeeper_feedback(wp, minQuorumLsn); /* - * Try to advance truncateLsn to minFlushLsn, which is the last record - * flushed to all safekeepers. We must always start streaming from the - * beginning of the record, which simplifies decoding on the far end. + * Try to advance truncateLsn -- the last record flushed to all + * safekeepers. * - * Advanced truncateLsn should be not further than nearest commitLsn. This - * prevents surprising violation of truncateLsn <= commitLsn invariant - * which might occur because 1) truncateLsn can be advanced immediately - * once chunk is broadcast to all safekeepers, and commitLsn generally - * can't be advanced based on feedback from safekeeper who is still in the - * previous epoch (similar to 'leader can't commit entries from previous - * term' in Raft); 2) chunks we read from WAL and send are plain sheets of - * bytes, but safekeepers ack only on record boundaries. + * Advanced truncateLsn should be not higher than commitLsn. This prevents + * surprising violation of truncateLsn <= commitLsn invariant which might + * occur because commitLsn generally can't be advanced based on feedback + * from safekeeper who is still in the previous epoch (similar to 'leader + * can't commit entries from previous term' in Raft); 2) */ - minFlushLsn = CalculateMinFlushLsn(wp); - if (minFlushLsn > wp->truncateLsn) + candidateTruncateLsn = CalculateMinFlushLsn(wp); + candidateTruncateLsn = Min(candidateTruncateLsn, minQuorumLsn); + if (candidateTruncateLsn > wp->truncateLsn) { - wp->truncateLsn = minFlushLsn; - - /* - * Advance the replication slot to free up old WAL files. Note that - * slot doesn't exist if we are in syncSafekeepers mode. - */ - wp->api.confirm_wal_streamed(wp, wp->truncateLsn); + wp->truncateLsn = candidateTruncateLsn; } /* @@ -1699,9 +1710,9 @@ AsyncRead(Safekeeper *sk, char **buf, int *buf_size) return false; case PG_ASYNC_READ_FAIL: - walprop_log(WARNING, "Failed to read from node %s:%s in %s state: %s", sk->host, - sk->port, FormatSafekeeperState(sk->state), - wp->api.conn_error_message(sk)); + wp_log(WARNING, "failed to read from node %s:%s in %s state: %s", sk->host, + sk->port, FormatSafekeeperState(sk), + wp->api.conn_error_message(sk)); ShutdownConnection(sk); return false; } @@ -1739,8 +1750,8 @@ AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg) tag = pq_getmsgint64_le(&s); if (tag != anymsg->tag) { - walprop_log(WARNING, "unexpected message tag %c from node %s:%s in state %s", (char) tag, sk->host, - sk->port, FormatSafekeeperState(sk->state)); + wp_log(WARNING, "unexpected message tag %c from node %s:%s in state %s", (char) tag, sk->host, + sk->port, FormatSafekeeperState(sk)); ResetConnection(sk); return false; } @@ -1811,13 +1822,14 @@ static bool BlockingWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState success_state) { WalProposer *wp = sk->wp; - uint32 events; + uint32 sk_events; + uint32 nwr_events; if (!wp->api.conn_blocking_write(sk, msg, msg_size)) { - walprop_log(WARNING, "Failed to send to node %s:%s in %s state: %s", - sk->host, sk->port, FormatSafekeeperState(sk->state), - wp->api.conn_error_message(sk)); + wp_log(WARNING, "failed to send to node %s:%s in %s state: %s", + sk->host, sk->port, FormatSafekeeperState(sk), + wp->api.conn_error_message(sk)); ShutdownConnection(sk); return false; } @@ -1828,9 +1840,15 @@ BlockingWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState succes * If the new state will be waiting for events to happen, update the event * set to wait for those */ - events = SafekeeperStateDesiredEvents(success_state); - if (events) - wp->api.update_event_set(sk, events); + SafekeeperStateDesiredEvents(sk, &sk_events, &nwr_events); + + /* + * nwr_events is relevant only during SS_ACTIVE which doesn't use + * BlockingWrite + */ + Assert(!nwr_events); + if (sk_events) + wp->api.update_event_set(sk, sk_events); return true; } @@ -1862,9 +1880,9 @@ AsyncWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState flush_sta wp->api.update_event_set(sk, WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE); return false; case PG_ASYNC_WRITE_FAIL: - walprop_log(WARNING, "Failed to send to node %s:%s in %s state: %s", - sk->host, sk->port, FormatSafekeeperState(sk->state), - wp->api.conn_error_message(sk)); + wp_log(WARNING, "failed to send to node %s:%s in %s state: %s", + sk->host, sk->port, FormatSafekeeperState(sk), + wp->api.conn_error_message(sk)); ShutdownConnection(sk); return false; default: @@ -1901,9 +1919,9 @@ AsyncFlush(Safekeeper *sk) /* Nothing to do; try again when the socket's ready */ return false; case -1: - walprop_log(WARNING, "Failed to flush write to node %s:%s in %s state: %s", - sk->host, sk->port, FormatSafekeeperState(sk->state), - wp->api.conn_error_message(sk)); + wp_log(WARNING, "failed to flush write to node %s:%s in %s state: %s", + sk->host, sk->port, FormatSafekeeperState(sk), + wp->api.conn_error_message(sk)); ResetConnection(sk); return false; default: @@ -1932,18 +1950,18 @@ CompareLsn(const void *a, const void *b) * * The strings are intended to be used as a prefix to "state", e.g.: * - * walprop_log(LOG, "currently in %s state", FormatSafekeeperState(sk->state)); + * wp_log(LOG, "currently in %s state", FormatSafekeeperState(sk)); * * If this sort of phrasing doesn't fit the message, instead use something like: * - * walprop_log(LOG, "currently in state [%s]", FormatSafekeeperState(sk->state)); + * wp_log(LOG, "currently in state [%s]", FormatSafekeeperState(sk)); */ static char * -FormatSafekeeperState(SafekeeperState state) +FormatSafekeeperState(Safekeeper *sk) { char *return_val = NULL; - switch (state) + switch (sk->state) { case SS_OFFLINE: return_val = "offline"; @@ -1971,7 +1989,18 @@ FormatSafekeeperState(SafekeeperState state) return_val = "idle"; break; case SS_ACTIVE: - return_val = "active"; + switch (sk->active_state) + { + case SS_ACTIVE_SEND: + return_val = "active send"; + break; + case SS_ACTIVE_READ_WAL: + return_val = "active read WAL"; + break; + case SS_ACTIVE_FLUSH: + return_val = "active flush"; + break; + } break; } @@ -1984,22 +2013,21 @@ FormatSafekeeperState(SafekeeperState state) static void AssertEventsOkForState(uint32 events, Safekeeper *sk) { - WalProposer *wp = sk->wp; - uint32 expected = SafekeeperStateDesiredEvents(sk->state); - - /* - * The events are in-line with what we're expecting, under two conditions: - * (a) if we aren't expecting anything, `events` has no read- or - * write-ready component. (b) if we are expecting something, there's - * overlap (i.e. `events & expected != 0`) - */ + uint32 sk_events; + uint32 nwr_events; + uint32 expected; bool events_ok_for_state; /* long name so the `Assert` is more * clear later */ + WalProposer *wp = sk->wp; - if (expected == WL_NO_EVENTS) - events_ok_for_state = ((events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE)) == 0); - else - events_ok_for_state = ((events & expected) != 0); + SafekeeperStateDesiredEvents(sk, &sk_events, &nwr_events); + + /* + * Without one more level of notify target indirection we have no way to + * distinguish which socket woke up us, so just union expected events. + */ + expected = sk_events | nwr_events; + events_ok_for_state = ((events & expected) != 0); if (!events_ok_for_state) { @@ -2007,37 +2035,40 @@ AssertEventsOkForState(uint32 events, Safekeeper *sk) * To give a descriptive message in the case of failure, we use elog * and then an assertion that's guaranteed to fail. */ - walprop_log(WARNING, "events %s mismatched for safekeeper %s:%s in state [%s]", - FormatEvents(wp, events), sk->host, sk->port, FormatSafekeeperState(sk->state)); + wp_log(WARNING, "events %s mismatched for safekeeper %s:%s in state [%s]", + FormatEvents(wp, events), sk->host, sk->port, FormatSafekeeperState(sk)); Assert(events_ok_for_state); } } -/* Returns the set of events a safekeeper in this state should be waiting on +/* Returns the set of events for both safekeeper (sk_events) and neon_walreader + * (nwr_events) sockets a safekeeper in this state should be waiting on. * * This will return WL_NO_EVENTS (= 0) for some events. */ -static uint32 -SafekeeperStateDesiredEvents(SafekeeperState state) +void +SafekeeperStateDesiredEvents(Safekeeper *sk, uint32 *sk_events, uint32 *nwr_events) { - uint32 result = WL_NO_EVENTS; + WalProposer *wp = sk->wp; + + *nwr_events = 0; /* nwr_events is empty for most states */ /* If the state doesn't have a modifier, we can check the base state */ - switch (state) + switch (sk->state) { /* Connecting states say what they want in the name */ case SS_CONNECTING_READ: - result = WL_SOCKET_READABLE; - break; + *sk_events = WL_SOCKET_READABLE; + return; case SS_CONNECTING_WRITE: - result = WL_SOCKET_WRITEABLE; - break; + *sk_events = WL_SOCKET_WRITEABLE; + return; /* Reading states need the socket to be read-ready to continue */ case SS_WAIT_EXEC_RESULT: case SS_HANDSHAKE_RECV: case SS_WAIT_VERDICT: - result = WL_SOCKET_READABLE; - break; + *sk_events = WL_SOCKET_READABLE; + return; /* * Idle states use read-readiness as a sign that the connection @@ -2045,32 +2076,66 @@ SafekeeperStateDesiredEvents(SafekeeperState state) */ case SS_VOTING: case SS_IDLE: - result = WL_SOCKET_READABLE; - break; + *sk_events = WL_SOCKET_READABLE; + return; - /* - * Flush states require write-ready for flushing. Active state - * does both reading and writing. - * - * TODO: SS_ACTIVE sometimes doesn't need to be write-ready. We - * should check sk->flushWrite here to set WL_SOCKET_WRITEABLE. - */ case SS_SEND_ELECTED_FLUSH: + *sk_events = WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE; + return; + case SS_ACTIVE: - result = WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE; - break; + switch (sk->active_state) + { + /* + * Everything is sent; we just wait for sk responses and + * latch. + * + * Note: this assumes we send all available WAL to + * safekeeper in one wakeup (unless it blocks). Otherwise + * we would want WL_SOCKET_WRITEABLE here to finish the + * work. + */ + case SS_ACTIVE_SEND: + *sk_events = WL_SOCKET_READABLE; + /* c.f. walprop_pg_active_state_update_event_set */ +#if PG_VERSION_NUM >= 150000 + if (wp->api.wal_reader_events(sk)) + *nwr_events = WL_SOCKET_CLOSED; +#endif /* on PG 14 nwr_events remains 0 */ + return; + + /* + * Waiting for neon_walreader socket, but we still read + * responses from sk socket. + */ + case SS_ACTIVE_READ_WAL: + *sk_events = WL_SOCKET_READABLE; + *nwr_events = wp->api.wal_reader_events(sk); + return; + + /* + * Need to flush the sk socket, so ignore neon_walreader + * one and set write interest on sk. + */ + case SS_ACTIVE_FLUSH: + *sk_events = WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE; +#if PG_VERSION_NUM >= 150000 + /* c.f. walprop_pg_active_state_update_event_set */ + if (wp->api.wal_reader_events(sk)) + *nwr_events = WL_SOCKET_CLOSED; +#endif /* on PG 14 nwr_events remains 0 */ + return; + } + return; /* The offline state expects no events. */ case SS_OFFLINE: - result = WL_NO_EVENTS; - break; + *sk_events = 0; + return; default: Assert(false); - break; } - - return result; } /* Returns a human-readable string corresponding to the event set @@ -2110,8 +2175,8 @@ FormatEvents(WalProposer *wp, uint32 events) if (events & (~all_flags)) { - walprop_log(WARNING, "Event formatting found unexpected component %d", - events & (~all_flags)); + wp_log(WARNING, "event formatting found unexpected component %d", + events & (~all_flags)); return_str[6] = '*'; return_str[7] = '\0'; } diff --git a/pgxn/neon/walproposer.h b/pgxn/neon/walproposer.h index 664aeedfa7..688d8e6e52 100644 --- a/pgxn/neon/walproposer.h +++ b/pgxn/neon/walproposer.h @@ -1,14 +1,15 @@ #ifndef __NEON_WALPROPOSER_H__ #define __NEON_WALPROPOSER_H__ -#include "postgres.h" -#include "access/xlogdefs.h" -#include "port.h" -#include "access/xlog_internal.h" #include "access/transam.h" +#include "access/xlogdefs.h" +#include "access/xlog_internal.h" #include "nodes/replnodes.h" -#include "utils/uuid.h" #include "replication/walreceiver.h" +#include "utils/uuid.h" + +#include "libpqwalproposer.h" +#include "neon_walreader.h" #define SK_MAGIC 0xCafeCeefu #define SK_PROTOCOL_VERSION 2 @@ -22,43 +23,9 @@ */ #define WL_NO_EVENTS 0 -struct WalProposerConn; /* Defined in implementation (walprop_pg.c) */ +struct WalProposerConn; /* Defined in libpqwalproposer.h */ typedef struct WalProposerConn WalProposerConn; -/* Possible return values from ReadPGAsync */ -typedef enum -{ - /* The full read was successful. buf now points to the data */ - PG_ASYNC_READ_SUCCESS, - - /* - * The read is ongoing. Wait until the connection is read-ready, then try - * again. - */ - PG_ASYNC_READ_TRY_AGAIN, - /* Reading failed. Check PQerrorMessage(conn) */ - PG_ASYNC_READ_FAIL, -} PGAsyncReadResult; - -/* Possible return values from WritePGAsync */ -typedef enum -{ - /* The write fully completed */ - PG_ASYNC_WRITE_SUCCESS, - - /* - * The write started, but you'll need to call PQflush some more times to - * finish it off. We just tried, so it's best to wait until the connection - * is read- or write-ready to try again. - * - * If it becomes read-ready, call PQconsumeInput and flush again. If it - * becomes write-ready, just call PQflush. - */ - PG_ASYNC_WRITE_TRY_FLUSH, - /* Writing failed. Check PQerrorMessage(conn) */ - PG_ASYNC_WRITE_FAIL, -} PGAsyncWriteResult; - /* * WAL safekeeper state, which is used to wait for some event. * @@ -135,6 +102,40 @@ typedef enum SS_ACTIVE, } SafekeeperState; +/* + * Sending WAL substates of SS_ACTIVE. + */ +typedef enum +{ + /* + * We are ready to send more WAL, waiting for latch set to learn about + * more WAL becoming available (or just a timeout to send heartbeat). + */ + SS_ACTIVE_SEND, + + /* + * Polling neon_walreader to receive chunk of WAL (probably remotely) to + * send to this safekeeper. + * + * Note: socket management is done completely inside walproposer_pg for + * simplicity, and thus simulation doesn't test it. Which is fine as + * simulation is mainly aimed at consensus checks, not waiteventset + * management. + * + * Also, while in this state we don't touch safekeeper socket, so in + * theory it might close connection as inactive. This can be addressed if + * needed; however, while fetching WAL we should regularly send it, so the + * problem is unlikely. Vice versa is also true (SS_ACTIVE doesn't handle + * walreader socket), but similarly shouldn't be a problem. + */ + SS_ACTIVE_READ_WAL, + + /* + * Waiting for write readiness to flush the socket. + */ + SS_ACTIVE_FLUSH, +} SafekeeperActiveState; + /* Consensus logical timestamp. */ typedef uint64 term_t; @@ -343,12 +344,11 @@ typedef struct Safekeeper */ XLogRecPtr startStreamingAt; - bool flushWrite; /* set to true if we need to call AsyncFlush,* - * to flush pending messages */ XLogRecPtr streamingAt; /* current streaming position */ AppendRequestHeader appendRequest; /* request for sending to safekeeper */ SafekeeperState state; /* safekeeper state machine state */ + SafekeeperActiveState active_state; TimestampTz latestMsgReceivedAt; /* when latest msg is received */ AcceptorGreeting greetResponse; /* acceptor greeting */ VoteResponse voteResponse; /* the vote */ @@ -356,7 +356,8 @@ typedef struct Safekeeper /* postgres-specific fields */ - #ifndef WALPROPOSER_LIB +#ifndef WALPROPOSER_LIB + /* * postgres protocol connection to the WAL acceptor * @@ -368,23 +369,39 @@ typedef struct Safekeeper /* * WAL reader, allocated for each safekeeper. */ - XLogReaderState *xlogreader; + NeonWALReader *xlogreader; /* * Position in wait event set. Equal to -1 if no event */ int eventPos; - #endif + + /* + * Neon WAL reader position in wait event set, or -1 if no socket. Note + * that event must be removed not only on error/failure, but also on + * successful *local* read, as next read might again be remote, but with + * different socket. + */ + int nwrEventPos; + + /* + * Per libpq docs, during connection establishment socket might change, + * remember here if it is stable to avoid readding to the event set if + * possible. Must be reset whenever nwr event is deleted. + */ + bool nwrConnEstablished; +#endif /* WalProposer library specifics */ - #ifdef WALPROPOSER_LIB +#ifdef WALPROPOSER_LIB + /* * Buffer for incoming messages. Usually Rust vector is stored here. * Caller is responsible for freeing the buffer. */ StringInfoData inbuf; - #endif +#endif } Safekeeper; /* Re-exported PostgresPollingStatusType */ @@ -401,31 +418,6 @@ typedef enum */ } WalProposerConnectPollStatusType; -/* Re-exported and modified ExecStatusType */ -typedef enum -{ - /* We received a single CopyBoth result */ - WP_EXEC_SUCCESS_COPYBOTH, - - /* - * Any success result other than a single CopyBoth was received. The - * specifics of the result were already logged, but it may be useful to - * provide an error message indicating which safekeeper messed up. - * - * Do not expect PQerrorMessage to be appropriately set. - */ - WP_EXEC_UNEXPECTED_SUCCESS, - - /* - * No result available at this time. Wait until read-ready, then call - * again. Internally, this is returned when PQisBusy indicates that - * PQgetResult would block. - */ - WP_EXEC_NEEDS_INPUT, - /* Catch-all failure. Check PQerrorMessage. */ - WP_EXEC_FAILED, -} WalProposerExecStatusType; - /* Re-exported ConnStatusType */ typedef enum { @@ -472,7 +464,7 @@ typedef struct walproposer_api WalProposerConnStatusType (*conn_status) (Safekeeper *sk); /* Start the connection, aka PQconnectStart. */ - void (*conn_connect_start) (Safekeeper *sk); + void (*conn_connect_start) (Safekeeper *sk); /* Poll an asynchronous connection, aka PQconnectPoll. */ WalProposerConnectPollStatusType (*conn_connect_poll) (Safekeeper *sk); @@ -486,11 +478,11 @@ typedef struct walproposer_api /* Flush buffer to the network, aka PQflush. */ int (*conn_flush) (Safekeeper *sk); - /* Close the connection, aka PQfinish. */ + /* Reset sk state: close pq connection, deallocate xlogreader. */ void (*conn_finish) (Safekeeper *sk); /* - * Try to read CopyData message from the safekeeper, aka PQgetCopyData. + * Try to read CopyData message from the safekeeper, aka PQgetCopyData. * * On success, the data is placed in *buf. It is valid until the next call * to this function. @@ -503,17 +495,20 @@ typedef struct walproposer_api /* Blocking CopyData write, aka PQputCopyData + PQflush. */ bool (*conn_blocking_write) (Safekeeper *sk, void const *buf, size_t size); - /* Download WAL from startpos to endpos and make it available locally. */ - bool (*recovery_download) (Safekeeper *sk, TimeLineID timeline, XLogRecPtr startpos, XLogRecPtr endpos); - - /* Read WAL from disk to buf. */ - void (*wal_read) (Safekeeper *sk, char *buf, XLogRecPtr startptr, Size count); + /* + * Download WAL before basebackup for logical walsenders from sk, if + * needed + */ + bool (*recovery_download) (WalProposer *wp, Safekeeper *sk); /* Allocate WAL reader. */ - void (*wal_reader_allocate) (Safekeeper *sk); + void (*wal_reader_allocate) (Safekeeper *sk); - /* Deallocate event set. */ - void (*free_event_set) (WalProposer *wp); + /* Read WAL from disk to buf. */ + NeonWALReadResult (*wal_read) (Safekeeper *sk, char *buf, XLogRecPtr startptr, Size count, char **errmsg); + + /* Returns events to be awaited on WAL reader, if any. */ + uint32 (*wal_reader_events) (Safekeeper *sk); /* Initialize event set. */ void (*init_event_set) (WalProposer *wp); @@ -521,9 +516,15 @@ typedef struct walproposer_api /* Update events for an existing safekeeper connection. */ void (*update_event_set) (Safekeeper *sk, uint32 events); + /* Configure wait event set for yield in SS_ACTIVE. */ + void (*active_state_update_event_set) (Safekeeper *sk); + /* Add a new safekeeper connection to the event set. */ void (*add_safekeeper_event_set) (Safekeeper *sk, uint32 events); + /* Remove safekeeper connection from event set */ + void (*rm_safekeeper_event_set) (Safekeeper *sk); + /* * Wait until some event happens: - timeout is reached - socket event for * safekeeper connection - new WAL is available @@ -556,26 +557,12 @@ typedef struct walproposer_api */ void (*process_safekeeper_feedback) (WalProposer *wp, XLogRecPtr commitLsn); - /* - * Called on peer_horizon_lsn updates. Used to advance replication slot - * and to free up disk space by deleting unnecessary WAL. - */ - void (*confirm_wal_streamed) (WalProposer *wp, XLogRecPtr lsn); - /* * Write a log message to the internal log processor. This is used only * when walproposer is compiled as a library. Otherwise, all logging is * handled by elog(). */ void (*log_internal) (WalProposer *wp, int level, const char *line); - - /* - * Called right after the proposer was elected, but before it started - * recovery and sent ProposerElected message to the safekeepers. - * - * Used by logical replication to update truncateLsn. - */ - void (*after_election) (WalProposer *wp); } walproposer_api; /* @@ -626,10 +613,10 @@ typedef struct WalProposerConfig uint64 systemId; /* Will be passed to safekeepers in greet request. */ - TimeLineID pgTimeline; + TimeLineID pgTimeline; #ifdef WALPROPOSER_LIB - void *callback_data; + void *callback_data; #endif } WalProposerConfig; @@ -709,14 +696,34 @@ extern void WalProposerBroadcast(WalProposer *wp, XLogRecPtr startpos, XLogRecPt extern void WalProposerPoll(WalProposer *wp); extern void WalProposerFree(WalProposer *wp); +/* + * WaitEventSet API doesn't allow to remove socket, so walproposer_pg uses it to + * recreate set from scratch, hence the export. + */ +extern void SafekeeperStateDesiredEvents(Safekeeper *sk, uint32 *sk_events, uint32 *nwr_events); +extern Safekeeper *GetDonor(WalProposer *wp, XLogRecPtr *donor_lsn); -#define WPEVENT 1337 /* special log level for walproposer internal events */ +#define WPEVENT 1337 /* special log level for walproposer internal + * events */ + +#define WP_LOG_PREFIX "[WP] " + +/* + * wp_log is used in pure wp code (walproposer.c), allowing API callback to + * catch logging. + */ #ifdef WALPROPOSER_LIB -void WalProposerLibLog(WalProposer *wp, int elevel, char *fmt, ...); -#define walprop_log(elevel, ...) WalProposerLibLog(wp, elevel, __VA_ARGS__) +extern void WalProposerLibLog(WalProposer *wp, int elevel, char *fmt,...); +#define wp_log(elevel, fmt, ...) WalProposerLibLog(wp, elevel, fmt, ## __VA_ARGS__) #else -#define walprop_log(elevel, ...) elog(elevel, __VA_ARGS__) +#define wp_log(elevel, fmt, ...) elog(elevel, WP_LOG_PREFIX fmt, ## __VA_ARGS__) #endif +/* + * And wpg_log is used all other (postgres specific) walproposer code, just + * adding prefix. + */ +#define wpg_log(elevel, fmt, ...) elog(elevel, WP_LOG_PREFIX fmt, ## __VA_ARGS__) + #endif /* __NEON_WALPROPOSER_H__ */ diff --git a/pgxn/neon/walproposer_compat.c b/pgxn/neon/walproposer_compat.c index 7617f21a26..35d984c52e 100644 --- a/pgxn/neon/walproposer_compat.c +++ b/pgxn/neon/walproposer_compat.c @@ -3,14 +3,17 @@ * This is needed to avoid linking to full postgres server installation. This file * is compiled as a part of libwalproposer static library. */ +#include "postgres.h" #include -#include "walproposer.h" -#include "utils/datetime.h" -#include "miscadmin.h" -void ExceptionalCondition(const char *conditionName, - const char *fileName, int lineNumber) +#include "miscadmin.h" +#include "utils/datetime.h" +#include "walproposer.h" + +void +ExceptionalCondition(const char *conditionName, + const char *fileName, int lineNumber) { fprintf(stderr, "ExceptionalCondition: %s:%d: %s\n", fileName, lineNumber, conditionName); @@ -169,17 +172,18 @@ timestamptz_to_str(TimestampTz t) bool TimestampDifferenceExceeds(TimestampTz start_time, - TimestampTz stop_time, - int msec) + TimestampTz stop_time, + int msec) { TimestampTz diff = stop_time - start_time; + return (diff >= msec * INT64CONST(1000)); } void -WalProposerLibLog(WalProposer *wp, int elevel, char *fmt, ...) +WalProposerLibLog(WalProposer *wp, int elevel, char *fmt,...) { - char buf[1024]; + char buf[1024]; va_list args; fmt = _(fmt); diff --git a/pgxn/neon/walproposer_pg.c b/pgxn/neon/walproposer_pg.c index f83a08d407..61a2a54809 100644 --- a/pgxn/neon/walproposer_pg.c +++ b/pgxn/neon/walproposer_pg.c @@ -12,6 +12,7 @@ #include #include #include "access/xact.h" +#include "access/xlog.h" #include "access/xlogdefs.h" #include "access/xlogutils.h" #include "access/xloginsert.h" @@ -43,14 +44,19 @@ #include "utils/ps_status.h" #include "utils/timestamp.h" -#include "neon.h" -#include "walproposer.h" #include "libpq-fe.h" +#include "libpqwalproposer.h" +#include "neon.h" +#include "neon_walreader.h" +#include "walproposer.h" + #define XLOG_HDR_SIZE (1 + 8 * 3) /* 'w' + startPos + walEnd + timestamp */ #define XLOG_HDR_START_POS 1 /* offset of start position in wal sender* * message header */ +#define MB ((XLogRecPtr)1024 * 1024) + #define WAL_PROPOSER_SLOT_NAME "wal_proposer_slot" char *wal_acceptors_list = ""; @@ -91,6 +97,12 @@ static void XLogBroadcastWalProposer(WalProposer *wp); static void XLogWalPropWrite(WalProposer *wp, char *buf, Size nbytes, XLogRecPtr recptr); static void XLogWalPropClose(XLogRecPtr recptr); +static void add_nwr_event_set(Safekeeper *sk, uint32 events); +static void update_nwr_event_set(Safekeeper *sk, uint32 events); +static void rm_safekeeper_event_set(Safekeeper *to_remove, bool is_sk); + +static XLogRecPtr GetLogRepRestartLSN(WalProposer *wp); + static void init_walprop_config(bool syncSafekeepers) { @@ -214,7 +226,6 @@ backpressure_lag_impl(void) XLogRecPtr myFlushLsn = GetFlushRecPtr(); #endif replication_feedback_get_lsns(&writePtr, &flushPtr, &applyPtr); -#define MB ((XLogRecPtr)1024 * 1024) elog(DEBUG2, "current flushLsn %X/%X PageserverFeedback: write %X/%X flush %X/%X apply %X/%X", LSN_FORMAT_ARGS(myFlushLsn), @@ -413,8 +424,8 @@ walprop_pg_start_streaming(WalProposer *wp, XLogRecPtr startpos) { StartReplicationCmd cmd; - elog(LOG, "WAL proposer starts streaming at %X/%X", - LSN_FORMAT_ARGS(startpos)); + wpg_log(LOG, "WAL proposer starts streaming at %X/%X", + LSN_FORMAT_ARGS(startpos)); cmd.slotname = WAL_PROPOSER_SLOT_NAME; cmd.timeline = wp->greetRequest.timeline; cmd.startpoint = startpos; @@ -538,17 +549,9 @@ walprop_pg_load_libpqwalreceiver(void) { load_file("libpqwalreceiver", false); if (WalReceiverFunctions == NULL) - elog(ERROR, "libpqwalreceiver didn't initialize correctly"); + wpg_log(ERROR, "libpqwalreceiver didn't initialize correctly"); } -/* Header in walproposer.h -- Wrapper struct to abstract away the libpq connection */ -struct WalProposerConn -{ - PGconn *pg_conn; - bool is_nonblocking; /* whether the connection is non-blocking */ - char *recvbuf; /* last received data from walprop_async_read */ -}; - /* Helper function */ static bool ensure_nonblocking_status(WalProposerConn *conn, bool is_nonblocking) @@ -586,16 +589,17 @@ walprop_status(Safekeeper *sk) } } -static void -walprop_connect_start(Safekeeper *sk) +WalProposerConn * +libpqwp_connect_start(char *conninfo) { + PGconn *pg_conn; + WalProposerConn *conn; const char *keywords[3]; const char *values[3]; int n; char *password = neon_auth_token; - Assert(sk->conn == NULL); /* * Connect using the given connection string. If the NEON_AUTH_TOKEN @@ -614,7 +618,7 @@ walprop_connect_start(Safekeeper *sk) n++; } keywords[n] = "dbname"; - values[n] = sk->conninfo; + values[n] = conninfo; n++; keywords[n] = NULL; values[n] = NULL; @@ -626,7 +630,7 @@ walprop_connect_start(Safekeeper *sk) * PGconn structure" */ if (!pg_conn) - elog(FATAL, "failed to allocate new PGconn object"); + wpg_log(FATAL, "failed to allocate new PGconn object"); /* * And in theory this allocation can fail as well, but it's incredibly @@ -635,11 +639,20 @@ walprop_connect_start(Safekeeper *sk) * palloc will exit on failure though, so there's not much we could do if * it *did* fail. */ - sk->conn = palloc(sizeof(WalProposerConn)); - sk->conn->pg_conn = pg_conn; - sk->conn->is_nonblocking = false; /* connections always start in blocking + conn = palloc(sizeof(WalProposerConn)); + conn->pg_conn = pg_conn; + conn->is_nonblocking = false; /* connections always start in blocking * mode */ - sk->conn->recvbuf = NULL; + conn->recvbuf = NULL; + return conn; +} + +static void +walprop_connect_start(Safekeeper *sk) +{ + Assert(sk->conn == NULL); + sk->conn = libpqwp_connect_start(sk->conninfo); + } static WalProposerConnectPollStatusType @@ -667,7 +680,7 @@ walprop_connect_poll(Safekeeper *sk) * unused. We'll expect it's never returned. */ case PGRES_POLLING_ACTIVE: - elog(FATAL, "Unexpected PGRES_POLLING_ACTIVE returned from PQconnectPoll"); + wpg_log(FATAL, "unexpected PGRES_POLLING_ACTIVE returned from PQconnectPoll"); /* * This return is never actually reached, but it's here to make @@ -683,26 +696,33 @@ walprop_connect_poll(Safekeeper *sk) return return_val; } -static bool -walprop_send_query(Safekeeper *sk, char *query) +extern bool +libpqwp_send_query(WalProposerConn *conn, char *query) { /* * We need to be in blocking mode for sending the query to run without * requiring a call to PQflush */ - if (!ensure_nonblocking_status(sk->conn, false)) + if (!ensure_nonblocking_status(conn, false)) return false; /* PQsendQuery returns 1 on success, 0 on failure */ - if (!PQsendQuery(sk->conn->pg_conn, query)) + if (!PQsendQuery(conn->pg_conn, query)) return false; return true; } -static WalProposerExecStatusType -walprop_get_query_result(Safekeeper *sk) +static bool +walprop_send_query(Safekeeper *sk, char *query) { + return libpqwp_send_query(sk->conn, query); +} + +WalProposerExecStatusType +libpqwp_get_query_result(WalProposerConn *conn) +{ + PGresult *result; WalProposerExecStatusType return_val; @@ -710,14 +730,14 @@ walprop_get_query_result(Safekeeper *sk) char *unexpected_success = NULL; /* Consume any input that we might be missing */ - if (!PQconsumeInput(sk->conn->pg_conn)) + if (!PQconsumeInput(conn->pg_conn)) return WP_EXEC_FAILED; - if (PQisBusy(sk->conn->pg_conn)) + if (PQisBusy(conn->pg_conn)) return WP_EXEC_NEEDS_INPUT; - result = PQgetResult(sk->conn->pg_conn); + result = PQgetResult(conn->pg_conn); /* * PQgetResult returns NULL only if getting the result was successful & @@ -725,7 +745,7 @@ walprop_get_query_result(Safekeeper *sk) */ if (!result) { - elog(WARNING, "[libpqwalproposer] Unexpected successful end of command results"); + wpg_log(WARNING, "[libpqwalproposer] Unexpected successful end of command results"); return WP_EXEC_UNEXPECTED_SUCCESS; } @@ -773,11 +793,17 @@ walprop_get_query_result(Safekeeper *sk) } if (unexpected_success) - elog(WARNING, "[libpqwalproposer] Unexpected successful %s", unexpected_success); + wpg_log(WARNING, "[libpqwalproposer] Unexpected successful %s", unexpected_success); return return_val; } +static WalProposerExecStatusType +walprop_get_query_result(Safekeeper *sk) +{ + return libpqwp_get_query_result(sk->conn); +} + static pgsocket walprop_socket(Safekeeper *sk) { @@ -790,42 +816,31 @@ walprop_flush(Safekeeper *sk) return (PQflush(sk->conn->pg_conn)); } -static void -walprop_finish(Safekeeper *sk) +/* Like libpqrcv_receive. *buf is valid until the next call. */ +PGAsyncReadResult +libpqwp_async_read(WalProposerConn *conn, char **buf, int *amount) { - if (!sk->conn) - return; + int rawlen; - if (sk->conn->recvbuf != NULL) - PQfreemem(sk->conn->recvbuf); - PQfinish(sk->conn->pg_conn); - pfree(sk->conn); - sk->conn = NULL; -} - -/* - * Receive a message from the safekeeper. - * - * On success, the data is placed in *buf. It is valid until the next call - * to this function. - */ -static PGAsyncReadResult -walprop_async_read(Safekeeper *sk, char **buf, int *amount) -{ - int result; - - if (sk->conn->recvbuf != NULL) + if (conn->recvbuf != NULL) { - PQfreemem(sk->conn->recvbuf); - sk->conn->recvbuf = NULL; + PQfreemem(conn->recvbuf); + conn->recvbuf = NULL; } - /* Call PQconsumeInput so that we have the data we need */ - if (!PQconsumeInput(sk->conn->pg_conn)) + /* Try to receive a CopyData message */ + rawlen = PQgetCopyData(conn->pg_conn, &conn->recvbuf, true); + if (rawlen == 0) { - *amount = 0; - *buf = NULL; - return PG_ASYNC_READ_FAIL; + /* Try consuming some data. */ + if (!PQconsumeInput(conn->pg_conn)) + { + *amount = 0; + *buf = NULL; + return PG_ASYNC_READ_FAIL; + } + /* Now that we've consumed some input, try again */ + rawlen = PQgetCopyData(conn->pg_conn, &conn->recvbuf, true); } /* @@ -839,7 +854,7 @@ walprop_async_read(Safekeeper *sk, char **buf, int *amount) * sometimes be triggered by the server returning an ErrorResponse (which * also happens to have the effect that the copy is done). */ - switch (result = PQgetCopyData(sk->conn->pg_conn, &sk->conn->recvbuf, true)) + switch (rawlen) { case 0: *amount = 0; @@ -854,10 +869,10 @@ walprop_async_read(Safekeeper *sk, char **buf, int *amount) * We can check PQgetResult to make sure that the server * failed; it'll always result in PGRES_FATAL_ERROR */ - ExecStatusType status = PQresultStatus(PQgetResult(sk->conn->pg_conn)); + ExecStatusType status = PQresultStatus(PQgetResult(conn->pg_conn)); if (status != PGRES_FATAL_ERROR) - elog(FATAL, "unexpected result status %d after failed PQgetCopyData", status); + wpg_log(FATAL, "unexpected result status %d after failed PQgetCopyData", status); /* * If there was actually an error, it'll be properly reported @@ -874,12 +889,24 @@ walprop_async_read(Safekeeper *sk, char **buf, int *amount) return PG_ASYNC_READ_FAIL; default: /* Positive values indicate the size of the returned result */ - *amount = result; - *buf = sk->conn->recvbuf; + *amount = rawlen; + *buf = conn->recvbuf; return PG_ASYNC_READ_SUCCESS; } } +/* + * Receive a message from the safekeeper. + * + * On success, the data is placed in *buf. It is valid until the next call + * to this function. + */ +static PGAsyncReadResult +walprop_async_read(Safekeeper *sk, char **buf, int *amount) +{ + return libpqwp_async_read(sk->conn, buf, amount); +} + static PGAsyncWriteResult walprop_async_write(Safekeeper *sk, void const *buf, size_t size) { @@ -910,7 +937,7 @@ walprop_async_write(Safekeeper *sk, void const *buf, size_t size) case -1: return PG_ASYNC_WRITE_FAIL; default: - elog(FATAL, "invalid return %d from PQputCopyData", result); + wpg_log(FATAL, "invalid return %d from PQputCopyData", result); } /* @@ -931,7 +958,7 @@ walprop_async_write(Safekeeper *sk, void const *buf, size_t size) case -1: return PG_ASYNC_WRITE_FAIL; default: - elog(FATAL, "invalid return %d from PQflush", result); + wpg_log(FATAL, "invalid return %d from PQflush", result); } } @@ -962,6 +989,33 @@ walprop_blocking_write(Safekeeper *sk, void const *buf, size_t size) return true; } +void +libpqwp_disconnect(WalProposerConn *conn) +{ + if (conn->recvbuf != NULL) + PQfreemem(conn->recvbuf); + PQfinish(conn->pg_conn); + pfree(conn); +} + +static void +walprop_finish(Safekeeper *sk) +{ + if (sk->conn) + { + libpqwp_disconnect(sk->conn); + sk->conn = NULL; + } + + /* free xlogreader */ + if (sk->xlogreader) + { + NeonWALReaderFree(sk->xlogreader); + sk->xlogreader = NULL; + } + rm_safekeeper_event_set(sk, false); +} + /* * Subscribe for new WAL and stream it in the loop to safekeepers. * @@ -1165,16 +1219,25 @@ XLogBroadcastWalProposer(WalProposer *wp) } } -/* - * Receive WAL from most advanced safekeeper - */ +/* Download WAL before basebackup for logical walsenders from sk, if needed */ static bool -WalProposerRecovery(Safekeeper *sk, TimeLineID timeline, XLogRecPtr startpos, XLogRecPtr endpos) +WalProposerRecovery(WalProposer *wp, Safekeeper *sk) { char *err; WalReceiverConn *wrconn; WalRcvStreamOptions options; char conninfo[MAXCONNINFO]; + TimeLineID timeline; + XLogRecPtr startpos; + XLogRecPtr endpos; + uint64 download_range_mb; + + startpos = GetLogRepRestartLSN(wp); + if (startpos == InvalidXLogRecPtr) + return true; /* recovery not needed */ + endpos = wp->propEpochStartLsn; + + timeline = wp->greetRequest.timeline; if (!neon_auth_token) { @@ -1186,7 +1249,7 @@ WalProposerRecovery(Safekeeper *sk, TimeLineID timeline, XLogRecPtr startpos, XL written = snprintf((char *) conninfo, MAXCONNINFO, "password=%s %s", neon_auth_token, sk->conninfo); if (written > MAXCONNINFO || written < 0) - elog(FATAL, "could not append password to the safekeeper connection string"); + wpg_log(FATAL, "could not append password to the safekeeper connection string"); } #if PG_MAJORVERSION_NUM < 16 @@ -1203,11 +1266,11 @@ WalProposerRecovery(Safekeeper *sk, TimeLineID timeline, XLogRecPtr startpos, XL err))); return false; } - elog(LOG, - "start recovery from %s:%s starting from %X/%08X till %X/%08X timeline " - "%d", - sk->host, sk->port, (uint32) (startpos >> 32), - (uint32) startpos, (uint32) (endpos >> 32), (uint32) endpos, timeline); + wpg_log(LOG, + "start recovery for logical replication from %s:%s starting from %X/%08X till %X/%08X timeline " + "%d", + sk->host, sk->port, (uint32) (startpos >> 32), + (uint32) startpos, (uint32) (endpos >> 32), (uint32) endpos, timeline); options.logical = false; options.startpoint = startpos; @@ -1291,10 +1354,11 @@ XLogWalPropWrite(WalProposer *wp, char *buf, Size nbytes, XLogRecPtr recptr) /* * Apart from walproposer, basebackup LSN page is also written out by * postgres itself which writes WAL only in pages, and in basebackup it is - * inherently dummy (only safekeepers have historic WAL). Update WAL buffers - * here to avoid dummy page overwriting correct one we download here. Ugly, - * but alternatives are about the same ugly. We won't need that if we switch - * to on-demand WAL download from safekeepers, without writing to disk. + * inherently dummy (only safekeepers have historic WAL). Update WAL + * buffers here to avoid dummy page overwriting correct one we download + * here. Ugly, but alternatives are about the same ugly. We won't need + * that if we switch to on-demand WAL download from safekeepers, without + * writing to disk. * * https://github.com/neondatabase/neon/issues/5749 */ @@ -1399,28 +1463,54 @@ XLogWalPropClose(XLogRecPtr recptr) walpropFile = -1; } -static void -walprop_pg_wal_read(Safekeeper *sk, char *buf, XLogRecPtr startptr, Size count) -{ - WALReadError errinfo; - - if (!WALRead(sk->xlogreader, - buf, - startptr, - count, - walprop_pg_get_timeline_id(), - &errinfo)) - { - WALReadRaiseError(&errinfo); - } -} - static void walprop_pg_wal_reader_allocate(Safekeeper *sk) { - sk->xlogreader = XLogReaderAllocate(wal_segment_size, NULL, XL_ROUTINE(.segment_open = wal_segment_open,.segment_close = wal_segment_close), NULL); + char log_prefix[64]; + + snprintf(log_prefix, sizeof(log_prefix), WP_LOG_PREFIX "sk %s:%s nwr: ", sk->host, sk->port); + Assert(!sk->xlogreader); + sk->xlogreader = NeonWALReaderAllocate(wal_segment_size, sk->wp->propEpochStartLsn, sk->wp, log_prefix); if (sk->xlogreader == NULL) - elog(FATAL, "Failed to allocate xlog reader"); + wpg_log(FATAL, "failed to allocate xlog reader"); +} + +static NeonWALReadResult +walprop_pg_wal_read(Safekeeper *sk, char *buf, XLogRecPtr startptr, Size count, char **errmsg) +{ + NeonWALReadResult res; + + res = NeonWALRead(sk->xlogreader, + buf, + startptr, + count, + walprop_pg_get_timeline_id()); + + if (res == NEON_WALREAD_SUCCESS) + { + /* + * If we have the socket subscribed, but walreader doesn't need any + * events, it must mean that remote connection just closed hoping to + * do next read locally. Remove the socket then. It is important to do + * as otherwise next read might open another connection and we won't + * be able to distinguish whether we have correct socket added in wait + * event set. + */ + if (NeonWALReaderEvents(sk->xlogreader) == 0) + rm_safekeeper_event_set(sk, false); + } + else if (res == NEON_WALREAD_ERROR) + { + *errmsg = NeonWALReaderErrMsg(sk->xlogreader); + } + + return res; +} + +static uint32 +walprop_pg_wal_reader_events(Safekeeper *sk) +{ + return NeonWALReaderEvents(sk->xlogreader); } static WaitEventSet *waitEvents; @@ -1437,6 +1527,8 @@ walprop_pg_free_event_set(WalProposer *wp) for (int i = 0; i < wp->n_safekeepers; i++) { wp->safekeeper[i].eventPos = -1; + wp->safekeeper[i].nwrEventPos = -1; + wp->safekeeper[i].nwrConnEstablished = false; } } @@ -1444,13 +1536,39 @@ static void walprop_pg_init_event_set(WalProposer *wp) { if (waitEvents) - elog(FATAL, "double-initialization of event set"); + wpg_log(FATAL, "double-initialization of event set"); - waitEvents = CreateWaitEventSet(TopMemoryContext, 2 + wp->n_safekeepers); + /* for each sk, we have socket plus potentially socket for neon walreader */ + waitEvents = CreateWaitEventSet(TopMemoryContext, 2 + 2 * wp->n_safekeepers); AddWaitEventToSet(waitEvents, WL_LATCH_SET, PGINVALID_SOCKET, MyLatch, NULL); AddWaitEventToSet(waitEvents, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET, NULL, NULL); + + for (int i = 0; i < wp->n_safekeepers; i++) + { + wp->safekeeper[i].eventPos = -1; + wp->safekeeper[i].nwrEventPos = -1; + wp->safekeeper[i].nwrConnEstablished = false; + } +} + +/* add safekeeper socket to wait event set */ +static void +walprop_pg_add_safekeeper_event_set(Safekeeper *sk, uint32 events) +{ + Assert(sk->eventPos == -1); + sk->eventPos = AddWaitEventToSet(waitEvents, events, walprop_socket(sk), NULL, sk); +} + +/* add neon wal reader socket to wait event set */ +static void +add_nwr_event_set(Safekeeper *sk, uint32 events) +{ + Assert(sk->nwrEventPos == -1); + sk->nwrEventPos = AddWaitEventToSet(waitEvents, events, NeonWALReaderSocket(sk->xlogreader), NULL, sk); + sk->nwrConnEstablished = NeonWALReaderIsRemConnEstablished(sk->xlogreader); + wpg_log(DEBUG5, "sk %s:%s: added nwr socket events %d", sk->host, sk->port, events); } static void @@ -1462,10 +1580,144 @@ walprop_pg_update_event_set(Safekeeper *sk, uint32 events) ModifyWaitEvent(waitEvents, sk->eventPos, events, NULL); } +/* + * Update neon_walreader event. + * Can be called when nwr socket doesn't exist, does nothing in this case. + */ static void -walprop_pg_add_safekeeper_event_set(Safekeeper *sk, uint32 events) +update_nwr_event_set(Safekeeper *sk, uint32 events) { - sk->eventPos = AddWaitEventToSet(waitEvents, events, walprop_socket(sk), NULL, sk); + /* eventPos = -1 when we don't have an event */ + if (sk->nwrEventPos != -1) + ModifyWaitEvent(waitEvents, sk->nwrEventPos, events, NULL); +} + + +static void +walprop_pg_active_state_update_event_set(Safekeeper *sk) +{ + uint32 sk_events; + uint32 nwr_events; + + Assert(sk->state == SS_ACTIVE); + SafekeeperStateDesiredEvents(sk, &sk_events, &nwr_events); + + /* + * If we need to wait for neon_walreader, ensure we have up to date socket + * in the wait event set. + */ + if (sk->active_state == SS_ACTIVE_READ_WAL) + { + /* + * If conn is established and socket is thus stable, update the event + * directly; otherwise re-add it. + */ + if (sk->nwrConnEstablished) + { + Assert(sk->nwrEventPos != -1); + update_nwr_event_set(sk, nwr_events); + } + else + { + rm_safekeeper_event_set(sk, false); + add_nwr_event_set(sk, nwr_events); + } + } + else + { + /* + * Hack: we should always set 0 here, but for random reasons + * WaitEventSet (WaitEventAdjustEpoll) asserts that there is at least + * some event. Since there is also no way to remove socket except + * reconstructing the whole set, SafekeeperStateDesiredEvents instead + * gives WL_SOCKET_CLOSED if socket exists. We never expect it to + * trigger. + * + * On PG 14 which doesn't have WL_SOCKET_CLOSED resort to event + * removal. + */ +#if PG_VERSION_NUM >= 150000 + Assert(nwr_events == WL_SOCKET_CLOSED || nwr_events == 0); + update_nwr_event_set(sk, WL_SOCKET_CLOSED); +#else /* pg 14 */ + rm_safekeeper_event_set(sk, false); +#endif + } + walprop_pg_update_event_set(sk, sk_events); +} + +static void +walprop_pg_rm_safekeeper_event_set(Safekeeper *to_remove) +{ + rm_safekeeper_event_set(to_remove, true); +} + +/* + * A hacky way to remove single event from the event set. Can be called if event + * doesn't exist, does nothing in this case. + * + * Note: Internally, this completely reconstructs the event set. It should be + * avoided if possible. + * + * If is_sk is true, socket of connection to safekeeper is removed; otherwise + * socket of neon_walreader. + */ +static void +rm_safekeeper_event_set(Safekeeper *to_remove, bool is_sk) +{ + WalProposer *wp = to_remove->wp; + + wpg_log(DEBUG5, "sk %s:%s: removing event, is_sk %d", + to_remove->host, to_remove->port, is_sk); + + /* + * Shortpath for exiting if have nothing to do. We never call this + * function with safekeeper socket not existing, but do that with neon + * walreader socket. + */ + if ((is_sk && to_remove->eventPos == -1) || + (!is_sk && to_remove->nwrEventPos == -1)) + { + return; + } + + /* Remove the existing event set, assign sk->eventPos = -1 */ + walprop_pg_free_event_set(wp); + + /* Re-initialize it without adding any safekeeper events */ + wp->api.init_event_set(wp); + + /* + * loop through the existing safekeepers. If they aren't the one we're + * removing, and if they have a socket we can use, re-add the applicable + * events. + */ + for (int i = 0; i < wp->n_safekeepers; i++) + { + Safekeeper *sk = &wp->safekeeper[i]; + + /* + * If this safekeeper isn't offline, add events for it, except for the + * event requested to remove. + */ + if (sk->state != SS_OFFLINE) + { + uint32 sk_events; + uint32 nwr_events; + + SafekeeperStateDesiredEvents(sk, &sk_events, &nwr_events); + + if (sk != to_remove || !is_sk) + { + /* will set sk->eventPos */ + wp->api.add_safekeeper_event_set(sk, sk_events); + } + if ((sk != to_remove || is_sk) && nwr_events) + { + add_nwr_event_set(sk, nwr_events); + } + } + } } static int @@ -1481,6 +1733,21 @@ walprop_pg_wait_event_set(WalProposer *wp, long timeout, Safekeeper **sk, uint32 #if PG_MAJORVERSION_NUM >= 16 if (WalSndCtl != NULL) ConditionVariablePrepareToSleep(&WalSndCtl->wal_flush_cv); + + /* + * Now that we prepared the condvar, check flush ptr again -- it might + * have changed before we subscribed to cv so we missed the wakeup. + * + * Do that only when we're interested in new WAL: without sync-safekeepers + * and if election already passed. + */ + if (!wp->config->syncSafekeepers && wp->availableLsn != InvalidXLogRecPtr && GetFlushRecPtr(NULL) > wp->availableLsn) + { + ConditionVariableCancelSleep(); + ResetLatch(MyLatch); + *events = WL_LATCH_SET; + return 1; + } #endif /* @@ -1532,7 +1799,7 @@ walprop_pg_finish_sync_safekeepers(WalProposer *wp, XLogRecPtr lsn) } /* - * Get PageserverFeedback fields from the most advanced safekeeper + * Choose most advanced PageserverFeedback and set it to *rf. */ static void GetLatestNeonFeedback(PageserverFeedback *rf, WalProposer *wp) @@ -1555,15 +1822,13 @@ GetLatestNeonFeedback(PageserverFeedback *rf, WalProposer *wp) rf->remote_consistent_lsn = wp->safekeeper[latest_safekeeper].appendResponse.rf.remote_consistent_lsn; rf->replytime = wp->safekeeper[latest_safekeeper].appendResponse.rf.replytime; - elog(DEBUG2, "GetLatestNeonFeedback: currentClusterSize %lu," - " last_received_lsn %X/%X, disk_consistent_lsn %X/%X, remote_consistent_lsn %X/%X, replytime %lu", - rf->currentClusterSize, - LSN_FORMAT_ARGS(rf->last_received_lsn), - LSN_FORMAT_ARGS(rf->disk_consistent_lsn), - LSN_FORMAT_ARGS(rf->remote_consistent_lsn), - rf->replytime); - - replication_feedback_set(rf); + wpg_log(DEBUG2, "GetLatestNeonFeedback: currentClusterSize %lu," + " last_received_lsn %X/%X, disk_consistent_lsn %X/%X, remote_consistent_lsn %X/%X, replytime %lu", + rf->currentClusterSize, + LSN_FORMAT_ARGS(rf->last_received_lsn), + LSN_FORMAT_ARGS(rf->disk_consistent_lsn), + LSN_FORMAT_ARGS(rf->remote_consistent_lsn), + rf->replytime); } /* @@ -1603,63 +1868,69 @@ CombineHotStanbyFeedbacks(HotStandbyFeedback *hs, WalProposer *wp) hs->catalog_xmin = InvalidFullTransactionId; } +/* + * Based on commitLsn and safekeeper responses including pageserver feedback, + * 1) Propagate cluster size received from ps to ensure the limit. + * 2) Propagate pageserver LSN positions to ensure backpressure limits. + * 3) Advance walproposer slot to commitLsn (releasing WAL & waking up waiters). + * 4) Propagate hot standby feedback. + * + * None of that is functional in sync-safekeepers. + */ static void walprop_pg_process_safekeeper_feedback(WalProposer *wp, XLogRecPtr commitLsn) { HotStandbyFeedback hsFeedback; - XLogRecPtr diskConsistentLsn; + XLogRecPtr oldDiskConsistentLsn; - diskConsistentLsn = quorumFeedback.rf.disk_consistent_lsn; + if (wp->config->syncSafekeepers) + return; - if (!wp->config->syncSafekeepers) + oldDiskConsistentLsn = quorumFeedback.rf.disk_consistent_lsn; + + /* Get PageserverFeedback fields from the most advanced safekeeper */ + GetLatestNeonFeedback(&quorumFeedback.rf, wp); + replication_feedback_set(&quorumFeedback.rf); + SetZenithCurrentClusterSize(quorumFeedback.rf.currentClusterSize); + + if (commitLsn > quorumFeedback.flushLsn || oldDiskConsistentLsn != quorumFeedback.rf.disk_consistent_lsn) { - /* Get PageserverFeedback fields from the most advanced safekeeper */ - GetLatestNeonFeedback(&quorumFeedback.rf, wp); - SetZenithCurrentClusterSize(quorumFeedback.rf.currentClusterSize); - } - - if (commitLsn > quorumFeedback.flushLsn || diskConsistentLsn != quorumFeedback.rf.disk_consistent_lsn) - { - if (commitLsn > quorumFeedback.flushLsn) quorumFeedback.flushLsn = commitLsn; - /* advance the replication slot */ - if (!wp->config->syncSafekeepers) - ProcessStandbyReply( - /* write_lsn - This is what durably stored in WAL service. */ - quorumFeedback.flushLsn, - /* flush_lsn - This is what durably stored in WAL service. */ - quorumFeedback.flushLsn, + /* + * Advance the replication slot to commitLsn. WAL before it is + * hardened and will be fetched from one of safekeepers by + * neon_walreader if needed. + * + * Also wakes up syncrep waiters. + */ + ProcessStandbyReply( + /* write_lsn - This is what durably stored in WAL service. */ + quorumFeedback.flushLsn, + /* flush_lsn - This is what durably stored in WAL service. */ + quorumFeedback.flushLsn, - /* - * apply_lsn - This is what processed and durably saved at* - * pageserver. - */ - quorumFeedback.rf.disk_consistent_lsn, - walprop_pg_get_current_timestamp(wp), false); + /* + * apply_lsn - This is what processed and durably saved at* + * pageserver. + */ + quorumFeedback.rf.disk_consistent_lsn, + walprop_pg_get_current_timestamp(wp), false); } CombineHotStanbyFeedbacks(&hsFeedback, wp); if (hsFeedback.ts != 0 && memcmp(&hsFeedback, &quorumFeedback.hs, sizeof hsFeedback) != 0) { quorumFeedback.hs = hsFeedback; - if (!wp->config->syncSafekeepers) - ProcessStandbyHSFeedback(hsFeedback.ts, - XidFromFullTransactionId(hsFeedback.xmin), - EpochFromFullTransactionId(hsFeedback.xmin), - XidFromFullTransactionId(hsFeedback.catalog_xmin), - EpochFromFullTransactionId(hsFeedback.catalog_xmin)); + ProcessStandbyHSFeedback(hsFeedback.ts, + XidFromFullTransactionId(hsFeedback.xmin), + EpochFromFullTransactionId(hsFeedback.xmin), + XidFromFullTransactionId(hsFeedback.catalog_xmin), + EpochFromFullTransactionId(hsFeedback.catalog_xmin)); } } -static void -walprop_pg_confirm_wal_streamed(WalProposer *wp, XLogRecPtr lsn) -{ - if (MyReplicationSlot) - PhysicalConfirmReceivedLocation(lsn); -} - static XLogRecPtr walprop_pg_get_redo_start_lsn(WalProposer *wp) { @@ -1678,34 +1949,56 @@ walprop_pg_log_internal(WalProposer *wp, int level, const char *line) elog(FATAL, "unexpected log_internal message at level %d: %s", level, line); } -static void -walprop_pg_after_election(WalProposer *wp) +static XLogRecPtr +GetLogRepRestartLSN(WalProposer *wp) { - FILE* f; - XLogRecPtr lrRestartLsn; + FILE *f; + XLogRecPtr lrRestartLsn = InvalidXLogRecPtr; - /* We don't need to do anything in syncSafekeepers mode.*/ + /* We don't need to do anything in syncSafekeepers mode. */ if (wp->config->syncSafekeepers) - return; + return InvalidXLogRecPtr; /* - * If there are active logical replication subscription we need - * to provide enough WAL for their WAL senders based on th position - * of their replication slots. + * If there are active logical replication subscription we need to provide + * enough WAL for their WAL senders based on th position of their + * replication slots. */ f = fopen("restart.lsn", "rb"); - if (f != NULL && !wp->config->syncSafekeepers) + if (f != NULL) { - fread(&lrRestartLsn, sizeof(lrRestartLsn), 1, f); + size_t rc = fread(&lrRestartLsn, sizeof(lrRestartLsn), 1, f); + fclose(f); - if (lrRestartLsn != InvalidXLogRecPtr) + if (rc == 1 && lrRestartLsn != InvalidXLogRecPtr) { - elog(LOG, "Logical replication restart LSN %X/%X", LSN_FORMAT_ARGS(lrRestartLsn)); - /* start from the beginning of the segment to fetch page headers verifed by XLogReader */ + uint64 download_range_mb; + + wpg_log(LOG, "logical replication restart LSN %X/%X", LSN_FORMAT_ARGS(lrRestartLsn)); + + /* + * If we need to download more than a max_slot_wal_keep_size, + * don't do it to avoid risk of exploding pg_wal. Logical + * replication won't work until recreated, but at least compute + * would start; this also follows max_slot_wal_keep_size + * semantics. + */ + download_range_mb = (wp->propEpochStartLsn - lrRestartLsn) / MB; + if (max_slot_wal_keep_size_mb > 0 && download_range_mb >= max_slot_wal_keep_size_mb) + { + wpg_log(WARNING, "not downloading WAL for logical replication since %X/%X as max_slot_wal_keep_size=%dMB", + LSN_FORMAT_ARGS(lrRestartLsn), max_slot_wal_keep_size_mb); + return InvalidXLogRecPtr; + } + + /* + * start from the beginning of the segment to fetch page headers + * verifed by XLogReader + */ lrRestartLsn = lrRestartLsn - XLogSegmentOffset(lrRestartLsn, wal_segment_size); - wp->truncateLsn = Min(wp->truncateLsn, lrRestartLsn); } } + return lrRestartLsn; } static const walproposer_api walprop_pg = { @@ -1725,18 +2018,18 @@ static const walproposer_api walprop_pg = { .conn_async_write = walprop_async_write, .conn_blocking_write = walprop_blocking_write, .recovery_download = WalProposerRecovery, - .wal_read = walprop_pg_wal_read, .wal_reader_allocate = walprop_pg_wal_reader_allocate, - .free_event_set = walprop_pg_free_event_set, + .wal_read = walprop_pg_wal_read, + .wal_reader_events = walprop_pg_wal_reader_events, .init_event_set = walprop_pg_init_event_set, .update_event_set = walprop_pg_update_event_set, + .active_state_update_event_set = walprop_pg_active_state_update_event_set, .add_safekeeper_event_set = walprop_pg_add_safekeeper_event_set, + .rm_safekeeper_event_set = walprop_pg_rm_safekeeper_event_set, .wait_event_set = walprop_pg_wait_event_set, .strong_random = walprop_pg_strong_random, .get_redo_start_lsn = walprop_pg_get_redo_start_lsn, .finish_sync_safekeepers = walprop_pg_finish_sync_safekeepers, .process_safekeeper_feedback = walprop_pg_process_safekeeper_feedback, - .confirm_wal_streamed = walprop_pg_confirm_wal_streamed, .log_internal = walprop_pg_log_internal, - .after_election = walprop_pg_after_election, }; diff --git a/poetry.lock b/poetry.lock index 830f80dc97..c597d811bd 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand. [[package]] name = "aiohttp" @@ -98,18 +98,18 @@ speedups = ["Brotli", "aiodns", "brotlicffi"] [[package]] name = "aiopg" -version = "1.3.4" +version = "1.4.0" description = "Postgres integration with asyncio." optional = false -python-versions = ">=3.6" +python-versions = ">=3.7" files = [ - {file = "aiopg-1.3.4-py3-none-any.whl", hash = "sha256:b5b74a124831aad71608c3c203479db90bac4a7eb3f8982bc48c3d3e6f1e57bf"}, - {file = "aiopg-1.3.4.tar.gz", hash = "sha256:23f9e4cd9f28e9d91a6de3b4fb517e8bed25511cd954acccba9fe3a702d9b7d0"}, + {file = "aiopg-1.4.0-py3-none-any.whl", hash = "sha256:aea46e8aff30b039cfa818e6db4752c97656e893fc75e5a5dc57355a9e9dedbd"}, + {file = "aiopg-1.4.0.tar.gz", hash = "sha256:116253bef86b4d954116716d181e9a0294037f266718b2e1c9766af995639d71"}, ] [package.dependencies] async-timeout = ">=3.0,<5.0" -psycopg2-binary = ">=2.8.4" +psycopg2-binary = ">=2.9.5" [package.extras] sa = ["sqlalchemy[postgresql-psycopg2binary] (>=1.3,<1.5)"] @@ -160,64 +160,71 @@ pluggy = ">=0.4.0" [[package]] name = "async-timeout" -version = "4.0.2" +version = "4.0.3" description = "Timeout context manager for asyncio programs" optional = false -python-versions = ">=3.6" +python-versions = ">=3.7" files = [ - {file = "async-timeout-4.0.2.tar.gz", hash = "sha256:2163e1640ddb52b7a8c80d0a67a08587e5d245cc9c553a74a847056bc2976b15"}, - {file = "async_timeout-4.0.2-py3-none-any.whl", hash = "sha256:8ca1e4fcf50d07413d66d1a5e416e42cfdf5851c981d679a09851a6853383b3c"}, + {file = "async-timeout-4.0.3.tar.gz", hash = "sha256:4640d96be84d82d02ed59ea2b7105a0f7b33abe8703703cd0ab0bf87c427522f"}, + {file = "async_timeout-4.0.3-py3-none-any.whl", hash = "sha256:7405140ff1230c310e51dc27b3145b9092d659ce68ff733fb0cefe3ee42be028"}, ] [[package]] name = "asyncpg" -version = "0.27.0" +version = "0.29.0" description = "An asyncio PostgreSQL driver" optional = false -python-versions = ">=3.7.0" +python-versions = ">=3.8.0" files = [ - {file = "asyncpg-0.27.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:fca608d199ffed4903dce1bcd97ad0fe8260f405c1c225bdf0002709132171c2"}, - {file = "asyncpg-0.27.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:20b596d8d074f6f695c13ffb8646d0b6bb1ab570ba7b0cfd349b921ff03cfc1e"}, - {file = "asyncpg-0.27.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:7a6206210c869ebd3f4eb9e89bea132aefb56ff3d1b7dd7e26b102b17e27bbb1"}, - {file = "asyncpg-0.27.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a7a94c03386bb95456b12c66026b3a87d1b965f0f1e5733c36e7229f8f137747"}, - {file = "asyncpg-0.27.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:bfc3980b4ba6f97138b04f0d32e8af21d6c9fa1f8e6e140c07d15690a0a99279"}, - {file = "asyncpg-0.27.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:9654085f2b22f66952124de13a8071b54453ff972c25c59b5ce1173a4283ffd9"}, - {file = "asyncpg-0.27.0-cp310-cp310-win32.whl", hash = "sha256:879c29a75969eb2722f94443752f4720d560d1e748474de54ae8dd230bc4956b"}, - {file = "asyncpg-0.27.0-cp310-cp310-win_amd64.whl", hash = "sha256:ab0f21c4818d46a60ca789ebc92327d6d874d3b7ccff3963f7af0a21dc6cff52"}, - {file = "asyncpg-0.27.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:18f77e8e71e826ba2d0c3ba6764930776719ae2b225ca07e014590545928b576"}, - {file = "asyncpg-0.27.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:c2232d4625c558f2aa001942cac1d7952aa9f0dbfc212f63bc754277769e1ef2"}, - {file = "asyncpg-0.27.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9a3a4ff43702d39e3c97a8786314123d314e0f0e4dabc8367db5b665c93914de"}, - {file = "asyncpg-0.27.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ccddb9419ab4e1c48742457d0c0362dbdaeb9b28e6875115abfe319b29ee225d"}, - {file = "asyncpg-0.27.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:768e0e7c2898d40b16d4ef7a0b44e8150db3dd8995b4652aa1fe2902e92c7df8"}, - {file = "asyncpg-0.27.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:609054a1f47292a905582a1cfcca51a6f3f30ab9d822448693e66fdddde27920"}, - {file = "asyncpg-0.27.0-cp311-cp311-win32.whl", hash = "sha256:8113e17cfe236dc2277ec844ba9b3d5312f61bd2fdae6d3ed1c1cdd75f6cf2d8"}, - {file = "asyncpg-0.27.0-cp311-cp311-win_amd64.whl", hash = "sha256:bb71211414dd1eeb8d31ec529fe77cff04bf53efc783a5f6f0a32d84923f45cf"}, - {file = "asyncpg-0.27.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4750f5cf49ed48a6e49c6e5aed390eee367694636c2dcfaf4a273ca832c5c43c"}, - {file = "asyncpg-0.27.0-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:eca01eb112a39d31cc4abb93a5aef2a81514c23f70956729f42fb83b11b3483f"}, - {file = "asyncpg-0.27.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:5710cb0937f696ce303f5eed6d272e3f057339bb4139378ccecafa9ee923a71c"}, - {file = "asyncpg-0.27.0-cp37-cp37m-win_amd64.whl", hash = "sha256:71cca80a056ebe19ec74b7117b09e650990c3ca535ac1c35234a96f65604192f"}, - {file = "asyncpg-0.27.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:4bb366ae34af5b5cabc3ac6a5347dfb6013af38c68af8452f27968d49085ecc0"}, - {file = "asyncpg-0.27.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:16ba8ec2e85d586b4a12bcd03e8d29e3d99e832764d6a1d0b8c27dbbe4a2569d"}, - {file = "asyncpg-0.27.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d20dea7b83651d93b1eb2f353511fe7fd554752844523f17ad30115d8b9c8cd6"}, - {file = "asyncpg-0.27.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:e56ac8a8237ad4adec97c0cd4728596885f908053ab725e22900b5902e7f8e69"}, - {file = "asyncpg-0.27.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:bf21ebf023ec67335258e0f3d3ad7b91bb9507985ba2b2206346de488267cad0"}, - {file = "asyncpg-0.27.0-cp38-cp38-win32.whl", hash = "sha256:69aa1b443a182b13a17ff926ed6627af2d98f62f2fe5890583270cc4073f63bf"}, - {file = "asyncpg-0.27.0-cp38-cp38-win_amd64.whl", hash = "sha256:62932f29cf2433988fcd799770ec64b374a3691e7902ecf85da14d5e0854d1ea"}, - {file = "asyncpg-0.27.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:fddcacf695581a8d856654bc4c8cfb73d5c9df26d5f55201722d3e6a699e9629"}, - {file = "asyncpg-0.27.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:7d8585707ecc6661d07367d444bbaa846b4e095d84451340da8df55a3757e152"}, - {file = "asyncpg-0.27.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:975a320baf7020339a67315284a4d3bf7460e664e484672bd3e71dbd881bc692"}, - {file = "asyncpg-0.27.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2232ebae9796d4600a7819fc383da78ab51b32a092795f4555575fc934c1c89d"}, - {file = "asyncpg-0.27.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:88b62164738239f62f4af92567b846a8ef7cf8abf53eddd83650603de4d52163"}, - {file = "asyncpg-0.27.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:eb4b2fdf88af4fb1cc569781a8f933d2a73ee82cd720e0cb4edabbaecf2a905b"}, - {file = "asyncpg-0.27.0-cp39-cp39-win32.whl", hash = "sha256:8934577e1ed13f7d2d9cea3cc016cc6f95c19faedea2c2b56a6f94f257cea672"}, - {file = "asyncpg-0.27.0-cp39-cp39-win_amd64.whl", hash = "sha256:1b6499de06fe035cf2fa932ec5617ed3f37d4ebbf663b655922e105a484a6af9"}, - {file = "asyncpg-0.27.0.tar.gz", hash = "sha256:720986d9a4705dd8a40fdf172036f5ae787225036a7eb46e704c45aa8f62c054"}, + {file = "asyncpg-0.29.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:72fd0ef9f00aeed37179c62282a3d14262dbbafb74ec0ba16e1b1864d8a12169"}, + {file = "asyncpg-0.29.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:52e8f8f9ff6e21f9b39ca9f8e3e33a5fcdceaf5667a8c5c32bee158e313be385"}, + {file = "asyncpg-0.29.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a9e6823a7012be8b68301342ba33b4740e5a166f6bbda0aee32bc01638491a22"}, + {file = "asyncpg-0.29.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:746e80d83ad5d5464cfbf94315eb6744222ab00aa4e522b704322fb182b83610"}, + {file = "asyncpg-0.29.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:ff8e8109cd6a46ff852a5e6bab8b0a047d7ea42fcb7ca5ae6eaae97d8eacf397"}, + {file = "asyncpg-0.29.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:97eb024685b1d7e72b1972863de527c11ff87960837919dac6e34754768098eb"}, + {file = "asyncpg-0.29.0-cp310-cp310-win32.whl", hash = "sha256:5bbb7f2cafd8d1fa3e65431833de2642f4b2124be61a449fa064e1a08d27e449"}, + {file = "asyncpg-0.29.0-cp310-cp310-win_amd64.whl", hash = "sha256:76c3ac6530904838a4b650b2880f8e7af938ee049e769ec2fba7cd66469d7772"}, + {file = "asyncpg-0.29.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:d4900ee08e85af01adb207519bb4e14b1cae8fd21e0ccf80fac6aa60b6da37b4"}, + {file = "asyncpg-0.29.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a65c1dcd820d5aea7c7d82a3fdcb70e096f8f70d1a8bf93eb458e49bfad036ac"}, + {file = "asyncpg-0.29.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5b52e46f165585fd6af4863f268566668407c76b2c72d366bb8b522fa66f1870"}, + {file = "asyncpg-0.29.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dc600ee8ef3dd38b8d67421359779f8ccec30b463e7aec7ed481c8346decf99f"}, + {file = "asyncpg-0.29.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:039a261af4f38f949095e1e780bae84a25ffe3e370175193174eb08d3cecab23"}, + {file = "asyncpg-0.29.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:6feaf2d8f9138d190e5ec4390c1715c3e87b37715cd69b2c3dfca616134efd2b"}, + {file = "asyncpg-0.29.0-cp311-cp311-win32.whl", hash = "sha256:1e186427c88225ef730555f5fdda6c1812daa884064bfe6bc462fd3a71c4b675"}, + {file = "asyncpg-0.29.0-cp311-cp311-win_amd64.whl", hash = "sha256:cfe73ffae35f518cfd6e4e5f5abb2618ceb5ef02a2365ce64f132601000587d3"}, + {file = "asyncpg-0.29.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:6011b0dc29886ab424dc042bf9eeb507670a3b40aece3439944006aafe023178"}, + {file = "asyncpg-0.29.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:b544ffc66b039d5ec5a7454667f855f7fec08e0dfaf5a5490dfafbb7abbd2cfb"}, + {file = "asyncpg-0.29.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d84156d5fb530b06c493f9e7635aa18f518fa1d1395ef240d211cb563c4e2364"}, + {file = "asyncpg-0.29.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:54858bc25b49d1114178d65a88e48ad50cb2b6f3e475caa0f0c092d5f527c106"}, + {file = "asyncpg-0.29.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:bde17a1861cf10d5afce80a36fca736a86769ab3579532c03e45f83ba8a09c59"}, + {file = "asyncpg-0.29.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:37a2ec1b9ff88d8773d3eb6d3784dc7e3fee7756a5317b67f923172a4748a175"}, + {file = "asyncpg-0.29.0-cp312-cp312-win32.whl", hash = "sha256:bb1292d9fad43112a85e98ecdc2e051602bce97c199920586be83254d9dafc02"}, + {file = "asyncpg-0.29.0-cp312-cp312-win_amd64.whl", hash = "sha256:2245be8ec5047a605e0b454c894e54bf2ec787ac04b1cb7e0d3c67aa1e32f0fe"}, + {file = "asyncpg-0.29.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:0009a300cae37b8c525e5b449233d59cd9868fd35431abc470a3e364d2b85cb9"}, + {file = "asyncpg-0.29.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:5cad1324dbb33f3ca0cd2074d5114354ed3be2b94d48ddfd88af75ebda7c43cc"}, + {file = "asyncpg-0.29.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:012d01df61e009015944ac7543d6ee30c2dc1eb2f6b10b62a3f598beb6531548"}, + {file = "asyncpg-0.29.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:000c996c53c04770798053e1730d34e30cb645ad95a63265aec82da9093d88e7"}, + {file = "asyncpg-0.29.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:e0bfe9c4d3429706cf70d3249089de14d6a01192d617e9093a8e941fea8ee775"}, + {file = "asyncpg-0.29.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:642a36eb41b6313ffa328e8a5c5c2b5bea6ee138546c9c3cf1bffaad8ee36dd9"}, + {file = "asyncpg-0.29.0-cp38-cp38-win32.whl", hash = "sha256:a921372bbd0aa3a5822dd0409da61b4cd50df89ae85150149f8c119f23e8c408"}, + {file = "asyncpg-0.29.0-cp38-cp38-win_amd64.whl", hash = "sha256:103aad2b92d1506700cbf51cd8bb5441e7e72e87a7b3a2ca4e32c840f051a6a3"}, + {file = "asyncpg-0.29.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:5340dd515d7e52f4c11ada32171d87c05570479dc01dc66d03ee3e150fb695da"}, + {file = "asyncpg-0.29.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:e17b52c6cf83e170d3d865571ba574577ab8e533e7361a2b8ce6157d02c665d3"}, + {file = "asyncpg-0.29.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f100d23f273555f4b19b74a96840aa27b85e99ba4b1f18d4ebff0734e78dc090"}, + {file = "asyncpg-0.29.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:48e7c58b516057126b363cec8ca02b804644fd012ef8e6c7e23386b7d5e6ce83"}, + {file = "asyncpg-0.29.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:f9ea3f24eb4c49a615573724d88a48bd1b7821c890c2effe04f05382ed9e8810"}, + {file = "asyncpg-0.29.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:8d36c7f14a22ec9e928f15f92a48207546ffe68bc412f3be718eedccdf10dc5c"}, + {file = "asyncpg-0.29.0-cp39-cp39-win32.whl", hash = "sha256:797ab8123ebaed304a1fad4d7576d5376c3a006a4100380fb9d517f0b59c1ab2"}, + {file = "asyncpg-0.29.0-cp39-cp39-win_amd64.whl", hash = "sha256:cce08a178858b426ae1aa8409b5cc171def45d4293626e7aa6510696d46decd8"}, + {file = "asyncpg-0.29.0.tar.gz", hash = "sha256:d1c49e1f44fffafd9a55e1a9b101590859d881d639ea2922516f5d9c512d354e"}, ] +[package.dependencies] +async-timeout = {version = ">=4.0.3", markers = "python_version < \"3.12.0\""} + [package.extras] -dev = ["Cython (>=0.29.24,<0.30.0)", "Sphinx (>=4.1.2,<4.2.0)", "flake8 (>=5.0.4,<5.1.0)", "pytest (>=6.0)", "sphinx-rtd-theme (>=0.5.2,<0.6.0)", "sphinxcontrib-asyncio (>=0.3.0,<0.4.0)", "uvloop (>=0.15.3)"] -docs = ["Sphinx (>=4.1.2,<4.2.0)", "sphinx-rtd-theme (>=0.5.2,<0.6.0)", "sphinxcontrib-asyncio (>=0.3.0,<0.4.0)"] -test = ["flake8 (>=5.0.4,<5.1.0)", "uvloop (>=0.15.3)"] +docs = ["Sphinx (>=5.3.0,<5.4.0)", "sphinx-rtd-theme (>=1.2.2)", "sphinxcontrib-asyncio (>=0.3.0,<0.4.0)"] +test = ["flake8 (>=6.1,<7.0)", "uvloop (>=0.15.3)"] [[package]] name = "attrs" @@ -332,19 +339,19 @@ uvloop = ["uvloop (>=0.15.2)"] [[package]] name = "boto3" -version = "1.26.16" +version = "1.34.11" description = "The AWS SDK for Python" optional = false -python-versions = ">= 3.7" +python-versions = ">= 3.8" files = [ - {file = "boto3-1.26.16-py3-none-any.whl", hash = "sha256:4f493a2aed71cee93e626de4f67ce58dd82c0473480a0fc45b131715cd8f4f30"}, - {file = "boto3-1.26.16.tar.gz", hash = "sha256:31c0adf71e4bd19a5428580bb229d7ea3b5795eecaa0847a85385df00c026116"}, + {file = "boto3-1.34.11-py3-none-any.whl", hash = "sha256:1af021e0c6e3040e8de66d403e963566476235bb70f9a8e3f6784813ac2d8026"}, + {file = "boto3-1.34.11.tar.gz", hash = "sha256:31c130a40ec0631059b77d7e87f67ad03ff1685a5b37638ac0c4687026a3259d"}, ] [package.dependencies] -botocore = ">=1.29.16,<1.30.0" +botocore = ">=1.34.11,<1.35.0" jmespath = ">=0.7.1,<2.0.0" -s3transfer = ">=0.6.0,<0.7.0" +s3transfer = ">=0.10.0,<0.11.0" [package.extras] crt = ["botocore[crt] (>=1.21.0,<2.0a0)"] @@ -695,22 +702,25 @@ xray = ["mypy-boto3-xray (>=1.26.0,<1.27.0)"] [[package]] name = "botocore" -version = "1.29.16" +version = "1.34.11" description = "Low-level, data-driven core of boto 3." optional = false -python-versions = ">= 3.7" +python-versions = ">= 3.8" files = [ - {file = "botocore-1.29.16-py3-none-any.whl", hash = "sha256:271b599e6cfe214405ed50d41cd967add1d5d469383dd81ff583bc818b47f59b"}, - {file = "botocore-1.29.16.tar.gz", hash = "sha256:8cfcc10f2f1751608c3cec694f2d6b5e16ebcd50d0a104f9914d5616227c62e9"}, + {file = "botocore-1.34.11-py3-none-any.whl", hash = "sha256:1ff1398b6ea670e1c01ac67a33af3da854f8e700d3528289c04f319c330d8250"}, + {file = "botocore-1.34.11.tar.gz", hash = "sha256:51905c3d623c60df5dc5794387de7caf886d350180a01a3dfa762e903edb45a9"}, ] [package.dependencies] jmespath = ">=0.7.1,<2.0.0" python-dateutil = ">=2.1,<3.0.0" -urllib3 = ">=1.25.4,<1.27" +urllib3 = [ + {version = ">=1.25.4,<1.27", markers = "python_version < \"3.10\""}, + {version = ">=1.25.4,<2.1", markers = "python_version >= \"3.10\""}, +] [package.extras] -crt = ["awscrt (==0.14.0)"] +crt = ["awscrt (==0.19.19)"] [[package]] name = "botocore-stubs" @@ -1882,13 +1892,13 @@ files = [ [[package]] name = "pytest" -version = "7.3.1" +version = "7.4.4" description = "pytest: simple powerful testing with Python" optional = false python-versions = ">=3.7" files = [ - {file = "pytest-7.3.1-py3-none-any.whl", hash = "sha256:3799fa815351fea3a5e96ac7e503a96fa51cc9942c3753cda7651b93c1cfa362"}, - {file = "pytest-7.3.1.tar.gz", hash = "sha256:434afafd78b1d78ed0addf160ad2b77a30d35d4bdf8af234fe621919d9ed15e3"}, + {file = "pytest-7.4.4-py3-none-any.whl", hash = "sha256:b090cdf5ed60bf4c45261be03239c2c1c22df034fbffe691abe93cd80cea01d8"}, + {file = "pytest-7.4.4.tar.gz", hash = "sha256:2cf0005922c6ace4a3e2ec8b4080eb0d9753fdc93107415332f50ce9e7994280"}, ] [package.dependencies] @@ -1900,7 +1910,7 @@ pluggy = ">=0.12,<2.0" tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""} [package.extras] -testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "xmlschema"] +testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"] [[package]] name = "pytest-asyncio" @@ -1967,18 +1977,18 @@ pytest = [ [[package]] name = "pytest-rerunfailures" -version = "11.1.2" +version = "13.0" description = "pytest plugin to re-run tests to eliminate flaky failures" optional = false python-versions = ">=3.7" files = [ - {file = "pytest-rerunfailures-11.1.2.tar.gz", hash = "sha256:55611661e873f1cafa384c82f08d07883954f4b76435f4b8a5b470c1954573de"}, - {file = "pytest_rerunfailures-11.1.2-py3-none-any.whl", hash = "sha256:d21fe2e46d9774f8ad95f1aa799544ae95cac3a223477af94aa985adfae92b7e"}, + {file = "pytest-rerunfailures-13.0.tar.gz", hash = "sha256:e132dbe420bc476f544b96e7036edd0a69707574209b6677263c950d19b09199"}, + {file = "pytest_rerunfailures-13.0-py3-none-any.whl", hash = "sha256:34919cb3fcb1f8e5d4b940aa75ccdea9661bade925091873b7c6fa5548333069"}, ] [package.dependencies] packaging = ">=17.1" -pytest = ">=5.3" +pytest = ">=7" [[package]] name = "pytest-split" @@ -2085,51 +2095,61 @@ files = [ [[package]] name = "pyyaml" -version = "6.0" +version = "6.0.1" description = "YAML parser and emitter for Python" optional = false python-versions = ">=3.6" files = [ - {file = "PyYAML-6.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d4db7c7aef085872ef65a8fd7d6d09a14ae91f691dec3e87ee5ee0539d516f53"}, - {file = "PyYAML-6.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9df7ed3b3d2e0ecfe09e14741b857df43adb5a3ddadc919a2d94fbdf78fea53c"}, - {file = "PyYAML-6.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:77f396e6ef4c73fdc33a9157446466f1cff553d979bd00ecb64385760c6babdc"}, - {file = "PyYAML-6.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a80a78046a72361de73f8f395f1f1e49f956c6be882eed58505a15f3e430962b"}, - {file = "PyYAML-6.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:f84fbc98b019fef2ee9a1cb3ce93e3187a6df0b2538a651bfb890254ba9f90b5"}, - {file = "PyYAML-6.0-cp310-cp310-win32.whl", hash = "sha256:2cd5df3de48857ed0544b34e2d40e9fac445930039f3cfe4bcc592a1f836d513"}, - {file = "PyYAML-6.0-cp310-cp310-win_amd64.whl", hash = "sha256:daf496c58a8c52083df09b80c860005194014c3698698d1a57cbcfa182142a3a"}, - {file = "PyYAML-6.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:d4b0ba9512519522b118090257be113b9468d804b19d63c71dbcf4a48fa32358"}, - {file = "PyYAML-6.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:81957921f441d50af23654aa6c5e5eaf9b06aba7f0a19c18a538dc7ef291c5a1"}, - {file = "PyYAML-6.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:afa17f5bc4d1b10afd4466fd3a44dc0e245382deca5b3c353d8b757f9e3ecb8d"}, - {file = "PyYAML-6.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:dbad0e9d368bb989f4515da330b88a057617d16b6a8245084f1b05400f24609f"}, - {file = "PyYAML-6.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:432557aa2c09802be39460360ddffd48156e30721f5e8d917f01d31694216782"}, - {file = "PyYAML-6.0-cp311-cp311-win32.whl", hash = "sha256:bfaef573a63ba8923503d27530362590ff4f576c626d86a9fed95822a8255fd7"}, - {file = "PyYAML-6.0-cp311-cp311-win_amd64.whl", hash = "sha256:01b45c0191e6d66c470b6cf1b9531a771a83c1c4208272ead47a3ae4f2f603bf"}, - {file = "PyYAML-6.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:897b80890765f037df3403d22bab41627ca8811ae55e9a722fd0392850ec4d86"}, - {file = "PyYAML-6.0-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:50602afada6d6cbfad699b0c7bb50d5ccffa7e46a3d738092afddc1f9758427f"}, - {file = "PyYAML-6.0-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:48c346915c114f5fdb3ead70312bd042a953a8ce5c7106d5bfb1a5254e47da92"}, - {file = "PyYAML-6.0-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:98c4d36e99714e55cfbaaee6dd5badbc9a1ec339ebfc3b1f52e293aee6bb71a4"}, - {file = "PyYAML-6.0-cp36-cp36m-win32.whl", hash = "sha256:0283c35a6a9fbf047493e3a0ce8d79ef5030852c51e9d911a27badfde0605293"}, - {file = "PyYAML-6.0-cp36-cp36m-win_amd64.whl", hash = "sha256:07751360502caac1c067a8132d150cf3d61339af5691fe9e87803040dbc5db57"}, - {file = "PyYAML-6.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:819b3830a1543db06c4d4b865e70ded25be52a2e0631ccd2f6a47a2822f2fd7c"}, - {file = "PyYAML-6.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:473f9edb243cb1935ab5a084eb238d842fb8f404ed2193a915d1784b5a6b5fc0"}, - {file = "PyYAML-6.0-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0ce82d761c532fe4ec3f87fc45688bdd3a4c1dc5e0b4a19814b9009a29baefd4"}, - {file = "PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:231710d57adfd809ef5d34183b8ed1eeae3f76459c18fb4a0b373ad56bedcdd9"}, - {file = "PyYAML-6.0-cp37-cp37m-win32.whl", hash = "sha256:c5687b8d43cf58545ade1fe3e055f70eac7a5a1a0bf42824308d868289a95737"}, - {file = "PyYAML-6.0-cp37-cp37m-win_amd64.whl", hash = "sha256:d15a181d1ecd0d4270dc32edb46f7cb7733c7c508857278d3d378d14d606db2d"}, - {file = "PyYAML-6.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:0b4624f379dab24d3725ffde76559cff63d9ec94e1736b556dacdfebe5ab6d4b"}, - {file = "PyYAML-6.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:213c60cd50106436cc818accf5baa1aba61c0189ff610f64f4a3e8c6726218ba"}, - {file = "PyYAML-6.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9fa600030013c4de8165339db93d182b9431076eb98eb40ee068700c9c813e34"}, - {file = "PyYAML-6.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:277a0ef2981ca40581a47093e9e2d13b3f1fbbeffae064c1d21bfceba2030287"}, - {file = "PyYAML-6.0-cp38-cp38-win32.whl", hash = "sha256:d4eccecf9adf6fbcc6861a38015c2a64f38b9d94838ac1810a9023a0609e1b78"}, - {file = "PyYAML-6.0-cp38-cp38-win_amd64.whl", hash = "sha256:1e4747bc279b4f613a09eb64bba2ba602d8a6664c6ce6396a4d0cd413a50ce07"}, - {file = "PyYAML-6.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:055d937d65826939cb044fc8c9b08889e8c743fdc6a32b33e2390f66013e449b"}, - {file = "PyYAML-6.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:e61ceaab6f49fb8bdfaa0f92c4b57bcfbea54c09277b1b4f7ac376bfb7a7c174"}, - {file = "PyYAML-6.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d67d839ede4ed1b28a4e8909735fc992a923cdb84e618544973d7dfc71540803"}, - {file = "PyYAML-6.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cba8c411ef271aa037d7357a2bc8f9ee8b58b9965831d9e51baf703280dc73d3"}, - {file = "PyYAML-6.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:40527857252b61eacd1d9af500c3337ba8deb8fc298940291486c465c8b46ec0"}, - {file = "PyYAML-6.0-cp39-cp39-win32.whl", hash = "sha256:b5b9eccad747aabaaffbc6064800670f0c297e52c12754eb1d976c57e4f74dcb"}, - {file = "PyYAML-6.0-cp39-cp39-win_amd64.whl", hash = "sha256:b3d267842bf12586ba6c734f89d1f5b871df0273157918b0ccefa29deb05c21c"}, - {file = "PyYAML-6.0.tar.gz", hash = "sha256:68fb519c14306fec9720a2a5b45bc9f0c8d1b9c72adf45c37baedfcd949c35a2"}, + {file = "PyYAML-6.0.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d858aa552c999bc8a8d57426ed01e40bef403cd8ccdd0fc5f6f04a00414cac2a"}, + {file = "PyYAML-6.0.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:fd66fc5d0da6d9815ba2cebeb4205f95818ff4b79c3ebe268e75d961704af52f"}, + {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:69b023b2b4daa7548bcfbd4aa3da05b3a74b772db9e23b982788168117739938"}, + {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:81e0b275a9ecc9c0c0c07b4b90ba548307583c125f54d5b6946cfee6360c733d"}, + {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba336e390cd8e4d1739f42dfe9bb83a3cc2e80f567d8805e11b46f4a943f5515"}, + {file = "PyYAML-6.0.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:326c013efe8048858a6d312ddd31d56e468118ad4cdeda36c719bf5bb6192290"}, + {file = "PyYAML-6.0.1-cp310-cp310-win32.whl", hash = "sha256:bd4af7373a854424dabd882decdc5579653d7868b8fb26dc7d0e99f823aa5924"}, + {file = "PyYAML-6.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:fd1592b3fdf65fff2ad0004b5e363300ef59ced41c2e6b3a99d4089fa8c5435d"}, + {file = "PyYAML-6.0.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6965a7bc3cf88e5a1c3bd2e0b5c22f8d677dc88a455344035f03399034eb3007"}, + {file = "PyYAML-6.0.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f003ed9ad21d6a4713f0a9b5a7a0a79e08dd0f221aff4525a2be4c346ee60aab"}, + {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:42f8152b8dbc4fe7d96729ec2b99c7097d656dc1213a3229ca5383f973a5ed6d"}, + {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:062582fca9fabdd2c8b54a3ef1c978d786e0f6b3a1510e0ac93ef59e0ddae2bc"}, + {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d2b04aac4d386b172d5b9692e2d2da8de7bfb6c387fa4f801fbf6fb2e6ba4673"}, + {file = "PyYAML-6.0.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e7d73685e87afe9f3b36c799222440d6cf362062f78be1013661b00c5c6f678b"}, + {file = "PyYAML-6.0.1-cp311-cp311-win32.whl", hash = "sha256:1635fd110e8d85d55237ab316b5b011de701ea0f29d07611174a1b42f1444741"}, + {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"}, + {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"}, + {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"}, + {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"}, + {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"}, + {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"}, + {file = "PyYAML-6.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:0d3304d8c0adc42be59c5f8a4d9e3d7379e6955ad754aa9d6ab7a398b59dd1df"}, + {file = "PyYAML-6.0.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:50550eb667afee136e9a77d6dc71ae76a44df8b3e51e41b77f6de2932bfe0f47"}, + {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1fe35611261b29bd1de0070f0b2f47cb6ff71fa6595c077e42bd0c419fa27b98"}, + {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:704219a11b772aea0d8ecd7058d0082713c3562b4e271b849ad7dc4a5c90c13c"}, + {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:afd7e57eddb1a54f0f1a974bc4391af8bcce0b444685d936840f125cf046d5bd"}, + {file = "PyYAML-6.0.1-cp36-cp36m-win32.whl", hash = "sha256:fca0e3a251908a499833aa292323f32437106001d436eca0e6e7833256674585"}, + {file = "PyYAML-6.0.1-cp36-cp36m-win_amd64.whl", hash = "sha256:f22ac1c3cac4dbc50079e965eba2c1058622631e526bd9afd45fedd49ba781fa"}, + {file = "PyYAML-6.0.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:b1275ad35a5d18c62a7220633c913e1b42d44b46ee12554e5fd39c70a243d6a3"}, + {file = "PyYAML-6.0.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:18aeb1bf9a78867dc38b259769503436b7c72f7a1f1f4c93ff9a17de54319b27"}, + {file = "PyYAML-6.0.1-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:596106435fa6ad000c2991a98fa58eeb8656ef2325d7e158344fb33864ed87e3"}, + {file = "PyYAML-6.0.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:baa90d3f661d43131ca170712d903e6295d1f7a0f595074f151c0aed377c9b9c"}, + {file = "PyYAML-6.0.1-cp37-cp37m-win32.whl", hash = "sha256:9046c58c4395dff28dd494285c82ba00b546adfc7ef001486fbf0324bc174fba"}, + {file = "PyYAML-6.0.1-cp37-cp37m-win_amd64.whl", hash = "sha256:4fb147e7a67ef577a588a0e2c17b6db51dda102c71de36f8549b6816a96e1867"}, + {file = "PyYAML-6.0.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:1d4c7e777c441b20e32f52bd377e0c409713e8bb1386e1099c2415f26e479595"}, + {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a0cd17c15d3bb3fa06978b4e8958dcdc6e0174ccea823003a106c7d4d7899ac5"}, + {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:28c119d996beec18c05208a8bd78cbe4007878c6dd15091efb73a30e90539696"}, + {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7e07cbde391ba96ab58e532ff4803f79c4129397514e1413a7dc761ccd755735"}, + {file = "PyYAML-6.0.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:49a183be227561de579b4a36efbb21b3eab9651dd81b1858589f796549873dd6"}, + {file = "PyYAML-6.0.1-cp38-cp38-win32.whl", hash = "sha256:184c5108a2aca3c5b3d3bf9395d50893a7ab82a38004c8f61c258d4428e80206"}, + {file = "PyYAML-6.0.1-cp38-cp38-win_amd64.whl", hash = "sha256:1e2722cc9fbb45d9b87631ac70924c11d3a401b2d7f410cc0e3bbf249f2dca62"}, + {file = "PyYAML-6.0.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9eb6caa9a297fc2c2fb8862bc5370d0303ddba53ba97e71f08023b6cd73d16a8"}, + {file = "PyYAML-6.0.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:c8098ddcc2a85b61647b2590f825f3db38891662cfc2fc776415143f599bb859"}, + {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5773183b6446b2c99bb77e77595dd486303b4faab2b086e7b17bc6bef28865f6"}, + {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b786eecbdf8499b9ca1d697215862083bd6d2a99965554781d0d8d1ad31e13a0"}, + {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc1bf2925a1ecd43da378f4db9e4f799775d6367bdb94671027b73b393a7c42c"}, + {file = "PyYAML-6.0.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:04ac92ad1925b2cff1db0cfebffb6ffc43457495c9b3c39d3fcae417d7125dc5"}, + {file = "PyYAML-6.0.1-cp39-cp39-win32.whl", hash = "sha256:faca3bdcf85b2fc05d06ff3fbc1f83e1391b3e724afa3feba7d13eeab355484c"}, + {file = "PyYAML-6.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:510c9deebc5c0225e8c96813043e62b680ba2f9c50a08d3724c7f28a747d1486"}, + {file = "PyYAML-6.0.1.tar.gz", hash = "sha256:bfdf460b1736c775f2ba9f6a92bca30bc2095067b8a9d77876d1fad6cc3b4a43"}, ] [[package]] @@ -2213,20 +2233,20 @@ files = [ [[package]] name = "s3transfer" -version = "0.6.0" +version = "0.10.0" description = "An Amazon S3 Transfer Manager" optional = false -python-versions = ">= 3.7" +python-versions = ">= 3.8" files = [ - {file = "s3transfer-0.6.0-py3-none-any.whl", hash = "sha256:06176b74f3a15f61f1b4f25a1fc29a4429040b7647133a463da8fa5bd28d5ecd"}, - {file = "s3transfer-0.6.0.tar.gz", hash = "sha256:2ed07d3866f523cc561bf4a00fc5535827981b117dd7876f036b0c1aca42c947"}, + {file = "s3transfer-0.10.0-py3-none-any.whl", hash = "sha256:3cdb40f5cfa6966e812209d0994f2a4709b561c88e90cf00c2696d2df4e56b2e"}, + {file = "s3transfer-0.10.0.tar.gz", hash = "sha256:d0c8bbf672d5eebbe4e57945e23b972d963f07d82f661cabf678a5c88831595b"}, ] [package.dependencies] -botocore = ">=1.12.36,<2.0a.0" +botocore = ">=1.33.2,<2.0a.0" [package.extras] -crt = ["botocore[crt] (>=1.20.29,<2.0a.0)"] +crt = ["botocore[crt] (>=1.33.2,<2.0a.0)"] [[package]] name = "sarif-om" @@ -2546,85 +2566,101 @@ files = [ [[package]] name = "yarl" -version = "1.8.2" +version = "1.9.4" description = "Yet another URL library" optional = false python-versions = ">=3.7" files = [ - {file = "yarl-1.8.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:bb81f753c815f6b8e2ddd2eef3c855cf7da193b82396ac013c661aaa6cc6b0a5"}, - {file = "yarl-1.8.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:47d49ac96156f0928f002e2424299b2c91d9db73e08c4cd6742923a086f1c863"}, - {file = "yarl-1.8.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:3fc056e35fa6fba63248d93ff6e672c096f95f7836938241ebc8260e062832fe"}, - {file = "yarl-1.8.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:58a3c13d1c3005dbbac5c9f0d3210b60220a65a999b1833aa46bd6677c69b08e"}, - {file = "yarl-1.8.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:10b08293cda921157f1e7c2790999d903b3fd28cd5c208cf8826b3b508026996"}, - {file = "yarl-1.8.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:de986979bbd87272fe557e0a8fcb66fd40ae2ddfe28a8b1ce4eae22681728fef"}, - {file = "yarl-1.8.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c4fcfa71e2c6a3cb568cf81aadc12768b9995323186a10827beccf5fa23d4f8"}, - {file = "yarl-1.8.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ae4d7ff1049f36accde9e1ef7301912a751e5bae0a9d142459646114c70ecba6"}, - {file = "yarl-1.8.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:bf071f797aec5b96abfc735ab97da9fd8f8768b43ce2abd85356a3127909d146"}, - {file = "yarl-1.8.2-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:74dece2bfc60f0f70907c34b857ee98f2c6dd0f75185db133770cd67300d505f"}, - {file = "yarl-1.8.2-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:df60a94d332158b444301c7f569659c926168e4d4aad2cfbf4bce0e8fb8be826"}, - {file = "yarl-1.8.2-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:63243b21c6e28ec2375f932a10ce7eda65139b5b854c0f6b82ed945ba526bff3"}, - {file = "yarl-1.8.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:cfa2bbca929aa742b5084fd4663dd4b87c191c844326fcb21c3afd2d11497f80"}, - {file = "yarl-1.8.2-cp310-cp310-win32.whl", hash = "sha256:b05df9ea7496df11b710081bd90ecc3a3db6adb4fee36f6a411e7bc91a18aa42"}, - {file = "yarl-1.8.2-cp310-cp310-win_amd64.whl", hash = "sha256:24ad1d10c9db1953291f56b5fe76203977f1ed05f82d09ec97acb623a7976574"}, - {file = "yarl-1.8.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:2a1fca9588f360036242f379bfea2b8b44cae2721859b1c56d033adfd5893634"}, - {file = "yarl-1.8.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:f37db05c6051eff17bc832914fe46869f8849de5b92dc4a3466cd63095d23dfd"}, - {file = "yarl-1.8.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:77e913b846a6b9c5f767b14dc1e759e5aff05502fe73079f6f4176359d832581"}, - {file = "yarl-1.8.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0978f29222e649c351b173da2b9b4665ad1feb8d1daa9d971eb90df08702668a"}, - {file = "yarl-1.8.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:388a45dc77198b2460eac0aca1efd6a7c09e976ee768b0d5109173e521a19daf"}, - {file = "yarl-1.8.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2305517e332a862ef75be8fad3606ea10108662bc6fe08509d5ca99503ac2aee"}, - {file = "yarl-1.8.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:42430ff511571940d51e75cf42f1e4dbdded477e71c1b7a17f4da76c1da8ea76"}, - {file = "yarl-1.8.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3150078118f62371375e1e69b13b48288e44f6691c1069340081c3fd12c94d5b"}, - {file = "yarl-1.8.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:c15163b6125db87c8f53c98baa5e785782078fbd2dbeaa04c6141935eb6dab7a"}, - {file = "yarl-1.8.2-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:4d04acba75c72e6eb90745447d69f84e6c9056390f7a9724605ca9c56b4afcc6"}, - {file = "yarl-1.8.2-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:e7fd20d6576c10306dea2d6a5765f46f0ac5d6f53436217913e952d19237efc4"}, - {file = "yarl-1.8.2-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:75c16b2a900b3536dfc7014905a128a2bea8fb01f9ee26d2d7d8db0a08e7cb2c"}, - {file = "yarl-1.8.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:6d88056a04860a98341a0cf53e950e3ac9f4e51d1b6f61a53b0609df342cc8b2"}, - {file = "yarl-1.8.2-cp311-cp311-win32.whl", hash = "sha256:fb742dcdd5eec9f26b61224c23baea46c9055cf16f62475e11b9b15dfd5c117b"}, - {file = "yarl-1.8.2-cp311-cp311-win_amd64.whl", hash = "sha256:8c46d3d89902c393a1d1e243ac847e0442d0196bbd81aecc94fcebbc2fd5857c"}, - {file = "yarl-1.8.2-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:ceff9722e0df2e0a9e8a79c610842004fa54e5b309fe6d218e47cd52f791d7ef"}, - {file = "yarl-1.8.2-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3f6b4aca43b602ba0f1459de647af954769919c4714706be36af670a5f44c9c1"}, - {file = "yarl-1.8.2-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1684a9bd9077e922300ecd48003ddae7a7474e0412bea38d4631443a91d61077"}, - {file = "yarl-1.8.2-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ebb78745273e51b9832ef90c0898501006670d6e059f2cdb0e999494eb1450c2"}, - {file = "yarl-1.8.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3adeef150d528ded2a8e734ebf9ae2e658f4c49bf413f5f157a470e17a4a2e89"}, - {file = "yarl-1.8.2-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:57a7c87927a468e5a1dc60c17caf9597161d66457a34273ab1760219953f7f4c"}, - {file = "yarl-1.8.2-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:efff27bd8cbe1f9bd127e7894942ccc20c857aa8b5a0327874f30201e5ce83d0"}, - {file = "yarl-1.8.2-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:a783cd344113cb88c5ff7ca32f1f16532a6f2142185147822187913eb989f739"}, - {file = "yarl-1.8.2-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:705227dccbe96ab02c7cb2c43e1228e2826e7ead880bb19ec94ef279e9555b5b"}, - {file = "yarl-1.8.2-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:34c09b43bd538bf6c4b891ecce94b6fa4f1f10663a8d4ca589a079a5018f6ed7"}, - {file = "yarl-1.8.2-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:a48f4f7fea9a51098b02209d90297ac324241bf37ff6be6d2b0149ab2bd51b37"}, - {file = "yarl-1.8.2-cp37-cp37m-win32.whl", hash = "sha256:0414fd91ce0b763d4eadb4456795b307a71524dbacd015c657bb2a39db2eab89"}, - {file = "yarl-1.8.2-cp37-cp37m-win_amd64.whl", hash = "sha256:d881d152ae0007809c2c02e22aa534e702f12071e6b285e90945aa3c376463c5"}, - {file = "yarl-1.8.2-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:5df5e3d04101c1e5c3b1d69710b0574171cc02fddc4b23d1b2813e75f35a30b1"}, - {file = "yarl-1.8.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:7a66c506ec67eb3159eea5096acd05f5e788ceec7b96087d30c7d2865a243918"}, - {file = "yarl-1.8.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:2b4fa2606adf392051d990c3b3877d768771adc3faf2e117b9de7eb977741229"}, - {file = "yarl-1.8.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1e21fb44e1eff06dd6ef971d4bdc611807d6bd3691223d9c01a18cec3677939e"}, - {file = "yarl-1.8.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:93202666046d9edadfe9f2e7bf5e0782ea0d497b6d63da322e541665d65a044e"}, - {file = "yarl-1.8.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:fc77086ce244453e074e445104f0ecb27530d6fd3a46698e33f6c38951d5a0f1"}, - {file = "yarl-1.8.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:64dd68a92cab699a233641f5929a40f02a4ede8c009068ca8aa1fe87b8c20ae3"}, - {file = "yarl-1.8.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1b372aad2b5f81db66ee7ec085cbad72c4da660d994e8e590c997e9b01e44901"}, - {file = "yarl-1.8.2-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:e6f3515aafe0209dd17fb9bdd3b4e892963370b3de781f53e1746a521fb39fc0"}, - {file = "yarl-1.8.2-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:dfef7350ee369197106805e193d420b75467b6cceac646ea5ed3049fcc950a05"}, - {file = "yarl-1.8.2-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:728be34f70a190566d20aa13dc1f01dc44b6aa74580e10a3fb159691bc76909d"}, - {file = "yarl-1.8.2-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:ff205b58dc2929191f68162633d5e10e8044398d7a45265f90a0f1d51f85f72c"}, - {file = "yarl-1.8.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:baf211dcad448a87a0d9047dc8282d7de59473ade7d7fdf22150b1d23859f946"}, - {file = "yarl-1.8.2-cp38-cp38-win32.whl", hash = "sha256:272b4f1599f1b621bf2aabe4e5b54f39a933971f4e7c9aa311d6d7dc06965165"}, - {file = "yarl-1.8.2-cp38-cp38-win_amd64.whl", hash = "sha256:326dd1d3caf910cd26a26ccbfb84c03b608ba32499b5d6eeb09252c920bcbe4f"}, - {file = "yarl-1.8.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:f8ca8ad414c85bbc50f49c0a106f951613dfa5f948ab69c10ce9b128d368baf8"}, - {file = "yarl-1.8.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:418857f837347e8aaef682679f41e36c24250097f9e2f315d39bae3a99a34cbf"}, - {file = "yarl-1.8.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ae0eec05ab49e91a78700761777f284c2df119376e391db42c38ab46fd662b77"}, - {file = "yarl-1.8.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:009a028127e0a1755c38b03244c0bea9d5565630db9c4cf9572496e947137a87"}, - {file = "yarl-1.8.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3edac5d74bb3209c418805bda77f973117836e1de7c000e9755e572c1f7850d0"}, - {file = "yarl-1.8.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:da65c3f263729e47351261351b8679c6429151ef9649bba08ef2528ff2c423b2"}, - {file = "yarl-1.8.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0ef8fb25e52663a1c85d608f6dd72e19bd390e2ecaf29c17fb08f730226e3a08"}, - {file = "yarl-1.8.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:bcd7bb1e5c45274af9a1dd7494d3c52b2be5e6bd8d7e49c612705fd45420b12d"}, - {file = "yarl-1.8.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:44ceac0450e648de86da8e42674f9b7077d763ea80c8ceb9d1c3e41f0f0a9951"}, - {file = "yarl-1.8.2-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:97209cc91189b48e7cfe777237c04af8e7cc51eb369004e061809bcdf4e55220"}, - {file = "yarl-1.8.2-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:48dd18adcf98ea9cd721a25313aef49d70d413a999d7d89df44f469edfb38a06"}, - {file = "yarl-1.8.2-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:e59399dda559688461762800d7fb34d9e8a6a7444fd76ec33220a926c8be1516"}, - {file = "yarl-1.8.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:d617c241c8c3ad5c4e78a08429fa49e4b04bedfc507b34b4d8dceb83b4af3588"}, - {file = "yarl-1.8.2-cp39-cp39-win32.whl", hash = "sha256:cb6d48d80a41f68de41212f3dfd1a9d9898d7841c8f7ce6696cf2fd9cb57ef83"}, - {file = "yarl-1.8.2-cp39-cp39-win_amd64.whl", hash = "sha256:6604711362f2dbf7160df21c416f81fac0de6dbcf0b5445a2ef25478ecc4c778"}, - {file = "yarl-1.8.2.tar.gz", hash = "sha256:49d43402c6e3013ad0978602bf6bf5328535c48d192304b91b97a3c6790b1562"}, + {file = "yarl-1.9.4-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:a8c1df72eb746f4136fe9a2e72b0c9dc1da1cbd23b5372f94b5820ff8ae30e0e"}, + {file = "yarl-1.9.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:a3a6ed1d525bfb91b3fc9b690c5a21bb52de28c018530ad85093cc488bee2dd2"}, + {file = "yarl-1.9.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c38c9ddb6103ceae4e4498f9c08fac9b590c5c71b0370f98714768e22ac6fa66"}, + {file = "yarl-1.9.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d9e09c9d74f4566e905a0b8fa668c58109f7624db96a2171f21747abc7524234"}, + {file = "yarl-1.9.4-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b8477c1ee4bd47c57d49621a062121c3023609f7a13b8a46953eb6c9716ca392"}, + {file = "yarl-1.9.4-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d5ff2c858f5f6a42c2a8e751100f237c5e869cbde669a724f2062d4c4ef93551"}, + {file = "yarl-1.9.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:357495293086c5b6d34ca9616a43d329317feab7917518bc97a08f9e55648455"}, + {file = "yarl-1.9.4-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:54525ae423d7b7a8ee81ba189f131054defdb122cde31ff17477951464c1691c"}, + {file = "yarl-1.9.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:801e9264d19643548651b9db361ce3287176671fb0117f96b5ac0ee1c3530d53"}, + {file = "yarl-1.9.4-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:e516dc8baf7b380e6c1c26792610230f37147bb754d6426462ab115a02944385"}, + {file = "yarl-1.9.4-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:7d5aaac37d19b2904bb9dfe12cdb08c8443e7ba7d2852894ad448d4b8f442863"}, + {file = "yarl-1.9.4-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:54beabb809ffcacbd9d28ac57b0db46e42a6e341a030293fb3185c409e626b8b"}, + {file = "yarl-1.9.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:bac8d525a8dbc2a1507ec731d2867025d11ceadcb4dd421423a5d42c56818541"}, + {file = "yarl-1.9.4-cp310-cp310-win32.whl", hash = "sha256:7855426dfbddac81896b6e533ebefc0af2f132d4a47340cee6d22cac7190022d"}, + {file = "yarl-1.9.4-cp310-cp310-win_amd64.whl", hash = "sha256:848cd2a1df56ddbffeb375535fb62c9d1645dde33ca4d51341378b3f5954429b"}, + {file = "yarl-1.9.4-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:35a2b9396879ce32754bd457d31a51ff0a9d426fd9e0e3c33394bf4b9036b099"}, + {file = "yarl-1.9.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:4c7d56b293cc071e82532f70adcbd8b61909eec973ae9d2d1f9b233f3d943f2c"}, + {file = "yarl-1.9.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:d8a1c6c0be645c745a081c192e747c5de06e944a0d21245f4cf7c05e457c36e0"}, + {file = "yarl-1.9.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4b3c1ffe10069f655ea2d731808e76e0f452fc6c749bea04781daf18e6039525"}, + {file = "yarl-1.9.4-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:549d19c84c55d11687ddbd47eeb348a89df9cb30e1993f1b128f4685cd0ebbf8"}, + {file = "yarl-1.9.4-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a7409f968456111140c1c95301cadf071bd30a81cbd7ab829169fb9e3d72eae9"}, + {file = "yarl-1.9.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e23a6d84d9d1738dbc6e38167776107e63307dfc8ad108e580548d1f2c587f42"}, + {file = "yarl-1.9.4-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d8b889777de69897406c9fb0b76cdf2fd0f31267861ae7501d93003d55f54fbe"}, + {file = "yarl-1.9.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:03caa9507d3d3c83bca08650678e25364e1843b484f19986a527630ca376ecce"}, + {file = "yarl-1.9.4-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:4e9035df8d0880b2f1c7f5031f33f69e071dfe72ee9310cfc76f7b605958ceb9"}, + {file = "yarl-1.9.4-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:c0ec0ed476f77db9fb29bca17f0a8fcc7bc97ad4c6c1d8959c507decb22e8572"}, + {file = "yarl-1.9.4-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:ee04010f26d5102399bd17f8df8bc38dc7ccd7701dc77f4a68c5b8d733406958"}, + {file = "yarl-1.9.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:49a180c2e0743d5d6e0b4d1a9e5f633c62eca3f8a86ba5dd3c471060e352ca98"}, + {file = "yarl-1.9.4-cp311-cp311-win32.whl", hash = "sha256:81eb57278deb6098a5b62e88ad8281b2ba09f2f1147c4767522353eaa6260b31"}, + {file = "yarl-1.9.4-cp311-cp311-win_amd64.whl", hash = "sha256:d1d2532b340b692880261c15aee4dc94dd22ca5d61b9db9a8a361953d36410b1"}, + {file = "yarl-1.9.4-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:0d2454f0aef65ea81037759be5ca9947539667eecebca092733b2eb43c965a81"}, + {file = "yarl-1.9.4-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:44d8ffbb9c06e5a7f529f38f53eda23e50d1ed33c6c869e01481d3fafa6b8142"}, + {file = "yarl-1.9.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:aaaea1e536f98754a6e5c56091baa1b6ce2f2700cc4a00b0d49eca8dea471074"}, + {file = "yarl-1.9.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3777ce5536d17989c91696db1d459574e9a9bd37660ea7ee4d3344579bb6f129"}, + {file = "yarl-1.9.4-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9fc5fc1eeb029757349ad26bbc5880557389a03fa6ada41703db5e068881e5f2"}, + {file = "yarl-1.9.4-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ea65804b5dc88dacd4a40279af0cdadcfe74b3e5b4c897aa0d81cf86927fee78"}, + {file = "yarl-1.9.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:aa102d6d280a5455ad6a0f9e6d769989638718e938a6a0a2ff3f4a7ff8c62cc4"}, + {file = "yarl-1.9.4-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:09efe4615ada057ba2d30df871d2f668af661e971dfeedf0c159927d48bbeff0"}, + {file = "yarl-1.9.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:008d3e808d03ef28542372d01057fd09168419cdc8f848efe2804f894ae03e51"}, + {file = "yarl-1.9.4-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:6f5cb257bc2ec58f437da2b37a8cd48f666db96d47b8a3115c29f316313654ff"}, + {file = "yarl-1.9.4-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:992f18e0ea248ee03b5a6e8b3b4738850ae7dbb172cc41c966462801cbf62cf7"}, + {file = "yarl-1.9.4-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:0e9d124c191d5b881060a9e5060627694c3bdd1fe24c5eecc8d5d7d0eb6faabc"}, + {file = "yarl-1.9.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:3986b6f41ad22988e53d5778f91855dc0399b043fc8946d4f2e68af22ee9ff10"}, + {file = "yarl-1.9.4-cp312-cp312-win32.whl", hash = "sha256:4b21516d181cd77ebd06ce160ef8cc2a5e9ad35fb1c5930882baff5ac865eee7"}, + {file = "yarl-1.9.4-cp312-cp312-win_amd64.whl", hash = "sha256:a9bd00dc3bc395a662900f33f74feb3e757429e545d831eef5bb280252631984"}, + {file = "yarl-1.9.4-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:63b20738b5aac74e239622d2fe30df4fca4942a86e31bf47a81a0e94c14df94f"}, + {file = "yarl-1.9.4-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d7d7f7de27b8944f1fee2c26a88b4dabc2409d2fea7a9ed3df79b67277644e17"}, + {file = "yarl-1.9.4-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c74018551e31269d56fab81a728f683667e7c28c04e807ba08f8c9e3bba32f14"}, + {file = "yarl-1.9.4-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ca06675212f94e7a610e85ca36948bb8fc023e458dd6c63ef71abfd482481aa5"}, + {file = "yarl-1.9.4-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5aef935237d60a51a62b86249839b51345f47564208c6ee615ed2a40878dccdd"}, + {file = "yarl-1.9.4-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2b134fd795e2322b7684155b7855cc99409d10b2e408056db2b93b51a52accc7"}, + {file = "yarl-1.9.4-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:d25039a474c4c72a5ad4b52495056f843a7ff07b632c1b92ea9043a3d9950f6e"}, + {file = "yarl-1.9.4-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:f7d6b36dd2e029b6bcb8a13cf19664c7b8e19ab3a58e0fefbb5b8461447ed5ec"}, + {file = "yarl-1.9.4-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:957b4774373cf6f709359e5c8c4a0af9f6d7875db657adb0feaf8d6cb3c3964c"}, + {file = "yarl-1.9.4-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:d7eeb6d22331e2fd42fce928a81c697c9ee2d51400bd1a28803965883e13cead"}, + {file = "yarl-1.9.4-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:6a962e04b8f91f8c4e5917e518d17958e3bdee71fd1d8b88cdce74dd0ebbf434"}, + {file = "yarl-1.9.4-cp37-cp37m-win32.whl", hash = "sha256:f3bc6af6e2b8f92eced34ef6a96ffb248e863af20ef4fde9448cc8c9b858b749"}, + {file = "yarl-1.9.4-cp37-cp37m-win_amd64.whl", hash = "sha256:ad4d7a90a92e528aadf4965d685c17dacff3df282db1121136c382dc0b6014d2"}, + {file = "yarl-1.9.4-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:ec61d826d80fc293ed46c9dd26995921e3a82146feacd952ef0757236fc137be"}, + {file = "yarl-1.9.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:8be9e837ea9113676e5754b43b940b50cce76d9ed7d2461df1af39a8ee674d9f"}, + {file = "yarl-1.9.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:bef596fdaa8f26e3d66af846bbe77057237cb6e8efff8cd7cc8dff9a62278bbf"}, + {file = "yarl-1.9.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2d47552b6e52c3319fede1b60b3de120fe83bde9b7bddad11a69fb0af7db32f1"}, + {file = "yarl-1.9.4-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:84fc30f71689d7fc9168b92788abc977dc8cefa806909565fc2951d02f6b7d57"}, + {file = "yarl-1.9.4-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4aa9741085f635934f3a2583e16fcf62ba835719a8b2b28fb2917bb0537c1dfa"}, + {file = "yarl-1.9.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:206a55215e6d05dbc6c98ce598a59e6fbd0c493e2de4ea6cc2f4934d5a18d130"}, + {file = "yarl-1.9.4-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:07574b007ee20e5c375a8fe4a0789fad26db905f9813be0f9fef5a68080de559"}, + {file = "yarl-1.9.4-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:5a2e2433eb9344a163aced6a5f6c9222c0786e5a9e9cac2c89f0b28433f56e23"}, + {file = "yarl-1.9.4-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:6ad6d10ed9b67a382b45f29ea028f92d25bc0bc1daf6c5b801b90b5aa70fb9ec"}, + {file = "yarl-1.9.4-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:6fe79f998a4052d79e1c30eeb7d6c1c1056ad33300f682465e1b4e9b5a188b78"}, + {file = "yarl-1.9.4-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:a825ec844298c791fd28ed14ed1bffc56a98d15b8c58a20e0e08c1f5f2bea1be"}, + {file = "yarl-1.9.4-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:8619d6915b3b0b34420cf9b2bb6d81ef59d984cb0fde7544e9ece32b4b3043c3"}, + {file = "yarl-1.9.4-cp38-cp38-win32.whl", hash = "sha256:686a0c2f85f83463272ddffd4deb5e591c98aac1897d65e92319f729c320eece"}, + {file = "yarl-1.9.4-cp38-cp38-win_amd64.whl", hash = "sha256:a00862fb23195b6b8322f7d781b0dc1d82cb3bcac346d1e38689370cc1cc398b"}, + {file = "yarl-1.9.4-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:604f31d97fa493083ea21bd9b92c419012531c4e17ea6da0f65cacdcf5d0bd27"}, + {file = "yarl-1.9.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:8a854227cf581330ffa2c4824d96e52ee621dd571078a252c25e3a3b3d94a1b1"}, + {file = "yarl-1.9.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ba6f52cbc7809cd8d74604cce9c14868306ae4aa0282016b641c661f981a6e91"}, + {file = "yarl-1.9.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a6327976c7c2f4ee6816eff196e25385ccc02cb81427952414a64811037bbc8b"}, + {file = "yarl-1.9.4-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8397a3817d7dcdd14bb266283cd1d6fc7264a48c186b986f32e86d86d35fbac5"}, + {file = "yarl-1.9.4-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e0381b4ce23ff92f8170080c97678040fc5b08da85e9e292292aba67fdac6c34"}, + {file = "yarl-1.9.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:23d32a2594cb5d565d358a92e151315d1b2268bc10f4610d098f96b147370136"}, + {file = "yarl-1.9.4-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ddb2a5c08a4eaaba605340fdee8fc08e406c56617566d9643ad8bf6852778fc7"}, + {file = "yarl-1.9.4-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:26a1dc6285e03f3cc9e839a2da83bcbf31dcb0d004c72d0730e755b33466c30e"}, + {file = "yarl-1.9.4-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:18580f672e44ce1238b82f7fb87d727c4a131f3a9d33a5e0e82b793362bf18b4"}, + {file = "yarl-1.9.4-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:29e0f83f37610f173eb7e7b5562dd71467993495e568e708d99e9d1944f561ec"}, + {file = "yarl-1.9.4-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:1f23e4fe1e8794f74b6027d7cf19dc25f8b63af1483d91d595d4a07eca1fb26c"}, + {file = "yarl-1.9.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:db8e58b9d79200c76956cefd14d5c90af54416ff5353c5bfd7cbe58818e26ef0"}, + {file = "yarl-1.9.4-cp39-cp39-win32.whl", hash = "sha256:c7224cab95645c7ab53791022ae77a4509472613e839dab722a72abe5a684575"}, + {file = "yarl-1.9.4-cp39-cp39-win_amd64.whl", hash = "sha256:824d6c50492add5da9374875ce72db7a0733b29c2394890aef23d533106e2b15"}, + {file = "yarl-1.9.4-py3-none-any.whl", hash = "sha256:928cecb0ef9d5a7946eb6ff58417ad2fe9375762382f1bf5c55e61645f2c43ad"}, + {file = "yarl-1.9.4.tar.gz", hash = "sha256:566db86717cf8080b99b58b083b773a908ae40f06681e87e589a976faf8246bf"}, ] [package.dependencies] @@ -2707,4 +2743,4 @@ cffi = ["cffi (>=1.11)"] [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "25ffa9ed98d890a3b85e6036792296a60bb705e8f9eaa1f07336501116a58756" +content-hash = "8de8b05a9b35a6f76da7d7e3652ddbb521f1eca53fce7b933f537080a9d6eada" diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml index 0822718bae..c94cd55417 100644 --- a/proxy/Cargo.toml +++ b/proxy/Cargo.toml @@ -4,6 +4,10 @@ version = "0.1.0" edition.workspace = true license.workspace = true +[features] +default = [] +testing = [] + [dependencies] anyhow.workspace = true async-trait.workspace = true @@ -57,6 +61,7 @@ thiserror.workspace = true tls-listener.workspace = true tokio-postgres.workspace = true tokio-rustls.workspace = true +tokio-util.workspace = true tokio = { workspace = true, features = ["signal"] } tracing-opentelemetry.workspace = true tracing-subscriber.workspace = true @@ -69,12 +74,12 @@ webpki-roots.workspace = true x509-parser.workspace = true native-tls.workspace = true postgres-native-tls.workspace = true +postgres-protocol.workspace = true +smol_str.workspace = true workspace_hack.workspace = true -tokio-util.workspace = true [dev-dependencies] rcgen.workspace = true rstest.workspace = true tokio-postgres-rustls.workspace = true -postgres-protocol.workspace = true diff --git a/proxy/src/auth.rs b/proxy/src/auth.rs index 7d79d34045..64ef108e11 100644 --- a/proxy/src/auth.rs +++ b/proxy/src/auth.rs @@ -62,6 +62,9 @@ pub enum AuthErrorImpl { Please add it to the allowed list in the Neon console." )] IpAddressNotAllowed, + + #[error("Too many connections to this endpoint. Please try again later.")] + TooManyConnections, } #[derive(Debug, Error)] @@ -80,6 +83,14 @@ impl AuthError { pub fn ip_address_not_allowed() -> Self { AuthErrorImpl::IpAddressNotAllowed.into() } + + pub fn too_many_connections() -> Self { + AuthErrorImpl::TooManyConnections.into() + } + + pub fn is_auth_failed(&self) -> bool { + matches!(self.0.as_ref(), AuthErrorImpl::AuthFailed(_)) + } } impl> From for AuthError { @@ -102,6 +113,7 @@ impl UserFacingError for AuthError { MissingEndpointName => self.to_string(), Io(_) => "Internal error".to_string(), IpAddressNotAllowed => self.to_string(), + TooManyConnections => self.to_string(), } } } diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs index aa872285b1..923bd02560 100644 --- a/proxy/src/auth/backend.rs +++ b/proxy/src/auth/backend.rs @@ -3,13 +3,15 @@ mod hacks; mod link; pub use link::LinkAuthError; +use smol_str::SmolStr; use tokio_postgres::config::AuthKeys; use crate::auth::credentials::check_peer_addr_is_in_list; +use crate::auth::validate_password_and_exchange; use crate::console::errors::GetAuthInfoError; -use crate::console::provider::AuthInfo; use crate::console::AuthSecret; -use crate::proxy::{handle_try_wake, retry_after, LatencyTimer}; +use crate::proxy::connect_compute::handle_try_wake; +use crate::proxy::retry::retry_after; use crate::scram; use crate::stream::Stream; use crate::{ @@ -20,35 +22,17 @@ use crate::{ provider::{CachedNodeInfo, ConsoleReqExtra}, Api, }, + metrics::LatencyTimer, stream, url, }; use futures::TryFutureExt; use std::borrow::Cow; +use std::net::IpAddr; use std::ops::ControlFlow; use std::sync::Arc; use tokio::io::{AsyncRead, AsyncWrite}; use tracing::{error, info, warn}; -/// A product of successful authentication. -pub struct AuthSuccess { - /// Did we send [`pq_proto::BeMessage::AuthenticationOk`] to client? - pub reported_auth_ok: bool, - /// Something to be considered a positive result. - pub value: T, -} - -impl AuthSuccess { - /// Very similar to [`std::option::Option::map`]. - /// Maps [`AuthSuccess`] to [`AuthSuccess`] by applying - /// a function to a contained value. - pub fn map(self, f: impl FnOnce(T) -> R) -> AuthSuccess { - AuthSuccess { - reported_auth_ok: self.reported_auth_ok, - value: f(self.value), - } - } -} - /// This type serves two purposes: /// /// * When `T` is `()`, it's just a regular auth backend selector @@ -61,9 +45,11 @@ pub enum BackendType<'a, T> { /// Current Cloud API (V2). Console(Cow<'a, console::provider::neon::Api>, T), /// Local mock of Cloud API (V2). + #[cfg(feature = "testing")] Postgres(Cow<'a, console::provider::mock::Api>, T), /// Authentication via a web browser. Link(Cow<'a, url::ApiUrl>), + #[cfg(test)] /// Test backend. Test(&'a dyn TestBackend), } @@ -78,8 +64,10 @@ impl std::fmt::Display for BackendType<'_, ()> { use BackendType::*; match self { Console(endpoint, _) => fmt.debug_tuple("Console").field(&endpoint.url()).finish(), + #[cfg(feature = "testing")] Postgres(endpoint, _) => fmt.debug_tuple("Postgres").field(&endpoint.url()).finish(), Link(url) => fmt.debug_tuple("Link").field(&url.as_str()).finish(), + #[cfg(test)] Test(_) => fmt.debug_tuple("Test").finish(), } } @@ -92,8 +80,10 @@ impl BackendType<'_, T> { use BackendType::*; match self { Console(c, x) => Console(Cow::Borrowed(c), x), + #[cfg(feature = "testing")] Postgres(c, x) => Postgres(Cow::Borrowed(c), x), Link(c) => Link(Cow::Borrowed(c)), + #[cfg(test)] Test(x) => Test(*x), } } @@ -107,8 +97,10 @@ impl<'a, T> BackendType<'a, T> { use BackendType::*; match self { Console(c, x) => Console(c, f(x)), + #[cfg(feature = "testing")] Postgres(c, x) => Postgres(c, f(x)), Link(c) => Link(c), + #[cfg(test)] Test(x) => Test(x), } } @@ -121,88 +113,165 @@ impl<'a, T, E> BackendType<'a, Result> { use BackendType::*; match self { Console(c, x) => x.map(|x| Console(c, x)), + #[cfg(feature = "testing")] Postgres(c, x) => x.map(|x| Postgres(c, x)), Link(c) => Ok(Link(c)), + #[cfg(test)] Test(x) => Ok(Test(x)), } } } -pub enum ComputeCredentials { +pub struct ComputeCredentials { + pub info: ComputeUserInfo, + pub keys: T, +} + +pub struct ComputeUserInfoNoEndpoint { + pub user: SmolStr, + pub peer_addr: IpAddr, + pub cache_key: SmolStr, +} + +pub struct ComputeUserInfo { + pub endpoint: SmolStr, + pub inner: ComputeUserInfoNoEndpoint, +} + +pub enum ComputeCredentialKeys { + #[cfg(feature = "testing")] Password(Vec), AuthKeys(AuthKeys), } -/// True to its name, this function encapsulates our current auth trade-offs. -/// Here, we choose the appropriate auth flow based on circumstances. -async fn auth_quirks_creds( - api: &impl console::Api, - extra: &ConsoleReqExtra<'_>, - creds: &mut ClientCredentials<'_>, - client: &mut stream::PqStream>, - allow_cleartext: bool, - config: &'static AuthenticationConfig, - latency_timer: &mut LatencyTimer, -) -> auth::Result> { - // If there's no project so far, that entails that client doesn't - // support SNI or other means of passing the endpoint (project) name. - // We now expect to see a very specific payload in the place of password. - let maybe_success = if creds.project.is_none() { - // Password will be checked by the compute node later. - Some(hacks::password_hack(creds, client, latency_timer).await?) - } else { - None - }; +impl TryFrom for ComputeUserInfo { + // user name + type Error = ComputeUserInfoNoEndpoint; - // Password hack should set the project name. - // TODO: make `creds.project` more type-safe. - assert!(creds.project.is_some()); - info!("fetching user's authentication info"); - // TODO(anna): this will slow down both "hacks" below; we probably need a cache. - let AuthInfo { - secret, - allowed_ips, - } = api.get_auth_info(extra, creds).await?; - - // check allowed list - if !check_peer_addr_is_in_list(&creds.peer_addr.ip(), &allowed_ips) { - return Err(auth::AuthError::ip_address_not_allowed()); + fn try_from(creds: ClientCredentials) -> Result { + let inner = ComputeUserInfoNoEndpoint { + user: creds.user, + peer_addr: creds.peer_addr, + cache_key: creds.cache_key, + }; + match creds.project { + None => Err(inner), + Some(endpoint) => Ok(ComputeUserInfo { endpoint, inner }), + } } - let secret = secret.unwrap_or_else(|| { - // If we don't have an authentication secret, we mock one to - // prevent malicious probing (possible due to missing protocol steps). - // This mocked secret will never lead to successful authentication. - info!("authentication info not found, mocking it"); - AuthSecret::Scram(scram::ServerSecret::mock(creds.user, rand::random())) - }); - - if let Some(success) = maybe_success { - return Ok(success); - } - - // Perform cleartext auth if we're allowed to do that. - // Currently, we use it for websocket connections (latency). - if allow_cleartext { - // Password will be checked by the compute node later. - return hacks::cleartext_hack(client, latency_timer).await; - } - - // Finally, proceed with the main auth flow (SCRAM-based). - classic::authenticate(creds, client, config, latency_timer, secret).await } /// True to its name, this function encapsulates our current auth trade-offs. /// Here, we choose the appropriate auth flow based on circumstances. +/// +/// All authentication flows will emit an AuthenticationOk message if successful. async fn auth_quirks( api: &impl console::Api, - extra: &ConsoleReqExtra<'_>, - creds: &mut ClientCredentials<'_>, + extra: &ConsoleReqExtra, + creds: ClientCredentials, client: &mut stream::PqStream>, allow_cleartext: bool, config: &'static AuthenticationConfig, latency_timer: &mut LatencyTimer, -) -> auth::Result> { - let auth_stuff = auth_quirks_creds( +) -> auth::Result> { + // If there's no project so far, that entails that client doesn't + // support SNI or other means of passing the endpoint (project) name. + // We now expect to see a very specific payload in the place of password. + let (info, unauthenticated_password) = match creds.try_into() { + Err(info) => { + let res = hacks::password_hack_no_authentication(info, client, latency_timer).await?; + (res.info, Some(res.keys)) + } + Ok(info) => (info, None), + }; + + info!("fetching user's authentication info"); + let allowed_ips = api.get_allowed_ips(extra, &info).await?; + + // check allowed list + if !check_peer_addr_is_in_list(&info.inner.peer_addr, &allowed_ips) { + return Err(auth::AuthError::ip_address_not_allowed()); + } + let cached_secret = api.get_role_secret(extra, &info).await?; + + let secret = cached_secret.clone().unwrap_or_else(|| { + // If we don't have an authentication secret, we mock one to + // prevent malicious probing (possible due to missing protocol steps). + // This mocked secret will never lead to successful authentication. + info!("authentication info not found, mocking it"); + AuthSecret::Scram(scram::ServerSecret::mock(&info.inner.user, rand::random())) + }); + match authenticate_with_secret( + secret, + info, + client, + unauthenticated_password, + allow_cleartext, + config, + latency_timer, + ) + .await + { + Ok(keys) => Ok(keys), + Err(e) => { + if e.is_auth_failed() { + // The password could have been changed, so we invalidate the cache. + cached_secret.invalidate(); + } + Err(e) + } + } +} + +async fn authenticate_with_secret( + secret: AuthSecret, + info: ComputeUserInfo, + client: &mut stream::PqStream>, + unauthenticated_password: Option>, + allow_cleartext: bool, + config: &'static AuthenticationConfig, + latency_timer: &mut LatencyTimer, +) -> auth::Result> { + if let Some(password) = unauthenticated_password { + let auth_outcome = validate_password_and_exchange(&password, secret)?; + let keys = match auth_outcome { + crate::sasl::Outcome::Success(key) => key, + crate::sasl::Outcome::Failure(reason) => { + info!("auth backend failed with an error: {reason}"); + return Err(auth::AuthError::auth_failed(&*info.inner.user)); + } + }; + + // we have authenticated the password + client.write_message_noflush(&pq_proto::BeMessage::AuthenticationOk)?; + + return Ok(ComputeCredentials { info, keys }); + } + + // -- the remaining flows are self-authenticating -- + + // Perform cleartext auth if we're allowed to do that. + // Currently, we use it for websocket connections (latency). + if allow_cleartext { + return hacks::authenticate_cleartext(info, client, latency_timer, secret).await; + } + + // Finally, proceed with the main auth flow (SCRAM-based). + classic::authenticate(info, client, config, latency_timer, secret).await +} + +/// Authenticate the user and then wake a compute (or retrieve an existing compute session from cache) +/// only if authentication was successfuly. +async fn auth_and_wake_compute( + api: &impl console::Api, + extra: &ConsoleReqExtra, + creds: ClientCredentials, + client: &mut stream::PqStream>, + allow_cleartext: bool, + config: &'static AuthenticationConfig, + latency_timer: &mut LatencyTimer, +) -> auth::Result<(CachedNodeInfo, ComputeUserInfo)> { + let compute_credentials = auth_quirks( api, extra, creds, @@ -215,7 +284,7 @@ async fn auth_quirks( let mut num_retries = 0; let mut node = loop { - let wake_res = api.wake_compute(extra, creds).await; + let wake_res = api.wake_compute(extra, &compute_credentials.info).await; match handle_try_wake(wake_res, num_retries) { Err(e) => { error!(error = ?e, num_retries, retriable = false, "couldn't wake compute node"); @@ -232,27 +301,27 @@ async fn auth_quirks( tokio::time::sleep(wait_duration).await; }; - match auth_stuff.value { - ComputeCredentials::Password(password) => node.config.password(password), - ComputeCredentials::AuthKeys(auth_keys) => node.config.auth_keys(auth_keys), + match compute_credentials.keys { + #[cfg(feature = "testing")] + ComputeCredentialKeys::Password(password) => node.config.password(password), + ComputeCredentialKeys::AuthKeys(auth_keys) => node.config.auth_keys(auth_keys), }; - Ok(AuthSuccess { - reported_auth_ok: auth_stuff.reported_auth_ok, - value: node, - }) + Ok((node, compute_credentials.info)) } -impl BackendType<'_, ClientCredentials<'_>> { +impl<'a> BackendType<'a, ClientCredentials> { /// Get compute endpoint name from the credentials. - pub fn get_endpoint(&self) -> Option { + pub fn get_endpoint(&self) -> Option { use BackendType::*; match self { Console(_, creds) => creds.project.clone(), + #[cfg(feature = "testing")] Postgres(_, creds) => creds.project.clone(), - Link(_) => Some("link".to_owned()), - Test(_) => Some("test".to_owned()), + Link(_) => Some("link".into()), + #[cfg(test)] + Test(_) => Some("test".into()), } } @@ -261,9 +330,11 @@ impl BackendType<'_, ClientCredentials<'_>> { use BackendType::*; match self { - Console(_, creds) => creds.user, - Postgres(_, creds) => creds.user, + Console(_, creds) => &creds.user, + #[cfg(feature = "testing")] + Postgres(_, creds) => &creds.user, Link(_) => "link", + #[cfg(test)] Test(_) => "test", } } @@ -271,26 +342,25 @@ impl BackendType<'_, ClientCredentials<'_>> { /// Authenticate the client via the requested backend, possibly using credentials. #[tracing::instrument(fields(allow_cleartext = allow_cleartext), skip_all)] pub async fn authenticate( - &mut self, - extra: &ConsoleReqExtra<'_>, + self, + extra: &ConsoleReqExtra, client: &mut stream::PqStream>, allow_cleartext: bool, config: &'static AuthenticationConfig, latency_timer: &mut LatencyTimer, - ) -> auth::Result> { + ) -> auth::Result<(CachedNodeInfo, BackendType<'a, ComputeUserInfo>)> { use BackendType::*; let res = match self { Console(api, creds) => { info!( - user = creds.user, + user = &*creds.user, project = creds.project(), "performing authentication using the console" ); - let api = api.as_ref(); - auth_quirks( - api, + let (cache_info, user_info) = auth_and_wake_compute( + &*api, extra, creds, client, @@ -298,18 +368,19 @@ impl BackendType<'_, ClientCredentials<'_>> { config, latency_timer, ) - .await? + .await?; + (cache_info, BackendType::Console(api, user_info)) } + #[cfg(feature = "testing")] Postgres(api, creds) => { info!( - user = creds.user, + user = &*creds.user, project = creds.project(), "performing authentication using a local postgres instance" ); - let api = api.as_ref(); - auth_quirks( - api, + let (cache_info, user_info) = auth_and_wake_compute( + &*api, extra, creds, client, @@ -317,16 +388,21 @@ impl BackendType<'_, ClientCredentials<'_>> { config, latency_timer, ) - .await? + .await?; + (cache_info, BackendType::Postgres(api, user_info)) } // NOTE: this auth backend doesn't use client credentials. Link(url) => { info!("performing link authentication"); - link::authenticate(url, client) - .await? - .map(CachedNodeInfo::new_uncached) + let node_info = link::authenticate(&url, client).await?; + + ( + CachedNodeInfo::new_uncached(node_info), + BackendType::Link(url), + ) } + #[cfg(test)] Test(_) => { unreachable!("this function should never be called in the test backend") } @@ -335,16 +411,20 @@ impl BackendType<'_, ClientCredentials<'_>> { info!("user successfully authenticated"); Ok(res) } +} +impl BackendType<'_, ComputeUserInfo> { pub async fn get_allowed_ips( &self, - extra: &ConsoleReqExtra<'_>, + extra: &ConsoleReqExtra, ) -> Result>, GetAuthInfoError> { use BackendType::*; match self { Console(api, creds) => api.get_allowed_ips(extra, creds).await, + #[cfg(feature = "testing")] Postgres(api, creds) => api.get_allowed_ips(extra, creds).await, Link(_) => Ok(Arc::new(vec![])), + #[cfg(test)] Test(x) => x.get_allowed_ips(), } } @@ -353,14 +433,16 @@ impl BackendType<'_, ClientCredentials<'_>> { /// The link auth flow doesn't support this, so we return [`None`] in that case. pub async fn wake_compute( &self, - extra: &ConsoleReqExtra<'_>, + extra: &ConsoleReqExtra, ) -> Result, console::errors::WakeComputeError> { use BackendType::*; match self { Console(api, creds) => api.wake_compute(extra, creds).map_ok(Some).await, + #[cfg(feature = "testing")] Postgres(api, creds) => api.wake_compute(extra, creds).map_ok(Some).await, Link(_) => Ok(None), + #[cfg(test)] Test(x) => x.wake_compute().map(Some), } } diff --git a/proxy/src/auth/backend/classic.rs b/proxy/src/auth/backend/classic.rs index bb210821cd..5c394ec649 100644 --- a/proxy/src/auth/backend/classic.rs +++ b/proxy/src/auth/backend/classic.rs @@ -1,10 +1,10 @@ -use super::{AuthSuccess, ComputeCredentials}; +use super::{ComputeCredentials, ComputeUserInfo}; use crate::{ - auth::{self, AuthFlow, ClientCredentials}, + auth::{self, backend::ComputeCredentialKeys, AuthFlow}, compute, config::AuthenticationConfig, console::AuthSecret, - proxy::LatencyTimer, + metrics::LatencyTimer, sasl, stream::{PqStream, Stream}, }; @@ -12,14 +12,15 @@ use tokio::io::{AsyncRead, AsyncWrite}; use tracing::{info, warn}; pub(super) async fn authenticate( - creds: &ClientCredentials<'_>, + creds: ComputeUserInfo, client: &mut PqStream>, config: &'static AuthenticationConfig, latency_timer: &mut LatencyTimer, secret: AuthSecret, -) -> auth::Result> { +) -> auth::Result> { let flow = AuthFlow::new(client); let scram_keys = match secret { + #[cfg(feature = "testing")] AuthSecret::Md5(_) => { info!("auth endpoint chooses MD5"); return Err(auth::AuthError::bad_auth_method("MD5")); @@ -53,7 +54,7 @@ pub(super) async fn authenticate( sasl::Outcome::Success(key) => key, sasl::Outcome::Failure(reason) => { info!("auth backend failed with an error: {reason}"); - return Err(auth::AuthError::auth_failed(creds.user)); + return Err(auth::AuthError::auth_failed(&*creds.inner.user)); } }; @@ -64,9 +65,9 @@ pub(super) async fn authenticate( } }; - Ok(AuthSuccess { - reported_auth_ok: false, - value: ComputeCredentials::AuthKeys(tokio_postgres::config::AuthKeys::ScramSha256( + Ok(ComputeCredentials { + info: creds, + keys: ComputeCredentialKeys::AuthKeys(tokio_postgres::config::AuthKeys::ScramSha256( scram_keys, )), }) diff --git a/proxy/src/auth/backend/hacks.rs b/proxy/src/auth/backend/hacks.rs index 4448dbc56a..5dde514bca 100644 --- a/proxy/src/auth/backend/hacks.rs +++ b/proxy/src/auth/backend/hacks.rs @@ -1,7 +1,11 @@ -use super::{AuthSuccess, ComputeCredentials}; +use super::{ + ComputeCredentialKeys, ComputeCredentials, ComputeUserInfo, ComputeUserInfoNoEndpoint, +}; use crate::{ - auth::{self, AuthFlow, ClientCredentials}, - proxy::LatencyTimer, + auth::{self, AuthFlow}, + console::AuthSecret, + metrics::LatencyTimer, + sasl, stream::{self, Stream}, }; use tokio::io::{AsyncRead, AsyncWrite}; @@ -11,35 +15,42 @@ use tracing::{info, warn}; /// one round trip and *expensive* computations (>= 4096 HMAC iterations). /// These properties are benefical for serverless JS workers, so we /// use this mechanism for websocket connections. -pub async fn cleartext_hack( +pub async fn authenticate_cleartext( + info: ComputeUserInfo, client: &mut stream::PqStream>, latency_timer: &mut LatencyTimer, -) -> auth::Result> { + secret: AuthSecret, +) -> auth::Result> { warn!("cleartext auth flow override is enabled, proceeding"); // pause the timer while we communicate with the client let _paused = latency_timer.pause(); - let password = AuthFlow::new(client) - .begin(auth::CleartextPassword) + let auth_outcome = AuthFlow::new(client) + .begin(auth::CleartextPassword(secret)) .await? .authenticate() .await?; - // Report tentative success; compute node will check the password anyway. - Ok(AuthSuccess { - reported_auth_ok: false, - value: ComputeCredentials::Password(password), - }) + let keys = match auth_outcome { + sasl::Outcome::Success(key) => key, + sasl::Outcome::Failure(reason) => { + info!("auth backend failed with an error: {reason}"); + return Err(auth::AuthError::auth_failed(&*info.inner.user)); + } + }; + + Ok(ComputeCredentials { info, keys }) } /// Workaround for clients which don't provide an endpoint (project) name. -/// Very similar to [`cleartext_hack`], but there's a specific password format. -pub async fn password_hack( - creds: &mut ClientCredentials<'_>, +/// Similar to [`authenticate_cleartext`], but there's a specific password format, +/// and passwords are not yet validated (we don't know how to validate them!) +pub async fn password_hack_no_authentication( + info: ComputeUserInfoNoEndpoint, client: &mut stream::PqStream>, latency_timer: &mut LatencyTimer, -) -> auth::Result> { +) -> auth::Result>> { warn!("project not specified, resorting to the password hack auth flow"); // pause the timer while we communicate with the client @@ -48,15 +59,17 @@ pub async fn password_hack( let payload = AuthFlow::new(client) .begin(auth::PasswordHack) .await? - .authenticate() + .get_password() .await?; - info!(project = &payload.endpoint, "received missing parameter"); - creds.project = Some(payload.endpoint); + info!(project = &*payload.endpoint, "received missing parameter"); // Report tentative success; compute node will check the password anyway. - Ok(AuthSuccess { - reported_auth_ok: false, - value: ComputeCredentials::Password(payload.password), + Ok(ComputeCredentials { + info: ComputeUserInfo { + inner: info, + endpoint: payload.endpoint, + }, + keys: payload.password, }) } diff --git a/proxy/src/auth/backend/link.rs b/proxy/src/auth/backend/link.rs index da43cf11c4..2cf7e3acc7 100644 --- a/proxy/src/auth/backend/link.rs +++ b/proxy/src/auth/backend/link.rs @@ -1,4 +1,3 @@ -use super::AuthSuccess; use crate::{ auth, compute, console::{self, provider::NodeInfo}, @@ -57,7 +56,7 @@ pub fn new_psql_session_id() -> String { pub(super) async fn authenticate( link_uri: &reqwest::Url, client: &mut PqStream, -) -> auth::Result> { +) -> auth::Result { let psql_session_id = new_psql_session_id(); let span = info_span!("link", psql_session_id = &psql_session_id); let greeting = hello_message(link_uri, &psql_session_id); @@ -102,12 +101,9 @@ pub(super) async fn authenticate( config.password(password.as_ref()); } - Ok(AuthSuccess { - reported_auth_ok: true, - value: NodeInfo { - config, - aux: db_info.aux.into(), - allow_self_signed_compute: false, // caller may override - }, + Ok(NodeInfo { + config, + aux: db_info.aux, + allow_self_signed_compute: false, // caller may override }) } diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs index facb8da8cd..c04769a199 100644 --- a/proxy/src/auth/credentials.rs +++ b/proxy/src/auth/credentials.rs @@ -1,16 +1,13 @@ //! User credentials used in authentication. use crate::{ - auth::password_hack::parse_endpoint_param, - error::UserFacingError, - proxy::{neon_options, NUM_CONNECTION_ACCEPTED_BY_SNI}, + auth::password_hack::parse_endpoint_param, error::UserFacingError, + metrics::NUM_CONNECTION_ACCEPTED_BY_SNI, proxy::neon_options_str, }; use itertools::Itertools; use pq_proto::StartupMessageParams; -use std::{ - collections::HashSet, - net::{IpAddr, SocketAddr}, -}; +use smol_str::SmolStr; +use std::{collections::HashSet, net::IpAddr}; use thiserror::Error; use tracing::{info, warn}; @@ -24,7 +21,7 @@ pub enum ClientCredsParseError { SNI ('{}') and project option ('{}').", .domain, .option, )] - InconsistentProjectNames { domain: String, option: String }, + InconsistentProjectNames { domain: SmolStr, option: SmolStr }, #[error( "Common name inferred from SNI ('{}') is not known", @@ -33,7 +30,7 @@ pub enum ClientCredsParseError { UnknownCommonName { cn: String }, #[error("Project name ('{0}') must contain only alphanumeric characters and hyphen.")] - MalformedProjectName(String), + MalformedProjectName(SmolStr), } impl UserFacingError for ClientCredsParseError {} @@ -41,34 +38,34 @@ impl UserFacingError for ClientCredsParseError {} /// Various client credentials which we use for authentication. /// Note that we don't store any kind of client key or password here. #[derive(Debug, Clone, PartialEq, Eq)] -pub struct ClientCredentials<'a> { - pub user: &'a str, +pub struct ClientCredentials { + pub user: SmolStr, // TODO: this is a severe misnomer! We should think of a new name ASAP. - pub project: Option, + pub project: Option, - pub cache_key: String, - pub peer_addr: SocketAddr, + pub cache_key: SmolStr, + pub peer_addr: IpAddr, } -impl ClientCredentials<'_> { +impl ClientCredentials { #[inline] pub fn project(&self) -> Option<&str> { self.project.as_deref() } } -impl<'a> ClientCredentials<'a> { +impl ClientCredentials { pub fn parse( - params: &'a StartupMessageParams, + params: &StartupMessageParams, sni: Option<&str>, common_names: Option>, - peer_addr: SocketAddr, + peer_addr: IpAddr, ) -> Result { use ClientCredsParseError::*; // Some parameters are stored in the startup message. let get_param = |key| params.get(key).ok_or(MissingKey(key)); - let user = get_param("user")?; + let user = get_param("user")?.into(); // Project name might be passed via PG's command-line options. let project_option = params @@ -82,7 +79,7 @@ impl<'a> ClientCredentials<'a> { .at_most_one() .ok()? }) - .map(|name| name.to_string()); + .map(|name| name.into()); let project_from_domain = if let Some(sni_str) = sni { if let Some(cn) = common_names { @@ -121,7 +118,7 @@ impl<'a> ClientCredentials<'a> { } .transpose()?; - info!(user, project = project.as_deref(), "credentials"); + info!(%user, project = project.as_deref(), "credentials"); if sni.is_some() { info!("Connection with sni"); NUM_CONNECTION_ACCEPTED_BY_SNI @@ -142,8 +139,9 @@ impl<'a> ClientCredentials<'a> { let cache_key = format!( "{}{}", project.as_deref().unwrap_or(""), - neon_options(params).unwrap_or("".to_string()) - ); + neon_options_str(params) + ) + .into(); Ok(Self { user, @@ -206,10 +204,10 @@ fn project_name_valid(name: &str) -> bool { name.chars().all(|c| c.is_alphanumeric() || c == '-') } -fn subdomain_from_sni(sni: &str, common_name: &str) -> Option { +fn subdomain_from_sni(sni: &str, common_name: &str) -> Option { sni.strip_suffix(common_name)? .strip_suffix('.') - .map(str::to_owned) + .map(SmolStr::from) } #[cfg(test)] @@ -221,7 +219,7 @@ mod tests { fn parse_bare_minimum() -> anyhow::Result<()> { // According to postgresql, only `user` should be required. let options = StartupMessageParams::new([("user", "john_doe")]); - let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234)); + let peer_addr = IpAddr::from([127, 0, 0, 1]); let creds = ClientCredentials::parse(&options, None, None, peer_addr)?; assert_eq!(creds.user, "john_doe"); assert_eq!(creds.project, None); @@ -236,7 +234,7 @@ mod tests { ("database", "world"), // should be ignored ("foo", "bar"), // should be ignored ]); - let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234)); + let peer_addr = IpAddr::from([127, 0, 0, 1]); let creds = ClientCredentials::parse(&options, None, None, peer_addr)?; assert_eq!(creds.user, "john_doe"); assert_eq!(creds.project, None); @@ -251,7 +249,7 @@ mod tests { let sni = Some("foo.localhost"); let common_names = Some(["localhost".into()].into()); - let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234)); + let peer_addr = IpAddr::from([127, 0, 0, 1]); let creds = ClientCredentials::parse(&options, sni, common_names, peer_addr)?; assert_eq!(creds.user, "john_doe"); assert_eq!(creds.project.as_deref(), Some("foo")); @@ -267,7 +265,7 @@ mod tests { ("options", "-ckey=1 project=bar -c geqo=off"), ]); - let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234)); + let peer_addr = IpAddr::from([127, 0, 0, 1]); let creds = ClientCredentials::parse(&options, None, None, peer_addr)?; assert_eq!(creds.user, "john_doe"); assert_eq!(creds.project.as_deref(), Some("bar")); @@ -282,7 +280,7 @@ mod tests { ("options", "-ckey=1 endpoint=bar -c geqo=off"), ]); - let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234)); + let peer_addr = IpAddr::from([127, 0, 0, 1]); let creds = ClientCredentials::parse(&options, None, None, peer_addr)?; assert_eq!(creds.user, "john_doe"); assert_eq!(creds.project.as_deref(), Some("bar")); @@ -300,7 +298,7 @@ mod tests { ), ]); - let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234)); + let peer_addr = IpAddr::from([127, 0, 0, 1]); let creds = ClientCredentials::parse(&options, None, None, peer_addr)?; assert_eq!(creds.user, "john_doe"); assert!(creds.project.is_none()); @@ -315,7 +313,7 @@ mod tests { ("options", "-ckey=1 endpoint=bar project=foo -c geqo=off"), ]); - let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234)); + let peer_addr = IpAddr::from([127, 0, 0, 1]); let creds = ClientCredentials::parse(&options, None, None, peer_addr)?; assert_eq!(creds.user, "john_doe"); assert!(creds.project.is_none()); @@ -330,7 +328,7 @@ mod tests { let sni = Some("baz.localhost"); let common_names = Some(["localhost".into()].into()); - let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234)); + let peer_addr = IpAddr::from([127, 0, 0, 1]); let creds = ClientCredentials::parse(&options, sni, common_names, peer_addr)?; assert_eq!(creds.user, "john_doe"); assert_eq!(creds.project.as_deref(), Some("baz")); @@ -344,13 +342,13 @@ mod tests { let common_names = Some(["a.com".into(), "b.com".into()].into()); let sni = Some("p1.a.com"); - let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234)); + let peer_addr = IpAddr::from([127, 0, 0, 1]); let creds = ClientCredentials::parse(&options, sni, common_names, peer_addr)?; assert_eq!(creds.project.as_deref(), Some("p1")); let common_names = Some(["a.com".into(), "b.com".into()].into()); let sni = Some("p1.b.com"); - let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234)); + let peer_addr = IpAddr::from([127, 0, 0, 1]); let creds = ClientCredentials::parse(&options, sni, common_names, peer_addr)?; assert_eq!(creds.project.as_deref(), Some("p1")); @@ -365,7 +363,7 @@ mod tests { let sni = Some("second.localhost"); let common_names = Some(["localhost".into()].into()); - let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234)); + let peer_addr = IpAddr::from([127, 0, 0, 1]); let err = ClientCredentials::parse(&options, sni, common_names, peer_addr) .expect_err("should fail"); match err { @@ -384,7 +382,7 @@ mod tests { let sni = Some("project.localhost"); let common_names = Some(["example.com".into()].into()); - let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234)); + let peer_addr = IpAddr::from([127, 0, 0, 1]); let err = ClientCredentials::parse(&options, sni, common_names, peer_addr) .expect_err("should fail"); match err { @@ -404,13 +402,10 @@ mod tests { let sni = Some("project.localhost"); let common_names = Some(["localhost".into()].into()); - let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234)); + let peer_addr = IpAddr::from([127, 0, 0, 1]); let creds = ClientCredentials::parse(&options, sni, common_names, peer_addr)?; assert_eq!(creds.project.as_deref(), Some("project")); - assert_eq!( - creds.cache_key, - "projectneon_endpoint_type:read_write neon_lsn:0/2" - ); + assert_eq!(creds.cache_key, "projectendpoint_type:read_write lsn:0/2"); Ok(()) } diff --git a/proxy/src/auth/flow.rs b/proxy/src/auth/flow.rs index efb90733d6..3151a77263 100644 --- a/proxy/src/auth/flow.rs +++ b/proxy/src/auth/flow.rs @@ -1,8 +1,9 @@ //! Main authentication flow. -use super::{AuthErrorImpl, PasswordHackPayload}; +use super::{backend::ComputeCredentialKeys, AuthErrorImpl, PasswordHackPayload}; use crate::{ config::TlsServerEndPoint, + console::AuthSecret, sasl, scram, stream::{PqStream, Stream}, }; @@ -50,7 +51,7 @@ impl AuthMethod for PasswordHack { /// Use clear-text password auth called `password` in docs /// -pub struct CleartextPassword; +pub struct CleartextPassword(pub AuthSecret); impl AuthMethod for CleartextPassword { #[inline(always)] @@ -98,7 +99,7 @@ impl<'a, S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'a, S, Begin> { impl AuthFlow<'_, S, PasswordHack> { /// Perform user authentication. Raise an error in case authentication failed. - pub async fn authenticate(self) -> super::Result { + pub async fn get_password(self) -> super::Result { let msg = self.stream.read_password_message().await?; let password = msg .strip_suffix(&[0]) @@ -117,13 +118,19 @@ impl AuthFlow<'_, S, PasswordHack> { impl AuthFlow<'_, S, CleartextPassword> { /// Perform user authentication. Raise an error in case authentication failed. - pub async fn authenticate(self) -> super::Result> { + pub async fn authenticate(self) -> super::Result> { let msg = self.stream.read_password_message().await?; let password = msg .strip_suffix(&[0]) .ok_or(AuthErrorImpl::MalformedPassword("missing terminator"))?; - Ok(password.to_vec()) + let outcome = validate_password_and_exchange(password, self.state.0)?; + + if let sasl::Outcome::Success(_) = &outcome { + self.stream.write_message_noflush(&Be::AuthenticationOk)?; + } + + Ok(outcome) } } @@ -152,6 +159,49 @@ impl AuthFlow<'_, S, Scram<'_>> { )) .await?; + if let sasl::Outcome::Success(_) = &outcome { + self.stream.write_message_noflush(&Be::AuthenticationOk)?; + } + Ok(outcome) } } + +pub(super) fn validate_password_and_exchange( + password: &[u8], + secret: AuthSecret, +) -> super::Result> { + match secret { + #[cfg(feature = "testing")] + AuthSecret::Md5(_) => { + // test only + Ok(sasl::Outcome::Success(ComputeCredentialKeys::Password( + password.to_owned(), + ))) + } + // perform scram authentication as both client and server to validate the keys + AuthSecret::Scram(scram_secret) => { + use postgres_protocol::authentication::sasl::{ChannelBinding, ScramSha256}; + let sasl_client = ScramSha256::new(password, ChannelBinding::unsupported()); + let outcome = crate::scram::exchange( + &scram_secret, + sasl_client, + crate::config::TlsServerEndPoint::Undefined, + )?; + + let client_key = match outcome { + sasl::Outcome::Success(client_key) => client_key, + sasl::Outcome::Failure(reason) => return Ok(sasl::Outcome::Failure(reason)), + }; + + let keys = crate::compute::ScramKeys { + client_key: client_key.as_bytes(), + server_key: scram_secret.server_key.as_bytes(), + }; + + Ok(sasl::Outcome::Success(ComputeCredentialKeys::AuthKeys( + tokio_postgres::config::AuthKeys::ScramSha256(keys), + ))) + } + } +} diff --git a/proxy/src/auth/password_hack.rs b/proxy/src/auth/password_hack.rs index d1da208fef..372b0764ee 100644 --- a/proxy/src/auth/password_hack.rs +++ b/proxy/src/auth/password_hack.rs @@ -4,9 +4,10 @@ //! UPDATE (Mon Aug 8 13:20:34 UTC 2022): the payload format has been simplified. use bstr::ByteSlice; +use smol_str::SmolStr; pub struct PasswordHackPayload { - pub endpoint: String, + pub endpoint: SmolStr, pub password: Vec, } @@ -18,7 +19,7 @@ impl PasswordHackPayload { if let Some((endpoint, password)) = bytes.split_once_str(sep) { let endpoint = endpoint.to_str().ok()?; return Some(Self { - endpoint: parse_endpoint_param(endpoint)?.to_owned(), + endpoint: parse_endpoint_param(endpoint)?.into(), password: password.to_owned(), }); } diff --git a/proxy/src/bin/pg_sni_router.rs b/proxy/src/bin/pg_sni_router.rs index 2b859fc2db..d48ba3a54e 100644 --- a/proxy/src/bin/pg_sni_router.rs +++ b/proxy/src/bin/pg_sni_router.rs @@ -8,6 +8,7 @@ use std::{net::SocketAddr, sync::Arc}; use futures::future::Either; use itertools::Itertools; use proxy::config::TlsServerEndPoint; +use proxy::proxy::run_until_cancelled; use tokio::net::TcpListener; use anyhow::{anyhow, bail, ensure, Context}; @@ -20,7 +21,7 @@ use tokio::io::{AsyncRead, AsyncWrite}; use tokio_util::sync::CancellationToken; use utils::{project_git_version, sentry_init::init_sentry}; -use tracing::{error, info, warn, Instrument}; +use tracing::{error, info, Instrument}; project_git_version!(GIT_VERSION); @@ -151,63 +152,39 @@ async fn task_main( // will be inherited by all accepted client sockets. socket2::SockRef::from(&listener).set_keepalive(true)?; - let mut connections = tokio::task::JoinSet::new(); + let connections = tokio_util::task::task_tracker::TaskTracker::new(); - loop { - tokio::select! { - accept_result = listener.accept() => { - let (socket, peer_addr) = accept_result?; + while let Some(accept_result) = + run_until_cancelled(listener.accept(), &cancellation_token).await + { + let (socket, peer_addr) = accept_result?; - let session_id = uuid::Uuid::new_v4(); - let tls_config = Arc::clone(&tls_config); - let dest_suffix = Arc::clone(&dest_suffix); + let session_id = uuid::Uuid::new_v4(); + let tls_config = Arc::clone(&tls_config); + let dest_suffix = Arc::clone(&dest_suffix); - connections.spawn( - async move { - socket - .set_nodelay(true) - .context("failed to set socket option")?; + connections.spawn( + async move { + socket + .set_nodelay(true) + .context("failed to set socket option")?; - info!(%peer_addr, "serving"); - handle_client(dest_suffix, tls_config, tls_server_end_point, socket).await - } - .unwrap_or_else(|e| { - // Acknowledge that the task has finished with an error. - error!("per-client task finished with an error: {e:#}"); - }) - .instrument(tracing::info_span!("handle_client", ?session_id)) - ); + info!(%peer_addr, "serving"); + handle_client(dest_suffix, tls_config, tls_server_end_point, socket).await } - // Don't modify this unless you read https://docs.rs/tokio/latest/tokio/macro.select.html carefully. - // If this future completes and the pattern doesn't match, this branch is disabled for this call to `select!`. - // This only counts for this loop and it will be enabled again on next `select!`. - // - // Prior code had this as `Some(Err(e))` which _looks_ equivalent to the current setup, but it's not. - // When `connections.join_next()` returned `Some(Ok(()))` (which we expect), it would disable the join_next and it would - // not get called again, even if there are more connections to remove. - Some(res) = connections.join_next() => { - if let Err(e) = res { - if !e.is_panic() && !e.is_cancelled() { - warn!("unexpected error from joined connection task: {e:?}"); - } - } - } - _ = cancellation_token.cancelled() => { - drop(listener); - break; - } - } + .unwrap_or_else(|e| { + // Acknowledge that the task has finished with an error. + error!("per-client task finished with an error: {e:#}"); + }) + .instrument(tracing::info_span!("handle_client", ?session_id)), + ); } - // Drain connections - info!("waiting for all client connections to finish"); - while let Some(res) = connections.join_next().await { - if let Err(e) = res { - if !e.is_panic() && !e.is_cancelled() { - warn!("unexpected error from joined connection task: {e:?}"); - } - } - } + connections.close(); + drop(listener); + + connections.wait().await; + info!("all client connections have finished"); Ok(()) } @@ -284,5 +261,5 @@ async fn handle_client( let client = tokio::net::TcpStream::connect(destination).await?; let metrics_aux: MetricsAuxInfo = Default::default(); - proxy::proxy::proxy_pass(tls_stream, client, &metrics_aux).await + proxy::proxy::proxy_pass(tls_stream, client, metrics_aux).await } diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs index 7457e26867..5bc2d377a6 100644 --- a/proxy/src/bin/proxy.rs +++ b/proxy/src/bin/proxy.rs @@ -6,14 +6,19 @@ use proxy::config::HttpConfig; use proxy::console; use proxy::console::provider::AllowedIpsCache; use proxy::console::provider::NodeInfoCache; +use proxy::console::provider::RoleSecretCache; use proxy::http; +use proxy::rate_limiter::EndpointRateLimiter; +use proxy::rate_limiter::RateBucketInfo; use proxy::rate_limiter::RateLimiterConfig; +use proxy::serverless::GlobalConnPoolOptions; use proxy::usage_metrics; use anyhow::bail; use proxy::config::{self, ProxyConfig}; use proxy::serverless; use std::pin::pin; +use std::sync::Arc; use std::{borrow::Cow, net::SocketAddr}; use tokio::net::TcpListener; use tokio::task::JoinSet; @@ -30,6 +35,7 @@ use clap::{Parser, ValueEnum}; #[derive(Clone, Debug, ValueEnum)] enum AuthBackend { Console, + #[cfg(feature = "testing")] Postgres, Link, } @@ -82,7 +88,7 @@ struct ProxyCliArgs { #[clap(long)] metric_collection_interval: Option, /// cache for `wake_compute` api method (use `size=0` to disable) - #[clap(long, default_value = config::CacheOptions::DEFAULT_OPTIONS_NODE_INFO)] + #[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)] wake_compute_cache: String, /// lock for `wake_compute` api method. example: "shards=32,permits=4,epoch=10m,timeout=1s". (use `permits=0` to disable). #[clap(long, default_value = config::WakeComputeLockOptions::DEFAULT_OPTIONS_WAKE_COMPUTE_LOCK)] @@ -90,12 +96,8 @@ struct ProxyCliArgs { /// Allow self-signed certificates for compute nodes (for testing) #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)] allow_self_signed_compute: bool, - /// timeout for http connections - #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)] - sql_over_http_timeout: tokio::time::Duration, - /// Whether the SQL over http pool is opt-in - #[clap(long, default_value_t = true, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)] - sql_over_http_pool_opt_in: bool, + #[clap(flatten)] + sql_over_http: SqlOverHttpArgs, /// timeout for scram authentication protocol #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)] scram_protocol_timeout: tokio::time::Duration, @@ -103,7 +105,7 @@ struct ProxyCliArgs { #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)] require_client_ip: bool, /// Disable dynamic rate limiter and store the metrics to ensure its production behaviour. - #[clap(long, default_value_t = true, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)] + #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)] disable_dynamic_rate_limiter: bool, /// Rate limit algorithm. Makes sense only if `disable_rate_limiter` is `false`. #[clap(value_enum, long, default_value_t = proxy::rate_limiter::RateLimitAlgorithm::Aimd)] @@ -111,19 +113,58 @@ struct ProxyCliArgs { /// Timeout for rate limiter. If it didn't manage to aquire a permit in this time, it will return an error. #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)] rate_limiter_timeout: tokio::time::Duration, + /// Endpoint rate limiter max number of requests per second. + /// + /// Provided in the form '@'. + /// Can be given multiple times for different bucket sizes. + #[clap(long, default_values_t = RateBucketInfo::DEFAULT_SET)] + endpoint_rps_limit: Vec, /// Initial limit for dynamic rate limiter. Makes sense only if `rate_limit_algorithm` is *not* `None`. #[clap(long, default_value_t = 100)] initial_limit: usize, #[clap(flatten)] aimd_config: proxy::rate_limiter::AimdConfig, /// cache for `allowed_ips` (use `size=0` to disable) - #[clap(long, default_value = config::CacheOptions::DEFAULT_OPTIONS_NODE_INFO)] + #[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)] allowed_ips_cache: String, + /// cache for `role_secret` (use `size=0` to disable) + #[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)] + role_secret_cache: String, /// disable ip check for http requests. If it is too time consuming, it could be turned off. #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)] disable_ip_check_for_http: bool, } +#[derive(clap::Args, Clone, Copy, Debug)] +struct SqlOverHttpArgs { + /// timeout for http connection requests + #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)] + sql_over_http_timeout: tokio::time::Duration, + + /// Whether the SQL over http pool is opt-in + #[clap(long, default_value_t = true, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)] + sql_over_http_pool_opt_in: bool, + + /// How many connections to pool for each endpoint. Excess connections are discarded + #[clap(long, default_value_t = 20)] + sql_over_http_pool_max_conns_per_endpoint: usize, + + /// How long pooled connections should remain idle for before closing + #[clap(long, default_value = "5m", value_parser = humantime::parse_duration)] + sql_over_http_idle_timeout: tokio::time::Duration, + + /// Duration each shard will wait on average before a GC sweep. + /// A longer time will causes sweeps to take longer but will interfere less frequently. + #[clap(long, default_value = "10m", value_parser = humantime::parse_duration)] + sql_over_http_pool_gc_epoch: tokio::time::Duration, + + /// How many shards should the global pool have. Must be a power of two. + /// More shards will introduce less contention for pool operations, but can + /// increase memory used by the pool + #[clap(long, default_value_t = 128)] + sql_over_http_pool_shards: usize, +} + #[tokio::main] async fn main() -> anyhow::Result<()> { let _logging_guard = proxy::logging::init().await?; @@ -153,6 +194,8 @@ async fn main() -> anyhow::Result<()> { let proxy_listener = TcpListener::bind(proxy_address).await?; let cancellation_token = CancellationToken::new(); + let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new(&config.endpoint_rps_limit)); + // client facing tasks. these will exit on error or on cancellation // cancellation returns Ok(()) let mut client_tasks = JoinSet::new(); @@ -160,6 +203,7 @@ async fn main() -> anyhow::Result<()> { config, proxy_listener, cancellation_token.clone(), + endpoint_rate_limiter.clone(), )); // TODO: rename the argument to something like serverless. @@ -173,6 +217,7 @@ async fn main() -> anyhow::Result<()> { config, serverless_listener, cancellation_token.clone(), + endpoint_rate_limiter.clone(), )); } @@ -252,9 +297,11 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { AuthBackend::Console => { let wake_compute_cache_config: CacheOptions = args.wake_compute_cache.parse()?; let allowed_ips_cache_config: CacheOptions = args.allowed_ips_cache.parse()?; + let role_secret_cache_config: CacheOptions = args.role_secret_cache.parse()?; info!("Using NodeInfoCache (wake_compute) with options={wake_compute_cache_config:?}"); info!("Using AllowedIpsCache (wake_compute) with options={allowed_ips_cache_config:?}"); + info!("Using RoleSecretCache (wake_compute) with options={role_secret_cache_config:?}"); let caches = Box::leak(Box::new(console::caches::ApiCaches { node_info: NodeInfoCache::new( "node_info_cache", @@ -268,6 +315,12 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { allowed_ips_cache_config.ttl, false, ), + role_secret: RoleSecretCache::new( + "role_secret_cache", + role_secret_cache_config.size, + role_secret_cache_config.ttl, + false, + ), })); let config::WakeComputeLockOptions { @@ -289,6 +342,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { let api = console::provider::neon::Api::new(endpoint, caches, locks); auth::BackendType::Console(Cow::Owned(api), ()) } + #[cfg(feature = "testing")] AuthBackend::Postgres => { let url = args.auth_endpoint.parse()?; let api = console::provider::mock::Api::new(url); @@ -300,12 +354,22 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { } }; let http_config = HttpConfig { - timeout: args.sql_over_http_timeout, - pool_opt_in: args.sql_over_http_pool_opt_in, + request_timeout: args.sql_over_http.sql_over_http_timeout, + pool_options: GlobalConnPoolOptions { + max_conns_per_endpoint: args.sql_over_http.sql_over_http_pool_max_conns_per_endpoint, + gc_epoch: args.sql_over_http.sql_over_http_pool_gc_epoch, + pool_shards: args.sql_over_http.sql_over_http_pool_shards, + idle_timeout: args.sql_over_http.sql_over_http_idle_timeout, + opt_in: args.sql_over_http.sql_over_http_pool_opt_in, + }, }; let authentication_config = AuthenticationConfig { scram_protocol_timeout: args.scram_protocol_timeout, }; + + let mut endpoint_rps_limit = args.endpoint_rps_limit.clone(); + RateBucketInfo::validate(&mut endpoint_rps_limit)?; + let config = Box::leak(Box::new(ProxyConfig { tls_config, auth_backend, @@ -315,7 +379,35 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { authentication_config, require_client_ip: args.require_client_ip, disable_ip_check_for_http: args.disable_ip_check_for_http, + endpoint_rps_limit, })); Ok(config) } + +#[cfg(test)] +mod tests { + use std::time::Duration; + + use clap::Parser; + use proxy::rate_limiter::RateBucketInfo; + + #[test] + fn parse_endpoint_rps_limit() { + let config = super::ProxyCliArgs::parse_from([ + "proxy", + "--endpoint-rps-limit", + "100@1s", + "--endpoint-rps-limit", + "20@30s", + ]); + + assert_eq!( + config.endpoint_rps_limit, + vec![ + RateBucketInfo::new(100, Duration::from_secs(1)), + RateBucketInfo::new(20, Duration::from_secs(30)), + ] + ); + } +} diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs index c838c8fc38..a54ba56e43 100644 --- a/proxy/src/compute.rs +++ b/proxy/src/compute.rs @@ -1,9 +1,10 @@ use crate::{ auth::parse_endpoint_param, cancellation::CancelClosure, console::errors::WakeComputeError, - error::UserFacingError, proxy::is_neon_param, + error::UserFacingError, metrics::NUM_DB_CONNECTIONS_GAUGE, proxy::neon_option, }; use futures::{FutureExt, TryFutureExt}; use itertools::Itertools; +use metrics::IntCounterPairGuard; use pq_proto::StartupMessageParams; use std::{io, net::SocketAddr, time::Duration}; use thiserror::Error; @@ -223,6 +224,8 @@ pub struct PostgresConnection { pub params: std::collections::HashMap, /// Query cancellation token. pub cancel_closure: CancelClosure, + + _guage: IntCounterPairGuard, } impl ConnCfg { @@ -231,6 +234,7 @@ impl ConnCfg { &self, allow_self_signed_compute: bool, timeout: Duration, + proto: &'static str, ) -> Result { let (socket_addr, stream, host) = self.connect_raw(timeout).await?; @@ -264,6 +268,7 @@ impl ConnCfg { stream, params, cancel_closure, + _guage: NUM_DB_CONNECTIONS_GAUGE.with_label_values(&[proto]).guard(), }; Ok(connection) @@ -275,7 +280,7 @@ fn filtered_options(params: &StartupMessageParams) -> Option { #[allow(unstable_name_collisions)] let options: String = params .options_raw()? - .filter(|opt| parse_endpoint_param(opt).is_none() && !is_neon_param(opt)) + .filter(|opt| parse_endpoint_param(opt).is_none() && neon_option(opt).is_none()) .intersperse(" ") // TODO: use impl from std once it's stabilized .collect(); diff --git a/proxy/src/config.rs b/proxy/src/config.rs index 182d71f9be..610bf7e424 100644 --- a/proxy/src/config.rs +++ b/proxy/src/config.rs @@ -1,4 +1,4 @@ -use crate::auth; +use crate::{auth, rate_limiter::RateBucketInfo, serverless::GlobalConnPoolOptions}; use anyhow::{bail, ensure, Context, Ok}; use rustls::{sign, Certificate, PrivateKey}; use sha2::{Digest, Sha256}; @@ -20,6 +20,7 @@ pub struct ProxyConfig { pub authentication_config: AuthenticationConfig, pub require_client_ip: bool, pub disable_ip_check_for_http: bool, + pub endpoint_rps_limit: Vec, } #[derive(Debug)] @@ -35,8 +36,8 @@ pub struct TlsConfig { } pub struct HttpConfig { - pub timeout: tokio::time::Duration, - pub pool_opt_in: bool, + pub request_timeout: tokio::time::Duration, + pub pool_options: GlobalConnPoolOptions, } pub struct AuthenticationConfig { @@ -309,10 +310,10 @@ pub struct CacheOptions { impl CacheOptions { /// Default options for [`crate::console::provider::NodeInfoCache`]. - pub const DEFAULT_OPTIONS_NODE_INFO: &'static str = "size=4000,ttl=4m"; + pub const CACHE_DEFAULT_OPTIONS: &'static str = "size=4000,ttl=4m"; /// Parse cache options passed via cmdline. - /// Example: [`Self::DEFAULT_OPTIONS_NODE_INFO`]. + /// Example: [`Self::CACHE_DEFAULT_OPTIONS`]. fn parse(options: &str) -> anyhow::Result { let mut size = None; let mut ttl = None; diff --git a/proxy/src/console/messages.rs b/proxy/src/console/messages.rs index e5f1615b14..837379b21f 100644 --- a/proxy/src/console/messages.rs +++ b/proxy/src/console/messages.rs @@ -1,4 +1,5 @@ use serde::Deserialize; +use smol_str::SmolStr; use std::fmt; /// Generic error response with human-readable description. @@ -88,11 +89,11 @@ impl fmt::Debug for DatabaseInfo { /// Various labels for prometheus metrics. /// Also known as `ProxyMetricsAuxInfo` in the console. -#[derive(Debug, Deserialize, Default)] +#[derive(Debug, Deserialize, Clone, Default)] pub struct MetricsAuxInfo { - pub endpoint_id: Box, - pub project_id: Box, - pub branch_id: Box, + pub endpoint_id: SmolStr, + pub project_id: SmolStr, + pub branch_id: SmolStr, } impl MetricsAuxInfo { diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs index a525de8e53..e4cf1e8c8e 100644 --- a/proxy/src/console/provider.rs +++ b/proxy/src/console/provider.rs @@ -1,14 +1,16 @@ +#[cfg(feature = "testing")] pub mod mock; pub mod neon; use super::messages::MetricsAuxInfo; use crate::{ - auth::ClientCredentials, + auth::backend::ComputeUserInfo, cache::{timed_lru, TimedLru}, compute, scram, }; use async_trait::async_trait; use dashmap::DashMap; +use smol_str::SmolStr; use std::{sync::Arc, time::Duration}; use tokio::{ sync::{OwnedSemaphorePermit, Semaphore}, @@ -20,7 +22,7 @@ pub mod errors { use crate::{ error::{io_error, UserFacingError}, http, - proxy::ShouldRetry, + proxy::retry::ShouldRetry, }; use thiserror::Error; @@ -195,16 +197,29 @@ pub mod errors { } /// Extra query params we'd like to pass to the console. -pub struct ConsoleReqExtra<'a> { +pub struct ConsoleReqExtra { /// A unique identifier for a connection. pub session_id: uuid::Uuid, /// Name of client application, if set. - pub application_name: Option<&'a str>, - pub options: Option<&'a str>, + pub application_name: String, + pub options: Vec<(String, String)>, +} + +impl ConsoleReqExtra { + // https://swagger.io/docs/specification/serialization/ DeepObject format + // paramName[prop1]=value1¶mName[prop2]=value2&.... + pub fn options_as_deep_object(&self) -> Vec<(String, String)> { + self.options + .iter() + .map(|(k, v)| (format!("options[{}]", k), v.to_string())) + .collect() + } } /// Auth secret which is managed by the cloud. +#[derive(Clone)] pub enum AuthSecret { + #[cfg(feature = "testing")] /// Md5 hash of user's password. Md5([u8; 16]), @@ -229,7 +244,7 @@ pub struct NodeInfo { pub config: compute::ConnCfg, /// Labels for proxy's metrics. - pub aux: Arc, + pub aux: MetricsAuxInfo, /// Whether we should accept self-signed certificates (for testing) pub allow_self_signed_compute: bool, @@ -237,30 +252,32 @@ pub struct NodeInfo { pub type NodeInfoCache = TimedLru, NodeInfo>; pub type CachedNodeInfo = timed_lru::Cached<&'static NodeInfoCache>; -pub type AllowedIpsCache = TimedLru, Arc>>; +pub type AllowedIpsCache = TimedLru>>; +pub type RoleSecretCache = TimedLru<(SmolStr, SmolStr), Option>; +pub type CachedRoleSecret = timed_lru::Cached<&'static RoleSecretCache>; /// This will allocate per each call, but the http requests alone /// already require a few allocations, so it should be fine. #[async_trait] pub trait Api { /// Get the client's auth secret for authentication. - async fn get_auth_info( + async fn get_role_secret( &self, - extra: &ConsoleReqExtra<'_>, - creds: &ClientCredentials, - ) -> Result; + extra: &ConsoleReqExtra, + creds: &ComputeUserInfo, + ) -> Result; async fn get_allowed_ips( &self, - extra: &ConsoleReqExtra<'_>, - creds: &ClientCredentials, + extra: &ConsoleReqExtra, + creds: &ComputeUserInfo, ) -> Result>, errors::GetAuthInfoError>; /// Wake up the compute node and return the corresponding connection info. async fn wake_compute( &self, - extra: &ConsoleReqExtra<'_>, - creds: &ClientCredentials, + extra: &ConsoleReqExtra, + creds: &ComputeUserInfo, ) -> Result; } @@ -269,7 +286,9 @@ pub struct ApiCaches { /// Cache for the `wake_compute` API method. pub node_info: NodeInfoCache, /// Cache for the `get_allowed_ips`. TODO(anna): use notifications listener instead. - pub allowed_ips: TimedLru, Arc>>, + pub allowed_ips: AllowedIpsCache, + /// Cache for the `get_role_secret`. TODO(anna): use notifications listener instead. + pub role_secret: RoleSecretCache, } /// Various caches for [`console`](super). diff --git a/proxy/src/console/provider/mock.rs b/proxy/src/console/provider/mock.rs index 4cc68f0ac1..dba5e5863f 100644 --- a/proxy/src/console/provider/mock.rs +++ b/proxy/src/console/provider/mock.rs @@ -6,7 +6,8 @@ use super::{ errors::{ApiError, GetAuthInfoError, WakeComputeError}, AuthInfo, AuthSecret, CachedNodeInfo, ConsoleReqExtra, NodeInfo, }; -use crate::{auth::ClientCredentials, compute, error::io_error, scram, url::ApiUrl}; +use crate::console::provider::CachedRoleSecret; +use crate::{auth::backend::ComputeUserInfo, compute, error::io_error, scram, url::ApiUrl}; use async_trait::async_trait; use futures::TryFutureExt; use thiserror::Error; @@ -47,7 +48,7 @@ impl Api { async fn do_get_auth_info( &self, - creds: &ClientCredentials<'_>, + creds: &ComputeUserInfo, ) -> Result { let (secret, allowed_ips) = async { // Perhaps we could persist this connection, but then we'd have to @@ -60,7 +61,7 @@ impl Api { let secret = match get_execute_postgres_query( &client, "select rolpassword from pg_catalog.pg_authid where rolname = $1", - &[&creds.user], + &[&&*creds.inner.user], "rolpassword", ) .await? @@ -71,14 +72,14 @@ impl Api { secret.or_else(|| parse_md5(&entry).map(AuthSecret::Md5)) } None => { - warn!("user '{}' does not exist", creds.user); + warn!("user '{}' does not exist", creds.inner.user); None } }; let allowed_ips = match get_execute_postgres_query( &client, "select allowed_ips from neon_control_plane.endpoints where endpoint_id = $1", - &[&creds.project.clone().unwrap_or_default().as_str()], + &[&creds.endpoint.as_str()], "allowed_ips", ) .await? @@ -142,18 +143,20 @@ async fn get_execute_postgres_query( #[async_trait] impl super::Api for Api { #[tracing::instrument(skip_all)] - async fn get_auth_info( + async fn get_role_secret( &self, - _extra: &ConsoleReqExtra<'_>, - creds: &ClientCredentials, - ) -> Result { - self.do_get_auth_info(creds).await + _extra: &ConsoleReqExtra, + creds: &ComputeUserInfo, + ) -> Result { + Ok(CachedRoleSecret::new_uncached( + self.do_get_auth_info(creds).await?.secret, + )) } async fn get_allowed_ips( &self, - _extra: &ConsoleReqExtra<'_>, - creds: &ClientCredentials, + _extra: &ConsoleReqExtra, + creds: &ComputeUserInfo, ) -> Result>, GetAuthInfoError> { Ok(Arc::new(self.do_get_auth_info(creds).await?.allowed_ips)) } @@ -161,8 +164,8 @@ impl super::Api for Api { #[tracing::instrument(skip_all)] async fn wake_compute( &self, - _extra: &ConsoleReqExtra<'_>, - _creds: &ClientCredentials, + _extra: &ConsoleReqExtra, + _creds: &ComputeUserInfo, ) -> Result { self.do_wake_compute() .map_ok(CachedNodeInfo::new_uncached) diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs index 117d0ec190..5bf7b0f986 100644 --- a/proxy/src/console/provider/neon.rs +++ b/proxy/src/console/provider/neon.rs @@ -3,18 +3,15 @@ use super::{ super::messages::{ConsoleError, GetRoleSecret, WakeCompute}, errors::{ApiError, GetAuthInfoError, WakeComputeError}, - ApiCaches, ApiLocks, AuthInfo, AuthSecret, CachedNodeInfo, ConsoleReqExtra, NodeInfo, -}; -use crate::{ - auth::ClientCredentials, - compute, http, - proxy::{ALLOWED_IPS_BY_CACHE_OUTCOME, ALLOWED_IPS_NUMBER}, - scram, + ApiCaches, ApiLocks, AuthInfo, AuthSecret, CachedNodeInfo, CachedRoleSecret, ConsoleReqExtra, + NodeInfo, }; +use crate::metrics::{ALLOWED_IPS_BY_CACHE_OUTCOME, ALLOWED_IPS_NUMBER}; +use crate::{auth::backend::ComputeUserInfo, compute, http, scram}; use async_trait::async_trait; use futures::TryFutureExt; use itertools::Itertools; -use std::{net::SocketAddr, sync::Arc}; +use std::sync::Arc; use tokio::time::Instant; use tokio_postgres::config::SslMode; use tracing::{error, info, info_span, warn, Instrument}; @@ -52,8 +49,8 @@ impl Api { async fn do_get_auth_info( &self, - extra: &ConsoleReqExtra<'_>, - creds: &ClientCredentials<'_>, + extra: &ConsoleReqExtra, + creds: &ComputeUserInfo, ) -> Result { let request_id = uuid::Uuid::new_v4().to_string(); async { @@ -64,9 +61,9 @@ impl Api { .header("Authorization", format!("Bearer {}", &self.jwt)) .query(&[("session_id", extra.session_id)]) .query(&[ - ("application_name", extra.application_name), - ("project", Some(creds.project().expect("impossible"))), - ("role", Some(creds.user)), + ("application_name", extra.application_name.as_str()), + ("project", creds.endpoint.as_str()), + ("role", creds.inner.user.as_str()), ]) .build()?; @@ -105,24 +102,28 @@ impl Api { async fn do_wake_compute( &self, - extra: &ConsoleReqExtra<'_>, - creds: &ClientCredentials<'_>, + extra: &ConsoleReqExtra, + creds: &ComputeUserInfo, ) -> Result { - let project = creds.project().expect("impossible"); let request_id = uuid::Uuid::new_v4().to_string(); async { - let request = self + let mut request_builder = self .endpoint .get("proxy_wake_compute") .header("X-Request-ID", &request_id) .header("Authorization", format!("Bearer {}", &self.jwt)) .query(&[("session_id", extra.session_id)]) .query(&[ - ("application_name", extra.application_name), - ("project", Some(project)), - ("options", extra.options), - ]) - .build()?; + ("application_name", extra.application_name.as_str()), + ("project", creds.endpoint.as_str()), + ]); + + request_builder = if extra.options.is_empty() { + request_builder + } else { + request_builder.query(&extra.options_as_deep_object()) + }; + let request = request_builder.build()?; info!(url = request.url().as_str(), "sending http request"); let start = Instant::now(); @@ -140,11 +141,11 @@ impl Api { // We'll set username and such later using the startup message. // TODO: add more type safety (in progress). let mut config = compute::ConnCfg::new(); - config.host(&host).port(port).ssl_mode(SslMode::Disable); // TLS is not configured on compute nodes. + config.host(host).port(port).ssl_mode(SslMode::Disable); // TLS is not configured on compute nodes. let node = NodeInfo { config, - aux: body.aux.into(), + aux: body.aux, allow_self_signed_compute: false, }; @@ -159,21 +160,33 @@ impl Api { #[async_trait] impl super::Api for Api { #[tracing::instrument(skip_all)] - async fn get_auth_info( + async fn get_role_secret( &self, - extra: &ConsoleReqExtra<'_>, - creds: &ClientCredentials, - ) -> Result { - self.do_get_auth_info(extra, creds).await + extra: &ConsoleReqExtra, + creds: &ComputeUserInfo, + ) -> Result { + let ep = creds.endpoint.clone(); + let user = creds.inner.user.clone(); + if let Some(role_secret) = self.caches.role_secret.get(&(ep.clone(), user.clone())) { + return Ok(role_secret); + } + let auth_info = self.do_get_auth_info(extra, creds).await?; + let (_, secret) = self + .caches + .role_secret + .insert((ep.clone(), user), auth_info.secret.clone()); + self.caches + .allowed_ips + .insert(ep, Arc::new(auth_info.allowed_ips)); + Ok(secret) } async fn get_allowed_ips( &self, - extra: &ConsoleReqExtra<'_>, - creds: &ClientCredentials, + extra: &ConsoleReqExtra, + creds: &ComputeUserInfo, ) -> Result>, GetAuthInfoError> { - let key: &str = creds.project().expect("impossible"); - if let Some(allowed_ips) = self.caches.allowed_ips.get(key) { + if let Some(allowed_ips) = self.caches.allowed_ips.get(&creds.endpoint) { ALLOWED_IPS_BY_CACHE_OUTCOME .with_label_values(&["hit"]) .inc(); @@ -182,20 +195,24 @@ impl super::Api for Api { ALLOWED_IPS_BY_CACHE_OUTCOME .with_label_values(&["miss"]) .inc(); - let allowed_ips = Arc::new(self.do_get_auth_info(extra, creds).await?.allowed_ips); + let auth_info = self.do_get_auth_info(extra, creds).await?; + let allowed_ips = Arc::new(auth_info.allowed_ips); + let ep = creds.endpoint.clone(); + let user = creds.inner.user.clone(); self.caches - .allowed_ips - .insert(key.into(), allowed_ips.clone()); + .role_secret + .insert((ep.clone(), user), auth_info.secret); + self.caches.allowed_ips.insert(ep, allowed_ips.clone()); Ok(allowed_ips) } #[tracing::instrument(skip_all)] async fn wake_compute( &self, - extra: &ConsoleReqExtra<'_>, - creds: &ClientCredentials, + extra: &ConsoleReqExtra, + creds: &ComputeUserInfo, ) -> Result { - let key: &str = &creds.cache_key; + let key: &str = &creds.inner.cache_key; // Every time we do a wakeup http request, the compute node will stay up // for some time (highly depends on the console's scale-to-zero policy); @@ -252,9 +269,10 @@ async fn parse_body serde::Deserialize<'a>>( Err(ApiError::Console { status, text }) } -fn parse_host_port(input: &str) -> Option<(String, u16)> { - let parsed: SocketAddr = input.parse().ok()?; - Some((parsed.ip().to_string(), parsed.port())) +fn parse_host_port(input: &str) -> Option<(&str, u16)> { + let (host, port) = input.rsplit_once(':')?; + let ipv6_brackets: &[_] = &['[', ']']; + Some((host.trim_matches(ipv6_brackets), port.parse().ok()?)) } #[cfg(test)] @@ -262,9 +280,24 @@ mod tests { use super::*; #[test] - fn test_parse_host_port() { + fn test_parse_host_port_v4() { let (host, port) = parse_host_port("127.0.0.1:5432").expect("failed to parse"); assert_eq!(host, "127.0.0.1"); assert_eq!(port, 5432); } + + #[test] + fn test_parse_host_port_v6() { + let (host, port) = parse_host_port("[2001:db8::1]:5432").expect("failed to parse"); + assert_eq!(host, "2001:db8::1"); + assert_eq!(port, 5432); + } + + #[test] + fn test_parse_host_port_url() { + let (host, port) = parse_host_port("compute-foo-bar-1234.default.svc.cluster.local:5432") + .expect("failed to parse"); + assert_eq!(host, "compute-foo-bar-1234.default.svc.cluster.local"); + assert_eq!(port, 5432); + } } diff --git a/proxy/src/http.rs b/proxy/src/http.rs index 638705d3e9..59e1492ed4 100644 --- a/proxy/src/http.rs +++ b/proxy/src/http.rs @@ -13,7 +13,7 @@ pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware}; use tokio::time::Instant; use tracing::trace; -use crate::{proxy::CONSOLE_REQUEST_LATENCY, rate_limiter, url::ApiUrl}; +use crate::{metrics::CONSOLE_REQUEST_LATENCY, rate_limiter, url::ApiUrl}; use reqwest_middleware::RequestBuilder; /// This is the preferred way to create new http clients, @@ -95,7 +95,7 @@ impl Endpoint { let res = self.client.execute(request).await; CONSOLE_REQUEST_LATENCY .with_label_values(&[&path]) - .observe(start.elapsed().as_micros() as f64); + .observe(start.elapsed().as_secs_f64()); res } } diff --git a/proxy/src/lib.rs b/proxy/src/lib.rs index a22600cbb3..2da1eaf482 100644 --- a/proxy/src/lib.rs +++ b/proxy/src/lib.rs @@ -16,6 +16,7 @@ pub mod console; pub mod error; pub mod http; pub mod logging; +pub mod metrics; pub mod parse; pub mod protocol2; pub mod proxy; diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs new file mode 100644 index 0000000000..8e2a6105b1 --- /dev/null +++ b/proxy/src/metrics.rs @@ -0,0 +1,232 @@ +use ::metrics::{ + exponential_buckets, register_int_counter_pair_vec, register_int_counter_vec, + IntCounterPairVec, IntCounterVec, +}; +use prometheus::{ + register_histogram, register_histogram_vec, register_int_gauge_vec, Histogram, HistogramVec, + IntGaugeVec, +}; + +use once_cell::sync::Lazy; +use tokio::time; + +pub static NUM_DB_CONNECTIONS_GAUGE: Lazy = Lazy::new(|| { + register_int_counter_pair_vec!( + "proxy_opened_db_connections_total", + "Number of opened connections to a database.", + "proxy_closed_db_connections_total", + "Number of closed connections to a database.", + &["protocol"], + ) + .unwrap() +}); + +pub static NUM_CLIENT_CONNECTION_GAUGE: Lazy = Lazy::new(|| { + register_int_counter_pair_vec!( + "proxy_opened_client_connections_total", + "Number of opened connections from a client.", + "proxy_closed_client_connections_total", + "Number of closed connections from a client.", + &["protocol"], + ) + .unwrap() +}); + +pub static NUM_CONNECTION_REQUESTS_GAUGE: Lazy = Lazy::new(|| { + register_int_counter_pair_vec!( + "proxy_accepted_connections_total", + "Number of client connections accepted.", + "proxy_closed_connections_total", + "Number of client connections closed.", + &["protocol"], + ) + .unwrap() +}); + +pub static COMPUTE_CONNECTION_LATENCY: Lazy = Lazy::new(|| { + register_histogram_vec!( + "proxy_compute_connection_latency_seconds", + "Time it took for proxy to establish a connection to the compute endpoint", + // http/ws/tcp, true/false, true/false, success/failure + // 3 * 2 * 2 * 2 = 24 counters + &["protocol", "cache_miss", "pool_miss", "outcome"], + // largest bucket = 2^16 * 0.5ms = 32s + exponential_buckets(0.0005, 2.0, 16).unwrap(), + ) + .unwrap() +}); + +pub static CONSOLE_REQUEST_LATENCY: Lazy = Lazy::new(|| { + register_histogram_vec!( + "proxy_console_request_latency", + "Time it took for proxy to establish a connection to the compute endpoint", + // proxy_wake_compute/proxy_get_role_info + &["request"], + // largest bucket = 2^16 * 0.2ms = 13s + exponential_buckets(0.0002, 2.0, 16).unwrap(), + ) + .unwrap() +}); + +pub static ALLOWED_IPS_BY_CACHE_OUTCOME: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "proxy_allowed_ips_cache_misses", + "Number of cache hits/misses for allowed ips", + // hit/miss + &["outcome"], + ) + .unwrap() +}); + +pub static RATE_LIMITER_ACQUIRE_LATENCY: Lazy = Lazy::new(|| { + register_histogram!( + "proxy_control_plane_token_acquire_seconds", + "Time it took for proxy to establish a connection to the compute endpoint", + // largest bucket = 3^16 * 0.05ms = 2.15s + exponential_buckets(0.00005, 3.0, 16).unwrap(), + ) + .unwrap() +}); + +pub static RATE_LIMITER_LIMIT: Lazy = Lazy::new(|| { + register_int_gauge_vec!( + "semaphore_control_plane_limit", + "Current limit of the semaphore control plane", + &["limit"], // 2 counters + ) + .unwrap() +}); + +pub static NUM_CONNECTION_ACCEPTED_BY_SNI: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "proxy_accepted_connections_by_sni", + "Number of connections (per sni).", + &["kind"], + ) + .unwrap() +}); + +pub static ALLOWED_IPS_NUMBER: Lazy = Lazy::new(|| { + register_histogram!( + "proxy_allowed_ips_number", + "Number of allowed ips", + vec![0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 10.0, 20.0, 50.0, 100.0], + ) + .unwrap() +}); + +pub struct LatencyTimer { + // time since the stopwatch was started + start: Option, + // accumulated time on the stopwatch + accumulated: std::time::Duration, + // label data + protocol: &'static str, + cache_miss: bool, + pool_miss: bool, + outcome: &'static str, +} + +pub struct LatencyTimerPause<'a> { + timer: &'a mut LatencyTimer, +} + +impl LatencyTimer { + pub fn new(protocol: &'static str) -> Self { + Self { + start: Some(time::Instant::now()), + accumulated: std::time::Duration::ZERO, + protocol, + cache_miss: false, + // by default we don't do pooling + pool_miss: true, + // assume failed unless otherwise specified + outcome: "failed", + } + } + + pub fn pause(&mut self) -> LatencyTimerPause<'_> { + // stop the stopwatch and record the time that we have accumulated + let start = self.start.take().expect("latency timer should be started"); + self.accumulated += start.elapsed(); + LatencyTimerPause { timer: self } + } + + pub fn cache_miss(&mut self) { + self.cache_miss = true; + } + + pub fn pool_hit(&mut self) { + self.pool_miss = false; + } + + pub fn success(mut self) { + self.outcome = "success"; + } +} + +impl Drop for LatencyTimerPause<'_> { + fn drop(&mut self) { + // start the stopwatch again + self.timer.start = Some(time::Instant::now()); + } +} + +impl Drop for LatencyTimer { + fn drop(&mut self) { + let duration = + self.start.map(|start| start.elapsed()).unwrap_or_default() + self.accumulated; + COMPUTE_CONNECTION_LATENCY + .with_label_values(&[ + self.protocol, + bool_to_str(self.cache_miss), + bool_to_str(self.pool_miss), + self.outcome, + ]) + .observe(duration.as_secs_f64()) + } +} + +pub static NUM_CONNECTION_FAILURES: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "proxy_connection_failures_total", + "Number of connection failures (per kind).", + &["kind"], + ) + .unwrap() +}); + +pub static NUM_WAKEUP_FAILURES: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "proxy_connection_failures_breakdown", + "Number of wake-up failures (per kind).", + &["retry", "kind"], + ) + .unwrap() +}); + +pub static NUM_BYTES_PROXIED_PER_CLIENT_COUNTER: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "proxy_io_bytes_per_client", + "Number of bytes sent/received between client and backend.", + crate::console::messages::MetricsAuxInfo::TRAFFIC_LABELS, + ) + .unwrap() +}); + +pub static NUM_BYTES_PROXIED_COUNTER: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "proxy_io_bytes", + "Number of bytes sent/received between all clients and backends.", + &["direction"], + ) + .unwrap() +}); + +pub const fn bool_to_str(x: bool) -> &'static str { + if x { + "true" + } else { + "false" + } +} diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs index 2af2dd5562..17e910860c 100644 --- a/proxy/src/proxy.rs +++ b/proxy/src/proxy.rs @@ -1,286 +1,61 @@ #[cfg(test)] mod tests; +pub mod connect_compute; +pub mod retry; + use crate::{ - auth::{self, backend::AuthSuccess}, + auth, cancellation::{self, CancelMap}, - compute::{self, PostgresConnection}, + compute, config::{AuthenticationConfig, ProxyConfig, TlsConfig}, - console::{self, errors::WakeComputeError, messages::MetricsAuxInfo, Api}, - http::StatusCode, + console::{self, messages::MetricsAuxInfo}, + metrics::{ + LatencyTimer, NUM_BYTES_PROXIED_COUNTER, NUM_BYTES_PROXIED_PER_CLIENT_COUNTER, + NUM_CLIENT_CONNECTION_GAUGE, NUM_CONNECTION_REQUESTS_GAUGE, + }, protocol2::WithClientIp, + rate_limiter::EndpointRateLimiter, stream::{PqStream, Stream}, usage_metrics::{Ids, USAGE_METRICS}, }; use anyhow::{bail, Context}; -use async_trait::async_trait; use futures::TryFutureExt; use itertools::Itertools; -use metrics::{exponential_buckets, register_int_counter_vec, IntCounterVec}; -use once_cell::sync::{Lazy, OnceCell}; +use once_cell::sync::OnceCell; use pq_proto::{BeMessage as Be, FeStartupPacket, StartupMessageParams}; -use prometheus::{ - register_histogram, register_histogram_vec, register_int_gauge_vec, Histogram, HistogramVec, - IntGaugeVec, -}; use regex::Regex; -use std::{error::Error, io, net::SocketAddr, ops::ControlFlow, sync::Arc, time::Instant}; -use tokio::{ - io::{AsyncRead, AsyncWrite, AsyncWriteExt}, - time, -}; +use std::{net::IpAddr, sync::Arc}; +use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt}; use tokio_util::sync::CancellationToken; -use tracing::{error, info, info_span, warn, Instrument}; +use tracing::{error, info, info_span, Instrument}; use utils::measured_stream::MeasuredStream; -/// Number of times we should retry the `/proxy_wake_compute` http request. -/// Retry duration is BASE_RETRY_WAIT_DURATION * RETRY_WAIT_EXPONENT_BASE ^ n, where n starts at 0 -pub const NUM_RETRIES_CONNECT: u32 = 16; -const CONNECT_TIMEOUT: time::Duration = time::Duration::from_secs(2); -const BASE_RETRY_WAIT_DURATION: time::Duration = time::Duration::from_millis(25); -const RETRY_WAIT_EXPONENT_BASE: f64 = std::f64::consts::SQRT_2; +use self::connect_compute::{connect_to_compute, TcpMechanism}; const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)"; const ERR_PROTO_VIOLATION: &str = "protocol violation"; -pub static NUM_DB_CONNECTIONS_OPENED_COUNTER: Lazy = Lazy::new(|| { - register_int_counter_vec!( - "proxy_opened_db_connections_total", - "Number of opened connections to a database.", - &["protocol"], +pub async fn run_until_cancelled( + f: F, + cancellation_token: &CancellationToken, +) -> Option { + match futures::future::select( + std::pin::pin!(f), + std::pin::pin!(cancellation_token.cancelled()), ) - .unwrap() -}); - -pub static NUM_DB_CONNECTIONS_CLOSED_COUNTER: Lazy = Lazy::new(|| { - register_int_counter_vec!( - "proxy_closed_db_connections_total", - "Number of closed connections to a database.", - &["protocol"], - ) - .unwrap() -}); - -pub static NUM_CLIENT_CONNECTION_OPENED_COUNTER: Lazy = Lazy::new(|| { - register_int_counter_vec!( - "proxy_opened_client_connections_total", - "Number of opened connections from a client.", - &["protocol"], - ) - .unwrap() -}); - -pub static NUM_CLIENT_CONNECTION_CLOSED_COUNTER: Lazy = Lazy::new(|| { - register_int_counter_vec!( - "proxy_closed_client_connections_total", - "Number of closed connections from a client.", - &["protocol"], - ) - .unwrap() -}); - -pub static NUM_CONNECTIONS_ACCEPTED_COUNTER: Lazy = Lazy::new(|| { - register_int_counter_vec!( - "proxy_accepted_connections_total", - "Number of client connections accepted.", - &["protocol"], - ) - .unwrap() -}); - -pub static NUM_CONNECTIONS_CLOSED_COUNTER: Lazy = Lazy::new(|| { - register_int_counter_vec!( - "proxy_closed_connections_total", - "Number of client connections closed.", - &["protocol"], - ) - .unwrap() -}); - -static COMPUTE_CONNECTION_LATENCY: Lazy = Lazy::new(|| { - register_histogram_vec!( - "proxy_compute_connection_latency_seconds", - "Time it took for proxy to establish a connection to the compute endpoint", - // http/ws/tcp, true/false, true/false, success/failure - // 3 * 2 * 2 * 2 = 24 counters - &["protocol", "cache_miss", "pool_miss", "outcome"], - // largest bucket = 2^16 * 0.5ms = 32s - exponential_buckets(0.0005, 2.0, 16).unwrap(), - ) - .unwrap() -}); - -pub static CONSOLE_REQUEST_LATENCY: Lazy = Lazy::new(|| { - register_histogram_vec!( - "proxy_console_request_latency", - "Time it took for proxy to establish a connection to the compute endpoint", - // proxy_wake_compute/proxy_get_role_info - &["request"], - // largest bucket = 2^16 * 0.2ms = 13s - exponential_buckets(0.2, 2.0, 16).unwrap(), - ) - .unwrap() -}); - -pub static ALLOWED_IPS_BY_CACHE_OUTCOME: Lazy = Lazy::new(|| { - register_int_counter_vec!( - "proxy_allowed_ips_cache_misses", - "Number of cache hits/misses for allowed ips", - // hit/miss - &["outcome"], - ) - .unwrap() -}); - -pub static RATE_LIMITER_ACQUIRE_LATENCY: Lazy = Lazy::new(|| { - register_histogram!( - "semaphore_control_plane_token_acquire_seconds", - "Time it took for proxy to establish a connection to the compute endpoint", - // largest bucket = 3^16 * 0.00005s = 3.28s - exponential_buckets(0.00005, 3.0, 16).unwrap(), - ) - .unwrap() -}); - -pub static RATE_LIMITER_LIMIT: Lazy = Lazy::new(|| { - register_int_gauge_vec!( - "semaphore_control_plane_limit", - "Current limit of the semaphore control plane", - &["limit"], // 2 counters - ) - .unwrap() -}); - -pub static NUM_CONNECTION_ACCEPTED_BY_SNI: Lazy = Lazy::new(|| { - register_int_counter_vec!( - "proxy_accepted_connections_by_sni", - "Number of connections (per sni).", - &["kind"], - ) - .unwrap() -}); - -pub static ALLOWED_IPS_NUMBER: Lazy = Lazy::new(|| { - register_histogram!( - "proxy_allowed_ips_number", - "Number of allowed ips", - vec![0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 10.0, 20.0, 50.0, 100.0], - ) - .unwrap() -}); - -pub struct LatencyTimer { - // time since the stopwatch was started - start: Option, - // accumulated time on the stopwatch - accumulated: std::time::Duration, - // label data - protocol: &'static str, - cache_miss: bool, - pool_miss: bool, - outcome: &'static str, -} - -pub struct LatencyTimerPause<'a> { - timer: &'a mut LatencyTimer, -} - -impl LatencyTimer { - pub fn new(protocol: &'static str) -> Self { - Self { - start: Some(Instant::now()), - accumulated: std::time::Duration::ZERO, - protocol, - cache_miss: false, - // by default we don't do pooling - pool_miss: true, - // assume failed unless otherwise specified - outcome: "failed", - } - } - - pub fn pause(&mut self) -> LatencyTimerPause<'_> { - // stop the stopwatch and record the time that we have accumulated - let start = self.start.take().expect("latency timer should be started"); - self.accumulated += start.elapsed(); - LatencyTimerPause { timer: self } - } - - pub fn cache_miss(&mut self) { - self.cache_miss = true; - } - - pub fn pool_hit(&mut self) { - self.pool_miss = false; - } - - pub fn success(mut self) { - self.outcome = "success"; + .await + { + futures::future::Either::Left((f, _)) => Some(f), + futures::future::Either::Right(((), _)) => None, } } -impl Drop for LatencyTimerPause<'_> { - fn drop(&mut self) { - // start the stopwatch again - self.timer.start = Some(Instant::now()); - } -} - -impl Drop for LatencyTimer { - fn drop(&mut self) { - let duration = - self.start.map(|start| start.elapsed()).unwrap_or_default() + self.accumulated; - COMPUTE_CONNECTION_LATENCY - .with_label_values(&[ - self.protocol, - bool_to_str(self.cache_miss), - bool_to_str(self.pool_miss), - self.outcome, - ]) - .observe(duration.as_secs_f64()) - } -} - -static NUM_CONNECTION_FAILURES: Lazy = Lazy::new(|| { - register_int_counter_vec!( - "proxy_connection_failures_total", - "Number of connection failures (per kind).", - &["kind"], - ) - .unwrap() -}); - -static NUM_WAKEUP_FAILURES: Lazy = Lazy::new(|| { - register_int_counter_vec!( - "proxy_connection_failures_breakdown", - "Number of wake-up failures (per kind).", - &["retry", "kind"], - ) - .unwrap() -}); - -static NUM_BYTES_PROXIED_PER_CLIENT_COUNTER: Lazy = Lazy::new(|| { - register_int_counter_vec!( - "proxy_io_bytes_per_client", - "Number of bytes sent/received between client and backend.", - crate::console::messages::MetricsAuxInfo::TRAFFIC_LABELS, - ) - .unwrap() -}); - -static NUM_BYTES_PROXIED_COUNTER: Lazy = Lazy::new(|| { - register_int_counter_vec!( - "proxy_io_bytes", - "Number of bytes sent/received between all clients and backends.", - &["direction"], - ) - .unwrap() -}); - pub async fn task_main( config: &'static ProxyConfig, listener: tokio::net::TcpListener, cancellation_token: CancellationToken, + endpoint_rate_limiter: Arc, ) -> anyhow::Result<()> { scopeguard::defer! { info!("proxy has shut down"); @@ -290,71 +65,65 @@ pub async fn task_main( // will be inherited by all accepted client sockets. socket2::SockRef::from(&listener).set_keepalive(true)?; - let mut connections = tokio::task::JoinSet::new(); + let connections = tokio_util::task::task_tracker::TaskTracker::new(); let cancel_map = Arc::new(CancelMap::default()); - loop { - tokio::select! { - accept_result = listener.accept() => { - let (socket, peer_addr) = accept_result?; + while let Some(accept_result) = + run_until_cancelled(listener.accept(), &cancellation_token).await + { + let (socket, peer_addr) = accept_result?; - let session_id = uuid::Uuid::new_v4(); - let cancel_map = Arc::clone(&cancel_map); - connections.spawn( - async move { - info!("accepted postgres client connection"); + let session_id = uuid::Uuid::new_v4(); + let cancel_map = Arc::clone(&cancel_map); + let endpoint_rate_limiter = endpoint_rate_limiter.clone(); - let mut socket = WithClientIp::new(socket); - let mut peer_addr = peer_addr; - if let Some(ip) = socket.wait_for_addr().await? { - peer_addr = ip; - tracing::Span::current().record("peer_addr", &tracing::field::display(ip)); - } else if config.require_client_ip { - bail!("missing required client IP"); - } + connections.spawn( + async move { + info!("accepted postgres client connection"); - socket - .inner - .set_nodelay(true) - .context("failed to set socket option")?; - - handle_client(config, &cancel_map, session_id, socket, ClientMode::Tcp, peer_addr).await - } - .instrument(info_span!("handle_client", ?session_id, peer_addr = tracing::field::Empty)) - .unwrap_or_else(move |e| { - // Acknowledge that the task has finished with an error. - error!(?session_id, "per-client task finished with an error: {e:#}"); - }), - ); - } - // Don't modify this unless you read https://docs.rs/tokio/latest/tokio/macro.select.html carefully. - // If this future completes and the pattern doesn't match, this branch is disabled for this call to `select!`. - // This only counts for this loop and it will be enabled again on next `select!`. - // - // Prior code had this as `Some(Err(e))` which _looks_ equivalent to the current setup, but it's not. - // When `connections.join_next()` returned `Some(Ok(()))` (which we expect), it would disable the join_next and it would - // not get called again, even if there are more connections to remove. - Some(res) = connections.join_next() => { - if let Err(e) = res { - if !e.is_panic() && !e.is_cancelled() { - warn!("unexpected error from joined connection task: {e:?}"); - } + let mut socket = WithClientIp::new(socket); + let mut peer_addr = peer_addr; + if let Some(ip) = socket.wait_for_addr().await? { + peer_addr = ip; + tracing::Span::current().record("peer_addr", &tracing::field::display(ip)); + } else if config.require_client_ip { + bail!("missing required client IP"); } + + socket + .inner + .set_nodelay(true) + .context("failed to set socket option")?; + + handle_client( + config, + &cancel_map, + session_id, + socket, + ClientMode::Tcp, + peer_addr.ip(), + endpoint_rate_limiter, + ) + .await } - _ = cancellation_token.cancelled() => { - drop(listener); - break; - } - } + .instrument(info_span!( + "handle_client", + ?session_id, + peer_addr = tracing::field::Empty + )) + .unwrap_or_else(move |e| { + // Acknowledge that the task has finished with an error. + error!(?session_id, "per-client task finished with an error: {e:#}"); + }), + ); } + + connections.close(); + drop(listener); + // Drain connections - while let Some(res) = connections.join_next().await { - if let Err(e) = res { - if !e.is_panic() && !e.is_cancelled() { - warn!("unexpected error from joined connection task: {e:?}"); - } - } - } + connections.wait().await; + Ok(()) } @@ -408,7 +177,8 @@ pub async fn handle_client( session_id: uuid::Uuid, stream: S, mode: ClientMode, - peer_addr: SocketAddr, + peer_addr: IpAddr, + endpoint_rate_limiter: Arc, ) -> anyhow::Result<()> { info!( protocol = mode.protocol_label(), @@ -416,16 +186,12 @@ pub async fn handle_client( ); let proto = mode.protocol_label(); - NUM_CLIENT_CONNECTION_OPENED_COUNTER + let _client_gauge = NUM_CLIENT_CONNECTION_GAUGE .with_label_values(&[proto]) - .inc(); - NUM_CONNECTIONS_ACCEPTED_COUNTER + .guard(); + let _request_gauge = NUM_CONNECTION_REQUESTS_GAUGE .with_label_values(&[proto]) - .inc(); - scopeguard::defer! { - NUM_CLIENT_CONNECTION_CLOSED_COUNTER.with_label_values(&[proto]).inc(); - NUM_CONNECTIONS_CLOSED_COUNTER.with_label_values(&[proto]).inc(); - } + .guard(); let tls = config.tls_config.as_ref(); @@ -457,6 +223,7 @@ pub async fn handle_client( ¶ms, session_id, mode.allow_self_signed_compute(config), + endpoint_rate_limiter, ); cancel_map .with_session(|session| client.connect_to_db(session, mode, &config.authentication_config)) @@ -548,297 +315,10 @@ async fn handshake( } } -/// If we couldn't connect, a cached connection info might be to blame -/// (e.g. the compute node's address might've changed at the wrong time). -/// Invalidate the cache entry (if any) to prevent subsequent errors. -#[tracing::instrument(name = "invalidate_cache", skip_all)] -pub fn invalidate_cache(node_info: console::CachedNodeInfo) -> compute::ConnCfg { - let is_cached = node_info.cached(); - if is_cached { - warn!("invalidating stalled compute node info cache entry"); - } - let label = match is_cached { - true => "compute_cached", - false => "compute_uncached", - }; - NUM_CONNECTION_FAILURES.with_label_values(&[label]).inc(); - - node_info.invalidate().config -} - -/// Try to connect to the compute node once. -#[tracing::instrument(name = "connect_once", fields(pid = tracing::field::Empty), skip_all)] -async fn connect_to_compute_once( - node_info: &console::CachedNodeInfo, - timeout: time::Duration, -) -> Result { - let allow_self_signed_compute = node_info.allow_self_signed_compute; - - node_info - .config - .connect(allow_self_signed_compute, timeout) - .await -} - -#[async_trait] -pub trait ConnectMechanism { - type Connection; - type ConnectError; - type Error: From; - async fn connect_once( - &self, - node_info: &console::CachedNodeInfo, - timeout: time::Duration, - ) -> Result; - - fn update_connect_config(&self, conf: &mut compute::ConnCfg); -} - -pub struct TcpMechanism<'a> { - /// KV-dictionary with PostgreSQL connection params. - pub params: &'a StartupMessageParams, -} - -#[async_trait] -impl ConnectMechanism for TcpMechanism<'_> { - type Connection = PostgresConnection; - type ConnectError = compute::ConnectionError; - type Error = compute::ConnectionError; - - async fn connect_once( - &self, - node_info: &console::CachedNodeInfo, - timeout: time::Duration, - ) -> Result { - connect_to_compute_once(node_info, timeout).await - } - - fn update_connect_config(&self, config: &mut compute::ConnCfg) { - config.set_startup_params(self.params); - } -} - -const fn bool_to_str(x: bool) -> &'static str { - if x { - "true" - } else { - "false" - } -} - -fn report_error(e: &WakeComputeError, retry: bool) { - use crate::console::errors::ApiError; - let retry = bool_to_str(retry); - let kind = match e { - WakeComputeError::BadComputeAddress(_) => "bad_compute_address", - WakeComputeError::ApiError(ApiError::Transport(_)) => "api_transport_error", - WakeComputeError::ApiError(ApiError::Console { - status: StatusCode::LOCKED, - ref text, - }) if text.contains("written data quota exceeded") - || text.contains("the limit for current plan reached") => - { - "quota_exceeded" - } - WakeComputeError::ApiError(ApiError::Console { - status: StatusCode::LOCKED, - .. - }) => "api_console_locked", - WakeComputeError::ApiError(ApiError::Console { - status: StatusCode::BAD_REQUEST, - .. - }) => "api_console_bad_request", - WakeComputeError::ApiError(ApiError::Console { status, .. }) - if status.is_server_error() => - { - "api_console_other_server_error" - } - WakeComputeError::ApiError(ApiError::Console { .. }) => "api_console_other_error", - WakeComputeError::TimeoutError => "timeout_error", - }; - NUM_WAKEUP_FAILURES.with_label_values(&[retry, kind]).inc(); -} - -/// Try to connect to the compute node, retrying if necessary. -/// This function might update `node_info`, so we take it by `&mut`. -#[tracing::instrument(skip_all)] -pub async fn connect_to_compute( - mechanism: &M, - mut node_info: console::CachedNodeInfo, - extra: &console::ConsoleReqExtra<'_>, - creds: &auth::BackendType<'_, auth::ClientCredentials<'_>>, - mut latency_timer: LatencyTimer, -) -> Result -where - M::ConnectError: ShouldRetry + std::fmt::Debug, - M::Error: From, -{ - mechanism.update_connect_config(&mut node_info.config); - - // try once - let (config, err) = match mechanism.connect_once(&node_info, CONNECT_TIMEOUT).await { - Ok(res) => { - latency_timer.success(); - return Ok(res); - } - Err(e) => { - error!(error = ?e, "could not connect to compute node"); - (invalidate_cache(node_info), e) - } - }; - - latency_timer.cache_miss(); - - let mut num_retries = 1; - - // if we failed to connect, it's likely that the compute node was suspended, wake a new compute node - info!("compute node's state has likely changed; requesting a wake-up"); - let node_info = loop { - let wake_res = match creds { - auth::BackendType::Console(api, creds) => api.wake_compute(extra, creds).await, - auth::BackendType::Postgres(api, creds) => api.wake_compute(extra, creds).await, - // nothing to do? - auth::BackendType::Link(_) => return Err(err.into()), - // test backend - auth::BackendType::Test(x) => x.wake_compute(), - }; - - match handle_try_wake(wake_res, num_retries) { - Err(e) => { - error!(error = ?e, num_retries, retriable = false, "couldn't wake compute node"); - report_error(&e, false); - return Err(e.into()); - } - // failed to wake up but we can continue to retry - Ok(ControlFlow::Continue(e)) => { - report_error(&e, true); - warn!(error = ?e, num_retries, retriable = true, "couldn't wake compute node"); - } - // successfully woke up a compute node and can break the wakeup loop - Ok(ControlFlow::Break(mut node_info)) => { - node_info.config.reuse_password(&config); - mechanism.update_connect_config(&mut node_info.config); - break node_info; - } - } - - let wait_duration = retry_after(num_retries); - num_retries += 1; - - time::sleep(wait_duration).await; - }; - - // now that we have a new node, try connect to it repeatedly. - // this can error for a few reasons, for instance: - // * DNS connection settings haven't quite propagated yet - info!("wake_compute success. attempting to connect"); - loop { - match mechanism.connect_once(&node_info, CONNECT_TIMEOUT).await { - Ok(res) => { - latency_timer.success(); - return Ok(res); - } - Err(e) => { - let retriable = e.should_retry(num_retries); - if !retriable { - error!(error = ?e, num_retries, retriable, "couldn't connect to compute node"); - return Err(e.into()); - } - warn!(error = ?e, num_retries, retriable, "couldn't connect to compute node"); - } - } - - let wait_duration = retry_after(num_retries); - num_retries += 1; - - time::sleep(wait_duration).await; - } -} - -/// Attempts to wake up the compute node. -/// * Returns Ok(Continue(e)) if there was an error waking but retries are acceptable -/// * Returns Ok(Break(node)) if the wakeup succeeded -/// * Returns Err(e) if there was an error -pub fn handle_try_wake( - result: Result, - num_retries: u32, -) -> Result, WakeComputeError> { - match result { - Err(err) => match &err { - WakeComputeError::ApiError(api) if api.should_retry(num_retries) => { - Ok(ControlFlow::Continue(err)) - } - _ => Err(err), - }, - // Ready to try again. - Ok(new) => Ok(ControlFlow::Break(new)), - } -} - -pub trait ShouldRetry { - fn could_retry(&self) -> bool; - fn should_retry(&self, num_retries: u32) -> bool { - match self { - _ if num_retries >= NUM_RETRIES_CONNECT => false, - err => err.could_retry(), - } - } -} - -impl ShouldRetry for io::Error { - fn could_retry(&self) -> bool { - use std::io::ErrorKind; - matches!( - self.kind(), - ErrorKind::ConnectionRefused | ErrorKind::AddrNotAvailable | ErrorKind::TimedOut - ) - } -} - -impl ShouldRetry for tokio_postgres::error::DbError { - fn could_retry(&self) -> bool { - use tokio_postgres::error::SqlState; - matches!( - self.code(), - &SqlState::CONNECTION_FAILURE - | &SqlState::CONNECTION_EXCEPTION - | &SqlState::CONNECTION_DOES_NOT_EXIST - | &SqlState::SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION, - ) - } -} - -impl ShouldRetry for tokio_postgres::Error { - fn could_retry(&self) -> bool { - if let Some(io_err) = self.source().and_then(|x| x.downcast_ref()) { - io::Error::could_retry(io_err) - } else if let Some(db_err) = self.source().and_then(|x| x.downcast_ref()) { - tokio_postgres::error::DbError::could_retry(db_err) - } else { - false - } - } -} - -impl ShouldRetry for compute::ConnectionError { - fn could_retry(&self) -> bool { - match self { - compute::ConnectionError::Postgres(err) => err.could_retry(), - compute::ConnectionError::CouldNotConnect(err) => err.could_retry(), - _ => false, - } - } -} - -pub fn retry_after(num_retries: u32) -> time::Duration { - BASE_RETRY_WAIT_DURATION.mul_f64(RETRY_WAIT_EXPONENT_BASE.powi((num_retries as i32) - 1)) -} - /// Finish client connection initialization: confirm auth success, send params, etc. #[tracing::instrument(skip_all)] async fn prepare_client_connection( node: &compute::PostgresConnection, - reported_auth_ok: bool, session: cancellation::Session<'_>, stream: &mut PqStream, ) -> anyhow::Result<()> { @@ -846,13 +326,6 @@ async fn prepare_client_connection( // The new token (cancel_key_data) will be sent to the client. let cancel_key_data = session.enable_query_cancellation(node.cancel_closure.clone()); - // Report authentication success if we haven't done this already. - // Note that we do this only (for the most part) after we've connected - // to a compute (see above) which performs its own authentication. - if !reported_auth_ok { - stream.write_message_noflush(&Be::AuthenticationOk)?; - } - // Forward all postgres connection params to the client. // Right now the implementation is very hacky and inefficent (ideally, // we don't need an intermediate hashmap), but at least it should be correct. @@ -877,11 +350,11 @@ async fn prepare_client_connection( pub async fn proxy_pass( client: impl AsyncRead + AsyncWrite + Unpin, compute: impl AsyncRead + AsyncWrite + Unpin, - aux: &MetricsAuxInfo, + aux: MetricsAuxInfo, ) -> anyhow::Result<()> { let usage = USAGE_METRICS.register(Ids { - endpoint_id: aux.endpoint_id.to_string(), - branch_id: aux.branch_id.to_string(), + endpoint_id: aux.endpoint_id.clone(), + branch_id: aux.branch_id.clone(), }); let m_sent = NUM_BYTES_PROXIED_COUNTER.with_label_values(&["tx"]); @@ -921,23 +394,26 @@ struct Client<'a, S> { /// The underlying libpq protocol stream. stream: PqStream>, /// Client credentials that we care about. - creds: auth::BackendType<'a, auth::ClientCredentials<'a>>, + creds: auth::BackendType<'a, auth::ClientCredentials>, /// KV-dictionary with PostgreSQL connection params. params: &'a StartupMessageParams, /// Unique connection ID. session_id: uuid::Uuid, /// Allow self-signed certificates (for testing). allow_self_signed_compute: bool, + /// Rate limiter for endpoints + endpoint_rate_limiter: Arc, } impl<'a, S> Client<'a, S> { /// Construct a new connection context. fn new( stream: PqStream>, - creds: auth::BackendType<'a, auth::ClientCredentials<'a>>, + creds: auth::BackendType<'a, auth::ClientCredentials>, params: &'a StartupMessageParams, session_id: uuid::Uuid, allow_self_signed_compute: bool, + endpoint_rate_limiter: Arc, ) -> Self { Self { stream, @@ -945,6 +421,7 @@ impl<'a, S> Client<'a, S> { params, session_id, allow_self_signed_compute, + endpoint_rate_limiter, } } } @@ -953,7 +430,7 @@ impl Client<'_, S> { /// Let the client authenticate and connect to the designated compute node. // Instrumentation logs endpoint name everywhere. Doesn't work for link // auth; strictly speaking we don't know endpoint name in its case. - #[tracing::instrument(name = "", fields(ep = self.creds.get_endpoint().unwrap_or("".to_owned())), skip_all)] + #[tracing::instrument(name = "", fields(ep = %self.creds.get_endpoint().unwrap_or_default()), skip_all)] async fn connect_to_db( self, session: cancellation::Session<'_>, @@ -962,22 +439,35 @@ impl Client<'_, S> { ) -> anyhow::Result<()> { let Self { mut stream, - mut creds, + creds, params, session_id, allow_self_signed_compute, + endpoint_rate_limiter, } = self; - let console_options = neon_options(params); + // check rate limit + if let Some(ep) = creds.get_endpoint() { + if !endpoint_rate_limiter.check(ep) { + return stream + .throw_error(auth::AuthError::too_many_connections()) + .await; + } + } + let proto = mode.protocol_label(); let extra = console::ConsoleReqExtra { session_id, // aka this connection's id - application_name: params.get("application_name"), - options: console_options.as_deref(), + application_name: format!( + "{}/{}", + params.get("application_name").unwrap_or_default(), + proto + ), + options: neon_options(params), }; + let mut latency_timer = LatencyTimer::new(proto); - let mut latency_timer = LatencyTimer::new(mode.protocol_label()); - + let user = creds.get_user().to_owned(); let auth_result = match creds .authenticate( &extra, @@ -990,7 +480,6 @@ impl Client<'_, S> { { Ok(auth_result) => auth_result, Err(e) => { - let user = creds.get_user(); let db = params.get("database"); let app = params.get("application_name"); let params_span = tracing::info_span!("", ?user, ?db, ?app); @@ -999,16 +488,13 @@ impl Client<'_, S> { } }; - let AuthSuccess { - reported_auth_ok, - value: mut node_info, - } = auth_result; + let (mut node_info, creds) = auth_result; node_info.allow_self_signed_compute = allow_self_signed_compute; let aux = node_info.aux.clone(); let mut node = connect_to_compute( - &TcpMechanism { params }, + &TcpMechanism { params, proto }, node_info, &extra, &creds, @@ -1017,45 +503,40 @@ impl Client<'_, S> { .or_else(|e| stream.throw_error(e)) .await?; - let proto = mode.protocol_label(); - NUM_DB_CONNECTIONS_OPENED_COUNTER - .with_label_values(&[proto]) - .inc(); - scopeguard::defer! { - NUM_DB_CONNECTIONS_CLOSED_COUNTER.with_label_values(&[proto]).inc(); - } - - prepare_client_connection(&node, reported_auth_ok, session, &mut stream).await?; + prepare_client_connection(&node, session, &mut stream).await?; // Before proxy passing, forward to compute whatever data is left in the // PqStream input buffer. Normally there is none, but our serverless npm // driver in pipeline mode sends startup, password and first query // immediately after opening the connection. let (stream, read_buf) = stream.into_inner(); node.stream.write_all(&read_buf).await?; - proxy_pass(stream, node.stream, &aux).await + proxy_pass(stream, node.stream, aux).await } } -pub fn neon_options(params: &StartupMessageParams) -> Option { +pub fn neon_options(params: &StartupMessageParams) -> Vec<(String, String)> { #[allow(unstable_name_collisions)] - let options: String = params - .options_raw()? - .filter(|opt| is_neon_param(opt)) - .sorted() // we sort it to use as cache key - .intersperse(" ") // TODO: use impl from std once it's stabilized - .collect(); - - // Don't even bother with empty options. - if options.is_empty() { - return None; + match params.options_raw() { + Some(options) => options.filter_map(neon_option).collect(), + None => vec![], } - - Some(options) } -pub fn is_neon_param(bytes: &str) -> bool { +pub fn neon_options_str(params: &StartupMessageParams) -> String { + #[allow(unstable_name_collisions)] + neon_options(params) + .iter() + .map(|(k, v)| format!("{}:{}", k, v)) + .sorted() // we sort it to use as cache key + .intersperse(" ".to_owned()) + .collect() +} + +pub fn neon_option(bytes: &str) -> Option<(String, String)> { static RE: OnceCell = OnceCell::new(); - RE.get_or_init(|| Regex::new(r"^neon_\w+:").unwrap()); + let re = RE.get_or_init(|| Regex::new(r"^neon_(\w+):(.+)").unwrap()); - RE.get().unwrap().is_match(bytes) + let cap = re.captures(bytes)?; + let (_, [k, v]) = cap.extract(); + Some((k.to_owned(), v.to_owned())) } diff --git a/proxy/src/proxy/connect_compute.rs b/proxy/src/proxy/connect_compute.rs new file mode 100644 index 0000000000..88b0019c49 --- /dev/null +++ b/proxy/src/proxy/connect_compute.rs @@ -0,0 +1,238 @@ +use crate::{ + auth, + compute::{self, PostgresConnection}, + console::{self, errors::WakeComputeError, Api}, + metrics::{bool_to_str, LatencyTimer, NUM_CONNECTION_FAILURES, NUM_WAKEUP_FAILURES}, + proxy::retry::{retry_after, ShouldRetry}, +}; +use async_trait::async_trait; +use hyper::StatusCode; +use pq_proto::StartupMessageParams; +use std::ops::ControlFlow; +use tokio::time; +use tracing::{error, info, warn}; + +const CONNECT_TIMEOUT: time::Duration = time::Duration::from_secs(2); + +/// If we couldn't connect, a cached connection info might be to blame +/// (e.g. the compute node's address might've changed at the wrong time). +/// Invalidate the cache entry (if any) to prevent subsequent errors. +#[tracing::instrument(name = "invalidate_cache", skip_all)] +pub fn invalidate_cache(node_info: console::CachedNodeInfo) -> compute::ConnCfg { + let is_cached = node_info.cached(); + if is_cached { + warn!("invalidating stalled compute node info cache entry"); + } + let label = match is_cached { + true => "compute_cached", + false => "compute_uncached", + }; + NUM_CONNECTION_FAILURES.with_label_values(&[label]).inc(); + + node_info.invalidate().config +} + +/// Try to connect to the compute node once. +#[tracing::instrument(name = "connect_once", fields(pid = tracing::field::Empty), skip_all)] +async fn connect_to_compute_once( + node_info: &console::CachedNodeInfo, + timeout: time::Duration, + proto: &'static str, +) -> Result { + let allow_self_signed_compute = node_info.allow_self_signed_compute; + + node_info + .config + .connect(allow_self_signed_compute, timeout, proto) + .await +} + +#[async_trait] +pub trait ConnectMechanism { + type Connection; + type ConnectError; + type Error: From; + async fn connect_once( + &self, + node_info: &console::CachedNodeInfo, + timeout: time::Duration, + ) -> Result; + + fn update_connect_config(&self, conf: &mut compute::ConnCfg); +} + +pub struct TcpMechanism<'a> { + /// KV-dictionary with PostgreSQL connection params. + pub params: &'a StartupMessageParams, + pub proto: &'static str, +} + +#[async_trait] +impl ConnectMechanism for TcpMechanism<'_> { + type Connection = PostgresConnection; + type ConnectError = compute::ConnectionError; + type Error = compute::ConnectionError; + + async fn connect_once( + &self, + node_info: &console::CachedNodeInfo, + timeout: time::Duration, + ) -> Result { + connect_to_compute_once(node_info, timeout, self.proto).await + } + + fn update_connect_config(&self, config: &mut compute::ConnCfg) { + config.set_startup_params(self.params); + } +} + +fn report_error(e: &WakeComputeError, retry: bool) { + use crate::console::errors::ApiError; + let retry = bool_to_str(retry); + let kind = match e { + WakeComputeError::BadComputeAddress(_) => "bad_compute_address", + WakeComputeError::ApiError(ApiError::Transport(_)) => "api_transport_error", + WakeComputeError::ApiError(ApiError::Console { + status: StatusCode::LOCKED, + ref text, + }) if text.contains("written data quota exceeded") + || text.contains("the limit for current plan reached") => + { + "quota_exceeded" + } + WakeComputeError::ApiError(ApiError::Console { + status: StatusCode::LOCKED, + .. + }) => "api_console_locked", + WakeComputeError::ApiError(ApiError::Console { + status: StatusCode::BAD_REQUEST, + .. + }) => "api_console_bad_request", + WakeComputeError::ApiError(ApiError::Console { status, .. }) + if status.is_server_error() => + { + "api_console_other_server_error" + } + WakeComputeError::ApiError(ApiError::Console { .. }) => "api_console_other_error", + WakeComputeError::TimeoutError => "timeout_error", + }; + NUM_WAKEUP_FAILURES.with_label_values(&[retry, kind]).inc(); +} + +/// Try to connect to the compute node, retrying if necessary. +/// This function might update `node_info`, so we take it by `&mut`. +#[tracing::instrument(skip_all)] +pub async fn connect_to_compute( + mechanism: &M, + mut node_info: console::CachedNodeInfo, + extra: &console::ConsoleReqExtra, + creds: &auth::BackendType<'_, auth::backend::ComputeUserInfo>, + mut latency_timer: LatencyTimer, +) -> Result +where + M::ConnectError: ShouldRetry + std::fmt::Debug, + M::Error: From, +{ + mechanism.update_connect_config(&mut node_info.config); + + // try once + let (config, err) = match mechanism.connect_once(&node_info, CONNECT_TIMEOUT).await { + Ok(res) => { + latency_timer.success(); + return Ok(res); + } + Err(e) => { + error!(error = ?e, "could not connect to compute node"); + (invalidate_cache(node_info), e) + } + }; + + latency_timer.cache_miss(); + + let mut num_retries = 1; + + // if we failed to connect, it's likely that the compute node was suspended, wake a new compute node + info!("compute node's state has likely changed; requesting a wake-up"); + let node_info = loop { + let wake_res = match creds { + auth::BackendType::Console(api, creds) => api.wake_compute(extra, creds).await, + #[cfg(feature = "testing")] + auth::BackendType::Postgres(api, creds) => api.wake_compute(extra, creds).await, + // nothing to do? + auth::BackendType::Link(_) => return Err(err.into()), + // test backend + #[cfg(test)] + auth::BackendType::Test(x) => x.wake_compute(), + }; + + match handle_try_wake(wake_res, num_retries) { + Err(e) => { + error!(error = ?e, num_retries, retriable = false, "couldn't wake compute node"); + report_error(&e, false); + return Err(e.into()); + } + // failed to wake up but we can continue to retry + Ok(ControlFlow::Continue(e)) => { + report_error(&e, true); + warn!(error = ?e, num_retries, retriable = true, "couldn't wake compute node"); + } + // successfully woke up a compute node and can break the wakeup loop + Ok(ControlFlow::Break(mut node_info)) => { + node_info.config.reuse_password(&config); + mechanism.update_connect_config(&mut node_info.config); + break node_info; + } + } + + let wait_duration = retry_after(num_retries); + num_retries += 1; + + time::sleep(wait_duration).await; + }; + + // now that we have a new node, try connect to it repeatedly. + // this can error for a few reasons, for instance: + // * DNS connection settings haven't quite propagated yet + info!("wake_compute success. attempting to connect"); + loop { + match mechanism.connect_once(&node_info, CONNECT_TIMEOUT).await { + Ok(res) => { + latency_timer.success(); + return Ok(res); + } + Err(e) => { + let retriable = e.should_retry(num_retries); + if !retriable { + error!(error = ?e, num_retries, retriable, "couldn't connect to compute node"); + return Err(e.into()); + } + warn!(error = ?e, num_retries, retriable, "couldn't connect to compute node"); + } + } + + let wait_duration = retry_after(num_retries); + num_retries += 1; + + time::sleep(wait_duration).await; + } +} + +/// Attempts to wake up the compute node. +/// * Returns Ok(Continue(e)) if there was an error waking but retries are acceptable +/// * Returns Ok(Break(node)) if the wakeup succeeded +/// * Returns Err(e) if there was an error +pub fn handle_try_wake( + result: Result, + num_retries: u32, +) -> Result, WakeComputeError> { + match result { + Err(err) => match &err { + WakeComputeError::ApiError(api) if api.should_retry(num_retries) => { + Ok(ControlFlow::Continue(err)) + } + _ => Err(err), + }, + // Ready to try again. + Ok(new) => Ok(ControlFlow::Break(new)), + } +} diff --git a/proxy/src/proxy/retry.rs b/proxy/src/proxy/retry.rs new file mode 100644 index 0000000000..a85ed380b0 --- /dev/null +++ b/proxy/src/proxy/retry.rs @@ -0,0 +1,68 @@ +use crate::compute; +use std::{error::Error, io}; +use tokio::time; + +/// Number of times we should retry the `/proxy_wake_compute` http request. +/// Retry duration is BASE_RETRY_WAIT_DURATION * RETRY_WAIT_EXPONENT_BASE ^ n, where n starts at 0 +pub const NUM_RETRIES_CONNECT: u32 = 16; +const BASE_RETRY_WAIT_DURATION: time::Duration = time::Duration::from_millis(25); +const RETRY_WAIT_EXPONENT_BASE: f64 = std::f64::consts::SQRT_2; + +pub trait ShouldRetry { + fn could_retry(&self) -> bool; + fn should_retry(&self, num_retries: u32) -> bool { + match self { + _ if num_retries >= NUM_RETRIES_CONNECT => false, + err => err.could_retry(), + } + } +} + +impl ShouldRetry for io::Error { + fn could_retry(&self) -> bool { + use std::io::ErrorKind; + matches!( + self.kind(), + ErrorKind::ConnectionRefused | ErrorKind::AddrNotAvailable | ErrorKind::TimedOut + ) + } +} + +impl ShouldRetry for tokio_postgres::error::DbError { + fn could_retry(&self) -> bool { + use tokio_postgres::error::SqlState; + matches!( + self.code(), + &SqlState::CONNECTION_FAILURE + | &SqlState::CONNECTION_EXCEPTION + | &SqlState::CONNECTION_DOES_NOT_EXIST + | &SqlState::SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION, + ) + } +} + +impl ShouldRetry for tokio_postgres::Error { + fn could_retry(&self) -> bool { + if let Some(io_err) = self.source().and_then(|x| x.downcast_ref()) { + io::Error::could_retry(io_err) + } else if let Some(db_err) = self.source().and_then(|x| x.downcast_ref()) { + tokio_postgres::error::DbError::could_retry(db_err) + } else { + false + } + } +} + +impl ShouldRetry for compute::ConnectionError { + fn could_retry(&self) -> bool { + match self { + compute::ConnectionError::Postgres(err) => err.could_retry(), + compute::ConnectionError::CouldNotConnect(err) => err.could_retry(), + _ => false, + } + } +} + +pub fn retry_after(num_retries: u32) -> time::Duration { + BASE_RETRY_WAIT_DURATION.mul_f64(RETRY_WAIT_EXPONENT_BASE.powi((num_retries as i32) - 1)) +} diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs index b97c0efce4..3c483c59ee 100644 --- a/proxy/src/proxy/tests.rs +++ b/proxy/src/proxy/tests.rs @@ -2,11 +2,13 @@ mod mitm; +use super::connect_compute::ConnectMechanism; +use super::retry::ShouldRetry; use super::*; -use crate::auth::backend::TestBackend; -use crate::auth::ClientCredentials; +use crate::auth::backend::{ComputeUserInfo, TestBackend}; use crate::config::CertResolver; use crate::console::{CachedNodeInfo, NodeInfo}; +use crate::proxy::retry::{retry_after, NUM_RETRIES_CONNECT}; use crate::{auth, http, sasl, scram}; use async_trait::async_trait; use rstest::rstest; @@ -109,8 +111,9 @@ fn generate_tls_config<'a>( trait TestAuth: Sized { async fn authenticate( self, - _stream: &mut PqStream>, + stream: &mut PqStream>, ) -> anyhow::Result<()> { + stream.write_message_noflush(&Be::AuthenticationOk)?; Ok(()) } } @@ -168,7 +171,6 @@ async fn dummy_proxy( auth.authenticate(&mut stream).await?; stream - .write_message_noflush(&Be::AuthenticationOk)? .write_message_noflush(&Be::CLIENT_ENCODING)? .write_message(&Be::ReadyForQuery) .await?; @@ -424,7 +426,7 @@ impl ConnectMechanism for TestConnectMechanism { async fn connect_once( &self, _node_info: &console::CachedNodeInfo, - _timeout: time::Duration, + _timeout: std::time::Duration, ) -> Result { let mut counter = self.counter.lock().unwrap(); let action = self.sequence[*counter]; @@ -485,14 +487,14 @@ fn helper_create_connect_info( mechanism: &TestConnectMechanism, ) -> ( CachedNodeInfo, - console::ConsoleReqExtra<'static>, - auth::BackendType<'_, ClientCredentials<'static>>, + console::ConsoleReqExtra, + auth::BackendType<'_, ComputeUserInfo>, ) { let cache = helper_create_cached_node_info(); let extra = console::ConsoleReqExtra { session_id: uuid::Uuid::new_v4(), - application_name: Some("TEST"), - options: None, + application_name: "TEST".into(), + options: vec![], }; let creds = auth::BackendType::Test(mechanism); (cache, extra, creds) diff --git a/proxy/src/proxy/tests/mitm.rs b/proxy/src/proxy/tests/mitm.rs index 50b3034936..a0a84a1dc0 100644 --- a/proxy/src/proxy/tests/mitm.rs +++ b/proxy/src/proxy/tests/mitm.rs @@ -120,7 +120,7 @@ where struct PgFrame; impl Decoder for PgFrame { type Item = Bytes; - type Error = io::Error; + type Error = std::io::Error; fn decode(&mut self, src: &mut BytesMut) -> Result, Self::Error> { if src.len() < 5 { @@ -136,7 +136,7 @@ impl Decoder for PgFrame { } } impl Encoder for PgFrame { - type Error = io::Error; + type Error = std::io::Error; fn encode(&mut self, item: Bytes, dst: &mut BytesMut) -> Result<(), Self::Error> { dst.extend_from_slice(&item); diff --git a/proxy/src/rate_limiter.rs b/proxy/src/rate_limiter.rs index 5622c44a68..b26386d159 100644 --- a/proxy/src/rate_limiter.rs +++ b/proxy/src/rate_limiter.rs @@ -4,3 +4,4 @@ mod limiter; pub use aimd::Aimd; pub use limit_algorithm::{AimdConfig, Fixed, RateLimitAlgorithm, RateLimiterConfig}; pub use limiter::Limiter; +pub use limiter::{EndpointRateLimiter, RateBucketInfo}; diff --git a/proxy/src/rate_limiter/aimd.rs b/proxy/src/rate_limiter/aimd.rs index c6c532ae53..2c14a54a6c 100644 --- a/proxy/src/rate_limiter/aimd.rs +++ b/proxy/src/rate_limiter/aimd.rs @@ -33,39 +33,6 @@ impl Aimd { min_utilisation_threshold: config.aimd_min_utilisation_threshold, } } - - pub fn decrease_factor(self, factor: f32) -> Self { - assert!((0.5..1.0).contains(&factor)); - Self { - decrease_factor: factor, - ..self - } - } - - pub fn increase_by(self, increase: usize) -> Self { - assert!(increase > 0); - Self { - increase_by: increase, - ..self - } - } - - pub fn with_max_limit(self, max: usize) -> Self { - assert!(max > 0); - Self { - max_limit: max, - ..self - } - } - - /// A threshold below which the limit won't be increased. 0.5 = 50%. - pub fn with_min_utilisation_threshold(self, min_util: f32) -> Self { - assert!(min_util > 0. && min_util < 1.); - Self { - min_utilisation_threshold: min_util, - ..self - } - } } #[async_trait] diff --git a/proxy/src/rate_limiter/limiter.rs b/proxy/src/rate_limiter/limiter.rs index 3a9fed3919..a190b2cf8f 100644 --- a/proxy/src/rate_limiter/limiter.rs +++ b/proxy/src/rate_limiter/limiter.rs @@ -1,13 +1,19 @@ use std::{ + collections::hash_map::RandomState, + hash::BuildHasher, sync::{ atomic::{AtomicUsize, Ordering}, - Arc, + Arc, Mutex, }, - time::Duration, }; +use anyhow::bail; +use dashmap::DashMap; +use itertools::Itertools; +use rand::{rngs::StdRng, Rng, SeedableRng}; +use smol_str::SmolStr; use tokio::sync::{Mutex as AsyncMutex, Semaphore, SemaphorePermit}; -use tokio::time::{timeout, Instant}; +use tokio::time::{timeout, Duration, Instant}; use tracing::info; use super::{ @@ -15,6 +21,180 @@ use super::{ RateLimiterConfig, }; +// Simple per-endpoint rate limiter. +// +// Check that number of connections to the endpoint is below `max_rps` rps. +// Purposefully ignore user name and database name as clients can reconnect +// with different names, so we'll end up sending some http requests to +// the control plane. +// +// We also may save quite a lot of CPU (I think) by bailing out right after we +// saw SNI, before doing TLS handshake. User-side error messages in that case +// does not look very nice (`SSL SYSCALL error: Undefined error: 0`), so for now +// I went with a more expensive way that yields user-friendlier error messages. +pub struct EndpointRateLimiter { + map: DashMap, Hasher>, + info: &'static [RateBucketInfo], + access_count: AtomicUsize, + rand: Mutex, +} + +#[derive(Clone, Copy)] +struct RateBucket { + start: Instant, + count: u32, +} + +impl RateBucket { + fn should_allow_request(&mut self, info: &RateBucketInfo, now: Instant) -> bool { + if now - self.start < info.interval { + self.count < info.max_rpi + } else { + // bucket expired, reset + self.count = 0; + self.start = now; + + true + } + } + + fn inc(&mut self) { + self.count += 1; + } +} + +#[derive(Clone, Copy, PartialEq)] +pub struct RateBucketInfo { + pub interval: Duration, + // requests per interval + pub max_rpi: u32, +} + +impl std::fmt::Display for RateBucketInfo { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let rps = self.max_rpi * 1000 / self.interval.as_millis() as u32; + write!(f, "{rps}@{}", humantime::format_duration(self.interval)) + } +} + +impl std::fmt::Debug for RateBucketInfo { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{self}") + } +} + +impl std::str::FromStr for RateBucketInfo { + type Err = anyhow::Error; + + fn from_str(s: &str) -> Result { + let Some((max_rps, interval)) = s.split_once('@') else { + bail!("invalid rate info") + }; + let max_rps = max_rps.parse()?; + let interval = humantime::parse_duration(interval)?; + Ok(Self::new(max_rps, interval)) + } +} + +impl RateBucketInfo { + pub const DEFAULT_SET: [Self; 3] = [ + Self::new(300, Duration::from_secs(1)), + Self::new(200, Duration::from_secs(60)), + Self::new(100, Duration::from_secs(600)), + ]; + + pub fn validate(info: &mut [Self]) -> anyhow::Result<()> { + info.sort_unstable_by_key(|info| info.interval); + let invalid = info + .iter() + .tuple_windows() + .find(|(a, b)| a.max_rpi > b.max_rpi); + if let Some((a, b)) = invalid { + bail!( + "invalid endpoint RPS limits. {b} allows fewer requests per bucket than {a} ({} vs {})", + b.max_rpi, + a.max_rpi, + ); + } + + Ok(()) + } + + pub const fn new(max_rps: u32, interval: Duration) -> Self { + Self { + interval, + max_rpi: max_rps * interval.as_millis() as u32 / 1000, + } + } +} + +impl EndpointRateLimiter { + pub fn new(info: &'static [RateBucketInfo]) -> Self { + Self::new_with_rand_and_hasher(info, StdRng::from_entropy(), RandomState::new()) + } +} + +impl EndpointRateLimiter { + fn new_with_rand_and_hasher(info: &'static [RateBucketInfo], rand: R, hasher: S) -> Self { + info!(buckets = ?info, "endpoint rate limiter"); + Self { + info, + map: DashMap::with_hasher_and_shard_amount(hasher, 64), + access_count: AtomicUsize::new(1), // start from 1 to avoid GC on the first request + rand: Mutex::new(rand), + } + } + + /// Check that number of connections to the endpoint is below `max_rps` rps. + pub fn check(&self, endpoint: SmolStr) -> bool { + // do a partial GC every 2k requests. This cleans up ~ 1/64th of the map. + // worst case memory usage is about: + // = 2 * 2048 * 64 * (48B + 72B) + // = 30MB + if self.access_count.fetch_add(1, Ordering::AcqRel) % 2048 == 0 { + self.do_gc(); + } + + let now = Instant::now(); + let mut entry = self.map.entry(endpoint).or_insert_with(|| { + vec![ + RateBucket { + start: now, + count: 0, + }; + self.info.len() + ] + }); + + let should_allow_request = entry + .iter_mut() + .zip(self.info) + .all(|(bucket, info)| bucket.should_allow_request(info, now)); + + if should_allow_request { + // only increment the bucket counts if the request will actually be accepted + entry.iter_mut().for_each(RateBucket::inc); + } + + should_allow_request + } + + /// Clean the map. Simple strategy: remove all entries in a random shard. + /// At worst, we'll double the effective max_rps during the cleanup. + /// But that way deletion does not aquire mutex on each entry access. + pub fn do_gc(&self) { + info!( + "cleaning up endpoint rate limiter, current size = {}", + self.map.len() + ); + let n = self.map.shards().len(); + // this lock is ok as the periodic cycle of do_gc makes this very unlikely to collide + // (impossible, infact, unless we have 2048 threads) + let shard = self.rand.lock().unwrap().gen_range(0..n); + self.map.shards()[shard].write().clear(); + } +} + /// Limits the number of concurrent jobs. /// /// Concurrency is limited through the use of [Token]s. Acquire a token to run a job, and release the @@ -53,7 +233,6 @@ pub struct Token<'t> { #[derive(Debug, Clone, Copy)] pub struct LimiterState { limit: usize, - available: usize, in_flight: usize, } @@ -214,10 +393,10 @@ impl Limiter { } new_limit }; - crate::proxy::RATE_LIMITER_LIMIT + crate::metrics::RATE_LIMITER_LIMIT .with_label_values(&["expected"]) .set(new_limit as i64); - crate::proxy::RATE_LIMITER_LIMIT + crate::metrics::RATE_LIMITER_LIMIT .with_label_values(&["actual"]) .set(actual_limit as i64); self.limits.store(new_limit, Ordering::Release); @@ -231,11 +410,7 @@ impl Limiter { pub fn state(&self) -> LimiterState { let limit = self.limits.load(Ordering::Relaxed); let in_flight = self.in_flight.load(Ordering::Relaxed); - LimiterState { - limit, - available: limit.saturating_sub(in_flight), - in_flight, - } + LimiterState { limit, in_flight } } } @@ -248,13 +423,6 @@ impl<'t> Token<'t> { } } - #[cfg(test)] - pub fn set_latency(&mut self, latency: Duration) { - use std::ops::Sub; - - self.start = Instant::now().sub(latency); - } - pub fn forget(&mut self) { if let Some(permit) = self.permit.take() { permit.forget(); @@ -273,10 +441,6 @@ impl LimiterState { pub fn limit(&self) -> usize { self.limit } - /// The amount of concurrency available to use. - pub fn available(&self) -> usize { - self.available - } /// The number of jobs in flight. pub fn in_flight(&self) -> usize { self.in_flight @@ -306,7 +470,7 @@ impl reqwest_middleware::Middleware for Limiter { ) })?; info!(duration = ?start.elapsed(), "waiting for token to connect to the control plane"); - crate::proxy::RATE_LIMITER_ACQUIRE_LATENCY.observe(start.elapsed().as_secs_f64()); + crate::metrics::RATE_LIMITER_ACQUIRE_LATENCY.observe(start.elapsed().as_secs_f64()); match next.run(req, extensions).await { Ok(response) => { self.release(token, Some(Outcome::from_reqwest_response(&response))) @@ -324,12 +488,16 @@ impl reqwest_middleware::Middleware for Limiter { #[cfg(test)] mod tests { - use std::{pin::pin, task::Context, time::Duration}; + use std::{hash::BuildHasherDefault, pin::pin, task::Context, time::Duration}; use futures::{task::noop_waker_ref, Future}; + use rand::SeedableRng; + use rustc_hash::FxHasher; + use smol_str::SmolStr; + use tokio::time; - use super::{Limiter, Outcome}; - use crate::rate_limiter::RateLimitAlgorithm; + use super::{EndpointRateLimiter, Limiter, Outcome}; + use crate::rate_limiter::{RateBucketInfo, RateLimitAlgorithm}; #[tokio::test] async fn it_works() { @@ -438,4 +606,105 @@ mod tests { limiter.release(token1, None).await; limiter.release(token2, None).await; } + + #[test] + fn rate_bucket_rpi() { + let rate_bucket = RateBucketInfo::new(50, Duration::from_secs(5)); + assert_eq!(rate_bucket.max_rpi, 50 * 5); + + let rate_bucket = RateBucketInfo::new(50, Duration::from_millis(500)); + assert_eq!(rate_bucket.max_rpi, 50 / 2); + } + + #[test] + fn rate_bucket_parse() { + let rate_bucket: RateBucketInfo = "100@10s".parse().unwrap(); + assert_eq!(rate_bucket.interval, Duration::from_secs(10)); + assert_eq!(rate_bucket.max_rpi, 100 * 10); + assert_eq!(rate_bucket.to_string(), "100@10s"); + + let rate_bucket: RateBucketInfo = "100@1m".parse().unwrap(); + assert_eq!(rate_bucket.interval, Duration::from_secs(60)); + assert_eq!(rate_bucket.max_rpi, 100 * 60); + assert_eq!(rate_bucket.to_string(), "100@1m"); + } + + #[test] + fn default_rate_buckets() { + let mut defaults = RateBucketInfo::DEFAULT_SET; + RateBucketInfo::validate(&mut defaults[..]).unwrap(); + } + + #[test] + #[should_panic = "invalid endpoint RPS limits. 10@10s allows fewer requests per bucket than 300@1s (100 vs 300)"] + fn rate_buckets_validate() { + let mut rates: Vec = ["300@1s", "10@10s"] + .into_iter() + .map(|s| s.parse().unwrap()) + .collect(); + RateBucketInfo::validate(&mut rates).unwrap(); + } + + #[tokio::test] + async fn test_rate_limits() { + let mut rates: Vec = ["100@1s", "20@30s"] + .into_iter() + .map(|s| s.parse().unwrap()) + .collect(); + RateBucketInfo::validate(&mut rates).unwrap(); + let limiter = EndpointRateLimiter::new(Vec::leak(rates)); + + let endpoint = SmolStr::from("ep-my-endpoint-1234"); + + time::pause(); + + for _ in 0..100 { + assert!(limiter.check(endpoint.clone())); + } + // more connections fail + assert!(!limiter.check(endpoint.clone())); + + // fail even after 500ms as it's in the same bucket + time::advance(time::Duration::from_millis(500)).await; + assert!(!limiter.check(endpoint.clone())); + + // after a full 1s, 100 requests are allowed again + time::advance(time::Duration::from_millis(500)).await; + for _ in 1..6 { + for _ in 0..100 { + assert!(limiter.check(endpoint.clone())); + } + time::advance(time::Duration::from_millis(1000)).await; + } + + // more connections after 600 will exceed the 20rps@30s limit + assert!(!limiter.check(endpoint.clone())); + + // will still fail before the 30 second limit + time::advance(time::Duration::from_millis(30_000 - 6_000 - 1)).await; + assert!(!limiter.check(endpoint.clone())); + + // after the full 30 seconds, 100 requests are allowed again + time::advance(time::Duration::from_millis(1)).await; + for _ in 0..100 { + assert!(limiter.check(endpoint.clone())); + } + } + + #[tokio::test] + async fn test_rate_limits_gc() { + // fixed seeded random/hasher to ensure that the test is not flaky + let rand = rand::rngs::StdRng::from_seed([1; 32]); + let hasher = BuildHasherDefault::::default(); + + let limiter = EndpointRateLimiter::new_with_rand_and_hasher( + &RateBucketInfo::DEFAULT_SET, + rand, + hasher, + ); + for i in 0..1_000_000 { + limiter.check(format!("{i}").into()); + } + assert!(limiter.map.len() < 150_000); + } } diff --git a/proxy/src/sasl.rs b/proxy/src/sasl.rs index 6d1dd9fba5..da1cf21c6a 100644 --- a/proxy/src/sasl.rs +++ b/proxy/src/sasl.rs @@ -30,6 +30,9 @@ pub enum Error { #[error("Bad client message: {0}")] BadClientMessage(&'static str), + #[error("Internal error: missing digest")] + MissingBinding, + #[error(transparent)] Io(#[from] io::Error), } @@ -38,8 +41,7 @@ impl UserFacingError for Error { fn to_string_client(&self) -> String { use Error::*; match self { - // TODO: add support for channel binding - ChannelBindingFailed(_) => "channel binding is not supported yet".to_string(), + ChannelBindingFailed(m) => m.to_string(), ChannelBindingBadMethod(m) => format!("unsupported channel binding method {m}"), _ => "authentication protocol violation".to_string(), } diff --git a/proxy/src/scram.rs b/proxy/src/scram.rs index 63271309e1..49a7a13043 100644 --- a/proxy/src/scram.rs +++ b/proxy/src/scram.rs @@ -15,7 +15,7 @@ mod signature; #[cfg(any(test, doc))] mod password; -pub use exchange::Exchange; +pub use exchange::{exchange, Exchange}; pub use key::ScramKey; pub use secret::ServerSecret; diff --git a/proxy/src/scram/exchange.rs b/proxy/src/scram/exchange.rs index 319d9b1014..9af7db5201 100644 --- a/proxy/src/scram/exchange.rs +++ b/proxy/src/scram/exchange.rs @@ -1,5 +1,9 @@ //! Implementation of the SCRAM authentication algorithm. +use std::convert::Infallible; + +use postgres_protocol::authentication::sasl::ScramSha256; + use super::messages::{ ClientFinalMessage, ClientFirstMessage, OwnedServerFirstMessage, SCRAM_RAW_NONCE_LEN, }; @@ -29,22 +33,27 @@ impl std::str::FromStr for TlsServerEndPoint { } } +struct SaslSentInner { + cbind_flag: ChannelBinding, + client_first_message_bare: String, + server_first_message: OwnedServerFirstMessage, +} + +struct SaslInitial { + nonce: fn() -> [u8; SCRAM_RAW_NONCE_LEN], +} + enum ExchangeState { /// Waiting for [`ClientFirstMessage`]. - Initial, + Initial(SaslInitial), /// Waiting for [`ClientFinalMessage`]. - SaltSent { - cbind_flag: ChannelBinding, - client_first_message_bare: String, - server_first_message: OwnedServerFirstMessage, - }, + SaltSent(SaslSentInner), } /// Server's side of SCRAM auth algorithm. pub struct Exchange<'a> { state: ExchangeState, secret: &'a ServerSecret, - nonce: fn() -> [u8; SCRAM_RAW_NONCE_LEN], tls_server_end_point: config::TlsServerEndPoint, } @@ -55,90 +64,160 @@ impl<'a> Exchange<'a> { tls_server_end_point: config::TlsServerEndPoint, ) -> Self { Self { - state: ExchangeState::Initial, + state: ExchangeState::Initial(SaslInitial { nonce }), secret, - nonce, tls_server_end_point, } } } +pub fn exchange( + secret: &ServerSecret, + mut client: ScramSha256, + tls_server_end_point: config::TlsServerEndPoint, +) -> sasl::Result> { + use sasl::Step::*; + + let init = SaslInitial { + nonce: rand::random, + }; + + let client_first = std::str::from_utf8(client.message()) + .map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))?; + let sent = match init.transition(secret, &tls_server_end_point, client_first)? { + Continue(sent, server_first) => { + client.update(server_first.as_bytes())?; + sent + } + Success(x, _) => match x {}, + Failure(msg) => return Ok(sasl::Outcome::Failure(msg)), + }; + + let client_final = std::str::from_utf8(client.message()) + .map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))?; + let keys = match sent.transition(secret, &tls_server_end_point, client_final)? { + Success(keys, server_final) => { + client.finish(server_final.as_bytes())?; + keys + } + Continue(x, _) => match x {}, + Failure(msg) => return Ok(sasl::Outcome::Failure(msg)), + }; + + Ok(sasl::Outcome::Success(keys)) +} + +impl SaslInitial { + fn transition( + &self, + secret: &ServerSecret, + tls_server_end_point: &config::TlsServerEndPoint, + input: &str, + ) -> sasl::Result> { + let client_first_message = ClientFirstMessage::parse(input) + .ok_or(SaslError::BadClientMessage("invalid client-first-message"))?; + + // If the flag is set to "y" and the server supports channel + // binding, the server MUST fail authentication + if client_first_message.cbind_flag == ChannelBinding::NotSupportedServer + && tls_server_end_point.supported() + { + return Err(SaslError::ChannelBindingFailed("SCRAM-PLUS not used")); + } + + let server_first_message = client_first_message.build_server_first_message( + &(self.nonce)(), + &secret.salt_base64, + secret.iterations, + ); + let msg = server_first_message.as_str().to_owned(); + + let next = SaslSentInner { + cbind_flag: client_first_message.cbind_flag.and_then(str::parse)?, + client_first_message_bare: client_first_message.bare.to_owned(), + server_first_message, + }; + + Ok(sasl::Step::Continue(next, msg)) + } +} + +impl SaslSentInner { + fn transition( + &self, + secret: &ServerSecret, + tls_server_end_point: &config::TlsServerEndPoint, + input: &str, + ) -> sasl::Result> { + let Self { + cbind_flag, + client_first_message_bare, + server_first_message, + } = self; + + let client_final_message = ClientFinalMessage::parse(input) + .ok_or(SaslError::BadClientMessage("invalid client-final-message"))?; + + let channel_binding = cbind_flag.encode(|_| match tls_server_end_point { + config::TlsServerEndPoint::Sha256(x) => Ok(x), + config::TlsServerEndPoint::Undefined => Err(SaslError::MissingBinding), + })?; + + // This might've been caused by a MITM attack + if client_final_message.channel_binding != channel_binding { + return Err(SaslError::ChannelBindingFailed( + "insecure connection: secure channel data mismatch", + )); + } + + if client_final_message.nonce != server_first_message.nonce() { + return Err(SaslError::BadClientMessage("combined nonce doesn't match")); + } + + let signature_builder = SignatureBuilder { + client_first_message_bare, + server_first_message: server_first_message.as_str(), + client_final_message_without_proof: client_final_message.without_proof, + }; + + let client_key = signature_builder + .build(&secret.stored_key) + .derive_client_key(&client_final_message.proof); + + // Auth fails either if keys don't match or it's pre-determined to fail. + if client_key.sha256() != secret.stored_key || secret.doomed { + return Ok(sasl::Step::Failure("password doesn't match")); + } + + let msg = + client_final_message.build_server_final_message(signature_builder, &secret.server_key); + + Ok(sasl::Step::Success(client_key, msg)) + } +} + impl sasl::Mechanism for Exchange<'_> { type Output = super::ScramKey; fn exchange(mut self, input: &str) -> sasl::Result> { use {sasl::Step::*, ExchangeState::*}; match &self.state { - Initial => { - let client_first_message = ClientFirstMessage::parse(input) - .ok_or(SaslError::BadClientMessage("invalid client-first-message"))?; - - // If the flag is set to "y" and the server supports channel - // binding, the server MUST fail authentication - if client_first_message.cbind_flag == ChannelBinding::NotSupportedServer - && self.tls_server_end_point.supported() - { - return Err(SaslError::ChannelBindingFailed("SCRAM-PLUS not used")); - } - - let server_first_message = client_first_message.build_server_first_message( - &(self.nonce)(), - &self.secret.salt_base64, - self.secret.iterations, - ); - let msg = server_first_message.as_str().to_owned(); - - self.state = SaltSent { - cbind_flag: client_first_message.cbind_flag.and_then(str::parse)?, - client_first_message_bare: client_first_message.bare.to_owned(), - server_first_message, - }; - - Ok(Continue(self, msg)) - } - SaltSent { - cbind_flag, - client_first_message_bare, - server_first_message, - } => { - let client_final_message = ClientFinalMessage::parse(input) - .ok_or(SaslError::BadClientMessage("invalid client-final-message"))?; - - let channel_binding = cbind_flag.encode(|_| match &self.tls_server_end_point { - config::TlsServerEndPoint::Sha256(x) => Ok(x), - config::TlsServerEndPoint::Undefined => { - Err(SaslError::ChannelBindingFailed("no cert digest provided")) + Initial(init) => { + match init.transition(self.secret, &self.tls_server_end_point, input)? { + Continue(sent, msg) => { + self.state = SaltSent(sent); + Ok(Continue(self, msg)) } - })?; - - // This might've been caused by a MITM attack - if client_final_message.channel_binding != channel_binding { - return Err(SaslError::ChannelBindingFailed("data mismatch")); + Success(x, _) => match x {}, + Failure(msg) => Ok(Failure(msg)), } - - if client_final_message.nonce != server_first_message.nonce() { - return Err(SaslError::BadClientMessage("combined nonce doesn't match")); + } + SaltSent(sent) => { + match sent.transition(self.secret, &self.tls_server_end_point, input)? { + Success(keys, msg) => Ok(Success(keys, msg)), + Continue(x, _) => match x {}, + Failure(msg) => Ok(Failure(msg)), } - - let signature_builder = SignatureBuilder { - client_first_message_bare, - server_first_message: server_first_message.as_str(), - client_final_message_without_proof: client_final_message.without_proof, - }; - - let client_key = signature_builder - .build(&self.secret.stored_key) - .derive_client_key(&client_final_message.proof); - - // Auth fails either if keys don't match or it's pre-determined to fail. - if client_key.sha256() != self.secret.stored_key || self.secret.doomed { - return Ok(Failure("password doesn't match")); - } - - let msg = client_final_message - .build_server_final_message(signature_builder, &self.secret.server_key); - - Ok(Success(client_key, msg)) } } } diff --git a/proxy/src/scram/key.rs b/proxy/src/scram/key.rs index e9c65fcef3..bd93fb2b70 100644 --- a/proxy/src/scram/key.rs +++ b/proxy/src/scram/key.rs @@ -6,7 +6,7 @@ pub const SCRAM_KEY_LEN: usize = 32; /// One of the keys derived from the [password](super::password::SaltedPassword). /// We use the same structure for all keys, i.e. /// `ClientKey`, `StoredKey`, and `ServerKey`. -#[derive(Default, PartialEq, Eq)] +#[derive(Clone, Default, PartialEq, Eq)] #[repr(transparent)] pub struct ScramKey { bytes: [u8; SCRAM_KEY_LEN], diff --git a/proxy/src/scram/secret.rs b/proxy/src/scram/secret.rs index 424beccec9..9e74e07af1 100644 --- a/proxy/src/scram/secret.rs +++ b/proxy/src/scram/secret.rs @@ -5,6 +5,7 @@ use super::key::ScramKey; /// Server secret is produced from [password](super::password::SaltedPassword) /// and is used throughout the authentication process. +#[derive(Clone)] pub struct ServerSecret { /// Number of iterations for `PBKDF2` function. pub iterations: u32, diff --git a/proxy/src/serverless.rs b/proxy/src/serverless.rs index 45f8132393..07825da8dc 100644 --- a/proxy/src/serverless.rs +++ b/proxy/src/serverless.rs @@ -6,13 +6,20 @@ mod conn_pool; mod sql_over_http; mod websocket; +pub use conn_pool::GlobalConnPoolOptions; + use anyhow::bail; use hyper::StatusCode; +use metrics::IntCounterPairGuard; +use rand::rngs::StdRng; +use rand::SeedableRng; pub use reqwest_middleware::{ClientWithMiddleware, Error}; pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware}; +use tokio_util::task::TaskTracker; +use crate::metrics::NUM_CLIENT_CONNECTION_GAUGE; use crate::protocol2::{ProxyProtocolAccept, WithClientIp}; -use crate::proxy::{NUM_CLIENT_CONNECTION_CLOSED_COUNTER, NUM_CLIENT_CONNECTION_OPENED_COUNTER}; +use crate::rate_limiter::EndpointRateLimiter; use crate::{cancellation::CancelMap, config::ProxyConfig}; use futures::StreamExt; use hyper::{ @@ -23,7 +30,7 @@ use hyper::{ Body, Method, Request, Response, }; -use std::net::SocketAddr; +use std::net::IpAddr; use std::task::Poll; use std::{future::ready, sync::Arc}; use tls_listener::TlsListener; @@ -36,6 +43,7 @@ pub async fn task_main( config: &'static ProxyConfig, ws_listener: TcpListener, cancellation_token: CancellationToken, + endpoint_rate_limiter: Arc, ) -> anyhow::Result<()> { scopeguard::defer! { info!("websocket server has shut down"); @@ -43,6 +51,11 @@ pub async fn task_main( let conn_pool = conn_pool::GlobalConnPool::new(config); + let conn_pool2 = Arc::clone(&conn_pool); + tokio::spawn(async move { + conn_pool2.gc_worker(StdRng::from_entropy()).await; + }); + // shutdown the connection pool tokio::spawn({ let cancellation_token = cancellation_token.clone(); @@ -70,6 +83,9 @@ pub async fn task_main( incoming: addr_incoming, }; + let ws_connections = tokio_util::task::task_tracker::TaskTracker::new(); + ws_connections.close(); // allows `ws_connections.wait to complete` + let tls_listener = TlsListener::new(tls_acceptor, addr_incoming).filter(|conn| { if let Err(err) = conn { error!("failed to accept TLS connection for websockets: {err:?}"); @@ -86,6 +102,8 @@ pub async fn task_main( let remote_addr = io.inner.remote_addr(); let sni_name = tls.server_name().map(|s| s.to_string()); let conn_pool = conn_pool.clone(); + let ws_connections = ws_connections.clone(); + let endpoint_rate_limiter = endpoint_rate_limiter.clone(); async move { let peer_addr = match client_addr { @@ -97,13 +115,23 @@ pub async fn task_main( move |req: Request| { let sni_name = sni_name.clone(); let conn_pool = conn_pool.clone(); + let ws_connections = ws_connections.clone(); + let endpoint_rate_limiter = endpoint_rate_limiter.clone(); async move { let cancel_map = Arc::new(CancelMap::default()); let session_id = uuid::Uuid::new_v4(); request_handler( - req, config, conn_pool, cancel_map, session_id, sni_name, peer_addr, + req, + config, + conn_pool, + ws_connections, + cancel_map, + session_id, + sni_name, + peer_addr.ip(), + endpoint_rate_limiter, ) .instrument(info_span!( "serverless", @@ -123,27 +151,25 @@ pub async fn task_main( .with_graceful_shutdown(cancellation_token.cancelled()) .await?; + // await websocket connections + ws_connections.wait().await; + Ok(()) } struct MetricService { inner: S, + _gauge: IntCounterPairGuard, } impl MetricService { fn new(inner: S) -> MetricService { - NUM_CLIENT_CONNECTION_OPENED_COUNTER - .with_label_values(&["http"]) - .inc(); - MetricService { inner } - } -} - -impl Drop for MetricService { - fn drop(&mut self) { - NUM_CLIENT_CONNECTION_CLOSED_COUNTER - .with_label_values(&["http"]) - .inc(); + MetricService { + inner, + _gauge: NUM_CLIENT_CONNECTION_GAUGE + .with_label_values(&["http"]) + .guard(), + } } } @@ -164,14 +190,17 @@ where } } +#[allow(clippy::too_many_arguments)] async fn request_handler( mut request: Request, config: &'static ProxyConfig, conn_pool: Arc, + ws_connections: TaskTracker, cancel_map: Arc, session_id: uuid::Uuid, sni_hostname: Option, - peer_addr: SocketAddr, + peer_addr: IpAddr, + endpoint_rate_limiter: Arc, ) -> Result, ApiError> { let host = request .headers() @@ -187,7 +216,7 @@ async fn request_handler( let (response, websocket) = hyper_tungstenite::upgrade(&mut request, None) .map_err(|e| ApiError::BadRequest(e.into()))?; - tokio::spawn( + ws_connections.spawn( async move { if let Err(e) = websocket::serve_websocket( websocket, @@ -196,6 +225,7 @@ async fn request_handler( session_id, host, peer_addr, + endpoint_rate_limiter, ) .await { @@ -223,7 +253,7 @@ async fn request_handler( .header("Access-Control-Allow-Origin", "*") .header( "Access-Control-Allow-Headers", - "Neon-Connection-String, Neon-Raw-Text-Output, Neon-Array-Mode, Neon-Pool-Opt-In", + "Neon-Connection-String, Neon-Raw-Text-Output, Neon-Array-Mode, Neon-Pool-Opt-In, Neon-Batch-Read-Only, Neon-Batch-Isolation-Level", ) .header("Access-Control-Max-Age", "86400" /* 24 hours */) .status(StatusCode::OK) // 204 is also valid, but see: https://developer.mozilla.org/en-US/docs/Web/HTTP/Methods/OPTIONS#status_code diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs index 2072cadc3a..c476560215 100644 --- a/proxy/src/serverless/conn_pool.rs +++ b/proxy/src/serverless/conn_pool.rs @@ -1,14 +1,19 @@ -use anyhow::Context; +use anyhow::{anyhow, Context}; use async_trait::async_trait; use dashmap::DashMap; -use futures::future::poll_fn; +use futures::{future::poll_fn, Future}; +use metrics::{register_int_counter_pair, IntCounterPair, IntCounterPairGuard}; +use once_cell::sync::Lazy; use parking_lot::RwLock; use pbkdf2::{ password_hash::{PasswordHashString, PasswordHasher, PasswordVerifier, SaltString}, Params, Pbkdf2, }; use pq_proto::StartupMessageParams; -use std::{collections::HashMap, net::SocketAddr, sync::Arc}; +use prometheus::{exponential_buckets, register_histogram, Histogram}; +use rand::Rng; +use smol_str::SmolStr; +use std::{collections::HashMap, net::IpAddr, pin::pin, sync::Arc, sync::Weak, time::Duration}; use std::{ fmt, task::{ready, Poll}, @@ -17,40 +22,35 @@ use std::{ ops::Deref, sync::atomic::{self, AtomicUsize}, }; -use tokio::time; +use tokio::time::{self, Instant}; use tokio_postgres::{AsyncMessage, ReadyForQueryStatus}; use crate::{ - auth::{self, check_peer_addr_is_in_list}, + auth::{self, backend::ComputeUserInfo, check_peer_addr_is_in_list}, console, - proxy::{ - neon_options, LatencyTimer, NUM_DB_CONNECTIONS_CLOSED_COUNTER, - NUM_DB_CONNECTIONS_OPENED_COUNTER, - }, + metrics::{LatencyTimer, NUM_DB_CONNECTIONS_GAUGE}, + proxy::{connect_compute::ConnectMechanism, neon_options}, usage_metrics::{Ids, MetricCounter, USAGE_METRICS}, }; use crate::{compute, config}; -use crate::proxy::ConnectMechanism; - -use tracing::{error, warn, Span}; +use tracing::{debug, error, warn, Span}; use tracing::{info, info_span, Instrument}; -pub const APP_NAME: &str = "sql_over_http"; -const MAX_CONNS_PER_ENDPOINT: usize = 20; +pub const APP_NAME: &str = "/sql_over_http"; #[derive(Debug, Clone)] pub struct ConnInfo { - pub username: String, - pub dbname: String, - pub hostname: String, - pub password: String, - pub options: Option, + pub username: SmolStr, + pub dbname: SmolStr, + pub hostname: SmolStr, + pub password: SmolStr, + pub options: Option, } impl ConnInfo { // hm, change to hasher to avoid cloning? - pub fn db_and_user(&self) -> (String, String) { + pub fn db_and_user(&self) -> (SmolStr, SmolStr) { (self.dbname.clone(), self.username.clone()) } } @@ -70,8 +70,79 @@ struct ConnPoolEntry { // Per-endpoint connection pool, (dbname, username) -> DbUserConnPool // Number of open connections is limited by the `max_conns_per_endpoint`. pub struct EndpointConnPool { - pools: HashMap<(String, String), DbUserConnPool>, + pools: HashMap<(SmolStr, SmolStr), DbUserConnPool>, total_conns: usize, + max_conns: usize, + _guard: IntCounterPairGuard, +} + +impl EndpointConnPool { + fn get_conn_entry(&mut self, db_user: (SmolStr, SmolStr)) -> Option { + let Self { + pools, total_conns, .. + } = self; + pools + .get_mut(&db_user) + .and_then(|pool_entries| pool_entries.get_conn_entry(total_conns)) + } + + fn remove_client(&mut self, db_user: (SmolStr, SmolStr), conn_id: uuid::Uuid) -> bool { + let Self { + pools, total_conns, .. + } = self; + if let Some(pool) = pools.get_mut(&db_user) { + let old_len = pool.conns.len(); + pool.conns.retain(|conn| conn.conn.conn_id != conn_id); + let new_len = pool.conns.len(); + let removed = old_len - new_len; + *total_conns -= removed; + removed > 0 + } else { + false + } + } + + fn put(pool: &RwLock, conn_info: &ConnInfo, client: ClientInner) -> anyhow::Result<()> { + let conn_id = client.conn_id; + + if client.inner.is_closed() { + info!(%conn_id, "pool: throwing away connection '{conn_info}' because connection is closed"); + return Ok(()); + } + + // return connection to the pool + let mut returned = false; + let mut per_db_size = 0; + let total_conns = { + let mut pool = pool.write(); + + if pool.total_conns < pool.max_conns { + // we create this db-user entry in get, so it should not be None + if let Some(pool_entries) = pool.pools.get_mut(&conn_info.db_and_user()) { + pool_entries.conns.push(ConnPoolEntry { + conn: client, + _last_access: std::time::Instant::now(), + }); + + returned = true; + per_db_size = pool_entries.conns.len(); + + pool.total_conns += 1; + } + } + + pool.total_conns + }; + + // do logging outside of the mutex + if returned { + info!(%conn_id, "pool: returning connection '{conn_info}' back to the pool, total_conns={total_conns}, for this (db, user)={per_db_size}"); + } else { + info!(%conn_id, "pool: throwing away connection '{conn_info}' because pool is full, total_conns={total_conns}"); + } + + Ok(()) + } } /// 4096 is the number of rounds that SCRAM-SHA-256 recommends. @@ -90,75 +161,167 @@ pub struct DbUserConnPool { password_hash: Option, } +impl DbUserConnPool { + fn clear_closed_clients(&mut self, conns: &mut usize) { + let old_len = self.conns.len(); + + self.conns.retain(|conn| !conn.conn.inner.is_closed()); + + let new_len = self.conns.len(); + let removed = old_len - new_len; + *conns -= removed; + } + + fn get_conn_entry(&mut self, conns: &mut usize) -> Option { + self.clear_closed_clients(conns); + let conn = self.conns.pop(); + if conn.is_some() { + *conns -= 1; + } + conn + } +} + pub struct GlobalConnPool { // endpoint -> per-endpoint connection pool // // That should be a fairly conteded map, so return reference to the per-endpoint // pool as early as possible and release the lock. - global_pool: DashMap>>, + global_pool: DashMap>>, + /// Number of endpoint-connection pools + /// /// [`DashMap::len`] iterates over all inner pools and acquires a read lock on each. /// That seems like far too much effort, so we're using a relaxed increment counter instead. /// It's only used for diagnostics. global_pool_size: AtomicUsize, + proxy_config: &'static crate::config::ProxyConfig, +} + +#[derive(Debug, Clone, Copy)] +pub struct GlobalConnPoolOptions { // Maximum number of connections per one endpoint. // Can mix different (dbname, username) connections. // When running out of free slots for a particular endpoint, // falls back to opening a new connection for each request. - max_conns_per_endpoint: usize, + pub max_conns_per_endpoint: usize, - proxy_config: &'static crate::config::ProxyConfig, + pub gc_epoch: Duration, - // Using a lock to remove any race conditions. - // Eg cleaning up connections while a new connection is returned - closed: RwLock, + pub pool_shards: usize, + + pub idle_timeout: Duration, + + pub opt_in: bool, } +pub static GC_LATENCY: Lazy = Lazy::new(|| { + register_histogram!( + "proxy_http_pool_reclaimation_lag_seconds", + "Time it takes to reclaim unused connection pools", + // 1us -> 65ms + exponential_buckets(1e-6, 2.0, 16).unwrap(), + ) + .unwrap() +}); + +pub static ENDPOINT_POOLS: Lazy = Lazy::new(|| { + register_int_counter_pair!( + "proxy_http_pool_endpoints_registered_total", + "Number of endpoints we have registered pools for", + "proxy_http_pool_endpoints_unregistered_total", + "Number of endpoints we have unregistered pools for", + ) + .unwrap() +}); + impl GlobalConnPool { pub fn new(config: &'static crate::config::ProxyConfig) -> Arc { + let shards = config.http_config.pool_options.pool_shards; Arc::new(Self { - global_pool: DashMap::new(), + global_pool: DashMap::with_shard_amount(shards), global_pool_size: AtomicUsize::new(0), - max_conns_per_endpoint: MAX_CONNS_PER_ENDPOINT, proxy_config: config, - closed: RwLock::new(false), }) } pub fn shutdown(&self) { - *self.closed.write() = true; + // drops all strong references to endpoint-pools + self.global_pool.clear(); + } - self.global_pool.retain(|_, endpoint_pool| { - let mut pool = endpoint_pool.write(); - // by clearing this hashmap, we remove the slots that a connection can be returned to. - // when returning, it drops the connection if the slot doesn't exist - pool.pools.clear(); - pool.total_conns = 0; + pub async fn gc_worker(&self, mut rng: impl Rng) { + let epoch = self.proxy_config.http_config.pool_options.gc_epoch; + let mut interval = tokio::time::interval(epoch / (self.global_pool.shards().len()) as u32); + loop { + interval.tick().await; - false + let shard = rng.gen_range(0..self.global_pool.shards().len()); + self.gc(shard); + } + } + + fn gc(&self, shard: usize) { + debug!(shard, "pool: performing epoch reclamation"); + + // acquire a random shard lock + let mut shard = self.global_pool.shards()[shard].write(); + + let timer = GC_LATENCY.start_timer(); + let current_len = shard.len(); + shard.retain(|endpoint, x| { + // if the current endpoint pool is unique (no other strong or weak references) + // then it is currently not in use by any connections. + if let Some(pool) = Arc::get_mut(x.get_mut()) { + let EndpointConnPool { + pools, total_conns, .. + } = pool.get_mut(); + + // ensure that closed clients are removed + pools + .iter_mut() + .for_each(|(_, db_pool)| db_pool.clear_closed_clients(total_conns)); + + // we only remove this pool if it has no active connections + if *total_conns == 0 { + info!("pool: discarding pool for endpoint {endpoint}"); + return false; + } + } + + true }); + let new_len = shard.len(); + drop(shard); + timer.observe_duration(); + + let removed = current_len - new_len; + + if removed > 0 { + let global_pool_size = self + .global_pool_size + .fetch_sub(removed, atomic::Ordering::Relaxed) + - removed; + info!("pool: performed global pool gc. size now {global_pool_size}"); + } } pub async fn get( self: &Arc, - conn_info: &ConnInfo, + conn_info: ConnInfo, force_new: bool, session_id: uuid::Uuid, - peer_addr: SocketAddr, + peer_addr: IpAddr, ) -> anyhow::Result { let mut client: Option = None; let mut latency_timer = LatencyTimer::new("http"); - let pool = if force_new { - None - } else { - Some((conn_info.clone(), self.clone())) - }; - let mut hash_valid = false; + let mut endpoint_pool = Weak::new(); if !force_new { let pool = self.get_or_create_endpoint_pool(&conn_info.hostname); + endpoint_pool = Arc::downgrade(&pool); let mut hash = None; // find a pool entry by (dbname, username) if exists @@ -183,12 +346,8 @@ impl GlobalConnPool { // we will continue with the regular connection flow if validate.is_ok() { hash_valid = true; - let mut pool = pool.write(); - if let Some(pool_entries) = pool.pools.get_mut(&conn_info.db_and_user()) { - if let Some(entry) = pool_entries.conns.pop() { - client = Some(entry.conn); - pool.total_conns -= 1; - } + if let Some(entry) = pool.write().get_conn_entry(conn_info.db_and_user()) { + client = Some(entry.conn) } } } @@ -201,11 +360,12 @@ impl GlobalConnPool { info!(%conn_id, "pool: cached connection '{conn_info}' is closed, opening a new one"); connect_to_compute( self.proxy_config, - conn_info, + &conn_info, conn_id, session_id, latency_timer, peer_addr, + endpoint_pool.clone(), ) .await } else { @@ -217,18 +377,19 @@ impl GlobalConnPool { ); latency_timer.pool_hit(); latency_timer.success(); - return Ok(Client::new(client, pool).await); + return Ok(Client::new(client, conn_info, endpoint_pool).await); } } else { let conn_id = uuid::Uuid::new_v4(); info!(%conn_id, "pool: opening a new connection '{conn_info}'"); connect_to_compute( self.proxy_config, - conn_info, + &conn_info, conn_id, session_id, latency_timer, peer_addr, + endpoint_pool.clone(), ) .await }; @@ -272,62 +433,10 @@ impl GlobalConnPool { _ => {} } let new_client = new_client?; - Ok(Client::new(new_client, pool).await) + Ok(Client::new(new_client, conn_info, endpoint_pool).await) } - fn put(&self, conn_info: &ConnInfo, client: ClientInner) -> anyhow::Result<()> { - let conn_id = client.conn_id; - - // We want to hold this open while we return. This ensures that the pool can't close - // while we are in the middle of returning the connection. - let closed = self.closed.read(); - if *closed { - info!(%conn_id, "pool: throwing away connection '{conn_info}' because pool is closed"); - return Ok(()); - } - - if client.inner.is_closed() { - info!(%conn_id, "pool: throwing away connection '{conn_info}' because connection is closed"); - return Ok(()); - } - - let pool = self.get_or_create_endpoint_pool(&conn_info.hostname); - - // return connection to the pool - let mut returned = false; - let mut per_db_size = 0; - let total_conns = { - let mut pool = pool.write(); - - if pool.total_conns < self.max_conns_per_endpoint { - // we create this db-user entry in get, so it should not be None - if let Some(pool_entries) = pool.pools.get_mut(&conn_info.db_and_user()) { - pool_entries.conns.push(ConnPoolEntry { - conn: client, - _last_access: std::time::Instant::now(), - }); - - returned = true; - per_db_size = pool_entries.conns.len(); - - pool.total_conns += 1; - } - } - - pool.total_conns - }; - - // do logging outside of the mutex - if returned { - info!(%conn_id, "pool: returning connection '{conn_info}' back to the pool, total_conns={total_conns}, for this (db, user)={per_db_size}"); - } else { - info!(%conn_id, "pool: throwing away connection '{conn_info}' because pool is full, total_conns={total_conns}"); - } - - Ok(()) - } - - fn get_or_create_endpoint_pool(&self, endpoint: &String) -> Arc> { + fn get_or_create_endpoint_pool(&self, endpoint: &SmolStr) -> Arc> { // fast path if let Some(pool) = self.global_pool.get(endpoint) { return pool.clone(); @@ -337,6 +446,12 @@ impl GlobalConnPool { let new_pool = Arc::new(RwLock::new(EndpointConnPool { pools: HashMap::new(), total_conns: 0, + max_conns: self + .proxy_config + .http_config + .pool_options + .max_conns_per_endpoint, + _guard: ENDPOINT_POOLS.guard(), })); // find or create a pool for this endpoint @@ -366,9 +481,11 @@ impl GlobalConnPool { } struct TokioMechanism<'a> { + pool: Weak>, conn_info: &'a ConnInfo, session_id: uuid::Uuid, conn_id: uuid::Uuid, + idle: Duration, } #[async_trait] @@ -388,6 +505,8 @@ impl ConnectMechanism for TokioMechanism<'_> { timeout, self.conn_id, self.session_id, + self.pool.clone(), + self.idle, ) .await } @@ -405,7 +524,8 @@ async fn connect_to_compute( conn_id: uuid::Uuid, session_id: uuid::Uuid, latency_timer: LatencyTimer, - peer_addr: SocketAddr, + peer_addr: IpAddr, + pool: Weak>, ) -> anyhow::Result { let tls = config.tls_config.as_ref(); let common_names = tls.and_then(|tls| tls.common_names.clone()); @@ -422,19 +542,21 @@ async fn connect_to_compute( common_names, peer_addr, )?; + + let creds = + ComputeUserInfo::try_from(creds).map_err(|_| anyhow!("missing endpoint identifier"))?; let backend = config.auth_backend.as_ref().map(|_| creds); let console_options = neon_options(¶ms); let extra = console::ConsoleReqExtra { session_id: uuid::Uuid::new_v4(), - application_name: Some(APP_NAME), - options: console_options.as_deref(), + application_name: APP_NAME.to_string(), + options: console_options, }; - // TODO(anna): this is a bit hacky way, consider using console notification listener. if !config.disable_ip_check_for_http { let allowed_ips = backend.get_allowed_ips(&extra).await?; - if !check_peer_addr_is_in_list(&peer_addr.ip(), &allowed_ips) { + if !check_peer_addr_is_in_list(&peer_addr, &allowed_ips) { return Err(auth::AuthError::ip_address_not_allowed().into()); } } @@ -443,11 +565,13 @@ async fn connect_to_compute( .await? .context("missing cache entry from wake_compute")?; - crate::proxy::connect_to_compute( + crate::proxy::connect_compute::connect_to_compute( &TokioMechanism { conn_id, conn_info, session_id, + pool, + idle: config.http_config.pool_options.idle_timeout, }, node_info, &extra, @@ -463,16 +587,23 @@ async fn connect_to_compute_once( timeout: time::Duration, conn_id: uuid::Uuid, mut session: uuid::Uuid, + pool: Weak>, + idle: Duration, ) -> Result { let mut config = (*node_info.config).clone(); let (client, mut connection) = config .user(&conn_info.username) - .password(&conn_info.password) + .password(&*conn_info.password) .dbname(&conn_info.dbname) .connect_timeout(timeout) .connect(tokio_postgres::NoTls) .await?; + + let conn_gauge = NUM_DB_CONNECTIONS_GAUGE + .with_label_values(&["http"]) + .guard(); + tracing::Span::current().record("pid", &tracing::field::display(client.get_process_id())); let (tx, mut rx) = tokio::sync::watch::channel(session); @@ -482,20 +613,33 @@ async fn connect_to_compute_once( info!(%conn_info, %session, "new connection"); }); let ids = Ids { - endpoint_id: node_info.aux.endpoint_id.to_string(), - branch_id: node_info.aux.branch_id.to_string(), + endpoint_id: node_info.aux.endpoint_id.clone(), + branch_id: node_info.aux.branch_id.clone(), }; + let db_user = conn_info.db_and_user(); tokio::spawn( async move { - NUM_DB_CONNECTIONS_OPENED_COUNTER.with_label_values(&["http"]).inc(); - scopeguard::defer! { - NUM_DB_CONNECTIONS_CLOSED_COUNTER.with_label_values(&["http"]).inc(); - } + let _conn_gauge = conn_gauge; + let mut idle_timeout = pin!(tokio::time::sleep(idle)); poll_fn(move |cx| { if matches!(rx.has_changed(), Ok(true)) { session = *rx.borrow_and_update(); info!(%session, "changed session"); + idle_timeout.as_mut().reset(Instant::now() + idle); + } + + // 5 minute idle connection timeout + if idle_timeout.as_mut().poll(cx).is_ready() { + idle_timeout.as_mut().reset(Instant::now() + idle); + info!("connection idle"); + if let Some(pool) = pool.clone().upgrade() { + // remove client from pool - should close the connection if it's idle. + // does nothing if the client is currently checked-out and in-use + if pool.write().remove_client(db_user.clone(), conn_id) { + info!("idle connection removed"); + } + } } loop { @@ -513,15 +657,25 @@ async fn connect_to_compute_once( } Some(Err(e)) => { error!(%session, "connection error: {}", e); - return Poll::Ready(()) + break } None => { info!("connection closed"); - return Poll::Ready(()) + break } } } - }).await + + // remove from connection pool + if let Some(pool) = pool.clone().upgrade() { + if pool.write().remove_client(db_user.clone(), conn_id) { + info!("closed connection removed"); + } + } + + Poll::Ready(()) + }).await; + } .instrument(span) ); @@ -551,23 +705,27 @@ pub struct Client { conn_id: uuid::Uuid, span: Span, inner: Option, - pool: Option<(ConnInfo, Arc)>, + conn_info: ConnInfo, + pool: Weak>, } pub struct Discard<'a> { conn_id: uuid::Uuid, - pool: &'a mut Option<(ConnInfo, Arc)>, + conn_info: &'a ConnInfo, + pool: &'a mut Weak>, } impl Client { pub(self) async fn new( inner: ClientInner, - pool: Option<(ConnInfo, Arc)>, + conn_info: ConnInfo, + pool: Weak>, ) -> Self { Self { conn_id: inner.conn_id, inner: Some(inner), span: Span::current(), + conn_info, pool, } } @@ -576,6 +734,7 @@ impl Client { inner, pool, conn_id, + conn_info, span: _, } = self; ( @@ -585,6 +744,7 @@ impl Client { .inner, Discard { pool, + conn_info, conn_id: *conn_id, }, ) @@ -600,14 +760,14 @@ impl Client { impl Discard<'_> { pub fn check_idle(&mut self, status: ReadyForQueryStatus) { - if status != ReadyForQueryStatus::Idle { - if let Some((conn_info, _)) = self.pool.take() { - info!(conn_id = %self.conn_id, "pool: throwing away connection '{conn_info}' because connection is not idle") - } + let conn_info = &self.conn_info; + if status != ReadyForQueryStatus::Idle && std::mem::take(self.pool).strong_count() > 0 { + info!(conn_id = %self.conn_id, "pool: throwing away connection '{conn_info}' because connection is not idle") } } pub fn discard(&mut self) { - if let Some((conn_info, _)) = self.pool.take() { + let conn_info = &self.conn_info; + if std::mem::take(self.pool).strong_count() > 0 { info!(conn_id = %self.conn_id, "pool: throwing away connection '{conn_info}' because connection is potentially in a broken state") } } @@ -627,16 +787,17 @@ impl Deref for Client { impl Drop for Client { fn drop(&mut self) { + let conn_info = self.conn_info.clone(); let client = self .inner .take() .expect("client inner should not be removed"); - if let Some((conn_info, conn_pool)) = self.pool.take() { + if let Some(conn_pool) = std::mem::take(&mut self.pool).upgrade() { let current_span = self.span.clone(); // return connection to the pool tokio::task::spawn_blocking(move || { let _span = current_span.enter(); - let _ = conn_pool.put(&conn_info, client); + let _ = EndpointConnPool::put(&conn_pool, &conn_info, client); }); } } diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs index 25b96668de..2e9d8526d3 100644 --- a/proxy/src/serverless/sql_over_http.rs +++ b/proxy/src/serverless/sql_over_http.rs @@ -1,4 +1,4 @@ -use std::net::SocketAddr; +use std::net::IpAddr; use std::sync::Arc; use anyhow::bail; @@ -29,7 +29,7 @@ use utils::http::error::ApiError; use utils::http::json::json_response; use crate::config::HttpConfig; -use crate::proxy::{NUM_CONNECTIONS_ACCEPTED_COUNTER, NUM_CONNECTIONS_CLOSED_COUNTER}; +use crate::metrics::NUM_CONNECTION_REQUESTS_GAUGE; use super::conn_pool::ConnInfo; use super::conn_pool::GlobalConnPool; @@ -182,16 +182,16 @@ fn get_conn_info( for (key, value) in pairs { if key == "options" { - options = Some(value.to_string()); + options = Some(value.into()); break; } } Ok(ConnInfo { - username: username.to_owned(), - dbname: dbname.to_owned(), - hostname: hostname.to_owned(), - password: password.to_owned(), + username: username.into(), + dbname: dbname.into(), + hostname: hostname.into(), + password: password.into(), options, }) } @@ -202,11 +202,11 @@ pub async fn handle( sni_hostname: Option, conn_pool: Arc, session_id: uuid::Uuid, - peer_addr: SocketAddr, + peer_addr: IpAddr, config: &'static HttpConfig, ) -> Result, ApiError> { let result = tokio::time::timeout( - config.timeout, + config.request_timeout, handle_inner( config, request, @@ -278,7 +278,7 @@ pub async fn handle( Err(_) => { let message = format!( "HTTP-Connection timed out, execution time exeeded {} seconds", - config.timeout.as_secs() + config.request_timeout.as_secs() ); error!(message); json_response( @@ -301,14 +301,11 @@ async fn handle_inner( sni_hostname: Option, conn_pool: Arc, session_id: uuid::Uuid, - peer_addr: SocketAddr, + peer_addr: IpAddr, ) -> anyhow::Result> { - NUM_CONNECTIONS_ACCEPTED_COUNTER + let _request_gauge = NUM_CONNECTION_REQUESTS_GAUGE .with_label_values(&["http"]) - .inc(); - scopeguard::defer! { - NUM_CONNECTIONS_CLOSED_COUNTER.with_label_values(&["http"]).inc(); - } + .guard(); // // Determine the destination and connection params @@ -323,7 +320,8 @@ async fn handle_inner( // Allow connection pooling only if explicitly requested // or if we have decided that http pool is no longer opt-in - let allow_pool = !config.pool_opt_in || headers.get(&ALLOW_POOL) == Some(&HEADER_VALUE_TRUE); + let allow_pool = + !config.pool_options.opt_in || headers.get(&ALLOW_POOL) == Some(&HEADER_VALUE_TRUE); // isolation level, read only and deferrable @@ -362,7 +360,7 @@ async fn handle_inner( let payload: Payload = serde_json::from_slice(&body)?; let mut client = conn_pool - .get(&conn_info, !allow_pool, session_id, peer_addr) + .get(conn_info, !allow_pool, session_id, peer_addr) .await?; let mut response = Response::builder() diff --git a/proxy/src/serverless/websocket.rs b/proxy/src/serverless/websocket.rs index 8fb9a3dee4..071add3bca 100644 --- a/proxy/src/serverless/websocket.rs +++ b/proxy/src/serverless/websocket.rs @@ -3,6 +3,7 @@ use crate::{ config::ProxyConfig, error::io_error, proxy::{handle_client, ClientMode}, + rate_limiter::EndpointRateLimiter, }; use bytes::{Buf, Bytes}; use futures::{Sink, Stream}; @@ -11,8 +12,9 @@ use hyper_tungstenite::{tungstenite::Message, HyperWebsocket, WebSocketStream}; use pin_project_lite::pin_project; use std::{ - net::SocketAddr, + net::IpAddr, pin::Pin, + sync::Arc, task::{ready, Context, Poll}, }; use tokio::io::{self, AsyncBufRead, AsyncRead, AsyncWrite, ReadBuf}; @@ -25,15 +27,15 @@ use sync_wrapper::SyncWrapper; pin_project! { /// This is a wrapper around a [`WebSocketStream`] that /// implements [`AsyncRead`] and [`AsyncWrite`]. - pub struct WebSocketRw { + pub struct WebSocketRw { #[pin] - stream: SyncWrapper>, + stream: SyncWrapper>, bytes: Bytes, } } -impl WebSocketRw { - pub fn new(stream: WebSocketStream) -> Self { +impl WebSocketRw { + pub fn new(stream: WebSocketStream) -> Self { Self { stream: stream.into(), bytes: Bytes::new(), @@ -41,7 +43,7 @@ impl WebSocketRw { } } -impl AsyncWrite for WebSocketRw { +impl AsyncWrite for WebSocketRw { fn poll_write( self: Pin<&mut Self>, cx: &mut Context<'_>, @@ -67,7 +69,7 @@ impl AsyncWrite for WebSocketRw { } } -impl AsyncRead for WebSocketRw { +impl AsyncRead for WebSocketRw { fn poll_read( mut self: Pin<&mut Self>, cx: &mut Context<'_>, @@ -84,7 +86,7 @@ impl AsyncRead for WebSocketRw { } } -impl AsyncBufRead for WebSocketRw { +impl AsyncBufRead for WebSocketRw { fn poll_fill_buf(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { // Please refer to poll_fill_buf's documentation. const EOF: Poll> = Poll::Ready(Ok(&[])); @@ -133,7 +135,8 @@ pub async fn serve_websocket( cancel_map: &CancelMap, session_id: uuid::Uuid, hostname: Option, - peer_addr: SocketAddr, + peer_addr: IpAddr, + endpoint_rate_limiter: Arc, ) -> anyhow::Result<()> { let websocket = websocket.await?; handle_client( @@ -143,7 +146,65 @@ pub async fn serve_websocket( WebSocketRw::new(websocket), ClientMode::Websockets { hostname }, peer_addr, + endpoint_rate_limiter, ) .await?; Ok(()) } + +#[cfg(test)] +mod tests { + use std::pin::pin; + + use futures::{SinkExt, StreamExt}; + use hyper_tungstenite::{ + tungstenite::{protocol::Role, Message}, + WebSocketStream, + }; + use tokio::{ + io::{duplex, AsyncReadExt, AsyncWriteExt}, + task::JoinSet, + }; + + use super::WebSocketRw; + + #[tokio::test] + async fn websocket_stream_wrapper_happy_path() { + let (stream1, stream2) = duplex(1024); + + let mut js = JoinSet::new(); + + js.spawn(async move { + let mut client = WebSocketStream::from_raw_socket(stream1, Role::Client, None).await; + + client + .send(Message::Binary(b"hello world".to_vec())) + .await + .unwrap(); + + let message = client.next().await.unwrap().unwrap(); + assert_eq!(message, Message::Binary(b"websockets are cool".to_vec())); + + client.close(None).await.unwrap(); + }); + + js.spawn(async move { + let mut rw = pin!(WebSocketRw::new( + WebSocketStream::from_raw_socket(stream2, Role::Server, None).await + )); + + let mut buf = vec![0; 1024]; + let n = rw.read(&mut buf).await.unwrap(); + assert_eq!(&buf[..n], b"hello world"); + + rw.write_all(b"websockets are cool").await.unwrap(); + rw.flush().await.unwrap(); + + let n = rw.read_to_end(&mut buf).await.unwrap(); + assert_eq!(n, 0); + }); + + js.join_next().await.unwrap().unwrap(); + js.join_next().await.unwrap().unwrap(); + } +} diff --git a/proxy/src/usage_metrics.rs b/proxy/src/usage_metrics.rs index 180b5f7199..789a4c680c 100644 --- a/proxy/src/usage_metrics.rs +++ b/proxy/src/usage_metrics.rs @@ -6,6 +6,7 @@ use consumption_metrics::{idempotency_key, Event, EventChunk, EventType, CHUNK_S use dashmap::{mapref::entry::Entry, DashMap}; use once_cell::sync::Lazy; use serde::{Deserialize, Serialize}; +use smol_str::SmolStr; use std::{ convert::Infallible, sync::{ @@ -29,8 +30,8 @@ const DEFAULT_HTTP_REPORTING_TIMEOUT: Duration = Duration::from_secs(60); /// because we enrich the event with project_id in the control-plane endpoint. #[derive(Eq, Hash, PartialEq, Serialize, Deserialize, Debug, Clone)] pub struct Ids { - pub endpoint_id: String, - pub branch_id: String, + pub endpoint_id: SmolStr, + pub branch_id: SmolStr, } #[derive(Debug)] @@ -290,8 +291,8 @@ mod tests { // register a new counter let counter = metrics.register(Ids { - endpoint_id: "e1".to_string(), - branch_id: "b1".to_string(), + endpoint_id: "e1".into(), + branch_id: "b1".into(), }); // the counter should be observed despite 0 egress diff --git a/pyproject.toml b/pyproject.toml index 60cf0e5c96..18c8ece4a7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,18 +6,18 @@ authors = [] [tool.poetry.dependencies] python = "^3.9" -pytest = "^7.3.1" +pytest = "^7.4.4" psycopg2-binary = "^2.9.6" typing-extensions = "^4.6.1" PyJWT = {version = "^2.1.0", extras = ["crypto"]} requests = "^2.31.0" pytest-xdist = "^3.3.1" -asyncpg = "^0.27.0" -aiopg = "^1.3.1" +asyncpg = "^0.29.0" +aiopg = "^1.4.0" Jinja2 = "^3.0.2" types-requests = "^2.31.0.0" types-psycopg2 = "^2.9.21.10" -boto3 = "^1.26.16" +boto3 = "^1.34.11" boto3-stubs = {extras = ["s3"], version = "^1.26.16"} moto = {extras = ["server"], version = "^4.1.2"} backoff = "^2.2.1" @@ -34,7 +34,7 @@ types-psutil = "^5.9.5.12" types-toml = "^0.10.8.6" pytest-httpserver = "^1.0.8" aiohttp = "3.9.0" -pytest-rerunfailures = "^11.1.2" +pytest-rerunfailures = "^13.0" types-pytest-lazy-fixture = "^0.6.3.3" pytest-split = "^0.8.1" zstandard = "^0.21.0" diff --git a/s3_scrubber/Cargo.toml b/s3_scrubber/Cargo.toml index e26f2c6d6b..4d136472e0 100644 --- a/s3_scrubber/Cargo.toml +++ b/s3_scrubber/Cargo.toml @@ -6,6 +6,7 @@ license.workspace = true [dependencies] aws-sdk-s3.workspace = true +aws-smithy-async.workspace = true either.workspace = true tokio-rustls.workspace = true anyhow.workspace = true @@ -31,6 +32,7 @@ reqwest = { workspace = true, default-features = false, features = ["rustls-tls" aws-config = { workspace = true, default-features = false, features = ["rustls", "sso"] } pageserver = { path = "../pageserver" } +pageserver_api = { path = "../libs/pageserver_api" } remote_storage = { path = "../libs/remote_storage" } tracing.workspace = true @@ -38,3 +40,5 @@ tracing-subscriber.workspace = true clap.workspace = true tracing-appender = "0.2" histogram = "0.7" + +futures.workspace = true diff --git a/s3_scrubber/src/checks.rs b/s3_scrubber/src/checks.rs index 510a128663..7b9f96dce3 100644 --- a/s3_scrubber/src/checks.rs +++ b/s3_scrubber/src/checks.rs @@ -1,19 +1,21 @@ -use std::collections::HashSet; +use std::collections::{HashMap, HashSet}; use anyhow::Context; use aws_sdk_s3::{types::ObjectIdentifier, Client}; +use pageserver::tenant::remote_timeline_client::index::IndexLayerMetadata; +use pageserver_api::shard::ShardIndex; use tracing::{error, info, warn}; use utils::generation::Generation; +use utils::id::TimelineId; use crate::cloud_admin_api::BranchData; use crate::metadata_stream::stream_listing; -use crate::{download_object_with_retries, RootTarget}; +use crate::{download_object_with_retries, RootTarget, TenantShardTimelineId}; use futures_util::{pin_mut, StreamExt}; use pageserver::tenant::remote_timeline_client::parse_remote_index_path; use pageserver::tenant::storage_layer::LayerFileName; use pageserver::tenant::IndexPart; use remote_storage::RemotePath; -use utils::id::TenantTimelineId; pub(crate) struct TimelineAnalysis { /// Anomalies detected @@ -39,9 +41,9 @@ impl TimelineAnalysis { } } -pub(crate) async fn branch_cleanup_and_check_errors( - id: &TenantTimelineId, - s3_root: &RootTarget, +pub(crate) fn branch_cleanup_and_check_errors( + id: &TenantShardTimelineId, + tenant_objects: &mut TenantObjectListing, s3_active_branch: Option<&BranchData>, console_branch: Option, s3_data: Option, @@ -73,8 +75,8 @@ pub(crate) async fn branch_cleanup_and_check_errors( match s3_data.blob_data { BlobDataParseResult::Parsed { index_part, - index_part_generation, - mut s3_layers, + index_part_generation: _index_part_generation, + s3_layers: _s3_layers, } => { if !IndexPart::KNOWN_VERSIONS.contains(&index_part.get_version()) { result.errors.push(format!( @@ -112,64 +114,21 @@ pub(crate) async fn branch_cleanup_and_check_errors( )) } - let layer_map_key = (layer, metadata.generation); - if !s3_layers.remove(&layer_map_key) { + if !tenant_objects.check_ref(id.timeline_id, &layer, &metadata) { // FIXME: this will emit false positives if an index was // uploaded concurrently with our scan. To make this check // correct, we need to try sending a HEAD request for the // layer we think is missing. result.errors.push(format!( - "index_part.json contains a layer {}{} that is not present in remote storage", - layer_map_key.0.file_name(), - layer_map_key.1.get_suffix() + "index_part.json contains a layer {}{} (shard {}) that is not present in remote storage", + layer.file_name(), + metadata.generation.get_suffix(), + metadata.shard )) } } - - let orphan_layers: Vec<(LayerFileName, Generation)> = s3_layers - .into_iter() - .filter(|(_layer_name, gen)| - // A layer is only considered orphaned if it has a generation below - // the index. If the generation is >= the index, then the layer may - // be an upload from a running pageserver, or even an upload from - // a new generation that didn't upload an index yet. - // - // Even so, a layer that is not referenced by the index could just - // be something enqueued for deletion, so while this check is valid - // for indicating that a layer is garbage, it is not an indicator - // of a problem. - gen < &index_part_generation) - .collect(); - - if !orphan_layers.is_empty() { - result.errors.push(format!( - "index_part.json does not contain layers from S3: {:?}", - orphan_layers - .iter() - .map(|(layer_name, gen)| format!( - "{}{}", - layer_name.file_name(), - gen.get_suffix() - )) - .collect::>(), - )); - result.garbage_keys.extend(orphan_layers.iter().map( - |(layer_name, layer_gen)| { - let mut key = s3_root.timeline_root(id).prefix_in_bucket; - let delimiter = s3_root.delimiter(); - if !key.ends_with(delimiter) { - key.push_str(delimiter); - } - key.push_str(&format!( - "{}{}", - &layer_name.file_name(), - layer_gen.get_suffix() - )); - key - }, - )); - } } + BlobDataParseResult::Relic => {} BlobDataParseResult::Incorrect(parse_errors) => result.errors.extend( parse_errors .into_iter() @@ -202,6 +161,83 @@ pub(crate) async fn branch_cleanup_and_check_errors( result } +#[derive(Default)] +pub(crate) struct LayerRef { + ref_count: usize, +} + +/// Top-level index of objects in a tenant. This may be used by any shard-timeline within +/// the tenant to query whether an object exists. +#[derive(Default)] +pub(crate) struct TenantObjectListing { + shard_timelines: + HashMap<(ShardIndex, TimelineId), HashMap<(LayerFileName, Generation), LayerRef>>, +} + +impl TenantObjectListing { + /// Having done an S3 listing of the keys within a timeline prefix, merge them into the overall + /// list of layer keys for the Tenant. + pub(crate) fn push( + &mut self, + ttid: TenantShardTimelineId, + layers: HashSet<(LayerFileName, Generation)>, + ) { + let shard_index = ShardIndex::new( + ttid.tenant_shard_id.shard_number, + ttid.tenant_shard_id.shard_count, + ); + let replaced = self.shard_timelines.insert( + (shard_index, ttid.timeline_id), + layers + .into_iter() + .map(|l| (l, LayerRef::default())) + .collect(), + ); + + assert!( + replaced.is_none(), + "Built from an S3 object listing, which should never repeat a key" + ); + } + + /// Having loaded a timeline index, check if a layer referenced by the index exists. If it does, + /// the layer's refcount will be incremented. Later, after calling this for all references in all indices + /// in a tenant, orphan layers may be detected by their zero refcounts. + /// + /// Returns true if the layer exists + pub(crate) fn check_ref( + &mut self, + timeline_id: TimelineId, + layer_file: &LayerFileName, + metadata: &IndexLayerMetadata, + ) -> bool { + let Some(shard_tl) = self.shard_timelines.get_mut(&(metadata.shard, timeline_id)) else { + return false; + }; + + let Some(layer_ref) = shard_tl.get_mut(&(layer_file.clone(), metadata.generation)) else { + return false; + }; + + layer_ref.ref_count += 1; + + true + } + + pub(crate) fn get_orphans(&self) -> Vec<(ShardIndex, TimelineId, LayerFileName, Generation)> { + let mut result = Vec::new(); + for ((shard_index, timeline_id), layers) in &self.shard_timelines { + for ((layer_file, generation), layer_ref) in layers { + if layer_ref.ref_count == 0 { + result.push((*shard_index, *timeline_id, layer_file.clone(), *generation)) + } + } + } + + result + } +} + #[derive(Debug)] pub(crate) struct S3TimelineBlobData { pub(crate) blob_data: BlobDataParseResult, @@ -215,6 +251,8 @@ pub(crate) enum BlobDataParseResult { index_part_generation: Generation, s3_layers: HashSet<(LayerFileName, Generation)>, }, + /// The remains of a deleted Timeline (i.e. an initdb archive only) + Relic, Incorrect(Vec), } @@ -233,7 +271,7 @@ fn parse_layer_object_name(name: &str) -> Result<(LayerFileName, Generation), St pub(crate) async fn list_timeline_blobs( s3_client: &Client, - id: TenantTimelineId, + id: TenantShardTimelineId, s3_root: &RootTarget, ) -> anyhow::Result { let mut s3_layers = HashSet::new(); @@ -245,6 +283,7 @@ pub(crate) async fn list_timeline_blobs( timeline_dir_target.delimiter = String::new(); let mut index_parts: Vec = Vec::new(); + let mut initdb_archive: bool = false; let stream = stream_listing(s3_client, &timeline_dir_target); pin_mut!(stream); @@ -258,6 +297,10 @@ pub(crate) async fn list_timeline_blobs( tracing::info!("Index key {key}"); index_parts.push(obj) } + Some("initdb.tar.zst") => { + tracing::info!("initdb archive {key}"); + initdb_archive = true; + } Some(maybe_layer_name) => match parse_layer_object_name(maybe_layer_name) { Ok((new_layer, gen)) => { tracing::info!("Parsed layer key: {} {:?}", new_layer, gen); @@ -279,6 +322,16 @@ pub(crate) async fn list_timeline_blobs( } } + if index_parts.is_empty() && s3_layers.is_empty() && initdb_archive { + tracing::info!( + "Timeline is empty apart from initdb archive: expected post-deletion state." + ); + return Ok(S3TimelineBlobData { + blob_data: BlobDataParseResult::Relic, + keys_to_remove: Vec::new(), + }); + } + // Choose the index_part with the highest generation let (index_part_object, index_part_generation) = match index_parts .iter() diff --git a/s3_scrubber/src/garbage.rs b/s3_scrubber/src/garbage.rs index f27e1d7f65..7192afb91b 100644 --- a/s3_scrubber/src/garbage.rs +++ b/s3_scrubber/src/garbage.rs @@ -10,15 +10,16 @@ use aws_sdk_s3::{ Client, }; use futures_util::{pin_mut, TryStreamExt}; +use pageserver_api::shard::TenantShardId; use serde::{Deserialize, Serialize}; use tokio_stream::StreamExt; -use utils::id::{TenantId, TenantTimelineId}; +use utils::id::TenantId; use crate::{ cloud_admin_api::{CloudAdminApiClient, MaybeDeleted, ProjectData}, init_remote, metadata_stream::{stream_listing, stream_tenant_timelines, stream_tenants}, - BucketConfig, ConsoleConfig, NodeKind, RootTarget, TraversingDepth, + BucketConfig, ConsoleConfig, NodeKind, RootTarget, TenantShardTimelineId, TraversingDepth, }; #[derive(Serialize, Deserialize, Debug)] @@ -29,8 +30,8 @@ enum GarbageReason { #[derive(Serialize, Deserialize, Debug)] enum GarbageEntity { - Tenant(TenantId), - Timeline(TenantTimelineId), + Tenant(TenantShardId), + Timeline(TenantShardTimelineId), } #[derive(Serialize, Deserialize, Debug)] @@ -142,6 +143,9 @@ async fn find_garbage_inner( console_projects.len() ); + // TODO(sharding): batch calls into Console so that we only call once for each TenantId, + // rather than checking the same TenantId for multiple TenantShardId + // Enumerate Tenants in S3, and check if each one exists in Console tracing::info!("Finding all tenants in bucket {}...", bucket_config.bucket); let tenants = stream_tenants(&s3_client, &target); @@ -149,10 +153,10 @@ async fn find_garbage_inner( let api_client = cloud_admin_api_client.clone(); let console_projects = &console_projects; async move { - match console_projects.get(&t) { + match console_projects.get(&t.tenant_id) { Some(project_data) => Ok((t, Some(project_data.clone()))), None => api_client - .find_tenant_project(t) + .find_tenant_project(t.tenant_id) .await .map_err(|e| anyhow::anyhow!(e)) .map(|r| (t, r)), @@ -166,21 +170,21 @@ async fn find_garbage_inner( // checks if they are enabled by the `depth` parameter. pin_mut!(tenants_checked); let mut garbage = GarbageList::new(node_kind, bucket_config); - let mut active_tenants: Vec = vec![]; + let mut active_tenants: Vec = vec![]; let mut counter = 0; while let Some(result) = tenants_checked.next().await { - let (tenant_id, console_result) = result?; + let (tenant_shard_id, console_result) = result?; // Paranoia check if let Some(project) = &console_result { - assert!(project.tenant == tenant_id); + assert!(project.tenant == tenant_shard_id.tenant_id); } - if garbage.maybe_append(GarbageEntity::Tenant(tenant_id), console_result) { - tracing::debug!("Tenant {tenant_id} is garbage"); + if garbage.maybe_append(GarbageEntity::Tenant(tenant_shard_id), console_result) { + tracing::debug!("Tenant {tenant_shard_id} is garbage"); } else { - tracing::debug!("Tenant {tenant_id} is active"); - active_tenants.push(tenant_id); + tracing::debug!("Tenant {tenant_shard_id} is active"); + active_tenants.push(tenant_shard_id); } counter += 1; @@ -266,13 +270,13 @@ impl std::fmt::Display for PurgeMode { pub async fn get_tenant_objects( s3_client: &Arc, target: RootTarget, - tenant_id: TenantId, + tenant_shard_id: TenantShardId, ) -> anyhow::Result> { - tracing::debug!("Listing objects in tenant {tenant_id}"); + tracing::debug!("Listing objects in tenant {tenant_shard_id}"); // TODO: apply extra validation based on object modification time. Don't purge // tenants where any timeline's index_part.json has been touched recently. - let mut tenant_root = target.tenant_root(&tenant_id); + let mut tenant_root = target.tenant_root(&tenant_shard_id); // Remove delimiter, so that object listing lists all keys in the prefix and not just // common prefixes. @@ -285,7 +289,7 @@ pub async fn get_tenant_objects( pub async fn get_timeline_objects( s3_client: &Arc, target: RootTarget, - ttid: TenantTimelineId, + ttid: TenantShardTimelineId, ) -> anyhow::Result> { tracing::debug!("Listing objects in timeline {ttid}"); let mut timeline_root = target.timeline_root(&ttid); diff --git a/s3_scrubber/src/lib.rs b/s3_scrubber/src/lib.rs index e5465952fb..d2842877d0 100644 --- a/s3_scrubber/src/lib.rs +++ b/s3_scrubber/src/lib.rs @@ -15,13 +15,17 @@ use anyhow::Context; use aws_config::environment::EnvironmentVariableCredentialsProvider; use aws_config::imds::credentials::ImdsCredentialsProvider; use aws_config::meta::credentials::CredentialsProviderChain; +use aws_config::profile::ProfileFileCredentialsProvider; +use aws_config::retry::RetryConfig; use aws_config::sso::SsoCredentialsProvider; use aws_config::BehaviorVersion; -use aws_sdk_s3::config::Region; +use aws_sdk_s3::config::{AsyncSleep, Region, SharedAsyncSleep}; use aws_sdk_s3::{Client, Config}; +use aws_smithy_async::rt::sleep::TokioSleep; use clap::ValueEnum; use pageserver::tenant::TENANTS_SEGMENT_NAME; +use pageserver_api::shard::TenantShardId; use reqwest::Url; use serde::{Deserialize, Serialize}; use std::io::IsTerminal; @@ -29,7 +33,7 @@ use tokio::io::AsyncReadExt; use tracing::error; use tracing_appender::non_blocking::WorkerGuard; use tracing_subscriber::{fmt, prelude::*, EnvFilter}; -use utils::id::{TenantId, TenantTimelineId}; +use utils::id::TimelineId; const MAX_RETRIES: usize = 20; const CLOUD_ADMIN_API_TOKEN_ENV_VAR: &str = "CLOUD_ADMIN_API_TOKEN"; @@ -44,6 +48,35 @@ pub struct S3Target { pub delimiter: String, } +/// Convenience for referring to timelines within a particular shard: more ergonomic +/// than using a 2-tuple. +/// +/// This is the shard-aware equivalent of TenantTimelineId. It's defined here rather +/// than somewhere more broadly exposed, because this kind of thing is rarely needed +/// in the pageserver, as all timeline objects existing in the scope of a particular +/// tenant: the scrubber is different in that it handles collections of data referring to many +/// TenantShardTimelineIds in on place. +#[derive(Serialize, Deserialize, Debug, Clone, Copy, Hash, PartialEq, Eq)] +pub struct TenantShardTimelineId { + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, +} + +impl TenantShardTimelineId { + fn new(tenant_shard_id: TenantShardId, timeline_id: TimelineId) -> Self { + Self { + tenant_shard_id, + timeline_id, + } + } +} + +impl Display for TenantShardTimelineId { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}/{}", self.tenant_shard_id, self.timeline_id) + } +} + #[derive(clap::ValueEnum, Debug, Clone, Copy, PartialEq, Eq)] pub enum TraversingDepth { Tenant, @@ -86,7 +119,9 @@ impl S3Target { if new_self.prefix_in_bucket.is_empty() { new_self.prefix_in_bucket = format!("/{}/", new_segment); } else { - let _ = new_self.prefix_in_bucket.pop(); + if new_self.prefix_in_bucket.ends_with('/') { + new_self.prefix_in_bucket.pop(); + } new_self.prefix_in_bucket = [&new_self.prefix_in_bucket, new_segment, ""].join(&new_self.delimiter); } @@ -108,19 +143,19 @@ impl RootTarget { } } - pub fn tenant_root(&self, tenant_id: &TenantId) -> S3Target { + pub fn tenant_root(&self, tenant_id: &TenantShardId) -> S3Target { self.tenants_root().with_sub_segment(&tenant_id.to_string()) } - pub fn timelines_root(&self, tenant_id: &TenantId) -> S3Target { + pub fn timelines_root(&self, tenant_id: &TenantShardId) -> S3Target { match self { Self::Pageserver(_) => self.tenant_root(tenant_id).with_sub_segment("timelines"), Self::Safekeeper(_) => self.tenant_root(tenant_id), } } - pub fn timeline_root(&self, id: &TenantTimelineId) -> S3Target { - self.timelines_root(&id.tenant_id) + pub fn timeline_root(&self, id: &TenantShardTimelineId) -> S3Target { + self.timelines_root(&id.tenant_shard_id) .with_sub_segment(&id.timeline_id.to_string()) } @@ -223,6 +258,11 @@ pub fn init_s3_client(account_id: Option, bucket_region: Region) -> Clie let chain = CredentialsProviderChain::first_try( "env", EnvironmentVariableCredentialsProvider::new(), + ) + // uses "AWS_PROFILE" / `aws sso login --profile ` + .or_else( + "profile-sso", + ProfileFileCredentialsProvider::builder().build(), ); // Use SSO if we were given an account ID @@ -233,7 +273,7 @@ pub fn init_s3_client(account_id: Option, bucket_region: Region) -> Clie .account_id(sso_account) .role_name("PowerUserAccess") .start_url("https://neondb.awsapps.com/start") - .region(Region::from_static("eu-central-1")) + .region(bucket_region.clone()) .build(), ), None => chain, @@ -245,9 +285,13 @@ pub fn init_s3_client(account_id: Option, bucket_region: Region) -> Clie ) }; + let sleep_impl: Arc = Arc::new(TokioSleep::new()); + let mut builder = Config::builder() .behavior_version(BehaviorVersion::v2023_11_09()) .region(bucket_region) + .retry_config(RetryConfig::adaptive().with_max_attempts(3)) + .sleep_impl(SharedAsyncSleep::from(sleep_impl)) .credentials_provider(credentials_provider); if let Ok(endpoint) = env::var("AWS_ENDPOINT_URL") { diff --git a/s3_scrubber/src/main.rs b/s3_scrubber/src/main.rs index 1f0ceebdaf..957213856b 100644 --- a/s3_scrubber/src/main.rs +++ b/s3_scrubber/src/main.rs @@ -1,3 +1,4 @@ +use pageserver_api::shard::TenantShardId; use s3_scrubber::garbage::{find_garbage, purge_garbage, PurgeMode}; use s3_scrubber::scan_metadata::scan_metadata; use s3_scrubber::{init_logging, BucketConfig, ConsoleConfig, NodeKind, TraversingDepth}; @@ -34,6 +35,8 @@ enum Command { ScanMetadata { #[arg(short, long, default_value_t = false)] json: bool, + #[arg(long = "tenant-id", num_args = 0..)] + tenant_ids: Vec, }, } @@ -57,24 +60,37 @@ async fn main() -> anyhow::Result<()> { )); match cli.command { - Command::ScanMetadata { json } => match scan_metadata(bucket_config).await { - Err(e) => { - tracing::error!("Failed: {e}"); - Err(e) - } - Ok(summary) => { - if json { - println!("{}", serde_json::to_string(&summary).unwrap()) - } else { - println!("{}", summary.summary_string()); + Command::ScanMetadata { json, tenant_ids } => { + match scan_metadata(bucket_config.clone(), tenant_ids).await { + Err(e) => { + tracing::error!("Failed: {e}"); + Err(e) } - if summary.is_fatal() { - Err(anyhow::anyhow!("Fatal scrub errors detected")) - } else { - Ok(()) + Ok(summary) => { + if json { + println!("{}", serde_json::to_string(&summary).unwrap()) + } else { + println!("{}", summary.summary_string()); + } + if summary.is_fatal() { + Err(anyhow::anyhow!("Fatal scrub errors detected")) + } else if summary.is_empty() { + // Strictly speaking an empty bucket is a valid bucket, but if someone ran the + // scrubber they were likely expecting to scan something, and if we see no timelines + // at all then it's likely due to some configuration issues like a bad prefix + Err(anyhow::anyhow!( + "No timelines found in bucket {} prefix {}", + bucket_config.bucket, + bucket_config + .prefix_in_bucket + .unwrap_or("".to_string()) + )) + } else { + Ok(()) + } } } - }, + } Command::FindGarbage { node_kind, depth, diff --git a/s3_scrubber/src/metadata_stream.rs b/s3_scrubber/src/metadata_stream.rs index 4cfa77cfc1..073f37f319 100644 --- a/s3_scrubber/src/metadata_stream.rs +++ b/s3_scrubber/src/metadata_stream.rs @@ -3,14 +3,15 @@ use async_stream::{stream, try_stream}; use aws_sdk_s3::{types::ObjectIdentifier, Client}; use tokio_stream::Stream; -use crate::{list_objects_with_retries, RootTarget, S3Target, TenantId}; -use utils::id::{TenantTimelineId, TimelineId}; +use crate::{list_objects_with_retries, RootTarget, S3Target, TenantShardTimelineId}; +use pageserver_api::shard::TenantShardId; +use utils::id::TimelineId; /// Given an S3 bucket, output a stream of TenantIds discovered via ListObjectsv2 pub fn stream_tenants<'a>( s3_client: &'a Client, target: &'a RootTarget, -) -> impl Stream> + 'a { +) -> impl Stream> + 'a { try_stream! { let mut continuation_token = None; let tenants_target = target.tenants_root(); @@ -44,14 +45,14 @@ pub fn stream_tenants<'a>( } } -/// Given a TenantId, output a stream of the timelines within that tenant, discovered +/// Given a TenantShardId, output a stream of the timelines within that tenant, discovered /// using ListObjectsv2. The listing is done before the stream is built, so that this /// function can be used to generate concurrency on a stream using buffer_unordered. pub async fn stream_tenant_timelines<'a>( s3_client: &'a Client, target: &'a RootTarget, - tenant: TenantId, -) -> anyhow::Result> + 'a> { + tenant: TenantShardId, +) -> anyhow::Result> + 'a> { let mut timeline_ids: Vec> = Vec::new(); let mut continuation_token = None; let timelines_target = target.timelines_root(&tenant); @@ -98,7 +99,7 @@ pub async fn stream_tenant_timelines<'a>( Ok(stream! { for i in timeline_ids { let id = i?; - yield Ok(TenantTimelineId::new(tenant, id)); + yield Ok(TenantShardTimelineId::new(tenant, id)); } }) } diff --git a/s3_scrubber/src/scan_metadata.rs b/s3_scrubber/src/scan_metadata.rs index ad82db1e76..bfde8f0213 100644 --- a/s3_scrubber/src/scan_metadata.rs +++ b/s3_scrubber/src/scan_metadata.rs @@ -2,23 +2,25 @@ use std::collections::{HashMap, HashSet}; use crate::checks::{ branch_cleanup_and_check_errors, list_timeline_blobs, BlobDataParseResult, S3TimelineBlobData, - TimelineAnalysis, + TenantObjectListing, TimelineAnalysis, }; use crate::metadata_stream::{stream_tenant_timelines, stream_tenants}; -use crate::{init_remote, BucketConfig, NodeKind, RootTarget}; +use crate::{init_remote, BucketConfig, NodeKind, RootTarget, TenantShardTimelineId}; use aws_sdk_s3::Client; use futures_util::{pin_mut, StreamExt, TryStreamExt}; use histogram::Histogram; +use pageserver::tenant::remote_timeline_client::remote_layer_path; use pageserver::tenant::IndexPart; +use pageserver_api::shard::TenantShardId; use serde::Serialize; -use utils::id::TenantTimelineId; +use utils::id::TenantId; #[derive(Serialize)] pub struct MetadataSummary { count: usize, - with_errors: HashSet, - with_warnings: HashSet, - with_garbage: HashSet, + with_errors: HashSet, + with_warnings: HashSet, + with_orphans: HashSet, indices_by_version: HashMap, layer_count: MinMaxHisto, @@ -88,7 +90,7 @@ impl MetadataSummary { count: 0, with_errors: HashSet::new(), with_warnings: HashSet::new(), - with_garbage: HashSet::new(), + with_orphans: HashSet::new(), indices_by_version: HashMap::new(), layer_count: MinMaxHisto::new(), timeline_size_bytes: MinMaxHisto::new(), @@ -132,7 +134,7 @@ impl MetadataSummary { } } - fn update_analysis(&mut self, id: &TenantTimelineId, analysis: &TimelineAnalysis) { + fn update_analysis(&mut self, id: &TenantShardTimelineId, analysis: &TimelineAnalysis) { if !analysis.errors.is_empty() { self.with_errors.insert(*id); } @@ -142,6 +144,10 @@ impl MetadataSummary { } } + fn notify_timeline_orphan(&mut self, ttid: &TenantShardTimelineId) { + self.with_orphans.insert(*ttid); + } + /// Long-form output for printing at end of a scan pub fn summary_string(&self) -> String { let version_summary: String = itertools::join( @@ -155,7 +161,7 @@ impl MetadataSummary { "Timelines: {0} With errors: {1} With warnings: {2} -With garbage: {3} +With orphan layers: {3} Index versions: {version_summary} Timeline size bytes: {4} Layer size bytes: {5} @@ -164,7 +170,7 @@ Timeline layer count: {6} self.count, self.with_errors.len(), self.with_warnings.len(), - self.with_garbage.len(), + self.with_orphans.len(), self.timeline_size_bytes.oneline(), self.layer_size_bytes.oneline(), self.layer_count.oneline(), @@ -174,13 +180,24 @@ Timeline layer count: {6} pub fn is_fatal(&self) -> bool { !self.with_errors.is_empty() } + + pub fn is_empty(&self) -> bool { + self.count == 0 + } } /// Scan the pageserver metadata in an S3 bucket, reporting errors and statistics. -pub async fn scan_metadata(bucket_config: BucketConfig) -> anyhow::Result { +pub async fn scan_metadata( + bucket_config: BucketConfig, + tenant_ids: Vec, +) -> anyhow::Result { let (s3_client, target) = init_remote(bucket_config, NodeKind::Pageserver)?; - let tenants = stream_tenants(&s3_client, &target); + let tenants = if tenant_ids.is_empty() { + futures::future::Either::Left(stream_tenants(&s3_client, &target)) + } else { + futures::future::Either::Right(futures::stream::iter(tenant_ids.into_iter().map(Ok))) + }; // How many tenants to process in parallel. We need to be mindful of pageservers // accessing the same per tenant prefixes, so use a lower setting than pageservers. @@ -188,31 +205,131 @@ pub async fn scan_metadata(bucket_config: BucketConfig) -> anyhow::Result anyhow::Result<(TenantTimelineId, S3TimelineBlobData)> { + ttid: TenantShardTimelineId, + ) -> anyhow::Result<(TenantShardTimelineId, S3TimelineBlobData)> { let data = list_timeline_blobs(s3_client, ttid, target).await?; Ok((ttid, data)) } let timelines = timelines.map_ok(|ttid| report_on_timeline(&s3_client, &target, ttid)); - let timelines = timelines.try_buffer_unordered(CONCURRENCY); + let timelines = timelines.try_buffered(CONCURRENCY); + // We must gather all the TenantShardTimelineId->S3TimelineBlobData for each tenant, because different + // shards in the same tenant might refer to one anothers' keys if a shard split has happened. + + let mut tenant_id = None; + let mut tenant_objects = TenantObjectListing::default(); + let mut tenant_timeline_results = Vec::new(); + + fn analyze_tenant( + tenant_id: TenantId, + summary: &mut MetadataSummary, + mut tenant_objects: TenantObjectListing, + timelines: Vec<(TenantShardTimelineId, S3TimelineBlobData)>, + ) { + let mut timeline_generations = HashMap::new(); + for (ttid, data) in timelines { + // Stash the generation of each timeline, for later use identifying orphan layers + if let BlobDataParseResult::Parsed { + index_part: _index_part, + index_part_generation, + s3_layers: _s3_layers, + } = &data.blob_data + { + timeline_generations.insert(ttid, *index_part_generation); + } + + // Apply checks to this timeline shard's metadata, and in the process update `tenant_objects` + // reference counts for layers across the tenant. + let analysis = + branch_cleanup_and_check_errors(&ttid, &mut tenant_objects, None, None, Some(data)); + summary.update_analysis(&ttid, &analysis); + } + + // Identifying orphan layers must be done on a tenant-wide basis, because individual + // shards' layers may be referenced by other shards. + // + // Orphan layers are not a corruption, and not an indication of a problem. They are just + // consuming some space in remote storage, and may be cleaned up at leisure. + for (shard_index, timeline_id, layer_file, generation) in tenant_objects.get_orphans() { + let ttid = TenantShardTimelineId { + tenant_shard_id: TenantShardId { + tenant_id, + shard_count: shard_index.shard_count, + shard_number: shard_index.shard_number, + }, + timeline_id, + }; + + if let Some(timeline_generation) = timeline_generations.get(&ttid) { + if &generation >= timeline_generation { + // Candidate orphan layer is in the current or future generation relative + // to the index we read for this timeline shard, so its absence from the index + // doesn't make it an orphan: more likely, it is a case where the layer was + // uploaded, but the index referencing the layer wasn't written yet. + continue; + } + } + + let orphan_path = remote_layer_path( + &tenant_id, + &timeline_id, + shard_index, + &layer_file, + generation, + ); + + tracing::info!("Orphan layer detected: {orphan_path}"); + + summary.notify_timeline_orphan(&ttid); + } + } + + // Iterate through all the timeline results. These are in key-order, so + // all results for the same tenant will be adjacent. We accumulate these, + // and then call `analyze_tenant` to flush, when we see the next tenant ID. let mut summary = MetadataSummary::new(); pin_mut!(timelines); while let Some(i) = timelines.next().await { let (ttid, data) = i?; summary.update_data(&data); - let analysis = - branch_cleanup_and_check_errors(&ttid, &target, None, None, Some(data)).await; + match tenant_id { + None => tenant_id = Some(ttid.tenant_shard_id.tenant_id), + Some(prev_tenant_id) => { + if prev_tenant_id != ttid.tenant_shard_id.tenant_id { + let tenant_objects = std::mem::take(&mut tenant_objects); + let timelines = std::mem::take(&mut tenant_timeline_results); + analyze_tenant(prev_tenant_id, &mut summary, tenant_objects, timelines); + tenant_id = Some(ttid.tenant_shard_id.tenant_id); + } + } + } - summary.update_analysis(&ttid, &analysis); + if let BlobDataParseResult::Parsed { + index_part: _index_part, + index_part_generation: _index_part_generation, + s3_layers, + } = &data.blob_data + { + tenant_objects.push(ttid, s3_layers.clone()); + } + tenant_timeline_results.push((ttid, data)); + } + + if !tenant_timeline_results.is_empty() { + analyze_tenant( + tenant_id.expect("Must be set if results are present"), + &mut summary, + tenant_objects, + tenant_timeline_results, + ); } Ok(summary) diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml index 53fcd5ff07..4015c27933 100644 --- a/safekeeper/Cargo.toml +++ b/safekeeper/Cargo.toml @@ -4,6 +4,12 @@ version = "0.1.0" edition.workspace = true license.workspace = true +[features] +default = [] +# Enables test-only APIs, incuding failpoints. In particular, enables the `fail_point!` macro, +# which adds some runtime cost to run tests on outage conditions +testing = ["fail/failpoints"] + [dependencies] async-stream.workspace = true anyhow.workspace = true @@ -16,6 +22,7 @@ chrono.workspace = true clap = { workspace = true, features = ["derive"] } const_format.workspace = true crc32c.workspace = true +fail.workspace = true fs2.workspace = true git-version.workspace = true hex.workspace = true @@ -35,6 +42,7 @@ serde_with.workspace = true signal-hook.workspace = true thiserror.workspace = true tokio = { workspace = true, features = ["fs"] } +tokio-util = { workspace = true } tokio-io-timeout.workspace = true tokio-postgres.workspace = true toml_edit.workspace = true diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs index e59deb9fda..33047051df 100644 --- a/safekeeper/src/bin/safekeeper.rs +++ b/safekeeper/src/bin/safekeeper.rs @@ -54,6 +54,19 @@ const ID_FILE_NAME: &str = "safekeeper.id"; project_git_version!(GIT_VERSION); project_build_tag!(BUILD_TAG); +const FEATURES: &[&str] = &[ + #[cfg(feature = "testing")] + "testing", +]; + +fn version() -> String { + format!( + "{GIT_VERSION} failpoints: {}, features: {:?}", + fail::has_failpoints(), + FEATURES, + ) +} + const ABOUT: &str = r#" A fleet of safekeepers is responsible for reliably storing WAL received from compute, passing it through consensus (mitigating potential computes brain @@ -167,7 +180,9 @@ async fn main() -> anyhow::Result<()> { // getting 'argument cannot be used multiple times' error. This seems to be // impossible with pure Derive API, so convert struct to Command, modify it, // parse arguments, and then fill the struct back. - let cmd = ::command().args_override_self(true); + let cmd = ::command() + .args_override_self(true) + .version(version()); let mut matches = cmd.get_matches(); let mut args = ::from_arg_matches_mut(&mut matches)?; diff --git a/safekeeper/src/handler.rs b/safekeeper/src/handler.rs index d5333abae6..761541168c 100644 --- a/safekeeper/src/handler.rs +++ b/safekeeper/src/handler.rs @@ -11,7 +11,7 @@ use tracing::{debug, info, info_span, Instrument}; use crate::auth::check_permission; use crate::json_ctrl::{handle_json_ctrl, AppendLogicalMessage}; -use crate::metrics::{TrafficMetrics, PG_QUERIES_FINISHED, PG_QUERIES_RECEIVED}; +use crate::metrics::{TrafficMetrics, PG_QUERIES_GAUGE}; use crate::safekeeper::Term; use crate::timeline::TimelineError; use crate::wal_service::ConnectionId; @@ -210,10 +210,7 @@ impl postgres_backend::Handler let cmd = parse_cmd(query_string)?; let cmd_str = cmd_to_string(&cmd); - PG_QUERIES_RECEIVED.with_label_values(&[cmd_str]).inc(); - scopeguard::defer! { - PG_QUERIES_FINISHED.with_label_values(&[cmd_str]).inc(); - } + let _guard = PG_QUERIES_GAUGE.with_label_values(&[cmd_str]).guard(); info!("got query {:?}", query_string); diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs index c48b5330b3..25a3334e63 100644 --- a/safekeeper/src/http/routes.rs +++ b/safekeeper/src/http/routes.rs @@ -12,6 +12,8 @@ use storage_broker::proto::SafekeeperTimelineInfo; use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId; use tokio::fs::File; use tokio::io::AsyncReadExt; +use tokio_util::sync::CancellationToken; +use utils::failpoint_support::failpoints_handler; use std::io::Write as _; use tokio::sync::mpsc; @@ -444,6 +446,12 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder .data(Arc::new(conf)) .data(auth) .get("/v1/status", |r| request_span(r, status_handler)) + .put("/v1/failpoints", |r| { + request_span(r, move |r| async { + let cancel = CancellationToken::new(); + failpoints_handler(r, cancel).await + }) + }) // Will be used in the future instead of implicit timeline creation .post("/v1/tenant/timeline", |r| { request_span(r, timeline_create_handler) diff --git a/safekeeper/src/metrics.rs b/safekeeper/src/metrics.rs index 0711beb290..11a3f48922 100644 --- a/safekeeper/src/metrics.rs +++ b/safekeeper/src/metrics.rs @@ -11,7 +11,8 @@ use futures::Future; use metrics::{ core::{AtomicU64, Collector, Desc, GenericCounter, GenericGaugeVec, Opts}, proto::MetricFamily, - register_int_counter, register_int_counter_vec, Gauge, IntCounter, IntCounterVec, IntGaugeVec, + register_int_counter, register_int_counter_pair_vec, register_int_counter_vec, Gauge, + IntCounter, IntCounterPairVec, IntCounterVec, IntGaugeVec, }; use once_cell::sync::Lazy; @@ -89,16 +90,10 @@ pub static BROKER_PULLED_UPDATES: Lazy = Lazy::new(|| { ) .expect("Failed to register safekeeper_broker_pulled_updates_total counter") }); -pub static PG_QUERIES_RECEIVED: Lazy = Lazy::new(|| { - register_int_counter_vec!( +pub static PG_QUERIES_GAUGE: Lazy = Lazy::new(|| { + register_int_counter_pair_vec!( "safekeeper_pg_queries_received_total", "Number of queries received through pg protocol", - &["query"] - ) - .expect("Failed to register safekeeper_pg_queries_received_total counter") -}); -pub static PG_QUERIES_FINISHED: Lazy = Lazy::new(|| { - register_int_counter_vec!( "safekeeper_pg_queries_finished_total", "Number of queries finished through pg protocol", &["query"] diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs index 47a624281d..217a5f89ee 100644 --- a/safekeeper/src/safekeeper.rs +++ b/safekeeper/src/safekeeper.rs @@ -914,9 +914,14 @@ where Ok(()) } - /// Persist control file to disk, called only after timeline creation (bootstrap). - pub async fn persist(&mut self) -> Result<()> { - self.persist_control_file(self.state.clone()).await + /// Persist in-memory state of control file to disk. + // + // TODO: passing inmem_remote_consistent_lsn everywhere is ugly, better + // separate state completely and give Arc to all those who need it. + pub async fn persist_inmem(&mut self, inmem_remote_consistent_lsn: Lsn) -> Result<()> { + let mut state = self.state.clone(); + state.remote_consistent_lsn = inmem_remote_consistent_lsn; + self.persist_control_file(state).await } /// Persist in-memory state to the disk, taking other data from state. @@ -930,7 +935,7 @@ where /// Persist control file if there is something to save and enough time /// passed after the last save. - pub async fn maybe_persist_control_file( + pub async fn maybe_persist_inmem_control_file( &mut self, inmem_remote_consistent_lsn: Lsn, ) -> Result<()> { @@ -943,9 +948,7 @@ where || self.inmem.peer_horizon_lsn > self.state.peer_horizon_lsn || inmem_remote_consistent_lsn > self.state.remote_consistent_lsn; if need_persist { - let mut state = self.state.clone(); - state.remote_consistent_lsn = inmem_remote_consistent_lsn; - self.persist_control_file(state).await?; + self.persist_inmem(inmem_remote_consistent_lsn).await?; trace!("saved control file: {CF_SAVE_INTERVAL:?} passed"); } Ok(()) @@ -1064,8 +1067,6 @@ where if sync_control_file { let mut state = self.state.clone(); - // Note: we could make remote_consistent_lsn update in cf common by - // storing Arc to walsenders in Safekeeper. state.remote_consistent_lsn = new_remote_consistent_lsn; self.persist_control_file(state).await?; } diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs index 44f14f8c7e..9a5657a40d 100644 --- a/safekeeper/src/send_wal.rs +++ b/safekeeper/src/send_wal.rs @@ -17,6 +17,7 @@ use postgres_ffi::{TimestampTz, MAX_SEND_SIZE}; use pq_proto::{BeMessage, WalSndKeepAlive, XLogDataBody}; use serde::{Deserialize, Serialize}; use tokio::io::{AsyncRead, AsyncWrite}; +use utils::failpoint_support; use utils::id::TenantTimelineId; use utils::lsn::AtomicLsn; use utils::pageserver_feedback::PageserverFeedback; @@ -391,15 +392,8 @@ impl SafekeeperPostgresHandler { // application_name: give only committed WAL (used by pageserver) or all // existing WAL (up to flush_lsn, used by walproposer or peer recovery). // The second case is always driven by a consensus leader which term - // must generally be also supplied. However we're sloppy to do this in - // walproposer recovery which will be removed soon. So TODO is to make - // it not Option'al then. - // - // Fetching WAL without term in recovery creates a small risk of this - // WAL getting concurrently garbaged if another compute rises which - // collects majority and starts fixing log on this safekeeper itself. - // That's ok as (old) proposer will never be able to commit such WAL. - let end_watch = if self.is_walproposer_recovery() { + // must be supplied. + let end_watch = if term.is_some() { EndWatch::Flush(tli.get_term_flush_lsn_watch_rx()) } else { EndWatch::Commit(tli.get_commit_lsn_watch_rx()) @@ -535,12 +529,19 @@ impl WalSender<'_, IO> { ); // try to send as much as available, capped by MAX_SEND_SIZE - let mut send_size = self - .end_pos - .checked_sub(self.start_pos) - .context("reading wal without waiting for it first")? - .0 as usize; - send_size = min(send_size, self.send_buf.len()); + let mut chunk_end_pos = self.start_pos + MAX_SEND_SIZE as u64; + // if we went behind available WAL, back off + if chunk_end_pos >= self.end_pos { + chunk_end_pos = self.end_pos; + } else { + // If sending not up to end pos, round down to page boundary to + // avoid breaking WAL record not at page boundary, as protocol + // demands. See walsender.c (XLogSendPhysical). + chunk_end_pos = chunk_end_pos + .checked_sub(chunk_end_pos.block_offset()) + .unwrap(); + } + let send_size = (chunk_end_pos.0 - self.start_pos.0) as usize; let send_buf = &mut self.send_buf[..send_size]; let send_size: usize; { @@ -551,7 +552,8 @@ impl WalSender<'_, IO> { } else { None }; - // read wal into buffer + // Read WAL into buffer. send_size can be additionally capped to + // segment boundary here. send_size = self.wal_reader.read(send_buf).await? }; let send_buf = &send_buf[..send_size]; @@ -566,6 +568,11 @@ impl WalSender<'_, IO> { })) .await?; + if let Some(appname) = &self.appname { + if appname == "replica" { + failpoint_support::sleep_millis_async!("sk-send-wal-replica-sleep"); + } + } trace!( "sent {} bytes of WAL {}-{}", send_size, diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index 2ba871207e..bdc9088138 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -182,8 +182,9 @@ impl SharedState { } /// Mark timeline active/inactive and return whether s3 offloading requires - /// start/stop action. - fn update_status( + /// start/stop action. If timeline is deactivated, control file is persisted + /// as maintenance task does that only for active timelines. + async fn update_status( &mut self, num_computes: usize, remote_consistent_lsn: Lsn, @@ -191,7 +192,15 @@ impl SharedState { ) -> bool { let is_active = self.is_active(num_computes, remote_consistent_lsn); if self.active != is_active { - info!("timeline {} active={} now", ttid, is_active); + info!( + "timeline {} active={} now, remote_consistent_lsn={}, commit_lsn={}", + ttid, is_active, remote_consistent_lsn, self.sk.inmem.commit_lsn + ); + if !is_active { + if let Err(e) = self.sk.persist_inmem(remote_consistent_lsn).await { + warn!("control file save in update_status failed: {:?}", e); + } + } } self.active = is_active; self.is_wal_backup_action_pending(num_computes) @@ -438,7 +447,7 @@ impl Timeline { fs::create_dir_all(&self.timeline_dir).await?; // Write timeline to disk and start background tasks. - if let Err(e) = shared_state.sk.persist().await { + if let Err(e) = shared_state.sk.persist_inmem(Lsn::INVALID).await { // Bootstrap failed, cancel timeline and remove timeline directory. self.cancel(shared_state); @@ -511,12 +520,14 @@ impl Timeline { self.mutex.lock().await } - fn update_status(&self, shared_state: &mut SharedState) -> bool { - shared_state.update_status( - self.walreceivers.get_num(), - self.get_walsenders().get_remote_consistent_lsn(), - self.ttid, - ) + async fn update_status(&self, shared_state: &mut SharedState) -> bool { + shared_state + .update_status( + self.walreceivers.get_num(), + self.get_walsenders().get_remote_consistent_lsn(), + self.ttid, + ) + .await } /// Update timeline status and kick wal backup launcher to stop/start offloading if needed. @@ -526,7 +537,7 @@ impl Timeline { } let is_wal_backup_action_pending: bool = { let mut shared_state = self.write_shared_state().await; - self.update_status(&mut shared_state) + self.update_status(&mut shared_state).await }; if is_wal_backup_action_pending { // Can fail only if channel to a static thread got closed, which is not normal at all. @@ -683,7 +694,7 @@ impl Timeline { shared_state.sk.record_safekeeper_info(&sk_info).await?; let peer_info = PeerInfo::from_sk_info(&sk_info, Instant::now()); shared_state.peers_info.upsert(&peer_info); - is_wal_backup_action_pending = self.update_status(&mut shared_state); + is_wal_backup_action_pending = self.update_status(&mut shared_state).await; commit_lsn = shared_state.sk.inmem.commit_lsn; } self.commit_lsn_watch_tx.send(commit_lsn)?; @@ -828,7 +839,7 @@ impl Timeline { self.write_shared_state() .await .sk - .maybe_persist_control_file(remote_consistent_lsn) + .maybe_persist_inmem_control_file(remote_consistent_lsn) .await } diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs index 22c68ce3c9..c99bbc7d61 100644 --- a/safekeeper/src/wal_backup.rs +++ b/safekeeper/src/wal_backup.rs @@ -35,6 +35,9 @@ use once_cell::sync::OnceCell; const UPLOAD_FAILURE_RETRY_MIN_MS: u64 = 10; const UPLOAD_FAILURE_RETRY_MAX_MS: u64 = 5000; +/// Default buffer size when interfacing with [`tokio::fs::File`]. +const BUFFER_SIZE: usize = 32 * 1024; + /// Check whether wal backup is required for timeline. If yes, mark that launcher is /// aware of current status and return the timeline. async fn is_wal_backup_required(ttid: TenantTimelineId) -> Option> { @@ -494,15 +497,13 @@ async fn backup_object( .as_ref() .unwrap(); - let file = tokio::io::BufReader::new( - File::open(&source_file) - .await - .with_context(|| format!("Failed to open file {} for wal backup", source_file))?, - ); - - storage - .upload_storage_object(Box::new(file), size, target_file) + let file = File::open(&source_file) .await + .with_context(|| format!("Failed to open file {source_file:?} for wal backup"))?; + + let file = tokio_util::io::ReaderStream::with_capacity(file, BUFFER_SIZE); + + storage.upload_storage_object(file, size, target_file).await } pub async fn read_object( @@ -524,5 +525,9 @@ pub async fn read_object( format!("Failed to open WAL segment download stream for remote path {file_path:?}") })?; - Ok(download.download_stream) + let reader = tokio_util::io::StreamReader::new(download.download_stream); + + let reader = tokio::io::BufReader::with_capacity(BUFFER_SIZE, reader); + + Ok(Box::pin(reader)) } diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs index fa44b24258..e7538f805c 100644 --- a/safekeeper/src/wal_storage.rs +++ b/safekeeper/src/wal_storage.rs @@ -565,6 +565,9 @@ impl WalReader { }) } + /// Read WAL at current position into provided buf, returns number of bytes + /// read. It can be smaller than buf size only if segment boundary is + /// reached. pub async fn read(&mut self, buf: &mut [u8]) -> Result { // If this timeline is new, we may not have a full segment yet, so // we pad the first bytes of the timeline's first WAL segment with 0s diff --git a/scripts/export_import_between_pageservers.py b/scripts/export_import_between_pageservers.py index 77e4310eac..ff584bd4b0 100755 --- a/scripts/export_import_between_pageservers.py +++ b/scripts/export_import_between_pageservers.py @@ -266,9 +266,7 @@ class NeonPageserverHttpClient(requests.Session): def tenant_create(self, new_tenant_id: uuid.UUID, ok_if_exists): res = self.post( f"http://{self.host}:{self.port}/v1/tenant", - json={ - "new_tenant_id": new_tenant_id.hex, - }, + json={"new_tenant_id": new_tenant_id.hex, "generation": 1}, ) if res.status_code == 409: diff --git a/scripts/sk_collect_dumps/.gitignore b/scripts/sk_collect_dumps/.gitignore index d9d4d0296a..cdf99aefd7 100644 --- a/scripts/sk_collect_dumps/.gitignore +++ b/scripts/sk_collect_dumps/.gitignore @@ -1,2 +1,4 @@ result *.json +hosts +poetry.lock diff --git a/scripts/sk_collect_dumps/ansible.cfg b/scripts/sk_collect_dumps/ansible.cfg new file mode 100644 index 0000000000..150986ab79 --- /dev/null +++ b/scripts/sk_collect_dumps/ansible.cfg @@ -0,0 +1,11 @@ +[defaults] +host_key_checking = False +inventory=./hosts +remote_tmp=/tmp +remote_user=developer +callbacks_enabled = profile_tasks + +[ssh_connection] +scp_if_ssh = True +ssh_args = -F ./ssh.cfg +pipelining = True diff --git a/scripts/sk_collect_dumps/pyproject.toml b/scripts/sk_collect_dumps/pyproject.toml new file mode 100644 index 0000000000..c6f6adafe2 --- /dev/null +++ b/scripts/sk_collect_dumps/pyproject.toml @@ -0,0 +1,16 @@ +[tool.poetry] +name = "sk-collect-dumps" +version = "0.1.0" +description = "" +authors = ["Arseny Sher "] +readme = "README.md" +packages = [{include = "sk_collect_dumps"}] + +[tool.poetry.dependencies] +python = "^3.11" +ansible = "^9.1.0" + + +[build-system] +requires = ["poetry-core"] +build-backend = "poetry.core.masonry.api" diff --git a/scripts/sk_collect_dumps/readme.md b/scripts/sk_collect_dumps/readme.md index 52b73e9495..7494a6cb78 100644 --- a/scripts/sk_collect_dumps/readme.md +++ b/scripts/sk_collect_dumps/readme.md @@ -1,25 +1,43 @@ # Collect /v1/debug_dump from all safekeeper nodes -1. Run ansible playbooks to collect .json dumps from all safekeepers and store them in `./result` directory. -2. Run `DB_CONNSTR=... ./upload.sh prod_feb30` to upload dumps to `prod_feb30` table in specified postgres database. - -## How to use ansible (staging) - +3. Issue admin token (add/remove .stage from url for staging/prod and setting proper API key): ``` -AWS_DEFAULT_PROFILE=dev ansible-playbook -i ../../.github/ansible/staging.us-east-2.hosts.yaml -e @../../.github/ansible/ssm_config remote.yaml +# staging: +AUTH_TOKEN=$(curl https://console.stage.neon.tech/regions/console/api/v1/admin/issue_token -H "Accept: application/json" -H "Content-Type: application/json" -H "Authorization: Bearer $NEON_STAGING_KEY" -X POST -d '{"ttl_seconds": 43200, "scope": "safekeeperdata"}' 2>/dev/null | jq --raw-output '.jwt') +# prod: +AUTH_TOKEN=$(curl https://console.neon.tech/regions/console/api/v1/admin/issue_token -H "Accept: application/json" -H "Content-Type: application/json" -H "Authorization: Bearer $NEON_PROD_KEY" -X POST -d '{"ttl_seconds": 43200, "scope": "safekeeperdata"}' 2>/dev/null | jq --raw-output '.jwt') +# check +echo $AUTH_TOKEN +``` +2. Run ansible playbooks to collect .json dumps from all safekeepers and store them in `./result` directory. -AWS_DEFAULT_PROFILE=dev ansible-playbook -i ../../.github/ansible/staging.eu-west-1.hosts.yaml -e @../../.github/ansible/ssm_config remote.yaml +There are two ways to do that, with ssm or tsh. ssm: +``` +# in aws repo, cd .github/ansible and run e.g. (adjusting profile and region in vars and limit): +AWS_DEFAULT_PROFILE=dev ansible-playbook -i inventory_aws_ec2.yaml -i staging.us-east-2.vars.yaml -e @ssm_config -l 'safekeeper:&us_east_2' -e "auth_token=${AUTH_TOKEN}" ~/neon/neon/scripts/sk_collect_dumps/remote.yaml +``` +It will put the results to .results directory *near the playbook*. + +tsh: + +Update the inventory, if needed, selecting .build/.tech and optionally region: +``` +rm -f hosts && echo '[safekeeper]' >> hosts +# staging: +tsh ls | awk '{print $1}' | grep safekeeper | grep "neon.build" | grep us-east-2 >> hosts +# prod: +tsh ls | awk '{print $1}' | grep safekeeper | grep "neon.tech" | grep us-east-2 >> hosts ``` -## How to use ansible (prod) - +Test ansible connection: ``` -AWS_DEFAULT_PROFILE=prod ansible-playbook -i ../../.github/ansible/prod.us-west-2.hosts.yaml -e @../../.github/ansible/ssm_config remote.yaml - -AWS_DEFAULT_PROFILE=prod ansible-playbook -i ../../.github/ansible/prod.us-east-2.hosts.yaml -e @../../.github/ansible/ssm_config remote.yaml - -AWS_DEFAULT_PROFILE=prod ansible-playbook -i ../../.github/ansible/prod.eu-central-1.hosts.yaml -e @../../.github/ansible/ssm_config remote.yaml - -AWS_DEFAULT_PROFILE=prod ansible-playbook -i ../../.github/ansible/prod.ap-southeast-1.hosts.yaml -e @../../.github/ansible/ssm_config remote.yaml +ansible all -m ping -v ``` +Download the dumps: +``` +mkdir -p result && rm -f result/* +ansible-playbook -e "auth_token=${AUTH_TOKEN}" remote.yaml +``` + +3. Run `DB_CONNSTR=... ./upload.sh prod_feb30` to upload dumps to `prod_feb30` table in specified postgres database. diff --git a/scripts/sk_collect_dumps/remote.yaml b/scripts/sk_collect_dumps/remote.yaml index 29ce83efde..f214d0ae2c 100644 --- a/scripts/sk_collect_dumps/remote.yaml +++ b/scripts/sk_collect_dumps/remote.yaml @@ -1,18 +1,37 @@ - name: Fetch state dumps from safekeepers - hosts: safekeepers + hosts: safekeeper gather_facts: False - remote_user: "{{ remote_user }}" tasks: - - name: Download file + - name: Dump file get_url: url: "http://{{ inventory_hostname }}:7676/v1/debug_dump?dump_all=true&dump_disk_content=false" - dest: "/tmp/{{ inventory_hostname }}.json" + dest: "/tmp/{{ inventory_hostname }}-dump.json" + headers: + Authorization: "Bearer {{ auth_token }}" - - name: Fetch file from remote hosts + - name: install rsync + ansible.builtin.apt: + name: rsync + update_cache: yes + become: yes + ignore_errors: true # it can be already installed and we don't always have sudo + + - name: Fetch file from remote hosts (works only with ssm) fetch: - src: "/tmp/{{ inventory_hostname }}.json" - dest: "./result/{{ inventory_hostname }}.json" + src: "/tmp/{{ inventory_hostname }}-dump.json" + dest: "./result/{{ inventory_hostname }}-dump.json" flat: yes fail_on_missing: no + when: ansible_connection == "aws_ssm" + # xxx not sure how to make ansible 'synchronize' work with tsh + - name: Fetch file from remote hosts + shell: rsync -e 'tsh ssh' -azvP "developer@{{ inventory_hostname }}:/tmp/{{ inventory_hostname }}-dump.json" "./result/{{ inventory_hostname }}-dump.json" + delegate_to: localhost + when: ansible_connection != "aws_ssm" + + - name: remove remote dumps + ansible.builtin.file: + path: "/tmp/{{ inventory_hostname }}-dump.json" + state: absent diff --git a/scripts/sk_collect_dumps/ssh.cfg b/scripts/sk_collect_dumps/ssh.cfg new file mode 100644 index 0000000000..827c5d9286 --- /dev/null +++ b/scripts/sk_collect_dumps/ssh.cfg @@ -0,0 +1,13 @@ +# Begin generated Teleport configuration for teleport.aws.neon.tech by tsh + +# Common flags for all teleport.aws.neon.tech hosts +Host * + HostKeyAlgorithms rsa-sha2-512-cert-v01@openssh.com,rsa-sha2-256-cert-v01@openssh.com,ssh-rsa-cert-v01@openssh.com + +# Flags for all teleport.aws.neon.tech hosts except the proxy +Host * !teleport.aws.neon.tech + Port 3022 + ProxyCommand "/usr/local/bin/tsh" proxy ssh --cluster=teleport.aws.neon.tech --proxy=teleport.aws.neon.tech:443 %r@%h:%p + User developer + +# End generated Teleport configuration \ No newline at end of file diff --git a/scripts/sk_collect_dumps/upload.sh b/scripts/sk_collect_dumps/upload.sh index 2e54ecba1c..5189883fcb 100755 --- a/scripts/sk_collect_dumps/upload.sh +++ b/scripts/sk_collect_dumps/upload.sh @@ -31,22 +31,22 @@ SELECT (data->>'tenant_id') AS tenant_id, (data->>'timeline_id') AS timeline_id, (data->'memory'->>'active')::bool AS active, - (data->'memory'->>'flush_lsn')::bigint AS flush_lsn, - (data->'memory'->'mem_state'->>'backup_lsn')::bigint AS backup_lsn, - (data->'memory'->'mem_state'->>'commit_lsn')::bigint AS commit_lsn, - (data->'memory'->'mem_state'->>'peer_horizon_lsn')::bigint AS peer_horizon_lsn, - (data->'memory'->'mem_state'->>'remote_consistent_lsn')::bigint AS remote_consistent_lsn, - (data->'memory'->>'write_lsn')::bigint AS write_lsn, + (data->'memory'->>'flush_lsn')::pg_lsn AS flush_lsn, + (data->'memory'->'mem_state'->>'backup_lsn')::pg_lsn AS backup_lsn, + (data->'memory'->'mem_state'->>'commit_lsn')::pg_lsn AS commit_lsn, + (data->'memory'->'mem_state'->>'peer_horizon_lsn')::pg_lsn AS peer_horizon_lsn, + (data->'memory'->'mem_state'->>'remote_consistent_lsn')::pg_lsn AS remote_consistent_lsn, + (data->'memory'->>'write_lsn')::pg_lsn AS write_lsn, (data->'memory'->>'num_computes')::bigint AS num_computes, - (data->'memory'->>'epoch_start_lsn')::bigint AS epoch_start_lsn, + (data->'memory'->>'epoch_start_lsn')::pg_lsn AS epoch_start_lsn, (data->'memory'->>'last_removed_segno')::bigint AS last_removed_segno, (data->'memory'->>'is_cancelled')::bool AS is_cancelled, - (data->'control_file'->>'backup_lsn')::bigint AS disk_backup_lsn, - (data->'control_file'->>'commit_lsn')::bigint AS disk_commit_lsn, + (data->'control_file'->>'backup_lsn')::pg_lsn AS disk_backup_lsn, + (data->'control_file'->>'commit_lsn')::pg_lsn AS disk_commit_lsn, (data->'control_file'->'acceptor_state'->>'term')::bigint AS disk_term, - (data->'control_file'->>'local_start_lsn')::bigint AS local_start_lsn, - (data->'control_file'->>'peer_horizon_lsn')::bigint AS disk_peer_horizon_lsn, - (data->'control_file'->>'timeline_start_lsn')::bigint AS timeline_start_lsn, - (data->'control_file'->>'remote_consistent_lsn')::bigint AS disk_remote_consistent_lsn + (data->'control_file'->>'local_start_lsn')::pg_lsn AS local_start_lsn, + (data->'control_file'->>'peer_horizon_lsn')::pg_lsn AS disk_peer_horizon_lsn, + (data->'control_file'->>'timeline_start_lsn')::pg_lsn AS timeline_start_lsn, + (data->'control_file'->>'remote_consistent_lsn')::pg_lsn AS disk_remote_consistent_lsn FROM tmp_json EOF diff --git a/storage_broker/benches/rps.rs b/storage_broker/benches/rps.rs index a0c8e1f749..d66cbefa45 100644 --- a/storage_broker/benches/rps.rs +++ b/storage_broker/benches/rps.rs @@ -3,9 +3,12 @@ use std::sync::Arc; use std::time::{Duration, Instant}; use clap::Parser; -use storage_broker::proto::subscribe_safekeeper_info_request::SubscriptionKey; -use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId; -use storage_broker::proto::{SafekeeperTimelineInfo, SubscribeSafekeeperInfoRequest}; + +use storage_broker::proto::SafekeeperTimelineInfo; +use storage_broker::proto::{ + FilterTenantTimelineId, MessageType, SubscribeByFilterRequest, + TenantTimelineId as ProtoTenantTimelineId, TypeSubscription, TypedMessage, +}; use storage_broker::{BrokerClientChannel, DEFAULT_ENDPOINT}; use tokio::time; @@ -91,15 +94,23 @@ async fn subscribe(client: Option, counter: Arc, None => storage_broker::connect(DEFAULT_ENDPOINT, Duration::from_secs(5)).unwrap(), }; - let key = SubscriptionKey::TenantTimelineId(ProtoTenantTimelineId { + let ttid = ProtoTenantTimelineId { tenant_id: vec![0xFF; 16], timeline_id: tli_from_u64(i), - }); - let request = SubscribeSafekeeperInfoRequest { - subscription_key: Some(key), }; - let mut stream = client - .subscribe_safekeeper_info(request) + + let request = SubscribeByFilterRequest { + types: vec![TypeSubscription { + r#type: MessageType::SafekeeperTimelineInfo.into(), + }], + tenant_timeline_id: Some(FilterTenantTimelineId { + enabled: true, + tenant_timeline_id: Some(ttid), + }), + }; + + let mut stream: tonic::Streaming = client + .subscribe_by_filter(request) .await .unwrap() .into_inner(); diff --git a/storage_broker/proto/broker.proto b/storage_broker/proto/broker.proto index aa9d62a29f..7d1b63d23f 100644 --- a/storage_broker/proto/broker.proto +++ b/storage_broker/proto/broker.proto @@ -10,6 +10,12 @@ service BrokerService { // Publish safekeeper updates. rpc PublishSafekeeperInfo(stream SafekeeperTimelineInfo) returns (google.protobuf.Empty) {}; + + // Subscribe to all messages, limited by a filter. + rpc SubscribeByFilter(SubscribeByFilterRequest) returns (stream TypedMessage) {}; + + // Publish one message. + rpc PublishOne(TypedMessage) returns (google.protobuf.Empty) {}; } message SubscribeSafekeeperInfoRequest { @@ -48,3 +54,55 @@ message TenantTimelineId { bytes tenant_id = 1; bytes timeline_id = 2; } + +message FilterTenantTimelineId { + // If true, only messages related to `tenant_timeline_id` will be emitted. + // Otherwise, messages for all timelines will be emitted. + bool enabled = 1; + TenantTimelineId tenant_timeline_id = 2; +} + +message TypeSubscription { + MessageType type = 1; +} + +message SubscribeByFilterRequest { + // Subscription will emit messages only of the specified types. You need to specify + // at least one type to receive any messages. + repeated TypeSubscription types = 1; + + // If set and enabled, subscription will emit messages only for the specified tenant/timeline. + optional FilterTenantTimelineId tenant_timeline_id = 2; +} + +enum MessageType { + UNKNOWN = 0; + SAFEKEEPER_TIMELINE_INFO = 2; + SAFEKEEPER_DISCOVERY_REQUEST = 3; + SAFEKEEPER_DISCOVERY_RESPONSE = 4; +} + +// A message with a type. +message TypedMessage { + MessageType type = 1; + + optional SafekeeperTimelineInfo safekeeper_timeline_info = 2; + optional SafekeeperDiscoveryRequest safekeeper_discovery_request = 3; + optional SafekeeperDiscoveryResponse safekeeper_discovery_response = 4; +} + +message SafekeeperDiscoveryRequest { + TenantTimelineId tenant_timeline_id = 1; +} + +// Shorter version of SafekeeperTimelineInfo, contains only necessary fields. +message SafekeeperDiscoveryResponse { + uint64 safekeeper_id = 1; + TenantTimelineId tenant_timeline_id = 2; + // WAL available to download. + uint64 commit_lsn = 3; + // A connection string to use for WAL downloading. + string safekeeper_connstr = 4; + // Availability zone of a safekeeper. + optional string availability_zone = 5; +} diff --git a/storage_broker/src/bin/storage_broker.rs b/storage_broker/src/bin/storage_broker.rs index 9f81ac6cac..4e5f8ed724 100644 --- a/storage_broker/src/bin/storage_broker.rs +++ b/storage_broker/src/bin/storage_broker.rs @@ -35,10 +35,16 @@ use tracing::*; use utils::signals::ShutdownSignals; use metrics::{Encoder, TextEncoder}; -use storage_broker::metrics::{NUM_PUBS, NUM_SUBS_ALL, NUM_SUBS_TIMELINE}; +use storage_broker::metrics::{ + BROADCASTED_MESSAGES_TOTAL, BROADCAST_DROPPED_MESSAGES_TOTAL, NUM_PUBS, NUM_SUBS_ALL, + NUM_SUBS_TIMELINE, PROCESSED_MESSAGES_TOTAL, PUBLISHED_ONEOFF_MESSAGES_TOTAL, +}; use storage_broker::proto::broker_service_server::{BrokerService, BrokerServiceServer}; use storage_broker::proto::subscribe_safekeeper_info_request::SubscriptionKey as ProtoSubscriptionKey; -use storage_broker::proto::{SafekeeperTimelineInfo, SubscribeSafekeeperInfoRequest}; +use storage_broker::proto::{ + FilterTenantTimelineId, MessageType, SafekeeperDiscoveryRequest, SafekeeperDiscoveryResponse, + SafekeeperTimelineInfo, SubscribeByFilterRequest, SubscribeSafekeeperInfoRequest, TypedMessage, +}; use storage_broker::{ parse_proto_ttid, EitherBody, DEFAULT_KEEPALIVE_INTERVAL, DEFAULT_LISTEN_ADDR, }; @@ -73,8 +79,103 @@ struct Args { log_format: String, } -type PubId = u64; // id of publisher for registering in maps -type SubId = u64; // id of subscriber for registering in maps +/// Id of publisher for registering in maps +type PubId = u64; + +/// Id of subscriber for registering in maps +type SubId = u64; + +/// Single enum type for all messages. +#[derive(Clone, Debug, PartialEq)] +#[allow(clippy::enum_variant_names)] +enum Message { + SafekeeperTimelineInfo(SafekeeperTimelineInfo), + SafekeeperDiscoveryRequest(SafekeeperDiscoveryRequest), + SafekeeperDiscoveryResponse(SafekeeperDiscoveryResponse), +} + +impl Message { + /// Convert proto message to internal message. + pub fn from(proto_msg: TypedMessage) -> Result { + match proto_msg.r#type() { + MessageType::SafekeeperTimelineInfo => Ok(Message::SafekeeperTimelineInfo( + proto_msg.safekeeper_timeline_info.ok_or_else(|| { + Status::new(Code::InvalidArgument, "missing safekeeper_timeline_info") + })?, + )), + MessageType::SafekeeperDiscoveryRequest => Ok(Message::SafekeeperDiscoveryRequest( + proto_msg.safekeeper_discovery_request.ok_or_else(|| { + Status::new( + Code::InvalidArgument, + "missing safekeeper_discovery_request", + ) + })?, + )), + MessageType::SafekeeperDiscoveryResponse => Ok(Message::SafekeeperDiscoveryResponse( + proto_msg.safekeeper_discovery_response.ok_or_else(|| { + Status::new( + Code::InvalidArgument, + "missing safekeeper_discovery_response", + ) + })?, + )), + MessageType::Unknown => Err(Status::new( + Code::InvalidArgument, + format!("invalid message type: {:?}", proto_msg.r#type), + )), + } + } + + /// Get the tenant_timeline_id from the message. + pub fn tenant_timeline_id(&self) -> Result, Status> { + match self { + Message::SafekeeperTimelineInfo(msg) => Ok(msg + .tenant_timeline_id + .as_ref() + .map(parse_proto_ttid) + .transpose()?), + Message::SafekeeperDiscoveryRequest(msg) => Ok(msg + .tenant_timeline_id + .as_ref() + .map(parse_proto_ttid) + .transpose()?), + Message::SafekeeperDiscoveryResponse(msg) => Ok(msg + .tenant_timeline_id + .as_ref() + .map(parse_proto_ttid) + .transpose()?), + } + } + + /// Convert internal message to the protobuf struct. + pub fn as_typed_message(&self) -> TypedMessage { + let mut res = TypedMessage { + r#type: self.message_type() as i32, + ..Default::default() + }; + match self { + Message::SafekeeperTimelineInfo(msg) => { + res.safekeeper_timeline_info = Some(msg.clone()) + } + Message::SafekeeperDiscoveryRequest(msg) => { + res.safekeeper_discovery_request = Some(msg.clone()) + } + Message::SafekeeperDiscoveryResponse(msg) => { + res.safekeeper_discovery_response = Some(msg.clone()) + } + } + res + } + + /// Get the message type. + pub fn message_type(&self) -> MessageType { + match self { + Message::SafekeeperTimelineInfo(_) => MessageType::SafekeeperTimelineInfo, + Message::SafekeeperDiscoveryRequest(_) => MessageType::SafekeeperDiscoveryRequest, + Message::SafekeeperDiscoveryResponse(_) => MessageType::SafekeeperDiscoveryResponse, + } + } +} #[derive(Copy, Clone, Debug)] enum SubscriptionKey { @@ -83,7 +184,7 @@ enum SubscriptionKey { } impl SubscriptionKey { - // Parse protobuf subkey (protobuf doesn't have fixed size bytes, we get vectors). + /// Parse protobuf subkey (protobuf doesn't have fixed size bytes, we get vectors). pub fn from_proto_subscription_key(key: ProtoSubscriptionKey) -> Result { match key { ProtoSubscriptionKey::All(_) => Ok(SubscriptionKey::All), @@ -92,14 +193,29 @@ impl SubscriptionKey { } } } + + /// Parse from FilterTenantTimelineId + pub fn from_proto_filter_tenant_timeline_id( + f: &FilterTenantTimelineId, + ) -> Result { + if !f.enabled { + return Ok(SubscriptionKey::All); + } + + let ttid = + parse_proto_ttid(f.tenant_timeline_id.as_ref().ok_or_else(|| { + Status::new(Code::InvalidArgument, "missing tenant_timeline_id") + })?)?; + Ok(SubscriptionKey::Timeline(ttid)) + } } -// Channel to timeline subscribers. +/// Channel to timeline subscribers. struct ChanToTimelineSub { - chan: broadcast::Sender, - // Tracked separately to know when delete the shmem entry. receiver_count() - // is unhandy for that as unregistering and dropping the receiver side - // happens at different moments. + chan: broadcast::Sender, + /// Tracked separately to know when delete the shmem entry. receiver_count() + /// is unhandy for that as unregistering and dropping the receiver side + /// happens at different moments. num_subscribers: u64, } @@ -110,7 +226,7 @@ struct SharedState { num_subs_to_timelines: i64, chans_to_timeline_subs: HashMap, num_subs_to_all: i64, - chan_to_all_subs: broadcast::Sender, + chan_to_all_subs: broadcast::Sender, } impl SharedState { @@ -146,7 +262,7 @@ impl SharedState { &mut self, sub_key: SubscriptionKey, timeline_chan_size: usize, - ) -> (SubId, broadcast::Receiver) { + ) -> (SubId, broadcast::Receiver) { let sub_id = self.next_sub_id; self.next_sub_id += 1; let sub_rx = match sub_key { @@ -262,6 +378,29 @@ impl Registry { subscriber.id, subscriber.key, subscriber.remote_addr ); } + + /// Send msg to relevant subscribers. + pub fn send_msg(&self, msg: &Message) -> Result<(), Status> { + PROCESSED_MESSAGES_TOTAL.inc(); + + // send message to subscribers for everything + let shared_state = self.shared_state.read(); + // Err means there is no subscribers, it is fine. + shared_state.chan_to_all_subs.send(msg.clone()).ok(); + + // send message to per timeline subscribers, if there is ttid + let ttid = msg.tenant_timeline_id()?; + if let Some(ttid) = ttid { + if let Some(subs) = shared_state.chans_to_timeline_subs.get(&ttid) { + // Err can't happen here, as tx is destroyed only after removing + // from the map the last subscriber along with tx. + subs.chan + .send(msg.clone()) + .expect("rx is still in the map with zero subscribers"); + } + } + Ok(()) + } } // Private subscriber state. @@ -269,7 +408,7 @@ struct Subscriber { id: SubId, key: SubscriptionKey, // Subscriber receives messages from publishers here. - sub_rx: broadcast::Receiver, + sub_rx: broadcast::Receiver, // to unregister itself from shared state in Drop registry: Registry, // for logging @@ -291,26 +430,9 @@ struct Publisher { } impl Publisher { - // Send msg to relevant subscribers. - pub fn send_msg(&mut self, msg: &SafekeeperTimelineInfo) -> Result<(), Status> { - // send message to subscribers for everything - let shared_state = self.registry.shared_state.read(); - // Err means there is no subscribers, it is fine. - shared_state.chan_to_all_subs.send(msg.clone()).ok(); - - // send message to per timeline subscribers - let ttid = - parse_proto_ttid(msg.tenant_timeline_id.as_ref().ok_or_else(|| { - Status::new(Code::InvalidArgument, "missing tenant_timeline_id") - })?)?; - if let Some(subs) = shared_state.chans_to_timeline_subs.get(&ttid) { - // Err can't happen here, as tx is destroyed only after removing - // from the map the last subscriber along with tx. - subs.chan - .send(msg.clone()) - .expect("rx is still in the map with zero subscribers"); - } - Ok(()) + /// Send msg to relevant subscribers. + pub fn send_msg(&mut self, msg: &Message) -> Result<(), Status> { + self.registry.send_msg(msg) } } @@ -339,7 +461,7 @@ impl BrokerService for Broker { loop { match stream.next().await { - Some(Ok(msg)) => publisher.send_msg(&msg)?, + Some(Ok(msg)) => publisher.send_msg(&Message::SafekeeperTimelineInfo(msg))?, Some(Err(e)) => return Err(e), // grpc error from the stream None => break, // closed stream } @@ -371,8 +493,15 @@ impl BrokerService for Broker { let mut missed_msgs: u64 = 0; loop { match subscriber.sub_rx.recv().await { - Ok(info) => yield info, + Ok(info) => { + match info { + Message::SafekeeperTimelineInfo(info) => yield info, + _ => {}, + } + BROADCASTED_MESSAGES_TOTAL.inc(); + }, Err(RecvError::Lagged(skipped_msg)) => { + BROADCAST_DROPPED_MESSAGES_TOTAL.inc_by(skipped_msg); missed_msgs += skipped_msg; if (futures::poll!(Box::pin(warn_interval.tick()))).is_ready() { warn!("subscription id={}, key={:?} addr={:?} dropped {} messages, channel is full", @@ -392,6 +521,78 @@ impl BrokerService for Broker { Box::pin(output) as Self::SubscribeSafekeeperInfoStream )) } + + type SubscribeByFilterStream = + Pin> + Send + 'static>>; + + /// Subscribe to all messages, limited by a filter. + async fn subscribe_by_filter( + &self, + request: Request, + ) -> std::result::Result, Status> { + let remote_addr = request + .remote_addr() + .expect("TCPConnectInfo inserted by handler"); + let proto_filter = request.into_inner(); + let ttid_filter = proto_filter + .tenant_timeline_id + .as_ref() + .ok_or_else(|| Status::new(Code::InvalidArgument, "missing tenant_timeline_id"))?; + + let sub_key = SubscriptionKey::from_proto_filter_tenant_timeline_id(ttid_filter)?; + let types_set = proto_filter + .types + .iter() + .map(|t| t.r#type) + .collect::>(); + + let mut subscriber = self.registry.register_subscriber(sub_key, remote_addr); + + // transform rx into stream with item = Result, as method result demands + let output = async_stream::try_stream! { + let mut warn_interval = time::interval(Duration::from_millis(1000)); + let mut missed_msgs: u64 = 0; + loop { + match subscriber.sub_rx.recv().await { + Ok(msg) => { + let msg_type = msg.message_type() as i32; + if types_set.contains(&msg_type) { + yield msg.as_typed_message(); + BROADCASTED_MESSAGES_TOTAL.inc(); + } + }, + Err(RecvError::Lagged(skipped_msg)) => { + BROADCAST_DROPPED_MESSAGES_TOTAL.inc_by(skipped_msg); + missed_msgs += skipped_msg; + if (futures::poll!(Box::pin(warn_interval.tick()))).is_ready() { + warn!("subscription id={}, key={:?} addr={:?} dropped {} messages, channel is full", + subscriber.id, subscriber.key, subscriber.remote_addr, missed_msgs); + missed_msgs = 0; + } + } + Err(RecvError::Closed) => { + // can't happen, we never drop the channel while there is a subscriber + Err(Status::new(Code::Internal, "channel unexpectantly closed"))?; + } + } + } + }; + + Ok(Response::new( + Box::pin(output) as Self::SubscribeByFilterStream + )) + } + + /// Publish one message. + async fn publish_one( + &self, + request: Request, + ) -> std::result::Result, Status> { + let msg = Message::from(request.into_inner())?; + PUBLISHED_ONEOFF_MESSAGES_TOTAL.inc(); + self.registry.send_msg(&msg)?; + Ok(Response::new(())) + } } // We serve only metrics and healthcheck through http1. @@ -515,8 +716,8 @@ mod tests { use tokio::sync::broadcast::error::TryRecvError; use utils::id::{TenantId, TimelineId}; - fn msg(timeline_id: Vec) -> SafekeeperTimelineInfo { - SafekeeperTimelineInfo { + fn msg(timeline_id: Vec) -> Message { + Message::SafekeeperTimelineInfo(SafekeeperTimelineInfo { safekeeper_id: 1, tenant_timeline_id: Some(ProtoTenantTimelineId { tenant_id: vec![0x00; 16], @@ -533,7 +734,7 @@ mod tests { http_connstr: "neon-1-sk-1.local:7677".to_owned(), local_start_lsn: 0, availability_zone: None, - } + }) } fn tli_from_u64(i: u64) -> Vec { diff --git a/storage_broker/src/metrics.rs b/storage_broker/src/metrics.rs index f0649d0f68..1fd3dd5ad6 100644 --- a/storage_broker/src/metrics.rs +++ b/storage_broker/src/metrics.rs @@ -1,6 +1,6 @@ //! Broker metrics. -use metrics::{register_int_gauge, IntGauge}; +use metrics::{register_int_counter, register_int_gauge, IntCounter, IntGauge}; use once_cell::sync::Lazy; pub static NUM_PUBS: Lazy = Lazy::new(|| { @@ -23,3 +23,35 @@ pub static NUM_SUBS_ALL: Lazy = Lazy::new(|| { ) .expect("Failed to register metric") }); + +pub static PROCESSED_MESSAGES_TOTAL: Lazy = Lazy::new(|| { + register_int_counter!( + "storage_broker_processed_messages_total", + "Number of messages received by storage broker, before routing and broadcasting" + ) + .expect("Failed to register metric") +}); + +pub static BROADCASTED_MESSAGES_TOTAL: Lazy = Lazy::new(|| { + register_int_counter!( + "storage_broker_broadcasted_messages_total", + "Number of messages broadcasted (sent over network) to subscribers" + ) + .expect("Failed to register metric") +}); + +pub static BROADCAST_DROPPED_MESSAGES_TOTAL: Lazy = Lazy::new(|| { + register_int_counter!( + "storage_broker_broadcast_dropped_messages_total", + "Number of messages dropped due to channel capacity overflow" + ) + .expect("Failed to register metric") +}); + +pub static PUBLISHED_ONEOFF_MESSAGES_TOTAL: Lazy = Lazy::new(|| { + register_int_counter!( + "storage_broker_published_oneoff_messages_total", + "Number of one-off messages sent via PublishOne method" + ) + .expect("Failed to register metric") +}); diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 862aab84dc..5b1a8ba27d 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -28,6 +28,7 @@ import jwt import psycopg2 import pytest import requests +import toml from _pytest.config import Config from _pytest.config.argparsing import Parser from _pytest.fixtures import FixtureRequest @@ -56,6 +57,7 @@ from fixtures.remote_storage import ( RemoteStorageKind, RemoteStorageUser, S3Storage, + default_remote_storage, remote_storage_to_toml_inline_table, ) from fixtures.types import Lsn, TenantId, TimelineId @@ -345,7 +347,9 @@ class PgProtocol: """ return self.safe_psql_many([query], **kwargs)[0] - def safe_psql_many(self, queries: List[str], **kwargs: Any) -> List[List[Tuple[Any, ...]]]: + def safe_psql_many( + self, queries: List[str], log_query=True, **kwargs: Any + ) -> List[List[Tuple[Any, ...]]]: """ Execute queries against the node and return all rows. This method passes all extra params to connstr. @@ -354,7 +358,8 @@ class PgProtocol: with closing(self.connect(**kwargs)) as conn: with conn.cursor() as cur: for query in queries: - log.info(f"Executing query: {query}") + if log_query: + log.info(f"Executing query: {query}") cur.execute(query) if cur.description is None: @@ -363,6 +368,12 @@ class PgProtocol: result.append(cur.fetchall()) return result + def safe_psql_scalar(self, query, log_query=True) -> Any: + """ + Execute query returning single row with single column. + """ + return self.safe_psql(query, log_query=log_query)[0][0] + @dataclass class AuthKeys: @@ -435,7 +446,7 @@ class NeonEnvBuilder: # Pageserver remote storage self.pageserver_remote_storage = pageserver_remote_storage # Safekeepers remote storage - self.sk_remote_storage: Optional[RemoteStorage] = None + self.safekeepers_remote_storage: Optional[RemoteStorage] = None self.broker = broker self.run_id = run_id @@ -455,7 +466,6 @@ class NeonEnvBuilder: self.preserve_database_files = preserve_database_files self.initial_tenant = initial_tenant or TenantId.generate() self.initial_timeline = initial_timeline or TimelineId.generate() - self.enable_generations = False self.scrub_on_exit = False self.test_output_dir = test_output_dir @@ -468,7 +478,7 @@ class NeonEnvBuilder: # Cannot create more than one environment from one builder assert self.env is None, "environment already initialized" if default_remote_storage_if_missing and self.pageserver_remote_storage is None: - self.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) + self.enable_pageserver_remote_storage(default_remote_storage()) self.env = NeonEnv(self) return self.env @@ -505,6 +515,66 @@ class NeonEnvBuilder: return env + def from_repo_dir( + self, + repo_dir: Path, + neon_binpath: Optional[Path] = None, + pg_distrib_dir: Optional[Path] = None, + ) -> NeonEnv: + """ + A simple method to import data into the current NeonEnvBuilder from a snapshot of a repo dir. + """ + + # Setting custom `neon_binpath` and `pg_distrib_dir` is useful for compatibility tests + self.neon_binpath = neon_binpath or self.neon_binpath + self.pg_distrib_dir = pg_distrib_dir or self.pg_distrib_dir + + # Get the initial tenant and timeline from the snapshot config + snapshot_config_toml = repo_dir / "config" + with snapshot_config_toml.open("r") as f: + snapshot_config = toml.load(f) + + self.initial_tenant = TenantId(snapshot_config["default_tenant_id"]) + self.initial_timeline = TimelineId( + dict(snapshot_config["branch_name_mappings"][DEFAULT_BRANCH_NAME])[ + str(self.initial_tenant) + ] + ) + self.env = self.init_configs() + + for ps_dir in repo_dir.glob("pageserver_*"): + tenants_from_dir = ps_dir / "tenants" + tenants_to_dir = self.repo_dir / ps_dir.name / "tenants" + + log.info(f"Copying pageserver tenants directory {tenants_from_dir} to {tenants_to_dir}") + shutil.copytree(tenants_from_dir, tenants_to_dir) + + for sk_from_dir in (repo_dir / "safekeepers").glob("sk*"): + sk_to_dir = self.repo_dir / "safekeepers" / sk_from_dir.name + log.info(f"Copying safekeeper directory {sk_from_dir} to {sk_to_dir}") + sk_to_dir.rmdir() + shutil.copytree(sk_from_dir, sk_to_dir, ignore=shutil.ignore_patterns("*.log", "*.pid")) + + shutil.rmtree(self.repo_dir / "local_fs_remote_storage", ignore_errors=True) + shutil.copytree( + repo_dir / "local_fs_remote_storage", self.repo_dir / "local_fs_remote_storage" + ) + + if (attachments_json := Path(repo_dir / "attachments.json")).exists(): + shutil.copyfile(attachments_json, self.repo_dir / attachments_json.name) + + # Update the config with info about tenants and timelines + with (self.repo_dir / "config").open("r") as f: + config = toml.load(f) + + config["default_tenant_id"] = snapshot_config["default_tenant_id"] + config["branch_name_mappings"] = snapshot_config["branch_name_mappings"] + + with (self.repo_dir / "config").open("w") as f: + toml.dump(config, f) + + return self.env + def enable_scrub_on_exit(self): """ Call this if you would like the fixture to automatically run @@ -533,9 +603,11 @@ class NeonEnvBuilder: self.pageserver_remote_storage = ret def enable_safekeeper_remote_storage(self, kind: RemoteStorageKind): - assert self.sk_remote_storage is None, "sk_remote_storage already configured" + assert ( + self.safekeepers_remote_storage is None + ), "safekeepers_remote_storage already configured" - self.sk_remote_storage = self._configure_and_create_remote_storage( + self.safekeepers_remote_storage = self._configure_and_create_remote_storage( kind, RemoteStorageUser.SAFEKEEPER ) @@ -588,7 +660,7 @@ class NeonEnvBuilder: directory_to_clean.rmdir() def cleanup_remote_storage(self): - for x in [self.pageserver_remote_storage, self.sk_remote_storage]: + for x in [self.pageserver_remote_storage, self.safekeepers_remote_storage]: if isinstance(x, S3Storage): x.do_cleanup() @@ -613,8 +685,7 @@ class NeonEnvBuilder: pageserver.stop(immediate=True) - if self.env.attachment_service is not None: - self.env.attachment_service.stop(immediate=True) + self.env.attachment_service.stop(immediate=True) cleanup_error = None @@ -692,7 +763,7 @@ class NeonEnv: self.pageservers: List[NeonPageserver] = [] self.broker = config.broker self.pageserver_remote_storage = config.pageserver_remote_storage - self.safekeepers_remote_storage = config.sk_remote_storage + self.safekeepers_remote_storage = config.safekeepers_remote_storage self.pg_version = config.pg_version # Binary path for pageserver, safekeeper, etc self.neon_binpath = config.neon_binpath @@ -708,34 +779,22 @@ class NeonEnv: self.initial_tenant = config.initial_tenant self.initial_timeline = config.initial_timeline - if config.enable_generations: - attachment_service_port = self.port_distributor.get_port() - self.control_plane_api: Optional[str] = f"http://127.0.0.1:{attachment_service_port}" - self.attachment_service: Optional[NeonAttachmentService] = NeonAttachmentService(self) - else: - self.control_plane_api = None - self.attachment_service = None + attachment_service_port = self.port_distributor.get_port() + self.control_plane_api: str = f"http://127.0.0.1:{attachment_service_port}" + self.attachment_service: NeonAttachmentService = NeonAttachmentService(self) # Create a config file corresponding to the options - toml = textwrap.dedent( - f""" - default_tenant_id = '{config.initial_tenant}' - """ - ) + cfg: Dict[str, Any] = { + "default_tenant_id": str(self.initial_tenant), + "broker": { + "listen_addr": self.broker.listen_addr(), + }, + "pageservers": [], + "safekeepers": [], + } if self.control_plane_api is not None: - toml += textwrap.dedent( - f""" - control_plane_api = '{self.control_plane_api}' - """ - ) - - toml += textwrap.dedent( - f""" - [broker] - listen_addr = '{self.broker.listen_addr()}' - """ - ) + cfg["control_plane_api"] = self.control_plane_api # Create config for pageserver http_auth_type = "NeonJWT" if config.auth_enabled else "Trust" @@ -748,26 +807,24 @@ class NeonEnv: http=self.port_distributor.get_port(), ) - toml += textwrap.dedent( - f""" - [[pageservers]] - id={ps_id} - listen_pg_addr = 'localhost:{pageserver_port.pg}' - listen_http_addr = 'localhost:{pageserver_port.http}' - pg_auth_type = '{pg_auth_type}' - http_auth_type = '{http_auth_type}' - """ - ) - + ps_cfg: Dict[str, Any] = { + "id": ps_id, + "listen_pg_addr": f"localhost:{pageserver_port.pg}", + "listen_http_addr": f"localhost:{pageserver_port.http}", + "pg_auth_type": pg_auth_type, + "http_auth_type": http_auth_type, + } # Create a corresponding NeonPageserver object self.pageservers.append( NeonPageserver( self, ps_id, port=pageserver_port, - config_override=config.pageserver_config_override, + config_override=self.pageserver_config_override, ) ) + cfg["pageservers"].append(ps_cfg) + # Create config and a Safekeeper object for each safekeeper for i in range(1, config.num_safekeepers + 1): port = SafekeeperPort( @@ -776,39 +833,28 @@ class NeonEnv: http=self.port_distributor.get_port(), ) id = config.safekeepers_id_start + i # assign ids sequentially - toml += textwrap.dedent( - f""" - [[safekeepers]] - id = {id} - pg_port = {port.pg} - pg_tenant_only_port = {port.pg_tenant_only} - http_port = {port.http} - sync = {'true' if config.safekeepers_enable_fsync else 'false'}""" - ) + sk_cfg: Dict[str, Any] = { + "id": id, + "pg_port": port.pg, + "pg_tenant_only_port": port.pg_tenant_only, + "http_port": port.http, + "sync": config.safekeepers_enable_fsync, + } if config.auth_enabled: - toml += textwrap.dedent( - """ - auth_enabled = true - """ - ) - if config.sk_remote_storage is not None: - toml += textwrap.dedent( - f""" - remote_storage = "{remote_storage_to_toml_inline_table(config.sk_remote_storage)}" - """ - ) - safekeeper = Safekeeper(env=self, id=id, port=port) - self.safekeepers.append(safekeeper) + sk_cfg["auth_enabled"] = True + if self.safekeepers_remote_storage is not None: + sk_cfg["remote_storage"] = self.safekeepers_remote_storage.to_toml_inline_table() + self.safekeepers.append(Safekeeper(env=self, id=id, port=port)) + cfg["safekeepers"].append(sk_cfg) - log.info(f"Config: {toml}") - self.neon_cli.init(toml) + log.info(f"Config: {cfg}") + self.neon_cli.init(cfg) def start(self): # Start up broker, pageserver and all safekeepers self.broker.try_start() - if self.attachment_service is not None: - self.attachment_service.start() + self.attachment_service.start() for pageserver in self.pageservers: pageserver.start() @@ -847,8 +893,8 @@ class NeonEnv: """Get list of safekeeper endpoints suitable for safekeepers GUC""" return ",".join(f"localhost:{wa.port.pg}" for wa in self.safekeepers) - def get_pageserver_version(self) -> str: - bin_pageserver = str(self.neon_binpath / "pageserver") + def get_binary_version(self, binary_name: str) -> str: + bin_pageserver = str(self.neon_binpath / binary_name) res = subprocess.run( [bin_pageserver, "--version"], check=True, @@ -1287,10 +1333,10 @@ class NeonCli(AbstractNeonCli): def init( self, - config_toml: str, + config: Dict[str, Any], ) -> "subprocess.CompletedProcess[str]": with tempfile.NamedTemporaryFile(mode="w+") as tmp: - tmp.write(config_toml) + tmp.write(toml.dumps(config)) tmp.flush() cmd = ["init", f"--config={tmp.name}", "--pg-version", self.env.pg_version] @@ -1571,6 +1617,20 @@ class NeonAttachmentService: ) response.raise_for_status() + def inspect(self, tenant_id: TenantId) -> Optional[tuple[int, int]]: + response = requests.post( + f"{self.env.control_plane_api}/inspect", + json={"tenant_id": str(tenant_id)}, + ) + response.raise_for_status() + json = response.json() + log.info(f"Response: {json}") + if json["attachment"]: + # Explicit int() to make python type linter happy + return (int(json["attachment"][0]), int(json["attachment"][1])) + else: + return None + def __enter__(self) -> "NeonAttachmentService": return self @@ -1599,7 +1659,7 @@ class NeonPageserver(PgProtocol): self.running = False self.service_port = port self.config_override = config_override - self.version = env.get_pageserver_version() + self.version = env.get_binary_version("pageserver") # After a test finishes, we will scrape the log to see if there are any # unexpected error messages. If your test expects an error, add it to @@ -1714,11 +1774,16 @@ class NeonPageserver(PgProtocol): @property def workdir(self) -> Path: - return Path(os.path.join(self.env.repo_dir, f"pageserver_{self.id}")) + return self.env.repo_dir / f"pageserver_{self.id}" def assert_no_errors(self): - logfile = open(os.path.join(self.workdir, "pageserver.log"), "r") - errors = scan_pageserver_log_for_errors(logfile, self.allowed_errors) + logfile = self.workdir / "pageserver.log" + if not logfile.exists(): + log.warning(f"Skipping log check: {logfile} does not exist") + return + + with logfile.open("r") as f: + errors = scan_pageserver_log_for_errors(f, self.allowed_errors) for _lineno, error in errors: log.info(f"not allowed error: {error.strip()}") @@ -1742,7 +1807,10 @@ class NeonPageserver(PgProtocol): def log_contains(self, pattern: str) -> Optional[str]: """Check that the pageserver log contains a line that matches the given regex""" - logfile = open(os.path.join(self.workdir, "pageserver.log"), "r") + logfile = self.workdir / "pageserver.log" + if not logfile.exists(): + log.warning(f"Skipping log check: {logfile} does not exist") + return None contains_re = re.compile(pattern) @@ -1751,14 +1819,11 @@ class NeonPageserver(PgProtocol): # no guarantee it is already present in the log file. This hasn't # been a problem in practice, our python tests are not fast enough # to hit that race condition. - while True: - line = logfile.readline() - if not line: - break - - if contains_re.search(line): - # found it! - return line + with logfile.open("r") as f: + for line in f: + if contains_re.search(line): + # found it! + return line return None @@ -1769,21 +1834,56 @@ class NeonPageserver(PgProtocol): Tenant attachment passes through here to acquire a generation number before proceeding to call into the pageserver HTTP client. """ - if self.env.attachment_service is not None: - generation = self.env.attachment_service.attach_hook_issue(tenant_id, self.id) - else: - generation = None - client = self.http_client() - return client.tenant_attach(tenant_id, config, config_null, generation=generation) + return client.tenant_attach( + tenant_id, + config, + config_null, + generation=self.env.attachment_service.attach_hook_issue(tenant_id, self.id), + ) def tenant_detach(self, tenant_id: TenantId): - if self.env.attachment_service is not None: - self.env.attachment_service.attach_hook_drop(tenant_id) + self.env.attachment_service.attach_hook_drop(tenant_id) client = self.http_client() return client.tenant_detach(tenant_id) + def tenant_location_configure(self, tenant_id: TenantId, config: dict[str, Any], **kwargs): + if config["mode"].startswith("Attached") and "generation" not in config: + config["generation"] = self.env.attachment_service.attach_hook_issue(tenant_id, self.id) + + client = self.http_client() + return client.tenant_location_conf(tenant_id, config, **kwargs) + + def read_tenant_location_conf(self, tenant_id: TenantId) -> dict[str, Any]: + path = self.tenant_dir(tenant_id) / "config-v1" + log.info(f"Reading location conf from {path}") + bytes = open(path, "r").read() + try: + decoded: dict[str, Any] = toml.loads(bytes) + return decoded + except: + log.error(f"Failed to decode LocationConf, raw content ({len(bytes)} bytes): {bytes}") + raise + + def tenant_create( + self, + tenant_id: TenantId, + conf: Optional[Dict[str, Any]] = None, + auth_token: Optional[str] = None, + generation: Optional[int] = None, + ) -> TenantId: + if generation is None: + generation = self.env.attachment_service.attach_hook_issue(tenant_id, self.id) + client = self.http_client(auth_token=auth_token) + return client.tenant_create(tenant_id, conf, generation=generation) + + def tenant_load(self, tenant_id: TenantId): + client = self.http_client() + return client.tenant_load( + tenant_id, generation=self.env.attachment_service.attach_hook_issue(tenant_id, self.id) + ) + def append_pageserver_param_overrides( params_to_update: List[str], @@ -1858,7 +1958,8 @@ class PgBin: command: List[str], env: Optional[Env] = None, cwd: Optional[str] = None, - **kwargs: Any, + with_command_header=True, + **popen_kwargs: Any, ) -> str: """ Run one of the postgres binaries, with stderr and stdout redirected to a file. @@ -1871,7 +1972,13 @@ class PgBin: log.info(f"Running command '{' '.join(command)}'") env = self._build_env(env) base_path, _, _ = subprocess_capture( - self.log_dir, command, env=env, cwd=cwd, check=True, **kwargs + self.log_dir, + command, + env=env, + cwd=cwd, + check=True, + with_command_header=with_command_header, + **popen_kwargs, ) return base_path @@ -2635,6 +2742,13 @@ class Endpoint(PgProtocol): ): self.stop() + # Checkpoints running endpoint and returns pg_wal size in MB. + def get_pg_wal_size(self): + log.info(f'checkpointing at LSN {self.safe_psql("select pg_current_wal_lsn()")[0][0]}') + self.safe_psql("checkpoint") + assert self.pgdata_dir is not None # please mypy + return get_dir_size(os.path.join(self.pgdata_dir, "pg_wal")) / 1024 / 1024 + class EndpointFactory: """An object representing multiple compute endpoints.""" @@ -2682,6 +2796,7 @@ class EndpointFactory: lsn: Optional[Lsn] = None, hot_standby: bool = False, config_lines: Optional[List[str]] = None, + pageserver_id: Optional[int] = None, ) -> Endpoint: ep = Endpoint( self.env, @@ -2701,6 +2816,7 @@ class EndpointFactory: lsn=lsn, hot_standby=hot_standby, config_lines=config_lines, + pageserver_id=pageserver_id, ) def stop_all(self) -> "EndpointFactory": @@ -2811,7 +2927,10 @@ class Safekeeper: return res def http_client(self, auth_token: Optional[str] = None) -> SafekeeperHttpClient: - return SafekeeperHttpClient(port=self.port.http, auth_token=auth_token) + is_testing_enabled = '"testing"' in self.env.get_binary_version("safekeeper") + return SafekeeperHttpClient( + port=self.port.http, auth_token=auth_token, is_testing_enabled=is_testing_enabled + ) def data_dir(self) -> str: return os.path.join(self.env.repo_dir, "safekeepers", f"sk{self.id}") @@ -2826,11 +2945,18 @@ class Safekeeper: tli_dir = self.timeline_dir(tenant_id, timeline_id) segments = [] for _, _, filenames in os.walk(tli_dir): - segments.extend([f for f in filenames if f != "safekeeper.control"]) + segments.extend([f for f in filenames if not f.startswith("safekeeper.control")]) segments.sort() return segments +# Walreceiver as returned by sk's timeline status endpoint. +@dataclass +class Walreceiver: + conn_id: int + state: str + + @dataclass class SafekeeperTimelineStatus: acceptor_epoch: int @@ -2841,6 +2967,7 @@ class SafekeeperTimelineStatus: backup_lsn: Lsn peer_horizon_lsn: Lsn remote_consistent_lsn: Lsn + walreceivers: List[Walreceiver] @dataclass @@ -2854,10 +2981,11 @@ class SafekeeperMetrics: class SafekeeperHttpClient(requests.Session): HTTPError = requests.HTTPError - def __init__(self, port: int, auth_token: Optional[str] = None): + def __init__(self, port: int, auth_token: Optional[str] = None, is_testing_enabled=False): super().__init__() self.port = port self.auth_token = auth_token + self.is_testing_enabled = is_testing_enabled if auth_token is not None: self.headers["Authorization"] = f"Bearer {auth_token}" @@ -2865,6 +2993,30 @@ class SafekeeperHttpClient(requests.Session): def check_status(self): self.get(f"http://localhost:{self.port}/v1/status").raise_for_status() + def is_testing_enabled_or_skip(self): + if not self.is_testing_enabled: + pytest.skip("safekeeper was built without 'testing' feature") + + def configure_failpoints(self, config_strings: Tuple[str, str] | List[Tuple[str, str]]): + self.is_testing_enabled_or_skip() + + if isinstance(config_strings, tuple): + pairs = [config_strings] + else: + pairs = config_strings + + log.info(f"Requesting config failpoints: {repr(pairs)}") + + res = self.put( + f"http://localhost:{self.port}/v1/failpoints", + json=[{"name": name, "actions": actions} for name, actions in pairs], + ) + log.info(f"Got failpoints request response code {res.status_code}") + res.raise_for_status() + res_json = res.json() + assert res_json is None + return res_json + def debug_dump(self, params: Optional[Dict[str, str]] = None) -> Dict[str, Any]: params = params or {} res = self.get(f"http://localhost:{self.port}/v1/debug_dump", params=params) @@ -2902,6 +3054,7 @@ class SafekeeperHttpClient(requests.Session): res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}") res.raise_for_status() resj = res.json() + walreceivers = [Walreceiver(wr["conn_id"], wr["status"]) for wr in resj["walreceivers"]] return SafekeeperTimelineStatus( acceptor_epoch=resj["acceptor_state"]["epoch"], pg_version=resj["pg_info"]["pg_version"], @@ -2911,6 +3064,7 @@ class SafekeeperHttpClient(requests.Session): backup_lsn=Lsn(resj["backup_lsn"]), peer_horizon_lsn=Lsn(resj["peer_horizon_lsn"]), remote_consistent_lsn=Lsn(resj["remote_consistent_lsn"]), + walreceivers=walreceivers, ) def record_safekeeper_info(self, tenant_id: TenantId, timeline_id: TimelineId, body): @@ -3022,6 +3176,11 @@ def get_test_output_dir(request: FixtureRequest, top_output_dir: Path) -> Path: """Compute the working directory for an individual test.""" test_name = request.node.name test_dir = top_output_dir / test_name.replace("/", "-") + + # We rerun flaky tests multiple times, use a separate directory for each run. + if (suffix := getattr(request.node, "execution_count", None)) is not None: + test_dir = test_dir.parent / f"{test_dir.name}-{suffix}" + log.info(f"get_test_output_dir is {test_dir}") # make mypy happy assert isinstance(test_dir, Path) @@ -3042,7 +3201,7 @@ def pytest_addoption(parser: Parser): SMALL_DB_FILE_NAME_REGEX: re.Pattern = re.compile( # type: ignore[type-arg] - r"config|metadata|.+\.(?:toml|pid|json|sql)" + r"config|config-v1|heatmap-v1|metadata|.+\.(?:toml|pid|json|sql)" ) @@ -3303,8 +3462,6 @@ def parse_project_git_version_output(s: str) -> str: The information is generated by utils::project_git_version! """ - import re - res = re.search(r"git(-env)?:([0-9a-fA-F]{8,40})(-\S+)?", s) if res and (commit := res.group(2)): return commit diff --git a/test_runner/fixtures/pageserver/allowed_errors.py b/test_runner/fixtures/pageserver/allowed_errors.py index 5c3ae3ce4b..74c6bddf23 100755 --- a/test_runner/fixtures/pageserver/allowed_errors.py +++ b/test_runner/fixtures/pageserver/allowed_errors.py @@ -79,6 +79,9 @@ DEFAULT_PAGESERVER_ALLOWED_ERRORS = ( # AWS S3 may emit 500 errors for keys in a DeleteObjects response: we retry these # and it is not a failure of our code when it happens. ".*DeleteObjects.*We encountered an internal error. Please try again.*", + # During shutdown, DownloadError::Cancelled may be logged as an error. Cleaning this + # up is tracked in https://github.com/neondatabase/neon/issues/6096 + ".*Cancelled, shutting down.*", ) diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py index 0e00d4a7de..6dea0d923d 100644 --- a/test_runner/fixtures/pageserver/http.py +++ b/test_runner/fixtures/pageserver/http.py @@ -150,7 +150,7 @@ class PageserverHttpClient(requests.Session): # (this may change in future if we do fault injection of a kind that causes # requests TCP flows to stick) read=False, - backoff_factor=0, + backoff_factor=0.2, status_forcelist=[503], allowed_methods=None, remove_headers_on_redirect=[], @@ -210,16 +210,25 @@ class PageserverHttpClient(requests.Session): return res_json def tenant_create( - self, new_tenant_id: TenantId, conf: Optional[Dict[str, Any]] = None + self, + new_tenant_id: TenantId, + conf: Optional[Dict[str, Any]] = None, + generation: Optional[int] = None, ) -> TenantId: if conf is not None: assert "new_tenant_id" not in conf.keys() + + body: Dict[str, Any] = { + "new_tenant_id": str(new_tenant_id), + **(conf or {}), + } + + if generation is not None: + body.update({"generation": generation}) + res = self.post( f"http://localhost:{self.port}/v1/tenant", - json={ - "new_tenant_id": str(new_tenant_id), - **(conf or {}), - }, + json=body, ) self.verbose_error(res) if res.status_code == 409: @@ -260,13 +269,41 @@ class PageserverHttpClient(requests.Session): res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/detach", params=params) self.verbose_error(res) + def tenant_reset(self, tenant_id: TenantId, drop_cache: bool): + params = {} + if drop_cache: + params["drop_cache"] = "true" + + res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/reset", params=params) + self.verbose_error(res) + + def tenant_location_conf( + self, tenant_id: TenantId, location_conf=dict[str, Any], flush_ms=None + ): + body = location_conf.copy() + body["tenant_id"] = str(tenant_id) + + params = {} + if flush_ms is not None: + params["flush_ms"] = str(flush_ms) + + res = self.put( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/location_config", + json=body, + params=params, + ) + self.verbose_error(res) + def tenant_delete(self, tenant_id: TenantId): res = self.delete(f"http://localhost:{self.port}/v1/tenant/{tenant_id}") self.verbose_error(res) return res - def tenant_load(self, tenant_id: TenantId): - res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/load") + def tenant_load(self, tenant_id: TenantId, generation=None): + body = None + if generation is not None: + body = {"generation": generation} + res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/load", json=body) self.verbose_error(res) def tenant_ignore(self, tenant_id: TenantId): @@ -285,6 +322,10 @@ class PageserverHttpClient(requests.Session): self.verbose_error(res) return TenantConfig.from_json(res.json()) + def tenant_heatmap_upload(self, tenant_id: TenantId): + res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/heatmap_upload") + self.verbose_error(res) + def set_tenant_config(self, tenant_id: TenantId, config: dict[str, Any]): assert "tenant_id" not in config.keys() res = self.put( @@ -362,12 +403,16 @@ class PageserverHttpClient(requests.Session): new_timeline_id: TimelineId, ancestor_timeline_id: Optional[TimelineId] = None, ancestor_start_lsn: Optional[Lsn] = None, + existing_initdb_timeline_id: Optional[TimelineId] = None, **kwargs, ) -> Dict[Any, Any]: body: Dict[str, Any] = { "new_timeline_id": str(new_timeline_id), "ancestor_start_lsn": str(ancestor_start_lsn) if ancestor_start_lsn else None, "ancestor_timeline_id": str(ancestor_timeline_id) if ancestor_timeline_id else None, + "existing_initdb_timeline_id": str(existing_initdb_timeline_id) + if existing_initdb_timeline_id + else None, } if pg_version != PgVersion.NOT_SET: body["pg_version"] = int(pg_version) @@ -465,13 +510,21 @@ class PageserverHttpClient(requests.Session): assert res_json is None def timeline_get_lsn_by_timestamp( - self, tenant_id: TenantId, timeline_id: TimelineId, timestamp, version: int + self, + tenant_id: TenantId, + timeline_id: TimelineId, + timestamp, + version: Optional[int] = None, ): log.info( f"Requesting lsn by timestamp {timestamp}, tenant {tenant_id}, timeline {timeline_id}" ) + if version is None: + version_str = "" + else: + version_str = f"&version={version}" res = self.get( - f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/get_lsn_by_timestamp?timestamp={timestamp}&version={version}", + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/get_lsn_by_timestamp?timestamp={timestamp}{version_str}", ) self.verbose_error(res) res_json = res.json() @@ -661,7 +714,7 @@ class PageserverHttpClient(requests.Session): ) self.verbose_error(res) - assert res.status_code == 200 + assert res.status_code in (200, 304) def evict_all_layers(self, tenant_id: TenantId, timeline_id: TimelineId): info = self.layer_map_info(tenant_id, timeline_id) diff --git a/test_runner/fixtures/pageserver/types.py b/test_runner/fixtures/pageserver/types.py index 30e3f527bf..b3c1174b35 100644 --- a/test_runner/fixtures/pageserver/types.py +++ b/test_runner/fixtures/pageserver/types.py @@ -6,9 +6,8 @@ from fixtures.types import KEY_MAX, KEY_MIN, Key, Lsn @dataclass class IndexLayerMetadata: - @classmethod - def from_json(cls, d: Dict[str, Any]): - return {} + file_size: int + generation: int @dataclass(frozen=True) @@ -139,7 +138,7 @@ class IndexPartDump: def from_json(cls, d: Dict[str, Any]) -> "IndexPartDump": return IndexPartDump( layer_metadata={ - parse_layer_file_name(n): IndexLayerMetadata.from_json(v) + parse_layer_file_name(n): IndexLayerMetadata(v["file_size"], v["generation"]) for n, v in d["layer_metadata"].items() }, disk_consistent_lsn=Lsn(d["disk_consistent_lsn"]), diff --git a/test_runner/fixtures/pageserver/utils.py b/test_runner/fixtures/pageserver/utils.py index 007ff387f4..e7b78cfb9a 100644 --- a/test_runner/fixtures/pageserver/utils.py +++ b/test_runner/fixtures/pageserver/utils.py @@ -1,7 +1,7 @@ import time -from typing import TYPE_CHECKING, Any, Dict, Optional +from typing import TYPE_CHECKING, Any, Dict, List, Optional -from mypy_boto3_s3.type_defs import ListObjectsV2OutputTypeDef +from mypy_boto3_s3.type_defs import ListObjectsV2OutputTypeDef, ObjectTypeDef from fixtures.log_helper import log from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient @@ -235,10 +235,14 @@ if TYPE_CHECKING: from fixtures.neon_fixtures import NeonEnvBuilder -def assert_prefix_empty(neon_env_builder: "NeonEnvBuilder", prefix: Optional[str] = None): +def assert_prefix_empty( + neon_env_builder: "NeonEnvBuilder", + prefix: Optional[str] = None, + allowed_postfix: Optional[str] = None, +): response = list_prefix(neon_env_builder, prefix) keys = response["KeyCount"] - objects = response.get("Contents", []) + objects: List[ObjectTypeDef] = response.get("Contents", []) common_prefixes = response.get("CommonPrefixes", []) remote_storage = neon_env_builder.pageserver_remote_storage @@ -261,7 +265,18 @@ def assert_prefix_empty(neon_env_builder: "NeonEnvBuilder", prefix: Optional[str f"contradicting ListObjectsV2 response with KeyCount={keys} and Contents={objects}, CommonPrefixes={common_prefixes}" ) - assert keys == 0, f"remote dir with prefix {prefix} is not empty after deletion: {objects}" + filtered_count = 0 + if allowed_postfix is None: + filtered_count = len(objects) + else: + for _obj in objects: + key: str = str(response.get("Key", [])) + if not (allowed_postfix.endswith(key)): + filtered_count += 1 + + assert ( + filtered_count == 0 + ), f"remote dir with prefix {prefix} is not empty after deletion: {objects}" def assert_prefix_not_empty(neon_env_builder: "NeonEnvBuilder", prefix: Optional[str] = None): diff --git a/test_runner/fixtures/remote_storage.py b/test_runner/fixtures/remote_storage.py index 954c3142a3..c0c2383feb 100644 --- a/test_runner/fixtures/remote_storage.py +++ b/test_runner/fixtures/remote_storage.py @@ -9,13 +9,14 @@ from pathlib import Path from typing import Any, Dict, List, Optional, Union import boto3 +import toml from mypy_boto3_s3 import S3Client from fixtures.log_helper import log -from fixtures.pageserver.types import LayerFileName from fixtures.types import TenantId, TimelineId TIMELINE_INDEX_PART_FILE_NAME = "index_part.json" +TENANT_HEATMAP_FILE_NAME = "heatmap-v1.json" @enum.unique @@ -88,20 +89,63 @@ class LocalFsStorage: def timeline_path(self, tenant_id: TenantId, timeline_id: TimelineId) -> Path: return self.tenant_path(tenant_id) / "timelines" / str(timeline_id) - def layer_path( - self, tenant_id: TenantId, timeline_id: TimelineId, layer_file_name: LayerFileName - ): - return self.timeline_path(tenant_id, timeline_id) / layer_file_name.to_str() + def timeline_latest_generation(self, tenant_id, timeline_id): + timeline_files = os.listdir(self.timeline_path(tenant_id, timeline_id)) + index_parts = [f for f in timeline_files if f.startswith("index_part")] + + def parse_gen(filename): + log.info(f"parsing index_part '{filename}'") + parts = filename.split("-") + if len(parts) == 2: + return int(parts[1], 16) + else: + return None + + generations = sorted([parse_gen(f) for f in index_parts]) + if len(generations) == 0: + raise RuntimeError(f"No index_part found for {tenant_id}/{timeline_id}") + return generations[-1] def index_path(self, tenant_id: TenantId, timeline_id: TimelineId) -> Path: - return self.timeline_path(tenant_id, timeline_id) / TIMELINE_INDEX_PART_FILE_NAME + latest_gen = self.timeline_latest_generation(tenant_id, timeline_id) + if latest_gen is None: + filename = TIMELINE_INDEX_PART_FILE_NAME + else: + filename = f"{TIMELINE_INDEX_PART_FILE_NAME}-{latest_gen:08x}" + + return self.timeline_path(tenant_id, timeline_id) / filename + + def remote_layer_path( + self, + tenant_id: TenantId, + timeline_id: TimelineId, + local_name: str, + generation: Optional[int] = None, + ): + if generation is None: + generation = self.timeline_latest_generation(tenant_id, timeline_id) + + assert generation is not None, "Cannot calculate remote layer path without generation" + + filename = f"{local_name}-{generation:08x}" + return self.timeline_path(tenant_id, timeline_id) / filename def index_content(self, tenant_id: TenantId, timeline_id: TimelineId): with self.index_path(tenant_id, timeline_id).open("r") as f: return json.load(f) + def heatmap_path(self, tenant_id: TenantId) -> Path: + return self.tenant_path(tenant_id) / TENANT_HEATMAP_FILE_NAME + + def heatmap_content(self, tenant_id): + with self.heatmap_path(tenant_id).open("r") as f: + return json.load(f) + def to_toml_inline_table(self) -> str: - return f"local_path='{self.root}'" + rv = { + "local_path": str(self.root), + } + return toml.TomlEncoder().dump_inline_table(rv) def cleanup(self): # no cleanup is done here, because there's NeonEnvBuilder.cleanup_local_storage which will remove everything, including localfs files @@ -142,18 +186,18 @@ class S3Storage: ) def to_toml_inline_table(self) -> str: - s = [ - f"bucket_name='{self.bucket_name}'", - f"bucket_region='{self.bucket_region}'", - ] + rv = { + "bucket_name": self.bucket_name, + "bucket_region": self.bucket_region, + } if self.prefix_in_bucket is not None: - s.append(f"prefix_in_bucket='{self.prefix_in_bucket}'") + rv["prefix_in_bucket"] = self.prefix_in_bucket if self.endpoint is not None: - s.append(f"endpoint='{self.endpoint}'") + rv["endpoint"] = self.endpoint - return ",".join(s) + return toml.TomlEncoder().dump_inline_table(rv) def do_cleanup(self): if not self.cleanup: @@ -340,9 +384,16 @@ def s3_storage() -> RemoteStorageKind: return RemoteStorageKind.MOCK_S3 +def default_remote_storage() -> RemoteStorageKind: + """ + The remote storage kind used in tests that do not specify a preference + """ + return RemoteStorageKind.LOCAL_FS + + # serialize as toml inline table def remote_storage_to_toml_inline_table(remote_storage: RemoteStorage) -> str: if not isinstance(remote_storage, (LocalFsStorage, S3Storage)): raise Exception("invalid remote storage type") - return f"{{{remote_storage.to_toml_inline_table()}}}" + return remote_storage.to_toml_inline_table() diff --git a/test_runner/fixtures/types.py b/test_runner/fixtures/types.py index d95368f990..ea648e460d 100644 --- a/test_runner/fixtures/types.py +++ b/test_runner/fixtures/types.py @@ -125,3 +125,51 @@ class TenantId(Id): class TimelineId(Id): def __repr__(self) -> str: return f'TimelineId("{self.id.hex()}")' + + +# Workaround for compat with python 3.9, which does not have `typing.Self` +TTenantShardId = TypeVar("TTenantShardId", bound="TenantShardId") + + +class TenantShardId: + def __init__(self, tenant_id: TenantId, shard_number: int, shard_count: int): + self.tenant_id = tenant_id + self.shard_number = shard_number + self.shard_count = shard_count + assert self.shard_number < self.shard_count or self.shard_count == 0 + + @classmethod + def parse(cls: Type[TTenantShardId], input) -> TTenantShardId: + if len(input) == 32: + return cls( + tenant_id=TenantId(input), + shard_number=0, + shard_count=0, + ) + elif len(input) == 37: + return cls( + tenant_id=TenantId(input[0:32]), + shard_number=int(input[33:35], 16), + shard_count=int(input[35:37], 16), + ) + else: + raise ValueError(f"Invalid TenantShardId '{input}'") + + def __str__(self): + return f"{self.tenant_id}-{self.shard_number:02x}{self.shard_count:02x}" + + def _tuple(self) -> tuple[TenantId, int, int]: + return (self.tenant_id, self.shard_number, self.shard_count) + + def __lt__(self, other) -> bool: + if not isinstance(other, type(self)): + return NotImplemented + return self._tuple() < other._tuple() + + def __eq__(self, other) -> bool: + if not isinstance(other, type(self)): + return NotImplemented + return self._tuple() == other._tuple() + + def __hash__(self) -> int: + return hash(self._tuple()) diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py index 1ec18b9f74..cda788b2a4 100644 --- a/test_runner/fixtures/utils.py +++ b/test_runner/fixtures/utils.py @@ -49,7 +49,8 @@ def subprocess_capture( echo_stdout=False, capture_stdout=False, timeout=None, - **kwargs: Any, + with_command_header=True, + **popen_kwargs: Any, ) -> Tuple[str, Optional[str], int]: """Run a process and bifurcate its output to files and the `log` logger @@ -86,13 +87,23 @@ def subprocess_capture( self.captured = "" def run(self): + first = with_command_header for line in self.in_file: + if first: + # do this only after receiving any input so that we can + # keep deleting empty files, or leave it out completly if + # it was unwanted (using the file as input later for example) + first = False + # prefix the files with the command line so that we can + # later understand which file is for what command + self.out_file.write((f"# {' '.join(cmd)}\n\n").encode("utf-8")) + # Only bother decoding if we are going to do something more than stream to a file if self.echo or self.capture: string = line.decode(encoding="utf-8", errors="replace") if self.echo: - log.info(string) + log.info(string.strip()) if self.capture: self.captured += string @@ -107,7 +118,7 @@ def subprocess_capture( p = subprocess.Popen( cmd, - **kwargs, + **popen_kwargs, stdout=subprocess.PIPE, stderr=subprocess.PIPE, ) diff --git a/test_runner/fixtures/workload.py b/test_runner/fixtures/workload.py new file mode 100644 index 0000000000..241531437c --- /dev/null +++ b/test_runner/fixtures/workload.py @@ -0,0 +1,148 @@ +from typing import Optional + +from fixtures.log_helper import log +from fixtures.neon_fixtures import ( + Endpoint, + NeonEnv, + last_flush_lsn_upload, + wait_for_last_flush_lsn, +) +from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload +from fixtures.types import TenantId, TimelineId + + +class Workload: + """ + This is not a general purpose load generator: it exists for storage tests that need to inject some + high level types of storage work via the postgres interface: + - layer writes (`write_rows`) + - work for compaction (`churn_rows`) + - reads, checking we get the right data (`validate`) + """ + + def __init__(self, env: NeonEnv, tenant_id: TenantId, timeline_id: TimelineId): + self.env = env + self.tenant_id = tenant_id + self.timeline_id = timeline_id + self.table = "foo" + + self.expect_rows = 0 + self.churn_cursor = 0 + + self._endpoint: Optional[Endpoint] = None + + def endpoint(self, pageserver_id: int) -> Endpoint: + if self._endpoint is None: + self._endpoint = self.env.endpoints.create( + "main", + tenant_id=self.tenant_id, + pageserver_id=pageserver_id, + endpoint_id="ep-workload", + ) + self._endpoint.start(pageserver_id=pageserver_id) + else: + self._endpoint.reconfigure(pageserver_id=pageserver_id) + + connstring = self._endpoint.safe_psql( + "SELECT setting FROM pg_settings WHERE name='neon.pageserver_connstring'" + ) + log.info(f"Workload.endpoint: connstr={connstring}") + + return self._endpoint + + def __del__(self): + if self._endpoint is not None: + self._endpoint.stop() + + def init(self, pageserver_id: int): + endpoint = self.endpoint(pageserver_id) + + endpoint.safe_psql(f"CREATE TABLE {self.table} (id INTEGER PRIMARY KEY, val text);") + endpoint.safe_psql("CREATE EXTENSION IF NOT EXISTS neon_test_utils;") + last_flush_lsn_upload( + self.env, endpoint, self.tenant_id, self.timeline_id, pageserver_id=pageserver_id + ) + + def write_rows(self, n, pageserver_id): + endpoint = self.endpoint(pageserver_id) + start = self.expect_rows + end = start + n - 1 + self.expect_rows += n + dummy_value = "blah" + endpoint.safe_psql( + f""" + INSERT INTO {self.table} (id, val) + SELECT g, '{dummy_value}' + FROM generate_series({start}, {end}) g + """ + ) + + return last_flush_lsn_upload( + self.env, endpoint, self.tenant_id, self.timeline_id, pageserver_id=pageserver_id + ) + + def churn_rows(self, n, pageserver_id, upload=True): + assert self.expect_rows >= n + + max_iters = 10 + endpoint = self.endpoint(pageserver_id) + todo = n + i = 0 + while todo > 0: + i += 1 + if i > max_iters: + raise RuntimeError("oops") + start = self.churn_cursor % self.expect_rows + n_iter = min((self.expect_rows - start), todo) + todo -= n_iter + + end = start + n_iter - 1 + + log.info( + f"start,end = {start},{end}, cursor={self.churn_cursor}, expect_rows={self.expect_rows}" + ) + + assert end < self.expect_rows + + self.churn_cursor += n_iter + dummy_value = "blah" + endpoint.safe_psql_many( + [ + f""" + INSERT INTO {self.table} (id, val) + SELECT g, '{dummy_value}' + FROM generate_series({start}, {end}) g + ON CONFLICT (id) DO UPDATE + SET val = EXCLUDED.val + """, + f"VACUUM {self.table}", + ] + ) + + last_flush_lsn = wait_for_last_flush_lsn( + self.env, endpoint, self.tenant_id, self.timeline_id, pageserver_id=pageserver_id + ) + ps_http = self.env.get_pageserver(pageserver_id).http_client() + wait_for_last_record_lsn(ps_http, self.tenant_id, self.timeline_id, last_flush_lsn) + + if upload: + # force a checkpoint to trigger upload + ps_http.timeline_checkpoint(self.tenant_id, self.timeline_id) + wait_for_upload(ps_http, self.tenant_id, self.timeline_id, last_flush_lsn) + log.info(f"Churn: waiting for remote LSN {last_flush_lsn}") + else: + log.info(f"Churn: not waiting for upload, disk LSN {last_flush_lsn}") + + def validate(self, pageserver_id): + endpoint = self.endpoint(pageserver_id) + result = endpoint.safe_psql_many( + [ + "select clear_buffer_cache()", + f""" + SELECT COUNT(*) FROM {self.table} + """, + ] + ) + + log.info(f"validate({self.expect_rows}): {result}") + assert result == [[("",)], [(self.expect_rows,)]] diff --git a/test_runner/performance/test_bulk_insert.py b/test_runner/performance/test_bulk_insert.py index 46acec0f63..edc23b29ba 100644 --- a/test_runner/performance/test_bulk_insert.py +++ b/test_runner/performance/test_bulk_insert.py @@ -55,9 +55,19 @@ def measure_recovery_time(env: NeonCompare): # Delete the Tenant in the pageserver: this will drop local and remote layers, such that # when we "create" the Tenant again, we will replay the WAL from the beginning. + # + # This is a "weird" thing to do, and can confuse the attachment service as we're re-using + # the same tenant ID for a tenant that is logically different from the pageserver's point + # of view, but the same as far as the safekeeper/WAL is concerned. To work around that, + # we will explicitly create the tenant in the same generation that it was previously + # attached in. + attach_status = env.env.attachment_service.inspect(tenant_id=env.tenant) + assert attach_status is not None + (attach_gen, _) = attach_status + client.tenant_delete(env.tenant) wait_tenant_status_404(client, env.tenant, iterations=60, interval=0.5) - client.tenant_create(new_tenant_id=env.tenant) + env.env.pageserver.tenant_create(tenant_id=env.tenant, generation=attach_gen) # Measure recovery time with env.record_duration("wal_recovery"): diff --git a/test_runner/performance/test_perf_olap.py b/test_runner/performance/test_perf_olap.py index 0f7615f7ed..1e6e9a0174 100644 --- a/test_runner/performance/test_perf_olap.py +++ b/test_runner/performance/test_perf_olap.py @@ -17,6 +17,27 @@ class LabelledQuery: query: str +# This must run before all tests in this module +# create extension pg_stat_statements if it does not exist +# and TEST_OLAP_COLLECT_PG_STAT_STATEMENTS is set to true (default false) +# Theoretically this could be in a module or session scope fixture, +# however the code depends on other fixtures that have function scope +@pytest.mark.skipif( + os.getenv("TEST_OLAP_COLLECT_PG_STAT_STATEMENTS", "false").lower() == "false", + reason="Skipping - Creating extension pg_stat_statements", +) +@pytest.mark.remote_cluster +def test_clickbench_create_pg_stat_statements(remote_compare: RemoteCompare): + log.info("Creating extension pg_stat_statements") + query = LabelledQuery( + "Q_CREATE_EXTENSION", r"CREATE EXTENSION IF NOT EXISTS pg_stat_statements;" + ) + run_psql(remote_compare, query, times=1, explain=False) + log.info("Reset pg_stat_statements") + query = LabelledQuery("Q_RESET", r"SELECT pg_stat_statements_reset();") + run_psql(remote_compare, query, times=1, explain=False) + + # A list of queries to run. # Please do not alter the label for the query, as it is used to identify it. # Labels for ClickBench queries match the labels in ClickBench reports @@ -78,6 +99,8 @@ QUERIES: Tuple[LabelledQuery, ...] = ( # fmt: on ) +EXPLAIN_STRING: str = "EXPLAIN (ANALYZE, VERBOSE, BUFFERS, COSTS, SETTINGS, FORMAT JSON)" + def get_scale() -> List[str]: # We parametrize each tpc-h and clickbench test with scale @@ -88,7 +111,10 @@ def get_scale() -> List[str]: return [scale] -def run_psql(env: RemoteCompare, labelled_query: LabelledQuery, times: int) -> None: +# run the query times times plus once with EXPLAIN VERBOSE if explain is requestd +def run_psql( + env: RemoteCompare, labelled_query: LabelledQuery, times: int, explain: bool = False +) -> None: # prepare connstr: # - cut out password from connstr to pass it via env # - add options to connstr @@ -108,6 +134,13 @@ def run_psql(env: RemoteCompare, labelled_query: LabelledQuery, times: int) -> N log.info(f"Run {run}/{times}") with env.zenbenchmark.record_duration(f"{label}/{run}"): env.pg_bin.run_capture(["psql", connstr, "-c", query], env=environ) + if explain: + log.info(f"Explaining query {label}") + run += 1 + with env.zenbenchmark.record_duration(f"{label}/EXPLAIN"): + env.pg_bin.run_capture( + ["psql", connstr, "-c", f"{EXPLAIN_STRING} {query}"], env=environ + ) @pytest.mark.parametrize("scale", get_scale()) @@ -118,10 +151,13 @@ def test_clickbench(query: LabelledQuery, remote_compare: RemoteCompare, scale: An OLAP-style ClickHouse benchmark Based on https://github.com/ClickHouse/ClickBench/tree/c00135ca5b6a0d86fedcdbf998fdaa8ed85c1c3b/aurora-postgresql - The DB prepared manually in advance + The DB prepared manually in advance. + Important: after intial data load, run `VACUUM (DISABLE_PAGE_SKIPPING, FREEZE, ANALYZE) hits;` + to ensure that Postgres optimizer chooses the same plans as RDS and Aurora. """ + explain: bool = os.getenv("TEST_OLAP_COLLECT_EXPLAIN", "false").lower() == "true" - run_psql(remote_compare, query, times=3) + run_psql(remote_compare, query, times=3, explain=explain) def tpch_queuies() -> Tuple[ParameterSet, ...]: @@ -195,3 +231,16 @@ def test_user_examples(remote_compare: RemoteCompare): """, ) run_psql(remote_compare, query, times=3) + + +# This must run after all tests in this module +# Collect pg_stat_statements after running the tests if TEST_OLAP_COLLECT_PG_STAT_STATEMENTS is set to true (default false) +@pytest.mark.skipif( + os.getenv("TEST_OLAP_COLLECT_PG_STAT_STATEMENTS", "false").lower() == "false", + reason="Skipping - Collecting pg_stat_statements", +) +@pytest.mark.remote_cluster +def test_clickbench_collect_pg_stat_statements(remote_compare: RemoteCompare): + log.info("Collecting pg_stat_statements") + query = LabelledQuery("Q_COLLECT_PG_STAT_STATEMENTS", r"SELECT * from pg_stat_statements;") + run_psql(remote_compare, query, times=1, explain=False) diff --git a/test_runner/pg_clients/test_pg_clients.py b/test_runner/pg_clients/test_pg_clients.py index 8381eac946..3579c92b0c 100644 --- a/test_runner/pg_clients/test_pg_clients.py +++ b/test_runner/pg_clients/test_pg_clients.py @@ -48,6 +48,6 @@ def test_pg_clients(test_output_dir: Path, remote_pg: RemotePostgres, client: st subprocess_capture(test_output_dir, build_cmd, check=True) run_cmd = [docker_bin, "run", "--rm", "--env-file", env_file, image_tag] - basepath, _, _ = subprocess_capture(test_output_dir, run_cmd, check=True) + _, output, _ = subprocess_capture(test_output_dir, run_cmd, check=True, capture_stdout=True) - assert Path(f"{basepath}.stdout").read_text().strip() == "1" + assert str(output).strip() == "1" diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py index 10cffb1d6c..32397bbcc1 100644 --- a/test_runner/regress/test_attach_tenant_config.py +++ b/test_runner/regress/test_attach_tenant_config.py @@ -52,7 +52,16 @@ def negative_env(neon_env_builder: NeonEnvBuilder) -> Generator[NegativeTests, N TenantId(t["id"]) for t in ps_http.tenant_list() ], "tenant should not be attached after negative test" - env.pageserver.allowed_errors.append(".*Error processing HTTP request: Bad request") + env.pageserver.allowed_errors.extend( + [ + # This fixture detaches the tenant, and tests using it will tend to re-attach it + # shortly after. There may be un-processed deletion_queue validations from the + # initial attachment + ".*Dropped remote consistent LSN updates.*", + # This fixture is for tests that will intentionally generate 400 responses + ".*Error processing HTTP request: Bad request", + ] + ) def log_contains_bad_request(): env.pageserver.log_contains(".*Error processing HTTP request: Bad request") @@ -100,7 +109,6 @@ def test_config_with_unknown_keys_is_bad_request(negative_env: NegativeTests): env = negative_env.neon_env tenant_id = negative_env.tenant_id - ps_http = env.pageserver.http_client() config_with_unknown_keys = { "compaction_period": "1h", @@ -108,16 +116,16 @@ def test_config_with_unknown_keys_is_bad_request(negative_env: NegativeTests): } with pytest.raises(PageserverApiException) as e: - ps_http.tenant_attach(tenant_id, config=config_with_unknown_keys) + env.pageserver.tenant_attach(tenant_id, config=config_with_unknown_keys) assert e.type == PageserverApiException assert e.value.status_code == 400 @pytest.mark.parametrize("content_type", [None, "application/json"]) -def test_empty_body(positive_env: NeonEnv, content_type: Optional[str]): +def test_no_config(positive_env: NeonEnv, content_type: Optional[str]): """ - For backwards-compatibility: if we send an empty body, - the request should be accepted and the config should be the default config. + When the 'config' body attribute is omitted, the request should be accepted + and the tenant should use the default configuration """ env = positive_env ps_http = env.pageserver.http_client() @@ -128,9 +136,11 @@ def test_empty_body(positive_env: NeonEnv, content_type: Optional[str]): ps_http.tenant_detach(tenant_id) assert tenant_id not in [TenantId(t["id"]) for t in ps_http.tenant_list()] + body = {"generation": env.attachment_service.attach_hook_issue(tenant_id, env.pageserver.id)} + ps_http.post( f"{ps_http.base_url}/v1/tenant/{tenant_id}/attach", - data=b"", + json=body, headers=None if content_type else {"Content-Type": "application/json"}, ).raise_for_status() @@ -159,6 +169,7 @@ def test_fully_custom_config(positive_env: NeonEnv): "gc_feedback": True, "gc_horizon": 23 * (1024 * 1024), "gc_period": "2h 13m", + "heatmap_period": "10m", "image_creation_threshold": 7, "pitr_interval": "1m", "lagging_wal_timeout": "23m", @@ -191,7 +202,7 @@ def test_fully_custom_config(positive_env: NeonEnv): }, "ensure our custom config has different values than the default config for all config options, so we know we overrode everything" ps_http.tenant_detach(tenant_id) - ps_http.tenant_attach(tenant_id, config=fully_custom_config) + env.pageserver.tenant_attach(tenant_id, config=fully_custom_config) assert ps_http.tenant_config(tenant_id).tenant_specific_overrides == fully_custom_config assert set(ps_http.tenant_config(tenant_id).effective_config.keys()) == set( diff --git a/test_runner/regress/test_auth.py b/test_runner/regress/test_auth.py index f729bdee98..bd87ff3efd 100644 --- a/test_runner/regress/test_auth.py +++ b/test_runner/regress/test_auth.py @@ -60,14 +60,14 @@ def test_pageserver_auth(neon_env_builder: NeonEnvBuilder): assert_client_authorized(env, invalid_tenant_http_client) # create tenant using management token - pageserver_http_client.tenant_create(TenantId.generate()) + env.pageserver.tenant_create(TenantId.generate(), auth_token=pageserver_token) # fail to create tenant using tenant token with pytest.raises( PageserverApiException, match="Forbidden: JWT authentication error", ): - tenant_http_client.tenant_create(TenantId.generate()) + env.pageserver.tenant_create(TenantId.generate(), auth_token=tenant_token) def test_compute_auth_to_pageserver(neon_env_builder: NeonEnvBuilder): @@ -92,8 +92,9 @@ def test_compute_auth_to_pageserver(neon_env_builder: NeonEnvBuilder): def test_pageserver_multiple_keys(neon_env_builder: NeonEnvBuilder): neon_env_builder.auth_enabled = True env = neon_env_builder.init_start() - env.pageserver.allowed_errors.append(".*Authentication error: InvalidSignature.*") - env.pageserver.allowed_errors.append(".*Unauthorized: malformed jwt token.*") + env.pageserver.allowed_errors.extend( + [".*Authentication error: InvalidSignature.*", ".*Unauthorized: malformed jwt token.*"] + ) pageserver_token_old = env.auth_keys.generate_pageserver_token() pageserver_http_client_old = env.pageserver.http_client(pageserver_token_old) @@ -145,9 +146,9 @@ def test_pageserver_multiple_keys(neon_env_builder: NeonEnvBuilder): def test_pageserver_key_reload(neon_env_builder: NeonEnvBuilder): neon_env_builder.auth_enabled = True env = neon_env_builder.init_start() - env.pageserver.allowed_errors.append(".*Authentication error: InvalidSignature.*") - env.pageserver.allowed_errors.append(".*Unauthorized: malformed jwt token.*") - + env.pageserver.allowed_errors.extend( + [".*Authentication error: InvalidSignature.*", ".*Unauthorized: malformed jwt token.*"] + ) pageserver_token_old = env.auth_keys.generate_pageserver_token() pageserver_http_client_old = env.pageserver.http_client(pageserver_token_old) diff --git a/test_runner/regress/test_branch_behind.py b/test_runner/regress/test_branch_behind.py index a19b2862f8..9879254897 100644 --- a/test_runner/regress/test_branch_behind.py +++ b/test_runner/regress/test_branch_behind.py @@ -14,8 +14,9 @@ def test_branch_behind(neon_env_builder: NeonEnvBuilder): neon_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}" env = neon_env_builder.init_start() - env.pageserver.allowed_errors.append(".*invalid branch start lsn.*") - env.pageserver.allowed_errors.append(".*invalid start lsn .* for ancestor timeline.*") + env.pageserver.allowed_errors.extend( + [".*invalid branch start lsn.*", ".*invalid start lsn .* for ancestor timeline.*"] + ) # Branch at the point where only 100 rows were inserted branch_behind_timeline_id = env.neon_cli.create_branch("test_branch_behind") diff --git a/test_runner/regress/test_branching.py b/test_runner/regress/test_branching.py index c4f743204e..9a0b91b54e 100644 --- a/test_runner/regress/test_branching.py +++ b/test_runner/regress/test_branching.py @@ -1,8 +1,7 @@ import random import threading import time -from queue import SimpleQueue -from typing import Any, Dict, List, Union +from typing import List import pytest from fixtures.log_helper import log @@ -148,17 +147,17 @@ def test_cannot_create_endpoint_on_non_uploaded_timeline(neon_env_builder: NeonE env = neon_env_builder.init_configs() env.start() - env.pageserver.allowed_errors.append( - ".*request{method=POST path=/v1/tenant/.*/timeline request_id=.*}: request was dropped before completing.*" - ) - env.pageserver.allowed_errors.append( - ".*page_service_conn_main.*: query handler for 'basebackup .* is not active, state: Loading" + env.pageserver.allowed_errors.extend( + [ + ".*request{method=POST path=/v1/tenant/.*/timeline request_id=.*}: request was dropped before completing.*", + ".*page_service_conn_main.*: query handler for 'basebackup .* is not active, state: Loading", + ] ) ps_http = env.pageserver.http_client() # pause all uploads ps_http.configure_failpoints(("before-upload-index-pausable", "pause")) - ps_http.tenant_create(env.initial_tenant) + env.pageserver.tenant_create(env.initial_tenant) initial_branch = "initial_branch" @@ -200,7 +199,7 @@ def test_cannot_branch_from_non_uploaded_branch(neon_env_builder: NeonEnvBuilder # pause all uploads ps_http.configure_failpoints(("before-upload-index-pausable", "pause")) - ps_http.tenant_create(env.initial_tenant) + env.pageserver.tenant_create(env.initial_tenant) def start_creating_timeline(): with pytest.raises(RequestException): @@ -239,92 +238,6 @@ def test_cannot_branch_from_non_uploaded_branch(neon_env_builder: NeonEnvBuilder t.join() -def test_competing_branchings_from_loading_race_to_ok_or_err(neon_env_builder: NeonEnvBuilder): - """ - If the activate only after upload is used, then retries could become competing. - """ - - env = neon_env_builder.init_configs() - env.start() - - env.pageserver.allowed_errors.append( - ".*request{method=POST path=/v1/tenant/.*/timeline request_id=.*}: request was dropped before completing.*" - ) - env.pageserver.allowed_errors.append( - ".*Error processing HTTP request: InternalServerError\\(Timeline .*/.* already exists in pageserver's memory" - ) - ps_http = env.pageserver.http_client() - - # pause all uploads - ps_http.configure_failpoints(("before-upload-index-pausable", "pause")) - ps_http.tenant_create(env.initial_tenant) - - def start_creating_timeline(): - ps_http.timeline_create( - env.pg_version, env.initial_tenant, env.initial_timeline, timeout=60 - ) - - create_root = threading.Thread(target=start_creating_timeline) - - branch_id = TimelineId.generate() - - queue: SimpleQueue[Union[Dict[Any, Any], Exception]] = SimpleQueue() - barrier = threading.Barrier(3) - - def try_branch(): - barrier.wait() - barrier.wait() - try: - ret = ps_http.timeline_create( - env.pg_version, - env.initial_tenant, - branch_id, - ancestor_timeline_id=env.initial_timeline, - timeout=5, - ) - queue.put(ret) - except Exception as e: - queue.put(e) - - threads = [threading.Thread(target=try_branch) for _ in range(2)] - - try: - create_root.start() - - for t in threads: - t.start() - - wait_until_paused(env, "before-upload-index-pausable") - - barrier.wait() - ps_http.configure_failpoints(("before-upload-index-pausable", "off")) - barrier.wait() - - # now both requests race to branch, only one can win because they take gc_cs, Tenant::timelines or marker files - first = queue.get() - second = queue.get() - - log.info(first) - log.info(second) - - (succeeded, failed) = (first, second) if isinstance(second, Exception) else (second, first) - assert isinstance(failed, Exception) - assert isinstance(succeeded, Dict) - - # there's multiple valid status codes: - # - Timeline x/y already exists - # - whatever 409 response says, but that is a subclass of PageserverApiException - assert isinstance(failed, PageserverApiException) - assert succeeded["state"] == "Active" - finally: - # we might still have the failpoint active - env.pageserver.stop(immediate=True) - - for t in threads: - t.join() - create_root.join() - - def test_non_uploaded_root_timeline_is_deleted_after_restart(neon_env_builder: NeonEnvBuilder): """ Check that a timeline is deleted locally on subsequent restart if it never successfully uploaded during creation. @@ -343,8 +256,7 @@ def test_non_uploaded_root_timeline_is_deleted_after_restart(neon_env_builder: N ) ps_http = env.pageserver.http_client() - # pause all uploads - ps_http.tenant_create(env.initial_tenant) + env.pageserver.tenant_create(env.initial_tenant) # Create a timeline whose creation will succeed. The tenant will need at least one # timeline to be loadable. @@ -397,7 +309,7 @@ def test_non_uploaded_branch_is_deleted_after_restart(neon_env_builder: NeonEnvB ) ps_http = env.pageserver.http_client() - ps_http.tenant_create(env.initial_tenant) + env.pageserver.tenant_create(env.initial_tenant) ps_http.timeline_create(env.pg_version, env.initial_tenant, env.initial_timeline) # pause all uploads diff --git a/test_runner/regress/test_broken_timeline.py b/test_runner/regress/test_broken_timeline.py index 4911fc09d6..4da0ba7b20 100644 --- a/test_runner/regress/test_broken_timeline.py +++ b/test_runner/regress/test_broken_timeline.py @@ -20,7 +20,7 @@ def test_local_corruption(neon_env_builder: NeonEnvBuilder): env.pageserver.allowed_errors.extend( [ - ".*layer loading failed:.*", + ".*get_value_reconstruct_data for layer .*", ".*could not find data for key.*", ".*is not active. Current state: Broken.*", ".*will not become active. Current state: Broken.*", @@ -83,7 +83,7 @@ def test_local_corruption(neon_env_builder: NeonEnvBuilder): # (We don't check layer file contents on startup, when loading the timeline) # # This will change when we implement checksums for layers - with pytest.raises(Exception, match="layer loading failed:") as err: + with pytest.raises(Exception, match="get_value_reconstruct_data for layer ") as err: pg2.start() log.info( f"As expected, compute startup failed for timeline {tenant2}/{timeline2} with corrupt layers: {err}" @@ -114,7 +114,6 @@ def test_timeline_init_break_before_checkpoint(neon_env_builder: NeonEnvBuilder) [ ".*Failed to process timeline dir contents.*Timeline has no ancestor and no layer files.*", ".*Timeline got dropped without initializing, cleaning its files.*", - ".*Failed to load index_part from remote storage, failed creation?.*", ] ) @@ -144,8 +143,13 @@ def test_timeline_init_break_before_checkpoint(neon_env_builder: NeonEnvBuilder) ), "pageserver should clean its temp timeline files on timeline creation failure" -def test_timeline_init_break_before_checkpoint_recreate(neon_env_builder: NeonEnvBuilder): - env = neon_env_builder.init_start() +# The "exit" case is for a reproducer of issue 6007: an unclean shutdown where we can't do local fs cleanups +@pytest.mark.parametrize("exit_or_return", ["return", "exit"]) +def test_timeline_init_break_before_checkpoint_recreate( + neon_env_builder: NeonEnvBuilder, exit_or_return: str +): + env = neon_env_builder.init_configs() + env.start() pageserver_http = env.pageserver.http_client() env.pageserver.allowed_errors.extend( @@ -156,6 +160,7 @@ def test_timeline_init_break_before_checkpoint_recreate(neon_env_builder: NeonEn ] ) + env.pageserver.tenant_create(env.initial_tenant) tenant_id = env.initial_tenant timelines_dir = env.pageserver.timeline_dir(tenant_id) @@ -166,13 +171,17 @@ def test_timeline_init_break_before_checkpoint_recreate(neon_env_builder: NeonEn timeline_id = TimelineId("1080243c1f76fe3c5147266663c9860b") # Introduce failpoint during timeline init (some intermediate files are on disk), before it's checkpointed. - pageserver_http.configure_failpoints(("before-checkpoint-new-timeline", "return")) - with pytest.raises(Exception, match="before-checkpoint-new-timeline"): - _ = env.neon_cli.create_timeline( - "test_timeline_init_break_before_checkpoint", tenant_id, timeline_id - ) + failpoint = "before-checkpoint-new-timeline" + pattern = failpoint + if exit_or_return == "exit": + # in reality a read error happens, but there are automatic retries which now fail because pageserver is dead + pattern = "Connection aborted." - # Restart the page server + pageserver_http.configure_failpoints((failpoint, exit_or_return)) + with pytest.raises(Exception, match=pattern): + _ = pageserver_http.timeline_create(env.pg_version, tenant_id, timeline_id) + + # Restart the page server (with the failpoint disabled) env.pageserver.restart(immediate=True) # Creating the timeline didn't finish. The other timelines on tenant should still be present and work normally. @@ -186,11 +195,9 @@ def test_timeline_init_break_before_checkpoint_recreate(neon_env_builder: NeonEn timeline_dirs == initial_timeline_dirs ), "pageserver should clean its temp timeline files on timeline creation failure" - # Disable the failpoint again - pageserver_http.configure_failpoints(("before-checkpoint-new-timeline", "off")) # creating the branch should have worked now - new_timeline_id = env.neon_cli.create_timeline( - "test_timeline_init_break_before_checkpoint", tenant_id, timeline_id + new_timeline_id = TimelineId( + pageserver_http.timeline_create(env.pg_version, tenant_id, timeline_id)["timeline_id"] ) assert timeline_id == new_timeline_id diff --git a/test_runner/regress/test_change_pageserver.py b/test_runner/regress/test_change_pageserver.py index 410bf03c2b..adb67a579e 100644 --- a/test_runner/regress/test_change_pageserver.py +++ b/test_runner/regress/test_change_pageserver.py @@ -14,6 +14,11 @@ def test_change_pageserver(neon_env_builder: NeonEnvBuilder): ) env = neon_env_builder.init_start() + for pageserver in env.pageservers: + # This test dual-attaches a tenant, one of the pageservers will therefore + # be running with a stale generation. + pageserver.allowed_errors.append(".*Dropped remote consistent LSN updates.*") + env.neon_cli.create_branch("test_change_pageserver") endpoint = env.endpoints.create_start("test_change_pageserver") @@ -79,6 +84,10 @@ def test_change_pageserver(neon_env_builder: NeonEnvBuilder): # Try failing back, and this time we will stop the current pageserver before reconfiguring # the endpoint. Whereas the previous reconfiguration was like a healthy migration, this # is more like what happens in an unexpected pageserver failure. + # + # Since we're dual-attached, need to tip-off attachment service to treat the one we're + # about to start as the attached pageserver + env.attachment_service.attach_hook_issue(env.initial_tenant, env.pageservers[0].id) env.pageservers[0].start() env.pageservers[1].stop() @@ -88,6 +97,9 @@ def test_change_pageserver(neon_env_builder: NeonEnvBuilder): assert fetchone() == (100000,) env.pageservers[0].stop() + # Since we're dual-attached, need to tip-off attachment service to treat the one we're + # about to start as the attached pageserver + env.attachment_service.attach_hook_issue(env.initial_tenant, env.pageservers[1].id) env.pageservers[1].start() # Test a (former) bug where a child process spins without updating its connection string diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py index f3c6af4427..5a9c2782e6 100644 --- a/test_runner/regress/test_compatibility.py +++ b/test_runner/regress/test_compatibility.py @@ -1,30 +1,25 @@ -import copy import os import shutil import subprocess import tempfile from pathlib import Path -from typing import Any, List, Optional +from typing import List, Optional import pytest -import toml # TODO: replace with tomllib for Python >= 3.11 -from fixtures.log_helper import log +import toml from fixtures.neon_fixtures import ( - NeonCli, + NeonEnv, NeonEnvBuilder, PgBin, ) -from fixtures.pageserver.http import PageserverHttpClient from fixtures.pageserver.utils import ( timeline_delete_wait_completed, wait_for_last_record_lsn, wait_for_upload, ) from fixtures.pg_version import PgVersion -from fixtures.port_distributor import PortDistributor -from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind, RemoteStorageUser +from fixtures.remote_storage import RemoteStorageKind from fixtures.types import Lsn -from pytest import FixtureRequest # # A test suite that help to prevent unintentionally breaking backward or forward compatibility between Neon releases. @@ -37,8 +32,8 @@ from pytest import FixtureRequest # If the breakage is intentional, the test can be xfaild with setting ALLOW_FORWARD_COMPATIBILITY_BREAKAGE=true. # # The file contains a couple of helper functions: -# - prepare_snapshot copies the snapshot, cleans it up and makes it ready for the current version of Neon (replaces paths and ports in config files). # - check_neon_works performs the test itself, feel free to add more checks there. +# - dump_differs compares two SQL dumps and writes the diff to a file. # # # How to run `test_backward_compatibility` locally: @@ -46,6 +41,7 @@ from pytest import FixtureRequest # export DEFAULT_PG_VERSION=15 # export BUILD_TYPE=release # export CHECK_ONDISK_DATA_COMPATIBILITY=true +# export COMPATIBILITY_SNAPSHOT_DIR=test_output/compatibility_snapshot_pgv${DEFAULT_PG_VERSION} # # # Build previous version of binaries and create a data snapshot: # rm -rf pg_install target @@ -59,8 +55,7 @@ from pytest import FixtureRequest # CARGO_BUILD_FLAGS="--features=testing" make -s -j`nproc` # # # Run backward compatibility test -# COMPATIBILITY_SNAPSHOT_DIR=test_output/compatibility_snapshot_pgv${DEFAULT_PG_VERSION} \ -# ./scripts/pytest -k test_backward_compatibility +# ./scripts/pytest -k test_backward_compatibility # # # How to run `test_forward_compatibility` locally: @@ -68,6 +63,8 @@ from pytest import FixtureRequest # export DEFAULT_PG_VERSION=15 # export BUILD_TYPE=release # export CHECK_ONDISK_DATA_COMPATIBILITY=true +# export COMPATIBILITY_NEON_BIN=neon_previous/target/${BUILD_TYPE} +# export COMPATIBILITY_POSTGRES_DISTRIB_DIR=neon_previous/pg_install # # # Build previous version of binaries and store them somewhere: # rm -rf pg_install target @@ -84,9 +81,7 @@ from pytest import FixtureRequest # ./scripts/pytest -k test_create_snapshot # # # Run forward compatibility test -# COMPATIBILITY_NEON_BIN=neon_previous/target/${BUILD_TYPE} \ -# COMPATIBILITY_POSTGRES_DISTRIB_DIR=neon_previous/pg_install \ -# ./scripts/pytest -k test_forward_compatibility +# ./scripts/pytest -k test_forward_compatibility # check_ondisk_data_compatibility_if_enabled = pytest.mark.skipif( @@ -155,13 +150,9 @@ def test_create_snapshot( @pytest.mark.xdist_group("compatibility") @pytest.mark.order(after="test_create_snapshot") def test_backward_compatibility( - pg_bin: PgBin, - port_distributor: PortDistributor, + neon_env_builder: NeonEnvBuilder, test_output_dir: Path, - neon_binpath: Path, - pg_distrib_dir: Path, pg_version: PgVersion, - request: FixtureRequest, ): """ Test that the new binaries can read old data @@ -177,23 +168,15 @@ def test_backward_compatibility( ) try: - # Copy the snapshot to current directory, and prepare for the test - prepare_snapshot( - from_dir=compatibility_snapshot_dir, - to_dir=test_output_dir / "compatibility_snapshot", - port_distributor=port_distributor, - ) + neon_env_builder.num_safekeepers = 3 + env = neon_env_builder.from_repo_dir(compatibility_snapshot_dir / "repo") + neon_env_builder.start() check_neon_works( - test_output_dir / "compatibility_snapshot" / "repo", - neon_binpath, - neon_binpath, - pg_distrib_dir, - pg_version, - port_distributor, - test_output_dir, - pg_bin, - request, + env, + test_output_dir=test_output_dir, + sql_dump_path=compatibility_snapshot_dir / "dump.sql", + repo_dir=env.repo_dir, ) except Exception: if breaking_changes_allowed: @@ -212,12 +195,10 @@ def test_backward_compatibility( @pytest.mark.xdist_group("compatibility") @pytest.mark.order(after="test_create_snapshot") def test_forward_compatibility( + neon_env_builder: NeonEnvBuilder, test_output_dir: Path, top_output_dir: Path, - port_distributor: PortDistributor, pg_version: PgVersion, - request: FixtureRequest, - neon_binpath: Path, ): """ Test that the old binaries can read new data @@ -244,24 +225,19 @@ def test_forward_compatibility( ) try: - # Copy the snapshot to current directory, and prepare for the test - prepare_snapshot( - from_dir=compatibility_snapshot_dir, - to_dir=test_output_dir / "compatibility_snapshot", - port_distributor=port_distributor, + neon_env_builder.num_safekeepers = 3 + env = neon_env_builder.from_repo_dir( + compatibility_snapshot_dir / "repo", + neon_binpath=compatibility_neon_bin, pg_distrib_dir=compatibility_postgres_distrib_dir, ) + neon_env_builder.start() check_neon_works( - test_output_dir / "compatibility_snapshot" / "repo", - compatibility_neon_bin, - neon_binpath, - compatibility_postgres_distrib_dir, - pg_version, - port_distributor, - test_output_dir, - PgBin(test_output_dir, compatibility_postgres_distrib_dir, pg_version), - request, + env, + test_output_dir=test_output_dir, + sql_dump_path=compatibility_snapshot_dir / "dump.sql", + repo_dir=env.repo_dir, ) except Exception: if breaking_changes_allowed: @@ -276,193 +252,45 @@ def test_forward_compatibility( ), "Breaking changes are allowed by ALLOW_FORWARD_COMPATIBILITY_BREAKAGE, but the test has passed without any breakage" -def prepare_snapshot( - from_dir: Path, - to_dir: Path, - port_distributor: PortDistributor, - pg_distrib_dir: Optional[Path] = None, -): - assert from_dir.exists(), f"Snapshot '{from_dir}' doesn't exist" - assert (from_dir / "repo").exists(), f"Snapshot '{from_dir}' doesn't contain a repo directory" - assert (from_dir / "dump.sql").exists(), f"Snapshot '{from_dir}' doesn't contain a dump.sql" +def check_neon_works(env: NeonEnv, test_output_dir: Path, sql_dump_path: Path, repo_dir: Path): + ep = env.endpoints.create_start("main") + pg_bin = PgBin(test_output_dir, env.pg_distrib_dir, env.pg_version) - log.info(f"Copying snapshot from {from_dir} to {to_dir}") - shutil.copytree(from_dir, to_dir) - - repo_dir = to_dir / "repo" - - snapshot_config_toml = repo_dir / "config" - snapshot_config = toml.load(snapshot_config_toml) - - # Remove old logs to avoid confusion in test artifacts - for logfile in repo_dir.glob("**/*.log"): - logfile.unlink() - - # Remove old computes in 'endpoints'. Old versions of the control plane used a directory - # called "pgdatadirs". Delete it, too. - if (repo_dir / "endpoints").exists(): - shutil.rmtree(repo_dir / "endpoints") - if (repo_dir / "pgdatadirs").exists(): - shutil.rmtree(repo_dir / "pgdatadirs") - os.mkdir(repo_dir / "endpoints") - - # Update paths and ports in config files - legacy_pageserver_toml = repo_dir / "pageserver.toml" - legacy_bundle = os.path.exists(legacy_pageserver_toml) - - path_to_config: dict[Path, dict[Any, Any]] = {} - if legacy_bundle: - os.mkdir(repo_dir / "pageserver_1") - path_to_config[repo_dir / "pageserver_1" / "pageserver.toml"] = toml.load( - legacy_pageserver_toml - ) - os.remove(legacy_pageserver_toml) - os.rename(repo_dir / "tenants", repo_dir / "pageserver_1" / "tenants") - else: - for ps_conf in snapshot_config["pageservers"]: - config_path = repo_dir / f"pageserver_{ps_conf['id']}" / "pageserver.toml" - path_to_config[config_path] = toml.load(config_path) - - # For each pageserver config, edit it and rewrite - for config_path, pageserver_config in path_to_config.items(): - pageserver_config["remote_storage"]["local_path"] = str( - LocalFsStorage.component_path(repo_dir, RemoteStorageUser.PAGESERVER) - ) - - for param in ("listen_http_addr", "listen_pg_addr", "broker_endpoint"): - pageserver_config[param] = port_distributor.replace_with_new_port( - pageserver_config[param] - ) - - # We don't use authentication in compatibility tests - # so just remove authentication related settings. - pageserver_config.pop("pg_auth_type", None) - pageserver_config.pop("http_auth_type", None) - - if pg_distrib_dir: - pageserver_config["pg_distrib_dir"] = str(pg_distrib_dir) - - with config_path.open("w") as f: - toml.dump(pageserver_config, f) - - # neon_local config doesn't have to be backward compatible. If we're using a dump from before - # it supported multiple pageservers, fix it up. - if "pageservers" not in snapshot_config: - snapshot_config["pageservers"] = [snapshot_config["pageserver"]] - del snapshot_config["pageserver"] - - for param in ("listen_http_addr", "listen_pg_addr"): - for pageserver in snapshot_config["pageservers"]: - pageserver[param] = port_distributor.replace_with_new_port(pageserver[param]) - snapshot_config["broker"]["listen_addr"] = port_distributor.replace_with_new_port( - snapshot_config["broker"]["listen_addr"] - ) - for sk in snapshot_config["safekeepers"]: - for param in ("http_port", "pg_port", "pg_tenant_only_port"): - sk[param] = port_distributor.replace_with_new_port(sk[param]) - - if pg_distrib_dir: - snapshot_config["pg_distrib_dir"] = str(pg_distrib_dir) - - with snapshot_config_toml.open("w") as f: - toml.dump(snapshot_config, f) - - # Ensure that snapshot doesn't contain references to the original path - rv = subprocess.run( - [ - "grep", - "--recursive", - "--binary-file=without-match", - "--files-with-matches", - "test_create_snapshot/repo", - str(repo_dir), - ], - capture_output=True, - text=True, - ) - assert ( - rv.returncode != 0 - ), f"there're files referencing `test_create_snapshot/repo`, this path should be replaced with {repo_dir}:\n{rv.stdout}" - - -def check_neon_works( - repo_dir: Path, - neon_target_binpath: Path, - neon_current_binpath: Path, - pg_distrib_dir: Path, - pg_version: PgVersion, - port_distributor: PortDistributor, - test_output_dir: Path, - pg_bin: PgBin, - request: FixtureRequest, -): - snapshot_config_toml = repo_dir / "config" - snapshot_config = toml.load(snapshot_config_toml) - snapshot_config["neon_distrib_dir"] = str(neon_target_binpath) - snapshot_config["postgres_distrib_dir"] = str(pg_distrib_dir) - with (snapshot_config_toml).open("w") as f: - toml.dump(snapshot_config, f) - - # TODO: replace with NeonEnvBuilder / NeonEnv - config: Any = type("NeonEnvStub", (object,), {}) - config.rust_log_override = None - config.repo_dir = repo_dir - config.pg_version = pg_version - config.initial_tenant = snapshot_config["default_tenant_id"] - config.pg_distrib_dir = pg_distrib_dir - config.remote_storage = None - config.sk_remote_storage = None - - # Use the "target" binaries to launch the storage nodes - config_target = config - config_target.neon_binpath = neon_target_binpath - # We are using maybe-old binaries for neon services, but want to use current - # binaries for test utilities like neon_local - config_target.neon_local_binpath = neon_current_binpath - cli_target = NeonCli(config_target) - - # And the current binaries to launch computes - snapshot_config["neon_distrib_dir"] = str(neon_current_binpath) - with (snapshot_config_toml).open("w") as f: - toml.dump(snapshot_config, f) - config_current = copy.copy(config) - config_current.neon_binpath = neon_current_binpath - cli_current = NeonCli(config_current) - - cli_target.raw_cli(["start"]) - request.addfinalizer(lambda: cli_target.raw_cli(["stop"])) - - pg_port = port_distributor.get_port() - http_port = port_distributor.get_port() - cli_current.endpoint_create( - branch_name="main", pg_port=pg_port, http_port=http_port, endpoint_id="ep-main" - ) - cli_current.endpoint_start("ep-main") - request.addfinalizer(lambda: cli_current.endpoint_stop("ep-main")) - - connstr = f"host=127.0.0.1 port={pg_port} user=cloud_admin dbname=postgres" + connstr = ep.connstr() pg_bin.run_capture( ["pg_dumpall", f"--dbname={connstr}", f"--file={test_output_dir / 'dump.sql'}"] ) initial_dump_differs = dump_differs( - repo_dir.parent / "dump.sql", + sql_dump_path, test_output_dir / "dump.sql", test_output_dir / "dump.filediff", ) # Check that project can be recovered from WAL # loosely based on https://www.notion.so/neondatabase/Storage-Recovery-from-WAL-d92c0aac0ebf40df892b938045d7d720 - tenant_id = snapshot_config["default_tenant_id"] - timeline_id = dict(snapshot_config["branch_name_mappings"]["main"])[tenant_id] - pageserver_port = snapshot_config["pageservers"][0]["listen_http_addr"].split(":")[-1] - pageserver_http = PageserverHttpClient( - port=pageserver_port, - is_testing_enabled_or_skip=lambda: True, # TODO: check if testing really enabled + pageserver_http = env.pageserver.http_client() + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + pg_version = env.pg_version + + # Delete all files from local_fs_remote_storage except initdb.tar.zst, + # the file is required for `timeline_create` with `existing_initdb_timeline_id`. + # + # TODO: switch to Path.walk() in Python 3.12 + # for dirpath, _dirnames, filenames in (repo_dir / "local_fs_remote_storage").walk(): + for dirpath, _dirnames, filenames in os.walk(repo_dir / "local_fs_remote_storage"): + for filename in filenames: + if filename != "initdb.tar.zst": + (Path(dirpath) / filename).unlink() + + timeline_delete_wait_completed(pageserver_http, tenant_id, timeline_id) + pageserver_http.timeline_create( + pg_version=pg_version, + tenant_id=tenant_id, + new_timeline_id=timeline_id, + existing_initdb_timeline_id=timeline_id, ) - shutil.rmtree(repo_dir / "local_fs_remote_storage") - timeline_delete_wait_completed(pageserver_http, tenant_id, timeline_id) - pageserver_http.timeline_create(pg_version, tenant_id, timeline_id) pg_bin.run_capture( ["pg_dumpall", f"--dbname={connstr}", f"--file={test_output_dir / 'dump-from-wal.sql'}"] ) @@ -494,6 +322,11 @@ def dump_differs( Returns True if the dumps differ and produced diff is not allowed, False otherwise (in most cases we want it to return False). """ + if not first.exists(): + raise FileNotFoundError(f"{first} doesn't exist") + if not second.exists(): + raise FileNotFoundError(f"{second} doesn't exist") + with output.open("w") as stdout: res = subprocess.run( [ diff --git a/test_runner/regress/test_config.py b/test_runner/regress/test_config.py old mode 100755 new mode 100644 diff --git a/test_runner/regress/test_disk_usage_eviction.py b/test_runner/regress/test_disk_usage_eviction.py index f3f3a1ddf3..9fdc4d59f5 100644 --- a/test_runner/regress/test_disk_usage_eviction.py +++ b/test_runner/regress/test_disk_usage_eviction.py @@ -1,6 +1,7 @@ +import enum import time from dataclasses import dataclass -from typing import Dict, Tuple +from typing import Any, Dict, Tuple import pytest import toml @@ -64,6 +65,23 @@ def test_min_resident_size_override_handling( assert_config(tenant_id, None, config_level_override) +@enum.unique +class EvictionOrder(str, enum.Enum): + ABSOLUTE_ORDER = "absolute" + RELATIVE_ORDER_EQUAL = "relative_equal" + RELATIVE_ORDER_SPARE = "relative_spare" + + def config(self) -> Dict[str, Any]: + if self == EvictionOrder.ABSOLUTE_ORDER: + return {"type": "AbsoluteAccessed"} + elif self == EvictionOrder.RELATIVE_ORDER_EQUAL: + return {"type": "RelativeAccessed", "args": {"highest_layer_count_loses_first": False}} + elif self == EvictionOrder.RELATIVE_ORDER_SPARE: + return {"type": "RelativeAccessed", "args": {"highest_layer_count_loses_first": True}} + else: + raise RuntimeError(f"not implemented: {self}") + + @dataclass class EvictionEnv: timelines: list[Tuple[TenantId, TimelineId]] @@ -108,13 +126,14 @@ class EvictionEnv: _avg = cur.fetchone() def pageserver_start_with_disk_usage_eviction( - self, period, max_usage_pct, min_avail_bytes, mock_behavior + self, period, max_usage_pct, min_avail_bytes, mock_behavior, eviction_order: EvictionOrder ): disk_usage_config = { "period": period, "max_usage_pct": max_usage_pct, "min_avail_bytes": min_avail_bytes, "mock_statvfs": mock_behavior, + "eviction_order": eviction_order.config(), } enc = toml.TomlEncoder() @@ -270,7 +289,13 @@ def test_broken_tenants_are_skipped(eviction_env: EvictionEnv): env.neon_env.pageserver.allowed_errors.append(".*" + GLOBAL_LRU_LOG_LINE) -def test_pageserver_evicts_until_pressure_is_relieved(eviction_env: EvictionEnv): +@pytest.mark.parametrize( + "order", + [EvictionOrder.ABSOLUTE_ORDER, EvictionOrder.RELATIVE_ORDER_EQUAL], +) +def test_pageserver_evicts_until_pressure_is_relieved( + eviction_env: EvictionEnv, order: EvictionOrder +): """ Basic test to ensure that we evict enough to relieve pressure. """ @@ -281,7 +306,9 @@ def test_pageserver_evicts_until_pressure_is_relieved(eviction_env: EvictionEnv) target = total_on_disk // 2 - response = pageserver_http.disk_usage_eviction_run({"evict_bytes": target}) + response = pageserver_http.disk_usage_eviction_run( + {"evict_bytes": target, "eviction_order": order.config()} + ) log.info(f"{response}") (later_total_on_disk, _, _) = env.timelines_du() @@ -296,7 +323,13 @@ def test_pageserver_evicts_until_pressure_is_relieved(eviction_env: EvictionEnv) assert response["Finished"]["assumed"]["failed"]["count"] == 0, "zero failures expected" -def test_pageserver_respects_overridden_resident_size(eviction_env: EvictionEnv): +@pytest.mark.parametrize( + "order", + [EvictionOrder.ABSOLUTE_ORDER, EvictionOrder.RELATIVE_ORDER_EQUAL], +) +def test_pageserver_respects_overridden_resident_size( + eviction_env: EvictionEnv, order: EvictionOrder +): """ Override tenant min resident and ensure that it will be respected by eviction. """ @@ -336,7 +369,9 @@ def test_pageserver_respects_overridden_resident_size(eviction_env: EvictionEnv) env.warm_up_tenant(large_tenant[0]) # do one run - response = ps_http.disk_usage_eviction_run({"evict_bytes": target}) + response = ps_http.disk_usage_eviction_run( + {"evict_bytes": target, "eviction_order": order.config()} + ) log.info(f"{response}") time.sleep(1) # give log time to flush @@ -365,7 +400,11 @@ def test_pageserver_respects_overridden_resident_size(eviction_env: EvictionEnv) assert du_by_timeline[large_tenant] - later_du_by_timeline[large_tenant] >= target -def test_pageserver_falls_back_to_global_lru(eviction_env: EvictionEnv): +@pytest.mark.parametrize( + "order", + [EvictionOrder.ABSOLUTE_ORDER, EvictionOrder.RELATIVE_ORDER_EQUAL], +) +def test_pageserver_falls_back_to_global_lru(eviction_env: EvictionEnv, order: EvictionOrder): """ If we can't relieve pressure using tenant_min_resident_size-respecting eviction, we should continue to evict layers following global LRU. @@ -376,7 +415,9 @@ def test_pageserver_falls_back_to_global_lru(eviction_env: EvictionEnv): (total_on_disk, _, _) = env.timelines_du() target = total_on_disk - response = ps_http.disk_usage_eviction_run({"evict_bytes": target}) + response = ps_http.disk_usage_eviction_run( + {"evict_bytes": target, "eviction_order": order.config()} + ) log.info(f"{response}") (later_total_on_disk, _, _) = env.timelines_du() @@ -389,7 +430,15 @@ def test_pageserver_falls_back_to_global_lru(eviction_env: EvictionEnv): env.neon_env.pageserver.allowed_errors.append(".*" + GLOBAL_LRU_LOG_LINE) -def test_partial_evict_tenant(eviction_env: EvictionEnv): +@pytest.mark.parametrize( + "order", + [ + EvictionOrder.ABSOLUTE_ORDER, + EvictionOrder.RELATIVE_ORDER_EQUAL, + EvictionOrder.RELATIVE_ORDER_SPARE, + ], +) +def test_partial_evict_tenant(eviction_env: EvictionEnv, order: EvictionOrder): """ Warm up a tenant, then build up pressure to cause in evictions in both. We expect @@ -402,7 +451,7 @@ def test_partial_evict_tenant(eviction_env: EvictionEnv): (total_on_disk, _, _) = env.timelines_du() du_by_timeline = env.du_by_timeline() - # pick any tenant + # pick smaller or greater (iteration order is insertion order of scale=4 and scale=6) [warm, cold] = list(du_by_timeline.keys()) (tenant_id, timeline_id) = warm @@ -413,7 +462,9 @@ def test_partial_evict_tenant(eviction_env: EvictionEnv): # but not enough to fall into global LRU. # So, set target to all occupied space, except 2*env.layer_size per tenant target = du_by_timeline[cold] + (du_by_timeline[warm] // 2) - 2 * 2 * env.layer_size - response = ps_http.disk_usage_eviction_run({"evict_bytes": target}) + response = ps_http.disk_usage_eviction_run( + {"evict_bytes": target, "eviction_order": order.config()} + ) log.info(f"{response}") (later_total_on_disk, _, _) = env.timelines_du() @@ -428,28 +479,32 @@ def test_partial_evict_tenant(eviction_env: EvictionEnv): ), "all tenants should have lost some layers" warm_size = later_du_by_timeline[warm] - - # bounds for warmed_size - warm_lower = 0.5 * du_by_timeline[warm] - - # We don't know exactly whether the cold tenant needs 2 or just 1 env.layer_size wiggle room. - # So, check for up to 3 here. - warm_upper = warm_lower + 3 * env.layer_size - cold_size = later_du_by_timeline[cold] - cold_upper = 2 * env.layer_size - log.info( - f"expecting for warm tenant: {human_bytes(warm_lower)} < {human_bytes(warm_size)} < {human_bytes(warm_upper)}" - ) - log.info(f"expecting for cold tenant: {human_bytes(cold_size)} < {human_bytes(cold_upper)}") + if order == EvictionOrder.ABSOLUTE_ORDER: + # bounds for warmed_size + warm_lower = 0.5 * du_by_timeline[warm] - assert warm_size > warm_lower, "warmed up tenant should be at about half size (lower)" - assert warm_size < warm_upper, "warmed up tenant should be at about half size (upper)" + # We don't know exactly whether the cold tenant needs 2 or just 1 env.layer_size wiggle room. + # So, check for up to 3 here. + warm_upper = warm_lower + 3 * env.layer_size - assert ( - cold_size < cold_upper - ), "the cold tenant should be evicted to its min_resident_size, i.e., max layer file size" + cold_upper = 2 * env.layer_size + log.info(f"tenants: warm={warm[0]}, cold={cold[0]}") + log.info( + f"expecting for warm tenant: {human_bytes(warm_lower)} < {human_bytes(warm_size)} < {human_bytes(warm_upper)}" + ) + log.info(f"expecting for cold tenant: {human_bytes(cold_size)} < {human_bytes(cold_upper)}") + + assert warm_size > warm_lower, "warmed up tenant should be at about half size (lower)" + assert warm_size < warm_upper, "warmed up tenant should be at about half size (upper)" + + assert ( + cold_size < cold_upper + ), "the cold tenant should be evicted to its min_resident_size, i.e., max layer file size" + else: + # just go with the space was freed, find proper limits later + pass def poor_mans_du( @@ -501,6 +556,7 @@ def test_statvfs_error_handling(eviction_env: EvictionEnv): "type": "Failure", "mocked_error": "EIO", }, + eviction_order=EvictionOrder.ABSOLUTE_ORDER, ) assert env.neon_env.pageserver.log_contains(".*statvfs failed.*EIO") @@ -533,6 +589,7 @@ def test_statvfs_pressure_usage(eviction_env: EvictionEnv): # This avoids accounting for metadata files & tenant conf in the tests. "name_filter": ".*__.*", }, + eviction_order=EvictionOrder.ABSOLUTE_ORDER, ) def relieved_log_message(): @@ -573,6 +630,7 @@ def test_statvfs_pressure_min_avail_bytes(eviction_env: EvictionEnv): # This avoids accounting for metadata files & tenant conf in the tests. "name_filter": ".*__.*", }, + eviction_order=EvictionOrder.ABSOLUTE_ORDER, ) def relieved_log_message(): diff --git a/test_runner/regress/test_duplicate_layers.py b/test_runner/regress/test_duplicate_layers.py index bcf99cae7c..224e6f50c7 100644 --- a/test_runner/regress/test_duplicate_layers.py +++ b/test_runner/regress/test_duplicate_layers.py @@ -112,7 +112,9 @@ def test_actually_duplicated_l1(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin) assert l1_found is not None, "failed to find L1 locally" - uploaded = env.pageserver_remote_storage.timeline_path(tenant_id, timeline_id) / l1_found.name + uploaded = env.pageserver_remote_storage.remote_layer_path( + tenant_id, timeline_id, l1_found.name + ) assert not uploaded.exists(), "to-be-overwritten should not yet be uploaded" env.pageserver.start() @@ -139,4 +141,7 @@ def test_actually_duplicated_l1(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin) wait_for_upload_queue_empty(pageserver_http, tenant_id, timeline_id) + uploaded = env.pageserver_remote_storage.remote_layer_path( + tenant_id, timeline_id, l1_found.name + ) assert uploaded.exists(), "the L1 is uploaded" diff --git a/test_runner/regress/test_fullbackup.py b/test_runner/regress/test_fullbackup.py index 214f1f33a8..a456c06862 100644 --- a/test_runner/regress/test_fullbackup.py +++ b/test_runner/regress/test_fullbackup.py @@ -20,6 +20,7 @@ def test_fullbackup( pg_bin: PgBin, port_distributor: PortDistributor, pg_distrib_dir: Path, + test_output_dir: Path, ): env = neon_env_builder.init_start() @@ -49,10 +50,12 @@ def test_fullbackup( restored_dir_path = env.repo_dir / "restored_datadir" os.mkdir(restored_dir_path, 0o750) query = f"fullbackup {env.initial_tenant} {timeline} {lsn}" - cmd = ["psql", "--no-psqlrc", env.pageserver.connstr(), "-c", query] - result_basepath = pg_bin.run_capture(cmd, env=psql_env) - tar_output_file = result_basepath + ".stdout" - subprocess_capture(env.repo_dir, ["tar", "-xf", tar_output_file, "-C", str(restored_dir_path)]) + tar_output_file = test_output_dir / "fullbackup.tar" + cmd = ["psql", "--no-psqlrc", env.pageserver.connstr(), "-c", query, "-o", str(tar_output_file)] + pg_bin.run_capture(cmd, env=psql_env) + subprocess_capture( + env.repo_dir, ["tar", "-xf", str(tar_output_file), "-C", str(restored_dir_path)] + ) # HACK # fullbackup returns neon specific pg_control and first WAL segment diff --git a/test_runner/regress/test_gc_cutoff.py b/test_runner/regress/test_gc_cutoff.py index be3355f5cc..284a8c3563 100644 --- a/test_runner/regress/test_gc_cutoff.py +++ b/test_runner/regress/test_gc_cutoff.py @@ -35,6 +35,11 @@ def test_gc_cutoff(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): pageserver_http.configure_failpoints(("after-timeline-gc-removed-layers", "exit")) + # Because this test does a rapid series of restarts of the same node, it's possible that + # we are restarted again before we can clean up deletion lists form the previous generation, + # resulting in a subsequent startup logging a warning. + env.pageserver.allowed_errors.append(".*Dropping stale deletions for tenant.*") + for _ in range(5): with pytest.raises(subprocess.SubprocessError): pg_bin.run_capture(["pgbench", "-P1", "-N", "-c5", "-T500", "-Mprepared", connstr]) diff --git a/test_runner/regress/test_hot_standby.py b/test_runner/regress/test_hot_standby.py index 031fd2857d..7822e29ed9 100644 --- a/test_runner/regress/test_hot_standby.py +++ b/test_runner/regress/test_hot_standby.py @@ -1,19 +1,59 @@ +import os +import re import time -from fixtures.neon_fixtures import NeonEnv +from fixtures.log_helper import log +from fixtures.neon_fixtures import Endpoint, NeonEnv + + +def wait_caughtup(primary: Endpoint, secondary: Endpoint): + primary_lsn = primary.safe_psql_scalar( + "SELECT pg_current_wal_insert_lsn()::text", log_query=False + ) + while True: + secondary_lsn = secondary.safe_psql_scalar( + "SELECT pg_last_wal_replay_lsn()", log_query=False + ) + caught_up = secondary_lsn >= primary_lsn + log.info(f"caughtup={caught_up}, primary_lsn={primary_lsn}, secondary_lsn={secondary_lsn}") + if caught_up: + return + time.sleep(1) + + +# Check for corrupted WAL messages which might otherwise go unnoticed if +# reconnection fixes this. +def scan_standby_log_for_errors(secondary): + log_path = secondary.endpoint_path() / "compute.log" + with log_path.open("r") as f: + markers = re.compile( + r"incorrect resource manager data|record with incorrect|invalid magic number|unexpected pageaddr" + ) + for line in f: + if markers.search(line): + log.info(f"bad error in standby log: {line}") + raise AssertionError() def test_hot_standby(neon_simple_env: NeonEnv): env = neon_simple_env + # We've had a bug caused by WAL records split across multiple XLogData + # messages resulting in corrupted WAL complains on standby. It reproduced + # only when sending from safekeeper is slow enough to grab full + # MAX_SEND_SIZE messages. So insert sleep through failpoints, but only in + # one conf to decrease test time. + slow_down_send = "[debug-pg16]" in os.environ.get("PYTEST_CURRENT_TEST", "") + if slow_down_send: + sk_http = env.safekeepers[0].http_client() + sk_http.configure_failpoints([("sk-send-wal-replica-sleep", "return(100)")]) + with env.endpoints.create_start( branch_name="main", endpoint_id="primary", ) as primary: time.sleep(1) with env.endpoints.new_replica_start(origin=primary, endpoint_id="secondary") as secondary: - primary_lsn = None - caught_up = False queries = [ "SHOW neon.timeline_id", "SHOW neon.tenant_id", @@ -26,23 +66,6 @@ def test_hot_standby(neon_simple_env: NeonEnv): with p_con.cursor() as p_cur: p_cur.execute("CREATE TABLE test AS SELECT generate_series(1, 100) AS i") - # Explicit commit to make sure other connections (and replicas) can - # see the changes of this commit. - p_con.commit() - - with p_con.cursor() as p_cur: - p_cur.execute("SELECT pg_current_wal_insert_lsn()::text") - res = p_cur.fetchone() - assert res is not None - (lsn,) = res - primary_lsn = lsn - - # Explicit commit to make sure other connections (and replicas) can - # see the changes of this commit. - # Note that this may generate more WAL if the transaction has changed - # things, but we don't care about that. - p_con.commit() - for query in queries: with p_con.cursor() as p_cur: p_cur.execute(query) @@ -51,30 +74,28 @@ def test_hot_standby(neon_simple_env: NeonEnv): response = res responses[query] = response + # insert more data to make safekeeper send MAX_SEND_SIZE messages + if slow_down_send: + primary.safe_psql("create table t(key int, value text)") + primary.safe_psql("insert into t select generate_series(1, 100000), 'payload'") + + wait_caughtup(primary, secondary) + with secondary.connect() as s_con: with s_con.cursor() as s_cur: s_cur.execute("SELECT 1 WHERE pg_is_in_recovery()") res = s_cur.fetchone() assert res is not None - while not caught_up: - with s_con.cursor() as secondary_cursor: - secondary_cursor.execute("SELECT pg_last_wal_replay_lsn()") - res = secondary_cursor.fetchone() - assert res is not None - (secondary_lsn,) = res - # There may be more changes on the primary after we got our LSN - # due to e.g. autovacuum, but that shouldn't impact the content - # of the tables, so we check whether we've replayed up to at - # least after the commit of the `test` table. - caught_up = secondary_lsn >= primary_lsn - - # Explicit commit to flush any transient transaction-level state. - s_con.commit() - for query in queries: with s_con.cursor() as secondary_cursor: secondary_cursor.execute(query) response = secondary_cursor.fetchone() assert response is not None assert response == responses[query] + + scan_standby_log_for_errors(secondary) + + # clean up + if slow_down_send: + sk_http.configure_failpoints(("sk-send-wal-replica-sleep", "off")) diff --git a/test_runner/regress/test_import.py b/test_runner/regress/test_import.py index d357bd0ee4..faedf5d944 100644 --- a/test_runner/regress/test_import.py +++ b/test_runner/regress/test_import.py @@ -84,8 +84,7 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) env = neon_env_builder.init_start() - client = env.pageserver.http_client() - client.tenant_create(tenant) + env.pageserver.tenant_create(tenant) env.pageserver.allowed_errors.extend( [ @@ -100,12 +99,13 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build ] ) - # FIXME: we should clean up pageserver to not print this - env.pageserver.allowed_errors.append(".*exited with error: unexpected message type: CopyData.*") - - # FIXME: Is this expected? - env.pageserver.allowed_errors.append( - ".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*" + env.pageserver.allowed_errors.extend( + [ + # FIXME: we should clean up pageserver to not print this + ".*exited with error: unexpected message type: CopyData.*", + # FIXME: Is this expected? + ".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*", + ] ) def import_tar(base, wal): @@ -149,6 +149,7 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build ".*WARN.*ignored .* unexpected bytes after the tar archive.*" ) + client = env.pageserver.http_client() timeline_delete_wait_completed(client, tenant, timeline) # Importing correct backup works @@ -163,7 +164,9 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build assert endpoint.safe_psql("select count(*) from t") == [(300000,)] -def test_import_from_pageserver_small(pg_bin: PgBin, neon_env_builder: NeonEnvBuilder): +def test_import_from_pageserver_small( + pg_bin: PgBin, neon_env_builder: NeonEnvBuilder, test_output_dir: Path +): neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) env = neon_env_builder.init_start() @@ -177,7 +180,7 @@ def test_import_from_pageserver_small(pg_bin: PgBin, neon_env_builder: NeonEnvBu num_rows = 3000 lsn = _generate_data(num_rows, endpoint) - _import(num_rows, lsn, env, pg_bin, timeline, env.pg_distrib_dir) + _import(num_rows, lsn, env, pg_bin, timeline, env.pg_distrib_dir, test_output_dir) @pytest.mark.timeout(1800) @@ -185,7 +188,9 @@ def test_import_from_pageserver_small(pg_bin: PgBin, neon_env_builder: NeonEnvBu # the test back after finding the failure cause. # @pytest.mark.skipif(os.environ.get('BUILD_TYPE') == "debug", reason="only run with release build") @pytest.mark.skip("See https://github.com/neondatabase/neon/issues/2255") -def test_import_from_pageserver_multisegment(pg_bin: PgBin, neon_env_builder: NeonEnvBuilder): +def test_import_from_pageserver_multisegment( + pg_bin: PgBin, neon_env_builder: NeonEnvBuilder, test_output_dir: Path +): neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) env = neon_env_builder.init_start() @@ -205,7 +210,9 @@ def test_import_from_pageserver_multisegment(pg_bin: PgBin, neon_env_builder: Ne log.info(f"timeline logical size = {logical_size / (1024 ** 2)}MB") assert logical_size > 1024**3 # = 1GB - tar_output_file = _import(num_rows, lsn, env, pg_bin, timeline, env.pg_distrib_dir) + tar_output_file = _import( + num_rows, lsn, env, pg_bin, timeline, env.pg_distrib_dir, test_output_dir + ) # Check if the backup data contains multiple segment files cnt_seg_files = 0 @@ -246,7 +253,8 @@ def _import( pg_bin: PgBin, timeline: TimelineId, pg_distrib_dir: Path, -) -> str: + test_output_dir: Path, +) -> Path: """Test importing backup data to the pageserver. Args: @@ -263,9 +271,9 @@ def _import( # Get a fullbackup from pageserver query = f"fullbackup { env.initial_tenant} {timeline} {lsn}" - cmd = ["psql", "--no-psqlrc", env.pageserver.connstr(), "-c", query] - result_basepath = pg_bin.run_capture(cmd, env=psql_env) - tar_output_file = result_basepath + ".stdout" + tar_output_file = test_output_dir / "fullbackup.tar" + cmd = ["psql", "--no-psqlrc", env.pageserver.connstr(), "-c", query, "-o", str(tar_output_file)] + pg_bin.run_capture(cmd, env=psql_env) # Stop the first pageserver instance, erase all its data env.endpoints.stop_all() @@ -285,7 +293,7 @@ def _import( # Import to pageserver endpoint_id = "ep-import_from_pageserver" client = env.pageserver.http_client() - client.tenant_create(tenant) + env.pageserver.tenant_create(tenant) env.neon_cli.raw_cli( [ "timeline", @@ -299,7 +307,7 @@ def _import( "--base-lsn", str(lsn), "--base-tarfile", - os.path.join(tar_output_file), + str(tar_output_file), "--pg-version", env.pg_version, ] @@ -315,9 +323,17 @@ def _import( # Take another fullbackup query = f"fullbackup { tenant} {timeline} {lsn}" - cmd = ["psql", "--no-psqlrc", env.pageserver.connstr(), "-c", query] - result_basepath = pg_bin.run_capture(cmd, env=psql_env) - new_tar_output_file = result_basepath + ".stdout" + new_tar_output_file = test_output_dir / "fullbackup-new.tar" + cmd = [ + "psql", + "--no-psqlrc", + env.pageserver.connstr(), + "-c", + query, + "-o", + str(new_tar_output_file), + ] + pg_bin.run_capture(cmd, env=psql_env) # Check it's the same as the first fullbackup # TODO pageserver should be checking checksum diff --git a/test_runner/regress/test_layers_from_future.py b/test_runner/regress/test_layers_from_future.py index b36c927628..999e077e45 100644 --- a/test_runner/regress/test_layers_from_future.py +++ b/test_runner/regress/test_layers_from_future.py @@ -38,6 +38,9 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder): neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) env = neon_env_builder.init_start() + env.pageserver.allowed_errors.extend( + [".*Dropped remote consistent LSN updates.*", ".*Dropping stale deletions.*"] + ) ps_http = env.pageserver.http_client() @@ -49,7 +52,7 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder): "compaction_period": "0s", # we want to control when compaction runs "checkpoint_timeout": "24h", # something we won't reach "checkpoint_distance": f"{50 * (1024**2)}", # something we won't reach, we checkpoint manually - "image_creation_threshold": f"{image_creation_threshold}", + "image_creation_threshold": "100", # we want to control when image is created "compaction_threshold": f"{l0_l1_threshold}", "compaction_target_size": f"{128 * (1024**3)}", # make it so that we only have 1 partition => image coverage for delta layers => enables gc of delta layers } @@ -124,6 +127,10 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder): ), "sanity check for what above loop is supposed to do" # create the image layer from the future + ps_http.patch_tenant_config_client_side( + tenant_id, {"image_creation_threshold": image_creation_threshold}, None + ) + assert ps_http.tenant_config(tenant_id).effective_config["image_creation_threshold"] == 1 ps_http.timeline_compact(tenant_id, timeline_id, force_repartition=True) assert ( len( @@ -145,19 +152,27 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder): f"got layer from the future: lsn={future_layer.lsn} disk_consistent_lsn={ip.disk_consistent_lsn} last_record_lsn={last_record_lsn}" ) assert isinstance(env.pageserver_remote_storage, LocalFsStorage) - future_layer_path = env.pageserver_remote_storage.layer_path( - tenant_id, timeline_id, future_layer + future_layer_path = env.pageserver_remote_storage.remote_layer_path( + tenant_id, timeline_id, future_layer.to_str() ) log.info(f"future layer path: {future_layer_path}") pre_stat = future_layer_path.stat() time.sleep(1.1) # so that we can use change in pre_stat.st_mtime to detect overwrites + def get_generation_number(): + attachment = env.attachment_service.inspect(tenant_id) + assert attachment is not None + return attachment[0] + # force removal of layers from the future tenant_conf = ps_http.tenant_config(tenant_id) - ps_http.tenant_detach(tenant_id) + generation_before_detach = get_generation_number() + env.pageserver.tenant_detach(tenant_id) failpoint_name = "before-delete-layer-pausable" + ps_http.configure_failpoints((failpoint_name, "pause")) - ps_http.tenant_attach(tenant_id, tenant_conf.tenant_specific_overrides) + env.pageserver.tenant_attach(tenant_id, tenant_conf.tenant_specific_overrides) + generation_after_reattach = get_generation_number() wait_until_tenant_active(ps_http, tenant_id) # Ensure the IndexPart upload that unlinks the layer file finishes, i.e., doesn't clog the queue. @@ -173,6 +188,10 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder): assert env.pageserver.log_contains(f".*{tenant_id}.*at failpoint.*{failpoint_name}") wait_until(10, 0.5, delete_at_pause_point) + future_layer_path = env.pageserver_remote_storage.remote_layer_path( + tenant_id, timeline_id, future_layer.to_str(), generation=generation_before_detach + ) + log.info(f"future layer path: {future_layer_path}") assert future_layer_path.exists() # wait for re-ingestion of the WAL from safekeepers into the in-memory layer @@ -211,12 +230,17 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder): # Examine the resulting S3 state. log.info("integrity-check the remote storage") ip = get_index_part() - for layer_file_name in ip.layer_metadata.keys(): - layer_path = env.pageserver_remote_storage.layer_path( - tenant_id, timeline_id, layer_file_name + for layer_file_name, layer_metadata in ip.layer_metadata.items(): + log.info(f"Layer metadata {layer_file_name.to_str()}: {layer_metadata}") + layer_path = env.pageserver_remote_storage.remote_layer_path( + tenant_id, timeline_id, layer_file_name.to_str(), layer_metadata.generation ) assert layer_path.exists(), f"{layer_file_name.to_str()}" log.info("assert that the overwritten layer won") + future_layer_path = env.pageserver_remote_storage.remote_layer_path( + tenant_id, timeline_id, future_layer.to_str(), generation=generation_after_reattach + ) final_stat = future_layer_path.stat() + log.info(f"future layer path: {future_layer_path}") assert final_stat.st_mtime != pre_stat.st_mtime diff --git a/test_runner/regress/test_logical_replication.py b/test_runner/regress/test_logical_replication.py index d2d8d71e3f..51e358e60d 100644 --- a/test_runner/regress/test_logical_replication.py +++ b/test_runner/regress/test_logical_replication.py @@ -236,3 +236,30 @@ def test_wal_page_boundary_start(neon_simple_env: NeonEnv, vanilla_pg): assert vanilla_pg.safe_psql( "select sum(somedata) from replication_example" ) == endpoint.safe_psql("select sum(somedata) from replication_example") + + +# +# Check that slots are not inherited in brnach +# +def test_slots_and_branching(neon_simple_env: NeonEnv): + env = neon_simple_env + + tenant, timeline = env.neon_cli.create_tenant() + env.pageserver.http_client() + + main_branch = env.endpoints.create_start("main", tenant_id=tenant) + main_cur = main_branch.connect().cursor() + + # Create table and insert some data + main_cur.execute("select pg_create_logical_replication_slot('my_slot', 'pgoutput')") + + wait_for_last_flush_lsn(env, main_branch, tenant, timeline) + + # Create branch ws. + env.neon_cli.create_branch("ws", "main", tenant_id=tenant) + ws_branch = env.endpoints.create_start("ws", tenant_id=tenant) + log.info("postgres is running on 'ws' branch") + + # Check that we can create slot with the same name + ws_cur = ws_branch.connect().cursor() + ws_cur.execute("select pg_create_logical_replication_slot('my_slot', 'pgoutput')") diff --git a/test_runner/regress/test_lsn_mapping.py b/test_runner/regress/test_lsn_mapping.py index f79c1c347c..65d6d7a9fd 100644 --- a/test_runner/regress/test_lsn_mapping.py +++ b/test_runner/regress/test_lsn_mapping.py @@ -8,71 +8,6 @@ from fixtures.types import Lsn from fixtures.utils import query_scalar -# -# Test pageserver get_lsn_by_timestamp API -# -def test_lsn_mapping_old(neon_env_builder: NeonEnvBuilder): - env = neon_env_builder.init_start() - - new_timeline_id = env.neon_cli.create_branch("test_lsn_mapping") - endpoint_main = env.endpoints.create_start("test_lsn_mapping") - log.info("postgres is running on 'test_lsn_mapping' branch") - - cur = endpoint_main.connect().cursor() - # Create table, and insert rows, each in a separate transaction - # Disable synchronous_commit to make this initialization go faster. - # - # Each row contains current insert LSN and the current timestamp, when - # the row was inserted. - cur.execute("SET synchronous_commit=off") - cur.execute("CREATE TABLE foo (x integer)") - tbl = [] - for i in range(1000): - cur.execute("INSERT INTO foo VALUES(%s)", (i,)) - # Get the timestamp at UTC - after_timestamp = query_scalar(cur, "SELECT clock_timestamp()").replace(tzinfo=None) - tbl.append([i, after_timestamp]) - - # Execute one more transaction with synchronous_commit enabled, to flush - # all the previous transactions - cur.execute("SET synchronous_commit=on") - cur.execute("INSERT INTO foo VALUES (-1)") - - # Wait until WAL is received by pageserver - wait_for_last_flush_lsn(env, endpoint_main, env.initial_tenant, new_timeline_id) - - with env.pageserver.http_client() as client: - # Check edge cases: timestamp in the future - probe_timestamp = tbl[-1][1] + timedelta(hours=1) - result = client.timeline_get_lsn_by_timestamp( - env.initial_tenant, new_timeline_id, f"{probe_timestamp.isoformat()}Z", 1 - ) - assert result == "future" - - # timestamp too the far history - probe_timestamp = tbl[0][1] - timedelta(hours=10) - result = client.timeline_get_lsn_by_timestamp( - env.initial_tenant, new_timeline_id, f"{probe_timestamp.isoformat()}Z", 1 - ) - assert result == "past" - - # Probe a bunch of timestamps in the valid range - for i in range(1, len(tbl), 100): - probe_timestamp = tbl[i][1] - lsn = client.timeline_get_lsn_by_timestamp( - env.initial_tenant, new_timeline_id, f"{probe_timestamp.isoformat()}Z", 1 - ) - # Call get_lsn_by_timestamp to get the LSN - # Launch a new read-only node at that LSN, and check that only the rows - # that were supposed to be committed at that point in time are visible. - endpoint_here = env.endpoints.create_start( - branch_name="test_lsn_mapping", endpoint_id="ep-lsn_mapping_read", lsn=lsn - ) - assert endpoint_here.safe_psql("SELECT max(x) FROM foo")[0][0] == i - - endpoint_here.stop_and_destroy() - - # # Test pageserver get_lsn_by_timestamp API # @@ -130,7 +65,7 @@ def test_lsn_mapping(neon_env_builder: NeonEnvBuilder): # Timestamp is in the future probe_timestamp = tbl[-1][1] + timedelta(hours=1) result = client.timeline_get_lsn_by_timestamp( - tenant_id, timeline_id, f"{probe_timestamp.isoformat()}Z", 2 + tenant_id, timeline_id, f"{probe_timestamp.isoformat()}Z" ) assert result["kind"] == "future" # make sure that we return a well advanced lsn here @@ -139,7 +74,7 @@ def test_lsn_mapping(neon_env_builder: NeonEnvBuilder): # Timestamp is in the unreachable past probe_timestamp = tbl[0][1] - timedelta(hours=10) result = client.timeline_get_lsn_by_timestamp( - tenant_id, timeline_id, f"{probe_timestamp.isoformat()}Z", 2 + tenant_id, timeline_id, f"{probe_timestamp.isoformat()}Z" ) assert result["kind"] == "past" # make sure that we return the minimum lsn here at the start of the range @@ -149,7 +84,7 @@ def test_lsn_mapping(neon_env_builder: NeonEnvBuilder): for i in range(1, len(tbl), 100): probe_timestamp = tbl[i][1] result = client.timeline_get_lsn_by_timestamp( - tenant_id, timeline_id, f"{probe_timestamp.isoformat()}Z", 2 + tenant_id, timeline_id, f"{probe_timestamp.isoformat()}Z" ) assert result["kind"] not in ["past", "nodata"] lsn = result["lsn"] diff --git a/test_runner/regress/test_neon_cli.py b/test_runner/regress/test_neon_cli.py index de18ea0e6b..16d120e24a 100644 --- a/test_runner/regress/test_neon_cli.py +++ b/test_runner/regress/test_neon_cli.py @@ -133,6 +133,7 @@ def test_cli_start_stop(neon_env_builder: NeonEnvBuilder): # Stop default ps/sk env.neon_cli.pageserver_stop(env.pageserver.id) env.neon_cli.safekeeper_stop() + env.neon_cli.attachment_service_stop(False) # Keep NeonEnv state up to date, it usually owns starting/stopping services env.pageserver.running = False @@ -173,6 +174,9 @@ def test_cli_start_stop_multi(neon_env_builder: NeonEnvBuilder): env.neon_cli.safekeeper_stop(neon_env_builder.safekeepers_id_start + 1) env.neon_cli.safekeeper_stop(neon_env_builder.safekeepers_id_start + 2) + # Stop this to get out of the way of the following `start` + env.neon_cli.attachment_service_stop(False) + # Default start res = env.neon_cli.raw_cli(["start"]) res.check_returncode() diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py index a4cd42b6c3..af2d7aae88 100644 --- a/test_runner/regress/test_ondemand_download.py +++ b/test_runner/regress/test_ondemand_download.py @@ -5,7 +5,6 @@ import time from collections import defaultdict from typing import Any, DefaultDict, Dict, Tuple -import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnvBuilder, @@ -19,7 +18,7 @@ from fixtures.pageserver.utils import ( wait_for_upload, wait_for_upload_queue_empty, ) -from fixtures.remote_storage import RemoteStorageKind, available_remote_storages +from fixtures.remote_storage import RemoteStorageKind from fixtures.types import Lsn from fixtures.utils import query_scalar, wait_until @@ -45,13 +44,7 @@ def get_num_downloaded_layers(client: PageserverHttpClient): # If you have a large relation, check that the pageserver downloads parts of it as # require by queries. # -@pytest.mark.parametrize("remote_storage_kind", available_remote_storages()) -def test_ondemand_download_large_rel( - neon_env_builder: NeonEnvBuilder, - remote_storage_kind: RemoteStorageKind, -): - neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) - +def test_ondemand_download_large_rel(neon_env_builder: NeonEnvBuilder): # thinking about using a shared environment? the test assumes that global # metrics are for single tenant. env = neon_env_builder.init_start( @@ -145,13 +138,7 @@ def test_ondemand_download_large_rel( # If you have a relation with a long history of updates, the pageserver downloads the layer # files containing the history as needed by timetravel queries. # -@pytest.mark.parametrize("remote_storage_kind", available_remote_storages()) -def test_ondemand_download_timetravel( - neon_env_builder: NeonEnvBuilder, - remote_storage_kind: RemoteStorageKind, -): - neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) - +def test_ondemand_download_timetravel(neon_env_builder: NeonEnvBuilder): # thinking about using a shared environment? the test assumes that global # metrics are for single tenant. @@ -229,8 +216,7 @@ def test_ondemand_download_timetravel( assert filled_current_physical == filled_size, "we don't yet do layer eviction" # Wait until generated image layers are uploaded to S3 - if remote_storage_kind is not None: - wait_for_upload_queue_empty(pageserver_http, env.initial_tenant, timeline_id) + wait_for_upload_queue_empty(pageserver_http, env.initial_tenant, timeline_id) env.pageserver.stop() @@ -384,7 +370,7 @@ def test_download_remote_layers_api( env.pageserver.allowed_errors.extend( [ ".*download failed: downloading evicted layer file failed.*", - f".*initial size calculation.*{tenant_id}.*{timeline_id}.*Failed to calculate logical size", + f".*initial_size_calculation.*{tenant_id}.*{timeline_id}.*initial size calculation failed: downloading evicted layer file failed", ] ) diff --git a/test_runner/regress/test_pageserver_api.py b/test_runner/regress/test_pageserver_api.py index 2d83788193..573d2139ce 100644 --- a/test_runner/regress/test_pageserver_api.py +++ b/test_runner/regress/test_pageserver_api.py @@ -8,7 +8,6 @@ from fixtures.neon_fixtures import ( NeonEnvBuilder, ) from fixtures.pageserver.http import PageserverHttpClient -from fixtures.pg_version import PgVersion from fixtures.types import Lsn, TenantId, TimelineId from fixtures.utils import wait_until @@ -62,7 +61,10 @@ def test_pageserver_init_node_id( assert "has node id already, it cannot be overridden" in bad_update.stderr -def check_client(pg_version: PgVersion, client: PageserverHttpClient, initial_tenant: TenantId): +def check_client(env: NeonEnv, client: PageserverHttpClient): + pg_version = env.pg_version + initial_tenant = env.initial_tenant + client.check_status() # check initial tenant is there @@ -70,7 +72,9 @@ def check_client(pg_version: PgVersion, client: PageserverHttpClient, initial_te # create new tenant and check it is also there tenant_id = TenantId.generate() - client.tenant_create(tenant_id) + client.tenant_create( + tenant_id, generation=env.attachment_service.attach_hook_issue(tenant_id, env.pageserver.id) + ) assert tenant_id in {TenantId(t["id"]) for t in client.tenant_list()} timelines = client.timeline_list(tenant_id) @@ -181,7 +185,7 @@ def test_pageserver_http_get_wal_receiver_success(neon_simple_env: NeonEnv): def test_pageserver_http_api_client(neon_simple_env: NeonEnv): env = neon_simple_env with env.pageserver.http_client() as client: - check_client(env.pg_version, client, env.initial_tenant) + check_client(env, client) def test_pageserver_http_api_client_auth_enabled(neon_env_builder: NeonEnvBuilder): @@ -191,4 +195,4 @@ def test_pageserver_http_api_client_auth_enabled(neon_env_builder: NeonEnvBuilde pageserver_token = env.auth_keys.generate_pageserver_token() with env.pageserver.http_client(auth_token=pageserver_token) as client: - check_client(env.pg_version, client, env.initial_tenant) + check_client(env, client) diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py index 66cc286aba..9c2f5786d4 100644 --- a/test_runner/regress/test_pageserver_generations.py +++ b/test_runner/regress/test_pageserver_generations.py @@ -23,14 +23,20 @@ from fixtures.neon_fixtures import ( PgBin, S3Scrubber, last_flush_lsn_upload, - wait_for_last_flush_lsn, ) -from fixtures.pageserver.utils import list_prefix +from fixtures.pageserver.http import PageserverApiException +from fixtures.pageserver.utils import ( + assert_tenant_state, + list_prefix, + wait_for_last_record_lsn, + wait_for_upload, +) from fixtures.remote_storage import ( RemoteStorageKind, ) from fixtures.types import TenantId, TimelineId from fixtures.utils import print_gc_result, wait_until +from fixtures.workload import Workload # A tenant configuration that is convenient for generating uploads and deletions # without a large amount of postgres traffic. @@ -93,7 +99,10 @@ def generate_uploads_and_deletions( ) assert tenant_id is not None assert timeline_id is not None - wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id) + # We are waiting for uploads as well as local flush, in order to avoid leaving the system + # in a state where there are "future layers" in remote storage that will generate deletions + # after a restart. + last_flush_lsn_upload(env, endpoint, tenant_id, timeline_id) ps_http.timeline_checkpoint(tenant_id, timeline_id) # Compaction should generate some GC-elegible layers @@ -178,7 +187,6 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder): - After upgrade, the bucket should contain a mixture. - In both cases, postgres I/O should work. """ - neon_env_builder.enable_generations = True neon_env_builder.enable_pageserver_remote_storage( RemoteStorageKind.MOCK_S3, ) @@ -187,7 +195,6 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder): env.broker.try_start() for sk in env.safekeepers: sk.start() - assert env.attachment_service is not None env.attachment_service.start() env.pageserver.start(overrides=('--pageserver-config-override=control_plane_api=""',)) @@ -253,12 +260,10 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder): def test_deferred_deletion(neon_env_builder: NeonEnvBuilder): - neon_env_builder.enable_generations = True neon_env_builder.enable_pageserver_remote_storage( RemoteStorageKind.MOCK_S3, ) env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF) - assert env.attachment_service is not None some_other_pageserver = 1234 ps_http = env.pageserver.http_client() @@ -332,7 +337,6 @@ def test_deletion_queue_recovery( :param validate_before: whether to wait for deletions to be validated before restart. This makes them elegible to be executed after restart, if the same node keeps the attachment. """ - neon_env_builder.enable_generations = True neon_env_builder.enable_pageserver_remote_storage( RemoteStorageKind.MOCK_S3, ) @@ -396,7 +400,6 @@ def test_deletion_queue_recovery( if keep_attachment == KeepAttachment.LOSE: some_other_pageserver = 101010 - assert env.attachment_service is not None env.attachment_service.attach_hook_issue(env.initial_tenant, some_other_pageserver) env.pageserver.start() @@ -444,7 +447,6 @@ def test_deletion_queue_recovery( def test_emergency_mode(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): - neon_env_builder.enable_generations = True neon_env_builder.enable_pageserver_remote_storage( RemoteStorageKind.MOCK_S3, ) @@ -464,7 +466,6 @@ def test_emergency_mode(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): ) # Simulate a major incident: the control plane goes offline - assert env.attachment_service is not None env.attachment_service.stop() # Remember how many validations had happened before the control plane went offline @@ -536,7 +537,6 @@ def test_eviction_across_generations(neon_env_builder: NeonEnvBuilder): and must be constructed using the proper generation for the layer, which may not be the same generation that the tenant is running in. """ - neon_env_builder.enable_generations = True neon_env_builder.enable_pageserver_remote_storage( RemoteStorageKind.MOCK_S3, ) @@ -560,3 +560,90 @@ def test_eviction_across_generations(neon_env_builder: NeonEnvBuilder): read_all(env, tenant_id, timeline_id) evict_all_layers(env, tenant_id, timeline_id) read_all(env, tenant_id, timeline_id) + + +def test_multi_attach( + neon_env_builder: NeonEnvBuilder, + pg_bin: PgBin, +): + neon_env_builder.num_pageservers = 3 + neon_env_builder.enable_pageserver_remote_storage( + remote_storage_kind=RemoteStorageKind.MOCK_S3, + ) + env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF) + + pageservers = env.pageservers + http_clients = list([p.http_client() for p in pageservers]) + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + + # We will intentionally create situations where stale deletions happen from non-latest-generation + # nodes when the tenant is multiply-attached + for ps in env.pageservers: + ps.allowed_errors.extend( + [".*Dropped remote consistent LSN updates.*", ".*Dropping stale deletions.*"] + ) + + # Initially, the tenant will be attached to the first pageserver (first is default in our test harness) + wait_until(10, 0.2, lambda: assert_tenant_state(http_clients[0], tenant_id, "Active")) + _detail = http_clients[0].timeline_detail(tenant_id, timeline_id) + with pytest.raises(PageserverApiException): + http_clients[1].timeline_detail(tenant_id, timeline_id) + with pytest.raises(PageserverApiException): + http_clients[2].timeline_detail(tenant_id, timeline_id) + + workload = Workload(env, tenant_id, timeline_id) + workload.init(pageservers[0].id) + workload.write_rows(1000, pageservers[0].id) + + # Attach the tenant to the other two pageservers + pageservers[1].tenant_attach(env.initial_tenant) + pageservers[2].tenant_attach(env.initial_tenant) + + wait_until(10, 0.2, lambda: assert_tenant_state(http_clients[1], tenant_id, "Active")) + wait_until(10, 0.2, lambda: assert_tenant_state(http_clients[2], tenant_id, "Active")) + + # Now they all have it attached + _details = list([c.timeline_detail(tenant_id, timeline_id) for c in http_clients]) + _detail = http_clients[1].timeline_detail(tenant_id, timeline_id) + _detail = http_clients[2].timeline_detail(tenant_id, timeline_id) + + # The endpoint can use any pageserver to service its reads + for pageserver in pageservers: + workload.validate(pageserver.id) + + # If we write some more data, all the nodes can see it, including stale ones + wrote_lsn = workload.write_rows(1000, pageservers[0].id) + for ps_http in http_clients: + wait_for_last_record_lsn(ps_http, tenant_id, timeline_id, wrote_lsn) + + # ...and indeed endpoints can see it via any of the pageservers + for pageserver in pageservers: + workload.validate(pageserver.id) + + # Prompt all the pageservers, including stale ones, to upload ingested layers to remote storage + for ps_http in http_clients: + ps_http.timeline_checkpoint(tenant_id, timeline_id) + wait_for_upload(ps_http, tenant_id, timeline_id, wrote_lsn) + + # Now, the contents of remote storage will be a set of layers from each pageserver, but with unique + # generation numbers + # TODO: validate remote storage contents + + # Stop all pageservers + for ps in pageservers: + ps.stop() + + # Returning to a normal healthy state: all pageservers will start, but only the one most + # recently attached via the control plane will re-attach on startup + for ps in pageservers: + ps.start() + + with pytest.raises(PageserverApiException): + _detail = http_clients[0].timeline_detail(tenant_id, timeline_id) + with pytest.raises(PageserverApiException): + _detail = http_clients[1].timeline_detail(tenant_id, timeline_id) + _detail = http_clients[2].timeline_detail(tenant_id, timeline_id) + + # All data we wrote while multi-attached remains readable + workload.validate(pageservers[2].id) diff --git a/test_runner/regress/test_pageserver_metric_collection.py b/test_runner/regress/test_pageserver_metric_collection.py index b76dbbee03..042961baa5 100644 --- a/test_runner/regress/test_pageserver_metric_collection.py +++ b/test_runner/regress/test_pageserver_metric_collection.py @@ -64,13 +64,13 @@ def test_metric_collection( # spin up neon, after http server is ready env = neon_env_builder.init_start(initial_tenant_conf={"pitr_interval": "0 sec"}) # httpserver is shut down before pageserver during passing run - env.pageserver.allowed_errors.append(".*metrics endpoint refused the sent metrics*") - # we have a fast rate of calculation, these can happen at shutdown - env.pageserver.allowed_errors.append( - ".*synthetic_size_worker:calculate_synthetic_size.*:gather_size_inputs.*: failed to calculate logical size at .*: cancelled.*" - ) - env.pageserver.allowed_errors.append( - ".*synthetic_size_worker: failed to calculate synthetic size for tenant .*: failed to calculate some logical_sizes" + env.pageserver.allowed_errors.extend( + [ + ".*metrics endpoint refused the sent metrics*", + # we have a fast rate of calculation, these can happen at shutdown + ".*synthetic_size_worker:calculate_synthetic_size.*:gather_size_inputs.*: failed to calculate logical size at .*: cancelled.*", + ".*synthetic_size_worker: failed to calculate synthetic size for tenant .*: failed to calculate some logical_sizes", + ] ) tenant_id = env.initial_tenant @@ -212,13 +212,13 @@ def test_metric_collection_cleans_up_tempfile( pageserver_http = env.pageserver.http_client() # httpserver is shut down before pageserver during passing run - env.pageserver.allowed_errors.append(".*metrics endpoint refused the sent metrics*") - # we have a fast rate of calculation, these can happen at shutdown - env.pageserver.allowed_errors.append( - ".*synthetic_size_worker:calculate_synthetic_size.*:gather_size_inputs.*: failed to calculate logical size at .*: cancelled.*" - ) - env.pageserver.allowed_errors.append( - ".*synthetic_size_worker: failed to calculate synthetic size for tenant .*: failed to calculate some logical_sizes" + env.pageserver.allowed_errors.extend( + [ + ".*metrics endpoint refused the sent metrics*", + # we have a fast rate of calculation, these can happen at shutdown + ".*synthetic_size_worker:calculate_synthetic_size.*:gather_size_inputs.*: failed to calculate logical size at .*: cancelled.*", + ".*synthetic_size_worker: failed to calculate synthetic size for tenant .*: failed to calculate some logical_sizes", + ] ) tenant_id = env.initial_tenant diff --git a/test_runner/regress/test_pageserver_restart.py b/test_runner/regress/test_pageserver_restart.py index 443b0812fd..c4499196b5 100644 --- a/test_runner/regress/test_pageserver_restart.py +++ b/test_runner/regress/test_pageserver_restart.py @@ -9,9 +9,7 @@ from fixtures.utils import wait_until # Test restarting page server, while safekeeper and compute node keep # running. -@pytest.mark.parametrize("generations", [True, False]) -def test_pageserver_restart(neon_env_builder: NeonEnvBuilder, generations: bool): - neon_env_builder.enable_generations = generations +def test_pageserver_restart(neon_env_builder: NeonEnvBuilder): neon_env_builder.enable_pageserver_remote_storage(s3_storage()) neon_env_builder.enable_scrub_on_exit() @@ -106,7 +104,6 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder, generations: bool) # Initial tenant load should reflect the delay we injected ("initial_tenant_load", lambda t, p: t >= (tenant_load_delay_ms / 1000.0) and t >= p), # Subsequent steps should occur in expected order - ("initial_logical_sizes", lambda t, p: t > 0 and t >= p), ("background_jobs_can_start", lambda t, p: t > 0 and t >= p), ("complete", lambda t, p: t > 0 and t >= p), ] diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py new file mode 100644 index 0000000000..8ae4297983 --- /dev/null +++ b/test_runner/regress/test_pageserver_secondary.py @@ -0,0 +1,371 @@ +import random +from typing import Any, Dict, Optional + +import pytest +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnvBuilder, NeonPageserver +from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind +from fixtures.types import TenantId, TimelineId +from fixtures.utils import wait_until +from fixtures.workload import Workload + +# A tenant configuration that is convenient for generating uploads and deletions +# without a large amount of postgres traffic. +TENANT_CONF = { + # small checkpointing and compaction targets to ensure we generate many upload operations + "checkpoint_distance": f"{128 * 1024}", + "compaction_target_size": f"{128 * 1024}", + "compaction_threshold": "1", + # no PITR horizon, we specify the horizon when we request on-demand GC + "pitr_interval": "0s", + # disable background compaction and GC. We invoke it manually when we want it to happen. + "gc_period": "0s", + "compaction_period": "0s", + # create image layers eagerly, so that GC can remove some layers + "image_creation_threshold": "1", +} + + +def evict_random_layers( + rng: random.Random, pageserver: NeonPageserver, tenant_id: TenantId, timeline_id: TimelineId +): + """ + Evict 50% of the layers on a pageserver + """ + timeline_path = pageserver.timeline_dir(tenant_id, timeline_id) + initial_local_layers = sorted( + list(filter(lambda path: path.name != "metadata", timeline_path.glob("*"))) + ) + client = pageserver.http_client() + for layer in initial_local_layers: + if "ephemeral" in layer.name or "temp_download" in layer.name: + continue + + if rng.choice([True, False]): + log.info(f"Evicting layer {tenant_id}/{timeline_id} {layer.name}") + client.evict_layer(tenant_id=tenant_id, timeline_id=timeline_id, layer_name=layer.name) + + +@pytest.mark.parametrize("seed", [1, 2, 3]) +def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, seed: int): + """ + Issue many location configuration changes, ensure that tenants + remain readable & we don't get any unexpected errors. We should + have no ERROR in the log, and no 500s in the API. + + The location_config API is intentionally designed so that all destination + states are valid, so that we may test it in this way: the API should always + work as long as the tenant exists. + """ + neon_env_builder.num_pageservers = 3 + neon_env_builder.enable_pageserver_remote_storage( + remote_storage_kind=RemoteStorageKind.MOCK_S3, + ) + env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF) + + pageservers = env.pageservers + list([p.http_client() for p in pageservers]) + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + + # We will make no effort to avoid stale attachments + for ps in env.pageservers: + ps.allowed_errors.extend( + [ + ".*Dropped remote consistent LSN updates.*", + ".*Dropping stale deletions.*", + # page_service_conn_main{peer_addr=[::1]:41176}: query handler for 'pagestream 3b19aec5038c796f64b430b30a555121 d07776761d44050b8aab511df1657d83' failed: Tenant 3b19aec5038c796f64b430b30a555121 not found + ".*query handler.*Tenant.*not found.*", + # page_service_conn_main{peer_addr=[::1]:45552}: query handler for 'pagestream 414ede7ad50f775a8e7d9ba0e43b9efc a43884be16f44b3626482b6981b2c745' failed: Tenant 414ede7ad50f775a8e7d9ba0e43b9efc is not active + ".*query handler.*Tenant.*not active.*", + ] + ) + + # these can happen, if we shutdown at a good time. to be fixed as part of #5172. + message = ".*duplicated L1 layer layer=.*" + ps.allowed_errors.append(message) + + workload = Workload(env, tenant_id, timeline_id) + workload.init(env.pageservers[0].id) + workload.write_rows(256, env.pageservers[0].id) + + # We use a fixed seed to make the test reproducible: we want a randomly + # chosen order, but not to change the order every time we run the test. + rng = random.Random(seed) + + initial_generation = 1 + last_state = { + env.pageservers[0].id: ("AttachedSingle", initial_generation), + env.pageservers[1].id: ("Detached", None), + env.pageservers[2].id: ("Detached", None), + } + + latest_attached = env.pageservers[0].id + + for _i in range(0, 64): + # Pick a pageserver + pageserver = rng.choice(env.pageservers) + + # Pick a pseudorandom state + modes = [ + "AttachedSingle", + "AttachedMulti", + "AttachedStale", + "Secondary", + "Detached", + "_Evictions", + "_Restart", + ] + + mode = rng.choice(modes) + + last_state_ps = last_state[pageserver.id] + if mode == "_Evictions": + if last_state_ps[0].startswith("Attached"): + log.info(f"Action: evictions on pageserver {pageserver.id}") + evict_random_layers(rng, pageserver, tenant_id, timeline_id) + else: + log.info( + f"Action: skipping evictions on pageserver {pageserver.id}, is not attached" + ) + elif mode == "_Restart": + log.info(f"Action: restarting pageserver {pageserver.id}") + pageserver.stop() + pageserver.start() + if last_state_ps[0].startswith("Attached") and latest_attached == pageserver.id: + log.info("Entering postgres...") + workload.churn_rows(rng.randint(128, 256), pageserver.id) + workload.validate(pageserver.id) + elif last_state_ps[0].startswith("Attached"): + # The `attachment_service` will only re-attach on startup when a pageserver was the + # holder of the latest generation: otherwise the pageserver will revert to detached + # state if it was running attached with a stale generation + last_state[pageserver.id] = ("Detached", None) + else: + secondary_conf: Optional[Dict[str, Any]] = None + if mode == "Secondary": + secondary_conf = {"warm": rng.choice([True, False])} + + location_conf: Dict[str, Any] = { + "mode": mode, + "secondary_conf": secondary_conf, + "tenant_conf": {}, + } + + log.info(f"Action: Configuring pageserver {pageserver.id} to {location_conf}") + + # Select a generation number + if mode.startswith("Attached"): + if last_state_ps[1] is not None: + if rng.choice([True, False]): + # Move between attached states, staying in the same generation + generation = last_state_ps[1] + else: + # Switch generations, while also jumping between attached states + generation = env.attachment_service.attach_hook_issue( + tenant_id, pageserver.id + ) + latest_attached = pageserver.id + else: + generation = env.attachment_service.attach_hook_issue(tenant_id, pageserver.id) + latest_attached = pageserver.id + else: + generation = None + + location_conf["generation"] = generation + + pageserver.tenant_location_configure(tenant_id, location_conf) + last_state[pageserver.id] = (mode, generation) + + if mode.startswith("Attached"): + # This is a basic test: we are validating that he endpoint works properly _between_ + # configuration changes. A stronger test would be to validate that clients see + # no errors while we are making the changes. + workload.churn_rows( + rng.randint(128, 256), pageserver.id, upload=mode != "AttachedStale" + ) + workload.validate(pageserver.id) + + # Attach all pageservers + for ps in env.pageservers: + location_conf = {"mode": "AttachedMulti", "secondary_conf": None, "tenant_conf": {}} + ps.tenant_location_configure(tenant_id, location_conf) + + # Confirm that all are readable + for ps in env.pageservers: + workload.validate(ps.id) + + # Detach all pageservers + for ps in env.pageservers: + location_conf = {"mode": "Detached", "secondary_conf": None, "tenant_conf": {}} + ps.tenant_location_configure(tenant_id, location_conf) + + # Confirm that all local disk state was removed on detach + # TODO + + +def test_live_migration(neon_env_builder: NeonEnvBuilder): + """ + Test the sequence of location states that are used in a live migration. + """ + neon_env_builder.num_pageservers = 2 + neon_env_builder.enable_pageserver_remote_storage( + remote_storage_kind=RemoteStorageKind.MOCK_S3, + ) + env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF) + + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + + pageserver_a = env.pageservers[0] + pageserver_b = env.pageservers[1] + + initial_generation = 1 + + workload = Workload(env, tenant_id, timeline_id) + workload.init(env.pageservers[0].id) + workload.write_rows(256, env.pageservers[0].id) + + # Make the destination a secondary location + pageserver_b.tenant_location_configure( + tenant_id, + { + "mode": "Secondary", + "secondary_conf": {"warm": True}, + "tenant_conf": {}, + }, + ) + + workload.churn_rows(64, pageserver_a.id, upload=False) + + # Set origin attachment to stale + log.info("Setting origin to AttachedStale") + pageserver_a.tenant_location_configure( + tenant_id, + { + "mode": "AttachedStale", + "secondary_conf": None, + "tenant_conf": {}, + "generation": initial_generation, + }, + flush_ms=5000, + ) + + migrated_generation = env.attachment_service.attach_hook_issue(tenant_id, pageserver_b.id) + log.info(f"Acquired generation {migrated_generation} for destination pageserver") + assert migrated_generation == initial_generation + 1 + + # Writes and reads still work in AttachedStale. + workload.validate(pageserver_a.id) + + # TODO: call into secondary mode API hooks to do an upload/download sync + + # Generate some more dirty writes: we expect the origin to ingest WAL in + # in AttachedStale + workload.churn_rows(64, pageserver_a.id, upload=False) + workload.validate(pageserver_a.id) + + # Attach the destination + log.info("Setting destination to AttachedMulti") + pageserver_b.tenant_location_configure( + tenant_id, + { + "mode": "AttachedMulti", + "secondary_conf": None, + "tenant_conf": {}, + "generation": migrated_generation, + }, + ) + + # Wait for destination LSN to catch up with origin + origin_lsn = pageserver_a.http_client().timeline_detail(tenant_id, timeline_id)[ + "last_record_lsn" + ] + + def caught_up(): + destination_lsn = pageserver_b.http_client().timeline_detail(tenant_id, timeline_id)[ + "last_record_lsn" + ] + log.info( + f"Waiting for LSN to catch up: origin {origin_lsn} vs destination {destination_lsn}" + ) + assert destination_lsn >= origin_lsn + + wait_until(100, 0.1, caught_up) + + # The destination should accept writes + workload.churn_rows(64, pageserver_b.id) + + # Dual attached: both are readable. + workload.validate(pageserver_a.id) + workload.validate(pageserver_b.id) + + # Revert the origin to secondary + log.info("Setting origin to Secondary") + pageserver_a.tenant_location_configure( + tenant_id, + { + "mode": "Secondary", + "secondary_conf": {"warm": True}, + "tenant_conf": {}, + }, + ) + + workload.churn_rows(64, pageserver_b.id) + + # Put the destination into final state + pageserver_b.tenant_location_configure( + tenant_id, + { + "mode": "AttachedSingle", + "secondary_conf": None, + "tenant_conf": {}, + "generation": migrated_generation, + }, + ) + + workload.churn_rows(64, pageserver_b.id) + workload.validate(pageserver_b.id) + + +def test_heatmap_uploads(neon_env_builder: NeonEnvBuilder): + """ + Test the sequence of location states that are used in a live migration. + """ + env = neon_env_builder.init_start() # initial_tenant_conf=TENANT_CONF) + assert isinstance(env.pageserver_remote_storage, LocalFsStorage) + + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + + # Write some data so that we have some layers + workload = Workload(env, tenant_id, timeline_id) + workload.init(env.pageservers[0].id) + + # Write some layers and upload a heatmap + workload.write_rows(256, env.pageservers[0].id) + env.pageserver.http_client().tenant_heatmap_upload(tenant_id) + + def validate_heatmap(heatmap): + assert len(heatmap["timelines"]) == 1 + assert heatmap["timelines"][0]["timeline_id"] == str(timeline_id) + assert len(heatmap["timelines"][0]["layers"]) > 0 + layers = heatmap["timelines"][0]["layers"] + + # Each layer appears at most once + assert len(set(layer["name"] for layer in layers)) == len(layers) + + # Download and inspect the heatmap that the pageserver uploaded + heatmap_first = env.pageserver_remote_storage.heatmap_content(tenant_id) + log.info(f"Read back heatmap: {heatmap_first}") + validate_heatmap(heatmap_first) + + # Do some more I/O to generate more layers + workload.churn_rows(64, env.pageservers[0].id) + env.pageserver.http_client().tenant_heatmap_upload(tenant_id) + + # Ensure that another heatmap upload includes the new layers + heatmap_second = env.pageserver_remote_storage.heatmap_content(tenant_id) + log.info(f"Read back heatmap: {heatmap_second}") + assert heatmap_second != heatmap_first + validate_heatmap(heatmap_second) diff --git a/test_runner/regress/test_physical_replication.py b/test_runner/regress/test_physical_replication.py new file mode 100644 index 0000000000..034f2b669d --- /dev/null +++ b/test_runner/regress/test_physical_replication.py @@ -0,0 +1,29 @@ +import random +import time + +from fixtures.neon_fixtures import NeonEnv + + +def test_physical_replication(neon_simple_env: NeonEnv): + env = neon_simple_env + n_records = 100000 + with env.endpoints.create_start( + branch_name="main", + endpoint_id="primary", + ) as primary: + with primary.connect() as p_con: + with p_con.cursor() as p_cur: + p_cur.execute( + "CREATE TABLE t(pk bigint primary key, payload text default repeat('?',200))" + ) + time.sleep(1) + with env.endpoints.new_replica_start(origin=primary, endpoint_id="secondary") as secondary: + with primary.connect() as p_con: + with p_con.cursor() as p_cur: + with secondary.connect() as s_con: + with s_con.cursor() as s_cur: + for pk in range(n_records): + p_cur.execute("insert into t (pk) values (%s)", (pk,)) + s_cur.execute( + "select * from t where pk=%s", (random.randrange(1, n_records),) + ) diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py index ccf28cae8c..2fda56d0f4 100644 --- a/test_runner/regress/test_remote_storage.py +++ b/test_runner/regress/test_remote_storage.py @@ -23,7 +23,6 @@ from fixtures.pageserver.utils import ( wait_until_tenant_state, ) from fixtures.remote_storage import ( - TIMELINE_INDEX_PART_FILE_NAME, LocalFsStorage, RemoteStorageKind, available_remote_storages, @@ -61,8 +60,6 @@ def test_remote_storage_backup_and_restore( neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) - neon_env_builder.enable_generations = generations - # Exercise retry code path by making all uploads and downloads fail for the # first time. The retries print INFO-messages to the log; we will check # that they are present after the test. @@ -74,19 +71,20 @@ def test_remote_storage_backup_and_restore( ##### First start, insert data and upload it to the remote storage env = neon_env_builder.init_start() - # FIXME: Is this expected? - env.pageserver.allowed_errors.append( - ".*marking .* as locally complete, while it doesnt exist in remote index.*" + env.pageserver.allowed_errors.extend( + [ + # FIXME: Is this expected? + ".*marking .* as locally complete, while it doesnt exist in remote index.*", + ".*No timelines to attach received.*", + ".*Failed to get local tenant state.*", + # FIXME retry downloads without throwing errors + ".*failed to load remote timeline.*", + # we have a bunch of pytest.raises for these below + ".*tenant .*? already exists, state:.*", + ".*tenant directory already exists.*", + ".*simulated failure of remote operation.*", + ] ) - env.pageserver.allowed_errors.append(".*No timelines to attach received.*") - - env.pageserver.allowed_errors.append(".*Failed to get local tenant state.*") - # FIXME retry downloads without throwing errors - env.pageserver.allowed_errors.append(".*failed to load remote timeline.*") - # we have a bunch of pytest.raises for these below - env.pageserver.allowed_errors.append(".*tenant .*? already exists, state:.*") - env.pageserver.allowed_errors.append(".*tenant directory already exists.*") - env.pageserver.allowed_errors.append(".*simulated failure of remote operation.*") pageserver_http = env.pageserver.http_client() endpoint = env.endpoints.create_start("main") @@ -350,6 +348,13 @@ def test_remote_storage_upload_queue_retries( env.pageserver.stop(immediate=True) env.endpoints.stop_all() + # We are about to forcibly drop local dirs. Attachment service will increment generation in re-attach before + # we later increment when actually attaching it again, leading to skipping a generation and potentially getting + # these warnings if there was a durable but un-executed deletion list at time of restart. + env.pageserver.allowed_errors.extend( + [".*Dropped remote consistent LSN updates.*", ".*Dropping stale deletions.*"] + ) + dir_to_clear = env.pageserver.tenant_dir() shutil.rmtree(dir_to_clear) os.mkdir(dir_to_clear) @@ -603,7 +608,12 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue( assert isinstance(env.pageserver_remote_storage, LocalFsStorage) remote_timeline_path = env.pageserver_remote_storage.timeline_path(tenant_id, timeline_id) - assert not list(remote_timeline_path.iterdir()) + filtered = [ + path + for path in remote_timeline_path.iterdir() + if not (path.name.endswith("initdb.tar.zst")) + ] + assert len(filtered) == 0 # timeline deletion should kill ongoing uploads, so, the metric will be gone assert get_queued_count(file_kind="index", op_kind="upload") is None @@ -643,7 +653,7 @@ def test_empty_branch_remote_storage_upload(neon_env_builder: NeonEnvBuilder): ), f"Expected to have an initial timeline and the branch timeline only, but got {timelines_before_detach}" client.tenant_detach(env.initial_tenant) - client.tenant_attach(env.initial_tenant) + env.pageserver.tenant_attach(env.initial_tenant) wait_until_tenant_state(client, env.initial_tenant, "Active", 5) timelines_after_detach = set( @@ -753,10 +763,11 @@ def test_empty_branch_remote_storage_upload_on_restart(neon_env_builder: NeonEnv # this is because creating a timeline always awaits for the uploads to complete assert_nothing_to_upload(client, env.initial_tenant, new_branch_timeline_id) - assert ( - new_branch_on_remote_storage / TIMELINE_INDEX_PART_FILE_NAME + assert env.pageserver_remote_storage.index_path( + env.initial_tenant, new_branch_timeline_id ).is_file(), "uploads scheduled during initial load should had been awaited for" finally: + barrier.abort() create_thread.join() @@ -835,7 +846,7 @@ def test_compaction_waits_for_upload( ), "there should be one L1 after L0 => L1 compaction (without #5863 being fixed)" def layer_deletes_completed(): - m = client.get_metric_value("pageserver_layer_gcs_count_total", {"state": "completed"}) + m = client.get_metric_value("pageserver_layer_completed_deletes_total") if m is None: return 0 return int(m) diff --git a/test_runner/regress/test_tenant_conf.py b/test_runner/regress/test_tenant_conf.py index fcc3243e81..2ed22cabc4 100644 --- a/test_runner/regress/test_tenant_conf.py +++ b/test_runner/regress/test_tenant_conf.py @@ -314,7 +314,11 @@ def test_creating_tenant_conf_after_attach(neon_env_builder: NeonEnvBuilder): assert not config_path.exists(), "detach did not remove config file" - http_client.tenant_attach(tenant_id) + # The re-attach's increment of the generation number may invalidate deletion queue + # updates in flight from the previous attachment. + env.pageserver.allowed_errors.append(".*Dropped remote consistent LSN updates.*") + + env.pageserver.tenant_attach(tenant_id) wait_until( number_of_iterations=5, interval=1, diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py index 167c8355cc..fece876459 100644 --- a/test_runner/regress/test_tenant_delete.py +++ b/test_runner/regress/test_tenant_delete.py @@ -23,23 +23,18 @@ from fixtures.pageserver.utils import ( wait_until_tenant_active, wait_until_tenant_state, ) -from fixtures.remote_storage import ( - RemoteStorageKind, - available_remote_storages, - available_s3_storages, -) +from fixtures.remote_storage import RemoteStorageKind, available_s3_storages, s3_storage from fixtures.types import TenantId from fixtures.utils import run_pg_bench_small, wait_until -@pytest.mark.parametrize("remote_storage_kind", available_remote_storages()) def test_tenant_delete_smoke( neon_env_builder: NeonEnvBuilder, - remote_storage_kind: RemoteStorageKind, pg_bin: PgBin, ): neon_env_builder.pageserver_config_override = "test_remote_failures=1" + remote_storage_kind = s3_storage() neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) env = neon_env_builder.init_start() @@ -78,16 +73,15 @@ def test_tenant_delete_smoke( run_pg_bench_small(pg_bin, endpoint.connstr()) wait_for_last_flush_lsn(env, endpoint, tenant=tenant_id, timeline=timeline_id) - if remote_storage_kind in available_s3_storages(): - assert_prefix_not_empty( - neon_env_builder, - prefix="/".join( - ( - "tenants", - str(tenant_id), - ) - ), - ) + assert_prefix_not_empty( + neon_env_builder, + prefix="/".join( + ( + "tenants", + str(tenant_id), + ) + ), + ) parent = timeline @@ -100,16 +94,15 @@ def test_tenant_delete_smoke( tenant_path = env.pageserver.tenant_dir(tenant_id) assert not tenant_path.exists() - if remote_storage_kind in available_s3_storages(): - assert_prefix_empty( - neon_env_builder, - prefix="/".join( - ( - "tenants", - str(tenant_id), - ) - ), - ) + assert_prefix_empty( + neon_env_builder, + prefix="/".join( + ( + "tenants", + str(tenant_id), + ) + ), + ) # Deletion updates the tenant count: the one default tenant remains assert ps_http.get_metric_value("pageserver_tenant_manager_slots") == 1 @@ -149,9 +142,7 @@ FAILPOINTS_BEFORE_BACKGROUND = [ def combinations(): result = [] - remotes = [RemoteStorageKind.MOCK_S3] - if os.getenv("ENABLE_REAL_S3_REMOTE_STORAGE"): - remotes.append(RemoteStorageKind.REAL_S3) + remotes = available_s3_storages() for remote_storage_kind in remotes: for delete_failpoint in FAILPOINTS: @@ -165,8 +156,8 @@ def combinations(): return result -@pytest.mark.parametrize("remote_storage_kind, failpoint, simulate_failures", combinations()) @pytest.mark.parametrize("check", list(Check)) +@pytest.mark.parametrize("remote_storage_kind, failpoint, simulate_failures", combinations()) def test_delete_tenant_exercise_crash_safety_failpoints( neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind, @@ -214,16 +205,15 @@ def test_delete_tenant_exercise_crash_safety_failpoints( run_pg_bench_small(pg_bin, endpoint.connstr()) last_flush_lsn_upload(env, endpoint, tenant_id, timeline_id) - if remote_storage_kind in available_s3_storages(): - assert_prefix_not_empty( - neon_env_builder, - prefix="/".join( - ( - "tenants", - str(tenant_id), - ) - ), - ) + assert_prefix_not_empty( + neon_env_builder, + prefix="/".join( + ( + "tenants", + str(tenant_id), + ) + ), + ) ps_http.configure_failpoints((failpoint, "return")) @@ -276,24 +266,23 @@ def test_delete_tenant_exercise_crash_safety_failpoints( assert not tenant_dir.exists() # Check remote is empty - if remote_storage_kind in available_s3_storages(): - assert_prefix_empty( - neon_env_builder, - prefix="/".join( - ( - "tenants", - str(tenant_id), - ) - ), - ) + assert_prefix_empty( + neon_env_builder, + prefix="/".join( + ( + "tenants", + str(tenant_id), + ) + ), + allowed_postfix="initdb.tar.zst", + ) -@pytest.mark.parametrize("remote_storage_kind", available_remote_storages()) def test_tenant_delete_is_resumed_on_attach( neon_env_builder: NeonEnvBuilder, - remote_storage_kind: RemoteStorageKind, pg_bin: PgBin, ): + remote_storage_kind = s3_storage() neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) env = neon_env_builder.init_start(initial_tenant_conf=MANY_SMALL_LAYERS_TENANT_CONFIG) @@ -313,16 +302,15 @@ def test_tenant_delete_is_resumed_on_attach( wait_for_last_flush_lsn(env, endpoint, tenant=tenant_id, timeline=timeline_id) # sanity check, data should be there - if remote_storage_kind in available_s3_storages(): - assert_prefix_not_empty( - neon_env_builder, - prefix="/".join( - ( - "tenants", - str(tenant_id), - ) - ), - ) + assert_prefix_not_empty( + neon_env_builder, + prefix="/".join( + ( + "tenants", + str(tenant_id), + ) + ), + ) # failpoint before we remove index_part from s3 failpoint = "timeline-delete-before-index-delete" @@ -353,16 +341,15 @@ def test_tenant_delete_is_resumed_on_attach( iterations=iterations, ) - if remote_storage_kind in available_s3_storages(): - assert_prefix_not_empty( - neon_env_builder, - prefix="/".join( - ( - "tenants", - str(tenant_id), - ) - ), - ) + assert_prefix_not_empty( + neon_env_builder, + prefix="/".join( + ( + "tenants", + str(tenant_id), + ) + ), + ) reason = tenant_info["state"]["data"]["reason"] # failpoint may not be the only error in the stack @@ -379,7 +366,7 @@ def test_tenant_delete_is_resumed_on_attach( env.pageserver.start() # now we call attach - ps_http.tenant_attach(tenant_id=tenant_id) + env.pageserver.tenant_attach(tenant_id=tenant_id) # delete should be resumed wait_tenant_status_404(ps_http, tenant_id, iterations) @@ -388,17 +375,16 @@ def test_tenant_delete_is_resumed_on_attach( tenant_path = env.pageserver.tenant_dir(tenant_id) assert not tenant_path.exists() - if remote_storage_kind in available_s3_storages(): - ps_http.deletion_queue_flush(execute=True) - assert_prefix_empty( - neon_env_builder, - prefix="/".join( - ( - "tenants", - str(tenant_id), - ) - ), - ) + ps_http.deletion_queue_flush(execute=True) + assert_prefix_empty( + neon_env_builder, + prefix="/".join( + ( + "tenants", + str(tenant_id), + ) + ), + ) def test_long_timeline_create_cancelled_by_tenant_delete(neon_env_builder: NeonEnvBuilder): @@ -409,16 +395,16 @@ def test_long_timeline_create_cancelled_by_tenant_delete(neon_env_builder: NeonE env.start() pageserver_http = env.pageserver.http_client() - # happens with the cancellation bailing flushing loop earlier, leaving disk_consistent_lsn at zero - env.pageserver.allowed_errors.append( - ".*Timeline got dropped without initializing, cleaning its files" - ) - # the response hit_pausable_failpoint_and_later_fail - env.pageserver.allowed_errors.append( - f".*Error processing HTTP request: InternalServerError\\(new timeline {env.initial_tenant}/{env.initial_timeline} has invalid disk_consistent_lsn" + env.pageserver.allowed_errors.extend( + [ + # happens with the cancellation bailing flushing loop earlier, leaving disk_consistent_lsn at zero + ".*Timeline got dropped without initializing, cleaning its files", + # the response hit_pausable_failpoint_and_later_fail + f".*Error processing HTTP request: InternalServerError\\(new timeline {env.initial_tenant}/{env.initial_timeline} has invalid disk_consistent_lsn", + ] ) - pageserver_http.tenant_create(env.initial_tenant) + env.pageserver.tenant_create(env.initial_tenant) failpoint = "flush-layer-cancel-after-writing-layer-out-pausable" pageserver_http.configure_failpoints((failpoint, "pause")) diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py index 0bd3800480..0dcbb23ad4 100644 --- a/test_runner/regress/test_tenant_detach.py +++ b/test_runner/regress/test_tenant_detach.py @@ -1,4 +1,5 @@ import asyncio +import enum import random import time from threading import Thread @@ -20,7 +21,6 @@ from fixtures.pageserver.utils import ( ) from fixtures.remote_storage import ( RemoteStorageKind, - available_remote_storages, ) from fixtures.types import Lsn, TenantId, TimelineId from fixtures.utils import query_scalar, wait_until @@ -51,14 +51,18 @@ def do_gc_target( log.info("gc http thread returning") -# Basic detach and re-attach test -@pytest.mark.parametrize("remote_storage_kind", available_remote_storages()) -def test_tenant_reattach( - neon_env_builder: NeonEnvBuilder, - remote_storage_kind: RemoteStorageKind, -): - neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) +class ReattachMode(str, enum.Enum): + REATTACH_EXPLICIT = "explicit" + REATTACH_RESET = "reset" + REATTACH_RESET_DROP = "reset_drop" + +# Basic detach and re-attach test +@pytest.mark.parametrize( + "mode", + [ReattachMode.REATTACH_EXPLICIT, ReattachMode.REATTACH_RESET, ReattachMode.REATTACH_RESET_DROP], +) +def test_tenant_reattach(neon_env_builder: NeonEnvBuilder, mode: str): # Exercise retry code path by making all uploads and downloads fail for the # first time. The retries print INFO-messages to the log; we will check # that they are present after the test. @@ -72,6 +76,10 @@ def test_tenant_reattach( env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS) + # Our re-attach may race with the deletion queue processing LSN updates + # from the original attachment. + env.pageserver.allowed_errors.append(".*Dropped remote consistent LSN updates.*") + with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint: with endpoint.cursor() as cur: cur.execute("CREATE TABLE t(key int primary key, value text)") @@ -100,8 +108,15 @@ def test_tenant_reattach( ps_metrics.query_one("pageserver_last_record_lsn", filter=tenant_metric_filter).value ) - pageserver_http.tenant_detach(tenant_id) - pageserver_http.tenant_attach(tenant_id) + if mode == ReattachMode.REATTACH_EXPLICIT: + # Explicitly detach then attach the tenant as two separate API calls + env.pageserver.tenant_detach(tenant_id) + env.pageserver.tenant_attach(tenant_id) + elif mode in (ReattachMode.REATTACH_RESET, ReattachMode.REATTACH_RESET_DROP): + # Use the reset API to detach/attach in one shot + pageserver_http.tenant_reset(tenant_id, mode == ReattachMode.REATTACH_RESET_DROP) + else: + raise NotImplementedError(mode) time.sleep(1) # for metrics propagation @@ -166,15 +181,15 @@ num_rows = 100000 # # I don't know what's causing that... @pytest.mark.skip(reason="fixme") -@pytest.mark.parametrize("remote_storage_kind", available_remote_storages()) def test_tenant_reattach_while_busy( neon_env_builder: NeonEnvBuilder, - remote_storage_kind: RemoteStorageKind, ): updates_started = 0 updates_finished = 0 updates_to_perform = 0 + env = neon_env_builder.init_start() + # Run random UPDATEs on test table. On failure, try again. async def update_table(pg_conn: asyncpg.Connection): nonlocal updates_started, updates_finished, updates_to_perform @@ -206,7 +221,7 @@ def test_tenant_reattach_while_busy( pageserver_http.tenant_detach(tenant_id) await asyncio.sleep(1) log.info("Re-attaching tenant") - pageserver_http.tenant_attach(tenant_id) + env.pageserver.tenant_attach(tenant_id) log.info("Re-attach finished") # Continue with 5000 more updates @@ -227,9 +242,6 @@ def test_tenant_reattach_while_busy( assert updates_finished == updates_to_perform - neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) - env = neon_env_builder.init_start() - pageserver_http = env.pageserver.http_client() # create new nenant @@ -295,10 +307,14 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder): bogus_timeline_id = TimelineId.generate() pageserver_http.timeline_gc(tenant_id, bogus_timeline_id, 0) - # the error will be printed to the log too - env.pageserver.allowed_errors.append(".*gc target timeline does not exist.*") - # Timelines get stopped during detach, ignore the gc calls that error, witnessing that - env.pageserver.allowed_errors.append(".*InternalServerError\\(timeline is Stopping.*") + env.pageserver.allowed_errors.extend( + [ + # the error will be printed to the log too + ".*gc target timeline does not exist.*", + # Timelines get stopped during detach, ignore the gc calls that error, witnessing that + ".*InternalServerError\\(timeline is Stopping.*", + ] + ) # Detach while running manual GC. # It should wait for manual GC to finish because it runs in a task associated with the tenant. @@ -418,13 +434,9 @@ def test_tenant_detach_regular_tenant(neon_simple_env: NeonEnv): should not be present in pageserver's memory" -@pytest.mark.parametrize("remote_storage_kind", available_remote_storages()) def test_detach_while_attaching( neon_env_builder: NeonEnvBuilder, - remote_storage_kind: RemoteStorageKind, ): - neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) - ##### First start, insert secret data and upload it to the remote storage env = neon_env_builder.init_start() pageserver_http = env.pageserver.http_client() @@ -437,6 +449,10 @@ def test_detach_while_attaching( env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS) + # Our re-attach may race with the deletion queue processing LSN updates + # from the original attachment. + env.pageserver.allowed_errors.append(".*Dropped remote consistent LSN updates.*") + # Create table, and insert some rows. Make it big enough that it doesn't fit in # shared_buffers, otherwise the SELECT after restart will just return answer # from shared_buffers without hitting the page server, which defeats the point @@ -470,7 +486,7 @@ def test_detach_while_attaching( # And re-attach pageserver_http.configure_failpoints([("attach-before-activate", "return(5000)")]) - pageserver_http.tenant_attach(tenant_id) + env.pageserver.tenant_attach(tenant_id) # Before it has chance to finish, detach it again pageserver_http.tenant_detach(tenant_id) @@ -480,7 +496,7 @@ def test_detach_while_attaching( # Attach it again. If the GC and compaction loops from the previous attach/detach # cycle are still running, things could get really confusing.. - pageserver_http.tenant_attach(tenant_id) + env.pageserver.tenant_attach(tenant_id) with endpoint.cursor() as cur: cur.execute("SELECT COUNT(*) FROM foo") @@ -539,7 +555,7 @@ def test_ignored_tenant_reattach(neon_env_builder: NeonEnvBuilder): ), "Ignored tenant should not be reloaded after pageserver restart" # now, load it from the local files and expect it works - pageserver_http.tenant_load(tenant_id=ignored_tenant_id) + env.pageserver.tenant_load(tenant_id=ignored_tenant_id) wait_until_tenant_state(pageserver_http, ignored_tenant_id, "Active", 5) tenants_after_attach = [tenant["id"] for tenant in pageserver_http.tenant_list()] @@ -594,7 +610,7 @@ def test_ignored_tenant_download_missing_layers(neon_env_builder: NeonEnvBuilder assert layers_removed, f"Found no layers for tenant {timeline_dir}" # now, load it from the local files and expect it to work due to remote storage restoration - pageserver_http.tenant_load(tenant_id=tenant_id) + env.pageserver.tenant_load(tenant_id=tenant_id) wait_until_tenant_state(pageserver_http, tenant_id, "Active", 5) tenants_after_attach = [tenant["id"] for tenant in pageserver_http.tenant_list()] @@ -628,13 +644,13 @@ def test_load_attach_negatives(neon_env_builder: NeonEnvBuilder): expected_exception=PageserverApiException, match=f"tenant {tenant_id} already exists, state: Active", ): - pageserver_http.tenant_load(tenant_id) + env.pageserver.tenant_load(tenant_id) with pytest.raises( expected_exception=PageserverApiException, match=f"tenant {tenant_id} already exists, state: Active", ): - pageserver_http.tenant_attach(tenant_id) + env.pageserver.tenant_attach(tenant_id) pageserver_http.tenant_ignore(tenant_id) @@ -643,7 +659,7 @@ def test_load_attach_negatives(neon_env_builder: NeonEnvBuilder): expected_exception=PageserverApiException, match="tenant directory already exists", ): - pageserver_http.tenant_attach(tenant_id) + env.pageserver.tenant_attach(tenant_id) def test_ignore_while_attaching( @@ -662,6 +678,10 @@ def test_ignore_while_attaching( env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS) + # Our re-attach may race with the deletion queue processing LSN updates + # from the original attachment. + env.pageserver.allowed_errors.append(".*Dropped remote consistent LSN updates.*") + data_id = 1 data_secret = "very secret secret" insert_test_data(pageserver_http, tenant_id, timeline_id, data_id, data_secret, endpoint) @@ -672,7 +692,7 @@ def test_ignore_while_attaching( pageserver_http.tenant_detach(tenant_id) # And re-attach, but stop attach task_mgr task from completing pageserver_http.configure_failpoints([("attach-before-activate", "return(5000)")]) - pageserver_http.tenant_attach(tenant_id) + env.pageserver.tenant_attach(tenant_id) # Run ignore on the task, thereby cancelling the attach. # XXX This should take priority over attach, i.e., it should cancel the attach task. # But neither the failpoint, nor the proper remote_timeline_client download functions, @@ -687,7 +707,7 @@ def test_ignore_while_attaching( expected_exception=PageserverApiException, match="tenant directory already exists", ): - pageserver_http.tenant_attach(tenant_id) + env.pageserver.tenant_attach(tenant_id) tenants_after_ignore = [tenant["id"] for tenant in pageserver_http.tenant_list()] assert tenant_id not in tenants_after_ignore, "Ignored tenant should be missing" @@ -697,7 +717,7 @@ def test_ignore_while_attaching( # Calling load will bring the tenant back online pageserver_http.configure_failpoints([("attach-before-activate", "off")]) - pageserver_http.tenant_load(tenant_id) + env.pageserver.tenant_load(tenant_id) wait_until_tenant_state(pageserver_http, tenant_id, "Active", 5) @@ -801,7 +821,7 @@ def test_metrics_while_ignoring_broken_tenant_and_reloading( found_broken ), f"broken should still be in set, but it is not in the tenant state count: broken={broken}, broken_set={broken_set}" - client.tenant_load(env.initial_tenant) + env.pageserver.tenant_load(env.initial_tenant) found_active = False active, broken_set = ([], []) diff --git a/test_runner/regress/test_tenant_relocation.py b/test_runner/regress/test_tenant_relocation.py index 8be0f0449b..dcd7232b1b 100644 --- a/test_runner/regress/test_tenant_relocation.py +++ b/test_runner/regress/test_tenant_relocation.py @@ -7,13 +7,8 @@ from pathlib import Path from typing import Any, Dict, Optional, Tuple import pytest -from fixtures.broker import NeonBroker from fixtures.log_helper import log -from fixtures.neon_fixtures import ( - Endpoint, - NeonEnv, - NeonEnvBuilder, -) +from fixtures.neon_fixtures import Endpoint, NeonEnvBuilder, NeonPageserver from fixtures.pageserver.http import PageserverHttpClient from fixtures.pageserver.utils import ( assert_tenant_state, @@ -25,12 +20,10 @@ from fixtures.port_distributor import PortDistributor from fixtures.remote_storage import ( LocalFsStorage, RemoteStorageKind, - available_remote_storages, ) from fixtures.types import Lsn, TenantId, TimelineId from fixtures.utils import ( query_scalar, - start_in_background, subprocess_capture, wait_until, ) @@ -40,58 +33,6 @@ def assert_abs_margin_ratio(a: float, b: float, margin_ratio: float): assert abs(a - b) / a < margin_ratio, abs(a - b) / a -@contextmanager -def new_pageserver_service( - new_pageserver_dir: Path, - pageserver_bin: Path, - remote_storage_mock_path: Path, - pg_port: int, - http_port: int, - broker: Optional[NeonBroker], - pg_distrib_dir: Path, -): - """ - cannot use NeonPageserver yet because it depends on neon cli - which currently lacks support for multiple pageservers - """ - # actually run new pageserver - cmd = [ - str(pageserver_bin), - "--workdir", - str(new_pageserver_dir), - "--update-config", - f"-c listen_pg_addr='localhost:{pg_port}'", - f"-c listen_http_addr='localhost:{http_port}'", - f"-c pg_distrib_dir='{pg_distrib_dir}'", - "-c id=2", - f"-c remote_storage={{local_path='{remote_storage_mock_path}'}}", - ] - if broker is not None: - cmd.append( - f"-c broker_endpoint='{broker.client_url()}'", - ) - pageserver_client = PageserverHttpClient( - port=http_port, - auth_token=None, - is_testing_enabled_or_skip=lambda: True, # TODO: check if testing really enabled - ) - try: - pageserver_process = start_in_background( - cmd, new_pageserver_dir, "pageserver.log", pageserver_client.check_status - ) - except Exception as e: - log.error(e) - pageserver_process.kill() - raise Exception(f"Failed to start pageserver as {cmd}, reason: {e}") from e - - log.info("new pageserver started") - try: - yield pageserver_process - finally: - log.info("stopping new pageserver") - pageserver_process.kill() - - @contextmanager def pg_cur(endpoint): with closing(endpoint.connect()) as conn: @@ -201,7 +142,7 @@ def check_timeline_attached( def switch_pg_to_new_pageserver( - env: NeonEnv, + origin_ps: NeonPageserver, endpoint: Endpoint, new_pageserver_port: int, tenant_id: TenantId, @@ -216,7 +157,7 @@ def switch_pg_to_new_pageserver( endpoint.start() - timeline_to_detach_local_path = env.pageserver.timeline_dir(tenant_id, timeline_id) + timeline_to_detach_local_path = origin_ps.timeline_dir(tenant_id, timeline_id) files_before_detach = os.listdir(timeline_to_detach_local_path) assert ( "metadata" in files_before_detach @@ -269,27 +210,33 @@ def test_tenant_relocation( with_load: str, ): neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) + neon_env_builder.num_pageservers = 2 env = neon_env_builder.init_start() tenant_id = TenantId("74ee8b079a0e437eb0afea7d26a07209") - # FIXME: Is this expected? - env.pageserver.allowed_errors.append( - ".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*" + env.pageservers[0].allowed_errors.extend( + [ + # FIXME: Is this expected? + ".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*", + # Needed for detach polling on the original pageserver + f".*NotFound: tenant {tenant_id}.*", + # We will dual-attach in this test, so stale generations are expected + ".*Dropped remote consistent LSN updates.*", + ] ) - # Needed for detach polling. - env.pageserver.allowed_errors.append(f".*NotFound: tenant {tenant_id}.*") - assert isinstance(env.pageserver_remote_storage, LocalFsStorage) - remote_storage_mock_path = env.pageserver_remote_storage.root # we use two branches to check that they are both relocated # first branch is used for load, compute for second one is used to # check that data is not lost - pageserver_http = env.pageserver.http_client() + origin_ps = env.pageservers[0] + destination_ps = env.pageservers[1] + origin_http = origin_ps.http_client() + destination_http = destination_ps.http_client() _, initial_timeline_id = env.neon_cli.create_tenant(tenant_id) log.info("tenant to relocate %s initial_timeline_id %s", tenant_id, initial_timeline_id) @@ -302,7 +249,7 @@ def test_tenant_relocation( timeline_id_main, current_lsn_main = populate_branch( ep_main, tenant_id=tenant_id, - ps_http=pageserver_http, + ps_http=origin_http, create_table=True, expected_sum=500500, ) @@ -320,17 +267,17 @@ def test_tenant_relocation( timeline_id_second, current_lsn_second = populate_branch( ep_second, tenant_id=tenant_id, - ps_http=pageserver_http, + ps_http=origin_http, create_table=False, expected_sum=1001000, ) # wait until pageserver receives that data - wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id_main, current_lsn_main) - timeline_detail_main = pageserver_http.timeline_detail(tenant_id, timeline_id_main) + wait_for_last_record_lsn(origin_http, tenant_id, timeline_id_main, current_lsn_main) + timeline_detail_main = origin_http.timeline_detail(tenant_id, timeline_id_main) - wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id_second, current_lsn_second) - timeline_detail_second = pageserver_http.timeline_detail(tenant_id, timeline_id_second) + wait_for_last_record_lsn(origin_http, tenant_id, timeline_id_second, current_lsn_second) + timeline_detail_second = origin_http.timeline_detail(tenant_id, timeline_id_second) if with_load == "with_load": # create load table @@ -350,170 +297,149 @@ def test_tenant_relocation( # if user creates a branch during migration # it wont appear on the new pageserver ensure_checkpoint( - pageserver_http=pageserver_http, + pageserver_http=origin_http, tenant_id=tenant_id, timeline_id=timeline_id_main, current_lsn=current_lsn_main, ) ensure_checkpoint( - pageserver_http=pageserver_http, + pageserver_http=origin_http, tenant_id=tenant_id, timeline_id=timeline_id_second, current_lsn=current_lsn_second, ) - log.info("inititalizing new pageserver") - # bootstrap second pageserver - new_pageserver_dir = env.repo_dir / "new_pageserver" - new_pageserver_dir.mkdir() + # Migrate either by attaching from s3 or import/export basebackup + if method == "major": + cmd = [ + "poetry", + "run", + "python", + str(base_dir / "scripts/export_import_between_pageservers.py"), + "--tenant-id", + str(tenant_id), + "--from-host", + "localhost", + "--from-http-port", + str(origin_http.port), + "--from-pg-port", + str(origin_ps.service_port.pg), + "--to-host", + "localhost", + "--to-http-port", + str(destination_http.port), + "--to-pg-port", + str(destination_ps.service_port.pg), + "--pg-distrib-dir", + str(neon_env_builder.pg_distrib_dir), + "--work-dir", + str(test_output_dir), + "--tmp-pg-port", + str(port_distributor.get_port()), + ] + subprocess_capture(test_output_dir, cmd, check=True) - new_pageserver_pg_port = port_distributor.get_port() - new_pageserver_http_port = port_distributor.get_port() - log.info("new pageserver ports pg %s http %s", new_pageserver_pg_port, new_pageserver_http_port) - pageserver_bin = neon_binpath / "pageserver" + destination_ps.allowed_errors.append( + ".*ignored .* unexpected bytes after the tar archive.*" + ) + elif method == "minor": + # call to attach timeline to new pageserver + destination_ps.tenant_attach(tenant_id) - new_pageserver_http = PageserverHttpClient( - port=new_pageserver_http_port, - auth_token=None, - is_testing_enabled_or_skip=env.pageserver.is_testing_enabled_or_skip, - ) + # wait for tenant to finish attaching + wait_until( + number_of_iterations=10, + interval=1, + func=lambda: assert_tenant_state(destination_http, tenant_id, "Active"), + ) - with new_pageserver_service( - new_pageserver_dir, - pageserver_bin, - remote_storage_mock_path, - new_pageserver_pg_port, - new_pageserver_http_port, - neon_env_builder.broker, - neon_env_builder.pg_distrib_dir, - ): - # Migrate either by attaching from s3 or import/export basebackup - if method == "major": - cmd = [ - "poetry", - "run", - "python", - str(base_dir / "scripts/export_import_between_pageservers.py"), - "--tenant-id", - str(tenant_id), - "--from-host", - "localhost", - "--from-http-port", - str(pageserver_http.port), - "--from-pg-port", - str(env.pageserver.service_port.pg), - "--to-host", - "localhost", - "--to-http-port", - str(new_pageserver_http_port), - "--to-pg-port", - str(new_pageserver_pg_port), - "--pg-distrib-dir", - str(neon_env_builder.pg_distrib_dir), - "--work-dir", - str(test_output_dir), - "--tmp-pg-port", - str(port_distributor.get_port()), - ] - subprocess_capture(test_output_dir, cmd, check=True) - elif method == "minor": - # call to attach timeline to new pageserver - new_pageserver_http.tenant_attach(tenant_id) - - # wait for tenant to finish attaching - wait_until( - number_of_iterations=10, - interval=1, - func=lambda: assert_tenant_state(new_pageserver_http, tenant_id, "Active"), - ) - - check_timeline_attached( - new_pageserver_http, - tenant_id, - timeline_id_main, - timeline_detail_main, - current_lsn_main, - ) - - check_timeline_attached( - new_pageserver_http, - tenant_id, - timeline_id_second, - timeline_detail_second, - current_lsn_second, - ) - - # rewrite neon cli config to use new pageserver for basebackup to start new compute - lines = (env.repo_dir / "config").read_text().splitlines() - for i, line in enumerate(lines): - if line.startswith("listen_http_addr"): - lines[i] = f"listen_http_addr = 'localhost:{new_pageserver_http_port}'" - if line.startswith("listen_pg_addr"): - lines[i] = f"listen_pg_addr = 'localhost:{new_pageserver_pg_port}'" - (env.repo_dir / "config").write_text("\n".join(lines)) - - old_local_path_main = switch_pg_to_new_pageserver( - env, - ep_main, - new_pageserver_pg_port, + check_timeline_attached( + destination_http, tenant_id, timeline_id_main, + timeline_detail_main, + current_lsn_main, ) - old_local_path_second = switch_pg_to_new_pageserver( - env, - ep_second, - new_pageserver_pg_port, + check_timeline_attached( + destination_http, tenant_id, timeline_id_second, + timeline_detail_second, + current_lsn_second, ) - # detach tenant from old pageserver before we check - # that all the data is there to be sure that old pageserver - # is no longer involved, and if it is, we will see the error - pageserver_http.tenant_detach(tenant_id) + # rewrite neon cli config to use new pageserver for basebackup to start new compute + lines = (env.repo_dir / "config").read_text().splitlines() + for i, line in enumerate(lines): + if line.startswith("listen_http_addr"): + lines[i] = f"listen_http_addr = 'localhost:{destination_http.port}'" + if line.startswith("listen_pg_addr"): + lines[i] = f"listen_pg_addr = 'localhost:{destination_ps.service_port.pg}'" + (env.repo_dir / "config").write_text("\n".join(lines)) - # Wait a little, so that the detach operation has time to finish. - wait_tenant_status_404(pageserver_http, tenant_id, iterations=100, interval=1) + old_local_path_main = switch_pg_to_new_pageserver( + origin_ps, + ep_main, + destination_ps.service_port.pg, + tenant_id, + timeline_id_main, + ) - post_migration_check(ep_main, 500500, old_local_path_main) - post_migration_check(ep_second, 1001000, old_local_path_second) + old_local_path_second = switch_pg_to_new_pageserver( + origin_ps, + ep_second, + destination_ps.service_port.pg, + tenant_id, + timeline_id_second, + ) - # ensure that we can successfully read all relations on the new pageserver - with pg_cur(ep_second) as cur: - cur.execute( - """ - DO $$ - DECLARE - r RECORD; - BEGIN - FOR r IN - SELECT relname FROM pg_class WHERE relkind='r' - LOOP - RAISE NOTICE '%', r.relname; - EXECUTE 'SELECT count(*) FROM quote_ident($1)' USING r.relname; - END LOOP; - END$$; - """ - ) + # detach tenant from old pageserver before we check + # that all the data is there to be sure that old pageserver + # is no longer involved, and if it is, we will see the error + origin_http.tenant_detach(tenant_id) - if with_load == "with_load": - assert load_ok_event.wait(3) - log.info("stopping load thread") - load_stop_event.set() - load_thread.join(timeout=10) - log.info("load thread stopped") + # Wait a little, so that the detach operation has time to finish. + wait_tenant_status_404(origin_http, tenant_id, iterations=100, interval=1) - # bring old pageserver back for clean shutdown via neon cli - # new pageserver will be shut down by the context manager - lines = (env.repo_dir / "config").read_text().splitlines() - for i, line in enumerate(lines): - if line.startswith("listen_http_addr"): - lines[i] = f"listen_http_addr = 'localhost:{env.pageserver.service_port.http}'" - if line.startswith("listen_pg_addr"): - lines[i] = f"listen_pg_addr = 'localhost:{env.pageserver.service_port.pg}'" - (env.repo_dir / "config").write_text("\n".join(lines)) + post_migration_check(ep_main, 500500, old_local_path_main) + post_migration_check(ep_second, 1001000, old_local_path_second) + + # ensure that we can successfully read all relations on the new pageserver + with pg_cur(ep_second) as cur: + cur.execute( + """ + DO $$ + DECLARE + r RECORD; + BEGIN + FOR r IN + SELECT relname FROM pg_class WHERE relkind='r' + LOOP + RAISE NOTICE '%', r.relname; + EXECUTE 'SELECT count(*) FROM quote_ident($1)' USING r.relname; + END LOOP; + END$$; + """ + ) + + if with_load == "with_load": + assert load_ok_event.wait(3) + log.info("stopping load thread") + load_stop_event.set() + load_thread.join(timeout=10) + log.info("load thread stopped") + + # bring old pageserver back for clean shutdown via neon cli + # new pageserver will be shut down by the context manager + lines = (env.repo_dir / "config").read_text().splitlines() + for i, line in enumerate(lines): + if line.startswith("listen_http_addr"): + lines[i] = f"listen_http_addr = 'localhost:{origin_ps.service_port.http}'" + if line.startswith("listen_pg_addr"): + lines[i] = f"listen_pg_addr = 'localhost:{origin_ps.service_port.pg}'" + (env.repo_dir / "config").write_text("\n".join(lines)) # Simulate hard crash of pageserver and re-attach a tenant with a branch @@ -523,13 +449,9 @@ def test_tenant_relocation( # last-record LSN. We had a bug where GetPage incorrectly followed the # timeline to the ancestor without waiting for the missing WAL to # arrive. -@pytest.mark.parametrize("remote_storage_kind", available_remote_storages()) def test_emergency_relocate_with_branches_slow_replay( neon_env_builder: NeonEnvBuilder, - remote_storage_kind: RemoteStorageKind, ): - neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) - env = neon_env_builder.init_start() env.pageserver.is_testing_enabled_or_skip() pageserver_http = env.pageserver.http_client() @@ -571,7 +493,7 @@ def test_emergency_relocate_with_branches_slow_replay( # Attach and wait a few seconds to give it time to load the tenants, attach to the # safekeepers, and to stream and ingest the WAL up to the pause-point. before_attach_time = time.time() - pageserver_http.tenant_attach(tenant_id) + env.pageserver.tenant_attach(tenant_id) time.sleep(3) # The wal ingestion on the main timeline should now be paused at the fail point. @@ -677,13 +599,9 @@ def test_emergency_relocate_with_branches_slow_replay( # exist. Update dbir" path (2), and inserts an entry in the # DbDirectory with 'false' to indicate there is no PG_VERSION file. # -@pytest.mark.parametrize("remote_storage_kind", available_remote_storages()) def test_emergency_relocate_with_branches_createdb( neon_env_builder: NeonEnvBuilder, - remote_storage_kind: RemoteStorageKind, ): - neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) - env = neon_env_builder.init_start() pageserver_http = env.pageserver.http_client() @@ -718,7 +636,7 @@ def test_emergency_relocate_with_branches_createdb( # ingest the WAL, but let's make this less dependent on accidental timing. pageserver_http.configure_failpoints([("wal-ingest-logical-message-sleep", "return(5000)")]) before_attach_time = time.time() - pageserver_http.tenant_attach(tenant_id) + env.pageserver.tenant_attach(tenant_id) child_endpoint.start() with child_endpoint.cursor(dbname="neondb") as cur: diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py index 090d586721..22036884ee 100644 --- a/test_runner/regress/test_tenants.py +++ b/test_runner/regress/test_tenants.py @@ -18,7 +18,7 @@ from fixtures.neon_fixtures import ( NeonEnvBuilder, ) from fixtures.pageserver.utils import timeline_delete_wait_completed -from fixtures.remote_storage import RemoteStorageKind, available_remote_storages +from fixtures.remote_storage import RemoteStorageKind from fixtures.types import Lsn, TenantId from fixtures.utils import wait_until from prometheus_client.samples import Sample @@ -281,19 +281,15 @@ def test_pageserver_metrics_removed_after_detach(neon_env_builder: NeonEnvBuilde assert post_detach_samples == set() -# Check that empty tenants work with or without the remote storage -@pytest.mark.parametrize("remote_storage_kind", available_remote_storages()) -def test_pageserver_with_empty_tenants( - neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind -): - neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) - +def test_pageserver_with_empty_tenants(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_start() - env.pageserver.allowed_errors.append( - ".*marking .* as locally complete, while it doesnt exist in remote index.*" + env.pageserver.allowed_errors.extend( + [ + ".*marking .* as locally complete, while it doesnt exist in remote index.*", + ".*load failed.*list timelines directory.*", + ] ) - env.pageserver.allowed_errors.append(".*load failed.*list timelines directory.*") client = env.pageserver.http_client() diff --git a/test_runner/regress/test_tenants_with_remote_storage.py b/test_runner/regress/test_tenants_with_remote_storage.py index 0169335a70..07fb6dc5ca 100644 --- a/test_runner/regress/test_tenants_with_remote_storage.py +++ b/test_runner/regress/test_tenants_with_remote_storage.py @@ -11,7 +11,6 @@ import os from pathlib import Path from typing import List, Tuple -import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import ( Endpoint, @@ -27,7 +26,6 @@ from fixtures.pageserver.utils import ( from fixtures.remote_storage import ( LocalFsStorage, RemoteStorageKind, - available_remote_storages, ) from fixtures.types import Lsn, TenantId, TimelineId from fixtures.utils import query_scalar, wait_until @@ -60,10 +58,7 @@ async def all_tenants_workload(env: NeonEnv, tenants_endpoints): await asyncio.gather(*workers) -@pytest.mark.parametrize("remote_storage_kind", available_remote_storages()) -def test_tenants_many(neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind): - neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) - +def test_tenants_many(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_start() # FIXME: Is this expected? @@ -122,10 +117,12 @@ def test_tenants_attached_after_download(neon_env_builder: NeonEnvBuilder): ##### First start, insert secret data and upload it to the remote storage env = neon_env_builder.init_start() - # FIXME: Are these expected? - env.pageserver.allowed_errors.append(".*No timelines to attach received.*") - env.pageserver.allowed_errors.append( - ".*marking .* as locally complete, while it doesnt exist in remote index.*" + env.pageserver.allowed_errors.extend( + [ + # FIXME: Are these expected? + ".*No timelines to attach received.*", + ".*marking .* as locally complete, while it doesnt exist in remote index.*", + ] ) pageserver_http = env.pageserver.http_client() @@ -218,22 +215,19 @@ def test_tenants_attached_after_download(neon_env_builder: NeonEnvBuilder): def test_tenant_redownloads_truncated_file_on_startup( neon_env_builder: NeonEnvBuilder, ): - remote_storage_kind = RemoteStorageKind.LOCAL_FS - - # since we now store the layer file length metadata, we notice on startup that a layer file is of wrong size, and proceed to redownload it. - neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) - + # we store the layer file length metadata, we notice on startup that a layer file is of wrong size, and proceed to redownload it. env = neon_env_builder.init_start() assert isinstance(env.pageserver_remote_storage, LocalFsStorage) - env.pageserver.allowed_errors.append(".*removing local file .* because .*") - - # FIXME: Are these expected? - env.pageserver.allowed_errors.append( - ".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*" + env.pageserver.allowed_errors.extend( + [ + ".*removing local file .* because .*", + # FIXME: Are these expected? + ".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*", + ".*No timelines to attach received.*", + ] ) - env.pageserver.allowed_errors.append(".*No timelines to attach received.*") pageserver_http = env.pageserver.http_client() endpoint = env.endpoints.create_start("main") @@ -297,8 +291,8 @@ def test_tenant_redownloads_truncated_file_on_startup( assert os.stat(path).st_size == expected_size, "truncated layer should had been re-downloaded" # the remote side of local_layer_truncated - remote_layer_path = ( - env.pageserver_remote_storage.timeline_path(tenant_id, timeline_id) / path.name + remote_layer_path = env.pageserver_remote_storage.remote_layer_path( + tenant_id, timeline_id, path.name ) # if the upload ever was ongoing, this check would be racy, but at least one diff --git a/test_runner/regress/test_threshold_based_eviction.py b/test_runner/regress/test_threshold_based_eviction.py index 27d5cce5f2..5f72cfd747 100644 --- a/test_runner/regress/test_threshold_based_eviction.py +++ b/test_runner/regress/test_threshold_based_eviction.py @@ -36,12 +36,13 @@ def test_threshold_based_eviction( ".*metrics_collection:.* upload consumption_metrics (still failed|failed, will retry).*" ) env = neon_env_builder.init_start() - env.pageserver.allowed_errors.append(metrics_refused_log_line) - - # these can happen whenever we run consumption metrics collection - env.pageserver.allowed_errors.append(r".*failed to calculate logical size at \S+: cancelled") - env.pageserver.allowed_errors.append( - r".*failed to calculate synthetic size for tenant \S+: failed to calculate some logical_sizes" + env.pageserver.allowed_errors.extend( + [ + metrics_refused_log_line, + # these can happen whenever we run consumption metrics collection + r".*failed to calculate logical size at \S+: cancelled", + r".*failed to calculate synthetic size for tenant \S+: failed to calculate some logical_sizes", + ] ) tenant_id, timeline_id = env.initial_tenant, env.initial_timeline diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py index 2e1fcd38fe..82ffcb1177 100644 --- a/test_runner/regress/test_timeline_delete.py +++ b/test_runner/regress/test_timeline_delete.py @@ -29,8 +29,7 @@ from fixtures.pageserver.utils import ( from fixtures.remote_storage import ( LocalFsStorage, RemoteStorageKind, - available_remote_storages, - available_s3_storages, + s3_storage, ) from fixtures.types import Lsn, TenantId, TimelineId from fixtures.utils import query_scalar, run_pg_bench_small, wait_until @@ -40,10 +39,14 @@ from urllib3.util.retry import Retry def test_timeline_delete(neon_simple_env: NeonEnv): env = neon_simple_env - env.pageserver.allowed_errors.append(".*Timeline .* was not found.*") - env.pageserver.allowed_errors.append(".*timeline not found.*") - env.pageserver.allowed_errors.append(".*Cannot delete timeline which has child timelines.*") - env.pageserver.allowed_errors.append(".*Precondition failed: Requested tenant is missing.*") + env.pageserver.allowed_errors.extend( + [ + ".*Timeline .* was not found.*", + ".*timeline not found.*", + ".*Cannot delete timeline which has child timelines.*", + ".*Precondition failed: Requested tenant is missing.*", + ] + ) ps_http = env.pageserver.http_client() @@ -142,25 +145,11 @@ DELETE_FAILPOINTS = [ ] -def combinations(): - result = [] - - remotes = [RemoteStorageKind.MOCK_S3] - if os.getenv("ENABLE_REAL_S3_REMOTE_STORAGE"): - remotes.append(RemoteStorageKind.REAL_S3) - - for remote_storage_kind in remotes: - for delete_failpoint in DELETE_FAILPOINTS: - result.append((remote_storage_kind, delete_failpoint)) - return result - - # cover the two cases: remote storage configured vs not configured -@pytest.mark.parametrize("remote_storage_kind, failpoint", combinations()) +@pytest.mark.parametrize("failpoint", DELETE_FAILPOINTS) @pytest.mark.parametrize("check", list(Check)) def test_delete_timeline_exercise_crash_safety_failpoints( neon_env_builder: NeonEnvBuilder, - remote_storage_kind: RemoteStorageKind, failpoint: str, check: Check, pg_bin: PgBin, @@ -180,7 +169,7 @@ def test_delete_timeline_exercise_crash_safety_failpoints( 7. Ensure failpoint is hit 8. Retry or restart without the failpoint and check the result. """ - + remote_storage_kind = s3_storage() neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) env = neon_env_builder.init_start( @@ -201,35 +190,34 @@ def test_delete_timeline_exercise_crash_safety_failpoints( last_flush_lsn_upload(env, endpoint, env.initial_tenant, timeline_id) - if remote_storage_kind in available_s3_storages(): - assert_prefix_not_empty( - neon_env_builder, - prefix="/".join( - ( - "tenants", - str(env.initial_tenant), - "timelines", - str(timeline_id), - ) - ), - ) + assert_prefix_not_empty( + neon_env_builder, + prefix="/".join( + ( + "tenants", + str(env.initial_tenant), + "timelines", + str(timeline_id), + ) + ), + ) - env.pageserver.allowed_errors.append(f".*{timeline_id}.*failpoint: {failpoint}") - # It appears when we stopped flush loop during deletion and then pageserver is stopped - env.pageserver.allowed_errors.append( - ".*shutdown_all_tenants:shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited", + env.pageserver.allowed_errors.extend( + [ + f".*{timeline_id}.*failpoint: {failpoint}", + # It appears when we stopped flush loop during deletion and then pageserver is stopped + ".*shutdown_all_tenants:shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited", + # This happens when we fail before scheduling background operation. + # Timeline is left in stopping state and retry tries to stop it again. + ".*Ignoring new state, equal to the existing one: Stopping", + # This happens when we retry delete requests for broken timelines + ".*Ignoring state update Stopping for broken timeline", + # This happens when timeline remains are cleaned up during loading + ".*Timeline dir entry become invalid.*", + # In one of the branches we poll for tenant to become active. Polls can generate this log message: + f".*Tenant {env.initial_tenant} is not active*", + ] ) - # This happens when we fail before scheduling background operation. - # Timeline is left in stopping state and retry tries to stop it again. - env.pageserver.allowed_errors.append( - ".*Ignoring new state, equal to the existing one: Stopping" - ) - # This happens when we retry delete requests for broken timelines - env.pageserver.allowed_errors.append(".*Ignoring state update Stopping for broken timeline") - # This happens when timeline remains are cleaned up during loading - env.pageserver.allowed_errors.append(".*Timeline dir entry become invalid.*") - # In one of the branches we poll for tenant to become active. Polls can generate this log message: - env.pageserver.allowed_errors.append(f".*Tenant {env.initial_tenant} is not active*") ps_http.configure_failpoints((failpoint, "return")) @@ -275,15 +263,6 @@ def test_delete_timeline_exercise_crash_safety_failpoints( ps_http, env.initial_tenant, timeline_id, iterations=iterations ) - if failpoint == "timeline-delete-after-index-delete": - m = ps_http.get_metrics() - assert ( - m.query_one( - "remote_storage_s3_request_seconds_count", - filter={"request_type": "get_object", "result": "ok"}, - ).value - == 1 # index part for initial timeline - ) elif check is Check.RETRY_WITHOUT_RESTART: # this should succeed # this also checks that delete can be retried even when timeline is in Broken state @@ -308,17 +287,17 @@ def test_delete_timeline_exercise_crash_safety_failpoints( ) timeline_dir = env.pageserver.timeline_dir(env.initial_tenant, timeline_id) + # Check local is empty - assert not timeline_dir.exists() + assert (not timeline_dir.exists()) or len(os.listdir(timeline_dir)) == 0 + # Check no delete mark present assert not (timeline_dir.parent / f"{timeline_id}.___deleted").exists() -@pytest.mark.parametrize("remote_storage_kind", available_remote_storages()) @pytest.mark.parametrize("fill_branch", [True, False]) def test_timeline_resurrection_on_attach( neon_env_builder: NeonEnvBuilder, - remote_storage_kind: RemoteStorageKind, fill_branch: bool, ): """ @@ -327,8 +306,6 @@ def test_timeline_resurrection_on_attach( Original issue: https://github.com/neondatabase/neon/issues/3560 """ - neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) - ##### First start, insert data and upload it to the remote storage env = neon_env_builder.init_start() @@ -394,7 +371,7 @@ def test_timeline_resurrection_on_attach( ##### Second start, restore the data and ensure that we see only timeline that wasnt deleted env.pageserver.start() - ps_http.tenant_attach(tenant_id=tenant_id) + env.pageserver.tenant_attach(tenant_id=tenant_id) wait_until_tenant_active(ps_http, tenant_id=tenant_id, iterations=10, period=0.5) @@ -416,13 +393,13 @@ def test_timeline_delete_fail_before_local_delete(neon_env_builder: NeonEnvBuild env = neon_env_builder.init_start() - env.pageserver.allowed_errors.append(".*failpoint: timeline-delete-before-rm") - env.pageserver.allowed_errors.append( - ".*Ignoring new state, equal to the existing one: Stopping" - ) - # this happens, because the stuck timeline is visible to shutdown - env.pageserver.allowed_errors.append( - ".*shutdown_all_tenants:shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited", + env.pageserver.allowed_errors.extend( + [ + ".*failpoint: timeline-delete-before-rm", + ".*Ignoring new state, equal to the existing one: Stopping", + # this happens, because the stuck timeline is visible to shutdown + ".*shutdown_all_tenants:shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited", + ] ) ps_http = env.pageserver.http_client() @@ -569,10 +546,12 @@ def test_concurrent_timeline_delete_stuck_on( with pytest.raises(PageserverApiException, match=error_msg_re) as second_call_err: ps_http.timeline_delete(env.initial_tenant, child_timeline_id) assert second_call_err.value.status_code == 409 - env.pageserver.allowed_errors.append(f".*{child_timeline_id}.*{error_msg_re}.*") - # the second call will try to transition the timeline into Stopping state as well - env.pageserver.allowed_errors.append( - f".*{child_timeline_id}.*Ignoring new state, equal to the existing one: Stopping" + env.pageserver.allowed_errors.extend( + [ + f".*{child_timeline_id}.*{error_msg_re}.*", + # the second call will try to transition the timeline into Stopping state as well + f".*{child_timeline_id}.*Ignoring new state, equal to the existing one: Stopping", + ] ) log.info("second call failed as expected") @@ -656,20 +635,10 @@ def test_delete_timeline_client_hangup(neon_env_builder: NeonEnvBuilder): wait_timeline_detail_404(ps_http, env.initial_tenant, child_timeline_id, iterations=2) -@pytest.mark.parametrize( - "remote_storage_kind", - list( - filter( - lambda s: s in (RemoteStorageKind.MOCK_S3, RemoteStorageKind.REAL_S3), - available_remote_storages(), - ) - ), -) def test_timeline_delete_works_for_remote_smoke( neon_env_builder: NeonEnvBuilder, - remote_storage_kind: RemoteStorageKind, ): - neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) + neon_env_builder.enable_pageserver_remote_storage(s3_storage()) env = neon_env_builder.init_start() @@ -802,12 +771,11 @@ def test_delete_orphaned_objects( assert env.pageserver_remote_storage.index_path(env.initial_tenant, timeline_id).exists() -@pytest.mark.parametrize("remote_storage_kind", available_remote_storages()) def test_timeline_delete_resumed_on_attach( neon_env_builder: NeonEnvBuilder, - remote_storage_kind: RemoteStorageKind, pg_bin: PgBin, ): + remote_storage_kind = s3_storage() neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) env = neon_env_builder.init_start(initial_tenant_conf=MANY_SMALL_LAYERS_TENANT_CONFIG) @@ -822,18 +790,17 @@ def test_timeline_delete_resumed_on_attach( run_pg_bench_small(pg_bin, endpoint.connstr()) last_flush_lsn_upload(env, endpoint, env.initial_tenant, timeline_id) - if remote_storage_kind in available_s3_storages(): - assert_prefix_not_empty( - neon_env_builder, - prefix="/".join( - ( - "tenants", - str(env.initial_tenant), - "timelines", - str(timeline_id), - ) - ), - ) + assert_prefix_not_empty( + neon_env_builder, + prefix="/".join( + ( + "tenants", + str(env.initial_tenant), + "timelines", + str(timeline_id), + ) + ), + ) # failpoint before we remove index_part from s3 failpoint = "timeline-delete-during-rm" @@ -871,18 +838,17 @@ def test_timeline_delete_resumed_on_attach( # failpoint may not be the only error in the stack assert reason.endswith(f"failpoint: {failpoint}"), reason - if remote_storage_kind in available_s3_storages(): - assert_prefix_not_empty( - neon_env_builder, - prefix="/".join( - ( - "tenants", - str(tenant_id), - "timelines", - str(timeline_id), - ) - ), - ) + assert_prefix_not_empty( + neon_env_builder, + prefix="/".join( + ( + "tenants", + str(tenant_id), + "timelines", + str(timeline_id), + ) + ), + ) # now we stop pageserver and remove local tenant state env.endpoints.stop_all() @@ -895,7 +861,7 @@ def test_timeline_delete_resumed_on_attach( env.pageserver.start() # now we call attach - ps_http.tenant_attach(tenant_id=tenant_id) + env.pageserver.tenant_attach(tenant_id=tenant_id) # delete should be resumed wait_timeline_detail_404(ps_http, env.initial_tenant, timeline_id, iterations=iterations) @@ -903,15 +869,14 @@ def test_timeline_delete_resumed_on_attach( tenant_path = env.pageserver.timeline_dir(tenant_id, timeline_id) assert not tenant_path.exists() - if remote_storage_kind in available_s3_storages(): - assert_prefix_empty( - neon_env_builder, - prefix="/".join( - ( - "tenants", - str(timeline_id), - "timelines", - str(timeline_id), - ) - ), - ) + assert_prefix_empty( + neon_env_builder, + prefix="/".join( + ( + "tenants", + str(timeline_id), + "timelines", + str(timeline_id), + ) + ), + ) diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py index eb98348823..11685d1d48 100644 --- a/test_runner/regress/test_timeline_size.py +++ b/test_runner/regress/test_timeline_size.py @@ -1,3 +1,4 @@ +import concurrent.futures import math import queue import random @@ -24,6 +25,7 @@ from fixtures.pageserver.utils import ( assert_tenant_state, timeline_delete_wait_completed, wait_for_upload_queue_empty, + wait_tenant_status_404, wait_until_tenant_active, ) from fixtures.pg_version import PgVersion @@ -146,6 +148,72 @@ def wait_for_pageserver_catchup(endpoint_main: Endpoint, polling_interval=1, tim time.sleep(polling_interval) +def test_timeline_size_quota_on_startup(neon_env_builder: NeonEnvBuilder): + env = neon_env_builder.init_start() + client = env.pageserver.http_client() + new_timeline_id = env.neon_cli.create_branch("test_timeline_size_quota_on_startup") + + wait_for_timeline_size_init(client, tenant=env.initial_tenant, timeline=new_timeline_id) + + endpoint_main = env.endpoints.create( + "test_timeline_size_quota_on_startup", + # Set small limit for the test + config_lines=["neon.max_cluster_size=30MB"], + ) + endpoint_main.start() + + log.info("postgres is running on 'test_timeline_size_quota_on_startup' branch") + + with closing(endpoint_main.connect()) as conn: + with conn.cursor() as cur: + cur.execute("CREATE TABLE foo (t text)") + + # Insert many rows. This query must fail because of space limit + try: + for _i in range(5000): + cur.execute( + """ + INSERT INTO foo + SELECT 'long string to consume some space' || g + FROM generate_series(1, 100) g + """ + ) + + # If we get here, the timeline size limit failed + log.error("Query unexpectedly succeeded") + raise AssertionError() + + except psycopg2.errors.DiskFull as err: + log.info(f"Query expectedly failed with: {err}") + + # Restart endpoint that reached the limit to ensure that it doesn't fail on startup + # i.e. the size limit is not enforced during startup. + endpoint_main.stop() + # don't skip pg_catalog updates - it runs CREATE EXTENSION neon + # which is needed for neon.pg_cluster_size() to work + endpoint_main.respec(skip_pg_catalog_updates=False) + endpoint_main.start() + + # ensure that the limit is enforced after startup + with closing(endpoint_main.connect()) as conn: + with conn.cursor() as cur: + # This query must fail because of space limit + try: + cur.execute( + """ + INSERT INTO foo + SELECT 'long string to consume some space' || g + FROM generate_series(1, 100000) g + """ + ) + # If we get here, the timeline size limit failed + log.error("Query unexpectedly succeeded") + raise AssertionError() + + except psycopg2.errors.DiskFull as err: + log.info(f"Query expectedly failed with: {err}") + + def test_timeline_size_quota(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_start() client = env.pageserver.http_client() @@ -234,7 +302,8 @@ def test_timeline_initial_logical_size_calculation_cancellation( env = neon_env_builder.init_start() client = env.pageserver.http_client() - tenant_id, timeline_id = env.neon_cli.create_tenant() + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline # load in some data endpoint = env.endpoints.create_start("main", tenant_id=tenant_id) @@ -666,3 +735,191 @@ def wait_for_timeline_size_init( raise Exception( f"timed out while waiting for current_logical_size of a timeline to reach its non-incremental value, details: {timeline_details}" ) + + +def test_ondemand_activation(neon_env_builder: NeonEnvBuilder): + """ + Tenants warmuping up opportunistically will wait for one another's logical size calculations to complete + before proceeding. However, they skip this if a client is actively trying to access them. + + This test is not purely about logical sizes, but logical size calculation is the phase that we + use as a proxy for "warming up" in this test: it happens within the semaphore guard used + to limit concurrent tenant warm-up. + """ + + # We will run with the limit set to 1, so that once we have one tenant stuck + # in a pausable failpoint, the rest are prevented from proceeding through warmup. + neon_env_builder.pageserver_config_override = "concurrent_tenant_warmup = '1'" + + env = neon_env_builder.init_start() + pageserver_http = env.pageserver.http_client() + + # Create some tenants + n_tenants = 10 + tenant_ids = {env.initial_tenant} + for _i in range(0, n_tenants - 1): + tenant_id = TenantId.generate() + env.pageserver.tenant_create(tenant_id) + + # Empty tenants are not subject to waiting for logical size calculations, because + # those hapen on timeline level + timeline_id = TimelineId.generate() + env.neon_cli.create_timeline( + new_branch_name="main", tenant_id=tenant_id, timeline_id=timeline_id + ) + + tenant_ids.add(tenant_id) + + # Restart pageserver with logical size calculations paused + env.pageserver.stop() + env.pageserver.start( + extra_env_vars={"FAILPOINTS": "timeline-calculate-logical-size-pause=pause"} + ) + + def get_tenant_states(): + states = {} + log.info(f"Tenant ids: {tenant_ids}") + for tenant_id in tenant_ids: + tenant = pageserver_http.tenant_status(tenant_id=tenant_id) + states[tenant_id] = tenant["state"]["slug"] + log.info(f"Tenant states: {states}") + return states + + def at_least_one_active(): + assert "Active" in set(get_tenant_states().values()) + + # One tenant should activate, then get stuck in their logical size calculation + wait_until(10, 1, at_least_one_active) + + # Wait some walltime to gain confidence that other tenants really are stuck and not proceeding to activate + time.sleep(5) + + # We should see one tenant win the activation race, and enter logical size calculation. The rest + # will stay in Attaching state, waiting for the "warmup_limit" semaphore + expect_activated = 1 + states = get_tenant_states() + assert len([s for s in states.values() if s == "Active"]) == expect_activated + assert len([s for s in states.values() if s == "Attaching"]) == n_tenants - expect_activated + + assert ( + pageserver_http.get_metric_value("pageserver_tenant_startup_scheduled_total") == n_tenants + ) + + # This is zero, and subsequent checks are expect_activated - 1, because this counter does not + # count how may tenants are Active, it counts how many have finished warmup. The first tenant + # that reached Active is still stuck in its local size calculation, and has therefore not finished warmup. + assert pageserver_http.get_metric_value("pageserver_tenant_startup_complete_total") == 0 + + # If a client accesses one of the blocked tenants, it should skip waiting for warmup and + # go active as fast as it can. + stuck_tenant_id = list( + [(tid, s) for (tid, s) in get_tenant_states().items() if s == "Attaching"] + )[0][0] + + endpoint = env.endpoints.create_start(branch_name="main", tenant_id=stuck_tenant_id) + endpoint.safe_psql_many( + [ + "CREATE TABLE foo (x INTEGER)", + "INSERT INTO foo SELECT g FROM generate_series(1, 10) g", + ] + ) + endpoint.stop() + + # That one that we successfully accessed is now Active + expect_activated += 1 + assert pageserver_http.tenant_status(tenant_id=stuck_tenant_id)["state"]["slug"] == "Active" + assert ( + pageserver_http.get_metric_value("pageserver_tenant_startup_complete_total") + == expect_activated - 1 + ) + + # The ones we didn't touch are still in Attaching + assert ( + len([s for s in get_tenant_states().values() if s == "Attaching"]) + == n_tenants - expect_activated + ) + + # Timeline creation operations also wake up Attaching tenants + stuck_tenant_id = list( + [(tid, s) for (tid, s) in get_tenant_states().items() if s == "Attaching"] + )[0][0] + pageserver_http.timeline_create(env.pg_version, stuck_tenant_id, TimelineId.generate()) + expect_activated += 1 + assert pageserver_http.tenant_status(tenant_id=stuck_tenant_id)["state"]["slug"] == "Active" + assert ( + len([s for s in get_tenant_states().values() if s == "Attaching"]) + == n_tenants - expect_activated + ) + + assert ( + pageserver_http.get_metric_value("pageserver_tenant_startup_complete_total") + == expect_activated - 1 + ) + + # When we unblock logical size calculation, all tenants should proceed to active state via + # the warmup route. + pageserver_http.configure_failpoints(("timeline-calculate-logical-size-pause", "off")) + + def all_active(): + assert all(s == "Active" for s in get_tenant_states().values()) + + wait_until(10, 1, all_active) + + # Final control check: restarting with no failpoints at all results in all tenants coming active + # without being prompted by client I/O + env.pageserver.stop() + env.pageserver.start() + wait_until(10, 1, all_active) + + assert ( + pageserver_http.get_metric_value("pageserver_tenant_startup_scheduled_total") == n_tenants + ) + assert pageserver_http.get_metric_value("pageserver_tenant_startup_complete_total") == n_tenants + + # Check that tenant deletion proactively wakes tenants: this is done separately to the main + # body of the test because it will disrupt tenant counts + env.pageserver.stop() + env.pageserver.start( + extra_env_vars={"FAILPOINTS": "timeline-calculate-logical-size-pause=pause"} + ) + + wait_until(10, 1, at_least_one_active) + delete_tenant_id = list( + [(tid, s) for (tid, s) in get_tenant_states().items() if s == "Attaching"] + )[0][0] + + # Deleting a stuck tenant should prompt it to go active + with concurrent.futures.ThreadPoolExecutor() as executor: + log.info("Starting background delete") + + def delete_tenant(): + env.pageserver.http_client().tenant_delete(delete_tenant_id) + + background_delete = executor.submit(delete_tenant) + + # Deletion itself won't complete due to our failpoint: Tenant::shutdown can't complete while calculating + # logical size is paused in a failpoint. So instead we will use a log observation to check that + # on-demand activation was triggered by the tenant deletion + log_match = f".*attach{{tenant_id={delete_tenant_id} shard_id=0000}}: Activating tenant \\(on-demand\\).*" + + def activated_on_demand(): + assert env.pageserver.log_contains(log_match) is not None + + log.info(f"Waiting for activation message '{log_match}'") + try: + wait_until(10, 1, activated_on_demand) + finally: + log.info("Clearing failpoint") + pageserver_http.configure_failpoints(("timeline-calculate-logical-size-pause", "off")) + + # Deletion should complete successfully now that failpoint is unblocked + log.info("Joining background delete") + background_delete.result(timeout=10) + + # Poll for deletion to complete + wait_tenant_status_404(pageserver_http, tenant_id=delete_tenant_id, iterations=40) + tenant_ids.remove(delete_tenant_id) + + # Check that all the stuck tenants proceed to active (apart from the one that deletes) + wait_until(10, 1, all_active) + assert len(get_tenant_states()) == n_tenants - 1 diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index 05c60eb102..4dfc883f4c 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -1,6 +1,5 @@ import filecmp import os -import pathlib import random import shutil import signal @@ -31,6 +30,7 @@ from fixtures.neon_fixtures import ( Safekeeper, SafekeeperHttpClient, SafekeeperPort, + last_flush_lsn_upload, ) from fixtures.pageserver.utils import ( timeline_delete_wait_completed, @@ -39,10 +39,7 @@ from fixtures.pageserver.utils import ( ) from fixtures.pg_version import PgVersion from fixtures.port_distributor import PortDistributor -from fixtures.remote_storage import ( - RemoteStorageKind, - available_remote_storages, -) +from fixtures.remote_storage import RemoteStorageKind, default_remote_storage from fixtures.types import Lsn, TenantId, TimelineId from fixtures.utils import get_dir_size, query_scalar, start_in_background @@ -287,29 +284,47 @@ def test_broker(neon_env_builder: NeonEnvBuilder): # wait until remote_consistent_lsn gets advanced on all safekeepers clients = [sk.http_client() for sk in env.safekeepers] stat_before = [cli.timeline_status(tenant_id, timeline_id) for cli in clients] - log.info(f"statuses is {stat_before}") + log.info(f"statuses before insert: {stat_before}") endpoint.safe_psql("INSERT INTO t SELECT generate_series(1,100), 'payload'") - # force checkpoint in pageserver to advance remote_consistent_lsn - wait_lsn_force_checkpoint(tenant_id, timeline_id, endpoint, env.pageserver) + # wait for remote_consistent_lsn to reach flush_lsn, forcing it with checkpoint + new_rcl = last_flush_lsn_upload(env, endpoint, tenant_id, timeline_id) + log.info(f"new_rcl: {new_rcl}") + endpoint.stop() # and wait till remote_consistent_lsn propagates to all safekeepers + # + # This timeout is long: safekeepers learn about remote_consistent_lsn updates when a pageserver + # connects, receives a PrimaryKeepAlive, and sends a PageserverFeedback. So the timeout has to encompass: + # - pageserver deletion_queue to validate + publish the remote_consistent_lsn + # - pageserver to reconnect to all safekeepers one by one, with multi-second delays between + # + # TODO: timeline status on safekeeper should take into account peers state as well. + rcl_propagate_secs = 60 + started_at = time.time() while True: stat_after = [cli.timeline_status(tenant_id, timeline_id) for cli in clients] - if all( - s_after.remote_consistent_lsn > s_before.remote_consistent_lsn - for s_after, s_before in zip(stat_after, stat_before) - ): + if all([s_after.remote_consistent_lsn >= new_rcl for s_after in stat_after]): break elapsed = time.time() - started_at - if elapsed > 20: + if elapsed > rcl_propagate_secs: raise RuntimeError( f"timed out waiting {elapsed:.0f}s for remote_consistent_lsn propagation: status before {stat_before}, status current {stat_after}" ) time.sleep(1) + # Ensure that safekeepers don't lose remote_consistent_lsn on restart. + # Control file is persisted each 5s. TODO: do that on shutdown and remove sleep. + time.sleep(6) + for sk in env.safekeepers: + sk.stop() + sk.start() + stat_after_restart = [cli.timeline_status(tenant_id, timeline_id) for cli in clients] + log.info(f"statuses after {stat_after_restart}") + assert all([s.remote_consistent_lsn >= new_rcl for s in stat_after_restart]) + # Test that old WAL consumed by peers and pageserver is removed from safekeepers. @pytest.mark.parametrize("auth_enabled", [False, True]) @@ -404,7 +419,8 @@ def wait(f, desc, timeout=30, wait_f=None): try: if f(): break - except Exception: + except Exception as e: + log.info(f"got exception while waiting for {desc}: {e}") pass elapsed = time.time() - started_at if elapsed > timeout: @@ -439,10 +455,9 @@ def is_wal_trimmed(sk: Safekeeper, tenant_id: TenantId, timeline_id: TimelineId, return sk_wal_size_mb <= target_size_mb -@pytest.mark.parametrize("remote_storage_kind", available_remote_storages()) -def test_wal_backup(neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind): +def test_wal_backup(neon_env_builder: NeonEnvBuilder): neon_env_builder.num_safekeepers = 3 - neon_env_builder.enable_safekeeper_remote_storage(remote_storage_kind) + neon_env_builder.enable_safekeeper_remote_storage(default_remote_storage()) env = neon_env_builder.init_start() @@ -485,11 +500,10 @@ def test_wal_backup(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Remot ) -@pytest.mark.parametrize("remote_storage_kind", available_remote_storages()) -def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind): +def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder): neon_env_builder.num_safekeepers = 3 - neon_env_builder.enable_safekeeper_remote_storage(remote_storage_kind) + neon_env_builder.enable_safekeeper_remote_storage(default_remote_storage()) env = neon_env_builder.init_start() tenant_id = env.initial_tenant @@ -552,7 +566,7 @@ def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Re f"Pageserver last_record_lsn={pageserver_lsn}; flush_lsn={last_lsn}; lag before replay is {lag / 1024}kb" ) - endpoint.stop_and_destroy() + endpoint.stop() timeline_delete_wait_completed(ps_http, tenant_id, timeline_id) # Also delete and manually create timeline on safekeepers -- this tests @@ -639,7 +653,7 @@ class ProposerPostgres(PgProtocol): def __init__( self, pgdata_dir: str, - pg_bin, + pg_bin: PgBin, tenant_id: TenantId, timeline_id: TimelineId, listen_addr: str, @@ -665,7 +679,7 @@ class ProposerPostgres(PgProtocol): def create_dir_config(self, safekeepers: str): """Create dir and config for running --sync-safekeepers""" - pathlib.Path(self.pg_data_dir_path()).mkdir(exist_ok=True) + Path(self.pg_data_dir_path()).mkdir(exist_ok=True) with open(self.config_file_path(), "w") as f: cfg = [ "synchronous_standby_names = 'walproposer'\n", @@ -691,7 +705,7 @@ class ProposerPostgres(PgProtocol): "PGDATA": self.pg_data_dir_path(), } - basepath = self.pg_bin.run_capture(command, env) + basepath = self.pg_bin.run_capture(command, env, with_command_header=False) log.info(f"postgres --sync-safekeepers output: {basepath}") @@ -988,8 +1002,40 @@ def test_restart_endpoint(neon_env_builder: NeonEnvBuilder): endpoint.start() +# Context manager which logs passed time on exit. +class DurationLogger: + def __init__(self, desc): + self.desc = desc + + def __enter__(self): + self.ts_before = time.time() + + def __exit__(self, *exc): + log.info(f"{self.desc} finished in {time.time() - self.ts_before}s") + + +# Context manager which logs WAL position change on exit. +class WalChangeLogger: + def __init__(self, ep, desc_before): + self.ep = ep + self.desc_before = desc_before + + def __enter__(self): + self.ts_before = time.time() + self.lsn_before = Lsn(self.ep.safe_psql_scalar("select pg_current_wal_lsn()")) + log.info(f"{self.desc_before}, lsn_before={self.lsn_before}") + + def __exit__(self, *exc): + lsn_after = Lsn(self.ep.safe_psql_scalar("select pg_current_wal_lsn()")) + log.info( + f"inserted {((lsn_after - self.lsn_before) / 1024 / 1024):.3f} MB of WAL in {(time.time() - self.ts_before):.3f}s" + ) + + # Test that we can create timeline with one safekeeper down and initialize it -# later when some data already had been written. +# later when some data already had been written. It is strictly weaker than +# test_lagging_sk, but also is the simplest test to trigger WAL sk -> compute +# download (recovery) and as such useful for development/testing. def test_late_init(neon_env_builder: NeonEnvBuilder): neon_env_builder.num_safekeepers = 3 env = neon_env_builder.init_start() @@ -997,12 +1043,13 @@ def test_late_init(neon_env_builder: NeonEnvBuilder): sk1 = env.safekeepers[0] sk1.stop() - # create and insert smth while safekeeper is down... - env.neon_cli.create_branch("test_late_init") + tenant_id = env.initial_tenant + timeline_id = env.neon_cli.create_branch("test_late_init") endpoint = env.endpoints.create_start("test_late_init") + # create and insert smth while safekeeper is down... endpoint.safe_psql("create table t(key int, value text)") - endpoint.safe_psql("insert into t select generate_series(1, 1000), 'payload'") - log.info("insert with safekeeper down done") + with WalChangeLogger(endpoint, "doing insert with sk1 down"): + endpoint.safe_psql("insert into t select generate_series(1, 1000), 'payload'") endpoint.stop() # stop compute # stop another safekeeper, and start one which missed timeline creation @@ -1011,28 +1058,213 @@ def test_late_init(neon_env_builder: NeonEnvBuilder): sk1.start() # insert some more - endpoint = env.endpoints.create_start("test_late_init") + with DurationLogger("recovery"): + endpoint = env.endpoints.create_start("test_late_init") endpoint.safe_psql("insert into t select generate_series(1,100), 'payload'") + wait_flush_lsn_align_by_ep( + env, "test_late_init", tenant_id, timeline_id, endpoint, [sk1, env.safekeepers[2]] + ) + # Check that WALs are the same. + cmp_sk_wal([sk1, env.safekeepers[2]], tenant_id, timeline_id) + # is timeline flush_lsn equal on provided safekeepers? -def is_flush_lsn_aligned(sk1_http_cli, sk2_http_cli, tenant_id, timeline_id): - status1 = sk1_http_cli.timeline_status(tenant_id, timeline_id) - status2 = sk2_http_cli.timeline_status(tenant_id, timeline_id) - log.info( - f"waiting for flush_lsn alignment, sk1.flush_lsn={status1.flush_lsn}, sk2.flush_lsn={status2.flush_lsn}" +def is_flush_lsn_aligned(sk_http_clis, tenant_id, timeline_id): + flush_lsns = [ + sk_http_cli.timeline_status(tenant_id, timeline_id).flush_lsn + for sk_http_cli in sk_http_clis + ] + log.info(f"waiting for flush_lsn alignment, flush_lsns={flush_lsns}") + return all([flush_lsns[0] == flsn for flsn in flush_lsns]) + + +def are_walreceivers_absent(sk_http_cli, tenant_id: TenantId, timeline_id: TimelineId): + status = sk_http_cli.timeline_status(tenant_id, timeline_id) + log.info(f"waiting for walreceivers to be gone, currently {status.walreceivers}") + return len(status.walreceivers) == 0 + + +# Assert by xxd that WAL on given safekeepers is identical. No compute must be +# running for this to be reliable. +def cmp_sk_wal(sks: List[Safekeeper], tenant_id: TenantId, timeline_id: TimelineId): + assert len(sks) >= 2, "cmp_sk_wal makes sense with >= 2 safekeepers passed" + sk_http_clis = [sk.http_client() for sk in sks] + + # First check that term / flush_lsn are the same: it is easier to + # report/understand if WALs are different due to that. + statuses = [sk_http_cli.timeline_status(tenant_id, timeline_id) for sk_http_cli in sk_http_clis] + term_flush_lsns = [(s.acceptor_epoch, s.flush_lsn) for s in statuses] + for tfl, sk in zip(term_flush_lsns[1:], sks[1:]): + assert ( + term_flush_lsns[0] == tfl + ), f"(term, flush_lsn) are not equal on sks {sks[0].id} and {sk.id}: {term_flush_lsns[0]} != {tfl}" + + # check that WALs are identic. + segs = [sk.list_segments(tenant_id, timeline_id) for sk in sks] + for cmp_segs, sk in zip(segs[1:], sks[1:]): + assert ( + segs[0] == cmp_segs + ), f"lists of segments on sks {sks[0].id} and {sk.id} are not identic: {segs[0]} and {cmp_segs}" + log.info(f"comparing segs {segs[0]}") + + sk0 = sks[0] + for sk in sks[1:]: + (_, mismatch, not_regular) = filecmp.cmpfiles( + sk0.timeline_dir(tenant_id, timeline_id), + sk.timeline_dir(tenant_id, timeline_id), + segs[0], + shallow=False, + ) + log.info( + f"filecmp result mismatch and not regular files:\n\t mismatch={mismatch}\n\t not_regular={not_regular}" + ) + + for f in mismatch: + f1 = os.path.join(sk0.timeline_dir(tenant_id, timeline_id), f) + f2 = os.path.join(sk.timeline_dir(tenant_id, timeline_id), f) + stdout_filename = "{}.filediff".format(f2) + + with open(stdout_filename, "w") as stdout_f: + subprocess.run("xxd {} > {}.hex ".format(f1, f1), shell=True) + subprocess.run("xxd {} > {}.hex ".format(f2, f2), shell=True) + + cmd = "diff {}.hex {}.hex".format(f1, f2) + subprocess.run([cmd], stdout=stdout_f, shell=True) + + assert (mismatch, not_regular) == ( + [], + [], + ), f"WAL segs {f1} and {f2} on sks {sks[0].id} and {sk.id} are not identic" + + +# Wait until flush_lsn on given sks becomes equal, assuming endpoint ep is +# running. ep is stopped by this function. This is used in tests which check +# binary equality of WAL segments on safekeepers; which is inherently racy as +# shutting down endpoint might always write some WAL which can get to only one +# safekeeper. So here we recheck flush_lsn again after ep shutdown and retry if +# it has changed. +def wait_flush_lsn_align_by_ep(env, branch, tenant_id, timeline_id, ep, sks): + sk_http_clis = [sk.http_client() for sk in sks] + # First wait for the alignment. + wait( + partial(is_flush_lsn_aligned, sk_http_clis, tenant_id, timeline_id), + "flush_lsn to get aligned", ) - return status1.flush_lsn == status2.flush_lsn + ep.stop() # then stop endpoint + # Even if there is no compute, there might be some in flight data; ensure + # all walreceivers die before rechecking. + for sk_http_cli in sk_http_clis: + wait( + partial(are_walreceivers_absent, sk_http_cli, tenant_id, timeline_id), + "walreceivers to be gone", + ) + # Now recheck again flush_lsn and exit if it is good + if is_flush_lsn_aligned(sk_http_clis, tenant_id, timeline_id): + return + # Otherwise repeat. + log.info("flush_lsn changed during endpoint shutdown; retrying alignment") + ep = env.endpoints.create_start(branch) -# Test behaviour with one safekeeper down and missing a lot of WAL. Namely, that -# 1) walproposer can't recover node if it misses WAL written by previous computes, but -# still starts up and functions normally if two other sks are ok. -# 2) walproposer doesn't keep WAL after some threshold (pg_wal bloat is limited), but functions -# normally if two other sks are ok. -# 3) Lagged safekeeper can still recover by peer recovery. -def test_one_sk_down(neon_env_builder: NeonEnvBuilder): - pass +# Test behaviour with one safekeeper down and missing a lot of WAL, exercising +# neon_walreader and checking that pg_wal never bloats. Namely, ensures that +# compute doesn't keep many WAL for lagging sk, but still can recover it with +# neon_walreader, in two scenarious: a) WAL never existed on compute (it started +# on basebackup LSN later than lagging sk position) though segment file exists +# b) WAL had been recycled on it and segment file doesn't exist. +# +# Also checks along the way that whenever there are two sks alive, compute +# should be able to commit. +def test_lagging_sk(neon_env_builder: NeonEnvBuilder): + # inserts ~20MB of WAL, a bit more than a segment. + def fill_segment(ep): + ep.safe_psql("insert into t select generate_series(1, 180000), 'payload'") + + neon_env_builder.num_safekeepers = 3 + env = neon_env_builder.init_start() + + (sk1, sk2, sk3) = env.safekeepers + + # create and insert smth while safekeeper is down... + sk1.stop() + tenant_id = env.initial_tenant + timeline_id = env.neon_cli.create_branch("test_lagging_sk") + ep = env.endpoints.create_start("test_lagging_sk") + ep.safe_psql("create table t(key int, value text)") + # make small insert to be on the same segment + ep.safe_psql("insert into t select generate_series(1, 1000), 'payload'") + log.info("insert with safekeeper down done") + ep.stop() # stop compute + + # Stop another safekeeper, and start one which missed timeline creation. + sk2.stop() + sk1.start() + + # Start new ep and insert some more. neon_walreader should download WAL for + # sk1 because it should be filled since the horizon (initial LSN) which is + # earlier than basebackup LSN. + ep = env.endpoints.create_start("test_lagging_sk") + ep.safe_psql("insert into t select generate_series(1,100), 'payload'") + # stop ep and ensure WAL is identical after recovery. + wait_flush_lsn_align_by_ep(env, "test_lagging_sk", tenant_id, timeline_id, ep, [sk1, sk3]) + # Check that WALs are the same. + cmp_sk_wal([sk1, sk3], tenant_id, timeline_id) + + # Now repeat insertion with sk1 down, but with inserting more data to check + # that WAL on compute is removed. + sk1.stop() + sk2.start() + + # min_wal_size must be at least 2x segment size. + min_wal_config = [ + "min_wal_size=32MB", + "max_wal_size=32MB", + "wal_keep_size=0", + "log_checkpoints=on", + ] + ep = env.endpoints.create_start( + "test_lagging_sk", + config_lines=min_wal_config, + ) + with WalChangeLogger(ep, "doing large insert with sk1 down"): + for _ in range(0, 5): + fill_segment(ep) + # there shouldn't be more than 2 WAL segments (but dir may have archive_status files) + assert ep.get_pg_wal_size() < 16 * 2.5 + + sk2.stop() # stop another sk to ensure sk1 and sk3 can work + sk1.start() + with DurationLogger("recovery"): + ep.safe_psql("insert into t select generate_series(1,100), 'payload'") # forces recovery + # stop ep and ensure WAL is identical after recovery. + wait_flush_lsn_align_by_ep(env, "test_lagging_sk", tenant_id, timeline_id, ep, [sk1, sk3]) + # Check that WALs are the same. + cmp_sk_wal([sk1, sk3], tenant_id, timeline_id) + + # Now do the same with different safekeeper sk2 down, and restarting ep + # before recovery (again scenario when recovery starts below basebackup_lsn, + # but multi segment now). + ep = env.endpoints.create_start( + "test_lagging_sk", + config_lines=["min_wal_size=32MB", "max_wal_size=32MB", "log_checkpoints=on"], + ) + with WalChangeLogger(ep, "doing large insert with sk2 down"): + for _ in range(0, 5): + fill_segment(ep) + # there shouldn't be more than 2 WAL segments (but dir may have archive_status files) + assert ep.get_pg_wal_size() < 16 * 2.5 + + ep.stop() + ep = env.endpoints.create_start( + "test_lagging_sk", + config_lines=min_wal_config, + ) + sk2.start() + with DurationLogger("recovery"): + wait_flush_lsn_align_by_ep(env, "test_lagging_sk", tenant_id, timeline_id, ep, [sk2, sk3]) + # Check that WALs are the same. + cmp_sk_wal([sk1, sk2, sk3], tenant_id, timeline_id) # Smaller version of test_one_sk_down testing peer recovery in isolation: that @@ -1052,7 +1284,7 @@ def test_peer_recovery(neon_env_builder: NeonEnvBuilder): sk2_http_cli = sk2.http_client() # ensure tli gets created on sk1, peer recovery won't do that wait( - partial(is_flush_lsn_aligned, sk1_http_cli, sk2_http_cli, tenant_id, timeline_id), + partial(is_flush_lsn_aligned, [sk1_http_cli, sk2_http_cli], tenant_id, timeline_id), "flush_lsn to get aligned", ) @@ -1074,7 +1306,7 @@ def test_peer_recovery(neon_env_builder: NeonEnvBuilder): assert sk2_tli_status.flush_lsn - sk1_tli_status.flush_lsn >= 16 * 1024 * 1024 # wait a bit, lsns shouldn't change - # time.sleep(5) + time.sleep(2) sk1_tli_status = sk1_http_cli.timeline_status(tenant_id, timeline_id) sk2_tli_status = sk2_http_cli.timeline_status(tenant_id, timeline_id) log.info( @@ -1085,37 +1317,11 @@ def test_peer_recovery(neon_env_builder: NeonEnvBuilder): # now restart safekeeper with peer recovery enabled and wait for recovery sk1.stop().start(extra_opts=["--peer-recovery=true"]) wait( - partial(is_flush_lsn_aligned, sk1_http_cli, sk2_http_cli, tenant_id, timeline_id), + partial(is_flush_lsn_aligned, [sk1_http_cli, sk2_http_cli], tenant_id, timeline_id), "flush_lsn to get aligned", ) - # check that WALs are identic after recovery - segs = sk1.list_segments(tenant_id, timeline_id) - log.info(f"segs are {segs}") - - (_, mismatch, not_regular) = filecmp.cmpfiles( - sk1.timeline_dir(tenant_id, timeline_id), - sk2.timeline_dir(tenant_id, timeline_id), - segs, - shallow=False, - ) - log.info( - f"filecmp result mismatch and not regular files:\n\t mismatch={mismatch}\n\t not_regular={not_regular}" - ) - - for f in mismatch: - f1 = os.path.join(sk1.timeline_dir(tenant_id, timeline_id), f) - f2 = os.path.join(sk2.timeline_dir(tenant_id, timeline_id), f) - stdout_filename = "{}.filediff".format(f2) - - with open(stdout_filename, "w") as stdout_f: - subprocess.run("xxd {} > {}.hex ".format(f1, f1), shell=True) - subprocess.run("xxd {} > {}.hex ".format(f2, f2), shell=True) - - cmd = "diff {}.hex {}.hex".format(f1, f2) - subprocess.run([cmd], stdout=stdout_f, shell=True) - - assert (mismatch, not_regular) == ([], []) + cmp_sk_wal([sk1, sk2], tenant_id, timeline_id) # stop one of safekeepers which weren't recovering and insert a bit more to check we can commit env.safekeepers[2].stop() @@ -1351,60 +1557,6 @@ def test_replace_safekeeper(neon_env_builder: NeonEnvBuilder): show_statuses(env.safekeepers, tenant_id, timeline_id) -# We have `wal_keep_size=0`, so postgres should trim WAL once it's broadcasted -# to all safekeepers. This test checks that compute WAL can fit into small number -# of WAL segments. -def test_wal_deleted_after_broadcast(neon_env_builder: NeonEnvBuilder): - # used to calculate delta in collect_stats - last_lsn = Lsn(0) - - # returns pg_wal size in MB - def collect_stats(endpoint: Endpoint, cur, enable_logs=True): - nonlocal last_lsn - assert endpoint.pgdata_dir is not None - - log.info("executing INSERT to generate WAL") - current_lsn = Lsn(query_scalar(cur, "select pg_current_wal_lsn()")) - pg_wal_size_mb = get_dir_size(os.path.join(endpoint.pgdata_dir, "pg_wal")) / 1024 / 1024 - if enable_logs: - lsn_delta_mb = (current_lsn - last_lsn) / 1024 / 1024 - log.info(f"LSN delta: {lsn_delta_mb} MB, current WAL size: {pg_wal_size_mb} MB") - last_lsn = current_lsn - return pg_wal_size_mb - - # generates about ~20MB of WAL, to create at least one new segment - def generate_wal(cur): - cur.execute("INSERT INTO t SELECT generate_series(1,300000), 'payload'") - - neon_env_builder.num_safekeepers = 3 - env = neon_env_builder.init_start() - - env.neon_cli.create_branch("test_wal_deleted_after_broadcast") - # Adjust checkpoint config to prevent keeping old WAL segments - endpoint = env.endpoints.create_start( - "test_wal_deleted_after_broadcast", - config_lines=["min_wal_size=32MB", "max_wal_size=32MB", "log_checkpoints=on"], - ) - - pg_conn = endpoint.connect() - cur = pg_conn.cursor() - cur.execute("CREATE TABLE t(key int, value text)") - - collect_stats(endpoint, cur) - - # generate WAL to simulate normal workload - for _ in range(5): - generate_wal(cur) - collect_stats(endpoint, cur) - - log.info("executing checkpoint") - cur.execute("CHECKPOINT") - wal_size_after_checkpoint = collect_stats(endpoint, cur) - - # there shouldn't be more than 2 WAL segments (but dir may have archive_status files) - assert wal_size_after_checkpoint < 16 * 2.5 - - @pytest.mark.parametrize("auth_enabled", [False, True]) def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): neon_env_builder.auth_enabled = auth_enabled diff --git a/test_runner/regress/test_wal_acceptor_async.py b/test_runner/regress/test_wal_acceptor_async.py index feab7e605b..77d67cd63a 100644 --- a/test_runner/regress/test_wal_acceptor_async.py +++ b/test_runner/regress/test_wal_acceptor_async.py @@ -475,6 +475,46 @@ def test_unavailability(neon_env_builder: NeonEnvBuilder): asyncio.run(run_unavailability(env, endpoint)) +async def run_recovery_uncommitted(env: NeonEnv): + (sk1, sk2, _) = env.safekeepers + + env.neon_cli.create_branch("test_recovery_uncommitted") + ep = env.endpoints.create_start("test_recovery_uncommitted") + ep.safe_psql("create table t(key int, value text)") + ep.safe_psql("insert into t select generate_series(1, 100), 'payload'") + + # insert with only one safekeeper up to create tail of flushed but not committed WAL + sk1.stop() + sk2.stop() + conn = await ep.connect_async() + # query should hang, so execute in separate task + bg_query = asyncio.create_task( + conn.execute("insert into t select generate_series(1, 2000), 'payload'") + ) + sleep_sec = 2 + await asyncio.sleep(sleep_sec) + # it must still be not finished + assert not bg_query.done() + # note: destoy will kill compute_ctl, preventing it waiting for hanging sync-safekeepers. + ep.stop_and_destroy() + + # Start one of sks to make quorum online plus compute and ensure they can + # sync. + sk2.start() + ep = env.endpoints.create_start( + "test_recovery_uncommitted", + ) + ep.safe_psql("insert into t select generate_series(1, 2000), 'payload'") + + +# Test pulling uncommitted WAL (up to flush_lsn) during recovery. +def test_recovery_uncommitted(neon_env_builder: NeonEnvBuilder): + neon_env_builder.num_safekeepers = 3 + env = neon_env_builder.init_start() + + asyncio.run(run_recovery_uncommitted(env)) + + @dataclass class RaceConditionTest: iteration: int diff --git a/test_runner/regress/test_wal_restore.py b/test_runner/regress/test_wal_restore.py index b039b36255..7d03f644d1 100644 --- a/test_runner/regress/test_wal_restore.py +++ b/test_runner/regress/test_wal_restore.py @@ -11,6 +11,7 @@ from fixtures.neon_fixtures import ( PgBin, VanillaPostgres, ) +from fixtures.pageserver.utils import timeline_delete_wait_completed from fixtures.port_distributor import PortDistributor from fixtures.remote_storage import LocalFsStorage from fixtures.types import Lsn, TenantId, TimelineId @@ -125,3 +126,32 @@ def test_wal_restore_initdb( ) log.info(f"original lsn: {original_lsn}, restored lsn: {restored_lsn}") assert restored.safe_psql("select count(*) from t", user="cloud_admin") == [(300000,)] + + +def test_wal_restore_http(neon_env_builder: NeonEnvBuilder): + env = neon_env_builder.init_start() + endpoint = env.endpoints.create_start("main") + endpoint.safe_psql("create table t as select generate_series(1,300000)") + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + + ps_client = env.pageserver.http_client() + + # shut down the endpoint and delete the timeline from the pageserver + endpoint.stop() + + assert isinstance(env.pageserver_remote_storage, LocalFsStorage) + + timeline_delete_wait_completed(ps_client, tenant_id, timeline_id) + + # issue the restoration command + ps_client.timeline_create( + tenant_id=tenant_id, + new_timeline_id=timeline_id, + existing_initdb_timeline_id=timeline_id, + pg_version=env.pg_version, + ) + + # the table is back now! + restored = env.endpoints.create_start("main") + assert restored.safe_psql("select count(*) from t", user="cloud_admin") == [(300000,)] diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index dd067cf656..03358bb0b5 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit dd067cf656f6810a25aca6025633d32d02c5085a +Subproject commit 03358bb0b5e0d33c238710139e768db9e75cfcc8 diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index bc88f53931..a2dc225ddf 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit bc88f539312fcc4bb292ce94ae9db09ab6656e8a +Subproject commit a2dc225ddfc8cae1849aa2316f435c58f0333d8c diff --git a/vendor/postgres-v16 b/vendor/postgres-v16 index e3a22b7292..225071f482 160000 --- a/vendor/postgres-v16 +++ b/vendor/postgres-v16 @@ -1 +1 @@ -Subproject commit e3a22b72922055f9212eca12700190f118578362 +Subproject commit 225071f482774943854c2eec4540757e01171557 diff --git a/vendor/revisions.json b/vendor/revisions.json index c4cea208ee..def4eab069 100644 --- a/vendor/revisions.json +++ b/vendor/revisions.json @@ -1,5 +1,5 @@ { - "postgres-v16": "e3a22b72922055f9212eca12700190f118578362", - "postgres-v15": "bc88f539312fcc4bb292ce94ae9db09ab6656e8a", - "postgres-v14": "dd067cf656f6810a25aca6025633d32d02c5085a" + "postgres-v16": "225071f482774943854c2eec4540757e01171557", + "postgres-v15": "a2dc225ddfc8cae1849aa2316f435c58f0333d8c", + "postgres-v14": "03358bb0b5e0d33c238710139e768db9e75cfcc8" } diff --git a/vm-image-spec.yaml b/vm-image-spec.yaml index d0ba742a63..704e3721d6 100644 --- a/vm-image-spec.yaml +++ b/vm-image-spec.yaml @@ -13,6 +13,10 @@ commands: user: nobody sysvInitAction: respawn shell: 'DATA_SOURCE_NAME="user=cloud_admin sslmode=disable dbname=postgres" /bin/postgres_exporter' + - name: sql-exporter + user: nobody + sysvInitAction: respawn + shell: '/bin/sql_exporter -config.file=/etc/sql_exporter.yml' shutdownHook: | su -p postgres --session-command '/usr/local/bin/pg_ctl stop -D /var/db/postgres/compute/pgdata -m fast --wait -t 10' files: @@ -30,8 +34,9 @@ files: server_tls_sslmode=disable pool_mode=transaction max_client_conn=10000 - default_pool_size=16 + default_pool_size=64 max_prepared_statements=0 + admin_users=cloud_admin - filename: cgconfig.conf content: | # Configuration for cgroups in VM compute nodes @@ -46,6 +51,77 @@ files: } memory {} } + - filename: sql_exporter.yml + content: | + # Configuration for sql_exporter + # Global defaults. + global: + # If scrape_timeout <= 0, no timeout is set unless Prometheus provides one. The default is 10s. + scrape_timeout: 10s + # Subtracted from Prometheus' scrape_timeout to give us some headroom and prevent Prometheus from timing out first. + scrape_timeout_offset: 500ms + # Minimum interval between collector runs: by default (0s) collectors are executed on every scrape. + min_interval: 0s + # Maximum number of open connections to any one target. Metric queries will run concurrently on multiple connections, + # as will concurrent scrapes. + max_connections: 1 + # Maximum number of idle connections to any one target. Unless you use very long collection intervals, this should + # always be the same as max_connections. + max_idle_connections: 1 + # Maximum number of maximum amount of time a connection may be reused. Expired connections may be closed lazily before reuse. + # If 0, connections are not closed due to a connection's age. + max_connection_lifetime: 5m + + # The target to monitor and the collectors to execute on it. + target: + # Data source name always has a URI schema that matches the driver name. In some cases (e.g. MySQL) + # the schema gets dropped or replaced to match the driver expected DSN format. + data_source_name: 'postgresql://cloud_admin@127.0.0.1:5432/postgres?sslmode=disable' + + # Collectors (referenced by name) to execute on the target. + # Glob patterns are supported (see for syntax). + collectors: [neon_collector] + + # Collector files specifies a list of globs. One collector definition is read from each matching file. + # Glob patterns are supported (see for syntax). + collector_files: + - "neon_collector.yml" + - filename: neon_collector.yml + content: | + collector_name: neon_collector + metrics: + - metric_name: lfc_misses + type: gauge + help: 'lfc_misses' + key_labels: + values: [lfc_misses] + query: | + select lfc_value as lfc_misses from neon.neon_lfc_stats where lfc_key='file_cache_misses'; + + - metric_name: lfc_used + type: gauge + help: 'lfc_used' + key_labels: + values: [lfc_used] + query: | + select lfc_value as lfc_used from neon.neon_lfc_stats where lfc_key='file_cache_used'; + + - metric_name: lfc_hits + type: gauge + help: 'lfc_hits' + key_labels: + values: [lfc_hits] + query: | + select lfc_value as lfc_hits from neon.neon_lfc_stats where lfc_key='file_cache_hits'; + + - metric_name: lfc_writes + type: gauge + help: 'lfc_writes' + key_labels: + values: [lfc_writes] + query: | + select lfc_value as lfc_writes from neon.neon_lfc_stats where lfc_key='file_cache_writes'; + build: | # Build cgroup-tools # @@ -82,6 +158,8 @@ build: | FROM quay.io/prometheuscommunity/postgres-exporter:v0.12.0 AS postgres-exporter + FROM burningalchemist/sql_exporter:0.13 AS sql-exporter + # Build pgbouncer # FROM debian:bullseye-slim AS pgbouncer @@ -89,22 +167,21 @@ build: | && apt-get update \ && apt-get install -y \ build-essential \ - curl \ + git \ libevent-dev \ - libssl-dev \ - patchutils \ + libtool \ pkg-config - ENV PGBOUNCER_VERSION 1.21.0 - ENV PGBOUNCER_GITPATH 1_21_0 + # Note, we use pgbouncer from neondatabase/pgbouncer fork, which could contain extra commits. + # Use `dist_man_MANS=` to skip manpage generation (which requires python3/pandoc) + ENV PGBOUNCER_TAG pgbouncer_1_21_0-neon-1 RUN set -e \ - && curl -sfSL https://github.com/pgbouncer/pgbouncer/releases/download/pgbouncer_${PGBOUNCER_GITPATH}/pgbouncer-${PGBOUNCER_VERSION}.tar.gz -o pgbouncer-${PGBOUNCER_VERSION}.tar.gz \ - && tar xzvf pgbouncer-${PGBOUNCER_VERSION}.tar.gz \ - && cd pgbouncer-${PGBOUNCER_VERSION} \ - && curl https://github.com/pgbouncer/pgbouncer/commit/a7b3c0a5f4caa9dbe92743d04cf1e28c4c05806c.patch | filterdiff --include a/src/server.c | patch -p1 \ + && git clone --recurse-submodules --depth 1 --branch ${PGBOUNCER_TAG} https://github.com/neondatabase/pgbouncer.git pgbouncer \ + && cd pgbouncer \ + && ./autogen.sh \ && LDFLAGS=-static ./configure --prefix=/usr/local/pgbouncer --without-openssl \ - && make -j $(nproc) \ - && make install + && make -j $(nproc) dist_man_MANS= \ + && make install dist_man_MANS= merge: | # tweak nofile limits RUN set -e \ @@ -116,13 +193,19 @@ merge: | COPY cgconfig.conf /etc/cgconfig.conf COPY pgbouncer.ini /etc/pgbouncer.ini + COPY sql_exporter.yml /etc/sql_exporter.yml + COPY neon_collector.yml /etc/neon_collector.yml + RUN set -e \ && chown postgres:postgres /etc/pgbouncer.ini \ && chmod 0644 /etc/pgbouncer.ini \ - && chmod 0644 /etc/cgconfig.conf + && chmod 0644 /etc/cgconfig.conf \ + && chmod 0644 /etc/sql_exporter.yml \ + && chmod 0644 /etc/neon_collector.yml COPY --from=libcgroup-builder /libcgroup-install/bin/* /usr/bin/ COPY --from=libcgroup-builder /libcgroup-install/lib/* /usr/lib/ COPY --from=libcgroup-builder /libcgroup-install/sbin/* /usr/sbin/ COPY --from=postgres-exporter /bin/postgres_exporter /bin/postgres_exporter + COPY --from=sql-exporter /bin/sql_exporter /bin/sql_exporter COPY --from=pgbouncer /usr/local/pgbouncer/bin/pgbouncer /usr/local/bin/pgbouncer diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index 3e46731adf..4f13064088 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -39,6 +39,7 @@ futures-executor = { version = "0.3" } futures-io = { version = "0.3" } futures-sink = { version = "0.3" } futures-util = { version = "0.3", features = ["channel", "io", "sink"] } +getrandom = { version = "0.2", default-features = false, features = ["std"] } hex = { version = "0.4", features = ["serde"] } hmac = { version = "0.12", default-features = false, features = ["reset"] } hyper = { version = "0.14", features = ["full"] } @@ -50,12 +51,14 @@ nom = { version = "7" } num-bigint = { version = "0.4" } num-integer = { version = "0.1", features = ["i128"] } num-traits = { version = "0.2", features = ["i128"] } +once_cell = { version = "1" } prost = { version = "0.11" } rand = { version = "0.8", features = ["small_rng"] } regex = { version = "1" } -regex-syntax = { version = "0.7" } +regex-automata = { version = "0.4", default-features = false, features = ["dfa-onepass", "hybrid", "meta", "nfa-backtrack", "perf-inline", "perf-literal", "unicode"] } +regex-syntax = { version = "0.8" } reqwest = { version = "0.11", default-features = false, features = ["blocking", "default-tls", "json", "multipart", "rustls-tls", "stream"] } -ring = { version = "0.16", features = ["std"] } +ring = { version = "0.16" } rustls = { version = "0.21", features = ["dangerous_configuration"] } scopeguard = { version = "1" } serde = { version = "1", features = ["alloc", "derive"] } @@ -65,7 +68,7 @@ subtle = { version = "2" } time = { version = "0.3", features = ["local-offset", "macros", "serde-well-known"] } tokio = { version = "1", features = ["fs", "io-std", "io-util", "macros", "net", "process", "rt-multi-thread", "signal", "test-util"] } tokio-rustls = { version = "0.24" } -tokio-util = { version = "0.7", features = ["codec", "io"] } +tokio-util = { version = "0.7", features = ["codec", "compat", "io", "rt"] } toml_datetime = { version = "0.6", default-features = false, features = ["serde"] } toml_edit = { version = "0.19", features = ["serde"] } tower = { version = "0.4", default-features = false, features = ["balance", "buffer", "limit", "log", "timeout", "util"] } @@ -74,8 +77,8 @@ tracing-core = { version = "0.1" } tungstenite = { version = "0.20" } url = { version = "2", features = ["serde"] } uuid = { version = "1", features = ["serde", "v4"] } -zstd = { version = "0.12" } -zstd-safe = { version = "6", default-features = false, features = ["arrays", "legacy", "std", "zdict_builder"] } +zstd = { version = "0.13" } +zstd-safe = { version = "7", default-features = false, features = ["arrays", "legacy", "std", "zdict_builder"] } zstd-sys = { version = "2", default-features = false, features = ["legacy", "std", "zdict_builder"] } [build-dependencies] @@ -83,14 +86,17 @@ anyhow = { version = "1", features = ["backtrace"] } bytes = { version = "1", features = ["serde"] } cc = { version = "1", default-features = false, features = ["parallel"] } either = { version = "1" } +getrandom = { version = "0.2", default-features = false, features = ["std"] } itertools = { version = "0.10" } libc = { version = "0.2", features = ["extra_traits"] } log = { version = "0.4", default-features = false, features = ["std"] } memchr = { version = "2" } nom = { version = "7" } +once_cell = { version = "1" } prost = { version = "0.11" } regex = { version = "1" } -regex-syntax = { version = "0.7" } +regex-automata = { version = "0.4", default-features = false, features = ["dfa-onepass", "hybrid", "meta", "nfa-backtrack", "perf-inline", "perf-literal", "unicode"] } +regex-syntax = { version = "0.8" } serde = { version = "1", features = ["alloc", "derive"] } syn-dff4ba8e3ae991db = { package = "syn", version = "1", features = ["extra-traits", "full", "visit"] } syn-f595c2ba2a3f28df = { package = "syn", version = "2", features = ["extra-traits", "full", "visit", "visit-mut"] }