diff --git a/.config/nextest.toml b/.config/nextest.toml new file mode 100644 index 0000000000..8bccd51c6d --- /dev/null +++ b/.config/nextest.toml @@ -0,0 +1,2 @@ +[profile.default] +slow-timeout = "1m" diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml index e2f15d96db..8bf12c31b1 100644 --- a/.github/workflows/benchmarking.yml +++ b/.github/workflows/benchmarking.yml @@ -11,7 +11,7 @@ on: # │ │ ┌───────────── day of the month (1 - 31) # │ │ │ ┌───────────── month (1 - 12 or JAN-DEC) # │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT) - - cron: '0 3 * * *' # run once a day, timezone is utc + - cron: '0 3 * * *' # run once a day, timezone is utc workflow_dispatch: # adds ability to run this manually inputs: @@ -23,6 +23,21 @@ on: type: boolean description: 'Publish perf report. If not set, the report will be published only for the main branch' required: false + collect_olap_explain: + type: boolean + description: 'Collect EXPLAIN ANALYZE for OLAP queries. If not set, EXPLAIN ANALYZE will not be collected' + required: false + default: false + collect_pg_stat_statements: + type: boolean + description: 'Collect pg_stat_statements for OLAP queries. If not set, pg_stat_statements will not be collected' + required: false + default: false + run_AWS_RDS_AND_AURORA: + type: boolean + description: 'AWS-RDS and AWS-AURORA normally only run on Saturday. Set this to true to run them on every workflow_dispatch' + required: false + default: false defaults: run: @@ -113,6 +128,8 @@ jobs: # - neon-captest-reuse: Reusing existing project # - rds-aurora: Aurora Postgres Serverless v2 with autoscaling from 0.5 to 2 ACUs # - rds-postgres: RDS Postgres db.m5.large instance (2 vCPU, 8 GiB) with gp3 EBS storage + env: + RUN_AWS_RDS_AND_AURORA: ${{ github.event.inputs.run_AWS_RDS_AND_AURORA || 'false' }} runs-on: ubuntu-latest outputs: pgbench-compare-matrix: ${{ steps.pgbench-compare-matrix.outputs.matrix }} @@ -152,7 +169,7 @@ jobs: ] }' - if [ "$(date +%A)" = "Saturday" ]; then + if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres" }, { "platform": "rds-aurora" }]') fi @@ -171,9 +188,9 @@ jobs: ] }' - if [ "$(date +%A)" = "Saturday" ]; then + if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres", "scale": "10" }, - { "platform": "rds-aurora", "scale": "10" }]') + { "platform": "rds-aurora", "scale": "10" }]') fi echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT @@ -337,6 +354,8 @@ jobs: POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install DEFAULT_PG_VERSION: 14 TEST_OUTPUT: /tmp/test_output + TEST_OLAP_COLLECT_EXPLAIN: ${{ github.event.inputs.collect_olap_explain }} + TEST_OLAP_COLLECT_PG_STAT_STATEMENTS: ${{ github.event.inputs.collect_pg_stat_statements }} BUILD_TYPE: remote SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }} PLATFORM: ${{ matrix.platform }} @@ -399,6 +418,8 @@ jobs: env: VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" + TEST_OLAP_COLLECT_EXPLAIN: ${{ github.event.inputs.collect_olap_explain || 'false' }} + TEST_OLAP_COLLECT_PG_STAT_STATEMENTS: ${{ github.event.inputs.collect_pg_stat_statements || 'false' }} BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} TEST_OLAP_SCALE: 10 diff --git a/.github/workflows/build_and_push_docker_image.yml b/.github/workflows/build_and_push_docker_image.yml new file mode 100644 index 0000000000..e401b2f418 --- /dev/null +++ b/.github/workflows/build_and_push_docker_image.yml @@ -0,0 +1,105 @@ +name: Build and Push Docker Image + +on: + workflow_call: + inputs: + dockerfile-path: + required: true + type: string + image-name: + required: true + type: string + outputs: + build-tools-tag: + description: "tag generated for build tools" + value: ${{ jobs.tag.outputs.build-tools-tag }} + +jobs: + check-if-build-tools-dockerfile-changed: + runs-on: ubuntu-latest + outputs: + docker_file_changed: ${{ steps.dockerfile.outputs.docker_file_changed }} + steps: + - name: Check if Dockerfile.buildtools has changed + id: dockerfile + run: | + if [[ "$GITHUB_EVENT_NAME" != "pull_request" ]]; then + echo "docker_file_changed=false" >> $GITHUB_OUTPUT + exit + fi + updated_files=$(gh pr --repo neondatabase/neon diff ${{ github.event.pull_request.number }} --name-only) + if [[ $updated_files == *"Dockerfile.buildtools"* ]]; then + echo "docker_file_changed=true" >> $GITHUB_OUTPUT + fi + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + + tag: + runs-on: ubuntu-latest + needs: [ check-if-build-tools-dockerfile-changed ] + outputs: + build-tools-tag: ${{steps.buildtools-tag.outputs.image_tag}} + + steps: + - name: Get buildtools tag + env: + DOCKERFILE_CHANGED: ${{ needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed }} + run: | + if [[ "$GITHUB_EVENT_NAME" == "pull_request" ]] && [[ "${DOCKERFILE_CHANGED}" == "true" ]]; then + IMAGE_TAG=$GITHUB_RUN_ID + else + IMAGE_TAG=pinned + fi + + echo "image_tag=${IMAGE_TAG}" >> $GITHUB_OUTPUT + shell: bash + id: buildtools-tag + + kaniko: + if: needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed == 'true' + needs: [ tag, check-if-build-tools-dockerfile-changed ] + runs-on: [ self-hosted, dev, x64 ] + container: gcr.io/kaniko-project/executor:v1.7.0-debug + + steps: + - name: Checkout + uses: actions/checkout@v1 + + - name: Configure ECR login + run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json + + - name: Kaniko build + run: /kaniko/executor --reproducible --snapshotMode=redo --skip-unused-stages --dockerfile ${{ inputs.dockerfile-path }} --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-amd64 + + kaniko-arm: + if: needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed == 'true' + needs: [ tag, check-if-build-tools-dockerfile-changed ] + runs-on: [ self-hosted, dev, arm64 ] + container: gcr.io/kaniko-project/executor:v1.7.0-debug + + steps: + - name: Checkout + uses: actions/checkout@v1 + + - name: Configure ECR login + run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json + + - name: Kaniko build + run: /kaniko/executor --reproducible --snapshotMode=redo --skip-unused-stages --dockerfile ${{ inputs.dockerfile-path }} --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-arm64 + + manifest: + if: needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed == 'true' + name: 'manifest' + runs-on: [ self-hosted, dev, x64 ] + needs: + - tag + - kaniko + - kaniko-arm + - check-if-build-tools-dockerfile-changed + + steps: + - name: Create manifest + run: docker manifest create 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }} --amend 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-amd64 --amend 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-arm64 + + - name: Push manifest + run: docker manifest push 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }} diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 693ed1a66f..78deff6e85 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -44,7 +44,6 @@ jobs: exit 1 - tag: needs: [ check-permissions ] runs-on: [ self-hosted, gen3, small ] @@ -74,11 +73,19 @@ jobs: shell: bash id: build-tag - check-codestyle-python: + build-buildtools-image: needs: [ check-permissions ] + uses: ./.github/workflows/build_and_push_docker_image.yml + with: + dockerfile-path: Dockerfile.buildtools + image-name: build-tools + secrets: inherit + + check-codestyle-python: + needs: [ check-permissions, build-buildtools-image ] runs-on: [ self-hosted, gen3, small ] container: - image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned + image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }} options: --init steps: @@ -108,10 +115,10 @@ jobs: run: poetry run mypy . check-codestyle-rust: - needs: [ check-permissions ] + needs: [ check-permissions, build-buildtools-image ] runs-on: [ self-hosted, gen3, large ] container: - image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned + image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }} options: --init steps: @@ -175,10 +182,10 @@ jobs: run: cargo deny check --hide-inclusion-graph build-neon: - needs: [ check-permissions, tag ] + needs: [ check-permissions, tag, build-buildtools-image ] runs-on: [ self-hosted, gen3, large ] container: - image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned + image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }} options: --init strategy: fail-fast: false @@ -332,16 +339,16 @@ jobs: run: | ${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests - - name: Run cargo test + - name: Run rust tests run: | - ${cov_prefix} cargo test $CARGO_FLAGS $CARGO_FEATURES + ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES # Run separate tests for real S3 export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty - export REMOTE_STORAGE_S3_BUCKET=neon-github-public-dev + export REMOTE_STORAGE_S3_BUCKET=neon-github-ci-tests export REMOTE_STORAGE_S3_REGION=eu-central-1 # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now - ${cov_prefix} cargo test $CARGO_FLAGS --package remote_storage --test test_real_s3 + ${cov_prefix} cargo nextest run $CARGO_FLAGS -E 'package(remote_storage)' -E 'test(test_real_s3)' # Run separate tests for real Azure Blob Storage # XXX: replace region with `eu-central-1`-like region @@ -351,7 +358,7 @@ jobs: export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}" export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}" # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now - ${cov_prefix} cargo test $CARGO_FLAGS --package remote_storage --test test_real_azure + ${cov_prefix} cargo nextest run $CARGO_FLAGS -E 'package(remote_storage)' -E 'test(test_real_azure)' - name: Install rust binaries run: | @@ -408,10 +415,10 @@ jobs: uses: ./.github/actions/save-coverage-data regress-tests: - needs: [ check-permissions, build-neon, tag ] + needs: [ check-permissions, build-neon, build-buildtools-image, tag ] runs-on: [ self-hosted, gen3, large ] container: - image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned + image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }} # Default shared memory is 64mb options: --init --shm-size=512mb strategy: @@ -447,10 +454,10 @@ jobs: uses: ./.github/actions/save-coverage-data benchmarks: - needs: [ check-permissions, build-neon ] + needs: [ check-permissions, build-neon, build-buildtools-image ] runs-on: [ self-hosted, gen3, small ] container: - image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned + image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }} # Default shared memory is 64mb options: --init --shm-size=512mb if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks') @@ -479,12 +486,12 @@ jobs: # while coverage is currently collected for the debug ones create-test-report: - needs: [ check-permissions, regress-tests, coverage-report, benchmarks ] + needs: [ check-permissions, regress-tests, coverage-report, benchmarks, build-buildtools-image ] if: ${{ !cancelled() && contains(fromJSON('["skipped", "success"]'), needs.check-permissions.result) }} runs-on: [ self-hosted, gen3, small ] container: - image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned + image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }} options: --init steps: @@ -526,11 +533,10 @@ jobs: }) coverage-report: - needs: [ check-permissions, regress-tests ] - + needs: [ check-permissions, regress-tests, build-buildtools-image ] runs-on: [ self-hosted, gen3, small ] container: - image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned + image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }} options: --init strategy: fail-fast: false @@ -694,7 +700,7 @@ jobs: }" neon-image: - needs: [ check-permissions, tag ] + needs: [ check-permissions, build-buildtools-image, tag ] runs-on: [ self-hosted, gen3, large ] container: gcr.io/kaniko-project/executor:v1.9.2-debug defaults: @@ -733,6 +739,7 @@ jobs: --context . --build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }} --build-arg BUILD_TAG=${{ needs.tag.outputs.build-tag }} + --build-arg TAG=${{ needs.build-buildtools-image.outputs.build-tools-tag }} --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} --destination neondatabase/neon:${{needs.tag.outputs.build-tag}} @@ -743,7 +750,7 @@ jobs: compute-tools-image: runs-on: [ self-hosted, gen3, large ] - needs: [ check-permissions, tag ] + needs: [ check-permissions, build-buildtools-image, tag ] container: gcr.io/kaniko-project/executor:v1.9.2-debug defaults: run: @@ -778,6 +785,7 @@ jobs: --context . --build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }} --build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}} + --build-arg TAG=${{needs.build-buildtools-image.outputs.build-tools-tag}} --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com --dockerfile Dockerfile.compute-tools --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} @@ -788,7 +796,7 @@ jobs: run: rm -rf ~/.ecr compute-node-image: - needs: [ check-permissions, tag ] + needs: [ check-permissions, build-buildtools-image, tag ] runs-on: [ self-hosted, gen3, large ] container: image: gcr.io/kaniko-project/executor:v1.9.2-debug @@ -836,6 +844,7 @@ jobs: --build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }} --build-arg PG_VERSION=${{ matrix.version }} --build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}} + --build-arg TAG=${{needs.build-buildtools-image.outputs.build-tools-tag}} --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com --dockerfile Dockerfile.compute-node --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} @@ -857,7 +866,7 @@ jobs: run: shell: sh -eu {0} env: - VM_BUILDER_VERSION: v0.19.0 + VM_BUILDER_VERSION: v0.21.0 steps: - name: Checkout diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml index b1ea5e4f74..c6c2b7386a 100644 --- a/.github/workflows/neon_extra_builds.yml +++ b/.github/workflows/neon_extra_builds.yml @@ -218,7 +218,7 @@ jobs: # Run separate tests for real S3 export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty - export REMOTE_STORAGE_S3_BUCKET=neon-github-public-dev + export REMOTE_STORAGE_S3_BUCKET=neon-github-ci-tests export REMOTE_STORAGE_S3_REGION=eu-central-1 # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now cargo test $CARGO_FLAGS --package remote_storage --test test_real_s3 diff --git a/.github/workflows/update_build_tools_image.yml b/.github/workflows/update_build_tools_image.yml new file mode 100644 index 0000000000..88bab797b7 --- /dev/null +++ b/.github/workflows/update_build_tools_image.yml @@ -0,0 +1,130 @@ +name: 'Update build tools image tag' + +# This workflow it used to update tag of build tools in ECR. +# The most common use case is adding/moving `pinned` tag to `${GITHUB_RUN_IT}` image. + +on: + workflow_dispatch: + inputs: + from-tag: + description: 'Source tag' + required: true + type: string + to-tag: + description: 'Destination tag' + required: true + type: string + default: 'pinned' + +defaults: + run: + shell: bash -euo pipefail {0} + +env: + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }} + +permissions: {} + +jobs: + tag-image: + runs-on: [ self-hosted, gen3, small ] + container: golang:1.19-bullseye + + env: + IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools + FROM_TAG: ${{ inputs.from-tag }} + TO_TAG: ${{ inputs.to-tag }} + outputs: + next-digest-buildtools: ${{ steps.next-digest.outputs.next-digest-buildtools }} + prev-digest-buildtools: ${{ steps.prev-digest.outputs.prev-digest-buildtools }} + + steps: + - name: Install Crane & ECR helper + run: | + go install github.com/google/go-containerregistry/cmd/crane@a54d64203cffcbf94146e04069aae4a97f228ee2 # v0.16.1 + go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@adf1bafd791ae7d4ff098108b1e91f36a4da5404 # v0.7.1 + + - name: Configure ECR login + run: | + mkdir /github/home/.docker/ + echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json + + - name: Get source image digest + id: next-digest + run: | + NEXT_DIGEST=$(crane digest ${IMAGE}:${FROM_TAG} || true) + if [ -z "${NEXT_DIGEST}" ]; then + echo >&2 "Image ${IMAGE}:${FROM_TAG} does not exist" + exit 1 + fi + + echo "Current ${IMAGE}@${FROM_TAG} image is ${IMAGE}@${NEXT_DIGEST}" + echo "next-digest-buildtools=$NEXT_DIGEST" >> $GITHUB_OUTPUT + + - name: Get destination image digest (if already exists) + id: prev-digest + run: | + PREV_DIGEST=$(crane digest ${IMAGE}:${TO_TAG} || true) + if [ -z "${PREV_DIGEST}" ]; then + echo >&2 "Image ${IMAGE}:${TO_TAG} does not exist (it's ok)" + else + echo >&2 "Current ${IMAGE}@${TO_TAG} image is ${IMAGE}@${PREV_DIGEST}" + + echo "prev-digest-buildtools=$PREV_DIGEST" >> $GITHUB_OUTPUT + fi + + - name: Tag image + run: | + crane tag "${IMAGE}:${FROM_TAG}" "${TO_TAG}" + + rollback-tag-image: + needs: tag-image + if: ${{ !success() }} + + runs-on: [ self-hosted, gen3, small ] + container: golang:1.19-bullseye + + env: + IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools + FROM_TAG: ${{ inputs.from-tag }} + TO_TAG: ${{ inputs.to-tag }} + + steps: + - name: Install Crane & ECR helper + run: | + go install github.com/google/go-containerregistry/cmd/crane@a54d64203cffcbf94146e04069aae4a97f228ee2 # v0.16.1 + go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@adf1bafd791ae7d4ff098108b1e91f36a4da5404 # v0.7.1 + + - name: Configure ECR login + run: | + mkdir /github/home/.docker/ + echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json + + - name: Restore previous tag if needed + run: | + NEXT_DIGEST="${{ needs.tag-image.outputs.next-digest-buildtools }}" + PREV_DIGEST="${{ needs.tag-image.outputs.prev-digest-buildtools }}" + + if [ -z "${NEXT_DIGEST}" ]; then + echo >&2 "Image ${IMAGE}:${FROM_TAG} does not exist, nothing to rollback" + exit 0 + fi + + if [ -z "${PREV_DIGEST}" ]; then + # I guess we should delete the tag here/untag the image, but crane does not support it + # - https://github.com/google/go-containerregistry/issues/999 + + echo >&2 "Image ${IMAGE}:${TO_TAG} did not exist, but it was created by the job, no need to rollback" + + exit 0 + fi + + CURRENT_DIGEST=$(crane digest "${IMAGE}:${TO_TAG}") + if [ "${CURRENT_DIGEST}" == "${NEXT_DIGEST}" ]; then + crane tag "${IMAGE}@${PREV_DIGEST}" "${TO_TAG}" + + echo >&2 "Successfully restored ${TO_TAG} tag from ${IMAGE}@${CURRENT_DIGEST} to ${IMAGE}@${PREV_DIGEST}" + else + echo >&2 "Image ${IMAGE}:${TO_TAG}@${CURRENT_DIGEST} is not required to be restored" + fi diff --git a/.gitignore b/.gitignore index c5fc121ac2..3f4495c9e7 100644 --- a/.gitignore +++ b/.gitignore @@ -6,6 +6,7 @@ __pycache__/ test_output/ .vscode .idea +neon.iml /.neon /integration_tests/.neon diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 2692684006..b318c295a3 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -70,3 +70,17 @@ We're using the following approach to make it work: - The label gets removed automatically, so to run CI again with new changes, the label should be added again (after the review) For details see [`approved-for-ci-run.yml`](.github/workflows/approved-for-ci-run.yml) + +## How do I add the "pinned" tag to an buildtools image? +We use the `pinned` tag for `Dockerfile.buildtools` build images in our CI/CD setup, currently adding the `pinned` tag is a manual operation. + +You can call it from GitHub UI: https://github.com/neondatabase/neon/actions/workflows/update_build_tools_image.yml, +or using GitHub CLI: + +```bash +gh workflow -R neondatabase/neon run update_build_tools_image.yml \ + -f from-tag=6254913013 \ + -f to-tag=pinned \ + +# Default `-f to-tag` is `pinned`, so the parameter can be omitted. +``` \ No newline at end of file diff --git a/Cargo.lock b/Cargo.lock index f931fd6c29..abd87dc0da 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -190,9 +190,9 @@ dependencies = [ [[package]] name = "async-compression" -version = "0.4.0" +version = "0.4.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b0122885821398cc923ece939e24d1056a2384ee719432397fa9db87230ff11" +checksum = "bc2d0cfb2a7388d34f590e76686704c494ed7aaceed62ee1ba35cbf363abc2a5" dependencies = [ "flate2", "futures-core", @@ -1168,6 +1168,7 @@ dependencies = [ "regex", "remote_storage", "reqwest", + "rust-ini", "serde", "serde_json", "tar", @@ -1201,6 +1202,26 @@ version = "0.9.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "28c122c3980598d243d63d9a704629a2d748d101f278052ff068be5a4423ab6f" +[[package]] +name = "const-random" +version = "0.1.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5aaf16c9c2c612020bcfd042e170f6e32de9b9d75adb5277cdbbd2e2c8c8299a" +dependencies = [ + "const-random-macro", +] + +[[package]] +name = "const-random-macro" +version = "0.1.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f9d839f2a20b0aee515dc581a6172f2321f96cab76c1a38a4c584a194955390e" +dependencies = [ + "getrandom 0.2.11", + "once_cell", + "tiny-keccak", +] + [[package]] name = "const_fn" version = "0.4.9" @@ -1433,6 +1454,12 @@ dependencies = [ "winapi", ] +[[package]] +name = "crunchy" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a81dae078cea95a014a339291cec439d2f232ebe854a9d672b796c6afafa9b7" + [[package]] name = "crypto-bigint" version = "0.4.9" @@ -1575,6 +1602,15 @@ dependencies = [ "syn 2.0.32", ] +[[package]] +name = "dlv-list" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "442039f5147480ba31067cb00ada1adae6892028e40e45fc5de7b7df6dcc1b5f" +dependencies = [ + "const-random", +] + [[package]] name = "dyn-clone" version = "1.0.14" @@ -2106,6 +2142,20 @@ dependencies = [ "hashbrown 0.13.2", ] +[[package]] +name = "hdrhistogram" +version = "7.5.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "765c9198f173dd59ce26ff9f95ef0aafd0a0fe01fb9d72841bc5066a4c06511d" +dependencies = [ + "base64 0.21.1", + "byteorder", + "crossbeam-channel", + "flate2", + "nom", + "num-traits", +] + [[package]] name = "heapless" version = "0.8.0" @@ -2487,13 +2537,14 @@ dependencies = [ [[package]] name = "jsonwebtoken" -version = "8.3.0" +version = "9.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6971da4d9c3aa03c3d8f3ff0f4155b534aad021292003895a469716b2a230378" +checksum = "5c7ea04a7c5c055c175f189b6dc6ba036fd62306b58c66c9f6389036c503a3f4" dependencies = [ "base64 0.21.1", - "pem 1.1.1", - "ring 0.16.20", + "js-sys", + "pem 3.0.3", + "ring 0.17.6", "serde", "serde_json", "simple_asn1", @@ -3028,6 +3079,16 @@ dependencies = [ "tokio-stream", ] +[[package]] +name = "ordered-multimap" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4d6a8c22fc714f0c2373e6091bf6f5e9b37b1bc0b1184874b7e0a4e303d318f" +dependencies = [ + "dlv-list", + "hashbrown 0.14.0", +] + [[package]] name = "os_info" version = "3.7.0" @@ -3056,6 +3117,28 @@ dependencies = [ "sha2", ] +[[package]] +name = "pagebench" +version = "0.1.0" +dependencies = [ + "anyhow", + "clap", + "futures", + "hdrhistogram", + "humantime", + "humantime-serde", + "pageserver", + "pageserver_api", + "pageserver_client", + "rand 0.8.5", + "serde", + "serde_json", + "tokio", + "tracing", + "utils", + "workspace_hack", +] + [[package]] name = "pagectl" version = "0.1.0" @@ -3145,6 +3228,7 @@ dependencies = [ "tokio", "tokio-io-timeout", "tokio-postgres", + "tokio-stream", "tokio-tar", "tokio-util", "toml_edit", @@ -3182,11 +3266,19 @@ dependencies = [ name = "pageserver_client" version = "0.1.0" dependencies = [ + "anyhow", "async-trait", + "bytes", + "futures", "pageserver_api", + "postgres", "reqwest", "serde", "thiserror", + "tokio", + "tokio-postgres", + "tokio-stream", + "tokio-util", "utils", "workspace_hack", ] @@ -3282,18 +3374,19 @@ checksum = "19b17cddbe7ec3f8bc800887bab5e717348c95ea2ca0b1bf0837fb964dc67099" [[package]] name = "pem" -version = "1.1.1" +version = "2.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a8835c273a76a90455d7344889b0964598e3316e2a79ede8e36f16bdcf2228b8" +checksum = "6b13fe415cdf3c8e44518e18a7c95a13431d9bdf6d15367d82b23c377fdd441a" dependencies = [ - "base64 0.13.1", + "base64 0.21.1", + "serde", ] [[package]] name = "pem" -version = "2.0.1" +version = "3.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6b13fe415cdf3c8e44518e18a7c95a13431d9bdf6d15367d82b23c377fdd441a" +checksum = "1b8fcc794035347fb64beda2d3b462595dd2753e3f268d89c5aae77e8cf2c310" dependencies = [ "base64 0.21.1", "serde", @@ -4169,6 +4262,16 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "rust-ini" +version = "0.20.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3e0698206bcb8882bf2a9ecb4c1e7785db57ff052297085a6efd4fe42302068a" +dependencies = [ + "cfg-if", + "ordered-multimap", +] + [[package]] name = "rustc-demangle" version = "0.1.23" @@ -4419,12 +4522,12 @@ checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" [[package]] name = "sct" -version = "0.7.0" +version = "0.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d53dcdb7c9f8158937a7981b48accfd39a43af418591a5d008c7b22b5e1b7ca4" +checksum = "da046153aa2352493d6cb7da4b6e5c0c057d8a1d0a9aa8560baffdd945acd414" dependencies = [ - "ring 0.16.20", - "untrusted 0.7.1", + "ring 0.17.6", + "untrusted 0.9.0", ] [[package]] @@ -5123,6 +5226,15 @@ dependencies = [ "time-core", ] +[[package]] +name = "tiny-keccak" +version = "2.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2c9d3793400a45f954c52e73d068316d76b6f4e36977e3fcebb13a2721e80237" +dependencies = [ + "crunchy", +] + [[package]] name = "tinytemplate" version = "1.2.1" @@ -6290,6 +6402,7 @@ dependencies = [ "futures-io", "futures-sink", "futures-util", + "getrandom 0.2.11", "hex", "hmac", "hyper", @@ -6301,6 +6414,7 @@ dependencies = [ "num-bigint", "num-integer", "num-traits", + "once_cell", "prost", "rand 0.8.5", "regex", @@ -6403,30 +6517,28 @@ checksum = "2a0956f1ba7c7909bfb66c2e9e4124ab6f6482560f6628b5aaeba39207c9aad9" [[package]] name = "zstd" -version = "0.12.4" +version = "0.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1a27595e173641171fc74a1232b7b1c7a7cb6e18222c11e9dfb9888fa424c53c" +checksum = "bffb3309596d527cfcba7dfc6ed6052f1d39dfbd7c867aa2e865e4a449c10110" dependencies = [ "zstd-safe", ] [[package]] name = "zstd-safe" -version = "6.0.6" +version = "7.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ee98ffd0b48ee95e6c5168188e44a54550b1564d9d530ee21d5f0eaed1069581" +checksum = "43747c7422e2924c11144d5229878b98180ef8b06cca4ab5af37afc8a8d8ea3e" dependencies = [ - "libc", "zstd-sys", ] [[package]] name = "zstd-sys" -version = "2.0.8+zstd.1.5.5" +version = "2.0.9+zstd.1.5.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5556e6ee25d32df2586c098bbfa278803692a20d0ab9565e049480d52707ec8c" +checksum = "9e16efa8a874a0481a574084d34cc26fdb3b99627480f785888deb6386506656" dependencies = [ "cc", - "libc", "pkg-config", ] diff --git a/Cargo.toml b/Cargo.toml index b44544d626..5de636778a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -6,6 +6,7 @@ members = [ "pageserver", "pageserver/ctl", "pageserver/client", + "pageserver/pagebench", "proxy", "safekeeper", "storage_broker", @@ -79,6 +80,7 @@ futures-util = "0.3" git-version = "0.3" hashbrown = "0.13" hashlink = "0.8.1" +hdrhistogram = "7.5.2" hex = "0.4" hex-literal = "0.4" hmac = "0.12.1" @@ -91,7 +93,7 @@ hyper-tungstenite = "0.11" inotify = "0.10.2" ipnet = "2.9.0" itertools = "0.10" -jsonwebtoken = "8" +jsonwebtoken = "9" libc = "0.2" md5 = "0.7.0" memoffset = "0.8" diff --git a/Dockerfile b/Dockerfile index 60de9cfa3e..5d5fde4f14 100644 --- a/Dockerfile +++ b/Dockerfile @@ -3,7 +3,7 @@ ### By default, the binaries inside the image have some mock parameters and can start, but are not intended to be used ### inside this image in the real deployments. ARG REPOSITORY=neondatabase -ARG IMAGE=rust +ARG IMAGE=build-tools ARG TAG=pinned # Build Postgres diff --git a/Dockerfile.buildtools b/Dockerfile.buildtools new file mode 100644 index 0000000000..c2fcd8841e --- /dev/null +++ b/Dockerfile.buildtools @@ -0,0 +1,166 @@ +FROM debian:bullseye-slim + +# Add nonroot user +RUN useradd -ms /bin/bash nonroot -b /home +SHELL ["/bin/bash", "-c"] + +# System deps +RUN set -e \ + && apt update \ + && apt install -y \ + autoconf \ + automake \ + bison \ + build-essential \ + ca-certificates \ + cmake \ + curl \ + flex \ + git \ + gnupg \ + gzip \ + jq \ + libcurl4-openssl-dev \ + libbz2-dev \ + libffi-dev \ + liblzma-dev \ + libncurses5-dev \ + libncursesw5-dev \ + libpq-dev \ + libreadline-dev \ + libseccomp-dev \ + libsqlite3-dev \ + libssl-dev \ + libstdc++-10-dev \ + libtool \ + libxml2-dev \ + libxmlsec1-dev \ + libxxhash-dev \ + lsof \ + make \ + netcat \ + net-tools \ + openssh-client \ + parallel \ + pkg-config \ + unzip \ + wget \ + xz-utils \ + zlib1g-dev \ + zstd \ + && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* + +# protobuf-compiler (protoc) +ENV PROTOC_VERSION 25.1 +RUN curl -fsSL "https://github.com/protocolbuffers/protobuf/releases/download/v${PROTOC_VERSION}/protoc-${PROTOC_VERSION}-linux-$(uname -m | sed 's/aarch64/aarch_64/g').zip" -o "protoc.zip" \ + && unzip -q protoc.zip -d protoc \ + && mv protoc/bin/protoc /usr/local/bin/protoc \ + && mv protoc/include/google /usr/local/include/google \ + && rm -rf protoc.zip protoc + +# LLVM +ENV LLVM_VERSION=17 +RUN curl -fsSL 'https://apt.llvm.org/llvm-snapshot.gpg.key' | apt-key add - \ + && echo "deb http://apt.llvm.org/bullseye/ llvm-toolchain-bullseye-${LLVM_VERSION} main" > /etc/apt/sources.list.d/llvm.stable.list \ + && apt update \ + && apt install -y clang-${LLVM_VERSION} llvm-${LLVM_VERSION} \ + && bash -c 'for f in /usr/bin/clang*-${LLVM_VERSION} /usr/bin/llvm*-${LLVM_VERSION}; do ln -s "${f}" "${f%-${LLVM_VERSION}}"; done' \ + && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* + +# PostgreSQL 14 +RUN curl -fsSL 'https://www.postgresql.org/media/keys/ACCC4CF8.asc' | apt-key add - \ + && echo 'deb http://apt.postgresql.org/pub/repos/apt bullseye-pgdg main' > /etc/apt/sources.list.d/pgdg.list \ + && apt update \ + && apt install -y postgresql-client-14 \ + && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* + +# AWS CLI +RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-$(uname -m).zip" -o "awscliv2.zip" \ + && unzip -q awscliv2.zip \ + && ./aws/install \ + && rm awscliv2.zip + +# Mold: A Modern Linker +ENV MOLD_VERSION v2.4.0 +RUN set -e \ + && git clone https://github.com/rui314/mold.git \ + && mkdir mold/build \ + && cd mold/build \ + && git checkout ${MOLD_VERSION} \ + && cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_COMPILER=clang++ .. \ + && cmake --build . -j $(nproc) \ + && cmake --install . \ + && cd .. \ + && rm -rf mold + +# LCOV +# Build lcov from a fork: +# It includes several bug fixes on top on v2.0 release (https://github.com/linux-test-project/lcov/compare/v2.0...master) +# And patches from us: +# - Generates json file with code coverage summary (https://github.com/neondatabase/lcov/commit/426e7e7a22f669da54278e9b55e6d8caabd00af0.tar.gz) +RUN for package in Capture::Tiny DateTime Devel::Cover Digest::MD5 File::Spec JSON::XS Memory::Process Time::HiRes JSON; do yes | perl -MCPAN -e "CPAN::Shell->notest('install', '$package')"; done \ + && wget https://github.com/neondatabase/lcov/archive/426e7e7a22f669da54278e9b55e6d8caabd00af0.tar.gz -O lcov.tar.gz \ + && echo "61a22a62e20908b8b9e27d890bd0ea31f567a7b9668065589266371dcbca0992 lcov.tar.gz" | sha256sum --check \ + && mkdir -p lcov && tar -xzf lcov.tar.gz -C lcov --strip-components=1 \ + && cd lcov \ + && make install \ + && rm -rf ../lcov.tar.gz + +# Switch to nonroot user +USER nonroot:nonroot +WORKDIR /home/nonroot + +# Python +ENV PYTHON_VERSION=3.9.2 \ + PYENV_ROOT=/home/nonroot/.pyenv \ + PATH=/home/nonroot/.pyenv/shims:/home/nonroot/.pyenv/bin:/home/nonroot/.poetry/bin:$PATH +RUN set -e \ + && cd $HOME \ + && curl -sSO https://raw.githubusercontent.com/pyenv/pyenv-installer/master/bin/pyenv-installer \ + && chmod +x pyenv-installer \ + && ./pyenv-installer \ + && export PYENV_ROOT=/home/nonroot/.pyenv \ + && export PATH="$PYENV_ROOT/bin:$PATH" \ + && export PATH="$PYENV_ROOT/shims:$PATH" \ + && pyenv install ${PYTHON_VERSION} \ + && pyenv global ${PYTHON_VERSION} \ + && python --version \ + && pip install --upgrade pip \ + && pip --version \ + && pip install pipenv wheel poetry + +# Switch to nonroot user (again) +USER nonroot:nonroot +WORKDIR /home/nonroot + +# Rust +# Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`) +ENV RUSTC_VERSION=1.74.0 +ENV RUSTUP_HOME="/home/nonroot/.rustup" +ENV PATH="/home/nonroot/.cargo/bin:${PATH}" +RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \ + chmod +x rustup-init && \ + ./rustup-init -y --default-toolchain ${RUSTC_VERSION} && \ + rm rustup-init && \ + export PATH="$HOME/.cargo/bin:$PATH" && \ + . "$HOME/.cargo/env" && \ + cargo --version && rustup --version && \ + rustup component add llvm-tools-preview rustfmt clippy && \ + cargo install --git https://github.com/paritytech/cachepot && \ + cargo install rustfilt && \ + cargo install cargo-hakari && \ + cargo install cargo-deny && \ + cargo install cargo-hack && \ + cargo install cargo-nextest && \ + rm -rf /home/nonroot/.cargo/registry && \ + rm -rf /home/nonroot/.cargo/git +ENV RUSTC_WRAPPER=cachepot + +# Show versions +RUN whoami \ + && python --version \ + && pip --version \ + && cargo --version --verbose \ + && rustup --version --verbose \ + && rustc --version --verbose \ + && clang --version diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node index 03280586f8..14ba1b5b9a 100644 --- a/Dockerfile.compute-node +++ b/Dockerfile.compute-node @@ -1,6 +1,6 @@ ARG PG_VERSION ARG REPOSITORY=neondatabase -ARG IMAGE=rust +ARG IMAGE=build-tools ARG TAG=pinned ARG BUILD_TAG @@ -48,7 +48,29 @@ RUN cd postgres && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrowlocks.control && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgstattuple.control && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/refint.control && \ - echo 'trusted = true' >> /usr/local/pgsql/share/extension/xml2.control + echo 'trusted = true' >> /usr/local/pgsql/share/extension/xml2.control && \ + # We need to grant EXECUTE on pg_stat_statements_reset() to neon_superuser. + # In vanilla postgres this function is limited to Postgres role superuser. + # In neon we have neon_superuser role that is not a superuser but replaces superuser in some cases. + # We could add the additional grant statements to the postgres repository but it would be hard to maintain, + # whenever we need to pick up a new postgres version and we want to limit the changes in our postgres fork, + # so we do it here. + old_list="pg_stat_statements--1.0--1.1.sql pg_stat_statements--1.1--1.2.sql pg_stat_statements--1.2--1.3.sql pg_stat_statements--1.3--1.4.sql pg_stat_statements--1.4--1.5.sql pg_stat_statements--1.4.sql pg_stat_statements--1.5--1.6.sql"; \ + # the first loop is for pg_stat_statement extension version <= 1.6 + for file in /usr/local/pgsql/share/extension/pg_stat_statements--*.sql; do \ + filename=$(basename "$file"); \ + if echo "$old_list" | grep -q -F "$filename"; then \ + echo 'GRANT EXECUTE ON FUNCTION pg_stat_statements_reset() TO neon_superuser;' >> $file; \ + fi; \ + done; \ + # the second loop is for pg_stat_statement extension versions >= 1.7, + # where pg_stat_statement_reset() got 3 additional arguments + for file in /usr/local/pgsql/share/extension/pg_stat_statements--*.sql; do \ + filename=$(basename "$file"); \ + if ! echo "$old_list" | grep -q -F "$filename"; then \ + echo 'GRANT EXECUTE ON FUNCTION pg_stat_statements_reset(Oid, Oid, bigint) TO neon_superuser;' >> $file; \ + fi; \ + done ######################################################################################### # @@ -569,6 +591,23 @@ RUN wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4 make -j $(getconf _NPROCESSORS_ONLN) install && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/roaringbitmap.control +######################################################################################### +# +# Layer "pg-semver-pg-build" +# compile pg_semver extension +# +######################################################################################### +FROM build-deps AS pg-semver-pg-build +COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ + +ENV PATH "/usr/local/pgsql/bin/:$PATH" +RUN wget https://github.com/theory/pg-semver/archive/refs/tags/v0.32.1.tar.gz -O pg_semver.tar.gz && \ + echo "fbdaf7512026d62eec03fad8687c15ed509b6ba395bff140acd63d2e4fbe25d7 pg_semver.tar.gz" | sha256sum --check && \ + mkdir pg_semver-src && cd pg_semver-src && tar xvzf ../pg_semver.tar.gz --strip-components=1 -C . && \ + make -j $(getconf _NPROCESSORS_ONLN) && \ + make -j $(getconf _NPROCESSORS_ONLN) install && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/semver.control + ######################################################################################### # # Layer "pg-embedding-pg-build" @@ -768,6 +807,7 @@ COPY --from=pg-pgx-ulid-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=rdkit-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg-uuidv7-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg-roaringbitmap-pg-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=pg-semver-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg-embedding-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=wal2json-pg-build /usr/local/pgsql /usr/local/pgsql COPY pgxn/ pgxn/ diff --git a/Dockerfile.compute-tools b/Dockerfile.compute-tools index 3066e3f7ca..cc305cc556 100644 --- a/Dockerfile.compute-tools +++ b/Dockerfile.compute-tools @@ -1,7 +1,7 @@ # First transient image to build compute_tools binaries # NB: keep in sync with rust image version in .github/workflows/build_and_test.yml ARG REPOSITORY=neondatabase -ARG IMAGE=rust +ARG IMAGE=build-tools ARG TAG=pinned ARG BUILD_TAG diff --git a/README.md b/README.md index 3e3123f5ee..98af1edee6 100644 --- a/README.md +++ b/README.md @@ -29,13 +29,14 @@ See developer documentation in [SUMMARY.md](/docs/SUMMARY.md) for more informati ```bash apt install build-essential libtool libreadline-dev zlib1g-dev flex bison libseccomp-dev \ libssl-dev clang pkg-config libpq-dev cmake postgresql-client protobuf-compiler \ -libcurl4-openssl-dev openssl python-poetry lsof libicu-dev +libcurl4-openssl-dev openssl python3-poetry lsof libicu-dev ``` * On Fedora, these packages are needed: ```bash dnf install flex bison readline-devel zlib-devel openssl-devel \ libseccomp-devel perl clang cmake postgresql postgresql-contrib protobuf-compiler \ - protobuf-devel libcurl-devel openssl poetry lsof libicu-devel + protobuf-devel libcurl-devel openssl poetry lsof libicu-devel libpq-devel python3-devel \ + libffi-devel ``` * On Arch based systems, these packages are needed: ```bash diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml index 47378f1910..142fa08495 100644 --- a/compute_tools/Cargo.toml +++ b/compute_tools/Cargo.toml @@ -37,5 +37,6 @@ workspace_hack.workspace = true toml_edit.workspace = true remote_storage = { version = "0.1", path = "../libs/remote_storage/" } vm_monitor = { version = "0.1", path = "../libs/vm_monitor/" } -zstd = "0.12.4" +zstd = "0.13" bytes = "1.0" +rust-ini = "0.20.0" diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs index ce7345d5be..436db59088 100644 --- a/compute_tools/src/bin/compute_ctl.rs +++ b/compute_tools/src/bin/compute_ctl.rs @@ -31,7 +31,9 @@ //! -C 'postgresql://cloud_admin@localhost/postgres' \ //! -S /var/db/postgres/specs/current.json \ //! -b /usr/local/bin/postgres \ -//! -r http://pg-ext-s3-gateway +//! -r http://pg-ext-s3-gateway \ +//! --pgbouncer-connstr 'host=localhost port=6432 dbname=pgbouncer user=cloud_admin sslmode=disable' +//! --pgbouncer-ini-path /etc/pgbouncer.ini \ //! ``` //! use std::collections::HashMap; @@ -99,6 +101,9 @@ fn main() -> Result<()> { let spec_json = matches.get_one::("spec"); let spec_path = matches.get_one::("spec-path"); + let pgbouncer_connstr = matches.get_one::("pgbouncer-connstr"); + let pgbouncer_ini_path = matches.get_one::("pgbouncer-ini-path"); + // Extract OpenTelemetry context for the startup actions from the // TRACEPARENT and TRACESTATE env variables, and attach it to the current // tracing context. @@ -209,6 +214,8 @@ fn main() -> Result<()> { ext_remote_storage: ext_remote_storage.map(|s| s.to_string()), ext_download_progress: RwLock::new(HashMap::new()), build_tag, + pgbouncer_connstr: pgbouncer_connstr.map(|s| s.to_string()), + pgbouncer_ini_path: pgbouncer_ini_path.map(|s| s.to_string()), }; let compute = Arc::new(compute_node); @@ -493,6 +500,23 @@ fn cli() -> clap::Command { ) .value_name("FILECACHE_CONNSTR"), ) + .arg( + Arg::new("pgbouncer-connstr") + .long("pgbouncer-connstr") + .default_value( + "host=localhost port=6432 dbname=pgbouncer user=cloud_admin sslmode=disable", + ) + .value_name("PGBOUNCER_CONNSTR"), + ) + .arg( + Arg::new("pgbouncer-ini-path") + .long("pgbouncer-ini-path") + // Note: this doesn't match current path for pgbouncer.ini. + // Until we fix it, we need to pass the path explicitly + // or this will be effectively no-op. + .default_value("/etc/pgbouncer.ini") + .value_name("PGBOUNCER_INI_PATH"), + ) } #[test] diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index b39a800f14..cd7be0520e 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -7,6 +7,7 @@ use std::path::Path; use std::process::{Command, Stdio}; use std::str::FromStr; use std::sync::{Condvar, Mutex, RwLock}; +use std::thread; use std::time::Instant; use anyhow::{Context, Result}; @@ -64,6 +65,10 @@ pub struct ComputeNode { // key: ext_archive_name, value: started download time, download_completed? pub ext_download_progress: RwLock, bool)>>, pub build_tag: String, + // connection string to pgbouncer to change settings + pub pgbouncer_connstr: Option, + // path to pgbouncer.ini to change settings + pub pgbouncer_ini_path: Option, } // store some metrics about download size that might impact startup time @@ -737,6 +742,31 @@ impl ComputeNode { pub fn reconfigure(&self) -> Result<()> { let spec = self.state.lock().unwrap().pspec.clone().unwrap().spec; + if let Some(connstr) = &self.pgbouncer_connstr { + info!("tuning pgbouncer with connstr: {:?}", connstr); + + let rt = tokio::runtime::Builder::new_current_thread() + .enable_all() + .build() + .expect("failed to create rt"); + + // Spawn a thread to do the tuning, + // so that we don't block the main thread that starts Postgres. + let pgbouncer_settings = spec.pgbouncer_settings.clone(); + let connstr_clone = connstr.clone(); + let pgbouncer_ini_path = self.pgbouncer_ini_path.clone(); + let _handle = thread::spawn(move || { + let res = rt.block_on(tune_pgbouncer( + pgbouncer_settings, + &connstr_clone, + pgbouncer_ini_path, + )); + if let Err(err) = res { + error!("error while tuning pgbouncer: {err:?}"); + } + }); + } + // Write new config let pgdata_path = Path::new(&self.pgdata); let postgresql_conf_path = pgdata_path.join("postgresql.conf"); @@ -791,6 +821,32 @@ impl ComputeNode { pspec.timeline_id, ); + // tune pgbouncer + if let Some(connstr) = &self.pgbouncer_connstr { + info!("tuning pgbouncer with connstr: {:?}", connstr); + + let rt = tokio::runtime::Builder::new_current_thread() + .enable_all() + .build() + .expect("failed to create rt"); + + // Spawn a thread to do the tuning, + // so that we don't block the main thread that starts Postgres. + let pgbouncer_settings = pspec.spec.pgbouncer_settings.clone(); + let connstr_clone = connstr.clone(); + let pgbouncer_ini_path = self.pgbouncer_ini_path.clone(); + let _handle = thread::spawn(move || { + let res = rt.block_on(tune_pgbouncer( + pgbouncer_settings, + &connstr_clone, + pgbouncer_ini_path, + )); + if let Err(err) = res { + error!("error while tuning pgbouncer: {err:?}"); + } + }); + } + info!( "start_compute spec.remote_extensions {:?}", pspec.spec.remote_extensions diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs index b79e516650..0b0e137c03 100644 --- a/compute_tools/src/pg_helpers.rs +++ b/compute_tools/src/pg_helpers.rs @@ -9,9 +9,11 @@ use std::process::Child; use std::time::{Duration, Instant}; use anyhow::{bail, Result}; +use ini::Ini; use notify::{RecursiveMode, Watcher}; use postgres::{Client, Transaction}; -use tracing::{debug, instrument}; +use tokio_postgres::NoTls; +use tracing::{debug, error, info, instrument}; use compute_api::spec::{Database, GenericOption, GenericOptions, PgIdent, Role}; @@ -359,3 +361,68 @@ pub fn create_pgdata(pgdata: &str) -> Result<()> { Ok(()) } + +/// Update pgbouncer.ini with provided options +pub fn update_pgbouncer_ini( + pgbouncer_config: HashMap, + pgbouncer_ini_path: &str, +) -> Result<()> { + let mut conf = Ini::load_from_file(pgbouncer_ini_path)?; + let section = conf.section_mut(Some("pgbouncer")).unwrap(); + + for (option_name, value) in pgbouncer_config.iter() { + section.insert(option_name, value); + } + + conf.write_to_file(pgbouncer_ini_path)?; + Ok(()) +} + +/// Tune pgbouncer. +/// 1. Apply new config using pgbouncer admin console +/// 2. Add new values to pgbouncer.ini to preserve them after restart +pub async fn tune_pgbouncer( + pgbouncer_settings: Option>, + pgbouncer_connstr: &str, + pgbouncer_ini_path: Option, +) -> Result<()> { + if let Some(pgbouncer_config) = pgbouncer_settings { + // Apply new config + let connect_result = tokio_postgres::connect(pgbouncer_connstr, NoTls).await; + let (client, connection) = connect_result.unwrap(); + tokio::spawn(async move { + if let Err(e) = connection.await { + eprintln!("connection error: {}", e); + } + }); + + for (option_name, value) in pgbouncer_config.iter() { + info!( + "Applying pgbouncer setting change: {} = {}", + option_name, value + ); + let query = format!("SET {} = {}", option_name, value); + + let result = client.simple_query(&query).await; + + info!("Applying pgbouncer setting change: {}", query); + info!("pgbouncer setting change result: {:?}", result); + + if let Err(err) = result { + // Don't fail on error, just print it into log + error!( + "Failed to apply pgbouncer setting change: {}, {}", + query, err + ); + }; + } + + // save values to pgbouncer.ini + // so that they are preserved after pgbouncer restart + if let Some(pgbouncer_ini_path) = pgbouncer_ini_path { + update_pgbouncer_ini(pgbouncer_config, &pgbouncer_ini_path)?; + } + } + + Ok(()) +} diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs index 20299c8fde..d545858dc2 100644 --- a/compute_tools/src/spec.rs +++ b/compute_tools/src/spec.rs @@ -370,33 +370,49 @@ pub fn handle_role_deletions(spec: &ComputeSpec, connstr: &str, client: &mut Cli Ok(()) } +fn reassign_owned_objects_in_one_db( + conf: Config, + role_name: &PgIdent, + db_owner: &PgIdent, +) -> Result<()> { + let mut client = conf.connect(NoTls)?; + + // This will reassign all dependent objects to the db owner + let reassign_query = format!( + "REASSIGN OWNED BY {} TO {}", + role_name.pg_quote(), + db_owner.pg_quote() + ); + info!( + "reassigning objects owned by '{}' in db '{}' to '{}'", + role_name, + conf.get_dbname().unwrap_or(""), + db_owner + ); + client.simple_query(&reassign_query)?; + + // This now will only drop privileges of the role + let drop_query = format!("DROP OWNED BY {}", role_name.pg_quote()); + client.simple_query(&drop_query)?; + Ok(()) +} + // Reassign all owned objects in all databases to the owner of the database. fn reassign_owned_objects(spec: &ComputeSpec, connstr: &str, role_name: &PgIdent) -> Result<()> { for db in &spec.cluster.databases { if db.owner != *role_name { let mut conf = Config::from_str(connstr)?; conf.dbname(&db.name); - - let mut client = conf.connect(NoTls)?; - - // This will reassign all dependent objects to the db owner - let reassign_query = format!( - "REASSIGN OWNED BY {} TO {}", - role_name.pg_quote(), - db.owner.pg_quote() - ); - info!( - "reassigning objects owned by '{}' in db '{}' to '{}'", - role_name, &db.name, &db.owner - ); - client.simple_query(&reassign_query)?; - - // This now will only drop privileges of the role - let drop_query = format!("DROP OWNED BY {}", role_name.pg_quote()); - client.simple_query(&drop_query)?; + reassign_owned_objects_in_one_db(conf, role_name, &db.owner)?; } } + // Also handle case when there are no databases in the spec. + // In this case we need to reassign objects in the default database. + let conf = Config::from_str(connstr)?; + let db_owner = PgIdent::from_str("cloud_admin")?; + reassign_owned_objects_in_one_db(conf, role_name, &db_owner)?; + Ok(()) } diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs index 071f22dc2b..55b66742ca 100644 --- a/control_plane/src/endpoint.rs +++ b/control_plane/src/endpoint.rs @@ -537,6 +537,7 @@ impl Endpoint { safekeeper_connstrings, storage_auth_token: auth_token.clone(), remote_extensions, + pgbouncer_settings: None, }; let spec_path = self.endpoint_path().join("spec.json"); std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?; diff --git a/deny.toml b/deny.toml index 079dcac679..22e39a2ca3 100644 --- a/deny.toml +++ b/deny.toml @@ -35,6 +35,7 @@ allow = [ "Artistic-2.0", "BSD-2-Clause", "BSD-3-Clause", + "CC0-1.0", "ISC", "MIT", "MPL-2.0", diff --git a/docs/rfcs/029-getpage-throttling.md b/docs/rfcs/029-getpage-throttling.md new file mode 100644 index 0000000000..b4f9adefc5 --- /dev/null +++ b/docs/rfcs/029-getpage-throttling.md @@ -0,0 +1,197 @@ +# Per-Tenant GetPage@LSN Throttling + +Author: Christian Schwarz +Date: Oct 24, 2023 + +## Summary + +This RFC proposes per-tenant throttling of GetPage@LSN requests inside Pageserver +and the interactions with its client, i.e., the neon_smgr component in Compute. + +The result of implementing & executing this RFC will be a fleet-wide upper limit for +**"the highest GetPage/second that Pageserver can support for a single tenant/shard"**. + +## Background + +### GetPage@LSN Request Flow + +Pageserver exposes its `page_service.rs` as a libpq listener. +The Computes' `neon_smgr` module connects to that libpq listener. +Once a connection is established, the protocol allows Compute to request page images at a given LSN. +We call these requests GetPage@LSN requests, or GetPage requests for short. +Other request types can be sent, but these are low traffic compared to GetPage requests +and are not the concern of this RFC. + +Pageserver associates one libpq connection with one tokio task. + +Per connection/task, the pq protocol is handled by the common `postgres_backend` crate. +Its `run_message_loop` function invokes the `page_service` specific `impl postgres_backend::Handler for PageServerHandler`. +Requests are processed in the order in which they arrive via the TCP-based pq protocol. +So, there is no concurrent request processing within one connection/task. + +There is a degree of natural pipelining: +Compute can "fill the pipe" by sending more than one GetPage request into the libpq TCP stream. +And Pageserver can fill the pipe with responses in the other direction. +Both directions are subject to the limit of tx/rx buffers, nodelay, TCP flow control, etc. + +### GetPage@LSN Access Pattern + +The Compute has its own hierarchy of caches, specifically `shared_buffers` and the `local file cache` (LFC). +Compute only issues GetPage requests to Pageserver if it encounters a miss in these caches. + +If the working set stops fitting into Compute's caches, requests to Pageserver increase sharply -- the Compute starts *thrashing*. + +## Motivation + +In INC-69, a tenant issued 155k GetPage/second for a period of 10 minutes and 60k GetPage/second for a period of 3h, +then dropping to ca 18k GetPage/second for a period of 9h. + +We noticed this because of an internal GetPage latency SLO burn rate alert, i.e., +the request latency profile during this period significantly exceeded what was acceptable according to the internal SLO. + +Sadly, we do not have the observability data to determine the impact of this tenant on other tenants on the same tenants. + +However, here are some illustrative data points for the 155k period: +The tenant was responsible for >= 99% of the GetPage traffic and, frankly, the overall activity on this Pageserver instance. +We were serving pages at 10 Gb/s (`155k x 8 kbyte (PAGE_SZ) per second is 1.12GiB/s = 9.4Gb/s.`) +The CPU utilization of the instance was 75% user+system. +Pageserver page cache served 1.75M accesses/second at a hit rate of ca 90%. +The hit rate for materialized pages was ca. 40%. +Curiously, IOPS to the Instance Store NVMe were very low, rarely exceeding 100. + +The fact that the IOPS were so low / the materialized page cache hit rate was so high suggests that **this tenant's compute's caches were thrashing**. +The compute was of type `k8s-pod`; hence, auto-scaling could/would not have helped remediate the thrashing by provisioning more RAM. +The consequence was that the **thrashing translated into excessive GetPage requests against Pageserver**. + +My claim is that it was **unhealthy to serve this workload at the pace we did**: +* it is likely that other tenants were/would have experienced high latencies (again, we sadly don't have per-tenant latency data to confirm this) +* more importantly, it was **unsustainable** to serve traffic at this pace for multiple reasons: + * **predictability of performance**: when the working set grows, the pageserver materialized page cache hit rate drops. + At some point, we're bound by the EC2 Instance Store NVMe drive's IOPS limit. + The result is an **uneven** performance profile from the Compute perspective. + + * **economics**: Neon currently does not charge for IOPS, only capacity. + **We cannot afford to undercut the market in IOPS/$ this drastically; it leads to adverse selection and perverse incentives.** + For example, the 155k IOPS, which we served for 10min, would cost ca. 6.5k$/month when provisioned as an io2 EBS volume. + Even the 18k IOPS, which we served for 9h, would cost ca. 1.1k$/month when provisioned as an io2 EBS volume. + We charge 0$. + It could be economically advantageous to keep using a low-DRAM compute because Pageserver IOPS are fast enough and free. + + +Note: It is helpful to think of Pageserver as a disk, because it's precisely where `neon_smgr` sits: +vanilla Postgres gets its pages from disk, Neon Postgres gets them from Pageserver. +So, regarding the above performance & economic arguments, it is fair to say that we currently provide an "as-fast-as-possible-IOPS" disk that we charge for only by capacity. + +## Solution: Throttling GetPage Requests + +**The consequence of the above analysis must be that Pageserver throttles GetPage@LSN requests**. +That is, unless we want to start charging for provisioned GetPage@LSN/second. +Throttling sets the correct incentive for a thrashing Compute to scale up its DRAM to the working set size. +Neon Autoscaling will make this easy, [eventually](https://github.com/neondatabase/neon/pull/3913). + +## The Design Space + +What that remains is the question about *policy* and *mechanism*: + +**Policy** concerns itself with the question of what limit applies to a given connection|timeline|tenant. +Candidates are: + +* hard limit, same limit value per connection|timeline|tenant + * Per-tenant will provide an upper bound for the impact of a tenant on a given Pageserver instance. + This is a major operational pain point / risk right now. +* hard limit, configurable per connection|timeline|tenant + * This outsources policy to console/control plane, with obvious advantages for flexible structuring of what service we offer to customers. + * Note that this is not a mechanism to guarantee a minium provisioned rate, i.e., this is not a mechanism to guarantee a certain QoS for a tenant. +* fair share among active connections|timelines|tenants per instance + * example: each connection|timeline|tenant gets a fair fraction of the machine's GetPage/second capacity + * NB: needs definition of "active", and knowledge of available GetPage/second capacity in advance +* ... + + +Regarding **mechanism**, it's clear that **backpressure** is the way to go. +However, we must choose between +* **implicit** backpressure through pq/TCP and +* **explicit** rejection of requests + retries with exponential backoff + +Further, there is the question of how throttling GetPage@LSN will affect the **internal GetPage latency SLO**: +where do we measure the SLI for Pageserver's internal getpage latency SLO? Before or after the throttling? + +And when we eventually move the measurement point into the Computes (to avoid coordinated omission), +how do we avoid counting throttling-induced latency toward the internal getpage latency SLI/SLO? + +## Scope Of This RFC + +**This RFC proposes introducing a hard GetPage@LSN/second limit per tenant, with the same value applying to each tenant on a Pageserver**. + +This proposal is easy to implement and significantly de-risks operating large Pageservers, +based on the assumption that extremely-high-GetPage-rate-episodes like the one from the "Motivation" section are uncorrelated between tenants. + +For example, suppose we pick a limit that allows up to 10 tenants to go at limit rate. +Suppose our Pageserver can serve 100k GetPage/second total at a 100% page cache miss rate. +If each tenant gets a hard limit of 10k GetPage/second, we can serve up to 10 tenants at limit speed without latency degradation. + +The mechanism for backpressure will be TCP-based implicit backpressure. +The compute team isn't concerned about prefetch queue depth. +Pageserver will implement it by delaying the reading of requests from the libpq connection(s). + +The rate limit will be implemented using a per-tenant token bucket. +The bucket will be be shared among all connections to the tenant. +The bucket implementation supports starvation-preventing `await`ing. +The current candidate for the implementation is [`leaky_bucket`](https://docs.rs/leaky-bucket/). +The getpage@lsn benchmark that's being added in https://github.com/neondatabase/neon/issues/5771 +can be used to evaluate the overhead of sharing the bucket among connections of a tenant. +A possible technique to mitigate the impact of sharing the bucket would be to maintain a buffer of a few tokens per connection handler. + +Regarding metrics / the internal GetPage latency SLO: +we will measure the GetPage latency SLO _after_ the throttler and introduce a new metric to measure the amount of throttling, quantified by: +- histogram that records the tenants' observations of queue depth before they start waiting (one such histogram per pageserver) +- histogram that records the tenants' observations of time spent waiting (one such histogram per pageserver) + +Further observability measures: +- an INFO log message at frequency 1/min if the tenant/timeline/connection was throttled in that last minute. + The message will identify the tenant/timeline/connection to allow correlation with compute logs/stats. + +Rollout will happen as follows: +- deploy 1: implementation + config: disabled by default, ability to enable it per tenant through tenant_conf +- experimentation in staging and later production to study impact & interaction with auto-scaling +- determination of a sensible global default value + - the value will be chosen as high as possible ... + - ... but low enough to work towards this RFC's goal that one tenant should not be able to dominate a pageserver instance. +- deploy 2: implementation fixes if any + config: enabled by default with the aforementioned global default +- reset of the experimental per-tenant overrides +- gain experience & lower the limit over time + - we stop lowering the limit as soon as this RFC's goal is achieved, i.e., + once we decide that in practice the chosen value sufficiently de-risks operating large pageservers + +The per-tenant override will remain for emergencies and testing. +But since Console doesn't preserve it during tenant migrations, it isn't durably configurable for the tenant. + +Toward the upper layers of the Neon stack, the resulting limit will be +**"the highest GetPage/second that Pageserver can support for a single tenant"**. + +### Rationale + +We decided against error + retry because of worries about starvation. + +## Future Work + +Enable per-tenant emergency override of the limit via Console. +Should be part of a more general framework to specify tenant config overrides. +**NB:** this is **not** the right mechanism to _sell_ different max GetPage/second levels to users, +or _auto-scale_ the GetPage/second levels. Such functionality will require a separate RFC that +concerns itself with GetPage/second capacity planning. + +Compute-side metrics for GetPage latency. + +Back-channel to inform Compute/Autoscaling/ControlPlane that the project is being throttled. + +Compute-side neon_smgr improvements to avoid sending the same GetPage request multiple times if multiple backends experience a cache miss. + +Dealing with read-only endpoints: users use read-only endpoints to scale reads for a single tenant. +Possibly there are also assumptions around read-only endpoints not affecting the primary read-write endpoint's performance. +With per-tenant rate limiting, we will not meet that expectation. +However, we can currently only scale per tenant. +Soon, we will have sharding (#5505), which will apply the throttling on a per-shard basis. +But, that's orthogonal to scaling reads: if many endpoints hit one shard, they share the same throttling limit. +To solve this properly, I think we'll need replicas for tenants / shard. +To performance-isolate a tenant's endpoints from each other, we'd then route them to different replicas. diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs index 2a483188e4..4ff6831272 100644 --- a/libs/compute_api/src/spec.rs +++ b/libs/compute_api/src/spec.rs @@ -73,6 +73,8 @@ pub struct ComputeSpec { // information about available remote extensions pub remote_extensions: Option, + + pub pgbouncer_settings: Option>, } /// Feature flag to signal `compute_ctl` to enable certain experimental functionality. diff --git a/libs/compute_api/tests/cluster_spec.json b/libs/compute_api/tests/cluster_spec.json index e2afa17ef0..ccd015ad19 100644 --- a/libs/compute_api/tests/cluster_spec.json +++ b/libs/compute_api/tests/cluster_spec.json @@ -243,5 +243,9 @@ "public_extensions": [ "postgis" ] + }, + "pgbouncer_settings": { + "default_pool_size": "42", + "pool_mode": "session" } } diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index a78ba8ad94..be41b610b8 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -2,6 +2,7 @@ pub mod partitioning; use std::{ collections::HashMap, + io::Read, num::{NonZeroU64, NonZeroUsize}, time::SystemTime, }; @@ -19,7 +20,7 @@ use utils::{ use crate::{reltag::RelTag, shard::TenantShardId}; use anyhow::bail; -use bytes::{BufMut, Bytes, BytesMut}; +use bytes::{Buf, BufMut, Bytes, BytesMut}; /// The state of a tenant in this pageserver. /// @@ -369,6 +370,14 @@ pub struct TenantInfo { pub attachment_status: TenantAttachmentStatus, } +#[derive(Serialize, Deserialize, Clone)] +pub struct TenantDetails { + #[serde(flatten)] + pub tenant_info: TenantInfo, + + pub timelines: Vec, +} + /// This represents the output of the "timeline_detail" and "timeline_list" API calls. #[derive(Debug, Serialize, Deserialize, Clone)] pub struct TimelineInfo { @@ -576,6 +585,7 @@ pub enum PagestreamFeMessage { } // Wrapped in libpq CopyData +#[derive(strum_macros::EnumProperty)] pub enum PagestreamBeMessage { Exists(PagestreamExistsResponse), Nblocks(PagestreamNblocksResponse), @@ -584,6 +594,29 @@ pub enum PagestreamBeMessage { DbSize(PagestreamDbSizeResponse), } +// Keep in sync with `pagestore_client.h` +#[repr(u8)] +enum PagestreamBeMessageTag { + Exists = 100, + Nblocks = 101, + GetPage = 102, + Error = 103, + DbSize = 104, +} +impl TryFrom for PagestreamBeMessageTag { + type Error = u8; + fn try_from(value: u8) -> Result { + match value { + 100 => Ok(PagestreamBeMessageTag::Exists), + 101 => Ok(PagestreamBeMessageTag::Nblocks), + 102 => Ok(PagestreamBeMessageTag::GetPage), + 103 => Ok(PagestreamBeMessageTag::Error), + 104 => Ok(PagestreamBeMessageTag::DbSize), + _ => Err(value), + } + } +} + #[derive(Debug, PartialEq, Eq)] pub struct PagestreamExistsRequest { pub latest: bool, @@ -739,35 +772,91 @@ impl PagestreamBeMessage { pub fn serialize(&self) -> Bytes { let mut bytes = BytesMut::new(); + use PagestreamBeMessageTag as Tag; match self { Self::Exists(resp) => { - bytes.put_u8(100); /* tag from pagestore_client.h */ + bytes.put_u8(Tag::Exists as u8); bytes.put_u8(resp.exists as u8); } Self::Nblocks(resp) => { - bytes.put_u8(101); /* tag from pagestore_client.h */ + bytes.put_u8(Tag::Nblocks as u8); bytes.put_u32(resp.n_blocks); } Self::GetPage(resp) => { - bytes.put_u8(102); /* tag from pagestore_client.h */ + bytes.put_u8(Tag::GetPage as u8); bytes.put(&resp.page[..]); } Self::Error(resp) => { - bytes.put_u8(103); /* tag from pagestore_client.h */ + bytes.put_u8(Tag::Error as u8); bytes.put(resp.message.as_bytes()); bytes.put_u8(0); // null terminator } Self::DbSize(resp) => { - bytes.put_u8(104); /* tag from pagestore_client.h */ + bytes.put_u8(Tag::DbSize as u8); bytes.put_i64(resp.db_size); } } bytes.into() } + + pub fn deserialize(buf: Bytes) -> anyhow::Result { + let mut buf = buf.reader(); + let msg_tag = buf.read_u8()?; + + use PagestreamBeMessageTag as Tag; + let ok = + match Tag::try_from(msg_tag).map_err(|tag: u8| anyhow::anyhow!("invalid tag {tag}"))? { + Tag::Exists => { + let exists = buf.read_u8()?; + Self::Exists(PagestreamExistsResponse { + exists: exists != 0, + }) + } + Tag::Nblocks => { + let n_blocks = buf.read_u32::()?; + Self::Nblocks(PagestreamNblocksResponse { n_blocks }) + } + Tag::GetPage => { + let mut page = vec![0; 8192]; // TODO: use MaybeUninit + buf.read_exact(&mut page)?; + PagestreamBeMessage::GetPage(PagestreamGetPageResponse { page: page.into() }) + } + Tag::Error => { + let buf = buf.get_ref(); + let cstr = std::ffi::CStr::from_bytes_until_nul(buf)?; + let rust_str = cstr.to_str()?; + PagestreamBeMessage::Error(PagestreamErrorResponse { + message: rust_str.to_owned(), + }) + } + Tag::DbSize => { + let db_size = buf.read_i64::()?; + Self::DbSize(PagestreamDbSizeResponse { db_size }) + } + }; + let remaining = buf.into_inner(); + if !remaining.is_empty() { + anyhow::bail!( + "remaining bytes in msg with tag={msg_tag}: {}", + remaining.len() + ); + } + Ok(ok) + } + + pub fn kind(&self) -> &'static str { + match self { + Self::Exists(_) => "Exists", + Self::Nblocks(_) => "Nblocks", + Self::GetPage(_) => "GetPage", + Self::Error(_) => "Error", + Self::DbSize(_) => "DbSize", + } + } } #[cfg(test)] diff --git a/libs/pageserver_api/src/shard.rs b/libs/pageserver_api/src/shard.rs index 052fbd1402..3e4936eec4 100644 --- a/libs/pageserver_api/src/shard.rs +++ b/libs/pageserver_api/src/shard.rs @@ -81,6 +81,10 @@ impl TenantShardId { pub fn is_zero(&self) -> bool { self.shard_number == ShardNumber(0) } + + pub fn is_unsharded(&self) -> bool { + self.shard_number == ShardNumber(0) && self.shard_count == ShardCount(0) + } } /// Formatting helper @@ -159,7 +163,7 @@ impl From<[u8; 18]> for TenantShardId { /// shard we're dealing with, but do not need to know the full ShardIdentity (because /// we won't be doing any page->shard mapping), and do not need to know the fully qualified /// TenantShardId. -#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy)] +#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)] pub struct ShardIndex { pub shard_number: ShardNumber, pub shard_count: ShardCount, diff --git a/libs/remote_storage/src/azure_blob.rs b/libs/remote_storage/src/azure_blob.rs index 548bde02f6..7ea1103eb2 100644 --- a/libs/remote_storage/src/azure_blob.rs +++ b/libs/remote_storage/src/azure_blob.rs @@ -117,6 +117,8 @@ impl AzureBlobStorage { ) -> Result { let mut response = builder.into_stream(); + let mut etag = None; + let mut last_modified = None; let mut metadata = HashMap::new(); // TODO give proper streaming response instead of buffering into RAM // https://github.com/neondatabase/neon/issues/5563 @@ -124,6 +126,13 @@ impl AzureBlobStorage { let mut bufs = Vec::new(); while let Some(part) = response.next().await { let part = part.map_err(to_download_error)?; + let etag_str: &str = part.blob.properties.etag.as_ref(); + if etag.is_none() { + etag = Some(etag.unwrap_or_else(|| etag_str.to_owned())); + } + if last_modified.is_none() { + last_modified = Some(part.blob.properties.last_modified.into()); + } if let Some(blob_meta) = part.blob.metadata { metadata.extend(blob_meta.iter().map(|(k, v)| (k.to_owned(), v.to_owned()))); } @@ -136,6 +145,8 @@ impl AzureBlobStorage { } Ok(Download { download_stream: Box::pin(futures::stream::iter(bufs.into_iter().map(Ok))), + etag, + last_modified, metadata: Some(StorageMetadata(metadata)), }) } diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs index e77c54e1e7..3e408e3119 100644 --- a/libs/remote_storage/src/lib.rs +++ b/libs/remote_storage/src/lib.rs @@ -14,7 +14,9 @@ mod local_fs; mod s3_bucket; mod simulate_failures; -use std::{collections::HashMap, fmt::Debug, num::NonZeroUsize, pin::Pin, sync::Arc}; +use std::{ + collections::HashMap, fmt::Debug, num::NonZeroUsize, pin::Pin, sync::Arc, time::SystemTime, +}; use anyhow::{bail, Context}; use camino::{Utf8Path, Utf8PathBuf}; @@ -207,8 +209,13 @@ pub trait RemoteStorage: Send + Sync + 'static { async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()>; } +pub type DownloadStream = Pin> + Unpin + Send + Sync>>; pub struct Download { - pub download_stream: Pin> + Unpin + Send + Sync>>, + pub download_stream: DownloadStream, + /// The last time the file was modified (`last-modified` HTTP header) + pub last_modified: Option, + /// A way to identify this specific version of the resource (`etag` HTTP header) + pub etag: Option, /// Extra key-value data, associated with the current remote file. pub metadata: Option, } diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs index 03b98e5ea2..d1e7d325b9 100644 --- a/libs/remote_storage/src/local_fs.rs +++ b/libs/remote_storage/src/local_fs.rs @@ -18,7 +18,7 @@ use tokio_util::io::ReaderStream; use tracing::*; use utils::{crashsafe::path_with_suffix_extension, fs_ext::is_directory_empty}; -use crate::{Download, DownloadError, Listing, ListingMode, RemotePath}; +use crate::{Download, DownloadError, DownloadStream, Listing, ListingMode, RemotePath}; use super::{RemoteStorage, StorageMetadata}; @@ -331,6 +331,8 @@ impl RemoteStorage for LocalFs { .map_err(DownloadError::Other)?; Ok(Download { metadata, + last_modified: None, + etag: None, download_stream: Box::pin(source), }) } else { @@ -372,17 +374,17 @@ impl RemoteStorage for LocalFs { .await .map_err(DownloadError::Other)?; - Ok(match end_exclusive { - Some(end_exclusive) => Download { - metadata, - download_stream: Box::pin(ReaderStream::new( - source.take(end_exclusive - start_inclusive), - )), - }, - None => Download { - metadata, - download_stream: Box::pin(ReaderStream::new(source)), - }, + let download_stream: DownloadStream = match end_exclusive { + Some(end_exclusive) => Box::pin(ReaderStream::new( + source.take(end_exclusive - start_inclusive), + )), + None => Box::pin(ReaderStream::new(source)), + }; + Ok(Download { + metadata, + last_modified: None, + etag: None, + download_stream, }) } else { Err(DownloadError::NotFound) diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs index 97fa1bbf5b..0f95458ad1 100644 --- a/libs/remote_storage/src/s3_bucket.rs +++ b/libs/remote_storage/src/s3_bucket.rs @@ -16,6 +16,7 @@ use aws_config::{ environment::credentials::EnvironmentVariableCredentialsProvider, imds::credentials::ImdsCredentialsProvider, meta::credentials::CredentialsProviderChain, + profile::ProfileFileCredentialsProvider, provider_config::ProviderConfig, retry::{RetryConfigBuilder, RetryMode}, web_identity_token::WebIdentityTokenCredentialsProvider, @@ -74,20 +75,29 @@ impl S3Bucket { let region = Some(Region::new(aws_config.bucket_region.clone())); + let provider_conf = ProviderConfig::without_region().with_region(region.clone()); + let credentials_provider = { // uses "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY" CredentialsProviderChain::first_try( "env", EnvironmentVariableCredentialsProvider::new(), ) + // uses "AWS_PROFILE" / `aws sso login --profile ` + .or_else( + "profile-sso", + ProfileFileCredentialsProvider::builder() + .configure(&provider_conf) + .build(), + ) // uses "AWS_WEB_IDENTITY_TOKEN_FILE", "AWS_ROLE_ARN", "AWS_ROLE_SESSION_NAME" // needed to access remote extensions bucket - .or_else("token", { - let provider_conf = ProviderConfig::without_region().with_region(region.clone()); + .or_else( + "token", WebIdentityTokenCredentialsProvider::builder() .configure(&provider_conf) - .build() - }) + .build(), + ) // uses imds v2 .or_else("imds", ImdsCredentialsProvider::builder().build()) }; @@ -218,17 +228,11 @@ impl S3Bucket { let started_at = ScopeGuard::into_inner(started_at); - if get_object.is_err() { - metrics::BUCKET_METRICS.req_seconds.observe_elapsed( - kind, - AttemptOutcome::Err, - started_at, - ); - } - match get_object { Ok(object_output) => { let metadata = object_output.metadata().cloned().map(StorageMetadata); + let etag = object_output.e_tag.clone(); + let last_modified = object_output.last_modified.and_then(|t| t.try_into().ok()); let body = object_output.body; let body = ByteStreamAsStream::from(body); @@ -237,15 +241,33 @@ impl S3Bucket { Ok(Download { metadata, + etag, + last_modified, download_stream: Box::pin(body), }) } Err(SdkError::ServiceError(e)) if matches!(e.err(), GetObjectError::NoSuchKey(_)) => { + // Count this in the AttemptOutcome::Ok bucket, because 404 is not + // an error: we expect to sometimes fetch an object and find it missing, + // e.g. when probing for timeline indices. + metrics::BUCKET_METRICS.req_seconds.observe_elapsed( + kind, + AttemptOutcome::Ok, + started_at, + ); Err(DownloadError::NotFound) } - Err(e) => Err(DownloadError::Other( - anyhow::Error::new(e).context("download s3 object"), - )), + Err(e) => { + metrics::BUCKET_METRICS.req_seconds.observe_elapsed( + kind, + AttemptOutcome::Err, + started_at, + ); + + Err(DownloadError::Other( + anyhow::Error::new(e).context("download s3 object"), + )) + } } } } diff --git a/libs/remote_storage/tests/common/mod.rs b/libs/remote_storage/tests/common/mod.rs new file mode 100644 index 0000000000..bca117ed1a --- /dev/null +++ b/libs/remote_storage/tests/common/mod.rs @@ -0,0 +1,200 @@ +use std::collections::HashSet; +use std::ops::ControlFlow; +use std::path::PathBuf; +use std::sync::Arc; + +use anyhow::Context; +use bytes::Bytes; +use camino::Utf8Path; +use futures::stream::Stream; +use once_cell::sync::OnceCell; +use remote_storage::{Download, GenericRemoteStorage, RemotePath}; +use tokio::task::JoinSet; +use tracing::{debug, error, info}; + +static LOGGING_DONE: OnceCell<()> = OnceCell::new(); + +pub(crate) fn upload_stream( + content: std::borrow::Cow<'static, [u8]>, +) -> ( + impl Stream> + Send + Sync + 'static, + usize, +) { + use std::borrow::Cow; + + let content = match content { + Cow::Borrowed(x) => Bytes::from_static(x), + Cow::Owned(vec) => Bytes::from(vec), + }; + wrap_stream(content) +} + +pub(crate) fn wrap_stream( + content: bytes::Bytes, +) -> ( + impl Stream> + Send + Sync + 'static, + usize, +) { + let len = content.len(); + let content = futures::future::ready(Ok(content)); + + (futures::stream::once(content), len) +} + +pub(crate) async fn download_to_vec(dl: Download) -> anyhow::Result> { + let mut buf = Vec::new(); + tokio::io::copy_buf( + &mut tokio_util::io::StreamReader::new(dl.download_stream), + &mut buf, + ) + .await?; + Ok(buf) +} + +// Uploads files `folder{j}/blob{i}.txt`. See test description for more details. +pub(crate) async fn upload_simple_remote_data( + client: &Arc, + upload_tasks_count: usize, +) -> ControlFlow, HashSet> { + info!("Creating {upload_tasks_count} remote files"); + let mut upload_tasks = JoinSet::new(); + for i in 1..upload_tasks_count + 1 { + let task_client = Arc::clone(client); + upload_tasks.spawn(async move { + let blob_path = PathBuf::from(format!("folder{}/blob_{}.txt", i / 7, i)); + let blob_path = RemotePath::new( + Utf8Path::from_path(blob_path.as_path()).expect("must be valid blob path"), + ) + .with_context(|| format!("{blob_path:?} to RemotePath conversion"))?; + debug!("Creating remote item {i} at path {blob_path:?}"); + + let (data, len) = upload_stream(format!("remote blob data {i}").into_bytes().into()); + task_client.upload(data, len, &blob_path, None).await?; + + Ok::<_, anyhow::Error>(blob_path) + }); + } + + let mut upload_tasks_failed = false; + let mut uploaded_blobs = HashSet::with_capacity(upload_tasks_count); + while let Some(task_run_result) = upload_tasks.join_next().await { + match task_run_result + .context("task join failed") + .and_then(|task_result| task_result.context("upload task failed")) + { + Ok(upload_path) => { + uploaded_blobs.insert(upload_path); + } + Err(e) => { + error!("Upload task failed: {e:?}"); + upload_tasks_failed = true; + } + } + } + + if upload_tasks_failed { + ControlFlow::Break(uploaded_blobs) + } else { + ControlFlow::Continue(uploaded_blobs) + } +} + +pub(crate) async fn cleanup( + client: &Arc, + objects_to_delete: HashSet, +) { + info!( + "Removing {} objects from the remote storage during cleanup", + objects_to_delete.len() + ); + let mut delete_tasks = JoinSet::new(); + for object_to_delete in objects_to_delete { + let task_client = Arc::clone(client); + delete_tasks.spawn(async move { + debug!("Deleting remote item at path {object_to_delete:?}"); + task_client + .delete(&object_to_delete) + .await + .with_context(|| format!("{object_to_delete:?} removal")) + }); + } + + while let Some(task_run_result) = delete_tasks.join_next().await { + match task_run_result { + Ok(task_result) => match task_result { + Ok(()) => {} + Err(e) => error!("Delete task failed: {e:?}"), + }, + Err(join_err) => error!("Delete task did not finish correctly: {join_err}"), + } + } +} +pub(crate) struct Uploads { + pub(crate) prefixes: HashSet, + pub(crate) blobs: HashSet, +} + +pub(crate) async fn upload_remote_data( + client: &Arc, + base_prefix_str: &'static str, + upload_tasks_count: usize, +) -> ControlFlow { + info!("Creating {upload_tasks_count} remote files"); + let mut upload_tasks = JoinSet::new(); + for i in 1..upload_tasks_count + 1 { + let task_client = Arc::clone(client); + upload_tasks.spawn(async move { + let prefix = format!("{base_prefix_str}/sub_prefix_{i}/"); + let blob_prefix = RemotePath::new(Utf8Path::new(&prefix)) + .with_context(|| format!("{prefix:?} to RemotePath conversion"))?; + let blob_path = blob_prefix.join(Utf8Path::new(&format!("blob_{i}"))); + debug!("Creating remote item {i} at path {blob_path:?}"); + + let (data, data_len) = + upload_stream(format!("remote blob data {i}").into_bytes().into()); + task_client.upload(data, data_len, &blob_path, None).await?; + + Ok::<_, anyhow::Error>((blob_prefix, blob_path)) + }); + } + + let mut upload_tasks_failed = false; + let mut uploaded_prefixes = HashSet::with_capacity(upload_tasks_count); + let mut uploaded_blobs = HashSet::with_capacity(upload_tasks_count); + while let Some(task_run_result) = upload_tasks.join_next().await { + match task_run_result + .context("task join failed") + .and_then(|task_result| task_result.context("upload task failed")) + { + Ok((upload_prefix, upload_path)) => { + uploaded_prefixes.insert(upload_prefix); + uploaded_blobs.insert(upload_path); + } + Err(e) => { + error!("Upload task failed: {e:?}"); + upload_tasks_failed = true; + } + } + } + + let uploads = Uploads { + prefixes: uploaded_prefixes, + blobs: uploaded_blobs, + }; + if upload_tasks_failed { + ControlFlow::Break(uploads) + } else { + ControlFlow::Continue(uploads) + } +} + +pub(crate) fn ensure_logging_ready() { + LOGGING_DONE.get_or_init(|| { + utils::logging::init( + utils::logging::LogFormat::Test, + utils::logging::TracingErrorLayerEnablement::Disabled, + utils::logging::Output::Stdout, + ) + .expect("logging init failed"); + }); +} diff --git a/libs/remote_storage/tests/test_real_azure.rs b/libs/remote_storage/tests/test_real_azure.rs index 7327803198..0387dc30e7 100644 --- a/libs/remote_storage/tests/test_real_azure.rs +++ b/libs/remote_storage/tests/test_real_azure.rs @@ -2,23 +2,23 @@ use std::collections::HashSet; use std::env; use std::num::NonZeroUsize; use std::ops::ControlFlow; -use std::path::PathBuf; use std::sync::Arc; use std::time::UNIX_EPOCH; use anyhow::Context; -use bytes::Bytes; use camino::Utf8Path; -use futures::stream::Stream; -use once_cell::sync::OnceCell; use remote_storage::{ - AzureConfig, Download, GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind, + AzureConfig, GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind, }; use test_context::{test_context, AsyncTestContext}; -use tokio::task::JoinSet; -use tracing::{debug, error, info}; +use tracing::{debug, info}; -static LOGGING_DONE: OnceCell<()> = OnceCell::new(); +mod common; + +use common::{ + cleanup, download_to_vec, ensure_logging_ready, upload_remote_data, upload_simple_remote_data, + upload_stream, wrap_stream, +}; const ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME: &str = "ENABLE_REAL_AZURE_REMOTE_STORAGE"; @@ -30,7 +30,7 @@ const BASE_PREFIX: &str = "test"; /// If real Azure tests are disabled, the test passes, skipping any real test run: currently, there's no way to mark the test ignored in runtime with the /// deafult test framework, see https://github.com/rust-lang/rust/issues/68007 for details. /// -/// First, the test creates a set of Azure blobs with keys `/${random_prefix_part}/${base_prefix_str}/sub_prefix_${i}/blob_${i}` in [`upload_azure_data`] +/// First, the test creates a set of Azure blobs with keys `/${random_prefix_part}/${base_prefix_str}/sub_prefix_${i}/blob_${i}` in [`upload_remote_data`] /// where /// * `random_prefix_part` is set for the entire Azure client during the Azure client creation in [`create_azure_client`], to avoid multiple test runs interference /// * `base_prefix_str` is a common prefix to use in the client requests: we would want to ensure that the client is able to list nested prefixes inside the bucket @@ -97,7 +97,7 @@ async fn azure_pagination_should_work( /// Uses real Azure and requires [`ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME`] and related Azure cred env vars specified. Test will skip real code and pass if env vars not set. /// See `Azure_pagination_should_work` for more information. /// -/// First, create a set of Azure objects with keys `random_prefix/folder{j}/blob_{i}.txt` in [`upload_azure_data`] +/// First, create a set of Azure objects with keys `random_prefix/folder{j}/blob_{i}.txt` in [`upload_remote_data`] /// Then performs the following queries: /// 1. `list_files(None)`. This should return all files `random_prefix/folder{j}/blob_{i}.txt` /// 2. `list_files("folder1")`. This should return all files `random_prefix/folder1/blob_{i}.txt` @@ -218,18 +218,9 @@ async fn azure_upload_download_works(ctx: &mut MaybeEnabledAzure) -> anyhow::Res ctx.client.upload(data, len, &path, None).await?; - async fn download_and_compare(dl: Download) -> anyhow::Result> { - let mut buf = Vec::new(); - tokio::io::copy_buf( - &mut tokio_util::io::StreamReader::new(dl.download_stream), - &mut buf, - ) - .await?; - Ok(buf) - } // Normal download request let dl = ctx.client.download(&path).await?; - let buf = download_and_compare(dl).await?; + let buf = download_to_vec(dl).await?; assert_eq!(&buf, &orig); // Full range (end specified) @@ -237,12 +228,12 @@ async fn azure_upload_download_works(ctx: &mut MaybeEnabledAzure) -> anyhow::Res .client .download_byte_range(&path, 0, Some(len as u64)) .await?; - let buf = download_and_compare(dl).await?; + let buf = download_to_vec(dl).await?; assert_eq!(&buf, &orig); // partial range (end specified) let dl = ctx.client.download_byte_range(&path, 4, Some(10)).await?; - let buf = download_and_compare(dl).await?; + let buf = download_to_vec(dl).await?; assert_eq!(&buf, &orig[4..10]); // partial range (end beyond real end) @@ -250,17 +241,17 @@ async fn azure_upload_download_works(ctx: &mut MaybeEnabledAzure) -> anyhow::Res .client .download_byte_range(&path, 8, Some(len as u64 * 100)) .await?; - let buf = download_and_compare(dl).await?; + let buf = download_to_vec(dl).await?; assert_eq!(&buf, &orig[8..]); // Partial range (end unspecified) let dl = ctx.client.download_byte_range(&path, 4, None).await?; - let buf = download_and_compare(dl).await?; + let buf = download_to_vec(dl).await?; assert_eq!(&buf, &orig[4..]); // Full range (end unspecified) let dl = ctx.client.download_byte_range(&path, 0, None).await?; - let buf = download_and_compare(dl).await?; + let buf = download_to_vec(dl).await?; assert_eq!(&buf, &orig); debug!("Cleanup: deleting file at path {path:?}"); @@ -272,17 +263,6 @@ async fn azure_upload_download_works(ctx: &mut MaybeEnabledAzure) -> anyhow::Res Ok(()) } -fn ensure_logging_ready() { - LOGGING_DONE.get_or_init(|| { - utils::logging::init( - utils::logging::LogFormat::Test, - utils::logging::TracingErrorLayerEnablement::Disabled, - utils::logging::Output::Stdout, - ) - .expect("logging init failed"); - }); -} - struct EnabledAzure { client: Arc, base_prefix: &'static str, @@ -352,7 +332,7 @@ impl AsyncTestContext for MaybeEnabledAzureWithTestBlobs { let enabled = EnabledAzure::setup(Some(max_keys_in_list_response)).await; - match upload_azure_data(&enabled.client, enabled.base_prefix, upload_tasks_count).await { + match upload_remote_data(&enabled.client, enabled.base_prefix, upload_tasks_count).await { ControlFlow::Continue(uploads) => { info!("Remote objects created successfully"); @@ -414,7 +394,7 @@ impl AsyncTestContext for MaybeEnabledAzureWithSimpleTestBlobs { let enabled = EnabledAzure::setup(Some(max_keys_in_list_response)).await; - match upload_simple_azure_data(&enabled.client, upload_tasks_count).await { + match upload_simple_remote_data(&enabled.client, upload_tasks_count).await { ControlFlow::Continue(uploads) => { info!("Remote objects created successfully"); @@ -478,166 +458,3 @@ fn create_azure_client( GenericRemoteStorage::from_config(&remote_storage_config).context("remote storage init")?, )) } - -struct Uploads { - prefixes: HashSet, - blobs: HashSet, -} - -async fn upload_azure_data( - client: &Arc, - base_prefix_str: &'static str, - upload_tasks_count: usize, -) -> ControlFlow { - info!("Creating {upload_tasks_count} Azure files"); - let mut upload_tasks = JoinSet::new(); - for i in 1..upload_tasks_count + 1 { - let task_client = Arc::clone(client); - upload_tasks.spawn(async move { - let prefix = format!("{base_prefix_str}/sub_prefix_{i}/"); - let blob_prefix = RemotePath::new(Utf8Path::new(&prefix)) - .with_context(|| format!("{prefix:?} to RemotePath conversion"))?; - let blob_path = blob_prefix.join(Utf8Path::new(&format!("blob_{i}"))); - debug!("Creating remote item {i} at path {blob_path:?}"); - - let (data, len) = upload_stream(format!("remote blob data {i}").into_bytes().into()); - task_client.upload(data, len, &blob_path, None).await?; - - Ok::<_, anyhow::Error>((blob_prefix, blob_path)) - }); - } - - let mut upload_tasks_failed = false; - let mut uploaded_prefixes = HashSet::with_capacity(upload_tasks_count); - let mut uploaded_blobs = HashSet::with_capacity(upload_tasks_count); - while let Some(task_run_result) = upload_tasks.join_next().await { - match task_run_result - .context("task join failed") - .and_then(|task_result| task_result.context("upload task failed")) - { - Ok((upload_prefix, upload_path)) => { - uploaded_prefixes.insert(upload_prefix); - uploaded_blobs.insert(upload_path); - } - Err(e) => { - error!("Upload task failed: {e:?}"); - upload_tasks_failed = true; - } - } - } - - let uploads = Uploads { - prefixes: uploaded_prefixes, - blobs: uploaded_blobs, - }; - if upload_tasks_failed { - ControlFlow::Break(uploads) - } else { - ControlFlow::Continue(uploads) - } -} - -async fn cleanup(client: &Arc, objects_to_delete: HashSet) { - info!( - "Removing {} objects from the remote storage during cleanup", - objects_to_delete.len() - ); - let mut delete_tasks = JoinSet::new(); - for object_to_delete in objects_to_delete { - let task_client = Arc::clone(client); - delete_tasks.spawn(async move { - debug!("Deleting remote item at path {object_to_delete:?}"); - task_client - .delete(&object_to_delete) - .await - .with_context(|| format!("{object_to_delete:?} removal")) - }); - } - - while let Some(task_run_result) = delete_tasks.join_next().await { - match task_run_result { - Ok(task_result) => match task_result { - Ok(()) => {} - Err(e) => error!("Delete task failed: {e:?}"), - }, - Err(join_err) => error!("Delete task did not finish correctly: {join_err}"), - } - } -} - -// Uploads files `folder{j}/blob{i}.txt`. See test description for more details. -async fn upload_simple_azure_data( - client: &Arc, - upload_tasks_count: usize, -) -> ControlFlow, HashSet> { - info!("Creating {upload_tasks_count} Azure files"); - let mut upload_tasks = JoinSet::new(); - for i in 1..upload_tasks_count + 1 { - let task_client = Arc::clone(client); - upload_tasks.spawn(async move { - let blob_path = PathBuf::from(format!("folder{}/blob_{}.txt", i / 7, i)); - let blob_path = RemotePath::new( - Utf8Path::from_path(blob_path.as_path()).expect("must be valid blob path"), - ) - .with_context(|| format!("{blob_path:?} to RemotePath conversion"))?; - debug!("Creating remote item {i} at path {blob_path:?}"); - - let (data, len) = upload_stream(format!("remote blob data {i}").into_bytes().into()); - task_client.upload(data, len, &blob_path, None).await?; - - Ok::<_, anyhow::Error>(blob_path) - }); - } - - let mut upload_tasks_failed = false; - let mut uploaded_blobs = HashSet::with_capacity(upload_tasks_count); - while let Some(task_run_result) = upload_tasks.join_next().await { - match task_run_result - .context("task join failed") - .and_then(|task_result| task_result.context("upload task failed")) - { - Ok(upload_path) => { - uploaded_blobs.insert(upload_path); - } - Err(e) => { - error!("Upload task failed: {e:?}"); - upload_tasks_failed = true; - } - } - } - - if upload_tasks_failed { - ControlFlow::Break(uploaded_blobs) - } else { - ControlFlow::Continue(uploaded_blobs) - } -} - -// FIXME: copypasted from test_real_s3, can't remember how to share a module which is not compiled -// to binary -fn upload_stream( - content: std::borrow::Cow<'static, [u8]>, -) -> ( - impl Stream> + Send + Sync + 'static, - usize, -) { - use std::borrow::Cow; - - let content = match content { - Cow::Borrowed(x) => Bytes::from_static(x), - Cow::Owned(vec) => Bytes::from(vec), - }; - wrap_stream(content) -} - -fn wrap_stream( - content: bytes::Bytes, -) -> ( - impl Stream> + Send + Sync + 'static, - usize, -) { - let len = content.len(); - let content = futures::future::ready(Ok(content)); - - (futures::stream::once(content), len) -} diff --git a/libs/remote_storage/tests/test_real_s3.rs b/libs/remote_storage/tests/test_real_s3.rs index ecd834e61c..8f46b2abd6 100644 --- a/libs/remote_storage/tests/test_real_s3.rs +++ b/libs/remote_storage/tests/test_real_s3.rs @@ -2,23 +2,23 @@ use std::collections::HashSet; use std::env; use std::num::NonZeroUsize; use std::ops::ControlFlow; -use std::path::PathBuf; use std::sync::Arc; use std::time::UNIX_EPOCH; use anyhow::Context; -use bytes::Bytes; use camino::Utf8Path; -use futures::stream::Stream; -use once_cell::sync::OnceCell; use remote_storage::{ GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind, S3Config, }; use test_context::{test_context, AsyncTestContext}; -use tokio::task::JoinSet; -use tracing::{debug, error, info}; +use tracing::{debug, info}; -static LOGGING_DONE: OnceCell<()> = OnceCell::new(); +mod common; + +use common::{ + cleanup, download_to_vec, ensure_logging_ready, upload_remote_data, upload_simple_remote_data, + upload_stream, wrap_stream, +}; const ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME: &str = "ENABLE_REAL_S3_REMOTE_STORAGE"; @@ -30,7 +30,7 @@ const BASE_PREFIX: &str = "test"; /// If real S3 tests are disabled, the test passes, skipping any real test run: currently, there's no way to mark the test ignored in runtime with the /// deafult test framework, see https://github.com/rust-lang/rust/issues/68007 for details. /// -/// First, the test creates a set of S3 objects with keys `/${random_prefix_part}/${base_prefix_str}/sub_prefix_${i}/blob_${i}` in [`upload_s3_data`] +/// First, the test creates a set of S3 objects with keys `/${random_prefix_part}/${base_prefix_str}/sub_prefix_${i}/blob_${i}` in [`upload_remote_data`] /// where /// * `random_prefix_part` is set for the entire S3 client during the S3 client creation in [`create_s3_client`], to avoid multiple test runs interference /// * `base_prefix_str` is a common prefix to use in the client requests: we would want to ensure that the client is able to list nested prefixes inside the bucket @@ -95,7 +95,7 @@ async fn s3_pagination_should_work(ctx: &mut MaybeEnabledS3WithTestBlobs) -> any /// Uses real S3 and requires [`ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME`] and related S3 cred env vars specified. Test will skip real code and pass if env vars not set. /// See `s3_pagination_should_work` for more information. /// -/// First, create a set of S3 objects with keys `random_prefix/folder{j}/blob_{i}.txt` in [`upload_s3_data`] +/// First, create a set of S3 objects with keys `random_prefix/folder{j}/blob_{i}.txt` in [`upload_remote_data`] /// Then performs the following queries: /// 1. `list_files(None)`. This should return all files `random_prefix/folder{j}/blob_{i}.txt` /// 2. `list_files("folder1")`. This should return all files `random_prefix/folder1/blob_{i}.txt` @@ -198,15 +198,65 @@ async fn s3_delete_objects_works(ctx: &mut MaybeEnabledS3) -> anyhow::Result<()> Ok(()) } -fn ensure_logging_ready() { - LOGGING_DONE.get_or_init(|| { - utils::logging::init( - utils::logging::LogFormat::Test, - utils::logging::TracingErrorLayerEnablement::Disabled, - utils::logging::Output::Stdout, - ) - .expect("logging init failed"); - }); +#[test_context(MaybeEnabledS3)] +#[tokio::test] +async fn s3_upload_download_works(ctx: &mut MaybeEnabledS3) -> anyhow::Result<()> { + let MaybeEnabledS3::Enabled(ctx) = ctx else { + return Ok(()); + }; + + let path = RemotePath::new(Utf8Path::new(format!("{}/file", ctx.base_prefix).as_str())) + .with_context(|| "RemotePath conversion")?; + + let orig = bytes::Bytes::from_static("remote blob data here".as_bytes()); + + let (data, len) = wrap_stream(orig.clone()); + + ctx.client.upload(data, len, &path, None).await?; + + // Normal download request + let dl = ctx.client.download(&path).await?; + let buf = download_to_vec(dl).await?; + assert_eq!(&buf, &orig); + + // Full range (end specified) + let dl = ctx + .client + .download_byte_range(&path, 0, Some(len as u64)) + .await?; + let buf = download_to_vec(dl).await?; + assert_eq!(&buf, &orig); + + // partial range (end specified) + let dl = ctx.client.download_byte_range(&path, 4, Some(10)).await?; + let buf = download_to_vec(dl).await?; + assert_eq!(&buf, &orig[4..10]); + + // partial range (end beyond real end) + let dl = ctx + .client + .download_byte_range(&path, 8, Some(len as u64 * 100)) + .await?; + let buf = download_to_vec(dl).await?; + assert_eq!(&buf, &orig[8..]); + + // Partial range (end unspecified) + let dl = ctx.client.download_byte_range(&path, 4, None).await?; + let buf = download_to_vec(dl).await?; + assert_eq!(&buf, &orig[4..]); + + // Full range (end unspecified) + let dl = ctx.client.download_byte_range(&path, 0, None).await?; + let buf = download_to_vec(dl).await?; + assert_eq!(&buf, &orig); + + debug!("Cleanup: deleting file at path {path:?}"); + ctx.client + .delete(&path) + .await + .with_context(|| format!("{path:?} removal"))?; + + Ok(()) } struct EnabledS3 { @@ -278,7 +328,7 @@ impl AsyncTestContext for MaybeEnabledS3WithTestBlobs { let enabled = EnabledS3::setup(Some(max_keys_in_list_response)).await; - match upload_s3_data(&enabled.client, enabled.base_prefix, upload_tasks_count).await { + match upload_remote_data(&enabled.client, enabled.base_prefix, upload_tasks_count).await { ControlFlow::Continue(uploads) => { info!("Remote objects created successfully"); @@ -340,7 +390,7 @@ impl AsyncTestContext for MaybeEnabledS3WithSimpleTestBlobs { let enabled = EnabledS3::setup(Some(max_keys_in_list_response)).await; - match upload_simple_s3_data(&enabled.client, upload_tasks_count).await { + match upload_simple_remote_data(&enabled.client, upload_tasks_count).await { ControlFlow::Continue(uploads) => { info!("Remote objects created successfully"); @@ -403,166 +453,3 @@ fn create_s3_client( GenericRemoteStorage::from_config(&remote_storage_config).context("remote storage init")?, )) } - -struct Uploads { - prefixes: HashSet, - blobs: HashSet, -} - -async fn upload_s3_data( - client: &Arc, - base_prefix_str: &'static str, - upload_tasks_count: usize, -) -> ControlFlow { - info!("Creating {upload_tasks_count} S3 files"); - let mut upload_tasks = JoinSet::new(); - for i in 1..upload_tasks_count + 1 { - let task_client = Arc::clone(client); - upload_tasks.spawn(async move { - let prefix = format!("{base_prefix_str}/sub_prefix_{i}/"); - let blob_prefix = RemotePath::new(Utf8Path::new(&prefix)) - .with_context(|| format!("{prefix:?} to RemotePath conversion"))?; - let blob_path = blob_prefix.join(Utf8Path::new(&format!("blob_{i}"))); - debug!("Creating remote item {i} at path {blob_path:?}"); - - let (data, data_len) = - upload_stream(format!("remote blob data {i}").into_bytes().into()); - task_client.upload(data, data_len, &blob_path, None).await?; - - Ok::<_, anyhow::Error>((blob_prefix, blob_path)) - }); - } - - let mut upload_tasks_failed = false; - let mut uploaded_prefixes = HashSet::with_capacity(upload_tasks_count); - let mut uploaded_blobs = HashSet::with_capacity(upload_tasks_count); - while let Some(task_run_result) = upload_tasks.join_next().await { - match task_run_result - .context("task join failed") - .and_then(|task_result| task_result.context("upload task failed")) - { - Ok((upload_prefix, upload_path)) => { - uploaded_prefixes.insert(upload_prefix); - uploaded_blobs.insert(upload_path); - } - Err(e) => { - error!("Upload task failed: {e:?}"); - upload_tasks_failed = true; - } - } - } - - let uploads = Uploads { - prefixes: uploaded_prefixes, - blobs: uploaded_blobs, - }; - if upload_tasks_failed { - ControlFlow::Break(uploads) - } else { - ControlFlow::Continue(uploads) - } -} - -async fn cleanup(client: &Arc, objects_to_delete: HashSet) { - info!( - "Removing {} objects from the remote storage during cleanup", - objects_to_delete.len() - ); - let mut delete_tasks = JoinSet::new(); - for object_to_delete in objects_to_delete { - let task_client = Arc::clone(client); - delete_tasks.spawn(async move { - debug!("Deleting remote item at path {object_to_delete:?}"); - task_client - .delete(&object_to_delete) - .await - .with_context(|| format!("{object_to_delete:?} removal")) - }); - } - - while let Some(task_run_result) = delete_tasks.join_next().await { - match task_run_result { - Ok(task_result) => match task_result { - Ok(()) => {} - Err(e) => error!("Delete task failed: {e:?}"), - }, - Err(join_err) => error!("Delete task did not finish correctly: {join_err}"), - } - } -} - -// Uploads files `folder{j}/blob{i}.txt`. See test description for more details. -async fn upload_simple_s3_data( - client: &Arc, - upload_tasks_count: usize, -) -> ControlFlow, HashSet> { - info!("Creating {upload_tasks_count} S3 files"); - let mut upload_tasks = JoinSet::new(); - for i in 1..upload_tasks_count + 1 { - let task_client = Arc::clone(client); - upload_tasks.spawn(async move { - let blob_path = PathBuf::from(format!("folder{}/blob_{}.txt", i / 7, i)); - let blob_path = RemotePath::new( - Utf8Path::from_path(blob_path.as_path()).expect("must be valid blob path"), - ) - .with_context(|| format!("{blob_path:?} to RemotePath conversion"))?; - debug!("Creating remote item {i} at path {blob_path:?}"); - - let (data, data_len) = - upload_stream(format!("remote blob data {i}").into_bytes().into()); - task_client.upload(data, data_len, &blob_path, None).await?; - - Ok::<_, anyhow::Error>(blob_path) - }); - } - - let mut upload_tasks_failed = false; - let mut uploaded_blobs = HashSet::with_capacity(upload_tasks_count); - while let Some(task_run_result) = upload_tasks.join_next().await { - match task_run_result - .context("task join failed") - .and_then(|task_result| task_result.context("upload task failed")) - { - Ok(upload_path) => { - uploaded_blobs.insert(upload_path); - } - Err(e) => { - error!("Upload task failed: {e:?}"); - upload_tasks_failed = true; - } - } - } - - if upload_tasks_failed { - ControlFlow::Break(uploaded_blobs) - } else { - ControlFlow::Continue(uploaded_blobs) - } -} - -fn upload_stream( - content: std::borrow::Cow<'static, [u8]>, -) -> ( - impl Stream> + Send + Sync + 'static, - usize, -) { - use std::borrow::Cow; - - let content = match content { - Cow::Borrowed(x) => Bytes::from_static(x), - Cow::Owned(vec) => Bytes::from(vec), - }; - wrap_stream(content) -} - -fn wrap_stream( - content: bytes::Bytes, -) -> ( - impl Stream> + Send + Sync + 'static, - usize, -) { - let len = content.len(); - let content = futures::future::ready(Ok(content)); - - (futures::stream::once(content), len) -} diff --git a/libs/utils/src/lsn.rs b/libs/utils/src/lsn.rs index 262dcb8a8a..b3269ae049 100644 --- a/libs/utils/src/lsn.rs +++ b/libs/utils/src/lsn.rs @@ -366,6 +366,49 @@ impl MonotonicCounter for RecordLsn { } } +/// Implements [`rand::distributions::uniform::UniformSampler`] so we can sample [`Lsn`]s. +/// +/// This is used by the `pagebench` pageserver benchmarking tool. +pub struct LsnSampler(::Sampler); + +impl rand::distributions::uniform::SampleUniform for Lsn { + type Sampler = LsnSampler; +} + +impl rand::distributions::uniform::UniformSampler for LsnSampler { + type X = Lsn; + + fn new(low: B1, high: B2) -> Self + where + B1: rand::distributions::uniform::SampleBorrow + Sized, + B2: rand::distributions::uniform::SampleBorrow + Sized, + { + Self( + ::Sampler::new( + low.borrow().0, + high.borrow().0, + ), + ) + } + + fn new_inclusive(low: B1, high: B2) -> Self + where + B1: rand::distributions::uniform::SampleBorrow + Sized, + B2: rand::distributions::uniform::SampleBorrow + Sized, + { + Self( + ::Sampler::new_inclusive( + low.borrow().0, + high.borrow().0, + ), + ) + } + + fn sample(&self, rng: &mut R) -> Self::X { + Lsn(self.0.sample(rng)) + } +} + #[cfg(test)] mod tests { use crate::bin_ser::BeSer; diff --git a/libs/walproposer/bindgen_deps.h b/libs/walproposer/bindgen_deps.h index b95788347c..41ee1cd4a3 100644 --- a/libs/walproposer/bindgen_deps.h +++ b/libs/walproposer/bindgen_deps.h @@ -1 +1,2 @@ +#include "postgres.h" #include "walproposer.h" diff --git a/libs/walproposer/src/api_bindings.rs b/libs/walproposer/src/api_bindings.rs index 77afe1e686..1f7bf952dc 100644 --- a/libs/walproposer/src/api_bindings.rs +++ b/libs/walproposer/src/api_bindings.rs @@ -8,12 +8,12 @@ use std::ffi::CString; use crate::bindings::uint32; use crate::bindings::walproposer_api; +use crate::bindings::NeonWALReadResult; use crate::bindings::PGAsyncReadResult; use crate::bindings::PGAsyncWriteResult; use crate::bindings::Safekeeper; use crate::bindings::Size; use crate::bindings::StringInfoData; -use crate::bindings::TimeLineID; use crate::bindings::TimestampTz; use crate::bindings::WalProposer; use crate::bindings::WalProposerConnStatusType; @@ -178,31 +178,11 @@ extern "C" fn conn_blocking_write( } } -extern "C" fn recovery_download( - sk: *mut Safekeeper, - _timeline: TimeLineID, - startpos: XLogRecPtr, - endpos: XLogRecPtr, -) -> bool { +extern "C" fn recovery_download(wp: *mut WalProposer, sk: *mut Safekeeper) -> bool { unsafe { let callback_data = (*(*(*sk).wp).config).callback_data; let api = callback_data as *mut Box; - (*api).recovery_download(&mut (*sk), startpos, endpos) - } -} - -#[allow(clippy::unnecessary_cast)] -extern "C" fn wal_read( - sk: *mut Safekeeper, - buf: *mut ::std::os::raw::c_char, - startptr: XLogRecPtr, - count: Size, -) { - unsafe { - let buf = std::slice::from_raw_parts_mut(buf as *mut u8, count); - let callback_data = (*(*(*sk).wp).config).callback_data; - let api = callback_data as *mut Box; - (*api).wal_read(&mut (*sk), buf, startptr) + (*api).recovery_download(&mut (*wp), &mut (*sk)) } } @@ -214,11 +194,28 @@ extern "C" fn wal_reader_allocate(sk: *mut Safekeeper) { } } -extern "C" fn free_event_set(wp: *mut WalProposer) { +#[allow(clippy::unnecessary_cast)] +extern "C" fn wal_read( + sk: *mut Safekeeper, + buf: *mut ::std::os::raw::c_char, + startptr: XLogRecPtr, + count: Size, + _errmsg: *mut *mut ::std::os::raw::c_char, +) -> NeonWALReadResult { unsafe { - let callback_data = (*(*wp).config).callback_data; + let buf = std::slice::from_raw_parts_mut(buf as *mut u8, count); + let callback_data = (*(*(*sk).wp).config).callback_data; let api = callback_data as *mut Box; - (*api).free_event_set(&mut (*wp)); + // TODO: errmsg is not forwarded + (*api).wal_read(&mut (*sk), buf, startptr) + } +} + +extern "C" fn wal_reader_events(sk: *mut Safekeeper) -> uint32 { + unsafe { + let callback_data = (*(*(*sk).wp).config).callback_data; + let api = callback_data as *mut Box; + (*api).wal_reader_events(&mut (*sk)) } } @@ -238,6 +235,14 @@ extern "C" fn update_event_set(sk: *mut Safekeeper, events: uint32) { } } +extern "C" fn active_state_update_event_set(sk: *mut Safekeeper) { + unsafe { + let callback_data = (*(*(*sk).wp).config).callback_data; + let api = callback_data as *mut Box; + (*api).active_state_update_event_set(&mut (*sk)); + } +} + extern "C" fn add_safekeeper_event_set(sk: *mut Safekeeper, events: uint32) { unsafe { let callback_data = (*(*(*sk).wp).config).callback_data; @@ -246,6 +251,14 @@ extern "C" fn add_safekeeper_event_set(sk: *mut Safekeeper, events: uint32) { } } +extern "C" fn rm_safekeeper_event_set(sk: *mut Safekeeper) { + unsafe { + let callback_data = (*(*(*sk).wp).config).callback_data; + let api = callback_data as *mut Box; + (*api).rm_safekeeper_event_set(&mut (*sk)); + } +} + extern "C" fn wait_event_set( wp: *mut WalProposer, timeout: ::std::os::raw::c_long, @@ -313,14 +326,6 @@ extern "C" fn process_safekeeper_feedback(wp: *mut WalProposer, commit_lsn: XLog } } -extern "C" fn confirm_wal_streamed(wp: *mut WalProposer, lsn: XLogRecPtr) { - unsafe { - let callback_data = (*(*wp).config).callback_data; - let api = callback_data as *mut Box; - (*api).confirm_wal_streamed(&mut (*wp), lsn) - } -} - extern "C" fn log_internal( wp: *mut WalProposer, level: ::std::os::raw::c_int, @@ -335,14 +340,6 @@ extern "C" fn log_internal( } } -extern "C" fn after_election(wp: *mut WalProposer) { - unsafe { - let callback_data = (*(*wp).config).callback_data; - let api = callback_data as *mut Box; - (*api).after_election(&mut (*wp)) - } -} - #[derive(Debug)] pub enum Level { Debug5, @@ -401,20 +398,20 @@ pub(crate) fn create_api() -> walproposer_api { conn_async_write: Some(conn_async_write), conn_blocking_write: Some(conn_blocking_write), recovery_download: Some(recovery_download), - wal_read: Some(wal_read), wal_reader_allocate: Some(wal_reader_allocate), - free_event_set: Some(free_event_set), + wal_read: Some(wal_read), + wal_reader_events: Some(wal_reader_events), init_event_set: Some(init_event_set), update_event_set: Some(update_event_set), + active_state_update_event_set: Some(active_state_update_event_set), add_safekeeper_event_set: Some(add_safekeeper_event_set), + rm_safekeeper_event_set: Some(rm_safekeeper_event_set), wait_event_set: Some(wait_event_set), strong_random: Some(strong_random), get_redo_start_lsn: Some(get_redo_start_lsn), finish_sync_safekeepers: Some(finish_sync_safekeepers), process_safekeeper_feedback: Some(process_safekeeper_feedback), - confirm_wal_streamed: Some(confirm_wal_streamed), log_internal: Some(log_internal), - after_election: Some(after_election), } } diff --git a/libs/walproposer/src/walproposer.rs b/libs/walproposer/src/walproposer.rs index f5723018d7..35c8f6904d 100644 --- a/libs/walproposer/src/walproposer.rs +++ b/libs/walproposer/src/walproposer.rs @@ -6,8 +6,8 @@ use utils::id::TenantTimelineId; use crate::{ api_bindings::{create_api, take_vec_u8, Level}, bindings::{ - Safekeeper, WalProposer, WalProposerConfig, WalProposerCreate, WalProposerFree, - WalProposerStart, + NeonWALReadResult, Safekeeper, WalProposer, WalProposerConfig, WalProposerCreate, + WalProposerFree, WalProposerStart, }, }; @@ -86,19 +86,19 @@ pub trait ApiImpl { todo!() } - fn recovery_download(&self, _sk: &mut Safekeeper, _startpos: u64, _endpos: u64) -> bool { + fn recovery_download(&self, _wp: &mut WalProposer, _sk: &mut Safekeeper) -> bool { todo!() } - fn wal_read(&self, _sk: &mut Safekeeper, _buf: &mut [u8], _startpos: u64) { + fn wal_reader_allocate(&self, _sk: &mut Safekeeper) -> NeonWALReadResult { todo!() } - fn wal_reader_allocate(&self, _sk: &mut Safekeeper) { + fn wal_read(&self, _sk: &mut Safekeeper, _buf: &mut [u8], _startpos: u64) -> NeonWALReadResult { todo!() } - fn free_event_set(&self, _wp: &mut WalProposer) { + fn wal_reader_events(&self, _sk: &mut Safekeeper) -> u32 { todo!() } @@ -110,10 +110,18 @@ pub trait ApiImpl { todo!() } + fn active_state_update_event_set(&self, _sk: &mut Safekeeper) { + todo!() + } + fn add_safekeeper_event_set(&self, _sk: &mut Safekeeper, _events_mask: u32) { todo!() } + fn rm_safekeeper_event_set(&self, _sk: &mut Safekeeper) { + todo!() + } + fn wait_event_set(&self, _wp: &mut WalProposer, _timeout_millis: i64) -> WaitResult { todo!() } @@ -134,10 +142,6 @@ pub trait ApiImpl { todo!() } - fn confirm_wal_streamed(&self, _wp: &mut WalProposer, _lsn: u64) { - todo!() - } - fn log_internal(&self, _wp: &mut WalProposer, _level: Level, _msg: &str) { todo!() } @@ -240,6 +244,7 @@ impl Drop for Wrapper { #[cfg(test)] mod tests { + use core::panic; use std::{ cell::Cell, sync::{atomic::AtomicUsize, mpsc::sync_channel}, @@ -247,7 +252,7 @@ mod tests { use utils::id::TenantTimelineId; - use crate::{api_bindings::Level, walproposer::Wrapper}; + use crate::{api_bindings::Level, bindings::NeonWALReadResult, walproposer::Wrapper}; use super::ApiImpl; @@ -355,12 +360,17 @@ mod tests { true } - fn wal_reader_allocate(&self, _: &mut crate::bindings::Safekeeper) { - println!("wal_reader_allocate") + fn recovery_download( + &self, + _wp: &mut crate::bindings::WalProposer, + _sk: &mut crate::bindings::Safekeeper, + ) -> bool { + true } - fn free_event_set(&self, _: &mut crate::bindings::WalProposer) { - println!("free_event_set") + fn wal_reader_allocate(&self, _: &mut crate::bindings::Safekeeper) -> NeonWALReadResult { + println!("wal_reader_allocate"); + crate::bindings::NeonWALReadResult_NEON_WALREAD_SUCCESS } fn init_event_set(&self, _: &mut crate::bindings::WalProposer) { @@ -383,6 +393,13 @@ mod tests { self.wait_events.set(WaitEventsData { sk, event_mask }); } + fn rm_safekeeper_event_set(&self, sk: &mut crate::bindings::Safekeeper) { + println!( + "rm_safekeeper_event_set, sk={:?}", + sk as *mut crate::bindings::Safekeeper + ); + } + fn wait_event_set( &self, _: &mut crate::bindings::WalProposer, diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index 9e8172c6a1..980fbab22e 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -63,6 +63,7 @@ thiserror.workspace = true tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time"] } tokio-io-timeout.workspace = true tokio-postgres.workspace = true +tokio-stream.workspace = true tokio-util.workspace = true toml_edit = { workspace = true, features = [ "serde" ] } tracing.workspace = true diff --git a/pageserver/client/Cargo.toml b/pageserver/client/Cargo.toml index 4bd36185a6..0ed27602cd 100644 --- a/pageserver/client/Cargo.toml +++ b/pageserver/client/Cargo.toml @@ -12,3 +12,11 @@ reqwest.workspace = true utils.workspace = true serde.workspace = true workspace_hack = { version = "0.1", path = "../../workspace_hack" } +tokio-postgres.workspace = true +tokio-stream.workspace = true +tokio.workspace = true +futures.workspace = true +tokio-util.workspace = true +anyhow.workspace = true +postgres.workspace = true +bytes.workspace = true diff --git a/pageserver/client/src/lib.rs b/pageserver/client/src/lib.rs index 3963fd466c..4a3f4dea47 100644 --- a/pageserver/client/src/lib.rs +++ b/pageserver/client/src/lib.rs @@ -1 +1,2 @@ pub mod mgmt_api; +pub mod page_service; diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs index 77eb1bb8e2..87e4ed8efd 100644 --- a/pageserver/client/src/mgmt_api.rs +++ b/pageserver/client/src/mgmt_api.rs @@ -5,6 +5,8 @@ use utils::{ id::{TenantId, TimelineId}, }; +pub mod util; + #[derive(Debug)] pub struct Client { mgmt_api_endpoint: String, @@ -64,6 +66,18 @@ impl Client { resp.json().await.map_err(Error::ReceiveBody) } + pub async fn tenant_details( + &self, + tenant_id: TenantId, + ) -> Result { + let uri = format!("{}/v1/tenant/{tenant_id}", self.mgmt_api_endpoint); + self.get(uri) + .await? + .json() + .await + .map_err(Error::ReceiveBody) + } + pub async fn list_timelines( &self, tenant_id: TenantId, diff --git a/pageserver/client/src/mgmt_api/util.rs b/pageserver/client/src/mgmt_api/util.rs new file mode 100644 index 0000000000..048a3bb7cd --- /dev/null +++ b/pageserver/client/src/mgmt_api/util.rs @@ -0,0 +1,49 @@ +//! Helpers to do common higher-level tasks with the [`Client`]. + +use std::sync::Arc; + +use tokio::task::JoinSet; +use utils::id::{TenantId, TenantTimelineId}; + +use super::Client; + +/// Retrieve a list of all of the pageserver's timelines. +/// +/// Fails if there are sharded tenants present on the pageserver. +pub async fn get_pageserver_tenant_timelines_unsharded( + api_client: &Arc, +) -> anyhow::Result> { + let mut timelines: Vec = Vec::new(); + let mut tenants: Vec = Vec::new(); + for ti in api_client.list_tenants().await? { + if !ti.id.is_unsharded() { + anyhow::bail!( + "only unsharded tenants are supported at this time: {}", + ti.id + ); + } + tenants.push(ti.id.tenant_id) + } + let mut js = JoinSet::new(); + for tenant_id in tenants { + js.spawn({ + let mgmt_api_client = Arc::clone(api_client); + async move { + ( + tenant_id, + mgmt_api_client.tenant_details(tenant_id).await.unwrap(), + ) + } + }); + } + while let Some(res) = js.join_next().await { + let (tenant_id, details) = res.unwrap(); + for timeline_id in details.timelines { + timelines.push(TenantTimelineId { + tenant_id, + timeline_id, + }); + } + } + Ok(timelines) +} diff --git a/pageserver/client/src/page_service.rs b/pageserver/client/src/page_service.rs new file mode 100644 index 0000000000..fc0d2311f7 --- /dev/null +++ b/pageserver/client/src/page_service.rs @@ -0,0 +1,151 @@ +use std::pin::Pin; + +use futures::SinkExt; +use pageserver_api::{ + models::{ + PagestreamBeMessage, PagestreamFeMessage, PagestreamGetPageRequest, + PagestreamGetPageResponse, + }, + reltag::RelTag, +}; +use tokio::task::JoinHandle; +use tokio_postgres::CopyOutStream; +use tokio_stream::StreamExt; +use tokio_util::sync::CancellationToken; +use utils::{ + id::{TenantId, TimelineId}, + lsn::Lsn, +}; + +pub struct Client { + client: tokio_postgres::Client, + cancel_on_client_drop: Option, + conn_task: JoinHandle<()>, +} + +pub struct BasebackupRequest { + pub tenant_id: TenantId, + pub timeline_id: TimelineId, + pub lsn: Option, + pub gzip: bool, +} + +impl Client { + pub async fn new(connstring: String) -> anyhow::Result { + let (client, connection) = tokio_postgres::connect(&connstring, postgres::NoTls).await?; + + let conn_task_cancel = CancellationToken::new(); + let conn_task = tokio::spawn({ + let conn_task_cancel = conn_task_cancel.clone(); + async move { + tokio::select! { + _ = conn_task_cancel.cancelled() => { } + res = connection => { + res.unwrap(); + } + } + } + }); + Ok(Self { + cancel_on_client_drop: Some(conn_task_cancel.drop_guard()), + conn_task, + client, + }) + } + + pub async fn pagestream( + self, + tenant_id: TenantId, + timeline_id: TimelineId, + ) -> anyhow::Result { + let copy_both: tokio_postgres::CopyBothDuplex = self + .client + .copy_both_simple(&format!("pagestream {tenant_id} {timeline_id}")) + .await?; + let Client { + cancel_on_client_drop, + conn_task, + client: _, + } = self; + Ok(PagestreamClient { + copy_both: Box::pin(copy_both), + conn_task, + cancel_on_client_drop, + }) + } + + pub async fn basebackup(&self, req: &BasebackupRequest) -> anyhow::Result { + let BasebackupRequest { + tenant_id, + timeline_id, + lsn, + gzip, + } = req; + let mut args = Vec::with_capacity(5); + args.push("basebackup".to_string()); + args.push(format!("{tenant_id}")); + args.push(format!("{timeline_id}")); + if let Some(lsn) = lsn { + args.push(format!("{lsn}")); + } + if *gzip { + args.push("--gzip".to_string()) + } + Ok(self.client.copy_out(&args.join(" ")).await?) + } +} + +/// Create using [`Client::pagestream`]. +pub struct PagestreamClient { + copy_both: Pin>>, + cancel_on_client_drop: Option, + conn_task: JoinHandle<()>, +} + +pub struct RelTagBlockNo { + pub rel_tag: RelTag, + pub block_no: u32, +} + +impl PagestreamClient { + pub async fn shutdown(mut self) { + let _ = self.cancel_on_client_drop.take(); + self.conn_task.await.unwrap(); + } + + pub async fn getpage( + &mut self, + key: RelTagBlockNo, + lsn: Lsn, + ) -> anyhow::Result { + let req = PagestreamGetPageRequest { + latest: false, + rel: key.rel_tag, + blkno: key.block_no, + lsn, + }; + let req = PagestreamFeMessage::GetPage(req); + let req: bytes::Bytes = req.serialize(); + // let mut req = tokio_util::io::ReaderStream::new(&req); + let mut req = tokio_stream::once(Ok(req)); + + self.copy_both.send_all(&mut req).await?; + + let next: Option> = self.copy_both.next().await; + let next: bytes::Bytes = next.unwrap()?; + + let msg = PagestreamBeMessage::deserialize(next)?; + match msg { + PagestreamBeMessage::GetPage(p) => Ok(p), + PagestreamBeMessage::Error(e) => anyhow::bail!("Error: {:?}", e), + PagestreamBeMessage::Exists(_) + | PagestreamBeMessage::Nblocks(_) + | PagestreamBeMessage::DbSize(_) => { + anyhow::bail!( + "unexpected be message kind in response to getpage request: {}", + msg.kind() + ) + } + } + } +} diff --git a/pageserver/pagebench/Cargo.toml b/pageserver/pagebench/Cargo.toml new file mode 100644 index 0000000000..169d9b7f8e --- /dev/null +++ b/pageserver/pagebench/Cargo.toml @@ -0,0 +1,26 @@ +[package] +name = "pagebench" +version = "0.1.0" +edition.workspace = true +license.workspace = true + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +anyhow.workspace = true +clap.workspace = true +futures.workspace = true +hdrhistogram.workspace = true +humantime.workspace = true +humantime-serde.workspace = true +rand.workspace = true +serde.workspace = true +serde_json.workspace = true +tracing.workspace = true +tokio.workspace = true + +pageserver = { path = ".." } +pageserver_client.workspace = true +pageserver_api.workspace = true +utils = { path = "../../libs/utils/" } +workspace_hack = { version = "0.1", path = "../../workspace_hack" } diff --git a/pageserver/pagebench/src/cmd/basebackup.rs b/pageserver/pagebench/src/cmd/basebackup.rs new file mode 100644 index 0000000000..85a3e695de --- /dev/null +++ b/pageserver/pagebench/src/cmd/basebackup.rs @@ -0,0 +1,272 @@ +use anyhow::Context; +use pageserver_client::page_service::BasebackupRequest; + +use utils::id::TenantTimelineId; +use utils::lsn::Lsn; + +use rand::prelude::*; +use tokio::sync::Barrier; +use tokio::task::JoinSet; +use tracing::{debug, info, instrument}; + +use std::collections::HashMap; +use std::num::NonZeroUsize; +use std::ops::Range; +use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering}; +use std::sync::{Arc, Mutex}; +use std::time::Instant; + +use crate::util::tokio_thread_local_stats::AllThreadLocalStats; +use crate::util::{request_stats, tokio_thread_local_stats}; + +/// basebackup@LatestLSN +#[derive(clap::Parser)] +pub(crate) struct Args { + #[clap(long, default_value = "http://localhost:9898")] + mgmt_api_endpoint: String, + #[clap(long, default_value = "localhost:64000")] + page_service_host_port: String, + #[clap(long)] + pageserver_jwt: Option, + #[clap(long, default_value = "1")] + num_clients: NonZeroUsize, + #[clap(long, default_value = "1.0")] + gzip_probability: f64, + #[clap(long)] + runtime: Option, + #[clap(long)] + limit_to_first_n_targets: Option, + targets: Option>, +} + +#[derive(Debug, Default)] +struct LiveStats { + completed_requests: AtomicU64, +} + +impl LiveStats { + fn inc(&self) { + self.completed_requests.fetch_add(1, Ordering::Relaxed); + } +} + +struct Target { + timeline: TenantTimelineId, + lsn_range: Option>, +} + +#[derive(serde::Serialize)] +struct Output { + total: request_stats::Output, +} + +tokio_thread_local_stats::declare!(STATS: request_stats::Stats); + +pub(crate) fn main(args: Args) -> anyhow::Result<()> { + tokio_thread_local_stats::main!(STATS, move |thread_local_stats| { + main_impl(args, thread_local_stats) + }) +} + +async fn main_impl( + args: Args, + all_thread_local_stats: AllThreadLocalStats, +) -> anyhow::Result<()> { + let args: &'static Args = Box::leak(Box::new(args)); + + let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new( + args.mgmt_api_endpoint.clone(), + args.pageserver_jwt.as_deref(), + )); + + // discover targets + let timelines: Vec = crate::util::cli::targets::discover( + &mgmt_api_client, + crate::util::cli::targets::Spec { + limit_to_first_n_targets: args.limit_to_first_n_targets, + targets: args.targets.clone(), + }, + ) + .await?; + let mut js = JoinSet::new(); + for timeline in &timelines { + js.spawn({ + let timeline = *timeline; + // FIXME: this triggers initial logical size calculation + // https://github.com/neondatabase/neon/issues/6168 + let info = mgmt_api_client + .timeline_info(timeline.tenant_id, timeline.timeline_id) + .await + .unwrap(); + async move { + anyhow::Ok(Target { + timeline, + // TODO: support lsn_range != latest LSN + lsn_range: Some(info.last_record_lsn..(info.last_record_lsn + 1)), + }) + } + }); + } + let mut all_targets: Vec = Vec::new(); + while let Some(res) = js.join_next().await { + all_targets.push(res.unwrap().unwrap()); + } + + let live_stats = Arc::new(LiveStats::default()); + + let num_client_tasks = timelines.len(); + let num_live_stats_dump = 1; + let num_work_sender_tasks = 1; + + let start_work_barrier = Arc::new(tokio::sync::Barrier::new( + num_client_tasks + num_live_stats_dump + num_work_sender_tasks, + )); + let all_work_done_barrier = Arc::new(tokio::sync::Barrier::new(num_client_tasks)); + + tokio::spawn({ + let stats = Arc::clone(&live_stats); + let start_work_barrier = Arc::clone(&start_work_barrier); + async move { + start_work_barrier.wait().await; + loop { + let start = std::time::Instant::now(); + tokio::time::sleep(std::time::Duration::from_secs(1)).await; + let completed_requests = stats.completed_requests.swap(0, Ordering::Relaxed); + let elapsed = start.elapsed(); + info!( + "RPS: {:.0}", + completed_requests as f64 / elapsed.as_secs_f64() + ); + } + } + }); + + let mut work_senders = HashMap::new(); + let mut tasks = Vec::new(); + for tl in &timelines { + let (sender, receiver) = tokio::sync::mpsc::channel(1); // TODO: not sure what the implications of this are + work_senders.insert(tl, sender); + tasks.push(tokio::spawn(client( + args, + *tl, + Arc::clone(&start_work_barrier), + receiver, + Arc::clone(&all_work_done_barrier), + Arc::clone(&live_stats), + ))); + } + + let work_sender = async move { + start_work_barrier.wait().await; + loop { + let (timeline, work) = { + let mut rng = rand::thread_rng(); + let target = all_targets.choose(&mut rng).unwrap(); + let lsn = target.lsn_range.clone().map(|r| rng.gen_range(r)); + ( + target.timeline, + Work { + lsn, + gzip: rng.gen_bool(args.gzip_probability), + }, + ) + }; + let sender = work_senders.get(&timeline).unwrap(); + // TODO: what if this blocks? + sender.send(work).await.ok().unwrap(); + } + }; + + if let Some(runtime) = args.runtime { + match tokio::time::timeout(runtime.into(), work_sender).await { + Ok(()) => unreachable!("work sender never terminates"), + Err(_timeout) => { + // this implicitly drops the work_senders, making all the clients exit + } + } + } else { + work_sender.await; + unreachable!("work sender never terminates"); + } + + for t in tasks { + t.await.unwrap(); + } + + let output = Output { + total: { + let mut agg_stats = request_stats::Stats::new(); + for stats in all_thread_local_stats.lock().unwrap().iter() { + let stats = stats.lock().unwrap(); + agg_stats.add(&stats); + } + agg_stats.output() + }, + }; + + let output = serde_json::to_string_pretty(&output).unwrap(); + println!("{output}"); + + anyhow::Ok(()) +} + +#[derive(Copy, Clone)] +struct Work { + lsn: Option, + gzip: bool, +} + +#[instrument(skip_all)] +async fn client( + args: &'static Args, + timeline: TenantTimelineId, + start_work_barrier: Arc, + mut work: tokio::sync::mpsc::Receiver, + all_work_done_barrier: Arc, + live_stats: Arc, +) { + start_work_barrier.wait().await; + + let client = pageserver_client::page_service::Client::new(crate::util::connstring::connstring( + &args.page_service_host_port, + args.pageserver_jwt.as_deref(), + )) + .await + .unwrap(); + + while let Some(Work { lsn, gzip }) = work.recv().await { + let start = Instant::now(); + let copy_out_stream = client + .basebackup(&BasebackupRequest { + tenant_id: timeline.tenant_id, + timeline_id: timeline.timeline_id, + lsn, + gzip, + }) + .await + .with_context(|| format!("start basebackup for {timeline}")) + .unwrap(); + + use futures::StreamExt; + let size = Arc::new(AtomicUsize::new(0)); + copy_out_stream + .for_each({ + |r| { + let size = Arc::clone(&size); + async move { + let size = Arc::clone(&size); + size.fetch_add(r.unwrap().len(), Ordering::Relaxed); + } + } + }) + .await; + debug!("basebackup size is {} bytes", size.load(Ordering::Relaxed)); + let elapsed = start.elapsed(); + live_stats.inc(); + STATS.with(|stats| { + stats.borrow().lock().unwrap().observe(elapsed).unwrap(); + }); + } + + all_work_done_barrier.wait().await; +} diff --git a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs new file mode 100644 index 0000000000..16d198ab0e --- /dev/null +++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs @@ -0,0 +1,335 @@ +use anyhow::Context; +use futures::future::join_all; +use pageserver::pgdatadir_mapping::key_to_rel_block; +use pageserver::repository; +use pageserver_api::key::is_rel_block_key; +use pageserver_client::page_service::RelTagBlockNo; + +use utils::id::TenantTimelineId; +use utils::lsn::Lsn; + +use rand::prelude::*; +use tokio::sync::Barrier; +use tokio::task::JoinSet; +use tracing::{info, instrument}; + +use std::collections::HashMap; +use std::future::Future; +use std::num::NonZeroUsize; +use std::pin::Pin; +use std::sync::atomic::{AtomicU64, Ordering}; +use std::sync::{Arc, Mutex}; +use std::time::{Duration, Instant}; + +use crate::util::tokio_thread_local_stats::AllThreadLocalStats; +use crate::util::{request_stats, tokio_thread_local_stats}; + +/// GetPage@LatestLSN, uniformly distributed across the compute-accessible keyspace. +#[derive(clap::Parser)] +pub(crate) struct Args { + #[clap(long, default_value = "http://localhost:9898")] + mgmt_api_endpoint: String, + #[clap(long, default_value = "postgres://postgres@localhost:64000")] + page_service_connstring: String, + #[clap(long)] + pageserver_jwt: Option, + #[clap(long, default_value = "1")] + num_clients: NonZeroUsize, + #[clap(long)] + runtime: Option, + #[clap(long)] + per_target_rate_limit: Option, + #[clap(long)] + limit_to_first_n_targets: Option, + targets: Option>, +} + +#[derive(Debug, Default)] +struct LiveStats { + completed_requests: AtomicU64, +} + +impl LiveStats { + fn inc(&self) { + self.completed_requests.fetch_add(1, Ordering::Relaxed); + } +} + +#[derive(Clone)] +struct KeyRange { + timeline: TenantTimelineId, + timeline_lsn: Lsn, + start: i128, + end: i128, +} + +impl KeyRange { + fn len(&self) -> i128 { + self.end - self.start + } +} + +#[derive(serde::Serialize)] +struct Output { + total: request_stats::Output, +} + +tokio_thread_local_stats::declare!(STATS: request_stats::Stats); + +pub(crate) fn main(args: Args) -> anyhow::Result<()> { + tokio_thread_local_stats::main!(STATS, move |thread_local_stats| { + main_impl(args, thread_local_stats) + }) +} + +async fn main_impl( + args: Args, + all_thread_local_stats: AllThreadLocalStats, +) -> anyhow::Result<()> { + let args: &'static Args = Box::leak(Box::new(args)); + + let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new( + args.mgmt_api_endpoint.clone(), + args.pageserver_jwt.as_deref(), + )); + + // discover targets + let timelines: Vec = crate::util::cli::targets::discover( + &mgmt_api_client, + crate::util::cli::targets::Spec { + limit_to_first_n_targets: args.limit_to_first_n_targets, + targets: args.targets.clone(), + }, + ) + .await?; + + let mut js = JoinSet::new(); + for timeline in &timelines { + js.spawn({ + let mgmt_api_client = Arc::clone(&mgmt_api_client); + let timeline = *timeline; + async move { + let partitioning = mgmt_api_client + .keyspace(timeline.tenant_id, timeline.timeline_id) + .await?; + let lsn = partitioning.at_lsn; + + let ranges = partitioning + .keys + .ranges + .iter() + .filter_map(|r| { + let start = r.start; + let end = r.end; + // filter out non-relblock keys + match (is_rel_block_key(&start), is_rel_block_key(&end)) { + (true, true) => Some(KeyRange { + timeline, + timeline_lsn: lsn, + start: start.to_i128(), + end: end.to_i128(), + }), + (true, false) | (false, true) => { + unimplemented!("split up range") + } + (false, false) => None, + } + }) + .collect::>(); + + anyhow::Ok(ranges) + } + }); + } + let mut all_ranges: Vec = Vec::new(); + while let Some(res) = js.join_next().await { + all_ranges.extend(res.unwrap().unwrap()); + } + + let live_stats = Arc::new(LiveStats::default()); + + let num_client_tasks = timelines.len(); + let num_live_stats_dump = 1; + let num_work_sender_tasks = 1; + + let start_work_barrier = Arc::new(tokio::sync::Barrier::new( + num_client_tasks + num_live_stats_dump + num_work_sender_tasks, + )); + let all_work_done_barrier = Arc::new(tokio::sync::Barrier::new(num_client_tasks)); + + tokio::spawn({ + let stats = Arc::clone(&live_stats); + let start_work_barrier = Arc::clone(&start_work_barrier); + async move { + start_work_barrier.wait().await; + loop { + let start = std::time::Instant::now(); + tokio::time::sleep(std::time::Duration::from_secs(1)).await; + let completed_requests = stats.completed_requests.swap(0, Ordering::Relaxed); + let elapsed = start.elapsed(); + info!( + "RPS: {:.0}", + completed_requests as f64 / elapsed.as_secs_f64() + ); + } + } + }); + + let mut work_senders = HashMap::new(); + let mut tasks = Vec::new(); + for tl in &timelines { + let (sender, receiver) = tokio::sync::mpsc::channel(10); // TODO: not sure what the implications of this are + work_senders.insert(tl, sender); + tasks.push(tokio::spawn(client( + args, + *tl, + Arc::clone(&start_work_barrier), + receiver, + Arc::clone(&all_work_done_barrier), + Arc::clone(&live_stats), + ))); + } + + let work_sender: Pin>> = match args.per_target_rate_limit { + None => Box::pin(async move { + let weights = rand::distributions::weighted::WeightedIndex::new( + all_ranges.iter().map(|v| v.len()), + ) + .unwrap(); + + start_work_barrier.wait().await; + + loop { + let (range, key) = { + let mut rng = rand::thread_rng(); + let r = &all_ranges[weights.sample(&mut rng)]; + let key: i128 = rng.gen_range(r.start..r.end); + let key = repository::Key::from_i128(key); + let (rel_tag, block_no) = + key_to_rel_block(key).expect("we filter non-rel-block keys out above"); + (r, RelTagBlockNo { rel_tag, block_no }) + }; + let sender = work_senders.get(&range.timeline).unwrap(); + // TODO: what if this blocks? + sender.send((key, range.timeline_lsn)).await.ok().unwrap(); + } + }), + Some(rps_limit) => Box::pin(async move { + let period = Duration::from_secs_f64(1.0 / (rps_limit as f64)); + + let make_timeline_task: &dyn Fn( + TenantTimelineId, + ) + -> Pin>> = &|timeline| { + let sender = work_senders.get(&timeline).unwrap(); + let ranges: Vec = all_ranges + .iter() + .filter(|r| r.timeline == timeline) + .cloned() + .collect(); + let weights = rand::distributions::weighted::WeightedIndex::new( + ranges.iter().map(|v| v.len()), + ) + .unwrap(); + + Box::pin(async move { + let mut ticker = tokio::time::interval(period); + ticker.set_missed_tick_behavior( + /* TODO review this choice */ + tokio::time::MissedTickBehavior::Burst, + ); + loop { + ticker.tick().await; + let (range, key) = { + let mut rng = rand::thread_rng(); + let r = &ranges[weights.sample(&mut rng)]; + let key: i128 = rng.gen_range(r.start..r.end); + let key = repository::Key::from_i128(key); + let (rel_tag, block_no) = key_to_rel_block(key) + .expect("we filter non-rel-block keys out above"); + (r, RelTagBlockNo { rel_tag, block_no }) + }; + sender.send((key, range.timeline_lsn)).await.ok().unwrap(); + } + }) + }; + + let tasks: Vec<_> = work_senders + .keys() + .map(|tl| make_timeline_task(**tl)) + .collect(); + + start_work_barrier.wait().await; + + join_all(tasks).await; + }), + }; + + if let Some(runtime) = args.runtime { + match tokio::time::timeout(runtime.into(), work_sender).await { + Ok(()) => unreachable!("work sender never terminates"), + Err(_timeout) => { + // this implicitly drops the work_senders, making all the clients exit + } + } + } else { + work_sender.await; + unreachable!("work sender never terminates"); + } + + for t in tasks { + t.await.unwrap(); + } + + let output = Output { + total: { + let mut agg_stats = request_stats::Stats::new(); + for stats in all_thread_local_stats.lock().unwrap().iter() { + let stats = stats.lock().unwrap(); + agg_stats.add(&stats); + } + agg_stats.output() + }, + }; + + let output = serde_json::to_string_pretty(&output).unwrap(); + println!("{output}"); + + anyhow::Ok(()) +} + +#[instrument(skip_all)] +async fn client( + args: &'static Args, + timeline: TenantTimelineId, + start_work_barrier: Arc, + mut work: tokio::sync::mpsc::Receiver<(RelTagBlockNo, Lsn)>, + all_work_done_barrier: Arc, + live_stats: Arc, +) { + start_work_barrier.wait().await; + + let client = pageserver_client::page_service::Client::new(args.page_service_connstring.clone()) + .await + .unwrap(); + let mut client = client + .pagestream(timeline.tenant_id, timeline.timeline_id) + .await + .unwrap(); + + while let Some((key, lsn)) = work.recv().await { + let start = Instant::now(); + client + .getpage(key, lsn) + .await + .with_context(|| format!("getpage for {timeline}")) + .unwrap(); + let elapsed = start.elapsed(); + live_stats.inc(); + STATS.with(|stats| { + stats.borrow().lock().unwrap().observe(elapsed).unwrap(); + }); + } + + all_work_done_barrier.wait().await; +} diff --git a/pageserver/pagebench/src/cmd/trigger_initial_size_calculation.rs b/pageserver/pagebench/src/cmd/trigger_initial_size_calculation.rs new file mode 100644 index 0000000000..d46ae94e8a --- /dev/null +++ b/pageserver/pagebench/src/cmd/trigger_initial_size_calculation.rs @@ -0,0 +1,85 @@ +use std::sync::Arc; + +use humantime::Duration; +use tokio::task::JoinSet; +use utils::id::TenantTimelineId; + +#[derive(clap::Parser)] +pub(crate) struct Args { + #[clap(long, default_value = "http://localhost:9898")] + mgmt_api_endpoint: String, + #[clap(long, default_value = "localhost:64000")] + page_service_host_port: String, + #[clap(long)] + pageserver_jwt: Option, + #[clap( + long, + help = "if specified, poll mgmt api to check whether init logical size calculation has completed" + )] + poll_for_completion: Option, + #[clap(long)] + limit_to_first_n_targets: Option, + targets: Option>, +} + +pub(crate) fn main(args: Args) -> anyhow::Result<()> { + let rt = tokio::runtime::Builder::new_multi_thread() + .enable_all() + .build() + .unwrap(); + + let main_task = rt.spawn(main_impl(args)); + rt.block_on(main_task).unwrap() +} + +async fn main_impl(args: Args) -> anyhow::Result<()> { + let args: &'static Args = Box::leak(Box::new(args)); + + let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new( + args.mgmt_api_endpoint.clone(), + args.pageserver_jwt.as_deref(), + )); + + // discover targets + let timelines: Vec = crate::util::cli::targets::discover( + &mgmt_api_client, + crate::util::cli::targets::Spec { + limit_to_first_n_targets: args.limit_to_first_n_targets, + targets: args.targets.clone(), + }, + ) + .await?; + + // kick it off + + let mut js = JoinSet::new(); + for tl in timelines { + let mgmt_api_client = Arc::clone(&mgmt_api_client); + js.spawn(async move { + // TODO: API to explicitly trigger initial logical size computation. + // Should probably also avoid making it a side effect of timeline details to trigger initial logical size calculation. + // => https://github.com/neondatabase/neon/issues/6168 + let info = mgmt_api_client + .timeline_info(tl.tenant_id, tl.timeline_id) + .await + .unwrap(); + + if let Some(period) = args.poll_for_completion { + let mut ticker = tokio::time::interval(period.into()); + ticker.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Delay); + let mut info = info; + while !info.current_logical_size_is_accurate { + ticker.tick().await; + info = mgmt_api_client + .timeline_info(tl.tenant_id, tl.timeline_id) + .await + .unwrap(); + } + } + }); + } + while let Some(res) = js.join_next().await { + let _: () = res.unwrap(); + } + Ok(()) +} diff --git a/pageserver/pagebench/src/main.rs b/pageserver/pagebench/src/main.rs new file mode 100644 index 0000000000..e0120c9212 --- /dev/null +++ b/pageserver/pagebench/src/main.rs @@ -0,0 +1,48 @@ +use clap::Parser; +use utils::logging; + +/// Re-usable pieces of code that aren't CLI-specific. +mod util { + pub(crate) mod connstring; + pub(crate) mod request_stats; + #[macro_use] + pub(crate) mod tokio_thread_local_stats; + /// Re-usable pieces of CLI-specific code. + pub(crate) mod cli { + pub(crate) mod targets; + } +} + +/// The pagebench CLI sub-commands, dispatched in [`main`] below. +mod cmd { + pub(super) mod basebackup; + pub(super) mod getpage_latest_lsn; + pub(super) mod trigger_initial_size_calculation; +} + +/// Component-level performance test for pageserver. +#[derive(clap::Parser)] +enum Args { + Basebackup(cmd::basebackup::Args), + GetPageLatestLsn(cmd::getpage_latest_lsn::Args), + TriggerInitialSizeCalculation(cmd::trigger_initial_size_calculation::Args), +} + +fn main() { + logging::init( + logging::LogFormat::Plain, + logging::TracingErrorLayerEnablement::Disabled, + logging::Output::Stderr, + ) + .unwrap(); + + let args = Args::parse(); + match args { + Args::Basebackup(args) => cmd::basebackup::main(args), + Args::GetPageLatestLsn(args) => cmd::getpage_latest_lsn::main(args), + Args::TriggerInitialSizeCalculation(args) => { + cmd::trigger_initial_size_calculation::main(args) + } + } + .unwrap() +} diff --git a/pageserver/pagebench/src/util/cli/targets.rs b/pageserver/pagebench/src/util/cli/targets.rs new file mode 100644 index 0000000000..848eae27cf --- /dev/null +++ b/pageserver/pagebench/src/util/cli/targets.rs @@ -0,0 +1,34 @@ +use std::sync::Arc; + +use pageserver_client::mgmt_api; +use tracing::info; +use utils::id::TenantTimelineId; + +pub(crate) struct Spec { + pub(crate) limit_to_first_n_targets: Option, + pub(crate) targets: Option>, +} + +pub(crate) async fn discover( + api_client: &Arc, + spec: Spec, +) -> anyhow::Result> { + let mut timelines = if let Some(targets) = spec.targets { + targets + } else { + mgmt_api::util::get_pageserver_tenant_timelines_unsharded(api_client).await? + }; + + if let Some(limit) = spec.limit_to_first_n_targets { + timelines.sort(); // for determinism + timelines.truncate(limit); + if timelines.len() < limit { + anyhow::bail!("pageserver has less than limit_to_first_n_targets={limit} tenants"); + } + } + + info!("timelines:\n{:?}", timelines); + info!("number of timelines:\n{:?}", timelines.len()); + + Ok(timelines) +} diff --git a/pageserver/pagebench/src/util/connstring.rs b/pageserver/pagebench/src/util/connstring.rs new file mode 100644 index 0000000000..07a0ff042d --- /dev/null +++ b/pageserver/pagebench/src/util/connstring.rs @@ -0,0 +1,8 @@ +pub(crate) fn connstring(host_port: &str, jwt: Option<&str>) -> String { + let colon_and_jwt = if let Some(jwt) = jwt { + format!(":{jwt}") // TODO: urlescape + } else { + String::new() + }; + format!("postgres://postgres{colon_and_jwt}@{host_port}") +} diff --git a/pageserver/pagebench/src/util/request_stats.rs b/pageserver/pagebench/src/util/request_stats.rs new file mode 100644 index 0000000000..5ecf1cbf24 --- /dev/null +++ b/pageserver/pagebench/src/util/request_stats.rs @@ -0,0 +1,88 @@ +use std::time::Duration; + +use anyhow::Context; + +pub(crate) struct Stats { + latency_histo: hdrhistogram::Histogram, +} + +impl Stats { + pub(crate) fn new() -> Self { + Self { + // Initialize with fixed bounds so that we panic at runtime instead of resizing the histogram, + // which would skew the benchmark results. + latency_histo: hdrhistogram::Histogram::new_with_bounds(1, 1_000_000_000, 3).unwrap(), + } + } + pub(crate) fn observe(&mut self, latency: Duration) -> anyhow::Result<()> { + let micros: u64 = latency + .as_micros() + .try_into() + .context("latency greater than u64")?; + self.latency_histo + .record(micros) + .context("add to histogram")?; + Ok(()) + } + pub(crate) fn output(&self) -> Output { + let latency_percentiles = std::array::from_fn(|idx| { + let micros = self + .latency_histo + .value_at_percentile(LATENCY_PERCENTILES[idx]); + Duration::from_micros(micros) + }); + Output { + request_count: self.latency_histo.len(), + latency_mean: Duration::from_micros(self.latency_histo.mean() as u64), + latency_percentiles: LatencyPercentiles { + latency_percentiles, + }, + } + } + pub(crate) fn add(&mut self, other: &Self) { + let Self { + ref mut latency_histo, + } = self; + latency_histo.add(&other.latency_histo).unwrap(); + } +} + +impl Default for Stats { + fn default() -> Self { + Self::new() + } +} + +const LATENCY_PERCENTILES: [f64; 4] = [95.0, 99.00, 99.90, 99.99]; + +struct LatencyPercentiles { + latency_percentiles: [Duration; 4], +} + +impl serde::Serialize for LatencyPercentiles { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + use serde::ser::SerializeMap; + let mut ser = serializer.serialize_map(Some(LATENCY_PERCENTILES.len()))?; + for p in LATENCY_PERCENTILES { + ser.serialize_entry( + &format!("p{p}"), + &format!( + "{}", + &humantime::format_duration(self.latency_percentiles[0]) + ), + )?; + } + ser.end() + } +} + +#[derive(serde::Serialize)] +pub(crate) struct Output { + request_count: u64, + #[serde(with = "humantime_serde")] + latency_mean: Duration, + latency_percentiles: LatencyPercentiles, +} diff --git a/pageserver/pagebench/src/util/tokio_thread_local_stats.rs b/pageserver/pagebench/src/util/tokio_thread_local_stats.rs new file mode 100644 index 0000000000..82526213b6 --- /dev/null +++ b/pageserver/pagebench/src/util/tokio_thread_local_stats.rs @@ -0,0 +1,45 @@ +pub(crate) type ThreadLocalStats = Arc>; +pub(crate) type AllThreadLocalStats = Arc>>>; + +macro_rules! declare { + ($THREAD_LOCAL_NAME:ident: $T:ty) => { + thread_local! { + pub static $THREAD_LOCAL_NAME: std::cell::RefCell> = std::cell::RefCell::new( + std::sync::Arc::new(std::sync::Mutex::new(Default::default())) + ); + } + }; +} + +use std::sync::{Arc, Mutex}; + +pub(crate) use declare; + +macro_rules! main { + ($THREAD_LOCAL_NAME:ident, $main_impl:expr) => {{ + let main_impl = $main_impl; + let all = Arc::new(Mutex::new(Vec::new())); + + let rt = tokio::runtime::Builder::new_multi_thread() + .on_thread_start({ + let all = Arc::clone(&all); + move || { + // pre-initialize the thread local stats by accessesing them + // (some stats like requests_stats::Stats are quite costly to initialize, + // we don't want to pay that cost during the measurement period) + $THREAD_LOCAL_NAME.with(|stats| { + let stats: Arc<_> = Arc::clone(&*stats.borrow()); + all.lock().unwrap().push(stats); + }); + } + }) + .enable_all() + .build() + .unwrap(); + + let main_task = rt.spawn(main_impl(all)); + rt.block_on(main_task).unwrap() + }}; +} + +pub(crate) use main; diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index bd63c4d860..8516f397ca 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -1468,6 +1468,7 @@ threshold = "20m" period: Duration::from_secs(10), #[cfg(feature = "testing")] mock_statvfs: None, + eviction_order: crate::disk_usage_eviction_task::EvictionOrder::AbsoluteAccessed, }) ); match &conf.default_tenant_conf.eviction_policy { diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs index 76906cfaf7..23b9b573b6 100644 --- a/pageserver/src/disk_usage_eviction_task.rs +++ b/pageserver/src/disk_usage_eviction_task.rs @@ -74,6 +74,45 @@ pub struct DiskUsageEvictionTaskConfig { pub period: Duration, #[cfg(feature = "testing")] pub mock_statvfs: Option, + /// Select sorting for evicted layers + #[serde(default)] + pub eviction_order: EvictionOrder, +} + +/// Selects the sort order for eviction candidates *after* per tenant `min_resident_size` +/// partitioning. +#[derive(Default, Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +#[serde(tag = "type", content = "args")] +pub enum EvictionOrder { + /// Order the layers to be evicted by how recently they have been accessed in absolute + /// time. + /// + /// This strategy is unfair when some tenants grow faster than others towards the slower + /// growing. + #[default] + AbsoluteAccessed, + + /// Order the layers to be evicted by how recently they have been accessed relatively within + /// the set of resident layers of a tenant. + /// + /// This strategy will evict layers more fairly but is untested. + RelativeAccessed { + #[serde(default)] + highest_layer_count_loses_first: bool, + }, +} + +impl EvictionOrder { + /// Return true, if with [`Self::RelativeAccessed`] order the tenants with the highest layer + /// counts should be the first ones to have their layers evicted. + fn highest_layer_count_loses_first(&self) -> bool { + match self { + EvictionOrder::AbsoluteAccessed => false, + EvictionOrder::RelativeAccessed { + highest_layer_count_loses_first, + } => *highest_layer_count_loses_first, + } + } } #[derive(Default)] @@ -192,7 +231,14 @@ async fn disk_usage_eviction_task_iteration( ) -> anyhow::Result<()> { let usage_pre = filesystem_level_usage::get(tenants_dir, task_config) .context("get filesystem-level disk usage before evictions")?; - let res = disk_usage_eviction_task_iteration_impl(state, storage, usage_pre, cancel).await; + let res = disk_usage_eviction_task_iteration_impl( + state, + storage, + usage_pre, + task_config.eviction_order, + cancel, + ) + .await; match res { Ok(outcome) => { debug!(?outcome, "disk_usage_eviction_iteration finished"); @@ -278,6 +324,7 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl( state: &State, _storage: &GenericRemoteStorage, usage_pre: U, + eviction_order: EvictionOrder, cancel: &CancellationToken, ) -> anyhow::Result> { // use tokio's mutex to get a Sync guard (instead of std::sync::Mutex) @@ -297,7 +344,7 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl( "running disk usage based eviction due to pressure" ); - let candidates = match collect_eviction_candidates(cancel).await? { + let candidates = match collect_eviction_candidates(eviction_order, cancel).await? { EvictionCandidates::Cancelled => { return Ok(IterationOutcome::Cancelled); } @@ -307,16 +354,16 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl( // Debug-log the list of candidates let now = SystemTime::now(); for (i, (partition, candidate)) in candidates.iter().enumerate() { + let nth = i + 1; let desc = candidate.layer.layer_desc(); + let total_candidates = candidates.len(); + let size = desc.file_size; + let rel = candidate.relative_last_activity; debug!( - "cand {}/{}: size={}, no_access_for={}us, partition={:?}, {}/{}/{}", - i + 1, - candidates.len(), - desc.file_size, + "cand {nth}/{total_candidates}: size={size}, rel_last_activity={rel}, no_access_for={}us, partition={partition:?}, {}/{}/{}", now.duration_since(candidate.last_activity_ts) .unwrap() .as_micros(), - partition, desc.tenant_shard_id, desc.timeline_id, candidate.layer, @@ -459,6 +506,7 @@ struct EvictionCandidate { timeline: Arc, layer: Layer, last_activity_ts: SystemTime, + relative_last_activity: finite_f32::FiniteF32, } #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] @@ -478,24 +526,24 @@ enum EvictionCandidates { /// order. A caller that evicts in that order, until pressure is relieved, implements /// the eviction policy outlined in the module comment. /// -/// # Example +/// # Example with EvictionOrder::AbsoluteAccessed /// /// Imagine that there are two tenants, A and B, with five layers each, a-e. /// Each layer has size 100, and both tenant's min_resident_size is 150. /// The eviction order would be /// /// ```text -/// partition last_activity_ts tenant/layer -/// Above 18:30 A/c -/// Above 19:00 A/b -/// Above 18:29 B/c -/// Above 19:05 B/b -/// Above 20:00 B/a -/// Above 20:03 A/a -/// Below 20:30 A/d -/// Below 20:40 B/d -/// Below 20:45 B/e -/// Below 20:58 A/e +/// partition last_activity_ts tenant/layer +/// Above 18:30 A/c +/// Above 19:00 A/b +/// Above 18:29 B/c +/// Above 19:05 B/b +/// Above 20:00 B/a +/// Above 20:03 A/a +/// Below 20:30 A/d +/// Below 20:40 B/d +/// Below 20:45 B/e +/// Below 20:58 A/e /// ``` /// /// Now, if we need to evict 300 bytes to relieve pressure, we'd evict `A/c, A/b, B/c`. @@ -505,7 +553,77 @@ enum EvictionCandidates { /// `A/c, A/b, B/c, B/b, B/a, A/a, A/d, B/d, B/e`, reaching into the `Below` partition /// after exhauting the `Above` partition. /// So, we did not respect each tenant's min_resident_size. +/// +/// # Example with EvictionOrder::RelativeAccessed +/// +/// ```text +/// partition relative_age last_activity_ts tenant/layer +/// Above 0/4 18:30 A/c +/// Above 0/4 18:29 B/c +/// Above 1/4 19:00 A/b +/// Above 1/4 19:05 B/b +/// Above 2/4 20:00 B/a +/// Above 2/4 20:03 A/a +/// Below 3/4 20:30 A/d +/// Below 3/4 20:40 B/d +/// Below 4/4 20:45 B/e +/// Below 4/4 20:58 A/e +/// ``` +/// +/// With tenants having the same number of layers the picture does not change much. The same with +/// A having many more layers **resident** (not all of them listed): +/// +/// ```text +/// Above 0/100 18:30 A/c +/// Above 0/4 18:29 B/c +/// Above 1/100 19:00 A/b +/// Above 2/100 20:03 A/a +/// Above 3/100 20:03 A/nth_3 +/// Above 4/100 20:03 A/nth_4 +/// ... +/// Above 1/4 19:05 B/b +/// Above 25/100 20:04 A/nth_25 +/// ... +/// Above 2/4 20:00 B/a +/// Above 50/100 20:10 A/nth_50 +/// ... +/// Below 3/4 20:40 B/d +/// Below 99/100 20:30 A/nth_99 +/// Below 4/4 20:45 B/e +/// Below 100/100 20:58 A/nth_100 +/// ``` +/// +/// Now it's easier to see that because A has grown fast it has more layers to get evicted. What is +/// difficult to see is what happens on the next round assuming the evicting 23 from the above list +/// relieves the pressure (22 A layers gone, 1 B layers gone) but a new fast growing tenant C has +/// appeared: +/// +/// ```text +/// Above 0/87 20:04 A/nth_23 +/// Above 0/3 19:05 B/b +/// Above 0/50 20:59 C/nth_0 +/// Above 1/87 20:04 A/nth_24 +/// Above 1/50 21:00 C/nth_1 +/// Above 2/87 20:04 A/nth_25 +/// ... +/// Above 16/50 21:02 C/nth_16 +/// Above 1/3 20:00 B/a +/// Above 27/87 20:10 A/nth_50 +/// ... +/// Below 2/3 20:40 B/d +/// Below 49/50 21:05 C/nth_49 +/// Below 86/87 20:30 A/nth_99 +/// Below 3/3 20:45 B/e +/// Below 50/50 21:05 C/nth_50 +/// Below 87/87 20:58 A/nth_100 +/// ``` +/// +/// Now relieving pressure with 23 layers would cost: +/// - tenant A 14 layers +/// - tenant B 1 layer +/// - tenant C 8 layers async fn collect_eviction_candidates( + eviction_order: EvictionOrder, cancel: &CancellationToken, ) -> anyhow::Result { // get a snapshot of the list of tenants @@ -591,12 +709,63 @@ async fn collect_eviction_candidates( tenant_candidates .sort_unstable_by_key(|(_, layer_info)| std::cmp::Reverse(layer_info.last_activity_ts)); let mut cumsum: i128 = 0; - for (timeline, layer_info) in tenant_candidates.into_iter() { + + // keeping the -1 or not decides if every tenant should lose their least recently accessed + // layer OR if this should happen in the order of having highest layer count: + let fudge = if eviction_order.highest_layer_count_loses_first() { + // relative_age vs. tenant layer count: + // - 0.1..=1.0 (10 layers) + // - 0.01..=1.0 (100 layers) + // - 0.001..=1.0 (1000 layers) + // + // leading to evicting less of the smallest tenants. + 0 + } else { + // use full 0.0..=1.0 range, which means even the smallest tenants could always lose a + // layer. the actual ordering is unspecified: for 10k tenants on a pageserver it could + // be that less than 10k layer evictions is enough, so we would not need to evict from + // all tenants. + // + // as the tenant ordering is now deterministic this could hit the same tenants + // disproportionetly on multiple invocations. alternative could be to remember how many + // layers did we evict last time from this tenant, and inject that as an additional + // fudge here. + 1 + }; + + let total = tenant_candidates + .len() + .checked_sub(fudge) + .filter(|&x| x > 0) + // support 0 or 1 resident layer tenants as well + .unwrap_or(1); + let divider = total as f32; + + for (i, (timeline, layer_info)) in tenant_candidates.into_iter().enumerate() { let file_size = layer_info.file_size(); + + // as we iterate this reverse sorted list, the most recently accessed layer will always + // be 1.0; this is for us to evict it last. + let relative_last_activity = if matches!( + eviction_order, + EvictionOrder::RelativeAccessed { .. } + ) { + // another possibility: use buckets, like (256.0 * relative_last_activity) as u8 or + // similarly for u16. unsure how it would help. + finite_f32::FiniteF32::try_from_normalized((total - i) as f32 / divider) + .unwrap_or_else(|val| { + tracing::warn!(%fudge, "calculated invalid relative_last_activity for i={i}, total={total}: {val}"); + finite_f32::FiniteF32::ZERO + }) + } else { + finite_f32::FiniteF32::ZERO + }; + let candidate = EvictionCandidate { timeline, last_activity_ts: layer_info.last_activity_ts, layer: layer_info.layer, + relative_last_activity, }; let partition = if cumsum > min_resident_size as i128 { MinResidentSizePartition::Above @@ -610,8 +779,19 @@ async fn collect_eviction_candidates( debug_assert!(MinResidentSizePartition::Above < MinResidentSizePartition::Below, "as explained in the function's doc comment, layers that aren't in the tenant's min_resident_size are evicted first"); - candidates - .sort_unstable_by_key(|(partition, candidate)| (*partition, candidate.last_activity_ts)); + + match eviction_order { + EvictionOrder::AbsoluteAccessed => { + candidates.sort_unstable_by_key(|(partition, candidate)| { + (*partition, candidate.last_activity_ts) + }); + } + EvictionOrder::RelativeAccessed { .. } => { + candidates.sort_unstable_by_key(|(partition, candidate)| { + (*partition, candidate.relative_last_activity) + }); + } + } Ok(EvictionCandidates::Finished(candidates)) } @@ -640,6 +820,66 @@ impl std::ops::Deref for TimelineKey { } } +/// A totally ordered f32 subset we can use with sorting functions. +mod finite_f32 { + + /// A totally ordered f32 subset we can use with sorting functions. + #[derive(Clone, Copy, PartialEq)] + pub struct FiniteF32(f32); + + impl std::fmt::Debug for FiniteF32 { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + std::fmt::Debug::fmt(&self.0, f) + } + } + + impl std::fmt::Display for FiniteF32 { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + std::fmt::Display::fmt(&self.0, f) + } + } + + impl std::cmp::Eq for FiniteF32 {} + + impl std::cmp::PartialOrd for FiniteF32 { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } + } + + impl std::cmp::Ord for FiniteF32 { + fn cmp(&self, other: &Self) -> std::cmp::Ordering { + self.0.total_cmp(&other.0) + } + } + + impl TryFrom for FiniteF32 { + type Error = f32; + + fn try_from(value: f32) -> Result { + if value.is_finite() { + Ok(FiniteF32(value)) + } else { + Err(value) + } + } + } + + impl FiniteF32 { + pub const ZERO: FiniteF32 = FiniteF32(0.0); + + pub fn try_from_normalized(value: f32) -> Result { + if (0.0..=1.0).contains(&value) { + // -0.0 is within the range, make sure it is assumed 0.0..=1.0 + let value = value.abs(); + Ok(FiniteF32(value)) + } else { + Err(value) + } + } + } +} + mod filesystem_level_usage { use anyhow::Context; use camino::Utf8Path; @@ -721,6 +961,7 @@ mod filesystem_level_usage { #[test] fn max_usage_pct_pressure() { + use super::EvictionOrder; use super::Usage as _; use std::time::Duration; use utils::serde_percent::Percent; @@ -732,6 +973,7 @@ mod filesystem_level_usage { period: Duration::MAX, #[cfg(feature = "testing")] mock_statvfs: None, + eviction_order: EvictionOrder::default(), }, total_bytes: 100_000, avail_bytes: 0, diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index b79c5ada9a..1fbca1086f 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -159,6 +159,12 @@ paths: application/json: schema: $ref: "#/components/schemas/ConflictError" + "412": + description: Deletion may not proceed, tenant is not in Active state + content: + application/json: + schema: + $ref: "#/components/schemas/PreconditionFailedError" "500": description: Generic operation error content: diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 601fad5bde..11a3a2c872 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -14,6 +14,7 @@ use hyper::header; use hyper::StatusCode; use hyper::{Body, Request, Response, Uri}; use metrics::launch_timestamp::LaunchTimestamp; +use pageserver_api::models::TenantDetails; use pageserver_api::models::{ DownloadRemoteLayersTaskSpawnRequest, LocationConfigMode, TenantAttachRequest, TenantLoadRequest, TenantLocationConfigRequest, @@ -307,6 +308,7 @@ impl From for ApiError { SlotUpsertError(e) => e.into(), Other(o) => ApiError::InternalServerError(o), e @ InvalidState(_) => ApiError::PreconditionFailed(e.to_string().into_boxed_str()), + Cancelled => ApiError::ShuttingDown, } } } @@ -592,8 +594,6 @@ async fn get_lsn_by_timestamp_handler( ))); } - let version: Option = parse_query_param(&request, "version")?; - let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; let timestamp_raw = must_get_query_param(&request, "timestamp")?; let timestamp = humantime::parse_rfc3339(×tamp_raw) @@ -606,31 +606,18 @@ async fn get_lsn_by_timestamp_handler( let result = timeline .find_lsn_for_timestamp(timestamp_pg, &cancel, &ctx) .await?; - - if version.unwrap_or(0) > 1 { - #[derive(serde::Serialize)] - struct Result { - lsn: Lsn, - kind: &'static str, - } - let (lsn, kind) = match result { - LsnForTimestamp::Present(lsn) => (lsn, "present"), - LsnForTimestamp::Future(lsn) => (lsn, "future"), - LsnForTimestamp::Past(lsn) => (lsn, "past"), - LsnForTimestamp::NoData(lsn) => (lsn, "nodata"), - }; - json_response(StatusCode::OK, Result { lsn, kind }) - } else { - // FIXME: this is a temporary crutch not to break backwards compatibility - // See https://github.com/neondatabase/neon/pull/5608 - let result = match result { - LsnForTimestamp::Present(lsn) => format!("{lsn}"), - LsnForTimestamp::Future(_lsn) => "future".into(), - LsnForTimestamp::Past(_lsn) => "past".into(), - LsnForTimestamp::NoData(_lsn) => "nodata".into(), - }; - json_response(StatusCode::OK, result) + #[derive(serde::Serialize)] + struct Result { + lsn: Lsn, + kind: &'static str, } + let (lsn, kind) = match result { + LsnForTimestamp::Present(lsn) => (lsn, "present"), + LsnForTimestamp::Future(lsn) => (lsn, "future"), + LsnForTimestamp::Past(lsn) => (lsn, "past"), + LsnForTimestamp::NoData(lsn) => (lsn, "nodata"), + }; + json_response(StatusCode::OK, Result { lsn, kind }) } async fn get_timestamp_of_lsn_handler( @@ -872,11 +859,14 @@ async fn tenant_status( } let state = tenant.current_state(); - Result::<_, ApiError>::Ok(TenantInfo { - id: tenant_shard_id, - state: state.clone(), - current_physical_size: Some(current_physical_size), - attachment_status: state.attachment_status(), + Result::<_, ApiError>::Ok(TenantDetails { + tenant_info: TenantInfo { + id: tenant_shard_id, + state: state.clone(), + current_physical_size: Some(current_physical_size), + attachment_status: state.attachment_status(), + }, + timelines: tenant.list_timeline_ids(), }) } .instrument(info_span!("tenant_status_handler", @@ -897,7 +887,9 @@ async fn tenant_delete_handler( let state = get_state(&request); - mgr::delete_tenant(state.conf, state.remote_storage.clone(), tenant_shard_id) + state + .tenant_manager + .delete_tenant(tenant_shard_id, ACTIVE_TENANT_TIMEOUT) .instrument(info_span!("tenant_delete_handler", tenant_id = %tenant_shard_id.tenant_id, shard = %tenant_shard_id.shard_slug() @@ -1577,19 +1569,22 @@ async fn disk_usage_eviction_run( struct Config { /// How many bytes to evict before reporting that pressure is relieved. evict_bytes: u64, + + #[serde(default)] + eviction_order: crate::disk_usage_eviction_task::EvictionOrder, } #[derive(Debug, Clone, Copy, serde::Serialize)] struct Usage { // remains unchanged after instantiation of the struct - config: Config, + evict_bytes: u64, // updated by `add_available_bytes` freed_bytes: u64, } impl crate::disk_usage_eviction_task::Usage for Usage { fn has_pressure(&self) -> bool { - self.config.evict_bytes > self.freed_bytes + self.evict_bytes > self.freed_bytes } fn add_available_bytes(&mut self, bytes: u64) { @@ -1600,7 +1595,7 @@ async fn disk_usage_eviction_run( let config = json_request::(&mut r).await?; let usage = Usage { - config, + evict_bytes: config.evict_bytes, freed_bytes: 0, }; @@ -1615,7 +1610,11 @@ async fn disk_usage_eviction_run( let state = state.disk_usage_eviction_state.clone(); let res = crate::disk_usage_eviction_task::disk_usage_eviction_task_iteration_impl( - &state, storage, usage, &cancel, + &state, + storage, + usage, + config.eviction_order, + &cancel, ) .await; diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index b81037ae47..e9884a15f5 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -1776,6 +1776,7 @@ pub fn is_inherited_key(key: Key) -> bool { key != AUX_FILES_KEY } +/// Guaranteed to return `Ok()` if [[is_rel_block_key]] returns `true` for `key`. pub fn key_to_rel_block(key: Key) -> anyhow::Result<(RelTag, BlockNumber)> { Ok(match key.field1 { 0x00 => ( @@ -1790,7 +1791,6 @@ pub fn key_to_rel_block(key: Key) -> anyhow::Result<(RelTag, BlockNumber)> { _ => anyhow::bail!("unexpected value kind 0x{:02x}", key.field1), }) } - pub fn is_rel_fsm_block_key(key: Key) -> bool { key.field1 == 0x00 && key.field4 != 0 && key.field5 == FSM_FORKNUM && key.field6 != 0xffffffff } diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index eceef6bf78..2f2169d194 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -1552,6 +1552,10 @@ impl Tenant { .collect() } + pub fn list_timeline_ids(&self) -> Vec { + self.timelines.lock().unwrap().keys().cloned().collect() + } + /// This is used to create the initial 'main' timeline during bootstrapping, /// or when importing a new base backup. The caller is expected to load an /// initial image of the datadir to the new timeline after this. @@ -3130,6 +3134,7 @@ impl Tenant { /// For unit tests, make this visible so that other modules can directly create timelines #[cfg(test)] + #[tracing::instrument(fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), %timeline_id))] pub(crate) async fn bootstrap_timeline_test( &self, timeline_id: TimelineId, diff --git a/pageserver/src/tenant/delete.rs b/pageserver/src/tenant/delete.rs index e8491f26db..b21bad51ba 100644 --- a/pageserver/src/tenant/delete.rs +++ b/pageserver/src/tenant/delete.rs @@ -48,6 +48,9 @@ pub(crate) enum DeleteTenantError { #[error("Timeline {0}")] Timeline(#[from] DeleteTimelineError), + #[error("Cancelled")] + Cancelled, + #[error(transparent)] Other(#[from] anyhow::Error), } diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index b2f14db9f7..62922e8c99 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -514,10 +514,7 @@ pub async fn init_tenant_mgr( &ctx, ) { Ok(tenant) => { - tenants.insert( - TenantShardId::unsharded(tenant.tenant_id()), - TenantSlot::Attached(tenant), - ); + tenants.insert(tenant_shard_id, TenantSlot::Attached(tenant)); } Err(e) => { error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Failed to start tenant: {e:#}"); @@ -962,35 +959,27 @@ impl TenantManager { } let tenant_path = self.conf.tenant_path(&tenant_shard_id); + let timelines_path = self.conf.timelines_path(&tenant_shard_id); + + // Directory structure is the same for attached and secondary modes: + // create it if it doesn't exist. Timeline load/creation expects the + // timelines/ subdir to already exist. + // + // Does not need to be fsync'd because local storage is just a cache. + tokio::fs::create_dir_all(&timelines_path) + .await + .with_context(|| format!("Creating {timelines_path}"))?; + + // Before activating either secondary or attached mode, persist the + // configuration, so that on restart we will re-attach (or re-start + // secondary) on the tenant. + Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config) + .await + .map_err(SetNewTenantConfigError::Persist)?; let new_slot = match &new_location_config.mode { - LocationMode::Secondary(_) => { - // Directory doesn't need to be fsync'd because if we crash it can - // safely be recreated next time this tenant location is configured. - tokio::fs::create_dir_all(&tenant_path) - .await - .with_context(|| format!("Creating {tenant_path}"))?; - - Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config) - .await - .map_err(SetNewTenantConfigError::Persist)?; - - TenantSlot::Secondary - } + LocationMode::Secondary(_) => TenantSlot::Secondary, LocationMode::Attached(_attach_config) => { - let timelines_path = self.conf.timelines_path(&tenant_shard_id); - - // Directory doesn't need to be fsync'd because we do not depend on - // it to exist after crashes: it may be recreated when tenant is - // re-attached, see https://github.com/neondatabase/neon/issues/5550 - tokio::fs::create_dir_all(&tenant_path) - .await - .with_context(|| format!("Creating {timelines_path}"))?; - - Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config) - .await - .map_err(SetNewTenantConfigError::Persist)?; - let shard_identity = new_location_config.shard; let tenant = tenant_spawn( self.conf, @@ -1102,6 +1091,71 @@ impl TenantManager { .collect(), } } + + pub(crate) async fn delete_tenant( + &self, + tenant_shard_id: TenantShardId, + activation_timeout: Duration, + ) -> Result<(), DeleteTenantError> { + // We acquire a SlotGuard during this function to protect against concurrent + // changes while the ::prepare phase of DeleteTenantFlow executes, but then + // have to return the Tenant to the map while the background deletion runs. + // + // TODO: refactor deletion to happen outside the lifetime of a Tenant. + // Currently, deletion requires a reference to the tenants map in order to + // keep the Tenant in the map until deletion is complete, and then remove + // it at the end. + // + // See https://github.com/neondatabase/neon/issues/5080 + + let slot_guard = + tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustExist)?; + + // unwrap is safe because we used MustExist mode when acquiring + let tenant = match slot_guard.get_old_value().as_ref().unwrap() { + TenantSlot::Attached(tenant) => tenant.clone(), + _ => { + // Express "not attached" as equivalent to "not found" + return Err(DeleteTenantError::NotAttached); + } + }; + + match tenant.current_state() { + TenantState::Broken { .. } | TenantState::Stopping { .. } => { + // If a tenant is broken or stopping, DeleteTenantFlow can + // handle it: broken tenants proceed to delete, stopping tenants + // are checked for deletion already in progress. + } + _ => { + tenant + .wait_to_become_active(activation_timeout) + .await + .map_err(|e| match e { + GetActiveTenantError::WillNotBecomeActive(_) => { + DeleteTenantError::InvalidState(tenant.current_state()) + } + GetActiveTenantError::Cancelled => DeleteTenantError::Cancelled, + GetActiveTenantError::NotFound(_) => DeleteTenantError::NotAttached, + GetActiveTenantError::WaitForActiveTimeout { + latest_state: _latest_state, + wait_time: _wait_time, + } => DeleteTenantError::InvalidState(tenant.current_state()), + })?; + } + } + + let result = DeleteTenantFlow::run( + self.conf, + self.resources.remote_storage.clone(), + &TENANTS, + tenant, + ) + .await; + + // The Tenant goes back into the map in Stopping state, it will eventually be removed by DeleteTenantFLow + slot_guard.revert(); + result + } } #[derive(Debug, thiserror::Error)] @@ -1279,41 +1333,6 @@ pub(crate) async fn get_active_tenant_with_timeout( Ok(tenant) } -pub(crate) async fn delete_tenant( - conf: &'static PageServerConf, - remote_storage: Option, - tenant_shard_id: TenantShardId, -) -> Result<(), DeleteTenantError> { - // We acquire a SlotGuard during this function to protect against concurrent - // changes while the ::prepare phase of DeleteTenantFlow executes, but then - // have to return the Tenant to the map while the background deletion runs. - // - // TODO: refactor deletion to happen outside the lifetime of a Tenant. - // Currently, deletion requires a reference to the tenants map in order to - // keep the Tenant in the map until deletion is complete, and then remove - // it at the end. - // - // See https://github.com/neondatabase/neon/issues/5080 - - // TODO(sharding): make delete API sharding-aware - let slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustExist)?; - - // unwrap is safe because we used MustExist mode when acquiring - let tenant = match slot_guard.get_old_value().as_ref().unwrap() { - TenantSlot::Attached(tenant) => tenant.clone(), - _ => { - // Express "not attached" as equivalent to "not found" - return Err(DeleteTenantError::NotAttached); - } - }; - - let result = DeleteTenantFlow::run(conf, remote_storage, &TENANTS, tenant).await; - - // The Tenant goes back into the map in Stopping state, it will eventually be removed by DeleteTenantFLow - slot_guard.revert(); - result -} - #[derive(Debug, thiserror::Error)] pub(crate) enum DeleteTimelineError { #[error("Tenant {0}")] diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index 52ee8f49ce..1b0cf39fbe 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -2192,15 +2192,6 @@ mod tests { let index_part_bytes = serde_json::to_vec(&example_index_part).unwrap(); - let timeline_path = test_state.harness.timeline_path(&TIMELINE_ID); - let remote_timeline_dir = test_state.harness.remote_fs_dir.join( - timeline_path - .strip_prefix(&test_state.harness.conf.workdir) - .unwrap(), - ); - - std::fs::create_dir_all(remote_timeline_dir).expect("creating test dir should work"); - let index_path = test_state.harness.remote_fs_dir.join( remote_index_path( &test_state.harness.tenant_shard_id, @@ -2209,6 +2200,10 @@ mod tests { ) .get_path(), ); + + std::fs::create_dir_all(index_path.parent().unwrap()) + .expect("creating test dir should work"); + eprintln!("Writing {index_path}"); std::fs::write(&index_path, index_part_bytes).unwrap(); example_index_part diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs index 9a8ddc1a6b..8ae911b31e 100644 --- a/pageserver/src/tenant/storage_layer/layer.rs +++ b/pageserver/src/tenant/storage_layer/layer.rs @@ -878,6 +878,23 @@ impl LayerInner { Ok(()) } Err(e) => { + let consecutive_failures = + this.consecutive_failures.fetch_add(1, Ordering::Relaxed); + + let backoff = utils::backoff::exponential_backoff_duration_seconds( + consecutive_failures.min(u32::MAX as usize) as u32, + 1.5, + 60.0, + ); + + let backoff = std::time::Duration::from_secs_f64(backoff); + + tokio::select! { + _ = tokio::time::sleep(backoff) => {}, + _ = crate::task_mgr::shutdown_token().cancelled_owned() => {}, + _ = timeline.cancel.cancelled() => {}, + }; + Err(e) } }; @@ -926,21 +943,9 @@ impl LayerInner { Ok(permit) } Ok((Err(e), _permit)) => { - // FIXME: this should be with the spawned task and be cancellation sensitive - // - // while we should not need this, this backoff has turned out to be useful with - // a bug of unexpectedly deleted remote layer file (#5787). - let consecutive_failures = - self.consecutive_failures.fetch_add(1, Ordering::Relaxed); + // sleep already happened in the spawned task, if it was not cancelled + let consecutive_failures = self.consecutive_failures.load(Ordering::Relaxed); tracing::error!(consecutive_failures, "layer file download failed: {e:#}"); - let backoff = utils::backoff::exponential_backoff_duration_seconds( - consecutive_failures.min(u32::MAX as usize) as u32, - 1.5, - 60.0, - ); - let backoff = std::time::Duration::from_secs_f64(backoff); - - tokio::time::sleep(backoff).await; Err(DownloadError::DownloadFailed) } Err(_gone) => Err(DownloadError::DownloadCancelled), diff --git a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs index 7bfa246eeb..5a5b3d7586 100644 --- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs +++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs @@ -138,7 +138,7 @@ pub(super) async fn connection_manager_loop_step( Ok(Some(broker_update)) => connection_manager_state.register_timeline_update(broker_update), Err(status) => { match status.code() { - Code::Unknown if status.message().contains("stream closed because of a broken pipe") => { + Code::Unknown if status.message().contains("stream closed because of a broken pipe") || status.message().contains("connection reset") => { // tonic's error handling doesn't provide a clear code for disconnections: we get // "h2 protocol error: error reading a body from connection: stream closed because of a broken pipe" info!("broker disconnected: {status}"); diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index 16b245c488..1d14214030 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -1612,6 +1612,7 @@ impl<'a> WalIngest<'a> { mod tests { use super::*; use crate::tenant::harness::*; + use crate::tenant::remote_timeline_client::{remote_initdb_archive_path, INITDB_PATH}; use crate::tenant::Timeline; use postgres_ffi::v14::xlog_utils::SIZEOF_CHECKPOINT; use postgres_ffi::RELSEG_SIZE; @@ -2177,21 +2178,25 @@ mod tests { let pg_version = 15; // The test data was generated by pg15 let path = "test_data/sk_wal_segment_from_pgbench"; let wal_segment_path = format!("{path}/000000010000000000000001.zst"); + let source_initdb_path = format!("{path}/{INITDB_PATH}"); let startpoint = Lsn::from_hex("14AEC08").unwrap(); let endpoint = Lsn::from_hex("1FFFF98").unwrap(); + let harness = TenantHarness::create("test_ingest_real_wal").unwrap(); + let (tenant, ctx) = harness.load().await; + + let remote_initdb_path = remote_initdb_archive_path(&tenant.tenant_id(), &TIMELINE_ID); + let initdb_path = harness.remote_fs_dir.join(remote_initdb_path.get_path()); + + std::fs::create_dir_all(initdb_path.parent().unwrap()) + .expect("creating test dir should work"); + std::fs::copy(source_initdb_path, initdb_path).expect("copying the initdb.tar.zst works"); + // Bootstrap a real timeline. We can't use create_test_timeline because // it doesn't create a real checkpoint, and Walingest::new tries to parse // the garbage data. - // - // TODO use the initdb.tar.zst file stored with the test data to avoid - // problems with inconsistent initdb results after pg minor version bumps. - let (tenant, ctx) = TenantHarness::create("test_ingest_real_wal") - .unwrap() - .load() - .await; let tline = tenant - .bootstrap_timeline_test(TIMELINE_ID, pg_version, None, &ctx) + .bootstrap_timeline_test(TIMELINE_ID, pg_version, Some(TIMELINE_ID), &ctx) .await .unwrap(); diff --git a/pgxn/neon/Makefile b/pgxn/neon/Makefile index 466e346e46..c6b224a14d 100644 --- a/pgxn/neon/Makefile +++ b/pgxn/neon/Makefile @@ -9,6 +9,7 @@ OBJS = \ libpagestore.o \ neon.o \ neon_utils.o \ + neon_walreader.o \ pagestore_smgr.o \ relsize_cache.o \ walproposer.o \ diff --git a/pgxn/neon/control_plane_connector.c b/pgxn/neon/control_plane_connector.c index 2e7da671f9..e467a9c43a 100644 --- a/pgxn/neon/control_plane_connector.c +++ b/pgxn/neon/control_plane_connector.c @@ -19,20 +19,21 @@ *------------------------------------------------------------------------- */ #include "postgres.h" + +#include + +#include "access/xact.h" +#include "commands/defrem.h" +#include "fmgr.h" +#include "libpq/crypt.h" +#include "miscadmin.h" #include "tcop/pquery.h" #include "tcop/utility.h" -#include "access/xact.h" +#include "utils/acl.h" +#include "utils/guc.h" #include "utils/hsearch.h" #include "utils/memutils.h" -#include "commands/defrem.h" -#include "miscadmin.h" -#include "utils/acl.h" -#include "fmgr.h" -#include "utils/guc.h" -#include "port.h" -#include #include "utils/jsonb.h" -#include "libpq/crypt.h" static ProcessUtility_hook_type PreviousProcessUtilityHook = NULL; diff --git a/pgxn/neon/extension_server.c b/pgxn/neon/extension_server.c index fbbb8fd448..d9a75142f1 100644 --- a/pgxn/neon/extension_server.c +++ b/pgxn/neon/extension_server.c @@ -1,4 +1,3 @@ - /*------------------------------------------------------------------------- * * extension_server.c @@ -10,21 +9,11 @@ *------------------------------------------------------------------------- */ #include "postgres.h" -#include "tcop/pquery.h" -#include "tcop/utility.h" -#include "access/xact.h" -#include "utils/hsearch.h" -#include "utils/memutils.h" -#include "commands/defrem.h" -#include "miscadmin.h" -#include "utils/acl.h" -#include "fmgr.h" -#include "utils/guc.h" -#include "port.h" -#include "fmgr.h" #include +#include "utils/guc.h" + static int extension_server_port = 0; static download_extension_file_hook_type prev_download_extension_file_hook = NULL; diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c index 54b3661e66..6725ce8fff 100644 --- a/pgxn/neon/file_cache.c +++ b/pgxn/neon/file_cache.c @@ -13,32 +13,30 @@ *------------------------------------------------------------------------- */ +#include "postgres.h" + #include #include #include -#include "postgres.h" - #include "neon_pgversioncompat.h" +#include "access/parallel.h" #include "funcapi.h" #include "miscadmin.h" -#include "pgstat.h" #include "pagestore_client.h" -#include "access/parallel.h" +#include "pgstat.h" #include "postmaster/bgworker.h" #include RELFILEINFO_HDR #include "storage/buf_internals.h" -#include "storage/latch.h" +#include "storage/fd.h" #include "storage/ipc.h" +#include "storage/latch.h" #include "storage/lwlock.h" +#include "storage/pg_shmem.h" #include "utils/builtins.h" #include "utils/dynahash.h" #include "utils/guc.h" -#include "storage/fd.h" -#include "storage/pg_shmem.h" -#include "storage/buf_internals.h" -#include "pgstat.h" /* * Local file cache is used to temporary store relations pages in local file system. @@ -102,8 +100,6 @@ static shmem_request_hook_type prev_shmem_request_hook; #define LFC_ENABLED() (lfc_ctl->limit != 0) -void PGDLLEXPORT FileCacheMonitorMain(Datum main_arg); - /* * Local file cache is optional and Neon can work without it. * In case of any any errors with this cache, we should disable it but to not throw error. diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c index 16406ce8a3..3b038f906f 100644 --- a/pgxn/neon/libpagestore.c +++ b/pgxn/neon/libpagestore.c @@ -14,28 +14,24 @@ */ #include "postgres.h" -#include "pagestore_client.h" -#include "fmgr.h" #include "access/xlog.h" -#include "access/xlogutils.h" -#include "storage/buf_internals.h" -#include "storage/lwlock.h" -#include "storage/ipc.h" -#include "storage/pg_shmem.h" -#include "c.h" -#include "postmaster/interrupt.h" - +#include "fmgr.h" #include "libpq-fe.h" -#include "libpq/pqformat.h" #include "libpq/libpq.h" - +#include "libpq/pqformat.h" #include "miscadmin.h" #include "pgstat.h" +#include "postmaster/interrupt.h" +#include "storage/buf_internals.h" +#include "storage/ipc.h" +#include "storage/lwlock.h" +#include "storage/pg_shmem.h" #include "utils/guc.h" #include "neon.h" -#include "walproposer.h" #include "neon_utils.h" +#include "pagestore_client.h" +#include "walproposer.h" #define PageStoreTrace DEBUG5 @@ -62,8 +58,8 @@ char *neon_auth_token; int readahead_buffer_size = 128; int flush_every_n_requests = 8; -int n_reconnect_attempts = 0; -int max_reconnect_attempts = 60; +static int n_reconnect_attempts = 0; +static int max_reconnect_attempts = 60; #define MAX_PAGESERVER_CONNSTRING_SIZE 256 @@ -83,8 +79,6 @@ static PagestoreShmemState *pagestore_shared; static uint64 pagestore_local_counter = 0; static char local_pageserver_connstring[MAX_PAGESERVER_CONNSTRING_SIZE]; -bool (*old_redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id) = NULL; - static bool pageserver_flush(void); static void pageserver_disconnect(void); @@ -627,8 +621,6 @@ pg_init_libpagestore(void) smgr_hook = smgr_neon; smgr_init_hook = smgr_init_neon; dbsize_hook = neon_dbsize; - old_redo_read_buffer_filter = redo_read_buffer_filter; - redo_read_buffer_filter = neon_redo_read_buffer_filter; } lfc_init(); diff --git a/pgxn/neon/libpqwalproposer.h b/pgxn/neon/libpqwalproposer.h new file mode 100644 index 0000000000..cd7e568a47 --- /dev/null +++ b/pgxn/neon/libpqwalproposer.h @@ -0,0 +1,96 @@ +/* + * Interface to set of libpq wrappers walproposer and neon_walreader need. + * Similar to libpqwalreceiver, but it has blocking connection establishment and + * pqexec which don't fit us. Implementation is at walproposer_pg.c. + */ +#ifndef ___LIBPQWALPROPOSER_H__ +#define ___LIBPQWALPROPOSER_H__ + +/* Re-exported and modified ExecStatusType */ +typedef enum +{ + /* We received a single CopyBoth result */ + WP_EXEC_SUCCESS_COPYBOTH, + + /* + * Any success result other than a single CopyBoth was received. The + * specifics of the result were already logged, but it may be useful to + * provide an error message indicating which safekeeper messed up. + * + * Do not expect PQerrorMessage to be appropriately set. + */ + WP_EXEC_UNEXPECTED_SUCCESS, + + /* + * No result available at this time. Wait until read-ready, then call + * again. Internally, this is returned when PQisBusy indicates that + * PQgetResult would block. + */ + WP_EXEC_NEEDS_INPUT, + /* Catch-all failure. Check PQerrorMessage. */ + WP_EXEC_FAILED, +} WalProposerExecStatusType; + +/* Possible return values from walprop_async_read */ +typedef enum +{ + /* The full read was successful. buf now points to the data */ + PG_ASYNC_READ_SUCCESS, + + /* + * The read is ongoing. Wait until the connection is read-ready, then try + * again. + */ + PG_ASYNC_READ_TRY_AGAIN, + /* Reading failed. Check PQerrorMessage(conn) */ + PG_ASYNC_READ_FAIL, +} PGAsyncReadResult; + +/* Possible return values from walprop_async_write */ +typedef enum +{ + /* The write fully completed */ + PG_ASYNC_WRITE_SUCCESS, + + /* + * The write started, but you'll need to call PQflush some more times to + * finish it off. We just tried, so it's best to wait until the connection + * is read- or write-ready to try again. + * + * If it becomes read-ready, call PQconsumeInput and flush again. If it + * becomes write-ready, just call PQflush. + */ + PG_ASYNC_WRITE_TRY_FLUSH, + /* Writing failed. Check PQerrorMessage(conn) */ + PG_ASYNC_WRITE_FAIL, +} PGAsyncWriteResult; + +/* + * This header is included by walproposer.h to define walproposer_api; if we're + * building walproposer without pg, ignore libpq part, leaving only interface + * types. + */ +#ifndef WALPROPOSER_LIB + +#include "libpq-fe.h" + +/* + * Sometimes working directly with underlying PGconn is simpler, export the + * whole thing for simplicity. + */ +typedef struct WalProposerConn +{ + PGconn *pg_conn; + bool is_nonblocking; /* whether the connection is non-blocking */ + char *recvbuf; /* last received CopyData message from + * walprop_async_read */ +} WalProposerConn; + +extern WalProposerConn *libpqwp_connect_start(char *conninfo); +extern bool libpqwp_send_query(WalProposerConn *conn, char *query); +extern WalProposerExecStatusType libpqwp_get_query_result(WalProposerConn *conn); +extern PGAsyncReadResult libpqwp_async_read(WalProposerConn *conn, char **buf, int *amount); +extern void libpqwp_disconnect(WalProposerConn *conn); + +#endif /* WALPROPOSER_LIB */ +#endif /* ___LIBPQWALPROPOSER_H__ */ diff --git a/pgxn/neon/neon.h b/pgxn/neon/neon.h index 897a8373a1..c3afecc679 100644 --- a/pgxn/neon/neon.h +++ b/pgxn/neon/neon.h @@ -27,13 +27,6 @@ extern void pg_init_walproposer(void); extern void pg_init_extension_server(void); -/* - * Returns true if we shouldn't do REDO on that block in record indicated by - * block_id; false otherwise. - */ -extern bool neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id); -extern bool (*old_redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id); - extern uint64 BackpressureThrottlingTime(void); extern void replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn); diff --git a/pgxn/neon/neon_utils.c b/pgxn/neon/neon_utils.c index 807d2decf6..9135847aaf 100644 --- a/pgxn/neon/neon_utils.c +++ b/pgxn/neon/neon_utils.c @@ -3,33 +3,8 @@ #include "postgres.h" -#include "access/timeline.h" -#include "access/xlogutils.h" -#include "common/logging.h" -#include "common/ip.h" -#include "funcapi.h" -#include "libpq/libpq.h" +#include "lib/stringinfo.h" #include "libpq/pqformat.h" -#include "miscadmin.h" -#include "postmaster/interrupt.h" -#include "replication/slot.h" -#include "replication/walsender_private.h" - -#include "storage/ipc.h" -#include "utils/builtins.h" -#include "utils/ps_status.h" - -#include "libpq-fe.h" -#include -#include - -#if PG_VERSION_NUM >= 150000 -#include "access/xlogutils.h" -#include "access/xlogrecovery.h" -#endif -#if PG_MAJORVERSION_NUM >= 16 -#include "utils/guc.h" -#endif /* * Convert a character which represents a hexadecimal digit to an integer. diff --git a/pgxn/neon/neon_utils.h b/pgxn/neon/neon_utils.h index 20745d8b26..a86f1e061c 100644 --- a/pgxn/neon/neon_utils.h +++ b/pgxn/neon/neon_utils.h @@ -1,8 +1,6 @@ #ifndef __NEON_UTILS_H__ #define __NEON_UTILS_H__ -#include "postgres.h" - bool HexDecodeString(uint8 *result, char *input, int nbytes); uint32 pq_getmsgint32_le(StringInfo msg); uint64 pq_getmsgint64_le(StringInfo msg); diff --git a/pgxn/neon/neon_walreader.c b/pgxn/neon/neon_walreader.c new file mode 100644 index 0000000000..f7ec9e5bfa --- /dev/null +++ b/pgxn/neon/neon_walreader.c @@ -0,0 +1,742 @@ +/* + * Like WALRead, but when WAL segment doesn't exist locally instead of throwing + * ERROR asynchronously tries to fetch it from the most advanced safekeeper. + * + * We can't use libpqwalreceiver as it blocks during connection establishment + * (and waiting for PQExec result), so use libpqwalproposer instead. + * + * TODO: keepalives are currently never sent, so the other side can close the + * connection prematurely. + * + * TODO: close conn if reading takes too long to prevent stuck connections. + */ +#include "postgres.h" + +#include +#include + +#include "access/xlog_internal.h" +#include "access/xlogdefs.h" +#include "access/xlogreader.h" +#include "libpq/pqformat.h" +#include "storage/fd.h" +#include "utils/wait_event.h" + +#include "libpq-fe.h" + +#include "neon_walreader.h" +#include "walproposer.h" + +#define NEON_WALREADER_ERR_MSG_LEN 512 + +/* + * Can be called where NeonWALReader *state is available in the context, adds log_prefix. + */ +#define nwr_log(elevel, fmt, ...) elog(elevel, "%s" fmt, state->log_prefix, ## __VA_ARGS__) + +static NeonWALReadResult NeonWALReadRemote(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli); +static NeonWALReadResult NeonWALReaderReadMsg(NeonWALReader *state); +static void NeonWALReaderResetRemote(NeonWALReader *state); +static bool NeonWALReadLocal(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli); +static bool neon_wal_segment_open(NeonWALReader *state, XLogSegNo nextSegNo, TimeLineID *tli_p); +static void neon_wal_segment_close(NeonWALReader *state); +static bool is_wal_segment_exists(XLogSegNo segno, int segsize, + TimeLineID tli); + +/* + * State of connection to donor safekeeper. + */ +typedef enum +{ + /* no remote connection */ + RS_NONE, + /* doing PQconnectPoll, need readable socket */ + RS_CONNECTING_READ, + /* doing PQconnectPoll, need writable socket */ + RS_CONNECTING_WRITE, + /* Waiting for START_REPLICATION result */ + RS_WAIT_EXEC_RESULT, + /* replication stream established */ + RS_ESTABLISHED, +} NeonWALReaderRemoteState; + +struct NeonWALReader +{ + /* + * LSN before which we assume WAL is not available locally. Exists because + * though first segment after startup always exists, part before + * basebackup LSN is filled with zeros. + */ + XLogRecPtr available_lsn; + WALSegmentContext segcxt; + WALOpenSegment seg; + int wre_errno; + /* Explains failure to read, static for simplicity. */ + char err_msg[NEON_WALREADER_ERR_MSG_LEN]; + + /* + * Saved info about request in progress, used to check validity of + * arguments after resume and remember how far we accomplished it. req_lsn + * is 0 if there is no request in progress. + */ + XLogRecPtr req_lsn; + Size req_len; + Size req_progress; + WalProposer *wp; /* we learn donor through walproposer */ + char donor_name[64]; /* saved donor safekeeper name for logging */ + /* state of connection to safekeeper */ + NeonWALReaderRemoteState rem_state; + WalProposerConn *wp_conn; + + /* + * position in wp_conn recvbuf from which we'll copy WAL next time, or + * NULL if there is no unprocessed message + */ + char *wal_ptr; + Size wal_rem_len; /* how many unprocessed bytes left in recvbuf */ + + /* + * LSN of wal_ptr position according to walsender to cross check against + * read request + */ + XLogRecPtr rem_lsn; + + /* prepended to lines logged by neon_walreader, if provided */ + char log_prefix[64]; +}; + +/* palloc and initialize NeonWALReader */ +NeonWALReader * +NeonWALReaderAllocate(int wal_segment_size, XLogRecPtr available_lsn, WalProposer *wp, char *log_prefix) +{ + NeonWALReader *reader; + + reader = (NeonWALReader *) + palloc_extended(sizeof(NeonWALReader), + MCXT_ALLOC_NO_OOM | MCXT_ALLOC_ZERO); + if (!reader) + return NULL; + + reader->available_lsn = available_lsn; + reader->seg.ws_file = -1; + reader->seg.ws_segno = 0; + reader->seg.ws_tli = 0; + reader->segcxt.ws_segsize = wal_segment_size; + + reader->wp = wp; + + reader->rem_state = RS_NONE; + + if (log_prefix) + strlcpy(reader->log_prefix, log_prefix, sizeof(reader->log_prefix)); + + return reader; +} + +void +NeonWALReaderFree(NeonWALReader *state) +{ + if (state->seg.ws_file != -1) + neon_wal_segment_close(state); + if (state->wp_conn) + libpqwp_disconnect(state->wp_conn); + pfree(state); +} + +/* + * Like vanilla WALRead, but if requested position is before available_lsn or + * WAL segment doesn't exist on disk, it tries to fetch needed segment from the + * advanced safekeeper. + * + * Read 'count' bytes into 'buf', starting at location 'startptr', from WAL + * fetched from timeline 'tli'. + * + * Returns NEON_WALREAD_SUCCESS if succeeded, NEON_WALREAD_ERROR if an error + * occurs, in which case 'err' has the desciption. Error always closes remote + * connection, if there was any, so socket subscription should be removed. + * + * NEON_WALREAD_WOULDBLOCK means caller should obtain socket to wait for with + * NeonWALReaderSocket and call NeonWALRead again with exactly the same + * arguments when NeonWALReaderEvents happen on the socket. Note that per libpq + * docs during connection establishment (before first successful read) socket + * underneath might change. + * + * Also, eventually walreader should switch from remote to local read; caller + * should remove subscription to socket then by checking NeonWALReaderEvents + * after successful read (otherwise next read might reopen the connection with + * different socket). + * + * Reading not monotonically is not supported and will result in error. + * + * Caller should be sure that WAL up to requested LSN exists, otherwise + * NEON_WALREAD_WOULDBLOCK might be always returned. + */ +NeonWALReadResult +NeonWALRead(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli) +{ + /* + * If requested data is before known available basebackup lsn or there is + * already active remote state, do remote read. + */ + if (startptr < state->available_lsn || state->rem_state != RS_NONE) + { + return NeonWALReadRemote(state, buf, startptr, count, tli); + } + if (NeonWALReadLocal(state, buf, startptr, count, tli)) + { + return NEON_WALREAD_SUCCESS; + } + else if (state->wre_errno == ENOENT) + { + nwr_log(LOG, "local read failed as segment at %X/%X doesn't exist, attempting remote", + LSN_FORMAT_ARGS(startptr)); + return NeonWALReadRemote(state, buf, startptr, count, tli); + } + else + { + return NEON_WALREAD_ERROR; + } +} + +/* Do the read from remote safekeeper. */ +static NeonWALReadResult +NeonWALReadRemote(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli) +{ + if (state->rem_state == RS_NONE) + { + XLogRecPtr donor_lsn; + + /* no connection yet; start one */ + Safekeeper *donor = GetDonor(state->wp, &donor_lsn); + + if (donor == NULL) + { + snprintf(state->err_msg, sizeof(state->err_msg), + "failed to establish remote connection to fetch WAL: no donor available"); + return NEON_WALREAD_ERROR; + } + snprintf(state->donor_name, sizeof(state->donor_name), "%s:%s", donor->host, donor->port); + nwr_log(LOG, "establishing connection to %s, flush_lsn %X/%X to fetch WAL", + state->donor_name, LSN_FORMAT_ARGS(donor_lsn)); + state->wp_conn = libpqwp_connect_start(donor->conninfo); + if (PQstatus(state->wp_conn->pg_conn) == CONNECTION_BAD) + { + snprintf(state->err_msg, sizeof(state->err_msg), + "failed to connect to %s to fetch WAL: immediately failed with %s", + state->donor_name, PQerrorMessage(state->wp_conn->pg_conn)); + NeonWALReaderResetRemote(state); + return NEON_WALREAD_ERROR; + } + /* we'll poll immediately */ + state->rem_state = RS_CONNECTING_READ; + } + + if (state->rem_state == RS_CONNECTING_READ || state->rem_state == RS_CONNECTING_WRITE) + { + switch (PQconnectPoll(state->wp_conn->pg_conn)) + { + case PGRES_POLLING_FAILED: + snprintf(state->err_msg, sizeof(state->err_msg), + "failed to connect to %s to fetch WAL: poll error: %s", + state->donor_name, PQerrorMessage(state->wp_conn->pg_conn)); + NeonWALReaderResetRemote(state); + return NEON_WALREAD_ERROR; + case PGRES_POLLING_READING: + state->rem_state = RS_CONNECTING_READ; + return NEON_WALREAD_WOULDBLOCK; + case PGRES_POLLING_WRITING: + state->rem_state = RS_CONNECTING_WRITE; + return NEON_WALREAD_WOULDBLOCK; + case PGRES_POLLING_OK: + { + /* connection successfully established */ + char start_repl_query[128]; + + snprintf(start_repl_query, sizeof(start_repl_query), + "START_REPLICATION PHYSICAL %X/%X (term='" UINT64_FORMAT "')", + LSN_FORMAT_ARGS(startptr), state->wp->propTerm); + nwr_log(LOG, "connection to %s to fetch WAL succeeded, running %s", + state->donor_name, start_repl_query); + if (!libpqwp_send_query(state->wp_conn, start_repl_query)) + { + snprintf(state->err_msg, sizeof(state->err_msg), + "failed to send %s query to %s: %s", + start_repl_query, state->donor_name, PQerrorMessage(state->wp_conn->pg_conn)); + NeonWALReaderResetRemote(state); + return NEON_WALREAD_ERROR; + } + state->rem_state = RS_WAIT_EXEC_RESULT; + break; + } + + default: /* there is unused PGRES_POLLING_ACTIVE */ + Assert(false); + return NEON_WALREAD_ERROR; /* keep the compiler quiet */ + } + } + + if (state->rem_state == RS_WAIT_EXEC_RESULT) + { + switch (libpqwp_get_query_result(state->wp_conn)) + { + case WP_EXEC_SUCCESS_COPYBOTH: + state->rem_state = RS_ESTABLISHED; + break; + case WP_EXEC_NEEDS_INPUT: + return NEON_WALREAD_WOULDBLOCK; + case WP_EXEC_FAILED: + snprintf(state->err_msg, sizeof(state->err_msg), + "get START_REPLICATION result from %s failed: %s", + state->donor_name, PQerrorMessage(state->wp_conn->pg_conn)); + NeonWALReaderResetRemote(state); + return NEON_WALREAD_ERROR; + default: /* can't happen */ + snprintf(state->err_msg, sizeof(state->err_msg), + "get START_REPLICATION result from %s: unexpected result", + state->donor_name); + NeonWALReaderResetRemote(state); + return NEON_WALREAD_ERROR; + } + } + + Assert(state->rem_state == RS_ESTABLISHED); + + /* + * If we had the request before, verify args are the same and advance the + * result ptr according to the progress; otherwise register the request. + */ + if (state->req_lsn != InvalidXLogRecPtr) + { + if (state->req_lsn != startptr || state->req_len != count) + { + snprintf(state->err_msg, sizeof(state->err_msg), + "args changed during request, was %X/%X %zu, now %X/%X %zu", + LSN_FORMAT_ARGS(state->req_lsn), state->req_len, LSN_FORMAT_ARGS(startptr), count); + NeonWALReaderResetRemote(state); + return NEON_WALREAD_ERROR; + } + nwr_log(DEBUG5, "continuing remote read at req_lsn=%X/%X len=%zu, req_progress=%zu", + LSN_FORMAT_ARGS(startptr), + count, + state->req_progress); + buf += state->req_progress; + } + else + { + state->req_lsn = startptr; + state->req_len = count; + state->req_progress = 0; + nwr_log(DEBUG5, "starting remote read req_lsn=%X/%X len=%zu", + LSN_FORMAT_ARGS(startptr), + count); + } + + while (true) + { + Size to_copy; + + /* + * If we have no ready data, receive new message. + */ + if (state->wal_rem_len == 0 && + + /* + * check for the sake of 0 length reads; walproposer does these for + * heartbeats, though generally they shouldn't hit remote source. + */ + state->req_len - state->req_progress > 0) + { + NeonWALReadResult read_msg_res = NeonWALReaderReadMsg(state); + + if (read_msg_res != NEON_WALREAD_SUCCESS) + return read_msg_res; + } + + if (state->req_lsn + state->req_progress != state->rem_lsn) + { + snprintf(state->err_msg, sizeof(state->err_msg), + "expected remote WAL at %X/%X but got %X/%X. Non monotonic read requests could have caused this. req_lsn=%X/%X len=%zu", + LSN_FORMAT_ARGS(state->req_lsn + state->req_progress), + LSN_FORMAT_ARGS(state->rem_lsn), + LSN_FORMAT_ARGS(state->req_lsn), + state->req_len); + NeonWALReaderResetRemote(state); + return NEON_WALREAD_ERROR; + } + + /* We can copy min of (available, requested) bytes. */ + to_copy = + Min(state->req_len - state->req_progress, state->wal_rem_len); + memcpy(buf, state->wal_ptr, to_copy); + state->wal_ptr += to_copy; + state->wal_rem_len -= to_copy; + state->rem_lsn += to_copy; + if (state->wal_rem_len == 0) + state->wal_ptr = NULL; /* freed by libpqwalproposer */ + buf += to_copy; + state->req_progress += to_copy; + if (state->req_progress == state->req_len) + { + XLogSegNo next_segno; + XLogSegNo req_segno; + + XLByteToSeg(state->req_lsn, req_segno, state->segcxt.ws_segsize); + XLByteToSeg(state->rem_lsn, next_segno, state->segcxt.ws_segsize); + + /* + * Request completed. If there is a chance of serving next one + * locally, close the connection. + */ + if (state->req_lsn < state->available_lsn && + state->rem_lsn >= state->available_lsn) + { + nwr_log(LOG, "closing remote connection as available_lsn %X/%X crossed and next read at %X/%X is likely to be served locally", + LSN_FORMAT_ARGS(state->available_lsn), LSN_FORMAT_ARGS(state->rem_lsn)); + NeonWALReaderResetRemote(state); + } + else if (state->rem_lsn >= state->available_lsn && next_segno > req_segno && + is_wal_segment_exists(next_segno, state->segcxt.ws_segsize, tli)) + { + nwr_log(LOG, "closing remote connection as WAL file at next lsn %X/%X exists", + LSN_FORMAT_ARGS(state->rem_lsn)); + NeonWALReaderResetRemote(state); + } + state->req_lsn = InvalidXLogRecPtr; + state->req_len = 0; + state->req_progress = 0; + return NEON_WALREAD_SUCCESS; + } + } +} + +/* + * Read one WAL message from the stream, sets state->wal_ptr in case of success. + * Resets remote state in case of failure. + */ +static NeonWALReadResult +NeonWALReaderReadMsg(NeonWALReader *state) +{ + while (true) /* loop until we get 'w' */ + { + char *copydata_ptr; + int copydata_size; + StringInfoData s; + char msg_type; + int hdrlen; + + Assert(state->rem_state == RS_ESTABLISHED); + Assert(state->wal_ptr == NULL && state->wal_rem_len == 0); + + switch (libpqwp_async_read(state->wp_conn, + ©data_ptr, + ©data_size)) + { + case PG_ASYNC_READ_SUCCESS: + break; + case PG_ASYNC_READ_TRY_AGAIN: + return NEON_WALREAD_WOULDBLOCK; + case PG_ASYNC_READ_FAIL: + snprintf(state->err_msg, + sizeof(state->err_msg), + "req_lsn=%X/%X, req_len=%zu, req_progress=%zu, get copydata failed: %s", + LSN_FORMAT_ARGS(state->req_lsn), + state->req_len, + state->req_progress, + PQerrorMessage(state->wp_conn->pg_conn)); + goto err; + } + + /* put data on StringInfo to parse */ + s.data = copydata_ptr; + s.len = copydata_size; + s.cursor = 0; + s.maxlen = -1; + + if (copydata_size == 0) + { + snprintf(state->err_msg, + sizeof(state->err_msg), + "zero length copydata received"); + goto err; + } + msg_type = pq_getmsgbyte(&s); + switch (msg_type) + { + case 'w': + { + XLogRecPtr start_lsn; + + hdrlen = sizeof(int64) + sizeof(int64) + sizeof(int64); + if (s.len - s.cursor < hdrlen) + { + snprintf(state->err_msg, + sizeof(state->err_msg), + "invalid WAL message received from primary"); + goto err; + } + + start_lsn = pq_getmsgint64(&s); + pq_getmsgint64(&s); /* XLogRecPtr end_lsn; */ + pq_getmsgint64(&s); /* TimestampTz send_time */ + + state->rem_lsn = start_lsn; + state->wal_rem_len = (Size) (s.len - s.cursor); + state->wal_ptr = (char *) pq_getmsgbytes(&s, s.len - s.cursor); + nwr_log(DEBUG5, "received WAL msg at %X/%X len %zu", + LSN_FORMAT_ARGS(state->rem_lsn), state->wal_rem_len); + + return NEON_WALREAD_SUCCESS; + } + case 'k': + { + XLogRecPtr end_lsn; + bool reply_requested; + + hdrlen = sizeof(int64) + sizeof(int64) + sizeof(char); + if (s.len - s.cursor < hdrlen) + { + snprintf(state->err_msg, sizeof(state->err_msg), + "invalid keepalive message received from primary"); + goto err; + } + + end_lsn = pq_getmsgint64(&s); + pq_getmsgint64(&s); /* TimestampTz timestamp; */ + reply_requested = pq_getmsgbyte(&s); + nwr_log(DEBUG5, "received keepalive end_lsn=%X/%X reply_requested=%d", + LSN_FORMAT_ARGS(end_lsn), + reply_requested); + if (end_lsn < state->req_lsn + state->req_len) + { + snprintf(state->err_msg, sizeof(state->err_msg), + "closing remote connection: requested WAL up to %X/%X, but current donor %s has only up to %X/%X", + LSN_FORMAT_ARGS(state->req_lsn + state->req_len), state->donor_name, LSN_FORMAT_ARGS(end_lsn)); + goto err; + } + continue; + } + default: + nwr_log(WARNING, "invalid replication message type %d", msg_type); + continue; + } + } +err: + NeonWALReaderResetRemote(state); + return NEON_WALREAD_ERROR; +} + +/* reset remote connection and request in progress */ +static void +NeonWALReaderResetRemote(NeonWALReader *state) +{ + state->req_lsn = InvalidXLogRecPtr; + state->req_len = 0; + state->req_progress = 0; + state->rem_state = RS_NONE; + if (state->wp_conn) + { + libpqwp_disconnect(state->wp_conn); + state->wp_conn = NULL; + } + state->donor_name[0] = '\0'; + state->wal_ptr = NULL; + state->wal_rem_len = 0; + state->rem_lsn = InvalidXLogRecPtr; +} + +/* + * Return socket of connection to remote source. Must be called only when + * connection exists (NeonWALReaderEvents returns non zero). + */ +pgsocket +NeonWALReaderSocket(NeonWALReader *state) +{ + if (!state->wp_conn) + nwr_log(FATAL, "NeonWALReaderSocket is called without active remote connection"); + return PQsocket(state->wp_conn->pg_conn); +} + +/* + * Whether remote connection is established. Once this is done, until successful + * local read or error socket is stable and user can update socket events + * instead of readding it each time. + */ +bool +NeonWALReaderIsRemConnEstablished(NeonWALReader *state) +{ + return state->rem_state == RS_ESTABLISHED; +} + +/* + * Returns events user should wait on connection socket or 0 if remote + * connection is not active. + */ +extern uint32 +NeonWALReaderEvents(NeonWALReader *state) +{ + switch (state->rem_state) + { + case RS_NONE: + return 0; + case RS_CONNECTING_READ: + return WL_SOCKET_READABLE; + case RS_CONNECTING_WRITE: + return WL_SOCKET_WRITEABLE; + case RS_WAIT_EXEC_RESULT: + case RS_ESTABLISHED: + return WL_SOCKET_READABLE; + default: + Assert(false); + return 0; /* make compiler happy */ + } +} + +static bool +NeonWALReadLocal(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli) +{ + char *p; + XLogRecPtr recptr; + Size nbytes; + + p = buf; + recptr = startptr; + nbytes = count; + + while (nbytes > 0) + { + uint32 startoff; + int segbytes; + int readbytes; + + startoff = XLogSegmentOffset(recptr, state->segcxt.ws_segsize); + + /* + * If the data we want is not in a segment we have open, close what we + * have (if anything) and open the next one, using the caller's + * provided openSegment callback. + */ + if (state->seg.ws_file < 0 || + !XLByteInSeg(recptr, state->seg.ws_segno, state->segcxt.ws_segsize) || + tli != state->seg.ws_tli) + { + XLogSegNo nextSegNo; + + neon_wal_segment_close(state); + + XLByteToSeg(recptr, nextSegNo, state->segcxt.ws_segsize); + if (!neon_wal_segment_open(state, nextSegNo, &tli)) + { + char fname[MAXFNAMELEN]; + + state->wre_errno = errno; + + XLogFileName(fname, tli, nextSegNo, state->segcxt.ws_segsize); + snprintf(state->err_msg, sizeof(state->err_msg), "failed to open WAL segment %s while reading at %X/%X: %s", + fname, LSN_FORMAT_ARGS(recptr), strerror(state->wre_errno)); + return false; + } + + /* This shouldn't happen -- indicates a bug in segment_open */ + Assert(state->seg.ws_file >= 0); + + /* Update the current segment info. */ + state->seg.ws_tli = tli; + state->seg.ws_segno = nextSegNo; + } + + /* How many bytes are within this segment? */ + if (nbytes > (state->segcxt.ws_segsize - startoff)) + segbytes = state->segcxt.ws_segsize - startoff; + else + segbytes = nbytes; + +#ifndef FRONTEND + pgstat_report_wait_start(WAIT_EVENT_WAL_READ); +#endif + + /* Reset errno first; eases reporting non-errno-affecting errors */ + errno = 0; + readbytes = pg_pread(state->seg.ws_file, p, segbytes, (off_t) startoff); + +#ifndef FRONTEND + pgstat_report_wait_end(); +#endif + + if (readbytes <= 0) + { + char fname[MAXFNAMELEN]; + + XLogFileName(fname, state->seg.ws_tli, state->seg.ws_segno, state->segcxt.ws_segsize); + + if (readbytes < 0) + { + state->wre_errno = errno; + snprintf(state->err_msg, sizeof(state->err_msg), "could not read from log segment %s, offset %d: %m: %s", + fname, startoff, strerror(state->wre_errno)); + } + else + { + snprintf(state->err_msg, sizeof(state->err_msg), "could not read from log segment %s, offset %d: %m: unexpected EOF", + fname, startoff); + } + return false; + } + + /* Update state for read */ + recptr += readbytes; + nbytes -= readbytes; + p += readbytes; + } + + return true; +} + +/* + * Copy of vanilla wal_segment_open, but returns false in case of error instead + * of ERROR, with errno set. + * + * XLogReaderRoutine->segment_open callback for local pg_wal files + */ +static bool +neon_wal_segment_open(NeonWALReader *state, XLogSegNo nextSegNo, + TimeLineID *tli_p) +{ + TimeLineID tli = *tli_p; + char path[MAXPGPATH]; + + XLogFilePath(path, tli, nextSegNo, state->segcxt.ws_segsize); + nwr_log(DEBUG5, "opening %s", path); + state->seg.ws_file = BasicOpenFile(path, O_RDONLY | PG_BINARY); + if (state->seg.ws_file >= 0) + return true; + + return false; +} + +static bool +is_wal_segment_exists(XLogSegNo segno, int segsize, TimeLineID tli) +{ + struct stat stat_buffer; + char path[MAXPGPATH]; + + XLogFilePath(path, tli, segno, segsize); + return stat(path, &stat_buffer) == 0; +} + +/* copy of vanilla wal_segment_close with NeonWALReader */ +static void +neon_wal_segment_close(NeonWALReader *state) +{ + if (state->seg.ws_file >= 0) + { + close(state->seg.ws_file); + /* need to check errno? */ + state->seg.ws_file = -1; + } +} + +char * +NeonWALReaderErrMsg(NeonWALReader *state) +{ + return state->err_msg; +} diff --git a/pgxn/neon/neon_walreader.h b/pgxn/neon/neon_walreader.h new file mode 100644 index 0000000000..6be9f149aa --- /dev/null +++ b/pgxn/neon/neon_walreader.h @@ -0,0 +1,30 @@ +#ifndef __NEON_WALREADER_H__ +#define __NEON_WALREADER_H__ + +#include "access/xlogdefs.h" + +/* forward declare so we don't have to expose the struct to the public */ +struct NeonWALReader; +typedef struct NeonWALReader NeonWALReader; + +/* avoid including walproposer.h as it includes us */ +struct WalProposer; +typedef struct WalProposer WalProposer; + +/* NeonWALRead return value */ +typedef enum +{ + NEON_WALREAD_SUCCESS, + NEON_WALREAD_WOULDBLOCK, + NEON_WALREAD_ERROR, +} NeonWALReadResult; + +extern NeonWALReader *NeonWALReaderAllocate(int wal_segment_size, XLogRecPtr available_lsn, WalProposer *wp, char *log_prefix); +extern void NeonWALReaderFree(NeonWALReader *state); +extern NeonWALReadResult NeonWALRead(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli); +extern pgsocket NeonWALReaderSocket(NeonWALReader *state); +extern uint32 NeonWALReaderEvents(NeonWALReader *state); +extern bool NeonWALReaderIsRemConnEstablished(NeonWALReader *state); +extern char *NeonWALReaderErrMsg(NeonWALReader *state); + +#endif /* __NEON_WALREADER_H__ */ diff --git a/pgxn/neon/pagestore_client.h b/pgxn/neon/pagestore_client.h index ecfadb01d6..3fcaab0bee 100644 --- a/pgxn/neon/pagestore_client.h +++ b/pgxn/neon/pagestore_client.h @@ -13,19 +13,16 @@ #ifndef pageserver_h #define pageserver_h -#include "postgres.h" #include "neon_pgversioncompat.h" #include "access/xlogdefs.h" #include RELFILEINFO_HDR -#include "storage/block.h" -#include "storage/smgr.h" #include "lib/stringinfo.h" #include "libpq/pqformat.h" +#include "storage/block.h" +#include "storage/smgr.h" #include "utils/memutils.h" -#include "pg_config.h" - typedef enum { /* pagestore_client -> pagestore */ @@ -158,11 +155,8 @@ extern page_server_api *page_server; extern char *page_server_connstring; extern int flush_every_n_requests; extern int readahead_buffer_size; -extern bool seqscan_prefetch_enabled; -extern int seqscan_prefetch_distance; extern char *neon_timeline; extern char *neon_tenant; -extern bool wal_redo; extern int32 max_cluster_size; extern const f_smgr *smgr_neon(BackendId backend, NRelFileInfo rinfo); diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c index 609d80588c..8888cd89c6 100644 --- a/pgxn/neon/pagestore_smgr.c +++ b/pgxn/neon/pagestore_smgr.c @@ -47,25 +47,26 @@ #include "access/xact.h" #include "access/xlog.h" +#include "access/xlogdefs.h" #include "access/xloginsert.h" #include "access/xlog_internal.h" -#include "access/xlogdefs.h" +#include "access/xlogutils.h" #include "catalog/pg_class.h" #include "common/hashfn.h" #include "executor/instrument.h" -#include "pagestore_client.h" -#include "postmaster/interrupt.h" +#include "pgstat.h" #include "postmaster/autovacuum.h" +#include "postmaster/interrupt.h" #include "replication/walsender.h" #include "storage/bufmgr.h" #include "storage/buf_internals.h" #include "storage/fsm_internals.h" -#include "storage/smgr.h" #include "storage/md.h" -#include "pgstat.h" +#include "storage/smgr.h" + +#include "pagestore_client.h" #if PG_VERSION_NUM >= 150000 -#include "access/xlogutils.h" #include "access/xlogrecovery.h" #endif @@ -106,6 +107,9 @@ typedef enum static SMgrRelation unlogged_build_rel = NULL; static UnloggedBuildPhase unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS; +static bool neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id); +static bool (*old_redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id) = NULL; + /* * Prefetch implementation: * @@ -239,7 +243,7 @@ typedef struct PrefetchState PrefetchRequest prf_buffer[]; /* prefetch buffers */ } PrefetchState; -PrefetchState *MyPState; +static PrefetchState *MyPState; #define GetPrfSlot(ring_index) ( \ ( \ @@ -257,7 +261,7 @@ PrefetchState *MyPState; ) \ ) -XLogRecPtr prefetch_lsn = 0; +static XLogRecPtr prefetch_lsn = 0; static bool compact_prefetch_buffers(void); static void consume_prefetch_responses(void); @@ -1371,6 +1375,9 @@ neon_init(void) MyPState->prf_hash = prfh_create(MyPState->hashctx, readahead_buffer_size, NULL); + old_redo_read_buffer_filter = redo_read_buffer_filter; + redo_read_buffer_filter = neon_redo_read_buffer_filter; + #ifdef DEBUG_COMPARE_LOCAL mdinit(); #endif @@ -2869,7 +2876,7 @@ get_fsm_physical_block(BlockNumber heapblk) * contents, where with REDO locking it would wait on block 1 and see * block 3 with post-REDO contents only. */ -bool +static bool neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id) { XLogRecPtr end_recptr = record->EndRecPtr; diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c index fc3332612c..7fb0cab9a0 100644 --- a/pgxn/neon/walproposer.c +++ b/pgxn/neon/walproposer.c @@ -45,7 +45,6 @@ /* Prototypes for private functions */ static void WalProposerLoop(WalProposer *wp); -static void HackyRemoveWalProposerEvent(Safekeeper *to_remove); static void ShutdownConnection(Safekeeper *sk); static void ResetConnection(Safekeeper *sk); static long TimeToReconnect(WalProposer *wp, TimestampTz now); @@ -78,11 +77,11 @@ static bool BlockingWrite(Safekeeper *sk, void *msg, size_t msg_size, Safekeeper static bool AsyncWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState flush_state); static bool AsyncFlush(Safekeeper *sk); static int CompareLsn(const void *a, const void *b); -static char *FormatSafekeeperState(SafekeeperState state); +static char *FormatSafekeeperState(Safekeeper *sk); static void AssertEventsOkForState(uint32 events, Safekeeper *sk); -static uint32 SafekeeperStateDesiredEvents(SafekeeperState state); static char *FormatEvents(WalProposer *wp, uint32 events); + WalProposer * WalProposerCreate(WalProposerConfig *config, walproposer_api api) { @@ -113,6 +112,7 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api) wp->safekeeper[wp->n_safekeepers].host = host; wp->safekeeper[wp->n_safekeepers].port = port; wp->safekeeper[wp->n_safekeepers].state = SS_OFFLINE; + wp->safekeeper[wp->n_safekeepers].active_state = SS_ACTIVE_SEND; wp->safekeeper[wp->n_safekeepers].wp = wp; { @@ -127,8 +127,6 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api) } initStringInfo(&wp->safekeeper[wp->n_safekeepers].outbuf); - wp->api.wal_reader_allocate(&wp->safekeeper[wp->n_safekeepers]); - wp->safekeeper[wp->n_safekeepers].flushWrite = false; wp->safekeeper[wp->n_safekeepers].startStreamingAt = InvalidXLogRecPtr; wp->safekeeper[wp->n_safekeepers].streamingAt = InvalidXLogRecPtr; wp->n_safekeepers += 1; @@ -277,7 +275,7 @@ WalProposerPoll(WalProposer *wp) wp->config->safekeeper_connection_timeout)) { walprop_log(WARNING, "terminating connection to safekeeper '%s:%s' in '%s' state: no messages received during the last %dms or connection attempt took longer than that", - sk->host, sk->port, FormatSafekeeperState(sk->state), wp->config->safekeeper_connection_timeout); + sk->host, sk->port, FormatSafekeeperState(sk), wp->config->safekeeper_connection_timeout); ShutdownConnection(sk); } } @@ -305,58 +303,20 @@ WalProposerLoop(WalProposer *wp) WalProposerPoll(wp); } -/* - * Hack: provides a way to remove the event corresponding to an individual walproposer from the set. - * - * Note: Internally, this completely reconstructs the event set. It should be avoided if possible. - */ -static void -HackyRemoveWalProposerEvent(Safekeeper *to_remove) -{ - WalProposer *wp = to_remove->wp; - - /* Remove the existing event set, assign sk->eventPos = -1 */ - wp->api.free_event_set(wp); - /* Re-initialize it without adding any safekeeper events */ - wp->api.init_event_set(wp); - - /* - * loop through the existing safekeepers. If they aren't the one we're - * removing, and if they have a socket we can use, re-add the applicable - * events. - */ - for (int i = 0; i < wp->n_safekeepers; i++) - { - uint32 desired_events = WL_NO_EVENTS; - Safekeeper *sk = &wp->safekeeper[i]; - - if (sk == to_remove) - continue; - - /* If this safekeeper isn't offline, add an event for it! */ - if (sk->state != SS_OFFLINE) - { - desired_events = SafekeeperStateDesiredEvents(sk->state); - /* will set sk->eventPos */ - wp->api.add_safekeeper_event_set(sk, desired_events); - } - } -} /* Shuts down and cleans up the connection for a safekeeper. Sets its state to SS_OFFLINE */ static void ShutdownConnection(Safekeeper *sk) { - sk->wp->api.conn_finish(sk); sk->state = SS_OFFLINE; - sk->flushWrite = false; sk->streamingAt = InvalidXLogRecPtr; if (sk->voteResponse.termHistory.entries) pfree(sk->voteResponse.termHistory.entries); sk->voteResponse.termHistory.entries = NULL; - HackyRemoveWalProposerEvent(sk); + sk->wp->api.conn_finish(sk); + sk->wp->api.rm_safekeeper_event_set(sk); } /* @@ -474,7 +434,9 @@ ReconnectSafekeepers(WalProposer *wp) static void AdvancePollState(Safekeeper *sk, uint32 events) { +#ifdef WALPROPOSER_LIB /* walprop_log needs wp in lib build */ WalProposer *wp = sk->wp; +#endif /* * Sanity check. We assume further down that the operations don't block @@ -527,7 +489,7 @@ AdvancePollState(Safekeeper *sk, uint32 events) */ case SS_VOTING: walprop_log(WARNING, "EOF from node %s:%s in %s state", sk->host, - sk->port, FormatSafekeeperState(sk->state)); + sk->port, FormatSafekeeperState(sk)); ResetConnection(sk); return; @@ -556,7 +518,7 @@ AdvancePollState(Safekeeper *sk, uint32 events) */ case SS_IDLE: walprop_log(WARNING, "EOF from node %s:%s in %s state", sk->host, - sk->port, FormatSafekeeperState(sk->state)); + sk->port, FormatSafekeeperState(sk)); ResetConnection(sk); return; @@ -622,7 +584,7 @@ HandleConnectionEvent(Safekeeper *sk) * Because PQconnectPoll can change the socket, we have to un-register the * old event and re-register an event on the new socket. */ - HackyRemoveWalProposerEvent(sk); + wp->api.rm_safekeeper_event_set(sk); wp->api.add_safekeeper_event_set(sk, new_events); /* If we successfully connected, send START_WAL_PUSH query */ @@ -847,7 +809,7 @@ RecvVoteResponse(Safekeeper *sk) } else if (wp->n_votes > wp->quorum) { - /* recovery already performed, just start streaming */ + /* already elected, start streaming */ SendProposerElected(sk); } else @@ -873,21 +835,16 @@ HandleElectedProposer(WalProposer *wp) DetermineEpochStartLsn(wp); /* - * Check if not all safekeepers are up-to-date, we need to download WAL - * needed to synchronize them + * Synchronously download WAL from the most advanced safekeeper. We do + * that only for logical replication (and switching logical walsenders to + * neon_walreader is a todo.) */ - if (wp->truncateLsn < wp->propEpochStartLsn) + if (!wp->api.recovery_download(wp, &wp->safekeeper[wp->donor])) { - walprop_log(LOG, - "start recovery because truncateLsn=%X/%X is not " - "equal to epochStartLsn=%X/%X", - LSN_FORMAT_ARGS(wp->truncateLsn), - LSN_FORMAT_ARGS(wp->propEpochStartLsn)); - /* Perform recovery */ - if (!wp->api.recovery_download(&wp->safekeeper[wp->donor], wp->greetRequest.timeline, wp->truncateLsn, wp->propEpochStartLsn)) - walprop_log(FATAL, "Failed to recover state"); + walprop_log(FATAL, "failed to download WAL for logical replicaiton"); } - else if (wp->config->syncSafekeepers) + + if (wp->truncateLsn == wp->propEpochStartLsn && wp->config->syncSafekeepers) { /* Sync is not needed: just exit */ wp->api.finish_sync_safekeepers(wp, wp->propEpochStartLsn); @@ -1085,13 +1042,6 @@ DetermineEpochStartLsn(WalProposer *wp) } walprop_shared->mineLastElectedTerm = wp->propTerm; } - - /* - * WalProposer has just elected itself and initialized history, so we can - * call election callback. Usually it updates truncateLsn to fetch WAL for - * logical replication. - */ - wp->api.after_election(wp); } /* @@ -1112,6 +1062,9 @@ SendProposerElected(Safekeeper *sk) term_t lastCommonTerm; int i; + /* Now that we are ready to send it's a good moment to create WAL reader */ + wp->api.wal_reader_allocate(sk); + /* * Determine start LSN by comparing safekeeper's log term switch history * and proposer's, searching for the divergence point. @@ -1231,6 +1184,7 @@ StartStreaming(Safekeeper *sk) * once for a connection. */ sk->state = SS_ACTIVE; + sk->active_state = SS_ACTIVE_SEND; sk->streamingAt = sk->startStreamingAt; /* event set will be updated inside SendMessageToNode */ @@ -1289,9 +1243,13 @@ HandleActiveState(Safekeeper *sk, uint32 events) { WalProposer *wp = sk->wp; - uint32 newEvents = WL_SOCKET_READABLE; - - if (events & WL_SOCKET_WRITEABLE) + /* + * Note: we don't known which socket awoke us (sk or nwr). However, as + * SendAppendRequests always tries to send at least one msg in + * SS_ACTIVE_SEND be careful not to go there if are only after sk + * response, otherwise it'd create busy loop of pings. + */ + if (events & WL_SOCKET_WRITEABLE || sk->active_state == SS_ACTIVE_READ_WAL) if (!SendAppendRequests(sk)) return; @@ -1299,28 +1257,29 @@ HandleActiveState(Safekeeper *sk, uint32 events) if (!RecvAppendResponses(sk)) return; - /* - * We should wait for WL_SOCKET_WRITEABLE event if we have unflushed data - * in the buffer. - * - * LSN comparison checks if we have pending unsent messages. This check - * isn't necessary now, because we always send append messages immediately - * after arrival. But it's good to have it here in case we change this - * behavior in the future. - */ - if (sk->streamingAt != wp->availableLsn || sk->flushWrite) - newEvents |= WL_SOCKET_WRITEABLE; +#if PG_VERSION_NUM >= 150000 + /* expected never to happen, c.f. walprop_pg_active_state_update_event_set */ + if (events & WL_SOCKET_CLOSED) + { + walprop_log(WARNING, "connection to %s:%s in active state failed, got WL_SOCKET_CLOSED on neon_walreader socket", + sk->host, sk->port); + ShutdownConnection(sk); + return; + } +#endif - wp->api.update_event_set(sk, newEvents); + /* configures event set for yield whatever is the substate */ + wp->api.active_state_update_event_set(sk); } /* * Send WAL messages starting from sk->streamingAt until the end or non-writable - * socket, whichever comes first. Caller should take care of updating event set. - * Even if no unsent WAL is available, at least one empty message will be sent - * as a heartbeat, if socket is ready. + * socket or neon_walreader blocks, whichever comes first; active_state is + * updated accordingly. Caller should take care of updating event set. Even if + * no unsent WAL is available, at least one empty message will be sent as a + * heartbeat, if socket is ready. * - * Can change state if Async* functions encounter errors and reset connection. + * Resets state and kills the connections if any error on them is encountered. * Returns false in this case, true otherwise. */ static bool @@ -1328,11 +1287,11 @@ SendAppendRequests(Safekeeper *sk) { WalProposer *wp = sk->wp; XLogRecPtr endLsn; - AppendRequestHeader *req; PGAsyncWriteResult writeResult; bool sentAnything = false; + AppendRequestHeader *req; - if (sk->flushWrite) + if (sk->active_state == SS_ACTIVE_FLUSH) { if (!AsyncFlush(sk)) @@ -1343,76 +1302,101 @@ SendAppendRequests(Safekeeper *sk) return sk->state == SS_ACTIVE; /* Event set will be updated in the end of HandleActiveState */ - sk->flushWrite = false; + sk->active_state = SS_ACTIVE_SEND; } while (sk->streamingAt != wp->availableLsn || !sentAnything) { - sentAnything = true; - - endLsn = sk->streamingAt; - endLsn += MAX_SEND_SIZE; - - /* if we went beyond available WAL, back off */ - if (endLsn > wp->availableLsn) + if (sk->active_state == SS_ACTIVE_SEND) { - endLsn = wp->availableLsn; + sentAnything = true; + + endLsn = sk->streamingAt; + endLsn += MAX_SEND_SIZE; + + /* if we went beyond available WAL, back off */ + if (endLsn > wp->availableLsn) + { + endLsn = wp->availableLsn; + } + + req = &sk->appendRequest; + PrepareAppendRequest(sk->wp, &sk->appendRequest, sk->streamingAt, endLsn); + + walprop_log(DEBUG5, "sending message len %ld beginLsn=%X/%X endLsn=%X/%X commitLsn=%X/%X truncateLsn=%X/%X to %s:%s", + req->endLsn - req->beginLsn, + LSN_FORMAT_ARGS(req->beginLsn), + LSN_FORMAT_ARGS(req->endLsn), + LSN_FORMAT_ARGS(req->commitLsn), + LSN_FORMAT_ARGS(wp->truncateLsn), sk->host, sk->port); + + resetStringInfo(&sk->outbuf); + + /* write AppendRequest header */ + appendBinaryStringInfo(&sk->outbuf, (char *) req, sizeof(AppendRequestHeader)); + enlargeStringInfo(&sk->outbuf, req->endLsn - req->beginLsn); + sk->active_state = SS_ACTIVE_READ_WAL; } - req = &sk->appendRequest; - PrepareAppendRequest(sk->wp, &sk->appendRequest, sk->streamingAt, endLsn); - - walprop_log(DEBUG2, "sending message len %ld beginLsn=%X/%X endLsn=%X/%X commitLsn=%X/%X truncateLsn=%X/%X to %s:%s", - req->endLsn - req->beginLsn, - LSN_FORMAT_ARGS(req->beginLsn), - LSN_FORMAT_ARGS(req->endLsn), - LSN_FORMAT_ARGS(req->commitLsn), - LSN_FORMAT_ARGS(wp->truncateLsn), sk->host, sk->port); - - resetStringInfo(&sk->outbuf); - - /* write AppendRequest header */ - appendBinaryStringInfo(&sk->outbuf, (char *) req, sizeof(AppendRequestHeader)); - - /* write the WAL itself */ - enlargeStringInfo(&sk->outbuf, req->endLsn - req->beginLsn); - /* wal_read will raise error on failure */ - wp->api.wal_read(sk, - &sk->outbuf.data[sk->outbuf.len], - req->beginLsn, - req->endLsn - req->beginLsn); - sk->outbuf.len += req->endLsn - req->beginLsn; - - writeResult = wp->api.conn_async_write(sk, sk->outbuf.data, sk->outbuf.len); - - /* Mark current message as sent, whatever the result is */ - sk->streamingAt = endLsn; - - switch (writeResult) + if (sk->active_state == SS_ACTIVE_READ_WAL) { - case PG_ASYNC_WRITE_SUCCESS: - /* Continue writing the next message */ - break; + char *errmsg; - case PG_ASYNC_WRITE_TRY_FLUSH: + req = &sk->appendRequest; - /* - * * We still need to call PQflush some more to finish the - * job. Caller function will handle this by setting right - * event* set. - */ - sk->flushWrite = true; - return true; + switch (wp->api.wal_read(sk, + &sk->outbuf.data[sk->outbuf.len], + req->beginLsn, + req->endLsn - req->beginLsn, + &errmsg)) + { + case NEON_WALREAD_SUCCESS: + break; + case NEON_WALREAD_WOULDBLOCK: + return true; + case NEON_WALREAD_ERROR: + walprop_log(WARNING, "WAL reading for node %s:%s failed: %s", + sk->host, sk->port, errmsg); + ShutdownConnection(sk); + return false; + default: + Assert(false); + } - case PG_ASYNC_WRITE_FAIL: - walprop_log(WARNING, "Failed to send to node %s:%s in %s state: %s", - sk->host, sk->port, FormatSafekeeperState(sk->state), - wp->api.conn_error_message(sk)); - ShutdownConnection(sk); - return false; - default: - Assert(false); - return false; + sk->outbuf.len += req->endLsn - req->beginLsn; + + writeResult = wp->api.conn_async_write(sk, sk->outbuf.data, sk->outbuf.len); + + /* Mark current message as sent, whatever the result is */ + sk->streamingAt = req->endLsn; + + switch (writeResult) + { + case PG_ASYNC_WRITE_SUCCESS: + /* Continue writing the next message */ + sk->active_state = SS_ACTIVE_SEND; + break; + + case PG_ASYNC_WRITE_TRY_FLUSH: + + /* + * We still need to call PQflush some more to finish the + * job. Caller function will handle this by setting right + * event set. + */ + sk->active_state = SS_ACTIVE_FLUSH; + return true; + + case PG_ASYNC_WRITE_FAIL: + walprop_log(WARNING, "failed to send to node %s:%s in %s state: %s", + sk->host, sk->port, FormatSafekeeperState(sk), + wp->api.conn_error_message(sk)); + ShutdownConnection(sk); + return false; + default: + Assert(false); + return false; + } } } @@ -1422,7 +1406,7 @@ SendAppendRequests(Safekeeper *sk) /* * Receive and process all available feedback. * - * Can change state if Async* functions encounter errors and reset connection. + * Resets state and kills the connection if any error on it is encountered. * Returns false in this case, true otherwise. * * NB: This function can call SendMessageToNode and produce new messages. @@ -1608,39 +1592,77 @@ GetAcknowledgedByQuorumWALPosition(WalProposer *wp) return responses[wp->n_safekeepers - wp->quorum]; } +/* + * Return safekeeper with active connection from which WAL can be downloaded, or + * none if it doesn't exist. donor_lsn is set to end position of the donor to + * the best of our knowledge. + */ +Safekeeper * +GetDonor(WalProposer *wp, XLogRecPtr *donor_lsn) +{ + *donor_lsn = InvalidXLogRecPtr; + Safekeeper *donor = NULL; + int i; + + if (wp->n_votes < wp->quorum) + { + walprop_log(WARNING, "GetDonor called before elections are won"); + return NULL; + } + + /* + * First, consider node which had determined our term start LSN as we know + * about its position immediately after election before any feedbacks are + * sent. + */ + if (wp->safekeeper[wp->donor].state >= SS_IDLE) + { + donor = &wp->safekeeper[wp->donor]; + *donor_lsn = wp->propEpochStartLsn; + } + + /* + * But also check feedbacks from all nodes with live connections and take + * the highest one. Note: if node sends feedbacks it already processed + * elected message so its term is fine. + */ + for (i = 0; i < wp->n_safekeepers; i++) + { + Safekeeper *sk = &wp->safekeeper[i]; + + if (sk->state == SS_ACTIVE && sk->appendResponse.flushLsn > *donor_lsn) + { + donor = sk; + *donor_lsn = sk->appendResponse.flushLsn; + } + } + return donor; +} + static void HandleSafekeeperResponse(WalProposer *wp) { XLogRecPtr minQuorumLsn; - XLogRecPtr minFlushLsn; + XLogRecPtr candidateTruncateLsn; minQuorumLsn = GetAcknowledgedByQuorumWALPosition(wp); wp->api.process_safekeeper_feedback(wp, minQuorumLsn); /* - * Try to advance truncateLsn to minFlushLsn, which is the last record - * flushed to all safekeepers. We must always start streaming from the - * beginning of the record, which simplifies decoding on the far end. + * Try to advance truncateLsn -- the last record flushed to all + * safekeepers. * - * Advanced truncateLsn should be not further than nearest commitLsn. This - * prevents surprising violation of truncateLsn <= commitLsn invariant - * which might occur because 1) truncateLsn can be advanced immediately - * once chunk is broadcast to all safekeepers, and commitLsn generally - * can't be advanced based on feedback from safekeeper who is still in the - * previous epoch (similar to 'leader can't commit entries from previous - * term' in Raft); 2) chunks we read from WAL and send are plain sheets of - * bytes, but safekeepers ack only on record boundaries. + * Advanced truncateLsn should be not higher than commitLsn. This prevents + * surprising violation of truncateLsn <= commitLsn invariant which might + * occur because commitLsn generally can't be advanced based on feedback + * from safekeeper who is still in the previous epoch (similar to 'leader + * can't commit entries from previous term' in Raft); 2) */ - minFlushLsn = CalculateMinFlushLsn(wp); - if (minFlushLsn > wp->truncateLsn) + candidateTruncateLsn = CalculateMinFlushLsn(wp); + candidateTruncateLsn = Min(candidateTruncateLsn, minQuorumLsn); + if (candidateTruncateLsn > wp->truncateLsn) { - wp->truncateLsn = minFlushLsn; - - /* - * Advance the replication slot to free up old WAL files. Note that - * slot doesn't exist if we are in syncSafekeepers mode. - */ - wp->api.confirm_wal_streamed(wp, wp->truncateLsn); + wp->truncateLsn = candidateTruncateLsn; } /* @@ -1713,7 +1735,7 @@ AsyncRead(Safekeeper *sk, char **buf, int *buf_size) case PG_ASYNC_READ_FAIL: walprop_log(WARNING, "Failed to read from node %s:%s in %s state: %s", sk->host, - sk->port, FormatSafekeeperState(sk->state), + sk->port, FormatSafekeeperState(sk), wp->api.conn_error_message(sk)); ShutdownConnection(sk); return false; @@ -1753,7 +1775,7 @@ AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg) if (tag != anymsg->tag) { walprop_log(WARNING, "unexpected message tag %c from node %s:%s in state %s", (char) tag, sk->host, - sk->port, FormatSafekeeperState(sk->state)); + sk->port, FormatSafekeeperState(sk)); ResetConnection(sk); return false; } @@ -1824,12 +1846,13 @@ static bool BlockingWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState success_state) { WalProposer *wp = sk->wp; - uint32 events; + uint32 sk_events; + uint32 nwr_events; if (!wp->api.conn_blocking_write(sk, msg, msg_size)) { walprop_log(WARNING, "Failed to send to node %s:%s in %s state: %s", - sk->host, sk->port, FormatSafekeeperState(sk->state), + sk->host, sk->port, FormatSafekeeperState(sk), wp->api.conn_error_message(sk)); ShutdownConnection(sk); return false; @@ -1841,9 +1864,15 @@ BlockingWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState succes * If the new state will be waiting for events to happen, update the event * set to wait for those */ - events = SafekeeperStateDesiredEvents(success_state); - if (events) - wp->api.update_event_set(sk, events); + SafekeeperStateDesiredEvents(sk, &sk_events, &nwr_events); + + /* + * nwr_events is relevant only during SS_ACTIVE which doesn't use + * BlockingWrite + */ + Assert(!nwr_events); + if (sk_events) + wp->api.update_event_set(sk, sk_events); return true; } @@ -1876,7 +1905,7 @@ AsyncWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState flush_sta return false; case PG_ASYNC_WRITE_FAIL: walprop_log(WARNING, "Failed to send to node %s:%s in %s state: %s", - sk->host, sk->port, FormatSafekeeperState(sk->state), + sk->host, sk->port, FormatSafekeeperState(sk), wp->api.conn_error_message(sk)); ShutdownConnection(sk); return false; @@ -1915,7 +1944,7 @@ AsyncFlush(Safekeeper *sk) return false; case -1: walprop_log(WARNING, "Failed to flush write to node %s:%s in %s state: %s", - sk->host, sk->port, FormatSafekeeperState(sk->state), + sk->host, sk->port, FormatSafekeeperState(sk), wp->api.conn_error_message(sk)); ResetConnection(sk); return false; @@ -1945,18 +1974,18 @@ CompareLsn(const void *a, const void *b) * * The strings are intended to be used as a prefix to "state", e.g.: * - * walprop_log(LOG, "currently in %s state", FormatSafekeeperState(sk->state)); + * walprop_log(LOG, "currently in %s state", FormatSafekeeperState(sk)); * * If this sort of phrasing doesn't fit the message, instead use something like: * - * walprop_log(LOG, "currently in state [%s]", FormatSafekeeperState(sk->state)); + * walprop_log(LOG, "currently in state [%s]", FormatSafekeeperState(sk)); */ static char * -FormatSafekeeperState(SafekeeperState state) +FormatSafekeeperState(Safekeeper *sk) { char *return_val = NULL; - switch (state) + switch (sk->state) { case SS_OFFLINE: return_val = "offline"; @@ -1984,7 +2013,18 @@ FormatSafekeeperState(SafekeeperState state) return_val = "idle"; break; case SS_ACTIVE: - return_val = "active"; + switch (sk->active_state) + { + case SS_ACTIVE_SEND: + return_val = "active send"; + break; + case SS_ACTIVE_READ_WAL: + return_val = "active read WAL"; + break; + case SS_ACTIVE_FLUSH: + return_val = "active flush"; + break; + } break; } @@ -1997,22 +2037,21 @@ FormatSafekeeperState(SafekeeperState state) static void AssertEventsOkForState(uint32 events, Safekeeper *sk) { - WalProposer *wp = sk->wp; - uint32 expected = SafekeeperStateDesiredEvents(sk->state); - - /* - * The events are in-line with what we're expecting, under two conditions: - * (a) if we aren't expecting anything, `events` has no read- or - * write-ready component. (b) if we are expecting something, there's - * overlap (i.e. `events & expected != 0`) - */ + uint32 sk_events; + uint32 nwr_events; + uint32 expected; bool events_ok_for_state; /* long name so the `Assert` is more * clear later */ + WalProposer *wp = sk->wp; - if (expected == WL_NO_EVENTS) - events_ok_for_state = ((events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE)) == 0); - else - events_ok_for_state = ((events & expected) != 0); + SafekeeperStateDesiredEvents(sk, &sk_events, &nwr_events); + + /* + * Without one more level of notify target indirection we have no way to + * distinguish which socket woke up us, so just union expected events. + */ + expected = sk_events | nwr_events; + events_ok_for_state = ((events & expected) != 0); if (!events_ok_for_state) { @@ -2021,36 +2060,39 @@ AssertEventsOkForState(uint32 events, Safekeeper *sk) * and then an assertion that's guaranteed to fail. */ walprop_log(WARNING, "events %s mismatched for safekeeper %s:%s in state [%s]", - FormatEvents(wp, events), sk->host, sk->port, FormatSafekeeperState(sk->state)); + FormatEvents(wp, events), sk->host, sk->port, FormatSafekeeperState(sk)); Assert(events_ok_for_state); } } -/* Returns the set of events a safekeeper in this state should be waiting on +/* Returns the set of events for both safekeeper (sk_events) and neon_walreader + * (nwr_events) sockets a safekeeper in this state should be waiting on. * * This will return WL_NO_EVENTS (= 0) for some events. */ -static uint32 -SafekeeperStateDesiredEvents(SafekeeperState state) +void +SafekeeperStateDesiredEvents(Safekeeper *sk, uint32 *sk_events, uint32 *nwr_events) { - uint32 result = WL_NO_EVENTS; + WalProposer *wp = sk->wp; + + *nwr_events = 0; /* nwr_events is empty for most states */ /* If the state doesn't have a modifier, we can check the base state */ - switch (state) + switch (sk->state) { /* Connecting states say what they want in the name */ case SS_CONNECTING_READ: - result = WL_SOCKET_READABLE; - break; + *sk_events = WL_SOCKET_READABLE; + return; case SS_CONNECTING_WRITE: - result = WL_SOCKET_WRITEABLE; - break; + *sk_events = WL_SOCKET_WRITEABLE; + return; /* Reading states need the socket to be read-ready to continue */ case SS_WAIT_EXEC_RESULT: case SS_HANDSHAKE_RECV: case SS_WAIT_VERDICT: - result = WL_SOCKET_READABLE; - break; + *sk_events = WL_SOCKET_READABLE; + return; /* * Idle states use read-readiness as a sign that the connection @@ -2058,32 +2100,66 @@ SafekeeperStateDesiredEvents(SafekeeperState state) */ case SS_VOTING: case SS_IDLE: - result = WL_SOCKET_READABLE; - break; + *sk_events = WL_SOCKET_READABLE; + return; - /* - * Flush states require write-ready for flushing. Active state - * does both reading and writing. - * - * TODO: SS_ACTIVE sometimes doesn't need to be write-ready. We - * should check sk->flushWrite here to set WL_SOCKET_WRITEABLE. - */ case SS_SEND_ELECTED_FLUSH: + *sk_events = WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE; + return; + case SS_ACTIVE: - result = WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE; - break; + switch (sk->active_state) + { + /* + * Everything is sent; we just wait for sk responses and + * latch. + * + * Note: this assumes we send all available WAL to + * safekeeper in one wakeup (unless it blocks). Otherwise + * we would want WL_SOCKET_WRITEABLE here to finish the + * work. + */ + case SS_ACTIVE_SEND: + *sk_events = WL_SOCKET_READABLE; + /* c.f. walprop_pg_active_state_update_event_set */ +#if PG_VERSION_NUM >= 150000 + if (wp->api.wal_reader_events(sk)) + *nwr_events = WL_SOCKET_CLOSED; +#endif /* on PG 14 nwr_events remains 0 */ + return; + + /* + * Waiting for neon_walreader socket, but we still read + * responses from sk socket. + */ + case SS_ACTIVE_READ_WAL: + *sk_events = WL_SOCKET_READABLE; + *nwr_events = wp->api.wal_reader_events(sk); + return; + + /* + * Need to flush the sk socket, so ignore neon_walreader + * one and set write interest on sk. + */ + case SS_ACTIVE_FLUSH: + *sk_events = WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE; +#if PG_VERSION_NUM >= 150000 + /* c.f. walprop_pg_active_state_update_event_set */ + if (wp->api.wal_reader_events(sk)) + *nwr_events = WL_SOCKET_CLOSED; +#endif /* on PG 14 nwr_events remains 0 */ + return; + } + return; /* The offline state expects no events. */ case SS_OFFLINE: - result = WL_NO_EVENTS; - break; + *sk_events = 0; + return; default: Assert(false); - break; } - - return result; } /* Returns a human-readable string corresponding to the event set diff --git a/pgxn/neon/walproposer.h b/pgxn/neon/walproposer.h index 615018c58e..6d478076fe 100644 --- a/pgxn/neon/walproposer.h +++ b/pgxn/neon/walproposer.h @@ -1,14 +1,15 @@ #ifndef __NEON_WALPROPOSER_H__ #define __NEON_WALPROPOSER_H__ -#include "postgres.h" -#include "access/xlogdefs.h" -#include "port.h" -#include "access/xlog_internal.h" #include "access/transam.h" +#include "access/xlogdefs.h" +#include "access/xlog_internal.h" #include "nodes/replnodes.h" -#include "utils/uuid.h" #include "replication/walreceiver.h" +#include "utils/uuid.h" + +#include "libpqwalproposer.h" +#include "neon_walreader.h" #define SK_MAGIC 0xCafeCeefu #define SK_PROTOCOL_VERSION 2 @@ -22,43 +23,9 @@ */ #define WL_NO_EVENTS 0 -struct WalProposerConn; /* Defined in implementation (walprop_pg.c) */ +struct WalProposerConn; /* Defined in libpqwalproposer.h */ typedef struct WalProposerConn WalProposerConn; -/* Possible return values from ReadPGAsync */ -typedef enum -{ - /* The full read was successful. buf now points to the data */ - PG_ASYNC_READ_SUCCESS, - - /* - * The read is ongoing. Wait until the connection is read-ready, then try - * again. - */ - PG_ASYNC_READ_TRY_AGAIN, - /* Reading failed. Check PQerrorMessage(conn) */ - PG_ASYNC_READ_FAIL, -} PGAsyncReadResult; - -/* Possible return values from WritePGAsync */ -typedef enum -{ - /* The write fully completed */ - PG_ASYNC_WRITE_SUCCESS, - - /* - * The write started, but you'll need to call PQflush some more times to - * finish it off. We just tried, so it's best to wait until the connection - * is read- or write-ready to try again. - * - * If it becomes read-ready, call PQconsumeInput and flush again. If it - * becomes write-ready, just call PQflush. - */ - PG_ASYNC_WRITE_TRY_FLUSH, - /* Writing failed. Check PQerrorMessage(conn) */ - PG_ASYNC_WRITE_FAIL, -} PGAsyncWriteResult; - /* * WAL safekeeper state, which is used to wait for some event. * @@ -135,6 +102,40 @@ typedef enum SS_ACTIVE, } SafekeeperState; +/* + * Sending WAL substates of SS_ACTIVE. + */ +typedef enum +{ + /* + * We are ready to send more WAL, waiting for latch set to learn about + * more WAL becoming available (or just a timeout to send heartbeat). + */ + SS_ACTIVE_SEND, + + /* + * Polling neon_walreader to receive chunk of WAL (probably remotely) to + * send to this safekeeper. + * + * Note: socket management is done completely inside walproposer_pg for + * simplicity, and thus simulation doesn't test it. Which is fine as + * simulation is mainly aimed at consensus checks, not waiteventset + * management. + * + * Also, while in this state we don't touch safekeeper socket, so in + * theory it might close connection as inactive. This can be addressed if + * needed; however, while fetching WAL we should regularly send it, so the + * problem is unlikely. Vice versa is also true (SS_ACTIVE doesn't handle + * walreader socket), but similarly shouldn't be a problem. + */ + SS_ACTIVE_READ_WAL, + + /* + * Waiting for write readiness to flush the socket. + */ + SS_ACTIVE_FLUSH, +} SafekeeperActiveState; + /* Consensus logical timestamp. */ typedef uint64 term_t; @@ -343,12 +344,11 @@ typedef struct Safekeeper */ XLogRecPtr startStreamingAt; - bool flushWrite; /* set to true if we need to call AsyncFlush,* - * to flush pending messages */ XLogRecPtr streamingAt; /* current streaming position */ AppendRequestHeader appendRequest; /* request for sending to safekeeper */ SafekeeperState state; /* safekeeper state machine state */ + SafekeeperActiveState active_state; TimestampTz latestMsgReceivedAt; /* when latest msg is received */ AcceptorGreeting greetResponse; /* acceptor greeting */ VoteResponse voteResponse; /* the vote */ @@ -369,12 +369,27 @@ typedef struct Safekeeper /* * WAL reader, allocated for each safekeeper. */ - XLogReaderState *xlogreader; + NeonWALReader *xlogreader; /* * Position in wait event set. Equal to -1 if no event */ int eventPos; + + /* + * Neon WAL reader position in wait event set, or -1 if no socket. Note + * that event must be removed not only on error/failure, but also on + * successful *local* read, as next read might again be remote, but with + * different socket. + */ + int nwrEventPos; + + /* + * Per libpq docs, during connection establishment socket might change, + * remember here if it is stable to avoid readding to the event set if + * possible. Must be reset whenever nwr event is deleted. + */ + bool nwrConnEstablished; #endif @@ -403,31 +418,6 @@ typedef enum */ } WalProposerConnectPollStatusType; -/* Re-exported and modified ExecStatusType */ -typedef enum -{ - /* We received a single CopyBoth result */ - WP_EXEC_SUCCESS_COPYBOTH, - - /* - * Any success result other than a single CopyBoth was received. The - * specifics of the result were already logged, but it may be useful to - * provide an error message indicating which safekeeper messed up. - * - * Do not expect PQerrorMessage to be appropriately set. - */ - WP_EXEC_UNEXPECTED_SUCCESS, - - /* - * No result available at this time. Wait until read-ready, then call - * again. Internally, this is returned when PQisBusy indicates that - * PQgetResult would block. - */ - WP_EXEC_NEEDS_INPUT, - /* Catch-all failure. Check PQerrorMessage. */ - WP_EXEC_FAILED, -} WalProposerExecStatusType; - /* Re-exported ConnStatusType */ typedef enum { @@ -488,7 +478,7 @@ typedef struct walproposer_api /* Flush buffer to the network, aka PQflush. */ int (*conn_flush) (Safekeeper *sk); - /* Close the connection, aka PQfinish. */ + /* Reset sk state: close pq connection, deallocate xlogreader. */ void (*conn_finish) (Safekeeper *sk); /* @@ -505,17 +495,20 @@ typedef struct walproposer_api /* Blocking CopyData write, aka PQputCopyData + PQflush. */ bool (*conn_blocking_write) (Safekeeper *sk, void const *buf, size_t size); - /* Download WAL from startpos to endpos and make it available locally. */ - bool (*recovery_download) (Safekeeper *sk, TimeLineID timeline, XLogRecPtr startpos, XLogRecPtr endpos); - - /* Read WAL from disk to buf. */ - void (*wal_read) (Safekeeper *sk, char *buf, XLogRecPtr startptr, Size count); + /* + * Download WAL before basebackup for logical walsenders from sk, if + * needed + */ + bool (*recovery_download) (WalProposer *wp, Safekeeper *sk); /* Allocate WAL reader. */ void (*wal_reader_allocate) (Safekeeper *sk); - /* Deallocate event set. */ - void (*free_event_set) (WalProposer *wp); + /* Read WAL from disk to buf. */ + NeonWALReadResult (*wal_read) (Safekeeper *sk, char *buf, XLogRecPtr startptr, Size count, char **errmsg); + + /* Returns events to be awaited on WAL reader, if any. */ + uint32 (*wal_reader_events) (Safekeeper *sk); /* Initialize event set. */ void (*init_event_set) (WalProposer *wp); @@ -523,9 +516,15 @@ typedef struct walproposer_api /* Update events for an existing safekeeper connection. */ void (*update_event_set) (Safekeeper *sk, uint32 events); + /* Configure wait event set for yield in SS_ACTIVE. */ + void (*active_state_update_event_set) (Safekeeper *sk); + /* Add a new safekeeper connection to the event set. */ void (*add_safekeeper_event_set) (Safekeeper *sk, uint32 events); + /* Remove safekeeper connection from event set */ + void (*rm_safekeeper_event_set) (Safekeeper *sk); + /* * Wait until some event happens: - timeout is reached - socket event for * safekeeper connection - new WAL is available @@ -558,26 +557,12 @@ typedef struct walproposer_api */ void (*process_safekeeper_feedback) (WalProposer *wp, XLogRecPtr commitLsn); - /* - * Called on peer_horizon_lsn updates. Used to advance replication slot - * and to free up disk space by deleting unnecessary WAL. - */ - void (*confirm_wal_streamed) (WalProposer *wp, XLogRecPtr lsn); - /* * Write a log message to the internal log processor. This is used only * when walproposer is compiled as a library. Otherwise, all logging is * handled by elog(). */ void (*log_internal) (WalProposer *wp, int level, const char *line); - - /* - * Called right after the proposer was elected, but before it started - * recovery and sent ProposerElected message to the safekeepers. - * - * Used by logical replication to update truncateLsn. - */ - void (*after_election) (WalProposer *wp); } walproposer_api; /* @@ -711,6 +696,13 @@ extern void WalProposerBroadcast(WalProposer *wp, XLogRecPtr startpos, XLogRecPt extern void WalProposerPoll(WalProposer *wp); extern void WalProposerFree(WalProposer *wp); +/* + * WaitEventSet API doesn't allow to remove socket, so walproposer_pg uses it to + * recreate set from scratch, hence the export. + */ +extern void SafekeeperStateDesiredEvents(Safekeeper *sk, uint32 *sk_events, uint32 *nwr_events); +extern Safekeeper *GetDonor(WalProposer *wp, XLogRecPtr *donor_lsn); + #define WPEVENT 1337 /* special log level for walproposer internal * events */ diff --git a/pgxn/neon/walproposer_compat.c b/pgxn/neon/walproposer_compat.c index 04b519ab15..35d984c52e 100644 --- a/pgxn/neon/walproposer_compat.c +++ b/pgxn/neon/walproposer_compat.c @@ -3,11 +3,13 @@ * This is needed to avoid linking to full postgres server installation. This file * is compiled as a part of libwalproposer static library. */ +#include "postgres.h" #include -#include "walproposer.h" -#include "utils/datetime.h" + #include "miscadmin.h" +#include "utils/datetime.h" +#include "walproposer.h" void ExceptionalCondition(const char *conditionName, diff --git a/pgxn/neon/walproposer_pg.c b/pgxn/neon/walproposer_pg.c index 551d56d416..7773aabfab 100644 --- a/pgxn/neon/walproposer_pg.c +++ b/pgxn/neon/walproposer_pg.c @@ -12,6 +12,7 @@ #include #include #include "access/xact.h" +#include "access/xlog.h" #include "access/xlogdefs.h" #include "access/xlogutils.h" #include "access/xloginsert.h" @@ -43,14 +44,19 @@ #include "utils/ps_status.h" #include "utils/timestamp.h" -#include "neon.h" -#include "walproposer.h" #include "libpq-fe.h" +#include "libpqwalproposer.h" +#include "neon.h" +#include "neon_walreader.h" +#include "walproposer.h" + #define XLOG_HDR_SIZE (1 + 8 * 3) /* 'w' + startPos + walEnd + timestamp */ #define XLOG_HDR_START_POS 1 /* offset of start position in wal sender* * message header */ +#define MB ((XLogRecPtr)1024 * 1024) + #define WAL_PROPOSER_SLOT_NAME "wal_proposer_slot" char *wal_acceptors_list = ""; @@ -91,6 +97,12 @@ static void XLogBroadcastWalProposer(WalProposer *wp); static void XLogWalPropWrite(WalProposer *wp, char *buf, Size nbytes, XLogRecPtr recptr); static void XLogWalPropClose(XLogRecPtr recptr); +static void add_nwr_event_set(Safekeeper *sk, uint32 events); +static void update_nwr_event_set(Safekeeper *sk, uint32 events); +static void rm_safekeeper_event_set(Safekeeper *to_remove, bool is_sk); + +static XLogRecPtr GetLogRepRestartLSN(WalProposer *wp); + static void init_walprop_config(bool syncSafekeepers) { @@ -214,7 +226,6 @@ backpressure_lag_impl(void) XLogRecPtr myFlushLsn = GetFlushRecPtr(); #endif replication_feedback_get_lsns(&writePtr, &flushPtr, &applyPtr); -#define MB ((XLogRecPtr)1024 * 1024) elog(DEBUG2, "current flushLsn %X/%X PageserverFeedback: write %X/%X flush %X/%X apply %X/%X", LSN_FORMAT_ARGS(myFlushLsn), @@ -541,14 +552,6 @@ walprop_pg_load_libpqwalreceiver(void) elog(ERROR, "libpqwalreceiver didn't initialize correctly"); } -/* Header in walproposer.h -- Wrapper struct to abstract away the libpq connection */ -struct WalProposerConn -{ - PGconn *pg_conn; - bool is_nonblocking; /* whether the connection is non-blocking */ - char *recvbuf; /* last received data from walprop_async_read */ -}; - /* Helper function */ static bool ensure_nonblocking_status(WalProposerConn *conn, bool is_nonblocking) @@ -586,16 +589,17 @@ walprop_status(Safekeeper *sk) } } -static void -walprop_connect_start(Safekeeper *sk) +WalProposerConn * +libpqwp_connect_start(char *conninfo) { + PGconn *pg_conn; + WalProposerConn *conn; const char *keywords[3]; const char *values[3]; int n; char *password = neon_auth_token; - Assert(sk->conn == NULL); /* * Connect using the given connection string. If the NEON_AUTH_TOKEN @@ -614,7 +618,7 @@ walprop_connect_start(Safekeeper *sk) n++; } keywords[n] = "dbname"; - values[n] = sk->conninfo; + values[n] = conninfo; n++; keywords[n] = NULL; values[n] = NULL; @@ -635,11 +639,20 @@ walprop_connect_start(Safekeeper *sk) * palloc will exit on failure though, so there's not much we could do if * it *did* fail. */ - sk->conn = palloc(sizeof(WalProposerConn)); - sk->conn->pg_conn = pg_conn; - sk->conn->is_nonblocking = false; /* connections always start in - * blocking mode */ - sk->conn->recvbuf = NULL; + conn = palloc(sizeof(WalProposerConn)); + conn->pg_conn = pg_conn; + conn->is_nonblocking = false; /* connections always start in blocking + * mode */ + conn->recvbuf = NULL; + return conn; +} + +static void +walprop_connect_start(Safekeeper *sk) +{ + Assert(sk->conn == NULL); + sk->conn = libpqwp_connect_start(sk->conninfo); + } static WalProposerConnectPollStatusType @@ -683,26 +696,33 @@ walprop_connect_poll(Safekeeper *sk) return return_val; } -static bool -walprop_send_query(Safekeeper *sk, char *query) +extern bool +libpqwp_send_query(WalProposerConn *conn, char *query) { /* * We need to be in blocking mode for sending the query to run without * requiring a call to PQflush */ - if (!ensure_nonblocking_status(sk->conn, false)) + if (!ensure_nonblocking_status(conn, false)) return false; /* PQsendQuery returns 1 on success, 0 on failure */ - if (!PQsendQuery(sk->conn->pg_conn, query)) + if (!PQsendQuery(conn->pg_conn, query)) return false; return true; } -static WalProposerExecStatusType -walprop_get_query_result(Safekeeper *sk) +static bool +walprop_send_query(Safekeeper *sk, char *query) { + return libpqwp_send_query(sk->conn, query); +} + +WalProposerExecStatusType +libpqwp_get_query_result(WalProposerConn *conn) +{ + PGresult *result; WalProposerExecStatusType return_val; @@ -710,14 +730,14 @@ walprop_get_query_result(Safekeeper *sk) char *unexpected_success = NULL; /* Consume any input that we might be missing */ - if (!PQconsumeInput(sk->conn->pg_conn)) + if (!PQconsumeInput(conn->pg_conn)) return WP_EXEC_FAILED; - if (PQisBusy(sk->conn->pg_conn)) + if (PQisBusy(conn->pg_conn)) return WP_EXEC_NEEDS_INPUT; - result = PQgetResult(sk->conn->pg_conn); + result = PQgetResult(conn->pg_conn); /* * PQgetResult returns NULL only if getting the result was successful & @@ -778,6 +798,12 @@ walprop_get_query_result(Safekeeper *sk) return return_val; } +static WalProposerExecStatusType +walprop_get_query_result(Safekeeper *sk) +{ + return libpqwp_get_query_result(sk->conn); +} + static pgsocket walprop_socket(Safekeeper *sk) { @@ -790,42 +816,31 @@ walprop_flush(Safekeeper *sk) return (PQflush(sk->conn->pg_conn)); } -static void -walprop_finish(Safekeeper *sk) +/* Like libpqrcv_receive. *buf is valid until the next call. */ +PGAsyncReadResult +libpqwp_async_read(WalProposerConn *conn, char **buf, int *amount) { - if (!sk->conn) - return; + int rawlen; - if (sk->conn->recvbuf != NULL) - PQfreemem(sk->conn->recvbuf); - PQfinish(sk->conn->pg_conn); - pfree(sk->conn); - sk->conn = NULL; -} - -/* - * Receive a message from the safekeeper. - * - * On success, the data is placed in *buf. It is valid until the next call - * to this function. - */ -static PGAsyncReadResult -walprop_async_read(Safekeeper *sk, char **buf, int *amount) -{ - int result; - - if (sk->conn->recvbuf != NULL) + if (conn->recvbuf != NULL) { - PQfreemem(sk->conn->recvbuf); - sk->conn->recvbuf = NULL; + PQfreemem(conn->recvbuf); + conn->recvbuf = NULL; } - /* Call PQconsumeInput so that we have the data we need */ - if (!PQconsumeInput(sk->conn->pg_conn)) + /* Try to receive a CopyData message */ + rawlen = PQgetCopyData(conn->pg_conn, &conn->recvbuf, true); + if (rawlen == 0) { - *amount = 0; - *buf = NULL; - return PG_ASYNC_READ_FAIL; + /* Try consuming some data. */ + if (!PQconsumeInput(conn->pg_conn)) + { + *amount = 0; + *buf = NULL; + return PG_ASYNC_READ_FAIL; + } + /* Now that we've consumed some input, try again */ + rawlen = PQgetCopyData(conn->pg_conn, &conn->recvbuf, true); } /* @@ -839,7 +854,7 @@ walprop_async_read(Safekeeper *sk, char **buf, int *amount) * sometimes be triggered by the server returning an ErrorResponse (which * also happens to have the effect that the copy is done). */ - switch (result = PQgetCopyData(sk->conn->pg_conn, &sk->conn->recvbuf, true)) + switch (rawlen) { case 0: *amount = 0; @@ -854,7 +869,7 @@ walprop_async_read(Safekeeper *sk, char **buf, int *amount) * We can check PQgetResult to make sure that the server * failed; it'll always result in PGRES_FATAL_ERROR */ - ExecStatusType status = PQresultStatus(PQgetResult(sk->conn->pg_conn)); + ExecStatusType status = PQresultStatus(PQgetResult(conn->pg_conn)); if (status != PGRES_FATAL_ERROR) elog(FATAL, "unexpected result status %d after failed PQgetCopyData", status); @@ -874,12 +889,24 @@ walprop_async_read(Safekeeper *sk, char **buf, int *amount) return PG_ASYNC_READ_FAIL; default: /* Positive values indicate the size of the returned result */ - *amount = result; - *buf = sk->conn->recvbuf; + *amount = rawlen; + *buf = conn->recvbuf; return PG_ASYNC_READ_SUCCESS; } } +/* + * Receive a message from the safekeeper. + * + * On success, the data is placed in *buf. It is valid until the next call + * to this function. + */ +static PGAsyncReadResult +walprop_async_read(Safekeeper *sk, char **buf, int *amount) +{ + return libpqwp_async_read(sk->conn, buf, amount); +} + static PGAsyncWriteResult walprop_async_write(Safekeeper *sk, void const *buf, size_t size) { @@ -962,6 +989,33 @@ walprop_blocking_write(Safekeeper *sk, void const *buf, size_t size) return true; } +void +libpqwp_disconnect(WalProposerConn *conn) +{ + if (conn->recvbuf != NULL) + PQfreemem(conn->recvbuf); + PQfinish(conn->pg_conn); + pfree(conn); +} + +static void +walprop_finish(Safekeeper *sk) +{ + if (sk->conn) + { + libpqwp_disconnect(sk->conn); + sk->conn = NULL; + } + + /* free xlogreader */ + if (sk->xlogreader) + { + NeonWALReaderFree(sk->xlogreader); + sk->xlogreader = NULL; + } + rm_safekeeper_event_set(sk, false); +} + /* * Subscribe for new WAL and stream it in the loop to safekeepers. * @@ -1165,16 +1219,38 @@ XLogBroadcastWalProposer(WalProposer *wp) } } -/* - * Receive WAL from most advanced safekeeper - */ +/* Download WAL before basebackup for logical walsenders from sk, if needed */ static bool -WalProposerRecovery(Safekeeper *sk, TimeLineID timeline, XLogRecPtr startpos, XLogRecPtr endpos) +WalProposerRecovery(WalProposer *wp, Safekeeper *sk) { char *err; WalReceiverConn *wrconn; WalRcvStreamOptions options; char conninfo[MAXCONNINFO]; + TimeLineID timeline; + XLogRecPtr startpos; + XLogRecPtr endpos; + uint64 download_range_mb; + + startpos = GetLogRepRestartLSN(wp); + if (startpos == InvalidXLogRecPtr) + return true; /* recovery not needed */ + endpos = wp->propEpochStartLsn; + + /* + * If we need to download more than a max_slot_wal_keep_size, cap to it to + * avoid risk of exploding pg_wal. Logical replication won't work until + * recreated, but at least compute would start; this also follows + * max_slot_wal_keep_size semantics. + */ + download_range_mb = (endpos - startpos) / 1024 / 1024; + if (max_slot_wal_keep_size_mb > 0 && download_range_mb >= max_slot_wal_keep_size_mb) + { + startpos = endpos - max_slot_wal_keep_size_mb * 1024 * 1024; + walprop_log(WARNING, "capped WAL download for logical replication to %X/%X as max_slot_wal_keep_size=%dMB", + LSN_FORMAT_ARGS(startpos), max_slot_wal_keep_size_mb); + } + timeline = wp->greetRequest.timeline; if (!neon_auth_token) { @@ -1204,7 +1280,7 @@ WalProposerRecovery(Safekeeper *sk, TimeLineID timeline, XLogRecPtr startpos, XL return false; } elog(LOG, - "start recovery from %s:%s starting from %X/%08X till %X/%08X timeline " + "start recovery for logical replication from %s:%s starting from %X/%08X till %X/%08X timeline " "%d", sk->host, sk->port, (uint32) (startpos >> 32), (uint32) startpos, (uint32) (endpos >> 32), (uint32) endpos, timeline); @@ -1400,30 +1476,56 @@ XLogWalPropClose(XLogRecPtr recptr) walpropFile = -1; } -static void -walprop_pg_wal_read(Safekeeper *sk, char *buf, XLogRecPtr startptr, Size count) -{ - WALReadError errinfo; - - if (!WALRead(sk->xlogreader, - buf, - startptr, - count, - walprop_pg_get_timeline_id(), - &errinfo)) - { - WALReadRaiseError(&errinfo); - } -} - static void walprop_pg_wal_reader_allocate(Safekeeper *sk) { - sk->xlogreader = XLogReaderAllocate(wal_segment_size, NULL, XL_ROUTINE(.segment_open = wal_segment_open,.segment_close = wal_segment_close), NULL); + char log_prefix[64]; + + snprintf(log_prefix, sizeof(log_prefix), "sk %s:%s nwr: ", sk->host, sk->port); + Assert(!sk->xlogreader); + sk->xlogreader = NeonWALReaderAllocate(wal_segment_size, sk->wp->propEpochStartLsn, sk->wp, log_prefix); if (sk->xlogreader == NULL) elog(FATAL, "Failed to allocate xlog reader"); } +static NeonWALReadResult +walprop_pg_wal_read(Safekeeper *sk, char *buf, XLogRecPtr startptr, Size count, char **errmsg) +{ + NeonWALReadResult res; + + res = NeonWALRead(sk->xlogreader, + buf, + startptr, + count, + walprop_pg_get_timeline_id()); + + if (res == NEON_WALREAD_SUCCESS) + { + /* + * If we have the socket subscribed, but walreader doesn't need any + * events, it must mean that remote connection just closed hoping to + * do next read locally. Remove the socket then. It is important to do + * as otherwise next read might open another connection and we won't + * be able to distinguish whether we have correct socket added in wait + * event set. + */ + if (NeonWALReaderEvents(sk->xlogreader) == 0) + rm_safekeeper_event_set(sk, false); + } + else if (res == NEON_WALREAD_ERROR) + { + *errmsg = NeonWALReaderErrMsg(sk->xlogreader); + } + + return res; +} + +static uint32 +walprop_pg_wal_reader_events(Safekeeper *sk) +{ + return NeonWALReaderEvents(sk->xlogreader); +} + static WaitEventSet *waitEvents; static void @@ -1438,6 +1540,8 @@ walprop_pg_free_event_set(WalProposer *wp) for (int i = 0; i < wp->n_safekeepers; i++) { wp->safekeeper[i].eventPos = -1; + wp->safekeeper[i].nwrEventPos = -1; + wp->safekeeper[i].nwrConnEstablished = false; } } @@ -1447,11 +1551,37 @@ walprop_pg_init_event_set(WalProposer *wp) if (waitEvents) elog(FATAL, "double-initialization of event set"); - waitEvents = CreateWaitEventSet(TopMemoryContext, 2 + wp->n_safekeepers); + /* for each sk, we have socket plus potentially socket for neon walreader */ + waitEvents = CreateWaitEventSet(TopMemoryContext, 2 + 2 * wp->n_safekeepers); AddWaitEventToSet(waitEvents, WL_LATCH_SET, PGINVALID_SOCKET, MyLatch, NULL); AddWaitEventToSet(waitEvents, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET, NULL, NULL); + + for (int i = 0; i < wp->n_safekeepers; i++) + { + wp->safekeeper[i].eventPos = -1; + wp->safekeeper[i].nwrEventPos = -1; + wp->safekeeper[i].nwrConnEstablished = false; + } +} + +/* add safekeeper socket to wait event set */ +static void +walprop_pg_add_safekeeper_event_set(Safekeeper *sk, uint32 events) +{ + Assert(sk->eventPos == -1); + sk->eventPos = AddWaitEventToSet(waitEvents, events, walprop_socket(sk), NULL, sk); +} + +/* add neon wal reader socket to wait event set */ +static void +add_nwr_event_set(Safekeeper *sk, uint32 events) +{ + Assert(sk->nwrEventPos == -1); + sk->nwrEventPos = AddWaitEventToSet(waitEvents, events, NeonWALReaderSocket(sk->xlogreader), NULL, sk); + sk->nwrConnEstablished = NeonWALReaderIsRemConnEstablished(sk->xlogreader); + elog(DEBUG5, "sk %s:%s: added nwr socket events %d", sk->host, sk->port, events); } static void @@ -1463,10 +1593,144 @@ walprop_pg_update_event_set(Safekeeper *sk, uint32 events) ModifyWaitEvent(waitEvents, sk->eventPos, events, NULL); } +/* + * Update neon_walreader event. + * Can be called when nwr socket doesn't exist, does nothing in this case. + */ static void -walprop_pg_add_safekeeper_event_set(Safekeeper *sk, uint32 events) +update_nwr_event_set(Safekeeper *sk, uint32 events) { - sk->eventPos = AddWaitEventToSet(waitEvents, events, walprop_socket(sk), NULL, sk); + /* eventPos = -1 when we don't have an event */ + if (sk->nwrEventPos != -1) + ModifyWaitEvent(waitEvents, sk->nwrEventPos, events, NULL); +} + + +static void +walprop_pg_active_state_update_event_set(Safekeeper *sk) +{ + uint32 sk_events; + uint32 nwr_events; + + Assert(sk->state == SS_ACTIVE); + SafekeeperStateDesiredEvents(sk, &sk_events, &nwr_events); + + /* + * If we need to wait for neon_walreader, ensure we have up to date socket + * in the wait event set. + */ + if (sk->active_state == SS_ACTIVE_READ_WAL) + { + /* + * If conn is established and socket is thus stable, update the event + * directly; otherwise re-add it. + */ + if (sk->nwrConnEstablished) + { + Assert(sk->nwrEventPos != -1); + update_nwr_event_set(sk, nwr_events); + } + else + { + rm_safekeeper_event_set(sk, false); + add_nwr_event_set(sk, nwr_events); + } + } + else + { + /* + * Hack: we should always set 0 here, but for random reasons + * WaitEventSet (WaitEventAdjustEpoll) asserts that there is at least + * some event. Since there is also no way to remove socket except + * reconstructing the whole set, SafekeeperStateDesiredEvents instead + * gives WL_SOCKET_CLOSED if socket exists. We never expect it to + * trigger. + * + * On PG 14 which doesn't have WL_SOCKET_CLOSED resort to event + * removal. + */ +#if PG_VERSION_NUM >= 150000 + Assert(nwr_events == WL_SOCKET_CLOSED || nwr_events == 0); + update_nwr_event_set(sk, WL_SOCKET_CLOSED); +#else /* pg 14 */ + rm_safekeeper_event_set(sk, false); +#endif + } + walprop_pg_update_event_set(sk, sk_events); +} + +static void +walprop_pg_rm_safekeeper_event_set(Safekeeper *to_remove) +{ + rm_safekeeper_event_set(to_remove, true); +} + +/* + * A hacky way to remove single event from the event set. Can be called if event + * doesn't exist, does nothing in this case. + * + * Note: Internally, this completely reconstructs the event set. It should be + * avoided if possible. + * + * If is_sk is true, socket of connection to safekeeper is removed; otherwise + * socket of neon_walreader. + */ +static void +rm_safekeeper_event_set(Safekeeper *to_remove, bool is_sk) +{ + WalProposer *wp = to_remove->wp; + + elog(DEBUG5, "sk %s:%s: removing event, is_sk %d", + to_remove->host, to_remove->port, is_sk); + + /* + * Shortpath for exiting if have nothing to do. We never call this + * function with safekeeper socket not existing, but do that with neon + * walreader socket. + */ + if ((is_sk && to_remove->eventPos == -1) || + (!is_sk && to_remove->nwrEventPos == -1)) + { + return; + } + + /* Remove the existing event set, assign sk->eventPos = -1 */ + walprop_pg_free_event_set(wp); + + /* Re-initialize it without adding any safekeeper events */ + wp->api.init_event_set(wp); + + /* + * loop through the existing safekeepers. If they aren't the one we're + * removing, and if they have a socket we can use, re-add the applicable + * events. + */ + for (int i = 0; i < wp->n_safekeepers; i++) + { + Safekeeper *sk = &wp->safekeeper[i]; + + /* + * If this safekeeper isn't offline, add events for it, except for the + * event requested to remove. + */ + if (sk->state != SS_OFFLINE) + { + uint32 sk_events; + uint32 nwr_events; + + SafekeeperStateDesiredEvents(sk, &sk_events, &nwr_events); + + if (sk != to_remove || !is_sk) + { + /* will set sk->eventPos */ + wp->api.add_safekeeper_event_set(sk, sk_events); + } + if ((sk != to_remove || is_sk) && nwr_events) + { + add_nwr_event_set(sk, nwr_events); + } + } + } } static int @@ -1482,6 +1746,21 @@ walprop_pg_wait_event_set(WalProposer *wp, long timeout, Safekeeper **sk, uint32 #if PG_MAJORVERSION_NUM >= 16 if (WalSndCtl != NULL) ConditionVariablePrepareToSleep(&WalSndCtl->wal_flush_cv); + + /* + * Now that we prepared the condvar, check flush ptr again -- it might + * have changed before we subscribed to cv so we missed the wakeup. + * + * Do that only when we're interested in new WAL: without sync-safekeepers + * and if election already passed. + */ + if (!wp->config->syncSafekeepers && wp->availableLsn != InvalidXLogRecPtr && GetFlushRecPtr(NULL) > wp->availableLsn) + { + ConditionVariableCancelSleep(); + ResetLatch(MyLatch); + *events = WL_LATCH_SET; + return 1; + } #endif /* @@ -1533,7 +1812,7 @@ walprop_pg_finish_sync_safekeepers(WalProposer *wp, XLogRecPtr lsn) } /* - * Get PageserverFeedback fields from the most advanced safekeeper + * Choose most advanced PageserverFeedback and set it to *rf. */ static void GetLatestNeonFeedback(PageserverFeedback *rf, WalProposer *wp) @@ -1563,8 +1842,6 @@ GetLatestNeonFeedback(PageserverFeedback *rf, WalProposer *wp) LSN_FORMAT_ARGS(rf->disk_consistent_lsn), LSN_FORMAT_ARGS(rf->remote_consistent_lsn), rf->replytime); - - replication_feedback_set(rf); } /* @@ -1604,63 +1881,69 @@ CombineHotStanbyFeedbacks(HotStandbyFeedback *hs, WalProposer *wp) hs->catalog_xmin = InvalidFullTransactionId; } +/* + * Based on commitLsn and safekeeper responses including pageserver feedback, + * 1) Propagate cluster size received from ps to ensure the limit. + * 2) Propagate pageserver LSN positions to ensure backpressure limits. + * 3) Advance walproposer slot to commitLsn (releasing WAL & waking up waiters). + * 4) Propagate hot standby feedback. + * + * None of that is functional in sync-safekeepers. + */ static void walprop_pg_process_safekeeper_feedback(WalProposer *wp, XLogRecPtr commitLsn) { HotStandbyFeedback hsFeedback; - XLogRecPtr diskConsistentLsn; + XLogRecPtr oldDiskConsistentLsn; - diskConsistentLsn = quorumFeedback.rf.disk_consistent_lsn; + if (wp->config->syncSafekeepers) + return; - if (!wp->config->syncSafekeepers) + oldDiskConsistentLsn = quorumFeedback.rf.disk_consistent_lsn; + + /* Get PageserverFeedback fields from the most advanced safekeeper */ + GetLatestNeonFeedback(&quorumFeedback.rf, wp); + replication_feedback_set(&quorumFeedback.rf); + SetZenithCurrentClusterSize(quorumFeedback.rf.currentClusterSize); + + if (commitLsn > quorumFeedback.flushLsn || oldDiskConsistentLsn != quorumFeedback.rf.disk_consistent_lsn) { - /* Get PageserverFeedback fields from the most advanced safekeeper */ - GetLatestNeonFeedback(&quorumFeedback.rf, wp); - SetZenithCurrentClusterSize(quorumFeedback.rf.currentClusterSize); - } - - if (commitLsn > quorumFeedback.flushLsn || diskConsistentLsn != quorumFeedback.rf.disk_consistent_lsn) - { - if (commitLsn > quorumFeedback.flushLsn) quorumFeedback.flushLsn = commitLsn; - /* advance the replication slot */ - if (!wp->config->syncSafekeepers) - ProcessStandbyReply( - /* write_lsn - This is what durably stored in WAL service. */ - quorumFeedback.flushLsn, - /* flush_lsn - This is what durably stored in WAL service. */ - quorumFeedback.flushLsn, + /* + * Advance the replication slot to commitLsn. WAL before it is + * hardened and will be fetched from one of safekeepers by + * neon_walreader if needed. + * + * Also wakes up syncrep waiters. + */ + ProcessStandbyReply( + /* write_lsn - This is what durably stored in WAL service. */ + quorumFeedback.flushLsn, + /* flush_lsn - This is what durably stored in WAL service. */ + quorumFeedback.flushLsn, - /* - * apply_lsn - This is what processed and durably saved at* - * pageserver. - */ - quorumFeedback.rf.disk_consistent_lsn, - walprop_pg_get_current_timestamp(wp), false); + /* + * apply_lsn - This is what processed and durably saved at* + * pageserver. + */ + quorumFeedback.rf.disk_consistent_lsn, + walprop_pg_get_current_timestamp(wp), false); } CombineHotStanbyFeedbacks(&hsFeedback, wp); if (hsFeedback.ts != 0 && memcmp(&hsFeedback, &quorumFeedback.hs, sizeof hsFeedback) != 0) { quorumFeedback.hs = hsFeedback; - if (!wp->config->syncSafekeepers) - ProcessStandbyHSFeedback(hsFeedback.ts, - XidFromFullTransactionId(hsFeedback.xmin), - EpochFromFullTransactionId(hsFeedback.xmin), - XidFromFullTransactionId(hsFeedback.catalog_xmin), - EpochFromFullTransactionId(hsFeedback.catalog_xmin)); + ProcessStandbyHSFeedback(hsFeedback.ts, + XidFromFullTransactionId(hsFeedback.xmin), + EpochFromFullTransactionId(hsFeedback.xmin), + XidFromFullTransactionId(hsFeedback.catalog_xmin), + EpochFromFullTransactionId(hsFeedback.catalog_xmin)); } } -static void -walprop_pg_confirm_wal_streamed(WalProposer *wp, XLogRecPtr lsn) -{ - if (MyReplicationSlot) - PhysicalConfirmReceivedLocation(lsn); -} - static XLogRecPtr walprop_pg_get_redo_start_lsn(WalProposer *wp) { @@ -1679,15 +1962,15 @@ walprop_pg_log_internal(WalProposer *wp, int level, const char *line) elog(FATAL, "unexpected log_internal message at level %d: %s", level, line); } -static void -walprop_pg_after_election(WalProposer *wp) +static XLogRecPtr +GetLogRepRestartLSN(WalProposer *wp) { FILE *f; - XLogRecPtr lrRestartLsn; + XLogRecPtr lrRestartLsn = InvalidXLogRecPtr; /* We don't need to do anything in syncSafekeepers mode. */ if (wp->config->syncSafekeepers) - return; + return InvalidXLogRecPtr; /* * If there are active logical replication subscription we need to provide @@ -1695,22 +1978,40 @@ walprop_pg_after_election(WalProposer *wp) * replication slots. */ f = fopen("restart.lsn", "rb"); - if (f != NULL && !wp->config->syncSafekeepers) + if (f != NULL) { - fread(&lrRestartLsn, sizeof(lrRestartLsn), 1, f); + size_t rc = fread(&lrRestartLsn, sizeof(lrRestartLsn), 1, f); + fclose(f); - if (lrRestartLsn != InvalidXLogRecPtr) + if (rc == 1 && lrRestartLsn != InvalidXLogRecPtr) { - elog(LOG, "Logical replication restart LSN %X/%X", LSN_FORMAT_ARGS(lrRestartLsn)); + uint64 download_range_mb; + + elog(LOG, "logical replication restart LSN %X/%X", LSN_FORMAT_ARGS(lrRestartLsn)); + + /* + * If we need to download more than a max_slot_wal_keep_size, + * don't do it to avoid risk of exploding pg_wal. Logical + * replication won't work until recreated, but at least compute + * would start; this also follows max_slot_wal_keep_size + * semantics. + */ + download_range_mb = (wp->propEpochStartLsn - lrRestartLsn) / MB; + if (max_slot_wal_keep_size_mb > 0 && download_range_mb >= max_slot_wal_keep_size_mb) + { + walprop_log(WARNING, "not downloading WAL for logical replication since %X/%X as max_slot_wal_keep_size=%dMB", + LSN_FORMAT_ARGS(lrRestartLsn), max_slot_wal_keep_size_mb); + return InvalidXLogRecPtr; + } /* * start from the beginning of the segment to fetch page headers * verifed by XLogReader */ lrRestartLsn = lrRestartLsn - XLogSegmentOffset(lrRestartLsn, wal_segment_size); - wp->truncateLsn = Min(wp->truncateLsn, lrRestartLsn); } } + return lrRestartLsn; } static const walproposer_api walprop_pg = { @@ -1730,18 +2031,18 @@ static const walproposer_api walprop_pg = { .conn_async_write = walprop_async_write, .conn_blocking_write = walprop_blocking_write, .recovery_download = WalProposerRecovery, - .wal_read = walprop_pg_wal_read, .wal_reader_allocate = walprop_pg_wal_reader_allocate, - .free_event_set = walprop_pg_free_event_set, + .wal_read = walprop_pg_wal_read, + .wal_reader_events = walprop_pg_wal_reader_events, .init_event_set = walprop_pg_init_event_set, .update_event_set = walprop_pg_update_event_set, + .active_state_update_event_set = walprop_pg_active_state_update_event_set, .add_safekeeper_event_set = walprop_pg_add_safekeeper_event_set, + .rm_safekeeper_event_set = walprop_pg_rm_safekeeper_event_set, .wait_event_set = walprop_pg_wait_event_set, .strong_random = walprop_pg_strong_random, .get_redo_start_lsn = walprop_pg_get_redo_start_lsn, .finish_sync_safekeepers = walprop_pg_finish_sync_safekeepers, .process_safekeeper_feedback = walprop_pg_process_safekeeper_feedback, - .confirm_wal_streamed = walprop_pg_confirm_wal_streamed, .log_internal = walprop_pg_log_internal, - .after_election = walprop_pg_after_election, }; diff --git a/poetry.lock b/poetry.lock index 8583a71f85..76dfd6d37d 100644 --- a/poetry.lock +++ b/poetry.lock @@ -2092,51 +2092,61 @@ files = [ [[package]] name = "pyyaml" -version = "6.0" +version = "6.0.1" description = "YAML parser and emitter for Python" optional = false python-versions = ">=3.6" files = [ - {file = "PyYAML-6.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d4db7c7aef085872ef65a8fd7d6d09a14ae91f691dec3e87ee5ee0539d516f53"}, - {file = "PyYAML-6.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9df7ed3b3d2e0ecfe09e14741b857df43adb5a3ddadc919a2d94fbdf78fea53c"}, - {file = "PyYAML-6.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:77f396e6ef4c73fdc33a9157446466f1cff553d979bd00ecb64385760c6babdc"}, - {file = "PyYAML-6.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a80a78046a72361de73f8f395f1f1e49f956c6be882eed58505a15f3e430962b"}, - {file = "PyYAML-6.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:f84fbc98b019fef2ee9a1cb3ce93e3187a6df0b2538a651bfb890254ba9f90b5"}, - {file = "PyYAML-6.0-cp310-cp310-win32.whl", hash = "sha256:2cd5df3de48857ed0544b34e2d40e9fac445930039f3cfe4bcc592a1f836d513"}, - {file = "PyYAML-6.0-cp310-cp310-win_amd64.whl", hash = "sha256:daf496c58a8c52083df09b80c860005194014c3698698d1a57cbcfa182142a3a"}, - {file = "PyYAML-6.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:d4b0ba9512519522b118090257be113b9468d804b19d63c71dbcf4a48fa32358"}, - {file = "PyYAML-6.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:81957921f441d50af23654aa6c5e5eaf9b06aba7f0a19c18a538dc7ef291c5a1"}, - {file = "PyYAML-6.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:afa17f5bc4d1b10afd4466fd3a44dc0e245382deca5b3c353d8b757f9e3ecb8d"}, - {file = "PyYAML-6.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:dbad0e9d368bb989f4515da330b88a057617d16b6a8245084f1b05400f24609f"}, - {file = "PyYAML-6.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:432557aa2c09802be39460360ddffd48156e30721f5e8d917f01d31694216782"}, - {file = "PyYAML-6.0-cp311-cp311-win32.whl", hash = "sha256:bfaef573a63ba8923503d27530362590ff4f576c626d86a9fed95822a8255fd7"}, - {file = "PyYAML-6.0-cp311-cp311-win_amd64.whl", hash = "sha256:01b45c0191e6d66c470b6cf1b9531a771a83c1c4208272ead47a3ae4f2f603bf"}, - {file = "PyYAML-6.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:897b80890765f037df3403d22bab41627ca8811ae55e9a722fd0392850ec4d86"}, - {file = "PyYAML-6.0-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:50602afada6d6cbfad699b0c7bb50d5ccffa7e46a3d738092afddc1f9758427f"}, - {file = "PyYAML-6.0-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:48c346915c114f5fdb3ead70312bd042a953a8ce5c7106d5bfb1a5254e47da92"}, - {file = "PyYAML-6.0-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:98c4d36e99714e55cfbaaee6dd5badbc9a1ec339ebfc3b1f52e293aee6bb71a4"}, - {file = "PyYAML-6.0-cp36-cp36m-win32.whl", hash = "sha256:0283c35a6a9fbf047493e3a0ce8d79ef5030852c51e9d911a27badfde0605293"}, - {file = "PyYAML-6.0-cp36-cp36m-win_amd64.whl", hash = "sha256:07751360502caac1c067a8132d150cf3d61339af5691fe9e87803040dbc5db57"}, - {file = "PyYAML-6.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:819b3830a1543db06c4d4b865e70ded25be52a2e0631ccd2f6a47a2822f2fd7c"}, - {file = "PyYAML-6.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:473f9edb243cb1935ab5a084eb238d842fb8f404ed2193a915d1784b5a6b5fc0"}, - {file = "PyYAML-6.0-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0ce82d761c532fe4ec3f87fc45688bdd3a4c1dc5e0b4a19814b9009a29baefd4"}, - {file = "PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:231710d57adfd809ef5d34183b8ed1eeae3f76459c18fb4a0b373ad56bedcdd9"}, - {file = "PyYAML-6.0-cp37-cp37m-win32.whl", hash = "sha256:c5687b8d43cf58545ade1fe3e055f70eac7a5a1a0bf42824308d868289a95737"}, - {file = "PyYAML-6.0-cp37-cp37m-win_amd64.whl", hash = "sha256:d15a181d1ecd0d4270dc32edb46f7cb7733c7c508857278d3d378d14d606db2d"}, - {file = "PyYAML-6.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:0b4624f379dab24d3725ffde76559cff63d9ec94e1736b556dacdfebe5ab6d4b"}, - {file = "PyYAML-6.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:213c60cd50106436cc818accf5baa1aba61c0189ff610f64f4a3e8c6726218ba"}, - {file = "PyYAML-6.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9fa600030013c4de8165339db93d182b9431076eb98eb40ee068700c9c813e34"}, - {file = "PyYAML-6.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:277a0ef2981ca40581a47093e9e2d13b3f1fbbeffae064c1d21bfceba2030287"}, - {file = "PyYAML-6.0-cp38-cp38-win32.whl", hash = "sha256:d4eccecf9adf6fbcc6861a38015c2a64f38b9d94838ac1810a9023a0609e1b78"}, - {file = "PyYAML-6.0-cp38-cp38-win_amd64.whl", hash = "sha256:1e4747bc279b4f613a09eb64bba2ba602d8a6664c6ce6396a4d0cd413a50ce07"}, - {file = "PyYAML-6.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:055d937d65826939cb044fc8c9b08889e8c743fdc6a32b33e2390f66013e449b"}, - {file = "PyYAML-6.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:e61ceaab6f49fb8bdfaa0f92c4b57bcfbea54c09277b1b4f7ac376bfb7a7c174"}, - {file = "PyYAML-6.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d67d839ede4ed1b28a4e8909735fc992a923cdb84e618544973d7dfc71540803"}, - {file = "PyYAML-6.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cba8c411ef271aa037d7357a2bc8f9ee8b58b9965831d9e51baf703280dc73d3"}, - {file = "PyYAML-6.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:40527857252b61eacd1d9af500c3337ba8deb8fc298940291486c465c8b46ec0"}, - {file = "PyYAML-6.0-cp39-cp39-win32.whl", hash = "sha256:b5b9eccad747aabaaffbc6064800670f0c297e52c12754eb1d976c57e4f74dcb"}, - {file = "PyYAML-6.0-cp39-cp39-win_amd64.whl", hash = "sha256:b3d267842bf12586ba6c734f89d1f5b871df0273157918b0ccefa29deb05c21c"}, - {file = "PyYAML-6.0.tar.gz", hash = "sha256:68fb519c14306fec9720a2a5b45bc9f0c8d1b9c72adf45c37baedfcd949c35a2"}, + {file = "PyYAML-6.0.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d858aa552c999bc8a8d57426ed01e40bef403cd8ccdd0fc5f6f04a00414cac2a"}, + {file = "PyYAML-6.0.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:fd66fc5d0da6d9815ba2cebeb4205f95818ff4b79c3ebe268e75d961704af52f"}, + {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:69b023b2b4daa7548bcfbd4aa3da05b3a74b772db9e23b982788168117739938"}, + {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:81e0b275a9ecc9c0c0c07b4b90ba548307583c125f54d5b6946cfee6360c733d"}, + {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba336e390cd8e4d1739f42dfe9bb83a3cc2e80f567d8805e11b46f4a943f5515"}, + {file = "PyYAML-6.0.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:326c013efe8048858a6d312ddd31d56e468118ad4cdeda36c719bf5bb6192290"}, + {file = "PyYAML-6.0.1-cp310-cp310-win32.whl", hash = "sha256:bd4af7373a854424dabd882decdc5579653d7868b8fb26dc7d0e99f823aa5924"}, + {file = "PyYAML-6.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:fd1592b3fdf65fff2ad0004b5e363300ef59ced41c2e6b3a99d4089fa8c5435d"}, + {file = "PyYAML-6.0.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6965a7bc3cf88e5a1c3bd2e0b5c22f8d677dc88a455344035f03399034eb3007"}, + {file = "PyYAML-6.0.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f003ed9ad21d6a4713f0a9b5a7a0a79e08dd0f221aff4525a2be4c346ee60aab"}, + {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:42f8152b8dbc4fe7d96729ec2b99c7097d656dc1213a3229ca5383f973a5ed6d"}, + {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:062582fca9fabdd2c8b54a3ef1c978d786e0f6b3a1510e0ac93ef59e0ddae2bc"}, + {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d2b04aac4d386b172d5b9692e2d2da8de7bfb6c387fa4f801fbf6fb2e6ba4673"}, + {file = "PyYAML-6.0.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e7d73685e87afe9f3b36c799222440d6cf362062f78be1013661b00c5c6f678b"}, + {file = "PyYAML-6.0.1-cp311-cp311-win32.whl", hash = "sha256:1635fd110e8d85d55237ab316b5b011de701ea0f29d07611174a1b42f1444741"}, + {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"}, + {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"}, + {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"}, + {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"}, + {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"}, + {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"}, + {file = "PyYAML-6.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:0d3304d8c0adc42be59c5f8a4d9e3d7379e6955ad754aa9d6ab7a398b59dd1df"}, + {file = "PyYAML-6.0.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:50550eb667afee136e9a77d6dc71ae76a44df8b3e51e41b77f6de2932bfe0f47"}, + {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1fe35611261b29bd1de0070f0b2f47cb6ff71fa6595c077e42bd0c419fa27b98"}, + {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:704219a11b772aea0d8ecd7058d0082713c3562b4e271b849ad7dc4a5c90c13c"}, + {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:afd7e57eddb1a54f0f1a974bc4391af8bcce0b444685d936840f125cf046d5bd"}, + {file = "PyYAML-6.0.1-cp36-cp36m-win32.whl", hash = "sha256:fca0e3a251908a499833aa292323f32437106001d436eca0e6e7833256674585"}, + {file = "PyYAML-6.0.1-cp36-cp36m-win_amd64.whl", hash = "sha256:f22ac1c3cac4dbc50079e965eba2c1058622631e526bd9afd45fedd49ba781fa"}, + {file = "PyYAML-6.0.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:b1275ad35a5d18c62a7220633c913e1b42d44b46ee12554e5fd39c70a243d6a3"}, + {file = "PyYAML-6.0.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:18aeb1bf9a78867dc38b259769503436b7c72f7a1f1f4c93ff9a17de54319b27"}, + {file = "PyYAML-6.0.1-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:596106435fa6ad000c2991a98fa58eeb8656ef2325d7e158344fb33864ed87e3"}, + {file = "PyYAML-6.0.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:baa90d3f661d43131ca170712d903e6295d1f7a0f595074f151c0aed377c9b9c"}, + {file = "PyYAML-6.0.1-cp37-cp37m-win32.whl", hash = "sha256:9046c58c4395dff28dd494285c82ba00b546adfc7ef001486fbf0324bc174fba"}, + {file = "PyYAML-6.0.1-cp37-cp37m-win_amd64.whl", hash = "sha256:4fb147e7a67ef577a588a0e2c17b6db51dda102c71de36f8549b6816a96e1867"}, + {file = "PyYAML-6.0.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:1d4c7e777c441b20e32f52bd377e0c409713e8bb1386e1099c2415f26e479595"}, + {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a0cd17c15d3bb3fa06978b4e8958dcdc6e0174ccea823003a106c7d4d7899ac5"}, + {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:28c119d996beec18c05208a8bd78cbe4007878c6dd15091efb73a30e90539696"}, + {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7e07cbde391ba96ab58e532ff4803f79c4129397514e1413a7dc761ccd755735"}, + {file = "PyYAML-6.0.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:49a183be227561de579b4a36efbb21b3eab9651dd81b1858589f796549873dd6"}, + {file = "PyYAML-6.0.1-cp38-cp38-win32.whl", hash = "sha256:184c5108a2aca3c5b3d3bf9395d50893a7ab82a38004c8f61c258d4428e80206"}, + {file = "PyYAML-6.0.1-cp38-cp38-win_amd64.whl", hash = "sha256:1e2722cc9fbb45d9b87631ac70924c11d3a401b2d7f410cc0e3bbf249f2dca62"}, + {file = "PyYAML-6.0.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9eb6caa9a297fc2c2fb8862bc5370d0303ddba53ba97e71f08023b6cd73d16a8"}, + {file = "PyYAML-6.0.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:c8098ddcc2a85b61647b2590f825f3db38891662cfc2fc776415143f599bb859"}, + {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5773183b6446b2c99bb77e77595dd486303b4faab2b086e7b17bc6bef28865f6"}, + {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b786eecbdf8499b9ca1d697215862083bd6d2a99965554781d0d8d1ad31e13a0"}, + {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc1bf2925a1ecd43da378f4db9e4f799775d6367bdb94671027b73b393a7c42c"}, + {file = "PyYAML-6.0.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:04ac92ad1925b2cff1db0cfebffb6ffc43457495c9b3c39d3fcae417d7125dc5"}, + {file = "PyYAML-6.0.1-cp39-cp39-win32.whl", hash = "sha256:faca3bdcf85b2fc05d06ff3fbc1f83e1391b3e724afa3feba7d13eeab355484c"}, + {file = "PyYAML-6.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:510c9deebc5c0225e8c96813043e62b680ba2f9c50a08d3724c7f28a747d1486"}, + {file = "PyYAML-6.0.1.tar.gz", hash = "sha256:bfdf460b1736c775f2ba9f6a92bca30bc2095067b8a9d77876d1fad6cc3b4a43"}, ] [[package]] @@ -2553,85 +2563,101 @@ files = [ [[package]] name = "yarl" -version = "1.8.2" +version = "1.9.4" description = "Yet another URL library" optional = false python-versions = ">=3.7" files = [ - {file = "yarl-1.8.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:bb81f753c815f6b8e2ddd2eef3c855cf7da193b82396ac013c661aaa6cc6b0a5"}, - {file = "yarl-1.8.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:47d49ac96156f0928f002e2424299b2c91d9db73e08c4cd6742923a086f1c863"}, - {file = "yarl-1.8.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:3fc056e35fa6fba63248d93ff6e672c096f95f7836938241ebc8260e062832fe"}, - {file = "yarl-1.8.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:58a3c13d1c3005dbbac5c9f0d3210b60220a65a999b1833aa46bd6677c69b08e"}, - {file = "yarl-1.8.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:10b08293cda921157f1e7c2790999d903b3fd28cd5c208cf8826b3b508026996"}, - {file = "yarl-1.8.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:de986979bbd87272fe557e0a8fcb66fd40ae2ddfe28a8b1ce4eae22681728fef"}, - {file = "yarl-1.8.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c4fcfa71e2c6a3cb568cf81aadc12768b9995323186a10827beccf5fa23d4f8"}, - {file = "yarl-1.8.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ae4d7ff1049f36accde9e1ef7301912a751e5bae0a9d142459646114c70ecba6"}, - {file = "yarl-1.8.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:bf071f797aec5b96abfc735ab97da9fd8f8768b43ce2abd85356a3127909d146"}, - {file = "yarl-1.8.2-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:74dece2bfc60f0f70907c34b857ee98f2c6dd0f75185db133770cd67300d505f"}, - {file = "yarl-1.8.2-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:df60a94d332158b444301c7f569659c926168e4d4aad2cfbf4bce0e8fb8be826"}, - {file = "yarl-1.8.2-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:63243b21c6e28ec2375f932a10ce7eda65139b5b854c0f6b82ed945ba526bff3"}, - {file = "yarl-1.8.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:cfa2bbca929aa742b5084fd4663dd4b87c191c844326fcb21c3afd2d11497f80"}, - {file = "yarl-1.8.2-cp310-cp310-win32.whl", hash = "sha256:b05df9ea7496df11b710081bd90ecc3a3db6adb4fee36f6a411e7bc91a18aa42"}, - {file = "yarl-1.8.2-cp310-cp310-win_amd64.whl", hash = "sha256:24ad1d10c9db1953291f56b5fe76203977f1ed05f82d09ec97acb623a7976574"}, - {file = "yarl-1.8.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:2a1fca9588f360036242f379bfea2b8b44cae2721859b1c56d033adfd5893634"}, - {file = "yarl-1.8.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:f37db05c6051eff17bc832914fe46869f8849de5b92dc4a3466cd63095d23dfd"}, - {file = "yarl-1.8.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:77e913b846a6b9c5f767b14dc1e759e5aff05502fe73079f6f4176359d832581"}, - {file = "yarl-1.8.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0978f29222e649c351b173da2b9b4665ad1feb8d1daa9d971eb90df08702668a"}, - {file = "yarl-1.8.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:388a45dc77198b2460eac0aca1efd6a7c09e976ee768b0d5109173e521a19daf"}, - {file = "yarl-1.8.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2305517e332a862ef75be8fad3606ea10108662bc6fe08509d5ca99503ac2aee"}, - {file = "yarl-1.8.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:42430ff511571940d51e75cf42f1e4dbdded477e71c1b7a17f4da76c1da8ea76"}, - {file = "yarl-1.8.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3150078118f62371375e1e69b13b48288e44f6691c1069340081c3fd12c94d5b"}, - {file = "yarl-1.8.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:c15163b6125db87c8f53c98baa5e785782078fbd2dbeaa04c6141935eb6dab7a"}, - {file = "yarl-1.8.2-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:4d04acba75c72e6eb90745447d69f84e6c9056390f7a9724605ca9c56b4afcc6"}, - {file = "yarl-1.8.2-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:e7fd20d6576c10306dea2d6a5765f46f0ac5d6f53436217913e952d19237efc4"}, - {file = "yarl-1.8.2-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:75c16b2a900b3536dfc7014905a128a2bea8fb01f9ee26d2d7d8db0a08e7cb2c"}, - {file = "yarl-1.8.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:6d88056a04860a98341a0cf53e950e3ac9f4e51d1b6f61a53b0609df342cc8b2"}, - {file = "yarl-1.8.2-cp311-cp311-win32.whl", hash = "sha256:fb742dcdd5eec9f26b61224c23baea46c9055cf16f62475e11b9b15dfd5c117b"}, - {file = "yarl-1.8.2-cp311-cp311-win_amd64.whl", hash = "sha256:8c46d3d89902c393a1d1e243ac847e0442d0196bbd81aecc94fcebbc2fd5857c"}, - {file = "yarl-1.8.2-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:ceff9722e0df2e0a9e8a79c610842004fa54e5b309fe6d218e47cd52f791d7ef"}, - {file = "yarl-1.8.2-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3f6b4aca43b602ba0f1459de647af954769919c4714706be36af670a5f44c9c1"}, - {file = "yarl-1.8.2-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1684a9bd9077e922300ecd48003ddae7a7474e0412bea38d4631443a91d61077"}, - {file = "yarl-1.8.2-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ebb78745273e51b9832ef90c0898501006670d6e059f2cdb0e999494eb1450c2"}, - {file = "yarl-1.8.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3adeef150d528ded2a8e734ebf9ae2e658f4c49bf413f5f157a470e17a4a2e89"}, - {file = "yarl-1.8.2-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:57a7c87927a468e5a1dc60c17caf9597161d66457a34273ab1760219953f7f4c"}, - {file = "yarl-1.8.2-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:efff27bd8cbe1f9bd127e7894942ccc20c857aa8b5a0327874f30201e5ce83d0"}, - {file = "yarl-1.8.2-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:a783cd344113cb88c5ff7ca32f1f16532a6f2142185147822187913eb989f739"}, - {file = "yarl-1.8.2-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:705227dccbe96ab02c7cb2c43e1228e2826e7ead880bb19ec94ef279e9555b5b"}, - {file = "yarl-1.8.2-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:34c09b43bd538bf6c4b891ecce94b6fa4f1f10663a8d4ca589a079a5018f6ed7"}, - {file = "yarl-1.8.2-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:a48f4f7fea9a51098b02209d90297ac324241bf37ff6be6d2b0149ab2bd51b37"}, - {file = "yarl-1.8.2-cp37-cp37m-win32.whl", hash = "sha256:0414fd91ce0b763d4eadb4456795b307a71524dbacd015c657bb2a39db2eab89"}, - {file = "yarl-1.8.2-cp37-cp37m-win_amd64.whl", hash = "sha256:d881d152ae0007809c2c02e22aa534e702f12071e6b285e90945aa3c376463c5"}, - {file = "yarl-1.8.2-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:5df5e3d04101c1e5c3b1d69710b0574171cc02fddc4b23d1b2813e75f35a30b1"}, - {file = "yarl-1.8.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:7a66c506ec67eb3159eea5096acd05f5e788ceec7b96087d30c7d2865a243918"}, - {file = "yarl-1.8.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:2b4fa2606adf392051d990c3b3877d768771adc3faf2e117b9de7eb977741229"}, - {file = "yarl-1.8.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1e21fb44e1eff06dd6ef971d4bdc611807d6bd3691223d9c01a18cec3677939e"}, - {file = "yarl-1.8.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:93202666046d9edadfe9f2e7bf5e0782ea0d497b6d63da322e541665d65a044e"}, - {file = "yarl-1.8.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:fc77086ce244453e074e445104f0ecb27530d6fd3a46698e33f6c38951d5a0f1"}, - {file = "yarl-1.8.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:64dd68a92cab699a233641f5929a40f02a4ede8c009068ca8aa1fe87b8c20ae3"}, - {file = "yarl-1.8.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1b372aad2b5f81db66ee7ec085cbad72c4da660d994e8e590c997e9b01e44901"}, - {file = "yarl-1.8.2-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:e6f3515aafe0209dd17fb9bdd3b4e892963370b3de781f53e1746a521fb39fc0"}, - {file = "yarl-1.8.2-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:dfef7350ee369197106805e193d420b75467b6cceac646ea5ed3049fcc950a05"}, - {file = "yarl-1.8.2-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:728be34f70a190566d20aa13dc1f01dc44b6aa74580e10a3fb159691bc76909d"}, - {file = "yarl-1.8.2-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:ff205b58dc2929191f68162633d5e10e8044398d7a45265f90a0f1d51f85f72c"}, - {file = "yarl-1.8.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:baf211dcad448a87a0d9047dc8282d7de59473ade7d7fdf22150b1d23859f946"}, - {file = "yarl-1.8.2-cp38-cp38-win32.whl", hash = "sha256:272b4f1599f1b621bf2aabe4e5b54f39a933971f4e7c9aa311d6d7dc06965165"}, - {file = "yarl-1.8.2-cp38-cp38-win_amd64.whl", hash = "sha256:326dd1d3caf910cd26a26ccbfb84c03b608ba32499b5d6eeb09252c920bcbe4f"}, - {file = "yarl-1.8.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:f8ca8ad414c85bbc50f49c0a106f951613dfa5f948ab69c10ce9b128d368baf8"}, - {file = "yarl-1.8.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:418857f837347e8aaef682679f41e36c24250097f9e2f315d39bae3a99a34cbf"}, - {file = "yarl-1.8.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ae0eec05ab49e91a78700761777f284c2df119376e391db42c38ab46fd662b77"}, - {file = "yarl-1.8.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:009a028127e0a1755c38b03244c0bea9d5565630db9c4cf9572496e947137a87"}, - {file = "yarl-1.8.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3edac5d74bb3209c418805bda77f973117836e1de7c000e9755e572c1f7850d0"}, - {file = "yarl-1.8.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:da65c3f263729e47351261351b8679c6429151ef9649bba08ef2528ff2c423b2"}, - {file = "yarl-1.8.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0ef8fb25e52663a1c85d608f6dd72e19bd390e2ecaf29c17fb08f730226e3a08"}, - {file = "yarl-1.8.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:bcd7bb1e5c45274af9a1dd7494d3c52b2be5e6bd8d7e49c612705fd45420b12d"}, - {file = "yarl-1.8.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:44ceac0450e648de86da8e42674f9b7077d763ea80c8ceb9d1c3e41f0f0a9951"}, - {file = "yarl-1.8.2-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:97209cc91189b48e7cfe777237c04af8e7cc51eb369004e061809bcdf4e55220"}, - {file = "yarl-1.8.2-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:48dd18adcf98ea9cd721a25313aef49d70d413a999d7d89df44f469edfb38a06"}, - {file = "yarl-1.8.2-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:e59399dda559688461762800d7fb34d9e8a6a7444fd76ec33220a926c8be1516"}, - {file = "yarl-1.8.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:d617c241c8c3ad5c4e78a08429fa49e4b04bedfc507b34b4d8dceb83b4af3588"}, - {file = "yarl-1.8.2-cp39-cp39-win32.whl", hash = "sha256:cb6d48d80a41f68de41212f3dfd1a9d9898d7841c8f7ce6696cf2fd9cb57ef83"}, - {file = "yarl-1.8.2-cp39-cp39-win_amd64.whl", hash = "sha256:6604711362f2dbf7160df21c416f81fac0de6dbcf0b5445a2ef25478ecc4c778"}, - {file = "yarl-1.8.2.tar.gz", hash = "sha256:49d43402c6e3013ad0978602bf6bf5328535c48d192304b91b97a3c6790b1562"}, + {file = "yarl-1.9.4-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:a8c1df72eb746f4136fe9a2e72b0c9dc1da1cbd23b5372f94b5820ff8ae30e0e"}, + {file = "yarl-1.9.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:a3a6ed1d525bfb91b3fc9b690c5a21bb52de28c018530ad85093cc488bee2dd2"}, + {file = "yarl-1.9.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c38c9ddb6103ceae4e4498f9c08fac9b590c5c71b0370f98714768e22ac6fa66"}, + {file = "yarl-1.9.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d9e09c9d74f4566e905a0b8fa668c58109f7624db96a2171f21747abc7524234"}, + {file = "yarl-1.9.4-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b8477c1ee4bd47c57d49621a062121c3023609f7a13b8a46953eb6c9716ca392"}, + {file = "yarl-1.9.4-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d5ff2c858f5f6a42c2a8e751100f237c5e869cbde669a724f2062d4c4ef93551"}, + {file = "yarl-1.9.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:357495293086c5b6d34ca9616a43d329317feab7917518bc97a08f9e55648455"}, + {file = "yarl-1.9.4-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:54525ae423d7b7a8ee81ba189f131054defdb122cde31ff17477951464c1691c"}, + {file = "yarl-1.9.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:801e9264d19643548651b9db361ce3287176671fb0117f96b5ac0ee1c3530d53"}, + {file = "yarl-1.9.4-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:e516dc8baf7b380e6c1c26792610230f37147bb754d6426462ab115a02944385"}, + {file = "yarl-1.9.4-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:7d5aaac37d19b2904bb9dfe12cdb08c8443e7ba7d2852894ad448d4b8f442863"}, + {file = "yarl-1.9.4-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:54beabb809ffcacbd9d28ac57b0db46e42a6e341a030293fb3185c409e626b8b"}, + {file = "yarl-1.9.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:bac8d525a8dbc2a1507ec731d2867025d11ceadcb4dd421423a5d42c56818541"}, + {file = "yarl-1.9.4-cp310-cp310-win32.whl", hash = "sha256:7855426dfbddac81896b6e533ebefc0af2f132d4a47340cee6d22cac7190022d"}, + {file = "yarl-1.9.4-cp310-cp310-win_amd64.whl", hash = "sha256:848cd2a1df56ddbffeb375535fb62c9d1645dde33ca4d51341378b3f5954429b"}, + {file = "yarl-1.9.4-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:35a2b9396879ce32754bd457d31a51ff0a9d426fd9e0e3c33394bf4b9036b099"}, + {file = "yarl-1.9.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:4c7d56b293cc071e82532f70adcbd8b61909eec973ae9d2d1f9b233f3d943f2c"}, + {file = "yarl-1.9.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:d8a1c6c0be645c745a081c192e747c5de06e944a0d21245f4cf7c05e457c36e0"}, + {file = "yarl-1.9.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4b3c1ffe10069f655ea2d731808e76e0f452fc6c749bea04781daf18e6039525"}, + {file = "yarl-1.9.4-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:549d19c84c55d11687ddbd47eeb348a89df9cb30e1993f1b128f4685cd0ebbf8"}, + {file = "yarl-1.9.4-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a7409f968456111140c1c95301cadf071bd30a81cbd7ab829169fb9e3d72eae9"}, + {file = "yarl-1.9.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e23a6d84d9d1738dbc6e38167776107e63307dfc8ad108e580548d1f2c587f42"}, + {file = "yarl-1.9.4-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d8b889777de69897406c9fb0b76cdf2fd0f31267861ae7501d93003d55f54fbe"}, + {file = "yarl-1.9.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:03caa9507d3d3c83bca08650678e25364e1843b484f19986a527630ca376ecce"}, + {file = "yarl-1.9.4-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:4e9035df8d0880b2f1c7f5031f33f69e071dfe72ee9310cfc76f7b605958ceb9"}, + {file = "yarl-1.9.4-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:c0ec0ed476f77db9fb29bca17f0a8fcc7bc97ad4c6c1d8959c507decb22e8572"}, + {file = "yarl-1.9.4-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:ee04010f26d5102399bd17f8df8bc38dc7ccd7701dc77f4a68c5b8d733406958"}, + {file = "yarl-1.9.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:49a180c2e0743d5d6e0b4d1a9e5f633c62eca3f8a86ba5dd3c471060e352ca98"}, + {file = "yarl-1.9.4-cp311-cp311-win32.whl", hash = "sha256:81eb57278deb6098a5b62e88ad8281b2ba09f2f1147c4767522353eaa6260b31"}, + {file = "yarl-1.9.4-cp311-cp311-win_amd64.whl", hash = "sha256:d1d2532b340b692880261c15aee4dc94dd22ca5d61b9db9a8a361953d36410b1"}, + {file = "yarl-1.9.4-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:0d2454f0aef65ea81037759be5ca9947539667eecebca092733b2eb43c965a81"}, + {file = "yarl-1.9.4-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:44d8ffbb9c06e5a7f529f38f53eda23e50d1ed33c6c869e01481d3fafa6b8142"}, + {file = "yarl-1.9.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:aaaea1e536f98754a6e5c56091baa1b6ce2f2700cc4a00b0d49eca8dea471074"}, + {file = "yarl-1.9.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3777ce5536d17989c91696db1d459574e9a9bd37660ea7ee4d3344579bb6f129"}, + {file = "yarl-1.9.4-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9fc5fc1eeb029757349ad26bbc5880557389a03fa6ada41703db5e068881e5f2"}, + {file = "yarl-1.9.4-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ea65804b5dc88dacd4a40279af0cdadcfe74b3e5b4c897aa0d81cf86927fee78"}, + {file = "yarl-1.9.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:aa102d6d280a5455ad6a0f9e6d769989638718e938a6a0a2ff3f4a7ff8c62cc4"}, + {file = "yarl-1.9.4-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:09efe4615ada057ba2d30df871d2f668af661e971dfeedf0c159927d48bbeff0"}, + {file = "yarl-1.9.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:008d3e808d03ef28542372d01057fd09168419cdc8f848efe2804f894ae03e51"}, + {file = "yarl-1.9.4-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:6f5cb257bc2ec58f437da2b37a8cd48f666db96d47b8a3115c29f316313654ff"}, + {file = "yarl-1.9.4-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:992f18e0ea248ee03b5a6e8b3b4738850ae7dbb172cc41c966462801cbf62cf7"}, + {file = "yarl-1.9.4-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:0e9d124c191d5b881060a9e5060627694c3bdd1fe24c5eecc8d5d7d0eb6faabc"}, + {file = "yarl-1.9.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:3986b6f41ad22988e53d5778f91855dc0399b043fc8946d4f2e68af22ee9ff10"}, + {file = "yarl-1.9.4-cp312-cp312-win32.whl", hash = "sha256:4b21516d181cd77ebd06ce160ef8cc2a5e9ad35fb1c5930882baff5ac865eee7"}, + {file = "yarl-1.9.4-cp312-cp312-win_amd64.whl", hash = "sha256:a9bd00dc3bc395a662900f33f74feb3e757429e545d831eef5bb280252631984"}, + {file = "yarl-1.9.4-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:63b20738b5aac74e239622d2fe30df4fca4942a86e31bf47a81a0e94c14df94f"}, + {file = "yarl-1.9.4-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d7d7f7de27b8944f1fee2c26a88b4dabc2409d2fea7a9ed3df79b67277644e17"}, + {file = "yarl-1.9.4-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c74018551e31269d56fab81a728f683667e7c28c04e807ba08f8c9e3bba32f14"}, + {file = "yarl-1.9.4-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ca06675212f94e7a610e85ca36948bb8fc023e458dd6c63ef71abfd482481aa5"}, + {file = "yarl-1.9.4-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5aef935237d60a51a62b86249839b51345f47564208c6ee615ed2a40878dccdd"}, + {file = "yarl-1.9.4-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2b134fd795e2322b7684155b7855cc99409d10b2e408056db2b93b51a52accc7"}, + {file = "yarl-1.9.4-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:d25039a474c4c72a5ad4b52495056f843a7ff07b632c1b92ea9043a3d9950f6e"}, + {file = "yarl-1.9.4-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:f7d6b36dd2e029b6bcb8a13cf19664c7b8e19ab3a58e0fefbb5b8461447ed5ec"}, + {file = "yarl-1.9.4-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:957b4774373cf6f709359e5c8c4a0af9f6d7875db657adb0feaf8d6cb3c3964c"}, + {file = "yarl-1.9.4-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:d7eeb6d22331e2fd42fce928a81c697c9ee2d51400bd1a28803965883e13cead"}, + {file = "yarl-1.9.4-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:6a962e04b8f91f8c4e5917e518d17958e3bdee71fd1d8b88cdce74dd0ebbf434"}, + {file = "yarl-1.9.4-cp37-cp37m-win32.whl", hash = "sha256:f3bc6af6e2b8f92eced34ef6a96ffb248e863af20ef4fde9448cc8c9b858b749"}, + {file = "yarl-1.9.4-cp37-cp37m-win_amd64.whl", hash = "sha256:ad4d7a90a92e528aadf4965d685c17dacff3df282db1121136c382dc0b6014d2"}, + {file = "yarl-1.9.4-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:ec61d826d80fc293ed46c9dd26995921e3a82146feacd952ef0757236fc137be"}, + {file = "yarl-1.9.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:8be9e837ea9113676e5754b43b940b50cce76d9ed7d2461df1af39a8ee674d9f"}, + {file = "yarl-1.9.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:bef596fdaa8f26e3d66af846bbe77057237cb6e8efff8cd7cc8dff9a62278bbf"}, + {file = "yarl-1.9.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2d47552b6e52c3319fede1b60b3de120fe83bde9b7bddad11a69fb0af7db32f1"}, + {file = "yarl-1.9.4-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:84fc30f71689d7fc9168b92788abc977dc8cefa806909565fc2951d02f6b7d57"}, + {file = "yarl-1.9.4-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4aa9741085f635934f3a2583e16fcf62ba835719a8b2b28fb2917bb0537c1dfa"}, + {file = "yarl-1.9.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:206a55215e6d05dbc6c98ce598a59e6fbd0c493e2de4ea6cc2f4934d5a18d130"}, + {file = "yarl-1.9.4-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:07574b007ee20e5c375a8fe4a0789fad26db905f9813be0f9fef5a68080de559"}, + {file = "yarl-1.9.4-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:5a2e2433eb9344a163aced6a5f6c9222c0786e5a9e9cac2c89f0b28433f56e23"}, + {file = "yarl-1.9.4-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:6ad6d10ed9b67a382b45f29ea028f92d25bc0bc1daf6c5b801b90b5aa70fb9ec"}, + {file = "yarl-1.9.4-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:6fe79f998a4052d79e1c30eeb7d6c1c1056ad33300f682465e1b4e9b5a188b78"}, + {file = "yarl-1.9.4-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:a825ec844298c791fd28ed14ed1bffc56a98d15b8c58a20e0e08c1f5f2bea1be"}, + {file = "yarl-1.9.4-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:8619d6915b3b0b34420cf9b2bb6d81ef59d984cb0fde7544e9ece32b4b3043c3"}, + {file = "yarl-1.9.4-cp38-cp38-win32.whl", hash = "sha256:686a0c2f85f83463272ddffd4deb5e591c98aac1897d65e92319f729c320eece"}, + {file = "yarl-1.9.4-cp38-cp38-win_amd64.whl", hash = "sha256:a00862fb23195b6b8322f7d781b0dc1d82cb3bcac346d1e38689370cc1cc398b"}, + {file = "yarl-1.9.4-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:604f31d97fa493083ea21bd9b92c419012531c4e17ea6da0f65cacdcf5d0bd27"}, + {file = "yarl-1.9.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:8a854227cf581330ffa2c4824d96e52ee621dd571078a252c25e3a3b3d94a1b1"}, + {file = "yarl-1.9.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ba6f52cbc7809cd8d74604cce9c14868306ae4aa0282016b641c661f981a6e91"}, + {file = "yarl-1.9.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a6327976c7c2f4ee6816eff196e25385ccc02cb81427952414a64811037bbc8b"}, + {file = "yarl-1.9.4-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8397a3817d7dcdd14bb266283cd1d6fc7264a48c186b986f32e86d86d35fbac5"}, + {file = "yarl-1.9.4-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e0381b4ce23ff92f8170080c97678040fc5b08da85e9e292292aba67fdac6c34"}, + {file = "yarl-1.9.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:23d32a2594cb5d565d358a92e151315d1b2268bc10f4610d098f96b147370136"}, + {file = "yarl-1.9.4-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ddb2a5c08a4eaaba605340fdee8fc08e406c56617566d9643ad8bf6852778fc7"}, + {file = "yarl-1.9.4-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:26a1dc6285e03f3cc9e839a2da83bcbf31dcb0d004c72d0730e755b33466c30e"}, + {file = "yarl-1.9.4-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:18580f672e44ce1238b82f7fb87d727c4a131f3a9d33a5e0e82b793362bf18b4"}, + {file = "yarl-1.9.4-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:29e0f83f37610f173eb7e7b5562dd71467993495e568e708d99e9d1944f561ec"}, + {file = "yarl-1.9.4-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:1f23e4fe1e8794f74b6027d7cf19dc25f8b63af1483d91d595d4a07eca1fb26c"}, + {file = "yarl-1.9.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:db8e58b9d79200c76956cefd14d5c90af54416ff5353c5bfd7cbe58818e26ef0"}, + {file = "yarl-1.9.4-cp39-cp39-win32.whl", hash = "sha256:c7224cab95645c7ab53791022ae77a4509472613e839dab722a72abe5a684575"}, + {file = "yarl-1.9.4-cp39-cp39-win_amd64.whl", hash = "sha256:824d6c50492add5da9374875ce72db7a0733b29c2394890aef23d533106e2b15"}, + {file = "yarl-1.9.4-py3-none-any.whl", hash = "sha256:928cecb0ef9d5a7946eb6ff58417ad2fe9375762382f1bf5c55e61645f2c43ad"}, + {file = "yarl-1.9.4.tar.gz", hash = "sha256:566db86717cf8080b99b58b083b773a908ae40f06681e87e589a976faf8246bf"}, ] [package.dependencies] diff --git a/proxy/src/auth.rs b/proxy/src/auth.rs index eadb9abd43..64ef108e11 100644 --- a/proxy/src/auth.rs +++ b/proxy/src/auth.rs @@ -87,6 +87,10 @@ impl AuthError { pub fn too_many_connections() -> Self { AuthErrorImpl::TooManyConnections.into() } + + pub fn is_auth_failed(&self) -> bool { + matches!(self.0.as_ref(), AuthErrorImpl::AuthFailed(_)) + } } impl> From for AuthError { diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs index 3b09e05bd2..923bd02560 100644 --- a/proxy/src/auth/backend.rs +++ b/proxy/src/auth/backend.rs @@ -9,7 +9,6 @@ use tokio_postgres::config::AuthKeys; use crate::auth::credentials::check_peer_addr_is_in_list; use crate::auth::validate_password_and_exchange; use crate::console::errors::GetAuthInfoError; -use crate::console::provider::AuthInfo; use crate::console::AuthSecret; use crate::proxy::connect_compute::handle_try_wake; use crate::proxy::retry::retry_after; @@ -187,24 +186,52 @@ async fn auth_quirks( }; info!("fetching user's authentication info"); - // TODO(anna): this will slow down both "hacks" below; we probably need a cache. - let AuthInfo { - secret, - allowed_ips, - } = api.get_auth_info(extra, &info).await?; + let allowed_ips = api.get_allowed_ips(extra, &info).await?; // check allowed list if !check_peer_addr_is_in_list(&info.inner.peer_addr, &allowed_ips) { return Err(auth::AuthError::ip_address_not_allowed()); } - let secret = secret.unwrap_or_else(|| { + let cached_secret = api.get_role_secret(extra, &info).await?; + + let secret = cached_secret.clone().unwrap_or_else(|| { // If we don't have an authentication secret, we mock one to // prevent malicious probing (possible due to missing protocol steps). // This mocked secret will never lead to successful authentication. info!("authentication info not found, mocking it"); AuthSecret::Scram(scram::ServerSecret::mock(&info.inner.user, rand::random())) }); + match authenticate_with_secret( + secret, + info, + client, + unauthenticated_password, + allow_cleartext, + config, + latency_timer, + ) + .await + { + Ok(keys) => Ok(keys), + Err(e) => { + if e.is_auth_failed() { + // The password could have been changed, so we invalidate the cache. + cached_secret.invalidate(); + } + Err(e) + } + } +} +async fn authenticate_with_secret( + secret: AuthSecret, + info: ComputeUserInfo, + client: &mut stream::PqStream>, + unauthenticated_password: Option>, + allow_cleartext: bool, + config: &'static AuthenticationConfig, + latency_timer: &mut LatencyTimer, +) -> auth::Result> { if let Some(password) = unauthenticated_password { let auth_outcome = validate_password_and_exchange(&password, secret)?; let keys = match auth_outcome { diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs index ae4c42bcb1..5bc2d377a6 100644 --- a/proxy/src/bin/proxy.rs +++ b/proxy/src/bin/proxy.rs @@ -6,10 +6,12 @@ use proxy::config::HttpConfig; use proxy::console; use proxy::console::provider::AllowedIpsCache; use proxy::console::provider::NodeInfoCache; +use proxy::console::provider::RoleSecretCache; use proxy::http; use proxy::rate_limiter::EndpointRateLimiter; use proxy::rate_limiter::RateBucketInfo; use proxy::rate_limiter::RateLimiterConfig; +use proxy::serverless::GlobalConnPoolOptions; use proxy::usage_metrics; use anyhow::bail; @@ -86,7 +88,7 @@ struct ProxyCliArgs { #[clap(long)] metric_collection_interval: Option, /// cache for `wake_compute` api method (use `size=0` to disable) - #[clap(long, default_value = config::CacheOptions::DEFAULT_OPTIONS_NODE_INFO)] + #[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)] wake_compute_cache: String, /// lock for `wake_compute` api method. example: "shards=32,permits=4,epoch=10m,timeout=1s". (use `permits=0` to disable). #[clap(long, default_value = config::WakeComputeLockOptions::DEFAULT_OPTIONS_WAKE_COMPUTE_LOCK)] @@ -94,12 +96,8 @@ struct ProxyCliArgs { /// Allow self-signed certificates for compute nodes (for testing) #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)] allow_self_signed_compute: bool, - /// timeout for http connections - #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)] - sql_over_http_timeout: tokio::time::Duration, - /// Whether the SQL over http pool is opt-in - #[clap(long, default_value_t = true, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)] - sql_over_http_pool_opt_in: bool, + #[clap(flatten)] + sql_over_http: SqlOverHttpArgs, /// timeout for scram authentication protocol #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)] scram_protocol_timeout: tokio::time::Duration, @@ -127,13 +125,46 @@ struct ProxyCliArgs { #[clap(flatten)] aimd_config: proxy::rate_limiter::AimdConfig, /// cache for `allowed_ips` (use `size=0` to disable) - #[clap(long, default_value = config::CacheOptions::DEFAULT_OPTIONS_NODE_INFO)] + #[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)] allowed_ips_cache: String, + /// cache for `role_secret` (use `size=0` to disable) + #[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)] + role_secret_cache: String, /// disable ip check for http requests. If it is too time consuming, it could be turned off. #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)] disable_ip_check_for_http: bool, } +#[derive(clap::Args, Clone, Copy, Debug)] +struct SqlOverHttpArgs { + /// timeout for http connection requests + #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)] + sql_over_http_timeout: tokio::time::Duration, + + /// Whether the SQL over http pool is opt-in + #[clap(long, default_value_t = true, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)] + sql_over_http_pool_opt_in: bool, + + /// How many connections to pool for each endpoint. Excess connections are discarded + #[clap(long, default_value_t = 20)] + sql_over_http_pool_max_conns_per_endpoint: usize, + + /// How long pooled connections should remain idle for before closing + #[clap(long, default_value = "5m", value_parser = humantime::parse_duration)] + sql_over_http_idle_timeout: tokio::time::Duration, + + /// Duration each shard will wait on average before a GC sweep. + /// A longer time will causes sweeps to take longer but will interfere less frequently. + #[clap(long, default_value = "10m", value_parser = humantime::parse_duration)] + sql_over_http_pool_gc_epoch: tokio::time::Duration, + + /// How many shards should the global pool have. Must be a power of two. + /// More shards will introduce less contention for pool operations, but can + /// increase memory used by the pool + #[clap(long, default_value_t = 128)] + sql_over_http_pool_shards: usize, +} + #[tokio::main] async fn main() -> anyhow::Result<()> { let _logging_guard = proxy::logging::init().await?; @@ -266,9 +297,11 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { AuthBackend::Console => { let wake_compute_cache_config: CacheOptions = args.wake_compute_cache.parse()?; let allowed_ips_cache_config: CacheOptions = args.allowed_ips_cache.parse()?; + let role_secret_cache_config: CacheOptions = args.role_secret_cache.parse()?; info!("Using NodeInfoCache (wake_compute) with options={wake_compute_cache_config:?}"); info!("Using AllowedIpsCache (wake_compute) with options={allowed_ips_cache_config:?}"); + info!("Using RoleSecretCache (wake_compute) with options={role_secret_cache_config:?}"); let caches = Box::leak(Box::new(console::caches::ApiCaches { node_info: NodeInfoCache::new( "node_info_cache", @@ -282,6 +315,12 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { allowed_ips_cache_config.ttl, false, ), + role_secret: RoleSecretCache::new( + "role_secret_cache", + role_secret_cache_config.size, + role_secret_cache_config.ttl, + false, + ), })); let config::WakeComputeLockOptions { @@ -315,8 +354,14 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { } }; let http_config = HttpConfig { - timeout: args.sql_over_http_timeout, - pool_opt_in: args.sql_over_http_pool_opt_in, + request_timeout: args.sql_over_http.sql_over_http_timeout, + pool_options: GlobalConnPoolOptions { + max_conns_per_endpoint: args.sql_over_http.sql_over_http_pool_max_conns_per_endpoint, + gc_epoch: args.sql_over_http.sql_over_http_pool_gc_epoch, + pool_shards: args.sql_over_http.sql_over_http_pool_shards, + idle_timeout: args.sql_over_http.sql_over_http_idle_timeout, + opt_in: args.sql_over_http.sql_over_http_pool_opt_in, + }, }; let authentication_config = AuthenticationConfig { scram_protocol_timeout: args.scram_protocol_timeout, diff --git a/proxy/src/config.rs b/proxy/src/config.rs index f932df4058..610bf7e424 100644 --- a/proxy/src/config.rs +++ b/proxy/src/config.rs @@ -1,4 +1,4 @@ -use crate::{auth, rate_limiter::RateBucketInfo}; +use crate::{auth, rate_limiter::RateBucketInfo, serverless::GlobalConnPoolOptions}; use anyhow::{bail, ensure, Context, Ok}; use rustls::{sign, Certificate, PrivateKey}; use sha2::{Digest, Sha256}; @@ -36,8 +36,8 @@ pub struct TlsConfig { } pub struct HttpConfig { - pub timeout: tokio::time::Duration, - pub pool_opt_in: bool, + pub request_timeout: tokio::time::Duration, + pub pool_options: GlobalConnPoolOptions, } pub struct AuthenticationConfig { @@ -310,10 +310,10 @@ pub struct CacheOptions { impl CacheOptions { /// Default options for [`crate::console::provider::NodeInfoCache`]. - pub const DEFAULT_OPTIONS_NODE_INFO: &'static str = "size=4000,ttl=4m"; + pub const CACHE_DEFAULT_OPTIONS: &'static str = "size=4000,ttl=4m"; /// Parse cache options passed via cmdline. - /// Example: [`Self::DEFAULT_OPTIONS_NODE_INFO`]. + /// Example: [`Self::CACHE_DEFAULT_OPTIONS`]. fn parse(options: &str) -> anyhow::Result { let mut size = None; let mut ttl = None; diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs index 8d399f26ea..e4cf1e8c8e 100644 --- a/proxy/src/console/provider.rs +++ b/proxy/src/console/provider.rs @@ -10,6 +10,7 @@ use crate::{ }; use async_trait::async_trait; use dashmap::DashMap; +use smol_str::SmolStr; use std::{sync::Arc, time::Duration}; use tokio::{ sync::{OwnedSemaphorePermit, Semaphore}, @@ -216,6 +217,7 @@ impl ConsoleReqExtra { } /// Auth secret which is managed by the cloud. +#[derive(Clone)] pub enum AuthSecret { #[cfg(feature = "testing")] /// Md5 hash of user's password. @@ -250,18 +252,20 @@ pub struct NodeInfo { pub type NodeInfoCache = TimedLru, NodeInfo>; pub type CachedNodeInfo = timed_lru::Cached<&'static NodeInfoCache>; -pub type AllowedIpsCache = TimedLru, Arc>>; +pub type AllowedIpsCache = TimedLru>>; +pub type RoleSecretCache = TimedLru<(SmolStr, SmolStr), Option>; +pub type CachedRoleSecret = timed_lru::Cached<&'static RoleSecretCache>; /// This will allocate per each call, but the http requests alone /// already require a few allocations, so it should be fine. #[async_trait] pub trait Api { /// Get the client's auth secret for authentication. - async fn get_auth_info( + async fn get_role_secret( &self, extra: &ConsoleReqExtra, creds: &ComputeUserInfo, - ) -> Result; + ) -> Result; async fn get_allowed_ips( &self, @@ -282,7 +286,9 @@ pub struct ApiCaches { /// Cache for the `wake_compute` API method. pub node_info: NodeInfoCache, /// Cache for the `get_allowed_ips`. TODO(anna): use notifications listener instead. - pub allowed_ips: TimedLru, Arc>>, + pub allowed_ips: AllowedIpsCache, + /// Cache for the `get_role_secret`. TODO(anna): use notifications listener instead. + pub role_secret: RoleSecretCache, } /// Various caches for [`console`](super). diff --git a/proxy/src/console/provider/mock.rs b/proxy/src/console/provider/mock.rs index c464b4daf2..dba5e5863f 100644 --- a/proxy/src/console/provider/mock.rs +++ b/proxy/src/console/provider/mock.rs @@ -6,6 +6,7 @@ use super::{ errors::{ApiError, GetAuthInfoError, WakeComputeError}, AuthInfo, AuthSecret, CachedNodeInfo, ConsoleReqExtra, NodeInfo, }; +use crate::console::provider::CachedRoleSecret; use crate::{auth::backend::ComputeUserInfo, compute, error::io_error, scram, url::ApiUrl}; use async_trait::async_trait; use futures::TryFutureExt; @@ -142,12 +143,14 @@ async fn get_execute_postgres_query( #[async_trait] impl super::Api for Api { #[tracing::instrument(skip_all)] - async fn get_auth_info( + async fn get_role_secret( &self, _extra: &ConsoleReqExtra, creds: &ComputeUserInfo, - ) -> Result { - self.do_get_auth_info(creds).await + ) -> Result { + Ok(CachedRoleSecret::new_uncached( + self.do_get_auth_info(creds).await?.secret, + )) } async fn get_allowed_ips( diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs index f748c9a41f..5bf7b0f986 100644 --- a/proxy/src/console/provider/neon.rs +++ b/proxy/src/console/provider/neon.rs @@ -3,14 +3,15 @@ use super::{ super::messages::{ConsoleError, GetRoleSecret, WakeCompute}, errors::{ApiError, GetAuthInfoError, WakeComputeError}, - ApiCaches, ApiLocks, AuthInfo, AuthSecret, CachedNodeInfo, ConsoleReqExtra, NodeInfo, + ApiCaches, ApiLocks, AuthInfo, AuthSecret, CachedNodeInfo, CachedRoleSecret, ConsoleReqExtra, + NodeInfo, }; use crate::metrics::{ALLOWED_IPS_BY_CACHE_OUTCOME, ALLOWED_IPS_NUMBER}; use crate::{auth::backend::ComputeUserInfo, compute, http, scram}; use async_trait::async_trait; use futures::TryFutureExt; use itertools::Itertools; -use std::{net::SocketAddr, sync::Arc}; +use std::sync::Arc; use tokio::time::Instant; use tokio_postgres::config::SslMode; use tracing::{error, info, info_span, warn, Instrument}; @@ -140,7 +141,7 @@ impl Api { // We'll set username and such later using the startup message. // TODO: add more type safety (in progress). let mut config = compute::ConnCfg::new(); - config.host(&host).port(port).ssl_mode(SslMode::Disable); // TLS is not configured on compute nodes. + config.host(host).port(port).ssl_mode(SslMode::Disable); // TLS is not configured on compute nodes. let node = NodeInfo { config, @@ -159,12 +160,25 @@ impl Api { #[async_trait] impl super::Api for Api { #[tracing::instrument(skip_all)] - async fn get_auth_info( + async fn get_role_secret( &self, extra: &ConsoleReqExtra, creds: &ComputeUserInfo, - ) -> Result { - self.do_get_auth_info(extra, creds).await + ) -> Result { + let ep = creds.endpoint.clone(); + let user = creds.inner.user.clone(); + if let Some(role_secret) = self.caches.role_secret.get(&(ep.clone(), user.clone())) { + return Ok(role_secret); + } + let auth_info = self.do_get_auth_info(extra, creds).await?; + let (_, secret) = self + .caches + .role_secret + .insert((ep.clone(), user), auth_info.secret.clone()); + self.caches + .allowed_ips + .insert(ep, Arc::new(auth_info.allowed_ips)); + Ok(secret) } async fn get_allowed_ips( @@ -172,8 +186,7 @@ impl super::Api for Api { extra: &ConsoleReqExtra, creds: &ComputeUserInfo, ) -> Result>, GetAuthInfoError> { - let key: &str = &creds.endpoint; - if let Some(allowed_ips) = self.caches.allowed_ips.get(key) { + if let Some(allowed_ips) = self.caches.allowed_ips.get(&creds.endpoint) { ALLOWED_IPS_BY_CACHE_OUTCOME .with_label_values(&["hit"]) .inc(); @@ -182,10 +195,14 @@ impl super::Api for Api { ALLOWED_IPS_BY_CACHE_OUTCOME .with_label_values(&["miss"]) .inc(); - let allowed_ips = Arc::new(self.do_get_auth_info(extra, creds).await?.allowed_ips); + let auth_info = self.do_get_auth_info(extra, creds).await?; + let allowed_ips = Arc::new(auth_info.allowed_ips); + let ep = creds.endpoint.clone(); + let user = creds.inner.user.clone(); self.caches - .allowed_ips - .insert(key.into(), allowed_ips.clone()); + .role_secret + .insert((ep.clone(), user), auth_info.secret); + self.caches.allowed_ips.insert(ep, allowed_ips.clone()); Ok(allowed_ips) } @@ -252,9 +269,10 @@ async fn parse_body serde::Deserialize<'a>>( Err(ApiError::Console { status, text }) } -fn parse_host_port(input: &str) -> Option<(String, u16)> { - let parsed: SocketAddr = input.parse().ok()?; - Some((parsed.ip().to_string(), parsed.port())) +fn parse_host_port(input: &str) -> Option<(&str, u16)> { + let (host, port) = input.rsplit_once(':')?; + let ipv6_brackets: &[_] = &['[', ']']; + Some((host.trim_matches(ipv6_brackets), port.parse().ok()?)) } #[cfg(test)] @@ -262,9 +280,24 @@ mod tests { use super::*; #[test] - fn test_parse_host_port() { + fn test_parse_host_port_v4() { let (host, port) = parse_host_port("127.0.0.1:5432").expect("failed to parse"); assert_eq!(host, "127.0.0.1"); assert_eq!(port, 5432); } + + #[test] + fn test_parse_host_port_v6() { + let (host, port) = parse_host_port("[2001:db8::1]:5432").expect("failed to parse"); + assert_eq!(host, "2001:db8::1"); + assert_eq!(port, 5432); + } + + #[test] + fn test_parse_host_port_url() { + let (host, port) = parse_host_port("compute-foo-bar-1234.default.svc.cluster.local:5432") + .expect("failed to parse"); + assert_eq!(host, "compute-foo-bar-1234.default.svc.cluster.local"); + assert_eq!(port, 5432); + } } diff --git a/proxy/src/scram/key.rs b/proxy/src/scram/key.rs index e9c65fcef3..bd93fb2b70 100644 --- a/proxy/src/scram/key.rs +++ b/proxy/src/scram/key.rs @@ -6,7 +6,7 @@ pub const SCRAM_KEY_LEN: usize = 32; /// One of the keys derived from the [password](super::password::SaltedPassword). /// We use the same structure for all keys, i.e. /// `ClientKey`, `StoredKey`, and `ServerKey`. -#[derive(Default, PartialEq, Eq)] +#[derive(Clone, Default, PartialEq, Eq)] #[repr(transparent)] pub struct ScramKey { bytes: [u8; SCRAM_KEY_LEN], diff --git a/proxy/src/scram/secret.rs b/proxy/src/scram/secret.rs index 424beccec9..9e74e07af1 100644 --- a/proxy/src/scram/secret.rs +++ b/proxy/src/scram/secret.rs @@ -5,6 +5,7 @@ use super::key::ScramKey; /// Server secret is produced from [password](super::password::SaltedPassword) /// and is used throughout the authentication process. +#[derive(Clone)] pub struct ServerSecret { /// Number of iterations for `PBKDF2` function. pub iterations: u32, diff --git a/proxy/src/serverless.rs b/proxy/src/serverless.rs index e358a0712f..07825da8dc 100644 --- a/proxy/src/serverless.rs +++ b/proxy/src/serverless.rs @@ -6,9 +6,13 @@ mod conn_pool; mod sql_over_http; mod websocket; +pub use conn_pool::GlobalConnPoolOptions; + use anyhow::bail; use hyper::StatusCode; use metrics::IntCounterPairGuard; +use rand::rngs::StdRng; +use rand::SeedableRng; pub use reqwest_middleware::{ClientWithMiddleware, Error}; pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware}; use tokio_util::task::TaskTracker; @@ -47,6 +51,11 @@ pub async fn task_main( let conn_pool = conn_pool::GlobalConnPool::new(config); + let conn_pool2 = Arc::clone(&conn_pool); + tokio::spawn(async move { + conn_pool2.gc_worker(StdRng::from_entropy()).await; + }); + // shutdown the connection pool tokio::spawn({ let cancellation_token = cancellation_token.clone(); diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs index ab8903418b..c476560215 100644 --- a/proxy/src/serverless/conn_pool.rs +++ b/proxy/src/serverless/conn_pool.rs @@ -1,15 +1,19 @@ use anyhow::{anyhow, Context}; use async_trait::async_trait; use dashmap::DashMap; -use futures::future::poll_fn; +use futures::{future::poll_fn, Future}; +use metrics::{register_int_counter_pair, IntCounterPair, IntCounterPairGuard}; +use once_cell::sync::Lazy; use parking_lot::RwLock; use pbkdf2::{ password_hash::{PasswordHashString, PasswordHasher, PasswordVerifier, SaltString}, Params, Pbkdf2, }; use pq_proto::StartupMessageParams; +use prometheus::{exponential_buckets, register_histogram, Histogram}; +use rand::Rng; use smol_str::SmolStr; -use std::{collections::HashMap, net::IpAddr, sync::Arc}; +use std::{collections::HashMap, net::IpAddr, pin::pin, sync::Arc, sync::Weak, time::Duration}; use std::{ fmt, task::{ready, Poll}, @@ -18,7 +22,7 @@ use std::{ ops::Deref, sync::atomic::{self, AtomicUsize}, }; -use tokio::time; +use tokio::time::{self, Instant}; use tokio_postgres::{AsyncMessage, ReadyForQueryStatus}; use crate::{ @@ -30,11 +34,10 @@ use crate::{ }; use crate::{compute, config}; -use tracing::{error, warn, Span}; +use tracing::{debug, error, warn, Span}; use tracing::{info, info_span, Instrument}; pub const APP_NAME: &str = "/sql_over_http"; -const MAX_CONNS_PER_ENDPOINT: usize = 20; #[derive(Debug, Clone)] pub struct ConnInfo { @@ -69,6 +72,77 @@ struct ConnPoolEntry { pub struct EndpointConnPool { pools: HashMap<(SmolStr, SmolStr), DbUserConnPool>, total_conns: usize, + max_conns: usize, + _guard: IntCounterPairGuard, +} + +impl EndpointConnPool { + fn get_conn_entry(&mut self, db_user: (SmolStr, SmolStr)) -> Option { + let Self { + pools, total_conns, .. + } = self; + pools + .get_mut(&db_user) + .and_then(|pool_entries| pool_entries.get_conn_entry(total_conns)) + } + + fn remove_client(&mut self, db_user: (SmolStr, SmolStr), conn_id: uuid::Uuid) -> bool { + let Self { + pools, total_conns, .. + } = self; + if let Some(pool) = pools.get_mut(&db_user) { + let old_len = pool.conns.len(); + pool.conns.retain(|conn| conn.conn.conn_id != conn_id); + let new_len = pool.conns.len(); + let removed = old_len - new_len; + *total_conns -= removed; + removed > 0 + } else { + false + } + } + + fn put(pool: &RwLock, conn_info: &ConnInfo, client: ClientInner) -> anyhow::Result<()> { + let conn_id = client.conn_id; + + if client.inner.is_closed() { + info!(%conn_id, "pool: throwing away connection '{conn_info}' because connection is closed"); + return Ok(()); + } + + // return connection to the pool + let mut returned = false; + let mut per_db_size = 0; + let total_conns = { + let mut pool = pool.write(); + + if pool.total_conns < pool.max_conns { + // we create this db-user entry in get, so it should not be None + if let Some(pool_entries) = pool.pools.get_mut(&conn_info.db_and_user()) { + pool_entries.conns.push(ConnPoolEntry { + conn: client, + _last_access: std::time::Instant::now(), + }); + + returned = true; + per_db_size = pool_entries.conns.len(); + + pool.total_conns += 1; + } + } + + pool.total_conns + }; + + // do logging outside of the mutex + if returned { + info!(%conn_id, "pool: returning connection '{conn_info}' back to the pool, total_conns={total_conns}, for this (db, user)={per_db_size}"); + } else { + info!(%conn_id, "pool: throwing away connection '{conn_info}' because pool is full, total_conns={total_conns}"); + } + + Ok(()) + } } /// 4096 is the number of rounds that SCRAM-SHA-256 recommends. @@ -87,6 +161,27 @@ pub struct DbUserConnPool { password_hash: Option, } +impl DbUserConnPool { + fn clear_closed_clients(&mut self, conns: &mut usize) { + let old_len = self.conns.len(); + + self.conns.retain(|conn| !conn.conn.inner.is_closed()); + + let new_len = self.conns.len(); + let removed = old_len - new_len; + *conns -= removed; + } + + fn get_conn_entry(&mut self, conns: &mut usize) -> Option { + self.clear_closed_clients(conns); + let conn = self.conns.pop(); + if conn.is_some() { + *conns -= 1; + } + conn + } +} + pub struct GlobalConnPool { // endpoint -> per-endpoint connection pool // @@ -94,52 +189,127 @@ pub struct GlobalConnPool { // pool as early as possible and release the lock. global_pool: DashMap>>, + /// Number of endpoint-connection pools + /// /// [`DashMap::len`] iterates over all inner pools and acquires a read lock on each. /// That seems like far too much effort, so we're using a relaxed increment counter instead. /// It's only used for diagnostics. global_pool_size: AtomicUsize, + proxy_config: &'static crate::config::ProxyConfig, +} + +#[derive(Debug, Clone, Copy)] +pub struct GlobalConnPoolOptions { // Maximum number of connections per one endpoint. // Can mix different (dbname, username) connections. // When running out of free slots for a particular endpoint, // falls back to opening a new connection for each request. - max_conns_per_endpoint: usize, + pub max_conns_per_endpoint: usize, - proxy_config: &'static crate::config::ProxyConfig, + pub gc_epoch: Duration, - // Using a lock to remove any race conditions. - // Eg cleaning up connections while a new connection is returned - closed: RwLock, + pub pool_shards: usize, + + pub idle_timeout: Duration, + + pub opt_in: bool, } +pub static GC_LATENCY: Lazy = Lazy::new(|| { + register_histogram!( + "proxy_http_pool_reclaimation_lag_seconds", + "Time it takes to reclaim unused connection pools", + // 1us -> 65ms + exponential_buckets(1e-6, 2.0, 16).unwrap(), + ) + .unwrap() +}); + +pub static ENDPOINT_POOLS: Lazy = Lazy::new(|| { + register_int_counter_pair!( + "proxy_http_pool_endpoints_registered_total", + "Number of endpoints we have registered pools for", + "proxy_http_pool_endpoints_unregistered_total", + "Number of endpoints we have unregistered pools for", + ) + .unwrap() +}); + impl GlobalConnPool { pub fn new(config: &'static crate::config::ProxyConfig) -> Arc { + let shards = config.http_config.pool_options.pool_shards; Arc::new(Self { - global_pool: DashMap::new(), + global_pool: DashMap::with_shard_amount(shards), global_pool_size: AtomicUsize::new(0), - max_conns_per_endpoint: MAX_CONNS_PER_ENDPOINT, proxy_config: config, - closed: RwLock::new(false), }) } pub fn shutdown(&self) { - *self.closed.write() = true; + // drops all strong references to endpoint-pools + self.global_pool.clear(); + } - self.global_pool.retain(|_, endpoint_pool| { - let mut pool = endpoint_pool.write(); - // by clearing this hashmap, we remove the slots that a connection can be returned to. - // when returning, it drops the connection if the slot doesn't exist - pool.pools.clear(); - pool.total_conns = 0; + pub async fn gc_worker(&self, mut rng: impl Rng) { + let epoch = self.proxy_config.http_config.pool_options.gc_epoch; + let mut interval = tokio::time::interval(epoch / (self.global_pool.shards().len()) as u32); + loop { + interval.tick().await; - false + let shard = rng.gen_range(0..self.global_pool.shards().len()); + self.gc(shard); + } + } + + fn gc(&self, shard: usize) { + debug!(shard, "pool: performing epoch reclamation"); + + // acquire a random shard lock + let mut shard = self.global_pool.shards()[shard].write(); + + let timer = GC_LATENCY.start_timer(); + let current_len = shard.len(); + shard.retain(|endpoint, x| { + // if the current endpoint pool is unique (no other strong or weak references) + // then it is currently not in use by any connections. + if let Some(pool) = Arc::get_mut(x.get_mut()) { + let EndpointConnPool { + pools, total_conns, .. + } = pool.get_mut(); + + // ensure that closed clients are removed + pools + .iter_mut() + .for_each(|(_, db_pool)| db_pool.clear_closed_clients(total_conns)); + + // we only remove this pool if it has no active connections + if *total_conns == 0 { + info!("pool: discarding pool for endpoint {endpoint}"); + return false; + } + } + + true }); + let new_len = shard.len(); + drop(shard); + timer.observe_duration(); + + let removed = current_len - new_len; + + if removed > 0 { + let global_pool_size = self + .global_pool_size + .fetch_sub(removed, atomic::Ordering::Relaxed) + - removed; + info!("pool: performed global pool gc. size now {global_pool_size}"); + } } pub async fn get( self: &Arc, - conn_info: &ConnInfo, + conn_info: ConnInfo, force_new: bool, session_id: uuid::Uuid, peer_addr: IpAddr, @@ -147,15 +317,11 @@ impl GlobalConnPool { let mut client: Option = None; let mut latency_timer = LatencyTimer::new("http"); - let pool = if force_new { - None - } else { - Some((conn_info.clone(), self.clone())) - }; - let mut hash_valid = false; + let mut endpoint_pool = Weak::new(); if !force_new { let pool = self.get_or_create_endpoint_pool(&conn_info.hostname); + endpoint_pool = Arc::downgrade(&pool); let mut hash = None; // find a pool entry by (dbname, username) if exists @@ -180,12 +346,8 @@ impl GlobalConnPool { // we will continue with the regular connection flow if validate.is_ok() { hash_valid = true; - let mut pool = pool.write(); - if let Some(pool_entries) = pool.pools.get_mut(&conn_info.db_and_user()) { - if let Some(entry) = pool_entries.conns.pop() { - client = Some(entry.conn); - pool.total_conns -= 1; - } + if let Some(entry) = pool.write().get_conn_entry(conn_info.db_and_user()) { + client = Some(entry.conn) } } } @@ -198,11 +360,12 @@ impl GlobalConnPool { info!(%conn_id, "pool: cached connection '{conn_info}' is closed, opening a new one"); connect_to_compute( self.proxy_config, - conn_info, + &conn_info, conn_id, session_id, latency_timer, peer_addr, + endpoint_pool.clone(), ) .await } else { @@ -214,18 +377,19 @@ impl GlobalConnPool { ); latency_timer.pool_hit(); latency_timer.success(); - return Ok(Client::new(client, pool).await); + return Ok(Client::new(client, conn_info, endpoint_pool).await); } } else { let conn_id = uuid::Uuid::new_v4(); info!(%conn_id, "pool: opening a new connection '{conn_info}'"); connect_to_compute( self.proxy_config, - conn_info, + &conn_info, conn_id, session_id, latency_timer, peer_addr, + endpoint_pool.clone(), ) .await }; @@ -269,59 +433,7 @@ impl GlobalConnPool { _ => {} } let new_client = new_client?; - Ok(Client::new(new_client, pool).await) - } - - fn put(&self, conn_info: &ConnInfo, client: ClientInner) -> anyhow::Result<()> { - let conn_id = client.conn_id; - - // We want to hold this open while we return. This ensures that the pool can't close - // while we are in the middle of returning the connection. - let closed = self.closed.read(); - if *closed { - info!(%conn_id, "pool: throwing away connection '{conn_info}' because pool is closed"); - return Ok(()); - } - - if client.inner.is_closed() { - info!(%conn_id, "pool: throwing away connection '{conn_info}' because connection is closed"); - return Ok(()); - } - - let pool = self.get_or_create_endpoint_pool(&conn_info.hostname); - - // return connection to the pool - let mut returned = false; - let mut per_db_size = 0; - let total_conns = { - let mut pool = pool.write(); - - if pool.total_conns < self.max_conns_per_endpoint { - // we create this db-user entry in get, so it should not be None - if let Some(pool_entries) = pool.pools.get_mut(&conn_info.db_and_user()) { - pool_entries.conns.push(ConnPoolEntry { - conn: client, - _last_access: std::time::Instant::now(), - }); - - returned = true; - per_db_size = pool_entries.conns.len(); - - pool.total_conns += 1; - } - } - - pool.total_conns - }; - - // do logging outside of the mutex - if returned { - info!(%conn_id, "pool: returning connection '{conn_info}' back to the pool, total_conns={total_conns}, for this (db, user)={per_db_size}"); - } else { - info!(%conn_id, "pool: throwing away connection '{conn_info}' because pool is full, total_conns={total_conns}"); - } - - Ok(()) + Ok(Client::new(new_client, conn_info, endpoint_pool).await) } fn get_or_create_endpoint_pool(&self, endpoint: &SmolStr) -> Arc> { @@ -334,6 +446,12 @@ impl GlobalConnPool { let new_pool = Arc::new(RwLock::new(EndpointConnPool { pools: HashMap::new(), total_conns: 0, + max_conns: self + .proxy_config + .http_config + .pool_options + .max_conns_per_endpoint, + _guard: ENDPOINT_POOLS.guard(), })); // find or create a pool for this endpoint @@ -363,9 +481,11 @@ impl GlobalConnPool { } struct TokioMechanism<'a> { + pool: Weak>, conn_info: &'a ConnInfo, session_id: uuid::Uuid, conn_id: uuid::Uuid, + idle: Duration, } #[async_trait] @@ -385,6 +505,8 @@ impl ConnectMechanism for TokioMechanism<'_> { timeout, self.conn_id, self.session_id, + self.pool.clone(), + self.idle, ) .await } @@ -403,6 +525,7 @@ async fn connect_to_compute( session_id: uuid::Uuid, latency_timer: LatencyTimer, peer_addr: IpAddr, + pool: Weak>, ) -> anyhow::Result { let tls = config.tls_config.as_ref(); let common_names = tls.and_then(|tls| tls.common_names.clone()); @@ -431,7 +554,6 @@ async fn connect_to_compute( application_name: APP_NAME.to_string(), options: console_options, }; - // TODO(anna): this is a bit hacky way, consider using console notification listener. if !config.disable_ip_check_for_http { let allowed_ips = backend.get_allowed_ips(&extra).await?; if !check_peer_addr_is_in_list(&peer_addr, &allowed_ips) { @@ -448,6 +570,8 @@ async fn connect_to_compute( conn_id, conn_info, session_id, + pool, + idle: config.http_config.pool_options.idle_timeout, }, node_info, &extra, @@ -463,6 +587,8 @@ async fn connect_to_compute_once( timeout: time::Duration, conn_id: uuid::Uuid, mut session: uuid::Uuid, + pool: Weak>, + idle: Duration, ) -> Result { let mut config = (*node_info.config).clone(); @@ -491,13 +617,29 @@ async fn connect_to_compute_once( branch_id: node_info.aux.branch_id.clone(), }; + let db_user = conn_info.db_and_user(); tokio::spawn( async move { let _conn_gauge = conn_gauge; + let mut idle_timeout = pin!(tokio::time::sleep(idle)); poll_fn(move |cx| { if matches!(rx.has_changed(), Ok(true)) { session = *rx.borrow_and_update(); info!(%session, "changed session"); + idle_timeout.as_mut().reset(Instant::now() + idle); + } + + // 5 minute idle connection timeout + if idle_timeout.as_mut().poll(cx).is_ready() { + idle_timeout.as_mut().reset(Instant::now() + idle); + info!("connection idle"); + if let Some(pool) = pool.clone().upgrade() { + // remove client from pool - should close the connection if it's idle. + // does nothing if the client is currently checked-out and in-use + if pool.write().remove_client(db_user.clone(), conn_id) { + info!("idle connection removed"); + } + } } loop { @@ -515,15 +657,25 @@ async fn connect_to_compute_once( } Some(Err(e)) => { error!(%session, "connection error: {}", e); - return Poll::Ready(()) + break } None => { info!("connection closed"); - return Poll::Ready(()) + break } } } - }).await + + // remove from connection pool + if let Some(pool) = pool.clone().upgrade() { + if pool.write().remove_client(db_user.clone(), conn_id) { + info!("closed connection removed"); + } + } + + Poll::Ready(()) + }).await; + } .instrument(span) ); @@ -553,23 +705,27 @@ pub struct Client { conn_id: uuid::Uuid, span: Span, inner: Option, - pool: Option<(ConnInfo, Arc)>, + conn_info: ConnInfo, + pool: Weak>, } pub struct Discard<'a> { conn_id: uuid::Uuid, - pool: &'a mut Option<(ConnInfo, Arc)>, + conn_info: &'a ConnInfo, + pool: &'a mut Weak>, } impl Client { pub(self) async fn new( inner: ClientInner, - pool: Option<(ConnInfo, Arc)>, + conn_info: ConnInfo, + pool: Weak>, ) -> Self { Self { conn_id: inner.conn_id, inner: Some(inner), span: Span::current(), + conn_info, pool, } } @@ -578,6 +734,7 @@ impl Client { inner, pool, conn_id, + conn_info, span: _, } = self; ( @@ -587,6 +744,7 @@ impl Client { .inner, Discard { pool, + conn_info, conn_id: *conn_id, }, ) @@ -602,14 +760,14 @@ impl Client { impl Discard<'_> { pub fn check_idle(&mut self, status: ReadyForQueryStatus) { - if status != ReadyForQueryStatus::Idle { - if let Some((conn_info, _)) = self.pool.take() { - info!(conn_id = %self.conn_id, "pool: throwing away connection '{conn_info}' because connection is not idle") - } + let conn_info = &self.conn_info; + if status != ReadyForQueryStatus::Idle && std::mem::take(self.pool).strong_count() > 0 { + info!(conn_id = %self.conn_id, "pool: throwing away connection '{conn_info}' because connection is not idle") } } pub fn discard(&mut self) { - if let Some((conn_info, _)) = self.pool.take() { + let conn_info = &self.conn_info; + if std::mem::take(self.pool).strong_count() > 0 { info!(conn_id = %self.conn_id, "pool: throwing away connection '{conn_info}' because connection is potentially in a broken state") } } @@ -629,16 +787,17 @@ impl Deref for Client { impl Drop for Client { fn drop(&mut self) { + let conn_info = self.conn_info.clone(); let client = self .inner .take() .expect("client inner should not be removed"); - if let Some((conn_info, conn_pool)) = self.pool.take() { + if let Some(conn_pool) = std::mem::take(&mut self.pool).upgrade() { let current_span = self.span.clone(); // return connection to the pool tokio::task::spawn_blocking(move || { let _span = current_span.enter(); - let _ = conn_pool.put(&conn_info, client); + let _ = EndpointConnPool::put(&conn_pool, &conn_info, client); }); } } diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs index 307b085ce0..2e9d8526d3 100644 --- a/proxy/src/serverless/sql_over_http.rs +++ b/proxy/src/serverless/sql_over_http.rs @@ -206,7 +206,7 @@ pub async fn handle( config: &'static HttpConfig, ) -> Result, ApiError> { let result = tokio::time::timeout( - config.timeout, + config.request_timeout, handle_inner( config, request, @@ -278,7 +278,7 @@ pub async fn handle( Err(_) => { let message = format!( "HTTP-Connection timed out, execution time exeeded {} seconds", - config.timeout.as_secs() + config.request_timeout.as_secs() ); error!(message); json_response( @@ -320,7 +320,8 @@ async fn handle_inner( // Allow connection pooling only if explicitly requested // or if we have decided that http pool is no longer opt-in - let allow_pool = !config.pool_opt_in || headers.get(&ALLOW_POOL) == Some(&HEADER_VALUE_TRUE); + let allow_pool = + !config.pool_options.opt_in || headers.get(&ALLOW_POOL) == Some(&HEADER_VALUE_TRUE); // isolation level, read only and deferrable @@ -359,7 +360,7 @@ async fn handle_inner( let payload: Payload = serde_json::from_slice(&body)?; let mut client = conn_pool - .get(&conn_info, !allow_pool, session_id, peer_addr) + .get(conn_info, !allow_pool, session_id, peer_addr) .await?; let mut response = Response::builder() diff --git a/s3_scrubber/src/checks.rs b/s3_scrubber/src/checks.rs index 2acbb2352b..7b9f96dce3 100644 --- a/s3_scrubber/src/checks.rs +++ b/s3_scrubber/src/checks.rs @@ -1,9 +1,12 @@ -use std::collections::HashSet; +use std::collections::{HashMap, HashSet}; use anyhow::Context; use aws_sdk_s3::{types::ObjectIdentifier, Client}; +use pageserver::tenant::remote_timeline_client::index::IndexLayerMetadata; +use pageserver_api::shard::ShardIndex; use tracing::{error, info, warn}; use utils::generation::Generation; +use utils::id::TimelineId; use crate::cloud_admin_api::BranchData; use crate::metadata_stream::stream_listing; @@ -40,7 +43,7 @@ impl TimelineAnalysis { pub(crate) fn branch_cleanup_and_check_errors( id: &TenantShardTimelineId, - s3_root: &RootTarget, + tenant_objects: &mut TenantObjectListing, s3_active_branch: Option<&BranchData>, console_branch: Option, s3_data: Option, @@ -72,8 +75,8 @@ pub(crate) fn branch_cleanup_and_check_errors( match s3_data.blob_data { BlobDataParseResult::Parsed { index_part, - index_part_generation, - mut s3_layers, + index_part_generation: _index_part_generation, + s3_layers: _s3_layers, } => { if !IndexPart::KNOWN_VERSIONS.contains(&index_part.get_version()) { result.errors.push(format!( @@ -111,65 +114,19 @@ pub(crate) fn branch_cleanup_and_check_errors( )) } - let layer_map_key = (layer, metadata.generation); - if !s3_layers.remove(&layer_map_key) { + if !tenant_objects.check_ref(id.timeline_id, &layer, &metadata) { // FIXME: this will emit false positives if an index was // uploaded concurrently with our scan. To make this check // correct, we need to try sending a HEAD request for the // layer we think is missing. result.errors.push(format!( - "index_part.json contains a layer {}{} that is not present in remote storage", - layer_map_key.0.file_name(), - layer_map_key.1.get_suffix() + "index_part.json contains a layer {}{} (shard {}) that is not present in remote storage", + layer.file_name(), + metadata.generation.get_suffix(), + metadata.shard )) } } - - let orphan_layers: Vec<(LayerFileName, Generation)> = s3_layers - .into_iter() - .filter(|(_layer_name, gen)| - // A layer is only considered orphaned if it has a generation below - // the index. If the generation is >= the index, then the layer may - // be an upload from a running pageserver, or even an upload from - // a new generation that didn't upload an index yet. - // - // Even so, a layer that is not referenced by the index could just - // be something enqueued for deletion, so while this check is valid - // for indicating that a layer is garbage, it is not an indicator - // of a problem. - gen < &index_part_generation) - .collect(); - - if !orphan_layers.is_empty() { - // An orphan layer is not an error: it's arguably not even a warning, but it is helpful to report - // these as a hint that there is something worth cleaning up here. - result.warnings.push(format!( - "index_part.json does not contain layers from S3: {:?}", - orphan_layers - .iter() - .map(|(layer_name, gen)| format!( - "{}{}", - layer_name.file_name(), - gen.get_suffix() - )) - .collect::>(), - )); - result.garbage_keys.extend(orphan_layers.iter().map( - |(layer_name, layer_gen)| { - let mut key = s3_root.timeline_root(id).prefix_in_bucket; - let delimiter = s3_root.delimiter(); - if !key.ends_with(delimiter) { - key.push_str(delimiter); - } - key.push_str(&format!( - "{}{}", - &layer_name.file_name(), - layer_gen.get_suffix() - )); - key - }, - )); - } } BlobDataParseResult::Relic => {} BlobDataParseResult::Incorrect(parse_errors) => result.errors.extend( @@ -204,6 +161,83 @@ pub(crate) fn branch_cleanup_and_check_errors( result } +#[derive(Default)] +pub(crate) struct LayerRef { + ref_count: usize, +} + +/// Top-level index of objects in a tenant. This may be used by any shard-timeline within +/// the tenant to query whether an object exists. +#[derive(Default)] +pub(crate) struct TenantObjectListing { + shard_timelines: + HashMap<(ShardIndex, TimelineId), HashMap<(LayerFileName, Generation), LayerRef>>, +} + +impl TenantObjectListing { + /// Having done an S3 listing of the keys within a timeline prefix, merge them into the overall + /// list of layer keys for the Tenant. + pub(crate) fn push( + &mut self, + ttid: TenantShardTimelineId, + layers: HashSet<(LayerFileName, Generation)>, + ) { + let shard_index = ShardIndex::new( + ttid.tenant_shard_id.shard_number, + ttid.tenant_shard_id.shard_count, + ); + let replaced = self.shard_timelines.insert( + (shard_index, ttid.timeline_id), + layers + .into_iter() + .map(|l| (l, LayerRef::default())) + .collect(), + ); + + assert!( + replaced.is_none(), + "Built from an S3 object listing, which should never repeat a key" + ); + } + + /// Having loaded a timeline index, check if a layer referenced by the index exists. If it does, + /// the layer's refcount will be incremented. Later, after calling this for all references in all indices + /// in a tenant, orphan layers may be detected by their zero refcounts. + /// + /// Returns true if the layer exists + pub(crate) fn check_ref( + &mut self, + timeline_id: TimelineId, + layer_file: &LayerFileName, + metadata: &IndexLayerMetadata, + ) -> bool { + let Some(shard_tl) = self.shard_timelines.get_mut(&(metadata.shard, timeline_id)) else { + return false; + }; + + let Some(layer_ref) = shard_tl.get_mut(&(layer_file.clone(), metadata.generation)) else { + return false; + }; + + layer_ref.ref_count += 1; + + true + } + + pub(crate) fn get_orphans(&self) -> Vec<(ShardIndex, TimelineId, LayerFileName, Generation)> { + let mut result = Vec::new(); + for ((shard_index, timeline_id), layers) in &self.shard_timelines { + for ((layer_file, generation), layer_ref) in layers { + if layer_ref.ref_count == 0 { + result.push((*shard_index, *timeline_id, layer_file.clone(), *generation)) + } + } + } + + result + } +} + #[derive(Debug)] pub(crate) struct S3TimelineBlobData { pub(crate) blob_data: BlobDataParseResult, diff --git a/s3_scrubber/src/lib.rs b/s3_scrubber/src/lib.rs index d2338c21e5..8fb1346c8e 100644 --- a/s3_scrubber/src/lib.rs +++ b/s3_scrubber/src/lib.rs @@ -15,6 +15,7 @@ use anyhow::Context; use aws_config::environment::EnvironmentVariableCredentialsProvider; use aws_config::imds::credentials::ImdsCredentialsProvider; use aws_config::meta::credentials::CredentialsProviderChain; +use aws_config::profile::ProfileFileCredentialsProvider; use aws_config::sso::SsoCredentialsProvider; use aws_config::BehaviorVersion; use aws_sdk_s3::config::Region; @@ -255,6 +256,11 @@ pub fn init_s3_client(account_id: Option, bucket_region: Region) -> Clie let chain = CredentialsProviderChain::first_try( "env", EnvironmentVariableCredentialsProvider::new(), + ) + // uses "AWS_PROFILE" / `aws sso login --profile ` + .or_else( + "profile-sso", + ProfileFileCredentialsProvider::builder().build(), ); // Use SSO if we were given an account ID @@ -265,7 +271,7 @@ pub fn init_s3_client(account_id: Option, bucket_region: Region) -> Clie .account_id(sso_account) .role_name("PowerUserAccess") .start_url("https://neondb.awsapps.com/start") - .region(Region::from_static("eu-central-1")) + .region(bucket_region.clone()) .build(), ), None => chain, diff --git a/s3_scrubber/src/scan_metadata.rs b/s3_scrubber/src/scan_metadata.rs index 91347ca21b..bcc4d2e618 100644 --- a/s3_scrubber/src/scan_metadata.rs +++ b/s3_scrubber/src/scan_metadata.rs @@ -2,22 +2,25 @@ use std::collections::{HashMap, HashSet}; use crate::checks::{ branch_cleanup_and_check_errors, list_timeline_blobs, BlobDataParseResult, S3TimelineBlobData, - TimelineAnalysis, + TenantObjectListing, TimelineAnalysis, }; use crate::metadata_stream::{stream_tenant_timelines, stream_tenants}; use crate::{init_remote, BucketConfig, NodeKind, RootTarget, TenantShardTimelineId}; use aws_sdk_s3::Client; use futures_util::{pin_mut, StreamExt, TryStreamExt}; use histogram::Histogram; +use pageserver::tenant::remote_timeline_client::remote_layer_path; use pageserver::tenant::IndexPart; +use pageserver_api::shard::TenantShardId; use serde::Serialize; +use utils::id::TenantId; #[derive(Serialize)] pub struct MetadataSummary { count: usize, with_errors: HashSet, with_warnings: HashSet, - with_garbage: HashSet, + with_orphans: HashSet, indices_by_version: HashMap, layer_count: MinMaxHisto, @@ -87,7 +90,7 @@ impl MetadataSummary { count: 0, with_errors: HashSet::new(), with_warnings: HashSet::new(), - with_garbage: HashSet::new(), + with_orphans: HashSet::new(), indices_by_version: HashMap::new(), layer_count: MinMaxHisto::new(), timeline_size_bytes: MinMaxHisto::new(), @@ -141,6 +144,10 @@ impl MetadataSummary { } } + fn notify_timeline_orphan(&mut self, ttid: &TenantShardTimelineId) { + self.with_orphans.insert(*ttid); + } + /// Long-form output for printing at end of a scan pub fn summary_string(&self) -> String { let version_summary: String = itertools::join( @@ -154,7 +161,7 @@ impl MetadataSummary { "Timelines: {0} With errors: {1} With warnings: {2} -With garbage: {3} +With orphan layers: {3} Index versions: {version_summary} Timeline size bytes: {4} Layer size bytes: {5} @@ -163,7 +170,7 @@ Timeline layer count: {6} self.count, self.with_errors.len(), self.with_warnings.len(), - self.with_garbage.len(), + self.with_orphans.len(), self.timeline_size_bytes.oneline(), self.layer_size_bytes.oneline(), self.layer_count.oneline(), @@ -191,7 +198,7 @@ pub async fn scan_metadata(bucket_config: BucketConfig) -> anyhow::Result anyhow::ResultS3TimelineBlobData for each tenant, because different + // shards in the same tenant might refer to one anothers' keys if a shard split has happened. + + let mut tenant_id = None; + let mut tenant_objects = TenantObjectListing::default(); + let mut tenant_timeline_results = Vec::new(); + + fn analyze_tenant( + tenant_id: TenantId, + summary: &mut MetadataSummary, + mut tenant_objects: TenantObjectListing, + timelines: Vec<(TenantShardTimelineId, S3TimelineBlobData)>, + ) { + let mut timeline_generations = HashMap::new(); + for (ttid, data) in timelines { + // Stash the generation of each timeline, for later use identifying orphan layers + if let BlobDataParseResult::Parsed { + index_part: _index_part, + index_part_generation, + s3_layers: _s3_layers, + } = &data.blob_data + { + timeline_generations.insert(ttid, *index_part_generation); + } + + // Apply checks to this timeline shard's metadata, and in the process update `tenant_objects` + // reference counts for layers across the tenant. + let analysis = + branch_cleanup_and_check_errors(&ttid, &mut tenant_objects, None, None, Some(data)); + summary.update_analysis(&ttid, &analysis); + } + + // Identifying orphan layers must be done on a tenant-wide basis, because individual + // shards' layers may be referenced by other shards. + // + // Orphan layers are not a corruption, and not an indication of a problem. They are just + // consuming some space in remote storage, and may be cleaned up at leisure. + for (shard_index, timeline_id, layer_file, generation) in tenant_objects.get_orphans() { + let ttid = TenantShardTimelineId { + tenant_shard_id: TenantShardId { + tenant_id, + shard_count: shard_index.shard_count, + shard_number: shard_index.shard_number, + }, + timeline_id, + }; + + if let Some(timeline_generation) = timeline_generations.get(&ttid) { + if &generation >= timeline_generation { + // Candidate orphan layer is in the current or future generation relative + // to the index we read for this timeline shard, so its absence from the index + // doesn't make it an orphan: more likely, it is a case where the layer was + // uploaded, but the index referencing the layer wasn't written yet. + continue; + } + } + + let orphan_path = remote_layer_path( + &tenant_id, + &timeline_id, + shard_index, + &layer_file, + generation, + ); + + tracing::info!("Orphan layer detected: {orphan_path}"); + + summary.notify_timeline_orphan(&ttid); + } + } + + // Iterate through all the timeline results. These are in key-order, so + // all results for the same tenant will be adjacent. We accumulate these, + // and then call `analyze_tenant` to flush, when we see the next tenant ID. let mut summary = MetadataSummary::new(); pin_mut!(timelines); while let Some(i) = timelines.next().await { let (ttid, data) = i?; summary.update_data(&data); - let analysis = branch_cleanup_and_check_errors(&ttid, &target, None, None, Some(data)); + match tenant_id { + None => tenant_id = Some(ttid.tenant_shard_id.tenant_id), + Some(prev_tenant_id) => { + if prev_tenant_id != ttid.tenant_shard_id.tenant_id { + let tenant_objects = std::mem::take(&mut tenant_objects); + let timelines = std::mem::take(&mut tenant_timeline_results); + analyze_tenant(prev_tenant_id, &mut summary, tenant_objects, timelines); + tenant_id = Some(ttid.tenant_shard_id.tenant_id); + } + } + } - summary.update_analysis(&ttid, &analysis); + if let BlobDataParseResult::Parsed { + index_part: _index_part, + index_part_generation: _index_part_generation, + s3_layers, + } = &data.blob_data + { + tenant_objects.push(ttid, s3_layers.clone()); + } + tenant_timeline_results.push((ttid, data)); + } + + if !tenant_timeline_results.is_empty() { + analyze_tenant( + tenant_id.expect("Must be set if results are present"), + &mut summary, + tenant_objects, + tenant_timeline_results, + ); } Ok(summary) diff --git a/scripts/sk_collect_dumps/.gitignore b/scripts/sk_collect_dumps/.gitignore index d9d4d0296a..cdf99aefd7 100644 --- a/scripts/sk_collect_dumps/.gitignore +++ b/scripts/sk_collect_dumps/.gitignore @@ -1,2 +1,4 @@ result *.json +hosts +poetry.lock diff --git a/scripts/sk_collect_dumps/ansible.cfg b/scripts/sk_collect_dumps/ansible.cfg new file mode 100644 index 0000000000..150986ab79 --- /dev/null +++ b/scripts/sk_collect_dumps/ansible.cfg @@ -0,0 +1,11 @@ +[defaults] +host_key_checking = False +inventory=./hosts +remote_tmp=/tmp +remote_user=developer +callbacks_enabled = profile_tasks + +[ssh_connection] +scp_if_ssh = True +ssh_args = -F ./ssh.cfg +pipelining = True diff --git a/scripts/sk_collect_dumps/pyproject.toml b/scripts/sk_collect_dumps/pyproject.toml new file mode 100644 index 0000000000..c6f6adafe2 --- /dev/null +++ b/scripts/sk_collect_dumps/pyproject.toml @@ -0,0 +1,16 @@ +[tool.poetry] +name = "sk-collect-dumps" +version = "0.1.0" +description = "" +authors = ["Arseny Sher "] +readme = "README.md" +packages = [{include = "sk_collect_dumps"}] + +[tool.poetry.dependencies] +python = "^3.11" +ansible = "^9.1.0" + + +[build-system] +requires = ["poetry-core"] +build-backend = "poetry.core.masonry.api" diff --git a/scripts/sk_collect_dumps/readme.md b/scripts/sk_collect_dumps/readme.md index 52b73e9495..7494a6cb78 100644 --- a/scripts/sk_collect_dumps/readme.md +++ b/scripts/sk_collect_dumps/readme.md @@ -1,25 +1,43 @@ # Collect /v1/debug_dump from all safekeeper nodes -1. Run ansible playbooks to collect .json dumps from all safekeepers and store them in `./result` directory. -2. Run `DB_CONNSTR=... ./upload.sh prod_feb30` to upload dumps to `prod_feb30` table in specified postgres database. - -## How to use ansible (staging) - +3. Issue admin token (add/remove .stage from url for staging/prod and setting proper API key): ``` -AWS_DEFAULT_PROFILE=dev ansible-playbook -i ../../.github/ansible/staging.us-east-2.hosts.yaml -e @../../.github/ansible/ssm_config remote.yaml +# staging: +AUTH_TOKEN=$(curl https://console.stage.neon.tech/regions/console/api/v1/admin/issue_token -H "Accept: application/json" -H "Content-Type: application/json" -H "Authorization: Bearer $NEON_STAGING_KEY" -X POST -d '{"ttl_seconds": 43200, "scope": "safekeeperdata"}' 2>/dev/null | jq --raw-output '.jwt') +# prod: +AUTH_TOKEN=$(curl https://console.neon.tech/regions/console/api/v1/admin/issue_token -H "Accept: application/json" -H "Content-Type: application/json" -H "Authorization: Bearer $NEON_PROD_KEY" -X POST -d '{"ttl_seconds": 43200, "scope": "safekeeperdata"}' 2>/dev/null | jq --raw-output '.jwt') +# check +echo $AUTH_TOKEN +``` +2. Run ansible playbooks to collect .json dumps from all safekeepers and store them in `./result` directory. -AWS_DEFAULT_PROFILE=dev ansible-playbook -i ../../.github/ansible/staging.eu-west-1.hosts.yaml -e @../../.github/ansible/ssm_config remote.yaml +There are two ways to do that, with ssm or tsh. ssm: +``` +# in aws repo, cd .github/ansible and run e.g. (adjusting profile and region in vars and limit): +AWS_DEFAULT_PROFILE=dev ansible-playbook -i inventory_aws_ec2.yaml -i staging.us-east-2.vars.yaml -e @ssm_config -l 'safekeeper:&us_east_2' -e "auth_token=${AUTH_TOKEN}" ~/neon/neon/scripts/sk_collect_dumps/remote.yaml +``` +It will put the results to .results directory *near the playbook*. + +tsh: + +Update the inventory, if needed, selecting .build/.tech and optionally region: +``` +rm -f hosts && echo '[safekeeper]' >> hosts +# staging: +tsh ls | awk '{print $1}' | grep safekeeper | grep "neon.build" | grep us-east-2 >> hosts +# prod: +tsh ls | awk '{print $1}' | grep safekeeper | grep "neon.tech" | grep us-east-2 >> hosts ``` -## How to use ansible (prod) - +Test ansible connection: ``` -AWS_DEFAULT_PROFILE=prod ansible-playbook -i ../../.github/ansible/prod.us-west-2.hosts.yaml -e @../../.github/ansible/ssm_config remote.yaml - -AWS_DEFAULT_PROFILE=prod ansible-playbook -i ../../.github/ansible/prod.us-east-2.hosts.yaml -e @../../.github/ansible/ssm_config remote.yaml - -AWS_DEFAULT_PROFILE=prod ansible-playbook -i ../../.github/ansible/prod.eu-central-1.hosts.yaml -e @../../.github/ansible/ssm_config remote.yaml - -AWS_DEFAULT_PROFILE=prod ansible-playbook -i ../../.github/ansible/prod.ap-southeast-1.hosts.yaml -e @../../.github/ansible/ssm_config remote.yaml +ansible all -m ping -v ``` +Download the dumps: +``` +mkdir -p result && rm -f result/* +ansible-playbook -e "auth_token=${AUTH_TOKEN}" remote.yaml +``` + +3. Run `DB_CONNSTR=... ./upload.sh prod_feb30` to upload dumps to `prod_feb30` table in specified postgres database. diff --git a/scripts/sk_collect_dumps/remote.yaml b/scripts/sk_collect_dumps/remote.yaml index 29ce83efde..f214d0ae2c 100644 --- a/scripts/sk_collect_dumps/remote.yaml +++ b/scripts/sk_collect_dumps/remote.yaml @@ -1,18 +1,37 @@ - name: Fetch state dumps from safekeepers - hosts: safekeepers + hosts: safekeeper gather_facts: False - remote_user: "{{ remote_user }}" tasks: - - name: Download file + - name: Dump file get_url: url: "http://{{ inventory_hostname }}:7676/v1/debug_dump?dump_all=true&dump_disk_content=false" - dest: "/tmp/{{ inventory_hostname }}.json" + dest: "/tmp/{{ inventory_hostname }}-dump.json" + headers: + Authorization: "Bearer {{ auth_token }}" - - name: Fetch file from remote hosts + - name: install rsync + ansible.builtin.apt: + name: rsync + update_cache: yes + become: yes + ignore_errors: true # it can be already installed and we don't always have sudo + + - name: Fetch file from remote hosts (works only with ssm) fetch: - src: "/tmp/{{ inventory_hostname }}.json" - dest: "./result/{{ inventory_hostname }}.json" + src: "/tmp/{{ inventory_hostname }}-dump.json" + dest: "./result/{{ inventory_hostname }}-dump.json" flat: yes fail_on_missing: no + when: ansible_connection == "aws_ssm" + # xxx not sure how to make ansible 'synchronize' work with tsh + - name: Fetch file from remote hosts + shell: rsync -e 'tsh ssh' -azvP "developer@{{ inventory_hostname }}:/tmp/{{ inventory_hostname }}-dump.json" "./result/{{ inventory_hostname }}-dump.json" + delegate_to: localhost + when: ansible_connection != "aws_ssm" + + - name: remove remote dumps + ansible.builtin.file: + path: "/tmp/{{ inventory_hostname }}-dump.json" + state: absent diff --git a/scripts/sk_collect_dumps/ssh.cfg b/scripts/sk_collect_dumps/ssh.cfg new file mode 100644 index 0000000000..827c5d9286 --- /dev/null +++ b/scripts/sk_collect_dumps/ssh.cfg @@ -0,0 +1,13 @@ +# Begin generated Teleport configuration for teleport.aws.neon.tech by tsh + +# Common flags for all teleport.aws.neon.tech hosts +Host * + HostKeyAlgorithms rsa-sha2-512-cert-v01@openssh.com,rsa-sha2-256-cert-v01@openssh.com,ssh-rsa-cert-v01@openssh.com + +# Flags for all teleport.aws.neon.tech hosts except the proxy +Host * !teleport.aws.neon.tech + Port 3022 + ProxyCommand "/usr/local/bin/tsh" proxy ssh --cluster=teleport.aws.neon.tech --proxy=teleport.aws.neon.tech:443 %r@%h:%p + User developer + +# End generated Teleport configuration \ No newline at end of file diff --git a/scripts/sk_collect_dumps/upload.sh b/scripts/sk_collect_dumps/upload.sh index 2e54ecba1c..5189883fcb 100755 --- a/scripts/sk_collect_dumps/upload.sh +++ b/scripts/sk_collect_dumps/upload.sh @@ -31,22 +31,22 @@ SELECT (data->>'tenant_id') AS tenant_id, (data->>'timeline_id') AS timeline_id, (data->'memory'->>'active')::bool AS active, - (data->'memory'->>'flush_lsn')::bigint AS flush_lsn, - (data->'memory'->'mem_state'->>'backup_lsn')::bigint AS backup_lsn, - (data->'memory'->'mem_state'->>'commit_lsn')::bigint AS commit_lsn, - (data->'memory'->'mem_state'->>'peer_horizon_lsn')::bigint AS peer_horizon_lsn, - (data->'memory'->'mem_state'->>'remote_consistent_lsn')::bigint AS remote_consistent_lsn, - (data->'memory'->>'write_lsn')::bigint AS write_lsn, + (data->'memory'->>'flush_lsn')::pg_lsn AS flush_lsn, + (data->'memory'->'mem_state'->>'backup_lsn')::pg_lsn AS backup_lsn, + (data->'memory'->'mem_state'->>'commit_lsn')::pg_lsn AS commit_lsn, + (data->'memory'->'mem_state'->>'peer_horizon_lsn')::pg_lsn AS peer_horizon_lsn, + (data->'memory'->'mem_state'->>'remote_consistent_lsn')::pg_lsn AS remote_consistent_lsn, + (data->'memory'->>'write_lsn')::pg_lsn AS write_lsn, (data->'memory'->>'num_computes')::bigint AS num_computes, - (data->'memory'->>'epoch_start_lsn')::bigint AS epoch_start_lsn, + (data->'memory'->>'epoch_start_lsn')::pg_lsn AS epoch_start_lsn, (data->'memory'->>'last_removed_segno')::bigint AS last_removed_segno, (data->'memory'->>'is_cancelled')::bool AS is_cancelled, - (data->'control_file'->>'backup_lsn')::bigint AS disk_backup_lsn, - (data->'control_file'->>'commit_lsn')::bigint AS disk_commit_lsn, + (data->'control_file'->>'backup_lsn')::pg_lsn AS disk_backup_lsn, + (data->'control_file'->>'commit_lsn')::pg_lsn AS disk_commit_lsn, (data->'control_file'->'acceptor_state'->>'term')::bigint AS disk_term, - (data->'control_file'->>'local_start_lsn')::bigint AS local_start_lsn, - (data->'control_file'->>'peer_horizon_lsn')::bigint AS disk_peer_horizon_lsn, - (data->'control_file'->>'timeline_start_lsn')::bigint AS timeline_start_lsn, - (data->'control_file'->>'remote_consistent_lsn')::bigint AS disk_remote_consistent_lsn + (data->'control_file'->>'local_start_lsn')::pg_lsn AS local_start_lsn, + (data->'control_file'->>'peer_horizon_lsn')::pg_lsn AS disk_peer_horizon_lsn, + (data->'control_file'->>'timeline_start_lsn')::pg_lsn AS timeline_start_lsn, + (data->'control_file'->>'remote_consistent_lsn')::pg_lsn AS disk_remote_consistent_lsn FROM tmp_json EOF diff --git a/storage_broker/benches/rps.rs b/storage_broker/benches/rps.rs index a0c8e1f749..d66cbefa45 100644 --- a/storage_broker/benches/rps.rs +++ b/storage_broker/benches/rps.rs @@ -3,9 +3,12 @@ use std::sync::Arc; use std::time::{Duration, Instant}; use clap::Parser; -use storage_broker::proto::subscribe_safekeeper_info_request::SubscriptionKey; -use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId; -use storage_broker::proto::{SafekeeperTimelineInfo, SubscribeSafekeeperInfoRequest}; + +use storage_broker::proto::SafekeeperTimelineInfo; +use storage_broker::proto::{ + FilterTenantTimelineId, MessageType, SubscribeByFilterRequest, + TenantTimelineId as ProtoTenantTimelineId, TypeSubscription, TypedMessage, +}; use storage_broker::{BrokerClientChannel, DEFAULT_ENDPOINT}; use tokio::time; @@ -91,15 +94,23 @@ async fn subscribe(client: Option, counter: Arc, None => storage_broker::connect(DEFAULT_ENDPOINT, Duration::from_secs(5)).unwrap(), }; - let key = SubscriptionKey::TenantTimelineId(ProtoTenantTimelineId { + let ttid = ProtoTenantTimelineId { tenant_id: vec![0xFF; 16], timeline_id: tli_from_u64(i), - }); - let request = SubscribeSafekeeperInfoRequest { - subscription_key: Some(key), }; - let mut stream = client - .subscribe_safekeeper_info(request) + + let request = SubscribeByFilterRequest { + types: vec![TypeSubscription { + r#type: MessageType::SafekeeperTimelineInfo.into(), + }], + tenant_timeline_id: Some(FilterTenantTimelineId { + enabled: true, + tenant_timeline_id: Some(ttid), + }), + }; + + let mut stream: tonic::Streaming = client + .subscribe_by_filter(request) .await .unwrap() .into_inner(); diff --git a/storage_broker/proto/broker.proto b/storage_broker/proto/broker.proto index aa9d62a29f..7d1b63d23f 100644 --- a/storage_broker/proto/broker.proto +++ b/storage_broker/proto/broker.proto @@ -10,6 +10,12 @@ service BrokerService { // Publish safekeeper updates. rpc PublishSafekeeperInfo(stream SafekeeperTimelineInfo) returns (google.protobuf.Empty) {}; + + // Subscribe to all messages, limited by a filter. + rpc SubscribeByFilter(SubscribeByFilterRequest) returns (stream TypedMessage) {}; + + // Publish one message. + rpc PublishOne(TypedMessage) returns (google.protobuf.Empty) {}; } message SubscribeSafekeeperInfoRequest { @@ -48,3 +54,55 @@ message TenantTimelineId { bytes tenant_id = 1; bytes timeline_id = 2; } + +message FilterTenantTimelineId { + // If true, only messages related to `tenant_timeline_id` will be emitted. + // Otherwise, messages for all timelines will be emitted. + bool enabled = 1; + TenantTimelineId tenant_timeline_id = 2; +} + +message TypeSubscription { + MessageType type = 1; +} + +message SubscribeByFilterRequest { + // Subscription will emit messages only of the specified types. You need to specify + // at least one type to receive any messages. + repeated TypeSubscription types = 1; + + // If set and enabled, subscription will emit messages only for the specified tenant/timeline. + optional FilterTenantTimelineId tenant_timeline_id = 2; +} + +enum MessageType { + UNKNOWN = 0; + SAFEKEEPER_TIMELINE_INFO = 2; + SAFEKEEPER_DISCOVERY_REQUEST = 3; + SAFEKEEPER_DISCOVERY_RESPONSE = 4; +} + +// A message with a type. +message TypedMessage { + MessageType type = 1; + + optional SafekeeperTimelineInfo safekeeper_timeline_info = 2; + optional SafekeeperDiscoveryRequest safekeeper_discovery_request = 3; + optional SafekeeperDiscoveryResponse safekeeper_discovery_response = 4; +} + +message SafekeeperDiscoveryRequest { + TenantTimelineId tenant_timeline_id = 1; +} + +// Shorter version of SafekeeperTimelineInfo, contains only necessary fields. +message SafekeeperDiscoveryResponse { + uint64 safekeeper_id = 1; + TenantTimelineId tenant_timeline_id = 2; + // WAL available to download. + uint64 commit_lsn = 3; + // A connection string to use for WAL downloading. + string safekeeper_connstr = 4; + // Availability zone of a safekeeper. + optional string availability_zone = 5; +} diff --git a/storage_broker/src/bin/storage_broker.rs b/storage_broker/src/bin/storage_broker.rs index 9f81ac6cac..4e5f8ed724 100644 --- a/storage_broker/src/bin/storage_broker.rs +++ b/storage_broker/src/bin/storage_broker.rs @@ -35,10 +35,16 @@ use tracing::*; use utils::signals::ShutdownSignals; use metrics::{Encoder, TextEncoder}; -use storage_broker::metrics::{NUM_PUBS, NUM_SUBS_ALL, NUM_SUBS_TIMELINE}; +use storage_broker::metrics::{ + BROADCASTED_MESSAGES_TOTAL, BROADCAST_DROPPED_MESSAGES_TOTAL, NUM_PUBS, NUM_SUBS_ALL, + NUM_SUBS_TIMELINE, PROCESSED_MESSAGES_TOTAL, PUBLISHED_ONEOFF_MESSAGES_TOTAL, +}; use storage_broker::proto::broker_service_server::{BrokerService, BrokerServiceServer}; use storage_broker::proto::subscribe_safekeeper_info_request::SubscriptionKey as ProtoSubscriptionKey; -use storage_broker::proto::{SafekeeperTimelineInfo, SubscribeSafekeeperInfoRequest}; +use storage_broker::proto::{ + FilterTenantTimelineId, MessageType, SafekeeperDiscoveryRequest, SafekeeperDiscoveryResponse, + SafekeeperTimelineInfo, SubscribeByFilterRequest, SubscribeSafekeeperInfoRequest, TypedMessage, +}; use storage_broker::{ parse_proto_ttid, EitherBody, DEFAULT_KEEPALIVE_INTERVAL, DEFAULT_LISTEN_ADDR, }; @@ -73,8 +79,103 @@ struct Args { log_format: String, } -type PubId = u64; // id of publisher for registering in maps -type SubId = u64; // id of subscriber for registering in maps +/// Id of publisher for registering in maps +type PubId = u64; + +/// Id of subscriber for registering in maps +type SubId = u64; + +/// Single enum type for all messages. +#[derive(Clone, Debug, PartialEq)] +#[allow(clippy::enum_variant_names)] +enum Message { + SafekeeperTimelineInfo(SafekeeperTimelineInfo), + SafekeeperDiscoveryRequest(SafekeeperDiscoveryRequest), + SafekeeperDiscoveryResponse(SafekeeperDiscoveryResponse), +} + +impl Message { + /// Convert proto message to internal message. + pub fn from(proto_msg: TypedMessage) -> Result { + match proto_msg.r#type() { + MessageType::SafekeeperTimelineInfo => Ok(Message::SafekeeperTimelineInfo( + proto_msg.safekeeper_timeline_info.ok_or_else(|| { + Status::new(Code::InvalidArgument, "missing safekeeper_timeline_info") + })?, + )), + MessageType::SafekeeperDiscoveryRequest => Ok(Message::SafekeeperDiscoveryRequest( + proto_msg.safekeeper_discovery_request.ok_or_else(|| { + Status::new( + Code::InvalidArgument, + "missing safekeeper_discovery_request", + ) + })?, + )), + MessageType::SafekeeperDiscoveryResponse => Ok(Message::SafekeeperDiscoveryResponse( + proto_msg.safekeeper_discovery_response.ok_or_else(|| { + Status::new( + Code::InvalidArgument, + "missing safekeeper_discovery_response", + ) + })?, + )), + MessageType::Unknown => Err(Status::new( + Code::InvalidArgument, + format!("invalid message type: {:?}", proto_msg.r#type), + )), + } + } + + /// Get the tenant_timeline_id from the message. + pub fn tenant_timeline_id(&self) -> Result, Status> { + match self { + Message::SafekeeperTimelineInfo(msg) => Ok(msg + .tenant_timeline_id + .as_ref() + .map(parse_proto_ttid) + .transpose()?), + Message::SafekeeperDiscoveryRequest(msg) => Ok(msg + .tenant_timeline_id + .as_ref() + .map(parse_proto_ttid) + .transpose()?), + Message::SafekeeperDiscoveryResponse(msg) => Ok(msg + .tenant_timeline_id + .as_ref() + .map(parse_proto_ttid) + .transpose()?), + } + } + + /// Convert internal message to the protobuf struct. + pub fn as_typed_message(&self) -> TypedMessage { + let mut res = TypedMessage { + r#type: self.message_type() as i32, + ..Default::default() + }; + match self { + Message::SafekeeperTimelineInfo(msg) => { + res.safekeeper_timeline_info = Some(msg.clone()) + } + Message::SafekeeperDiscoveryRequest(msg) => { + res.safekeeper_discovery_request = Some(msg.clone()) + } + Message::SafekeeperDiscoveryResponse(msg) => { + res.safekeeper_discovery_response = Some(msg.clone()) + } + } + res + } + + /// Get the message type. + pub fn message_type(&self) -> MessageType { + match self { + Message::SafekeeperTimelineInfo(_) => MessageType::SafekeeperTimelineInfo, + Message::SafekeeperDiscoveryRequest(_) => MessageType::SafekeeperDiscoveryRequest, + Message::SafekeeperDiscoveryResponse(_) => MessageType::SafekeeperDiscoveryResponse, + } + } +} #[derive(Copy, Clone, Debug)] enum SubscriptionKey { @@ -83,7 +184,7 @@ enum SubscriptionKey { } impl SubscriptionKey { - // Parse protobuf subkey (protobuf doesn't have fixed size bytes, we get vectors). + /// Parse protobuf subkey (protobuf doesn't have fixed size bytes, we get vectors). pub fn from_proto_subscription_key(key: ProtoSubscriptionKey) -> Result { match key { ProtoSubscriptionKey::All(_) => Ok(SubscriptionKey::All), @@ -92,14 +193,29 @@ impl SubscriptionKey { } } } + + /// Parse from FilterTenantTimelineId + pub fn from_proto_filter_tenant_timeline_id( + f: &FilterTenantTimelineId, + ) -> Result { + if !f.enabled { + return Ok(SubscriptionKey::All); + } + + let ttid = + parse_proto_ttid(f.tenant_timeline_id.as_ref().ok_or_else(|| { + Status::new(Code::InvalidArgument, "missing tenant_timeline_id") + })?)?; + Ok(SubscriptionKey::Timeline(ttid)) + } } -// Channel to timeline subscribers. +/// Channel to timeline subscribers. struct ChanToTimelineSub { - chan: broadcast::Sender, - // Tracked separately to know when delete the shmem entry. receiver_count() - // is unhandy for that as unregistering and dropping the receiver side - // happens at different moments. + chan: broadcast::Sender, + /// Tracked separately to know when delete the shmem entry. receiver_count() + /// is unhandy for that as unregistering and dropping the receiver side + /// happens at different moments. num_subscribers: u64, } @@ -110,7 +226,7 @@ struct SharedState { num_subs_to_timelines: i64, chans_to_timeline_subs: HashMap, num_subs_to_all: i64, - chan_to_all_subs: broadcast::Sender, + chan_to_all_subs: broadcast::Sender, } impl SharedState { @@ -146,7 +262,7 @@ impl SharedState { &mut self, sub_key: SubscriptionKey, timeline_chan_size: usize, - ) -> (SubId, broadcast::Receiver) { + ) -> (SubId, broadcast::Receiver) { let sub_id = self.next_sub_id; self.next_sub_id += 1; let sub_rx = match sub_key { @@ -262,6 +378,29 @@ impl Registry { subscriber.id, subscriber.key, subscriber.remote_addr ); } + + /// Send msg to relevant subscribers. + pub fn send_msg(&self, msg: &Message) -> Result<(), Status> { + PROCESSED_MESSAGES_TOTAL.inc(); + + // send message to subscribers for everything + let shared_state = self.shared_state.read(); + // Err means there is no subscribers, it is fine. + shared_state.chan_to_all_subs.send(msg.clone()).ok(); + + // send message to per timeline subscribers, if there is ttid + let ttid = msg.tenant_timeline_id()?; + if let Some(ttid) = ttid { + if let Some(subs) = shared_state.chans_to_timeline_subs.get(&ttid) { + // Err can't happen here, as tx is destroyed only after removing + // from the map the last subscriber along with tx. + subs.chan + .send(msg.clone()) + .expect("rx is still in the map with zero subscribers"); + } + } + Ok(()) + } } // Private subscriber state. @@ -269,7 +408,7 @@ struct Subscriber { id: SubId, key: SubscriptionKey, // Subscriber receives messages from publishers here. - sub_rx: broadcast::Receiver, + sub_rx: broadcast::Receiver, // to unregister itself from shared state in Drop registry: Registry, // for logging @@ -291,26 +430,9 @@ struct Publisher { } impl Publisher { - // Send msg to relevant subscribers. - pub fn send_msg(&mut self, msg: &SafekeeperTimelineInfo) -> Result<(), Status> { - // send message to subscribers for everything - let shared_state = self.registry.shared_state.read(); - // Err means there is no subscribers, it is fine. - shared_state.chan_to_all_subs.send(msg.clone()).ok(); - - // send message to per timeline subscribers - let ttid = - parse_proto_ttid(msg.tenant_timeline_id.as_ref().ok_or_else(|| { - Status::new(Code::InvalidArgument, "missing tenant_timeline_id") - })?)?; - if let Some(subs) = shared_state.chans_to_timeline_subs.get(&ttid) { - // Err can't happen here, as tx is destroyed only after removing - // from the map the last subscriber along with tx. - subs.chan - .send(msg.clone()) - .expect("rx is still in the map with zero subscribers"); - } - Ok(()) + /// Send msg to relevant subscribers. + pub fn send_msg(&mut self, msg: &Message) -> Result<(), Status> { + self.registry.send_msg(msg) } } @@ -339,7 +461,7 @@ impl BrokerService for Broker { loop { match stream.next().await { - Some(Ok(msg)) => publisher.send_msg(&msg)?, + Some(Ok(msg)) => publisher.send_msg(&Message::SafekeeperTimelineInfo(msg))?, Some(Err(e)) => return Err(e), // grpc error from the stream None => break, // closed stream } @@ -371,8 +493,15 @@ impl BrokerService for Broker { let mut missed_msgs: u64 = 0; loop { match subscriber.sub_rx.recv().await { - Ok(info) => yield info, + Ok(info) => { + match info { + Message::SafekeeperTimelineInfo(info) => yield info, + _ => {}, + } + BROADCASTED_MESSAGES_TOTAL.inc(); + }, Err(RecvError::Lagged(skipped_msg)) => { + BROADCAST_DROPPED_MESSAGES_TOTAL.inc_by(skipped_msg); missed_msgs += skipped_msg; if (futures::poll!(Box::pin(warn_interval.tick()))).is_ready() { warn!("subscription id={}, key={:?} addr={:?} dropped {} messages, channel is full", @@ -392,6 +521,78 @@ impl BrokerService for Broker { Box::pin(output) as Self::SubscribeSafekeeperInfoStream )) } + + type SubscribeByFilterStream = + Pin> + Send + 'static>>; + + /// Subscribe to all messages, limited by a filter. + async fn subscribe_by_filter( + &self, + request: Request, + ) -> std::result::Result, Status> { + let remote_addr = request + .remote_addr() + .expect("TCPConnectInfo inserted by handler"); + let proto_filter = request.into_inner(); + let ttid_filter = proto_filter + .tenant_timeline_id + .as_ref() + .ok_or_else(|| Status::new(Code::InvalidArgument, "missing tenant_timeline_id"))?; + + let sub_key = SubscriptionKey::from_proto_filter_tenant_timeline_id(ttid_filter)?; + let types_set = proto_filter + .types + .iter() + .map(|t| t.r#type) + .collect::>(); + + let mut subscriber = self.registry.register_subscriber(sub_key, remote_addr); + + // transform rx into stream with item = Result, as method result demands + let output = async_stream::try_stream! { + let mut warn_interval = time::interval(Duration::from_millis(1000)); + let mut missed_msgs: u64 = 0; + loop { + match subscriber.sub_rx.recv().await { + Ok(msg) => { + let msg_type = msg.message_type() as i32; + if types_set.contains(&msg_type) { + yield msg.as_typed_message(); + BROADCASTED_MESSAGES_TOTAL.inc(); + } + }, + Err(RecvError::Lagged(skipped_msg)) => { + BROADCAST_DROPPED_MESSAGES_TOTAL.inc_by(skipped_msg); + missed_msgs += skipped_msg; + if (futures::poll!(Box::pin(warn_interval.tick()))).is_ready() { + warn!("subscription id={}, key={:?} addr={:?} dropped {} messages, channel is full", + subscriber.id, subscriber.key, subscriber.remote_addr, missed_msgs); + missed_msgs = 0; + } + } + Err(RecvError::Closed) => { + // can't happen, we never drop the channel while there is a subscriber + Err(Status::new(Code::Internal, "channel unexpectantly closed"))?; + } + } + } + }; + + Ok(Response::new( + Box::pin(output) as Self::SubscribeByFilterStream + )) + } + + /// Publish one message. + async fn publish_one( + &self, + request: Request, + ) -> std::result::Result, Status> { + let msg = Message::from(request.into_inner())?; + PUBLISHED_ONEOFF_MESSAGES_TOTAL.inc(); + self.registry.send_msg(&msg)?; + Ok(Response::new(())) + } } // We serve only metrics and healthcheck through http1. @@ -515,8 +716,8 @@ mod tests { use tokio::sync::broadcast::error::TryRecvError; use utils::id::{TenantId, TimelineId}; - fn msg(timeline_id: Vec) -> SafekeeperTimelineInfo { - SafekeeperTimelineInfo { + fn msg(timeline_id: Vec) -> Message { + Message::SafekeeperTimelineInfo(SafekeeperTimelineInfo { safekeeper_id: 1, tenant_timeline_id: Some(ProtoTenantTimelineId { tenant_id: vec![0x00; 16], @@ -533,7 +734,7 @@ mod tests { http_connstr: "neon-1-sk-1.local:7677".to_owned(), local_start_lsn: 0, availability_zone: None, - } + }) } fn tli_from_u64(i: u64) -> Vec { diff --git a/storage_broker/src/metrics.rs b/storage_broker/src/metrics.rs index f0649d0f68..1fd3dd5ad6 100644 --- a/storage_broker/src/metrics.rs +++ b/storage_broker/src/metrics.rs @@ -1,6 +1,6 @@ //! Broker metrics. -use metrics::{register_int_gauge, IntGauge}; +use metrics::{register_int_counter, register_int_gauge, IntCounter, IntGauge}; use once_cell::sync::Lazy; pub static NUM_PUBS: Lazy = Lazy::new(|| { @@ -23,3 +23,35 @@ pub static NUM_SUBS_ALL: Lazy = Lazy::new(|| { ) .expect("Failed to register metric") }); + +pub static PROCESSED_MESSAGES_TOTAL: Lazy = Lazy::new(|| { + register_int_counter!( + "storage_broker_processed_messages_total", + "Number of messages received by storage broker, before routing and broadcasting" + ) + .expect("Failed to register metric") +}); + +pub static BROADCASTED_MESSAGES_TOTAL: Lazy = Lazy::new(|| { + register_int_counter!( + "storage_broker_broadcasted_messages_total", + "Number of messages broadcasted (sent over network) to subscribers" + ) + .expect("Failed to register metric") +}); + +pub static BROADCAST_DROPPED_MESSAGES_TOTAL: Lazy = Lazy::new(|| { + register_int_counter!( + "storage_broker_broadcast_dropped_messages_total", + "Number of messages dropped due to channel capacity overflow" + ) + .expect("Failed to register metric") +}); + +pub static PUBLISHED_ONEOFF_MESSAGES_TOTAL: Lazy = Lazy::new(|| { + register_int_counter!( + "storage_broker_published_oneoff_messages_total", + "Number of one-off messages sent via PublishOne method" + ) + .expect("Failed to register metric") +}); diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 42e122cefe..597e311e02 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -365,6 +365,12 @@ class PgProtocol: result.append(cur.fetchall()) return result + def safe_psql_scalar(self, query) -> Any: + """ + Execute query returning single row with single column. + """ + return self.safe_psql(query)[0][0] + @dataclass class AuthKeys: @@ -457,7 +463,6 @@ class NeonEnvBuilder: self.preserve_database_files = preserve_database_files self.initial_tenant = initial_tenant or TenantId.generate() self.initial_timeline = initial_timeline or TimelineId.generate() - self.enable_generations = True self.scrub_on_exit = False self.test_output_dir = test_output_dir @@ -677,8 +682,7 @@ class NeonEnvBuilder: pageserver.stop(immediate=True) - if self.env.attachment_service is not None: - self.env.attachment_service.stop(immediate=True) + self.env.attachment_service.stop(immediate=True) cleanup_error = None @@ -772,13 +776,9 @@ class NeonEnv: self.initial_tenant = config.initial_tenant self.initial_timeline = config.initial_timeline - if config.enable_generations: - attachment_service_port = self.port_distributor.get_port() - self.control_plane_api: Optional[str] = f"http://127.0.0.1:{attachment_service_port}" - self.attachment_service: Optional[NeonAttachmentService] = NeonAttachmentService(self) - else: - self.control_plane_api = None - self.attachment_service = None + attachment_service_port = self.port_distributor.get_port() + self.control_plane_api: str = f"http://127.0.0.1:{attachment_service_port}" + self.attachment_service: NeonAttachmentService = NeonAttachmentService(self) # Create a config file corresponding to the options cfg: Dict[str, Any] = { @@ -851,8 +851,7 @@ class NeonEnv: # Start up broker, pageserver and all safekeepers self.broker.try_start() - if self.attachment_service is not None: - self.attachment_service.start() + self.attachment_service.start() for pageserver in self.pageservers: pageserver.start() @@ -1834,20 +1833,19 @@ class NeonPageserver(PgProtocol): """ client = self.http_client() return client.tenant_attach( - tenant_id, config, config_null, generation=self.maybe_get_generation(tenant_id) + tenant_id, + config, + config_null, + generation=self.env.attachment_service.attach_hook_issue(tenant_id, self.id), ) def tenant_detach(self, tenant_id: TenantId): - if self.env.attachment_service is not None: - self.env.attachment_service.attach_hook_drop(tenant_id) + self.env.attachment_service.attach_hook_drop(tenant_id) client = self.http_client() return client.tenant_detach(tenant_id) def tenant_location_configure(self, tenant_id: TenantId, config: dict[str, Any], **kwargs): - # This API is only for use when generations are enabled - assert self.env.attachment_service is not None - if config["mode"].startswith("Attached") and "generation" not in config: config["generation"] = self.env.attachment_service.attach_hook_issue(tenant_id, self.id) @@ -1873,26 +1871,15 @@ class NeonPageserver(PgProtocol): generation: Optional[int] = None, ) -> TenantId: if generation is None: - generation = self.maybe_get_generation(tenant_id) + generation = self.env.attachment_service.attach_hook_issue(tenant_id, self.id) client = self.http_client(auth_token=auth_token) return client.tenant_create(tenant_id, conf, generation=generation) def tenant_load(self, tenant_id: TenantId): client = self.http_client() - return client.tenant_load(tenant_id, generation=self.maybe_get_generation(tenant_id)) - - def maybe_get_generation(self, tenant_id: TenantId): - """ - For tests that would like to use an HTTP client directly instead of using - the `tenant_attach` and `tenant_create` helpers here: issue a generation - number for a tenant. - - Returns None if the attachment service is not enabled (legacy mode) - """ - if self.env.attachment_service is not None: - return self.env.attachment_service.attach_hook_issue(tenant_id, self.id) - else: - return None + return client.tenant_load( + tenant_id, generation=self.env.attachment_service.attach_hook_issue(tenant_id, self.id) + ) def append_pageserver_param_overrides( @@ -2752,6 +2739,13 @@ class Endpoint(PgProtocol): ): self.stop() + # Checkpoints running endpoint and returns pg_wal size in MB. + def get_pg_wal_size(self): + log.info(f'checkpointing at LSN {self.safe_psql("select pg_current_wal_lsn()")[0][0]}') + self.safe_psql("checkpoint") + assert self.pgdata_dir is not None # please mypy + return get_dir_size(os.path.join(self.pgdata_dir, "pg_wal")) / 1024 / 1024 + class EndpointFactory: """An object representing multiple compute endpoints.""" @@ -2950,6 +2944,13 @@ class Safekeeper: return segments +# Walreceiver as returned by sk's timeline status endpoint. +@dataclass +class Walreceiver: + conn_id: int + state: str + + @dataclass class SafekeeperTimelineStatus: acceptor_epoch: int @@ -2960,6 +2961,7 @@ class SafekeeperTimelineStatus: backup_lsn: Lsn peer_horizon_lsn: Lsn remote_consistent_lsn: Lsn + walreceivers: List[Walreceiver] @dataclass @@ -3021,6 +3023,7 @@ class SafekeeperHttpClient(requests.Session): res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}") res.raise_for_status() resj = res.json() + walreceivers = [Walreceiver(wr["conn_id"], wr["status"]) for wr in resj["walreceivers"]] return SafekeeperTimelineStatus( acceptor_epoch=resj["acceptor_state"]["epoch"], pg_version=resj["pg_info"]["pg_version"], @@ -3030,6 +3033,7 @@ class SafekeeperHttpClient(requests.Session): backup_lsn=Lsn(resj["backup_lsn"]), peer_horizon_lsn=Lsn(resj["peer_horizon_lsn"]), remote_consistent_lsn=Lsn(resj["remote_consistent_lsn"]), + walreceivers=walreceivers, ) def record_safekeeper_info(self, tenant_id: TenantId, timeline_id: TimelineId, body): diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py index eda8813c36..add6c4288a 100644 --- a/test_runner/fixtures/pageserver/http.py +++ b/test_runner/fixtures/pageserver/http.py @@ -510,13 +510,21 @@ class PageserverHttpClient(requests.Session): assert res_json is None def timeline_get_lsn_by_timestamp( - self, tenant_id: TenantId, timeline_id: TimelineId, timestamp, version: int + self, + tenant_id: TenantId, + timeline_id: TimelineId, + timestamp, + version: Optional[int] = None, ): log.info( f"Requesting lsn by timestamp {timestamp}, tenant {tenant_id}, timeline {timeline_id}" ) + if version is None: + version_str = "" + else: + version_str = f"&version={version}" res = self.get( - f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/get_lsn_by_timestamp?timestamp={timestamp}&version={version}", + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/get_lsn_by_timestamp?timestamp={timestamp}{version_str}", ) self.verbose_error(res) res_json = res.json() diff --git a/test_runner/fixtures/types.py b/test_runner/fixtures/types.py index d95368f990..ea648e460d 100644 --- a/test_runner/fixtures/types.py +++ b/test_runner/fixtures/types.py @@ -125,3 +125,51 @@ class TenantId(Id): class TimelineId(Id): def __repr__(self) -> str: return f'TimelineId("{self.id.hex()}")' + + +# Workaround for compat with python 3.9, which does not have `typing.Self` +TTenantShardId = TypeVar("TTenantShardId", bound="TenantShardId") + + +class TenantShardId: + def __init__(self, tenant_id: TenantId, shard_number: int, shard_count: int): + self.tenant_id = tenant_id + self.shard_number = shard_number + self.shard_count = shard_count + assert self.shard_number < self.shard_count or self.shard_count == 0 + + @classmethod + def parse(cls: Type[TTenantShardId], input) -> TTenantShardId: + if len(input) == 32: + return cls( + tenant_id=TenantId(input), + shard_number=0, + shard_count=0, + ) + elif len(input) == 37: + return cls( + tenant_id=TenantId(input[0:32]), + shard_number=int(input[33:35], 16), + shard_count=int(input[35:37], 16), + ) + else: + raise ValueError(f"Invalid TenantShardId '{input}'") + + def __str__(self): + return f"{self.tenant_id}-{self.shard_number:02x}{self.shard_count:02x}" + + def _tuple(self) -> tuple[TenantId, int, int]: + return (self.tenant_id, self.shard_number, self.shard_count) + + def __lt__(self, other) -> bool: + if not isinstance(other, type(self)): + return NotImplemented + return self._tuple() < other._tuple() + + def __eq__(self, other) -> bool: + if not isinstance(other, type(self)): + return NotImplemented + return self._tuple() == other._tuple() + + def __hash__(self) -> int: + return hash(self._tuple()) diff --git a/test_runner/performance/test_bulk_insert.py b/test_runner/performance/test_bulk_insert.py index a2a1fa11e5..edc23b29ba 100644 --- a/test_runner/performance/test_bulk_insert.py +++ b/test_runner/performance/test_bulk_insert.py @@ -61,7 +61,6 @@ def measure_recovery_time(env: NeonCompare): # of view, but the same as far as the safekeeper/WAL is concerned. To work around that, # we will explicitly create the tenant in the same generation that it was previously # attached in. - assert env.env.attachment_service is not None attach_status = env.env.attachment_service.inspect(tenant_id=env.tenant) assert attach_status is not None (attach_gen, _) = attach_status diff --git a/test_runner/performance/test_perf_olap.py b/test_runner/performance/test_perf_olap.py index 0f7615f7ed..1e6e9a0174 100644 --- a/test_runner/performance/test_perf_olap.py +++ b/test_runner/performance/test_perf_olap.py @@ -17,6 +17,27 @@ class LabelledQuery: query: str +# This must run before all tests in this module +# create extension pg_stat_statements if it does not exist +# and TEST_OLAP_COLLECT_PG_STAT_STATEMENTS is set to true (default false) +# Theoretically this could be in a module or session scope fixture, +# however the code depends on other fixtures that have function scope +@pytest.mark.skipif( + os.getenv("TEST_OLAP_COLLECT_PG_STAT_STATEMENTS", "false").lower() == "false", + reason="Skipping - Creating extension pg_stat_statements", +) +@pytest.mark.remote_cluster +def test_clickbench_create_pg_stat_statements(remote_compare: RemoteCompare): + log.info("Creating extension pg_stat_statements") + query = LabelledQuery( + "Q_CREATE_EXTENSION", r"CREATE EXTENSION IF NOT EXISTS pg_stat_statements;" + ) + run_psql(remote_compare, query, times=1, explain=False) + log.info("Reset pg_stat_statements") + query = LabelledQuery("Q_RESET", r"SELECT pg_stat_statements_reset();") + run_psql(remote_compare, query, times=1, explain=False) + + # A list of queries to run. # Please do not alter the label for the query, as it is used to identify it. # Labels for ClickBench queries match the labels in ClickBench reports @@ -78,6 +99,8 @@ QUERIES: Tuple[LabelledQuery, ...] = ( # fmt: on ) +EXPLAIN_STRING: str = "EXPLAIN (ANALYZE, VERBOSE, BUFFERS, COSTS, SETTINGS, FORMAT JSON)" + def get_scale() -> List[str]: # We parametrize each tpc-h and clickbench test with scale @@ -88,7 +111,10 @@ def get_scale() -> List[str]: return [scale] -def run_psql(env: RemoteCompare, labelled_query: LabelledQuery, times: int) -> None: +# run the query times times plus once with EXPLAIN VERBOSE if explain is requestd +def run_psql( + env: RemoteCompare, labelled_query: LabelledQuery, times: int, explain: bool = False +) -> None: # prepare connstr: # - cut out password from connstr to pass it via env # - add options to connstr @@ -108,6 +134,13 @@ def run_psql(env: RemoteCompare, labelled_query: LabelledQuery, times: int) -> N log.info(f"Run {run}/{times}") with env.zenbenchmark.record_duration(f"{label}/{run}"): env.pg_bin.run_capture(["psql", connstr, "-c", query], env=environ) + if explain: + log.info(f"Explaining query {label}") + run += 1 + with env.zenbenchmark.record_duration(f"{label}/EXPLAIN"): + env.pg_bin.run_capture( + ["psql", connstr, "-c", f"{EXPLAIN_STRING} {query}"], env=environ + ) @pytest.mark.parametrize("scale", get_scale()) @@ -118,10 +151,13 @@ def test_clickbench(query: LabelledQuery, remote_compare: RemoteCompare, scale: An OLAP-style ClickHouse benchmark Based on https://github.com/ClickHouse/ClickBench/tree/c00135ca5b6a0d86fedcdbf998fdaa8ed85c1c3b/aurora-postgresql - The DB prepared manually in advance + The DB prepared manually in advance. + Important: after intial data load, run `VACUUM (DISABLE_PAGE_SKIPPING, FREEZE, ANALYZE) hits;` + to ensure that Postgres optimizer chooses the same plans as RDS and Aurora. """ + explain: bool = os.getenv("TEST_OLAP_COLLECT_EXPLAIN", "false").lower() == "true" - run_psql(remote_compare, query, times=3) + run_psql(remote_compare, query, times=3, explain=explain) def tpch_queuies() -> Tuple[ParameterSet, ...]: @@ -195,3 +231,16 @@ def test_user_examples(remote_compare: RemoteCompare): """, ) run_psql(remote_compare, query, times=3) + + +# This must run after all tests in this module +# Collect pg_stat_statements after running the tests if TEST_OLAP_COLLECT_PG_STAT_STATEMENTS is set to true (default false) +@pytest.mark.skipif( + os.getenv("TEST_OLAP_COLLECT_PG_STAT_STATEMENTS", "false").lower() == "false", + reason="Skipping - Collecting pg_stat_statements", +) +@pytest.mark.remote_cluster +def test_clickbench_collect_pg_stat_statements(remote_compare: RemoteCompare): + log.info("Collecting pg_stat_statements") + query = LabelledQuery("Q_COLLECT_PG_STAT_STATEMENTS", r"SELECT * from pg_stat_statements;") + run_psql(remote_compare, query, times=1, explain=False) diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py index 352ec13884..32397bbcc1 100644 --- a/test_runner/regress/test_attach_tenant_config.py +++ b/test_runner/regress/test_attach_tenant_config.py @@ -136,10 +136,7 @@ def test_no_config(positive_env: NeonEnv, content_type: Optional[str]): ps_http.tenant_detach(tenant_id) assert tenant_id not in [TenantId(t["id"]) for t in ps_http.tenant_list()] - body = {} - gen = env.pageserver.maybe_get_generation(tenant_id) - if gen is not None: - body["generation"] = gen + body = {"generation": env.attachment_service.attach_hook_issue(tenant_id, env.pageserver.id)} ps_http.post( f"{ps_http.base_url}/v1/tenant/{tenant_id}/attach", diff --git a/test_runner/regress/test_change_pageserver.py b/test_runner/regress/test_change_pageserver.py index 1b6c982850..adb67a579e 100644 --- a/test_runner/regress/test_change_pageserver.py +++ b/test_runner/regress/test_change_pageserver.py @@ -87,7 +87,6 @@ def test_change_pageserver(neon_env_builder: NeonEnvBuilder): # # Since we're dual-attached, need to tip-off attachment service to treat the one we're # about to start as the attached pageserver - assert env.attachment_service is not None env.attachment_service.attach_hook_issue(env.initial_tenant, env.pageservers[0].id) env.pageservers[0].start() env.pageservers[1].stop() diff --git a/test_runner/regress/test_disk_usage_eviction.py b/test_runner/regress/test_disk_usage_eviction.py index f3f3a1ddf3..9fdc4d59f5 100644 --- a/test_runner/regress/test_disk_usage_eviction.py +++ b/test_runner/regress/test_disk_usage_eviction.py @@ -1,6 +1,7 @@ +import enum import time from dataclasses import dataclass -from typing import Dict, Tuple +from typing import Any, Dict, Tuple import pytest import toml @@ -64,6 +65,23 @@ def test_min_resident_size_override_handling( assert_config(tenant_id, None, config_level_override) +@enum.unique +class EvictionOrder(str, enum.Enum): + ABSOLUTE_ORDER = "absolute" + RELATIVE_ORDER_EQUAL = "relative_equal" + RELATIVE_ORDER_SPARE = "relative_spare" + + def config(self) -> Dict[str, Any]: + if self == EvictionOrder.ABSOLUTE_ORDER: + return {"type": "AbsoluteAccessed"} + elif self == EvictionOrder.RELATIVE_ORDER_EQUAL: + return {"type": "RelativeAccessed", "args": {"highest_layer_count_loses_first": False}} + elif self == EvictionOrder.RELATIVE_ORDER_SPARE: + return {"type": "RelativeAccessed", "args": {"highest_layer_count_loses_first": True}} + else: + raise RuntimeError(f"not implemented: {self}") + + @dataclass class EvictionEnv: timelines: list[Tuple[TenantId, TimelineId]] @@ -108,13 +126,14 @@ class EvictionEnv: _avg = cur.fetchone() def pageserver_start_with_disk_usage_eviction( - self, period, max_usage_pct, min_avail_bytes, mock_behavior + self, period, max_usage_pct, min_avail_bytes, mock_behavior, eviction_order: EvictionOrder ): disk_usage_config = { "period": period, "max_usage_pct": max_usage_pct, "min_avail_bytes": min_avail_bytes, "mock_statvfs": mock_behavior, + "eviction_order": eviction_order.config(), } enc = toml.TomlEncoder() @@ -270,7 +289,13 @@ def test_broken_tenants_are_skipped(eviction_env: EvictionEnv): env.neon_env.pageserver.allowed_errors.append(".*" + GLOBAL_LRU_LOG_LINE) -def test_pageserver_evicts_until_pressure_is_relieved(eviction_env: EvictionEnv): +@pytest.mark.parametrize( + "order", + [EvictionOrder.ABSOLUTE_ORDER, EvictionOrder.RELATIVE_ORDER_EQUAL], +) +def test_pageserver_evicts_until_pressure_is_relieved( + eviction_env: EvictionEnv, order: EvictionOrder +): """ Basic test to ensure that we evict enough to relieve pressure. """ @@ -281,7 +306,9 @@ def test_pageserver_evicts_until_pressure_is_relieved(eviction_env: EvictionEnv) target = total_on_disk // 2 - response = pageserver_http.disk_usage_eviction_run({"evict_bytes": target}) + response = pageserver_http.disk_usage_eviction_run( + {"evict_bytes": target, "eviction_order": order.config()} + ) log.info(f"{response}") (later_total_on_disk, _, _) = env.timelines_du() @@ -296,7 +323,13 @@ def test_pageserver_evicts_until_pressure_is_relieved(eviction_env: EvictionEnv) assert response["Finished"]["assumed"]["failed"]["count"] == 0, "zero failures expected" -def test_pageserver_respects_overridden_resident_size(eviction_env: EvictionEnv): +@pytest.mark.parametrize( + "order", + [EvictionOrder.ABSOLUTE_ORDER, EvictionOrder.RELATIVE_ORDER_EQUAL], +) +def test_pageserver_respects_overridden_resident_size( + eviction_env: EvictionEnv, order: EvictionOrder +): """ Override tenant min resident and ensure that it will be respected by eviction. """ @@ -336,7 +369,9 @@ def test_pageserver_respects_overridden_resident_size(eviction_env: EvictionEnv) env.warm_up_tenant(large_tenant[0]) # do one run - response = ps_http.disk_usage_eviction_run({"evict_bytes": target}) + response = ps_http.disk_usage_eviction_run( + {"evict_bytes": target, "eviction_order": order.config()} + ) log.info(f"{response}") time.sleep(1) # give log time to flush @@ -365,7 +400,11 @@ def test_pageserver_respects_overridden_resident_size(eviction_env: EvictionEnv) assert du_by_timeline[large_tenant] - later_du_by_timeline[large_tenant] >= target -def test_pageserver_falls_back_to_global_lru(eviction_env: EvictionEnv): +@pytest.mark.parametrize( + "order", + [EvictionOrder.ABSOLUTE_ORDER, EvictionOrder.RELATIVE_ORDER_EQUAL], +) +def test_pageserver_falls_back_to_global_lru(eviction_env: EvictionEnv, order: EvictionOrder): """ If we can't relieve pressure using tenant_min_resident_size-respecting eviction, we should continue to evict layers following global LRU. @@ -376,7 +415,9 @@ def test_pageserver_falls_back_to_global_lru(eviction_env: EvictionEnv): (total_on_disk, _, _) = env.timelines_du() target = total_on_disk - response = ps_http.disk_usage_eviction_run({"evict_bytes": target}) + response = ps_http.disk_usage_eviction_run( + {"evict_bytes": target, "eviction_order": order.config()} + ) log.info(f"{response}") (later_total_on_disk, _, _) = env.timelines_du() @@ -389,7 +430,15 @@ def test_pageserver_falls_back_to_global_lru(eviction_env: EvictionEnv): env.neon_env.pageserver.allowed_errors.append(".*" + GLOBAL_LRU_LOG_LINE) -def test_partial_evict_tenant(eviction_env: EvictionEnv): +@pytest.mark.parametrize( + "order", + [ + EvictionOrder.ABSOLUTE_ORDER, + EvictionOrder.RELATIVE_ORDER_EQUAL, + EvictionOrder.RELATIVE_ORDER_SPARE, + ], +) +def test_partial_evict_tenant(eviction_env: EvictionEnv, order: EvictionOrder): """ Warm up a tenant, then build up pressure to cause in evictions in both. We expect @@ -402,7 +451,7 @@ def test_partial_evict_tenant(eviction_env: EvictionEnv): (total_on_disk, _, _) = env.timelines_du() du_by_timeline = env.du_by_timeline() - # pick any tenant + # pick smaller or greater (iteration order is insertion order of scale=4 and scale=6) [warm, cold] = list(du_by_timeline.keys()) (tenant_id, timeline_id) = warm @@ -413,7 +462,9 @@ def test_partial_evict_tenant(eviction_env: EvictionEnv): # but not enough to fall into global LRU. # So, set target to all occupied space, except 2*env.layer_size per tenant target = du_by_timeline[cold] + (du_by_timeline[warm] // 2) - 2 * 2 * env.layer_size - response = ps_http.disk_usage_eviction_run({"evict_bytes": target}) + response = ps_http.disk_usage_eviction_run( + {"evict_bytes": target, "eviction_order": order.config()} + ) log.info(f"{response}") (later_total_on_disk, _, _) = env.timelines_du() @@ -428,28 +479,32 @@ def test_partial_evict_tenant(eviction_env: EvictionEnv): ), "all tenants should have lost some layers" warm_size = later_du_by_timeline[warm] - - # bounds for warmed_size - warm_lower = 0.5 * du_by_timeline[warm] - - # We don't know exactly whether the cold tenant needs 2 or just 1 env.layer_size wiggle room. - # So, check for up to 3 here. - warm_upper = warm_lower + 3 * env.layer_size - cold_size = later_du_by_timeline[cold] - cold_upper = 2 * env.layer_size - log.info( - f"expecting for warm tenant: {human_bytes(warm_lower)} < {human_bytes(warm_size)} < {human_bytes(warm_upper)}" - ) - log.info(f"expecting for cold tenant: {human_bytes(cold_size)} < {human_bytes(cold_upper)}") + if order == EvictionOrder.ABSOLUTE_ORDER: + # bounds for warmed_size + warm_lower = 0.5 * du_by_timeline[warm] - assert warm_size > warm_lower, "warmed up tenant should be at about half size (lower)" - assert warm_size < warm_upper, "warmed up tenant should be at about half size (upper)" + # We don't know exactly whether the cold tenant needs 2 or just 1 env.layer_size wiggle room. + # So, check for up to 3 here. + warm_upper = warm_lower + 3 * env.layer_size - assert ( - cold_size < cold_upper - ), "the cold tenant should be evicted to its min_resident_size, i.e., max layer file size" + cold_upper = 2 * env.layer_size + log.info(f"tenants: warm={warm[0]}, cold={cold[0]}") + log.info( + f"expecting for warm tenant: {human_bytes(warm_lower)} < {human_bytes(warm_size)} < {human_bytes(warm_upper)}" + ) + log.info(f"expecting for cold tenant: {human_bytes(cold_size)} < {human_bytes(cold_upper)}") + + assert warm_size > warm_lower, "warmed up tenant should be at about half size (lower)" + assert warm_size < warm_upper, "warmed up tenant should be at about half size (upper)" + + assert ( + cold_size < cold_upper + ), "the cold tenant should be evicted to its min_resident_size, i.e., max layer file size" + else: + # just go with the space was freed, find proper limits later + pass def poor_mans_du( @@ -501,6 +556,7 @@ def test_statvfs_error_handling(eviction_env: EvictionEnv): "type": "Failure", "mocked_error": "EIO", }, + eviction_order=EvictionOrder.ABSOLUTE_ORDER, ) assert env.neon_env.pageserver.log_contains(".*statvfs failed.*EIO") @@ -533,6 +589,7 @@ def test_statvfs_pressure_usage(eviction_env: EvictionEnv): # This avoids accounting for metadata files & tenant conf in the tests. "name_filter": ".*__.*", }, + eviction_order=EvictionOrder.ABSOLUTE_ORDER, ) def relieved_log_message(): @@ -573,6 +630,7 @@ def test_statvfs_pressure_min_avail_bytes(eviction_env: EvictionEnv): # This avoids accounting for metadata files & tenant conf in the tests. "name_filter": ".*__.*", }, + eviction_order=EvictionOrder.ABSOLUTE_ORDER, ) def relieved_log_message(): diff --git a/test_runner/regress/test_layers_from_future.py b/test_runner/regress/test_layers_from_future.py index ef2b2185c3..340188c1ae 100644 --- a/test_runner/regress/test_layers_from_future.py +++ b/test_runner/regress/test_layers_from_future.py @@ -157,7 +157,6 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder): time.sleep(1.1) # so that we can use change in pre_stat.st_mtime to detect overwrites def get_generation_number(): - assert env.attachment_service is not None attachment = env.attachment_service.inspect(tenant_id) assert attachment is not None return attachment[0] diff --git a/test_runner/regress/test_lsn_mapping.py b/test_runner/regress/test_lsn_mapping.py index f79c1c347c..65d6d7a9fd 100644 --- a/test_runner/regress/test_lsn_mapping.py +++ b/test_runner/regress/test_lsn_mapping.py @@ -8,71 +8,6 @@ from fixtures.types import Lsn from fixtures.utils import query_scalar -# -# Test pageserver get_lsn_by_timestamp API -# -def test_lsn_mapping_old(neon_env_builder: NeonEnvBuilder): - env = neon_env_builder.init_start() - - new_timeline_id = env.neon_cli.create_branch("test_lsn_mapping") - endpoint_main = env.endpoints.create_start("test_lsn_mapping") - log.info("postgres is running on 'test_lsn_mapping' branch") - - cur = endpoint_main.connect().cursor() - # Create table, and insert rows, each in a separate transaction - # Disable synchronous_commit to make this initialization go faster. - # - # Each row contains current insert LSN and the current timestamp, when - # the row was inserted. - cur.execute("SET synchronous_commit=off") - cur.execute("CREATE TABLE foo (x integer)") - tbl = [] - for i in range(1000): - cur.execute("INSERT INTO foo VALUES(%s)", (i,)) - # Get the timestamp at UTC - after_timestamp = query_scalar(cur, "SELECT clock_timestamp()").replace(tzinfo=None) - tbl.append([i, after_timestamp]) - - # Execute one more transaction with synchronous_commit enabled, to flush - # all the previous transactions - cur.execute("SET synchronous_commit=on") - cur.execute("INSERT INTO foo VALUES (-1)") - - # Wait until WAL is received by pageserver - wait_for_last_flush_lsn(env, endpoint_main, env.initial_tenant, new_timeline_id) - - with env.pageserver.http_client() as client: - # Check edge cases: timestamp in the future - probe_timestamp = tbl[-1][1] + timedelta(hours=1) - result = client.timeline_get_lsn_by_timestamp( - env.initial_tenant, new_timeline_id, f"{probe_timestamp.isoformat()}Z", 1 - ) - assert result == "future" - - # timestamp too the far history - probe_timestamp = tbl[0][1] - timedelta(hours=10) - result = client.timeline_get_lsn_by_timestamp( - env.initial_tenant, new_timeline_id, f"{probe_timestamp.isoformat()}Z", 1 - ) - assert result == "past" - - # Probe a bunch of timestamps in the valid range - for i in range(1, len(tbl), 100): - probe_timestamp = tbl[i][1] - lsn = client.timeline_get_lsn_by_timestamp( - env.initial_tenant, new_timeline_id, f"{probe_timestamp.isoformat()}Z", 1 - ) - # Call get_lsn_by_timestamp to get the LSN - # Launch a new read-only node at that LSN, and check that only the rows - # that were supposed to be committed at that point in time are visible. - endpoint_here = env.endpoints.create_start( - branch_name="test_lsn_mapping", endpoint_id="ep-lsn_mapping_read", lsn=lsn - ) - assert endpoint_here.safe_psql("SELECT max(x) FROM foo")[0][0] == i - - endpoint_here.stop_and_destroy() - - # # Test pageserver get_lsn_by_timestamp API # @@ -130,7 +65,7 @@ def test_lsn_mapping(neon_env_builder: NeonEnvBuilder): # Timestamp is in the future probe_timestamp = tbl[-1][1] + timedelta(hours=1) result = client.timeline_get_lsn_by_timestamp( - tenant_id, timeline_id, f"{probe_timestamp.isoformat()}Z", 2 + tenant_id, timeline_id, f"{probe_timestamp.isoformat()}Z" ) assert result["kind"] == "future" # make sure that we return a well advanced lsn here @@ -139,7 +74,7 @@ def test_lsn_mapping(neon_env_builder: NeonEnvBuilder): # Timestamp is in the unreachable past probe_timestamp = tbl[0][1] - timedelta(hours=10) result = client.timeline_get_lsn_by_timestamp( - tenant_id, timeline_id, f"{probe_timestamp.isoformat()}Z", 2 + tenant_id, timeline_id, f"{probe_timestamp.isoformat()}Z" ) assert result["kind"] == "past" # make sure that we return the minimum lsn here at the start of the range @@ -149,7 +84,7 @@ def test_lsn_mapping(neon_env_builder: NeonEnvBuilder): for i in range(1, len(tbl), 100): probe_timestamp = tbl[i][1] result = client.timeline_get_lsn_by_timestamp( - tenant_id, timeline_id, f"{probe_timestamp.isoformat()}Z", 2 + tenant_id, timeline_id, f"{probe_timestamp.isoformat()}Z" ) assert result["kind"] not in ["past", "nodata"] lsn = result["lsn"] diff --git a/test_runner/regress/test_pageserver_api.py b/test_runner/regress/test_pageserver_api.py index 64e41a2dd5..573d2139ce 100644 --- a/test_runner/regress/test_pageserver_api.py +++ b/test_runner/regress/test_pageserver_api.py @@ -72,7 +72,9 @@ def check_client(env: NeonEnv, client: PageserverHttpClient): # create new tenant and check it is also there tenant_id = TenantId.generate() - client.tenant_create(tenant_id, generation=env.pageserver.maybe_get_generation(tenant_id)) + client.tenant_create( + tenant_id, generation=env.attachment_service.attach_hook_issue(tenant_id, env.pageserver.id) + ) assert tenant_id in {TenantId(t["id"]) for t in client.tenant_list()} timelines = client.timeline_list(tenant_id) diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py index 4488be31c5..9c2f5786d4 100644 --- a/test_runner/regress/test_pageserver_generations.py +++ b/test_runner/regress/test_pageserver_generations.py @@ -187,7 +187,6 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder): - After upgrade, the bucket should contain a mixture. - In both cases, postgres I/O should work. """ - neon_env_builder.enable_generations = True neon_env_builder.enable_pageserver_remote_storage( RemoteStorageKind.MOCK_S3, ) @@ -196,7 +195,6 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder): env.broker.try_start() for sk in env.safekeepers: sk.start() - assert env.attachment_service is not None env.attachment_service.start() env.pageserver.start(overrides=('--pageserver-config-override=control_plane_api=""',)) @@ -262,12 +260,10 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder): def test_deferred_deletion(neon_env_builder: NeonEnvBuilder): - neon_env_builder.enable_generations = True neon_env_builder.enable_pageserver_remote_storage( RemoteStorageKind.MOCK_S3, ) env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF) - assert env.attachment_service is not None some_other_pageserver = 1234 ps_http = env.pageserver.http_client() @@ -341,7 +337,6 @@ def test_deletion_queue_recovery( :param validate_before: whether to wait for deletions to be validated before restart. This makes them elegible to be executed after restart, if the same node keeps the attachment. """ - neon_env_builder.enable_generations = True neon_env_builder.enable_pageserver_remote_storage( RemoteStorageKind.MOCK_S3, ) @@ -405,7 +400,6 @@ def test_deletion_queue_recovery( if keep_attachment == KeepAttachment.LOSE: some_other_pageserver = 101010 - assert env.attachment_service is not None env.attachment_service.attach_hook_issue(env.initial_tenant, some_other_pageserver) env.pageserver.start() @@ -453,7 +447,6 @@ def test_deletion_queue_recovery( def test_emergency_mode(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): - neon_env_builder.enable_generations = True neon_env_builder.enable_pageserver_remote_storage( RemoteStorageKind.MOCK_S3, ) @@ -473,7 +466,6 @@ def test_emergency_mode(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): ) # Simulate a major incident: the control plane goes offline - assert env.attachment_service is not None env.attachment_service.stop() # Remember how many validations had happened before the control plane went offline @@ -545,7 +537,6 @@ def test_eviction_across_generations(neon_env_builder: NeonEnvBuilder): and must be constructed using the proper generation for the layer, which may not be the same generation that the tenant is running in. """ - neon_env_builder.enable_generations = True neon_env_builder.enable_pageserver_remote_storage( RemoteStorageKind.MOCK_S3, ) @@ -575,7 +566,6 @@ def test_multi_attach( neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, ): - neon_env_builder.enable_generations = True neon_env_builder.num_pageservers = 3 neon_env_builder.enable_pageserver_remote_storage( remote_storage_kind=RemoteStorageKind.MOCK_S3, diff --git a/test_runner/regress/test_pageserver_restart.py b/test_runner/regress/test_pageserver_restart.py index 3cac32b790..c4499196b5 100644 --- a/test_runner/regress/test_pageserver_restart.py +++ b/test_runner/regress/test_pageserver_restart.py @@ -9,9 +9,7 @@ from fixtures.utils import wait_until # Test restarting page server, while safekeeper and compute node keep # running. -@pytest.mark.parametrize("generations", [True, False]) -def test_pageserver_restart(neon_env_builder: NeonEnvBuilder, generations: bool): - neon_env_builder.enable_generations = generations +def test_pageserver_restart(neon_env_builder: NeonEnvBuilder): neon_env_builder.enable_pageserver_remote_storage(s3_storage()) neon_env_builder.enable_scrub_on_exit() diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py index 64ade346aa..8ae4297983 100644 --- a/test_runner/regress/test_pageserver_secondary.py +++ b/test_runner/regress/test_pageserver_secondary.py @@ -57,13 +57,11 @@ def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, seed: int): states are valid, so that we may test it in this way: the API should always work as long as the tenant exists. """ - neon_env_builder.enable_generations = True neon_env_builder.num_pageservers = 3 neon_env_builder.enable_pageserver_remote_storage( remote_storage_kind=RemoteStorageKind.MOCK_S3, ) env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF) - assert env.attachment_service is not None pageservers = env.pageservers list([p.http_client() for p in pageservers]) @@ -210,13 +208,11 @@ def test_live_migration(neon_env_builder: NeonEnvBuilder): """ Test the sequence of location states that are used in a live migration. """ - neon_env_builder.enable_generations = True neon_env_builder.num_pageservers = 2 neon_env_builder.enable_pageserver_remote_storage( remote_storage_kind=RemoteStorageKind.MOCK_S3, ) env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF) - assert env.attachment_service is not None tenant_id = env.initial_tenant timeline_id = env.initial_timeline diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py index 3004d69f50..2fda56d0f4 100644 --- a/test_runner/regress/test_remote_storage.py +++ b/test_runner/regress/test_remote_storage.py @@ -60,8 +60,6 @@ def test_remote_storage_backup_and_restore( neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) - neon_env_builder.enable_generations = generations - # Exercise retry code path by making all uploads and downloads fail for the # first time. The retries print INFO-messages to the log; we will check # that they are present after the test. diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py index c6d578a7a2..82ffcb1177 100644 --- a/test_runner/regress/test_timeline_delete.py +++ b/test_runner/regress/test_timeline_delete.py @@ -263,15 +263,6 @@ def test_delete_timeline_exercise_crash_safety_failpoints( ps_http, env.initial_tenant, timeline_id, iterations=iterations ) - if failpoint == "timeline-delete-after-index-delete": - m = ps_http.get_metrics() - assert ( - m.query_one( - "remote_storage_s3_request_seconds_count", - filter={"request_type": "get_object", "result": "ok"}, - ).value - == 1 # index part for initial timeline - ) elif check is Check.RETRY_WITHOUT_RESTART: # this should succeed # this also checks that delete can be retried even when timeline is in Broken state diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py index 6e510b2eba..11685d1d48 100644 --- a/test_runner/regress/test_timeline_size.py +++ b/test_runner/regress/test_timeline_size.py @@ -1,3 +1,4 @@ +import concurrent.futures import math import queue import random @@ -24,6 +25,7 @@ from fixtures.pageserver.utils import ( assert_tenant_state, timeline_delete_wait_completed, wait_for_upload_queue_empty, + wait_tenant_status_404, wait_until_tenant_active, ) from fixtures.pg_version import PgVersion @@ -776,6 +778,7 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder): def get_tenant_states(): states = {} + log.info(f"Tenant ids: {tenant_ids}") for tenant_id in tenant_ids: tenant = pageserver_http.tenant_status(tenant_id=tenant_id) states[tenant_id] = tenant["state"]["slug"] @@ -872,3 +875,51 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder): pageserver_http.get_metric_value("pageserver_tenant_startup_scheduled_total") == n_tenants ) assert pageserver_http.get_metric_value("pageserver_tenant_startup_complete_total") == n_tenants + + # Check that tenant deletion proactively wakes tenants: this is done separately to the main + # body of the test because it will disrupt tenant counts + env.pageserver.stop() + env.pageserver.start( + extra_env_vars={"FAILPOINTS": "timeline-calculate-logical-size-pause=pause"} + ) + + wait_until(10, 1, at_least_one_active) + delete_tenant_id = list( + [(tid, s) for (tid, s) in get_tenant_states().items() if s == "Attaching"] + )[0][0] + + # Deleting a stuck tenant should prompt it to go active + with concurrent.futures.ThreadPoolExecutor() as executor: + log.info("Starting background delete") + + def delete_tenant(): + env.pageserver.http_client().tenant_delete(delete_tenant_id) + + background_delete = executor.submit(delete_tenant) + + # Deletion itself won't complete due to our failpoint: Tenant::shutdown can't complete while calculating + # logical size is paused in a failpoint. So instead we will use a log observation to check that + # on-demand activation was triggered by the tenant deletion + log_match = f".*attach{{tenant_id={delete_tenant_id} shard_id=0000}}: Activating tenant \\(on-demand\\).*" + + def activated_on_demand(): + assert env.pageserver.log_contains(log_match) is not None + + log.info(f"Waiting for activation message '{log_match}'") + try: + wait_until(10, 1, activated_on_demand) + finally: + log.info("Clearing failpoint") + pageserver_http.configure_failpoints(("timeline-calculate-logical-size-pause", "off")) + + # Deletion should complete successfully now that failpoint is unblocked + log.info("Joining background delete") + background_delete.result(timeout=10) + + # Poll for deletion to complete + wait_tenant_status_404(pageserver_http, tenant_id=delete_tenant_id, iterations=40) + tenant_ids.remove(delete_tenant_id) + + # Check that all the stuck tenants proceed to active (apart from the one that deletes) + wait_until(10, 1, all_active) + assert len(get_tenant_states()) == n_tenants - 1 diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index 3c40a9cb3e..cf8df389c8 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -419,7 +419,8 @@ def wait(f, desc, timeout=30, wait_f=None): try: if f(): break - except Exception: + except Exception as e: + log.info(f"got exception while waiting for {desc}: {e}") pass elapsed = time.time() - started_at if elapsed > timeout: @@ -1001,8 +1002,40 @@ def test_restart_endpoint(neon_env_builder: NeonEnvBuilder): endpoint.start() +# Context manager which logs passed time on exit. +class DurationLogger: + def __init__(self, desc): + self.desc = desc + + def __enter__(self): + self.ts_before = time.time() + + def __exit__(self, *exc): + log.info(f"{self.desc} finished in {time.time() - self.ts_before}s") + + +# Context manager which logs WAL position change on exit. +class WalChangeLogger: + def __init__(self, ep, desc_before): + self.ep = ep + self.desc_before = desc_before + + def __enter__(self): + self.ts_before = time.time() + self.lsn_before = Lsn(self.ep.safe_psql_scalar("select pg_current_wal_lsn()")) + log.info(f"{self.desc_before}, lsn_before={self.lsn_before}") + + def __exit__(self, *exc): + lsn_after = Lsn(self.ep.safe_psql_scalar("select pg_current_wal_lsn()")) + log.info( + f"inserted {((lsn_after - self.lsn_before) / 1024 / 1024):.3f} MB of WAL in {(time.time() - self.ts_before):.3f}s" + ) + + # Test that we can create timeline with one safekeeper down and initialize it -# later when some data already had been written. +# later when some data already had been written. It is strictly weaker than +# test_lagging_sk, but also is the simplest test to trigger WAL sk -> compute +# download (recovery) and as such useful for development/testing. def test_late_init(neon_env_builder: NeonEnvBuilder): neon_env_builder.num_safekeepers = 3 env = neon_env_builder.init_start() @@ -1010,12 +1043,13 @@ def test_late_init(neon_env_builder: NeonEnvBuilder): sk1 = env.safekeepers[0] sk1.stop() - # create and insert smth while safekeeper is down... - env.neon_cli.create_branch("test_late_init") + tenant_id = env.initial_tenant + timeline_id = env.neon_cli.create_branch("test_late_init") endpoint = env.endpoints.create_start("test_late_init") + # create and insert smth while safekeeper is down... endpoint.safe_psql("create table t(key int, value text)") - endpoint.safe_psql("insert into t select generate_series(1, 1000), 'payload'") - log.info("insert with safekeeper down done") + with WalChangeLogger(endpoint, "doing insert with sk1 down"): + endpoint.safe_psql("insert into t select generate_series(1, 1000), 'payload'") endpoint.stop() # stop compute # stop another safekeeper, and start one which missed timeline creation @@ -1024,28 +1058,213 @@ def test_late_init(neon_env_builder: NeonEnvBuilder): sk1.start() # insert some more - endpoint = env.endpoints.create_start("test_late_init") + with DurationLogger("recovery"): + endpoint = env.endpoints.create_start("test_late_init") endpoint.safe_psql("insert into t select generate_series(1,100), 'payload'") + wait_flush_lsn_align_by_ep( + env, "test_late_init", tenant_id, timeline_id, endpoint, [sk1, env.safekeepers[2]] + ) + # Check that WALs are the same. + cmp_sk_wal([sk1, env.safekeepers[2]], tenant_id, timeline_id) + # is timeline flush_lsn equal on provided safekeepers? -def is_flush_lsn_aligned(sk1_http_cli, sk2_http_cli, tenant_id, timeline_id): - status1 = sk1_http_cli.timeline_status(tenant_id, timeline_id) - status2 = sk2_http_cli.timeline_status(tenant_id, timeline_id) - log.info( - f"waiting for flush_lsn alignment, sk1.flush_lsn={status1.flush_lsn}, sk2.flush_lsn={status2.flush_lsn}" +def is_flush_lsn_aligned(sk_http_clis, tenant_id, timeline_id): + flush_lsns = [ + sk_http_cli.timeline_status(tenant_id, timeline_id).flush_lsn + for sk_http_cli in sk_http_clis + ] + log.info(f"waiting for flush_lsn alignment, flush_lsns={flush_lsns}") + return all([flush_lsns[0] == flsn for flsn in flush_lsns]) + + +def are_walreceivers_absent(sk_http_cli, tenant_id: TenantId, timeline_id: TimelineId): + status = sk_http_cli.timeline_status(tenant_id, timeline_id) + log.info(f"waiting for walreceivers to be gone, currently {status.walreceivers}") + return len(status.walreceivers) == 0 + + +# Assert by xxd that WAL on given safekeepers is identical. No compute must be +# running for this to be reliable. +def cmp_sk_wal(sks: List[Safekeeper], tenant_id: TenantId, timeline_id: TimelineId): + assert len(sks) >= 2, "cmp_sk_wal makes sense with >= 2 safekeepers passed" + sk_http_clis = [sk.http_client() for sk in sks] + + # First check that term / flush_lsn are the same: it is easier to + # report/understand if WALs are different due to that. + statuses = [sk_http_cli.timeline_status(tenant_id, timeline_id) for sk_http_cli in sk_http_clis] + term_flush_lsns = [(s.acceptor_epoch, s.flush_lsn) for s in statuses] + for tfl, sk in zip(term_flush_lsns[1:], sks[1:]): + assert ( + term_flush_lsns[0] == tfl + ), f"(term, flush_lsn) are not equal on sks {sks[0].id} and {sk.id}: {term_flush_lsns[0]} != {tfl}" + + # check that WALs are identic. + segs = [sk.list_segments(tenant_id, timeline_id) for sk in sks] + for cmp_segs, sk in zip(segs[1:], sks[1:]): + assert ( + segs[0] == cmp_segs + ), f"lists of segments on sks {sks[0].id} and {sk.id} are not identic: {segs[0]} and {cmp_segs}" + log.info(f"comparing segs {segs[0]}") + + sk0 = sks[0] + for sk in sks[1:]: + (_, mismatch, not_regular) = filecmp.cmpfiles( + sk0.timeline_dir(tenant_id, timeline_id), + sk.timeline_dir(tenant_id, timeline_id), + segs[0], + shallow=False, + ) + log.info( + f"filecmp result mismatch and not regular files:\n\t mismatch={mismatch}\n\t not_regular={not_regular}" + ) + + for f in mismatch: + f1 = os.path.join(sk0.timeline_dir(tenant_id, timeline_id), f) + f2 = os.path.join(sk.timeline_dir(tenant_id, timeline_id), f) + stdout_filename = "{}.filediff".format(f2) + + with open(stdout_filename, "w") as stdout_f: + subprocess.run("xxd {} > {}.hex ".format(f1, f1), shell=True) + subprocess.run("xxd {} > {}.hex ".format(f2, f2), shell=True) + + cmd = "diff {}.hex {}.hex".format(f1, f2) + subprocess.run([cmd], stdout=stdout_f, shell=True) + + assert (mismatch, not_regular) == ( + [], + [], + ), f"WAL segs {f1} and {f2} on sks {sks[0].id} and {sk.id} are not identic" + + +# Wait until flush_lsn on given sks becomes equal, assuming endpoint ep is +# running. ep is stopped by this function. This is used in tests which check +# binary equality of WAL segments on safekeepers; which is inherently racy as +# shutting down endpoint might always write some WAL which can get to only one +# safekeeper. So here we recheck flush_lsn again after ep shutdown and retry if +# it has changed. +def wait_flush_lsn_align_by_ep(env, branch, tenant_id, timeline_id, ep, sks): + sk_http_clis = [sk.http_client() for sk in sks] + # First wait for the alignment. + wait( + partial(is_flush_lsn_aligned, sk_http_clis, tenant_id, timeline_id), + "flush_lsn to get aligned", ) - return status1.flush_lsn == status2.flush_lsn + ep.stop() # then stop endpoint + # Even if there is no compute, there might be some in flight data; ensure + # all walreceivers die before rechecking. + for sk_http_cli in sk_http_clis: + wait( + partial(are_walreceivers_absent, sk_http_cli, tenant_id, timeline_id), + "walreceivers to be gone", + ) + # Now recheck again flush_lsn and exit if it is good + if is_flush_lsn_aligned(sk_http_clis, tenant_id, timeline_id): + return + # Otherwise repeat. + log.info("flush_lsn changed during endpoint shutdown; retrying alignment") + ep = env.endpoints.create_start(branch) -# Test behaviour with one safekeeper down and missing a lot of WAL. Namely, that -# 1) walproposer can't recover node if it misses WAL written by previous computes, but -# still starts up and functions normally if two other sks are ok. -# 2) walproposer doesn't keep WAL after some threshold (pg_wal bloat is limited), but functions -# normally if two other sks are ok. -# 3) Lagged safekeeper can still recover by peer recovery. -def test_one_sk_down(neon_env_builder: NeonEnvBuilder): - pass +# Test behaviour with one safekeeper down and missing a lot of WAL, exercising +# neon_walreader and checking that pg_wal never bloats. Namely, ensures that +# compute doesn't keep many WAL for lagging sk, but still can recover it with +# neon_walreader, in two scenarious: a) WAL never existed on compute (it started +# on basebackup LSN later than lagging sk position) though segment file exists +# b) WAL had been recycled on it and segment file doesn't exist. +# +# Also checks along the way that whenever there are two sks alive, compute +# should be able to commit. +def test_lagging_sk(neon_env_builder: NeonEnvBuilder): + # inserts ~20MB of WAL, a bit more than a segment. + def fill_segment(ep): + ep.safe_psql("insert into t select generate_series(1, 180000), 'payload'") + + neon_env_builder.num_safekeepers = 3 + env = neon_env_builder.init_start() + + (sk1, sk2, sk3) = env.safekeepers + + # create and insert smth while safekeeper is down... + sk1.stop() + tenant_id = env.initial_tenant + timeline_id = env.neon_cli.create_branch("test_lagging_sk") + ep = env.endpoints.create_start("test_lagging_sk") + ep.safe_psql("create table t(key int, value text)") + # make small insert to be on the same segment + ep.safe_psql("insert into t select generate_series(1, 1000), 'payload'") + log.info("insert with safekeeper down done") + ep.stop() # stop compute + + # Stop another safekeeper, and start one which missed timeline creation. + sk2.stop() + sk1.start() + + # Start new ep and insert some more. neon_walreader should download WAL for + # sk1 because it should be filled since the horizon (initial LSN) which is + # earlier than basebackup LSN. + ep = env.endpoints.create_start("test_lagging_sk") + ep.safe_psql("insert into t select generate_series(1,100), 'payload'") + # stop ep and ensure WAL is identical after recovery. + wait_flush_lsn_align_by_ep(env, "test_lagging_sk", tenant_id, timeline_id, ep, [sk1, sk3]) + # Check that WALs are the same. + cmp_sk_wal([sk1, sk3], tenant_id, timeline_id) + + # Now repeat insertion with sk1 down, but with inserting more data to check + # that WAL on compute is removed. + sk1.stop() + sk2.start() + + # min_wal_size must be at least 2x segment size. + min_wal_config = [ + "min_wal_size=32MB", + "max_wal_size=32MB", + "wal_keep_size=0", + "log_checkpoints=on", + ] + ep = env.endpoints.create_start( + "test_lagging_sk", + config_lines=min_wal_config, + ) + with WalChangeLogger(ep, "doing large insert with sk1 down"): + for _ in range(0, 5): + fill_segment(ep) + # there shouldn't be more than 2 WAL segments (but dir may have archive_status files) + assert ep.get_pg_wal_size() < 16 * 2.5 + + sk2.stop() # stop another sk to ensure sk1 and sk3 can work + sk1.start() + with DurationLogger("recovery"): + ep.safe_psql("insert into t select generate_series(1,100), 'payload'") # forces recovery + # stop ep and ensure WAL is identical after recovery. + wait_flush_lsn_align_by_ep(env, "test_lagging_sk", tenant_id, timeline_id, ep, [sk1, sk3]) + # Check that WALs are the same. + cmp_sk_wal([sk1, sk3], tenant_id, timeline_id) + + # Now do the same with different safekeeper sk2 down, and restarting ep + # before recovery (again scenario when recovery starts below basebackup_lsn, + # but multi segment now). + ep = env.endpoints.create_start( + "test_lagging_sk", + config_lines=["min_wal_size=32MB", "max_wal_size=32MB", "log_checkpoints=on"], + ) + with WalChangeLogger(ep, "doing large insert with sk2 down"): + for _ in range(0, 5): + fill_segment(ep) + # there shouldn't be more than 2 WAL segments (but dir may have archive_status files) + assert ep.get_pg_wal_size() < 16 * 2.5 + + ep.stop() + ep = env.endpoints.create_start( + "test_lagging_sk", + config_lines=min_wal_config, + ) + sk2.start() + with DurationLogger("recovery"): + wait_flush_lsn_align_by_ep(env, "test_lagging_sk", tenant_id, timeline_id, ep, [sk2, sk3]) + # Check that WALs are the same. + cmp_sk_wal([sk1, sk2, sk3], tenant_id, timeline_id) # Smaller version of test_one_sk_down testing peer recovery in isolation: that @@ -1065,7 +1284,7 @@ def test_peer_recovery(neon_env_builder: NeonEnvBuilder): sk2_http_cli = sk2.http_client() # ensure tli gets created on sk1, peer recovery won't do that wait( - partial(is_flush_lsn_aligned, sk1_http_cli, sk2_http_cli, tenant_id, timeline_id), + partial(is_flush_lsn_aligned, [sk1_http_cli, sk2_http_cli], tenant_id, timeline_id), "flush_lsn to get aligned", ) @@ -1087,7 +1306,7 @@ def test_peer_recovery(neon_env_builder: NeonEnvBuilder): assert sk2_tli_status.flush_lsn - sk1_tli_status.flush_lsn >= 16 * 1024 * 1024 # wait a bit, lsns shouldn't change - # time.sleep(5) + time.sleep(2) sk1_tli_status = sk1_http_cli.timeline_status(tenant_id, timeline_id) sk2_tli_status = sk2_http_cli.timeline_status(tenant_id, timeline_id) log.info( @@ -1098,37 +1317,11 @@ def test_peer_recovery(neon_env_builder: NeonEnvBuilder): # now restart safekeeper with peer recovery enabled and wait for recovery sk1.stop().start(extra_opts=["--peer-recovery=true"]) wait( - partial(is_flush_lsn_aligned, sk1_http_cli, sk2_http_cli, tenant_id, timeline_id), + partial(is_flush_lsn_aligned, [sk1_http_cli, sk2_http_cli], tenant_id, timeline_id), "flush_lsn to get aligned", ) - # check that WALs are identic after recovery - segs = sk1.list_segments(tenant_id, timeline_id) - log.info(f"segs are {segs}") - - (_, mismatch, not_regular) = filecmp.cmpfiles( - sk1.timeline_dir(tenant_id, timeline_id), - sk2.timeline_dir(tenant_id, timeline_id), - segs, - shallow=False, - ) - log.info( - f"filecmp result mismatch and not regular files:\n\t mismatch={mismatch}\n\t not_regular={not_regular}" - ) - - for f in mismatch: - f1 = os.path.join(sk1.timeline_dir(tenant_id, timeline_id), f) - f2 = os.path.join(sk2.timeline_dir(tenant_id, timeline_id), f) - stdout_filename = "{}.filediff".format(f2) - - with open(stdout_filename, "w") as stdout_f: - subprocess.run("xxd {} > {}.hex ".format(f1, f1), shell=True) - subprocess.run("xxd {} > {}.hex ".format(f2, f2), shell=True) - - cmd = "diff {}.hex {}.hex".format(f1, f2) - subprocess.run([cmd], stdout=stdout_f, shell=True) - - assert (mismatch, not_regular) == ([], []) + cmp_sk_wal([sk1, sk2], tenant_id, timeline_id) # stop one of safekeepers which weren't recovering and insert a bit more to check we can commit env.safekeepers[2].stop() @@ -1364,60 +1557,6 @@ def test_replace_safekeeper(neon_env_builder: NeonEnvBuilder): show_statuses(env.safekeepers, tenant_id, timeline_id) -# We have `wal_keep_size=0`, so postgres should trim WAL once it's broadcasted -# to all safekeepers. This test checks that compute WAL can fit into small number -# of WAL segments. -def test_wal_deleted_after_broadcast(neon_env_builder: NeonEnvBuilder): - # used to calculate delta in collect_stats - last_lsn = Lsn(0) - - # returns pg_wal size in MB - def collect_stats(endpoint: Endpoint, cur, enable_logs=True): - nonlocal last_lsn - assert endpoint.pgdata_dir is not None - - log.info("executing INSERT to generate WAL") - current_lsn = Lsn(query_scalar(cur, "select pg_current_wal_lsn()")) - pg_wal_size_mb = get_dir_size(os.path.join(endpoint.pgdata_dir, "pg_wal")) / 1024 / 1024 - if enable_logs: - lsn_delta_mb = (current_lsn - last_lsn) / 1024 / 1024 - log.info(f"LSN delta: {lsn_delta_mb} MB, current WAL size: {pg_wal_size_mb} MB") - last_lsn = current_lsn - return pg_wal_size_mb - - # generates about ~20MB of WAL, to create at least one new segment - def generate_wal(cur): - cur.execute("INSERT INTO t SELECT generate_series(1,300000), 'payload'") - - neon_env_builder.num_safekeepers = 3 - env = neon_env_builder.init_start() - - env.neon_cli.create_branch("test_wal_deleted_after_broadcast") - # Adjust checkpoint config to prevent keeping old WAL segments - endpoint = env.endpoints.create_start( - "test_wal_deleted_after_broadcast", - config_lines=["min_wal_size=32MB", "max_wal_size=32MB", "log_checkpoints=on"], - ) - - pg_conn = endpoint.connect() - cur = pg_conn.cursor() - cur.execute("CREATE TABLE t(key int, value text)") - - collect_stats(endpoint, cur) - - # generate WAL to simulate normal workload - for _ in range(5): - generate_wal(cur) - collect_stats(endpoint, cur) - - log.info("executing checkpoint") - cur.execute("CHECKPOINT") - wal_size_after_checkpoint = collect_stats(endpoint, cur) - - # there shouldn't be more than 2 WAL segments (but dir may have archive_status files) - assert wal_size_after_checkpoint < 16 * 2.5 - - @pytest.mark.parametrize("auth_enabled", [False, True]) def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): neon_env_builder.auth_enabled = auth_enabled diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index 0bb356aa0c..03358bb0b5 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit 0bb356aa0cd1582112926fbcf0b5370222c2db6d +Subproject commit 03358bb0b5e0d33c238710139e768db9e75cfcc8 diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index 24333abb81..a2dc225ddf 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit 24333abb81a9ecae4541019478f0bf7d0b289df7 +Subproject commit a2dc225ddfc8cae1849aa2316f435c58f0333d8c diff --git a/vendor/postgres-v16 b/vendor/postgres-v16 index 863b71572b..225071f482 160000 --- a/vendor/postgres-v16 +++ b/vendor/postgres-v16 @@ -1 +1 @@ -Subproject commit 863b71572bc441581efb3bbee2ad18af037be1bb +Subproject commit 225071f482774943854c2eec4540757e01171557 diff --git a/vendor/revisions.json b/vendor/revisions.json index a9575a2cb7..def4eab069 100644 --- a/vendor/revisions.json +++ b/vendor/revisions.json @@ -1,5 +1,5 @@ { - "postgres-v16": "863b71572bc441581efb3bbee2ad18af037be1bb", - "postgres-v15": "24333abb81a9ecae4541019478f0bf7d0b289df7", - "postgres-v14": "0bb356aa0cd1582112926fbcf0b5370222c2db6d" + "postgres-v16": "225071f482774943854c2eec4540757e01171557", + "postgres-v15": "a2dc225ddfc8cae1849aa2316f435c58f0333d8c", + "postgres-v14": "03358bb0b5e0d33c238710139e768db9e75cfcc8" } diff --git a/vm-image-spec.yaml b/vm-image-spec.yaml index 804405293f..68be0b3617 100644 --- a/vm-image-spec.yaml +++ b/vm-image-spec.yaml @@ -36,6 +36,7 @@ files: max_client_conn=10000 default_pool_size=64 max_prepared_statements=0 + admin_users=cloud_admin - filename: cgconfig.conf content: | # Configuration for cgroups in VM compute nodes diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index 4621a75c0b..4f13064088 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -39,6 +39,7 @@ futures-executor = { version = "0.3" } futures-io = { version = "0.3" } futures-sink = { version = "0.3" } futures-util = { version = "0.3", features = ["channel", "io", "sink"] } +getrandom = { version = "0.2", default-features = false, features = ["std"] } hex = { version = "0.4", features = ["serde"] } hmac = { version = "0.12", default-features = false, features = ["reset"] } hyper = { version = "0.14", features = ["full"] } @@ -50,13 +51,14 @@ nom = { version = "7" } num-bigint = { version = "0.4" } num-integer = { version = "0.1", features = ["i128"] } num-traits = { version = "0.2", features = ["i128"] } +once_cell = { version = "1" } prost = { version = "0.11" } rand = { version = "0.8", features = ["small_rng"] } regex = { version = "1" } regex-automata = { version = "0.4", default-features = false, features = ["dfa-onepass", "hybrid", "meta", "nfa-backtrack", "perf-inline", "perf-literal", "unicode"] } regex-syntax = { version = "0.8" } reqwest = { version = "0.11", default-features = false, features = ["blocking", "default-tls", "json", "multipart", "rustls-tls", "stream"] } -ring = { version = "0.16", features = ["std"] } +ring = { version = "0.16" } rustls = { version = "0.21", features = ["dangerous_configuration"] } scopeguard = { version = "1" } serde = { version = "1", features = ["alloc", "derive"] } @@ -75,8 +77,8 @@ tracing-core = { version = "0.1" } tungstenite = { version = "0.20" } url = { version = "2", features = ["serde"] } uuid = { version = "1", features = ["serde", "v4"] } -zstd = { version = "0.12" } -zstd-safe = { version = "6", default-features = false, features = ["arrays", "legacy", "std", "zdict_builder"] } +zstd = { version = "0.13" } +zstd-safe = { version = "7", default-features = false, features = ["arrays", "legacy", "std", "zdict_builder"] } zstd-sys = { version = "2", default-features = false, features = ["legacy", "std", "zdict_builder"] } [build-dependencies] @@ -84,11 +86,13 @@ anyhow = { version = "1", features = ["backtrace"] } bytes = { version = "1", features = ["serde"] } cc = { version = "1", default-features = false, features = ["parallel"] } either = { version = "1" } +getrandom = { version = "0.2", default-features = false, features = ["std"] } itertools = { version = "0.10" } libc = { version = "0.2", features = ["extra_traits"] } log = { version = "0.4", default-features = false, features = ["std"] } memchr = { version = "2" } nom = { version = "7" } +once_cell = { version = "1" } prost = { version = "0.11" } regex = { version = "1" } regex-automata = { version = "0.4", default-features = false, features = ["dfa-onepass", "hybrid", "meta", "nfa-backtrack", "perf-inline", "perf-literal", "unicode"] }