diff --git a/.config/nextest.toml b/.config/nextest.toml index a9398e4ab0..affdc16f31 100644 --- a/.config/nextest.toml +++ b/.config/nextest.toml @@ -1,2 +1,2 @@ [profile.default] -slow-timeout = { period = "20s", terminate-after = 3 } +slow-timeout = { period = "60s", terminate-after = 3 } diff --git a/.dockerignore b/.dockerignore index f7a6232ba1..1258532db8 100644 --- a/.dockerignore +++ b/.dockerignore @@ -17,6 +17,7 @@ !libs/ !neon_local/ !pageserver/ +!patches/ !pgxn/ !proxy/ !s3_scrubber/ diff --git a/.github/actionlint.yml b/.github/actionlint.yml index cb36e2eee6..37983798b7 100644 --- a/.github/actionlint.yml +++ b/.github/actionlint.yml @@ -1,12 +1,11 @@ self-hosted-runner: labels: - arm64 - - dev - gen3 - large - # Remove `macos-14` from the list after https://github.com/rhysd/actionlint/pull/392 is merged. - - macos-14 + - large-arm64 - small + - small-arm64 - us-east-2 config-variables: - REMOTE_STORAGE_AZURE_CONTAINER diff --git a/.github/actions/neon-branch-create/action.yml b/.github/actions/neon-branch-create/action.yml index dea3fc2357..9f752d5a89 100644 --- a/.github/actions/neon-branch-create/action.yml +++ b/.github/actions/neon-branch-create/action.yml @@ -3,13 +3,13 @@ description: 'Create Branch using API' inputs: api_key: - desctiption: 'Neon API key' + description: 'Neon API key' required: true project_id: - desctiption: 'ID of the Project to create Branch in' + description: 'ID of the Project to create Branch in' required: true api_host: - desctiption: 'Neon API host' + description: 'Neon API host' default: console-stage.neon.build outputs: dsn: diff --git a/.github/actions/neon-branch-delete/action.yml b/.github/actions/neon-branch-delete/action.yml index 8acba7ad00..58141a4a3f 100644 --- a/.github/actions/neon-branch-delete/action.yml +++ b/.github/actions/neon-branch-delete/action.yml @@ -3,16 +3,16 @@ description: 'Delete Branch using API' inputs: api_key: - desctiption: 'Neon API key' + description: 'Neon API key' required: true project_id: - desctiption: 'ID of the Project which should be deleted' + description: 'ID of the Project which should be deleted' required: true branch_id: - desctiption: 'ID of the branch to delete' + description: 'ID of the branch to delete' required: true api_host: - desctiption: 'Neon API host' + description: 'Neon API host' default: console-stage.neon.build runs: diff --git a/.github/actions/neon-project-create/action.yml b/.github/actions/neon-project-create/action.yml index 7f0e599b97..16759ad038 100644 --- a/.github/actions/neon-project-create/action.yml +++ b/.github/actions/neon-project-create/action.yml @@ -3,22 +3,22 @@ description: 'Create Neon Project using API' inputs: api_key: - desctiption: 'Neon API key' + description: 'Neon API key' required: true region_id: - desctiption: 'Region ID, if not set the project will be created in the default region' + description: 'Region ID, if not set the project will be created in the default region' default: aws-us-east-2 postgres_version: - desctiption: 'Postgres version; default is 15' - default: 15 + description: 'Postgres version; default is 15' + default: '15' api_host: - desctiption: 'Neon API host' + description: 'Neon API host' default: console-stage.neon.build provisioner: - desctiption: 'k8s-pod or k8s-neonvm' + description: 'k8s-pod or k8s-neonvm' default: 'k8s-pod' compute_units: - desctiption: '[Min, Max] compute units; Min and Max are used for k8s-neonvm with autoscaling, for k8s-pod values Min and Max should be equal' + description: '[Min, Max] compute units; Min and Max are used for k8s-neonvm with autoscaling, for k8s-pod values Min and Max should be equal' default: '[1, 1]' outputs: diff --git a/.github/actions/neon-project-delete/action.yml b/.github/actions/neon-project-delete/action.yml index b8ec6cac70..35e165fd61 100644 --- a/.github/actions/neon-project-delete/action.yml +++ b/.github/actions/neon-project-delete/action.yml @@ -3,13 +3,13 @@ description: 'Delete Neon Project using API' inputs: api_key: - desctiption: 'Neon API key' + description: 'Neon API key' required: true project_id: - desctiption: 'ID of the Project to delete' + description: 'ID of the Project to delete' required: true api_host: - desctiption: 'Neon API host' + description: 'Neon API host' default: console-stage.neon.build runs: diff --git a/.github/workflows/build-build-tools-image.yml b/.github/workflows/build-build-tools-image.yml index c527cef1ac..bdf00bcaae 100644 --- a/.github/workflows/build-build-tools-image.yml +++ b/.github/workflows/build-build-tools-image.yml @@ -39,7 +39,7 @@ jobs: matrix: arch: [ x64, arm64 ] - runs-on: ${{ fromJson(format('["self-hosted", "dev", "{0}"]', matrix.arch)) }} + runs-on: ${{ fromJson(format('["self-hosted", "gen3", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }} env: IMAGE_TAG: ${{ inputs.image-tag }} diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index eada65505f..f8c011a0a5 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -341,6 +341,9 @@ jobs: env: NEXTEST_RETRIES: 3 run: | + #nextest does not yet support running doctests + cargo test --doc $CARGO_FLAGS $CARGO_FEATURES + for io_engine in std-fs tokio-epoll-uring ; do NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES done @@ -543,9 +546,27 @@ jobs: # XXX: no coverage data handling here, since benchmarks are run on release builds, # while coverage is currently collected for the debug ones + report-benchmarks-failures: + needs: [ benchmarks, create-test-report ] + if: github.ref_name == 'main' && failure() && needs.benchmarks.result == 'failure' + runs-on: ubuntu-latest + + steps: + - uses: slackapi/slack-github-action@v1 + with: + channel-id: C060CNA47S9 # on-call-staging-storage-stream + slack-message: | + Benchmarks failed on main: ${{ github.event.head_commit.url }} + + Allure report: ${{ needs.create-test-report.outputs.report-url }} + env: + SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} + create-test-report: needs: [ check-permissions, regress-tests, coverage-report, benchmarks, build-build-tools-image ] if: ${{ !cancelled() && contains(fromJSON('["skipped", "success"]'), needs.check-permissions.result) }} + outputs: + report-url: ${{ steps.create-allure-report.outputs.report-url }} runs-on: [ self-hosted, gen3, small ] container: @@ -702,9 +723,13 @@ jobs: uses: ./.github/workflows/trigger-e2e-tests.yml secrets: inherit - neon-image: + neon-image-arch: needs: [ check-permissions, build-build-tools-image, tag ] - runs-on: [ self-hosted, gen3, large ] + strategy: + matrix: + arch: [ x64, arm64 ] + + runs-on: ${{ fromJson(format('["self-hosted", "gen3", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }} steps: - name: Checkout @@ -726,12 +751,6 @@ jobs: username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} - - uses: docker/login-action@v3 - with: - registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com - username: ${{ secrets.AWS_ACCESS_KEY_DEV }} - password: ${{ secrets.AWS_SECRET_KEY_DEV }} - - uses: docker/build-push-action@v5 with: context: . @@ -743,25 +762,52 @@ jobs: push: true pull: true file: Dockerfile - cache-from: type=registry,ref=neondatabase/neon:cache - cache-to: type=registry,ref=neondatabase/neon:cache,mode=max + cache-from: type=registry,ref=neondatabase/neon:cache-${{ matrix.arch }} + cache-to: type=registry,ref=neondatabase/neon:cache-${{ matrix.arch }},mode=max tags: | - 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} - neondatabase/neon:${{needs.tag.outputs.build-tag}} + neondatabase/neon:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }} - name: Remove custom docker config directory if: always() run: | rm -rf .docker-custom - compute-node-image: - needs: [ check-permissions, build-build-tools-image, tag ] - runs-on: [ self-hosted, gen3, large ] + neon-image: + needs: [ neon-image-arch, tag ] + runs-on: ubuntu-latest + steps: + - uses: docker/login-action@v3 + with: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + + - name: Create multi-arch image + run: | + docker buildx imagetools create -t neondatabase/neon:${{ needs.tag.outputs.build-tag }} \ + neondatabase/neon:${{ needs.tag.outputs.build-tag }}-x64 \ + neondatabase/neon:${{ needs.tag.outputs.build-tag }}-arm64 + + - uses: docker/login-action@v3 + with: + registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com + username: ${{ secrets.AWS_ACCESS_KEY_DEV }} + password: ${{ secrets.AWS_SECRET_KEY_DEV }} + + - name: Push multi-arch image to ECR + run: | + docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{ needs.tag.outputs.build-tag }} \ + neondatabase/neon:${{ needs.tag.outputs.build-tag }} + + compute-node-image-arch: + needs: [ check-permissions, build-build-tools-image, tag ] strategy: fail-fast: false matrix: version: [ v14, v15, v16 ] + arch: [ x64, arm64 ] + + runs-on: ${{ fromJson(format('["self-hosted", "gen3", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }} steps: - name: Checkout @@ -808,15 +854,14 @@ jobs: push: true pull: true file: Dockerfile.compute-node - cache-from: type=registry,ref=neondatabase/compute-node-${{ matrix.version }}:cache - cache-to: type=registry,ref=neondatabase/compute-node-${{ matrix.version }}:cache,mode=max + cache-from: type=registry,ref=neondatabase/compute-node-${{ matrix.version }}:cache-${{ matrix.arch }} + cache-to: type=registry,ref=neondatabase/compute-node-${{ matrix.version }}:cache-${{ matrix.arch }},mode=max tags: | - 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} - neondatabase/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} + neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }} - name: Build compute-tools image # compute-tools are Postgres independent, so build it only once - if: ${{ matrix.version == 'v16' }} + if: matrix.version == 'v16' uses: docker/build-push-action@v5 with: target: compute-tools-image @@ -830,14 +875,57 @@ jobs: pull: true file: Dockerfile.compute-node tags: | - 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{ needs.tag.outputs.build-tag }} - neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }} + neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }} - name: Remove custom docker config directory if: always() run: | rm -rf .docker-custom + compute-node-image: + needs: [ compute-node-image-arch, tag ] + runs-on: ubuntu-latest + + strategy: + matrix: + version: [ v14, v15, v16 ] + + steps: + - uses: docker/login-action@v3 + with: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + + - name: Create multi-arch compute-node image + run: | + docker buildx imagetools create -t neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} \ + neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-x64 \ + neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-arm64 + + - name: Create multi-arch compute-tools image + if: matrix.version == 'v16' + run: | + docker buildx imagetools create -t neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }} \ + neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-x64 \ + neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-arm64 + + - uses: docker/login-action@v3 + with: + registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com + username: ${{ secrets.AWS_ACCESS_KEY_DEV }} + password: ${{ secrets.AWS_SECRET_KEY_DEV }} + + - name: Push multi-arch compute-node-${{ matrix.version }} image to ECR + run: | + docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} \ + neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} + + - name: Push multi-arch compute-tools image to ECR + if: matrix.version == 'v16' + run: | + docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{ needs.tag.outputs.build-tag }} \ + neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }} + vm-compute-node-image: needs: [ check-permissions, tag, compute-node-image ] runs-on: [ self-hosted, gen3, large ] @@ -845,11 +933,8 @@ jobs: fail-fast: false matrix: version: [ v14, v15, v16 ] - defaults: - run: - shell: sh -eu {0} env: - VM_BUILDER_VERSION: v0.28.1 + VM_BUILDER_VERSION: v0.29.3 steps: - name: Checkout @@ -862,26 +947,48 @@ jobs: curl -fL https://github.com/neondatabase/autoscaling/releases/download/$VM_BUILDER_VERSION/vm-builder -o vm-builder chmod +x vm-builder + # Use custom DOCKER_CONFIG directory to avoid conflicts with default settings + # The default value is ~/.docker + - name: Set custom docker config directory + run: | + mkdir -p .docker-custom + echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV + + - uses: docker/login-action@v3 + with: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + # Note: we need a separate pull step here because otherwise vm-builder will try to pull, and # it won't have the proper authentication (written at v0.6.0) - name: Pulling compute-node image run: | - docker pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} + docker pull neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} - name: Build vm image run: | ./vm-builder \ -spec=vm-image-spec.yaml \ - -src=369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} \ - -dst=369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} + -src=neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} \ + -dst=neondatabase/vm-compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} - name: Pushing vm-compute-node image run: | - docker push 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} + docker push neondatabase/vm-compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} + + - name: Remove custom docker config directory + if: always() + run: | + rm -rf .docker-custom test-images: needs: [ check-permissions, tag, neon-image, compute-node-image ] - runs-on: [ self-hosted, gen3, small ] + strategy: + fail-fast: false + matrix: + arch: [ x64, arm64 ] + + runs-on: ${{ fromJson(format('["self-hosted", "gen3", "{0}"]', matrix.arch == 'arm64' && 'small-arm64' || 'small')) }} steps: - name: Checkout @@ -899,7 +1006,7 @@ jobs: - name: Verify image versions shell: bash # ensure no set -e for better error messages run: | - pageserver_version=$(docker run --rm 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} "/bin/sh" "-c" "/usr/local/bin/pageserver --version") + pageserver_version=$(docker run --rm neondatabase/neon:${{ needs.tag.outputs.build-tag }} "/bin/sh" "-c" "/usr/local/bin/pageserver --version") echo "Pageserver version string: $pageserver_version" @@ -925,78 +1032,48 @@ jobs: promote-images: needs: [ check-permissions, tag, test-images, vm-compute-node-image ] - runs-on: [ self-hosted, gen3, small ] - container: golang:1.19-bullseye - # Don't add if-condition here. - # The job should always be run because we have dependant other jobs that shouldn't be skipped + runs-on: ubuntu-latest + + env: + VERSIONS: v14 v15 v16 steps: - - name: Install Crane & ECR helper - run: | - go install github.com/google/go-containerregistry/cmd/crane@31786c6cbb82d6ec4fb8eb79cd9387905130534e # v0.11.0 - go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@69c85dc22db6511932bbf119e1a0cc5c90c69a7f # v0.6.0 + - uses: docker/login-action@v3 + with: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} - - name: Configure ECR login - run: | - mkdir /github/home/.docker/ - echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json + - uses: docker/login-action@v3 + with: + registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com + username: ${{ secrets.AWS_ACCESS_KEY_DEV }} + password: ${{ secrets.AWS_SECRET_KEY_DEV }} - - name: Copy vm-compute-node images to Docker Hub + - name: Copy vm-compute-node images to ECR run: | - crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} vm-compute-node-v14 - crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} vm-compute-node-v15 - crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} vm-compute-node-v16 + for version in ${VERSIONS}; do + docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }} \ + neondatabase/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }} + done - name: Add latest tag to images - if: github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy' + if: github.ref_name == 'main' run: | - crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} latest - crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} latest - crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} latest - crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} latest - crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} latest - crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest - crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v16:${{needs.tag.outputs.build-tag}} latest - crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} latest + for repo in neondatabase 369495373322.dkr.ecr.eu-central-1.amazonaws.com; do + docker buildx imagetools create -t $repo/neon:latest \ + $repo/neon:${{ needs.tag.outputs.build-tag }} - - name: Push images to production ECR - if: github.ref_name == 'main' || github.ref_name == 'release'|| github.ref_name == 'release-proxy' - run: | - crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/neon:latest - crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:latest - crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:latest - crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:latest - crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:latest - crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:latest - crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v16:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v16:latest - crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:latest + docker buildx imagetools create -t $repo/compute-tools:latest \ + $repo/compute-tools:${{ needs.tag.outputs.build-tag }} - - name: Configure Docker Hub login - run: | - # ECR Credential Helper & Docker Hub don't work together in config, hence reset - echo "" > /github/home/.docker/config.json - crane auth login -u ${{ secrets.NEON_DOCKERHUB_USERNAME }} -p ${{ secrets.NEON_DOCKERHUB_PASSWORD }} index.docker.io + for version in ${VERSIONS}; do + docker buildx imagetools create -t $repo/compute-node-${version}:latest \ + $repo/compute-node-${version}:${{ needs.tag.outputs.build-tag }} - - name: Push vm-compute-node to Docker Hub - run: | - crane push vm-compute-node-v14 neondatabase/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} - crane push vm-compute-node-v15 neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} - crane push vm-compute-node-v16 neondatabase/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} - - - name: Push latest tags to Docker Hub - if: github.ref_name == 'main' || github.ref_name == 'release'|| github.ref_name == 'release-proxy' - run: | - crane tag neondatabase/neon:${{needs.tag.outputs.build-tag}} latest - crane tag neondatabase/compute-tools:${{needs.tag.outputs.build-tag}} latest - crane tag neondatabase/compute-node-v14:${{needs.tag.outputs.build-tag}} latest - crane tag neondatabase/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} latest - crane tag neondatabase/compute-node-v15:${{needs.tag.outputs.build-tag}} latest - crane tag neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest - crane tag neondatabase/compute-node-v16:${{needs.tag.outputs.build-tag}} latest - crane tag neondatabase/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} latest - - - name: Cleanup ECR folder - run: rm -rf ~/.ecr + docker buildx imagetools create -t $repo/vm-compute-node-${version}:latest \ + $repo/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }} + done + done trigger-custom-extensions-build-and-wait: needs: [ check-permissions, tag ] diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml index 5a2f9d6645..7d2187e59c 100644 --- a/.github/workflows/neon_extra_builds.yml +++ b/.github/workflows/neon_extra_builds.yml @@ -136,7 +136,7 @@ jobs: check-linux-arm-build: needs: [ check-permissions, build-build-tools-image ] timeout-minutes: 90 - runs-on: [ self-hosted, dev, arm64 ] + runs-on: [ self-hosted, small-arm64 ] env: # Use release build only, to have less debug info around @@ -232,20 +232,20 @@ jobs: - name: Run cargo build run: | - mold -run cargo build --locked $CARGO_FLAGS $CARGO_FEATURES --bins --tests + mold -run cargo build --locked $CARGO_FLAGS $CARGO_FEATURES --bins --tests -j$(nproc) - name: Run cargo test env: NEXTEST_RETRIES: 3 run: | - cargo nextest run $CARGO_FEATURES + cargo nextest run $CARGO_FEATURES -j$(nproc) # Run separate tests for real S3 export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty export REMOTE_STORAGE_S3_BUCKET=neon-github-ci-tests export REMOTE_STORAGE_S3_REGION=eu-central-1 # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now - cargo nextest run --package remote_storage --test test_real_s3 + cargo nextest run --package remote_storage --test test_real_s3 -j$(nproc) # Run separate tests for real Azure Blob Storage # XXX: replace region with `eu-central-1`-like region @@ -255,12 +255,12 @@ jobs: export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}" export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}" # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now - cargo nextest run --package remote_storage --test test_real_azure + cargo nextest run --package remote_storage --test test_real_azure -j$(nproc) check-codestyle-rust-arm: needs: [ check-permissions, build-build-tools-image ] timeout-minutes: 90 - runs-on: [ self-hosted, dev, arm64 ] + runs-on: [ self-hosted, small-arm64 ] container: image: ${{ needs.build-build-tools-image.outputs.image }} @@ -269,6 +269,11 @@ jobs: password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} options: --init + strategy: + fail-fast: false + matrix: + build_type: [ debug, release ] + steps: - name: Fix git ownership run: | @@ -305,31 +310,35 @@ jobs: exit 1 fi echo "CLIPPY_COMMON_ARGS=${CLIPPY_COMMON_ARGS}" >> $GITHUB_ENV + - name: Run cargo clippy (debug) + if: matrix.build_type == 'debug' run: cargo hack --feature-powerset clippy $CLIPPY_COMMON_ARGS - name: Run cargo clippy (release) + if: matrix.build_type == 'release' run: cargo hack --feature-powerset clippy --release $CLIPPY_COMMON_ARGS - name: Check documentation generation - run: cargo doc --workspace --no-deps --document-private-items + if: matrix.build_type == 'release' + run: cargo doc --workspace --no-deps --document-private-items -j$(nproc) env: RUSTDOCFLAGS: "-Dwarnings -Arustdoc::private_intra_doc_links" # Use `${{ !cancelled() }}` to run quck tests after the longer clippy run - name: Check formatting - if: ${{ !cancelled() }} + if: ${{ !cancelled() && matrix.build_type == 'release' }} run: cargo fmt --all -- --check # https://github.com/facebookincubator/cargo-guppy/tree/bec4e0eb29dcd1faac70b1b5360267fc02bf830e/tools/cargo-hakari#2-keep-the-workspace-hack-up-to-date-in-ci - name: Check rust dependencies - if: ${{ !cancelled() }} + if: ${{ !cancelled() && matrix.build_type == 'release' }} run: | cargo hakari generate --diff # workspace-hack Cargo.toml is up-to-date cargo hakari manage-deps --dry-run # all workspace crates depend on workspace-hack # https://github.com/EmbarkStudios/cargo-deny - name: Check rust licenses/bans/advisories/sources - if: ${{ !cancelled() }} + if: ${{ !cancelled() && matrix.build_type == 'release' }} run: cargo deny check gather-rust-build-stats: @@ -338,7 +347,7 @@ jobs: contains(github.event.pull_request.labels.*.name, 'run-extra-build-stats') || contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') || github.ref_name == 'main' - runs-on: [ self-hosted, gen3, large ] + runs-on: [ self-hosted, large ] container: image: ${{ needs.build-build-tools-image.outputs.image }} credentials: @@ -369,7 +378,7 @@ jobs: run: make walproposer-lib -j$(nproc) - name: Produce the build stats - run: cargo build --all --release --timings + run: cargo build --all --release --timings -j$(nproc) - name: Upload the build stats id: upload-stats diff --git a/Cargo.lock b/Cargo.lock index 9bff5e1eff..d8f9021eb8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -25,9 +25,9 @@ checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" [[package]] name = "ahash" -version = "0.8.9" +version = "0.8.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d713b3834d76b85304d4d525563c1276e2e30dc97cc67bfb4585a4a29fc2c89f" +checksum = "e89da841a80418a9b391ebaea17f5c112ffaaa96f621d2c285b5174da76b9011" dependencies = [ "cfg-if", "const-random", @@ -284,9 +284,9 @@ checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" [[package]] name = "aws-config" -version = "1.1.4" +version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b30c39ebe61f75d1b3785362b1586b41991873c9ab3e317a9181c246fb71d82" +checksum = "baaa0be6ee7d90b775ae6ccb6d2ba182b91219ec2001f92338773a094246af1d" dependencies = [ "aws-credential-types", "aws-runtime", @@ -309,14 +309,15 @@ dependencies = [ "time", "tokio", "tracing", + "url", "zeroize", ] [[package]] name = "aws-credential-types" -version = "1.1.8" +version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fa8587ae17c8e967e4b05a62d495be2fb7701bec52a97f7acfe8a29f938384c8" +checksum = "e16838e6c9e12125face1c1eff1343c75e3ff540de98ff7ebd61874a89bcfeb9" dependencies = [ "aws-smithy-async", "aws-smithy-runtime-api", @@ -326,9 +327,9 @@ dependencies = [ [[package]] name = "aws-runtime" -version = "1.1.8" +version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b13dc54b4b49f8288532334bba8f87386a40571c47c37b1304979b556dc613c8" +checksum = "785da4a15e7b166b505fd577e4560c7a7cd8fbdf842eb1336cbcbf8944ce56f1" dependencies = [ "aws-credential-types", "aws-sigv4", @@ -373,10 +374,11 @@ dependencies = [ [[package]] name = "aws-sdk-s3" -version = "1.14.0" +version = "1.26.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "951f7730f51a2155c711c85c79f337fbc02a577fa99d2a0a8059acfce5392113" +checksum = "7bc5ce518d4b8d16e0408de7bdf1b3097cec61a7daa979750a208f8d9934386d" dependencies = [ + "ahash", "aws-credential-types", "aws-runtime", "aws-sigv4", @@ -391,20 +393,25 @@ dependencies = [ "aws-smithy-xml", "aws-types", "bytes", + "fastrand 2.0.0", + "hex", + "hmac", "http 0.2.9", "http-body 0.4.5", + "lru", "once_cell", "percent-encoding", "regex-lite", + "sha2", "tracing", "url", ] [[package]] name = "aws-sdk-sso" -version = "1.12.0" +version = "1.22.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f486420a66caad72635bc2ce0ff6581646e0d32df02aa39dc983bfe794955a5b" +checksum = "ca3d6c4cba4e009391b72b0fcf12aff04ea3c9c3aa2ecaafa330326a8bd7e601" dependencies = [ "aws-credential-types", "aws-runtime", @@ -424,9 +431,9 @@ dependencies = [ [[package]] name = "aws-sdk-ssooidc" -version = "1.12.0" +version = "1.22.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "39ddccf01d82fce9b4a15c8ae8608211ee7db8ed13a70b514bbfe41df3d24841" +checksum = "73400dc239d14f63d932f4ca7b55af5e9ef1f857f7d70655249ccc287adb2570" dependencies = [ "aws-credential-types", "aws-runtime", @@ -446,9 +453,9 @@ dependencies = [ [[package]] name = "aws-sdk-sts" -version = "1.12.0" +version = "1.22.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1a591f8c7e6a621a501b2b5d2e88e1697fcb6274264523a6ad4d5959889a41ce" +checksum = "10f8858308af76fba3e5ffcf1bb56af5471574d2bdfaf0159470c25bc2f760e5" dependencies = [ "aws-credential-types", "aws-runtime", @@ -469,9 +476,9 @@ dependencies = [ [[package]] name = "aws-sigv4" -version = "1.2.0" +version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "11d6f29688a4be9895c0ba8bef861ad0c0dac5c15e9618b9b7a6c233990fc263" +checksum = "58b56f1cbe6fd4d0c2573df72868f20ab1c125ca9c9dbce17927a463433a2e57" dependencies = [ "aws-credential-types", "aws-smithy-eventstream", @@ -498,9 +505,9 @@ dependencies = [ [[package]] name = "aws-smithy-async" -version = "1.1.8" +version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d26ea8fa03025b2face2b3038a63525a10891e3d8829901d502e5384a0d8cd46" +checksum = "62220bc6e97f946ddd51b5f1361f78996e704677afc518a4ff66b7a72ea1378c" dependencies = [ "futures-util", "pin-project-lite", @@ -509,9 +516,9 @@ dependencies = [ [[package]] name = "aws-smithy-checksums" -version = "0.60.4" +version = "0.60.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "be2acd1b9c6ae5859999250ed5a62423aedc5cf69045b844432de15fa2f31f2b" +checksum = "83fa43bc04a6b2441968faeab56e68da3812f978a670a5db32accbdcafddd12f" dependencies = [ "aws-smithy-http", "aws-smithy-types", @@ -541,9 +548,9 @@ dependencies = [ [[package]] name = "aws-smithy-http" -version = "0.60.7" +version = "0.60.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3f10fa66956f01540051b0aa7ad54574640f748f9839e843442d99b970d3aff9" +checksum = "4a7de001a1b9a25601016d8057ea16e31a45fdca3751304c8edf4ad72e706c08" dependencies = [ "aws-smithy-eventstream", "aws-smithy-runtime-api", @@ -581,9 +588,9 @@ dependencies = [ [[package]] name = "aws-smithy-runtime" -version = "1.1.8" +version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ec81002d883e5a7fd2bb063d6fb51c4999eb55d404f4fff3dd878bf4733b9f01" +checksum = "c9ac79e9f3a4d576f3cd4a470a0275b138d9e7b11b1cd514a6858ae0a79dd5bb" dependencies = [ "aws-smithy-async", "aws-smithy-http", @@ -594,6 +601,7 @@ dependencies = [ "h2 0.3.26", "http 0.2.9", "http-body 0.4.5", + "http-body 1.0.0", "hyper 0.14.26", "hyper-rustls 0.24.0", "once_cell", @@ -606,9 +614,9 @@ dependencies = [ [[package]] name = "aws-smithy-runtime-api" -version = "1.2.0" +version = "1.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9acb931e0adaf5132de878f1398d83f8677f90ba70f01f65ff87f6d7244be1c5" +checksum = "04ec42c2f5c0e7796a2848dde4d9f3bf8ce12ccbb3d5aa40c52fa0cdd61a1c47" dependencies = [ "aws-smithy-async", "aws-smithy-types", @@ -623,16 +631,19 @@ dependencies = [ [[package]] name = "aws-smithy-types" -version = "1.1.8" +version = "1.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "abe14dceea1e70101d38fbf2a99e6a34159477c0fb95e68e05c66bd7ae4c3729" +checksum = "baf98d97bba6ddaba180f1b1147e202d8fe04940403a95a3f826c790f931bbd1" dependencies = [ "base64-simd", "bytes", "bytes-utils", "futures-core", "http 0.2.9", + "http 1.1.0", "http-body 0.4.5", + "http-body 1.0.0", + "http-body-util", "itoa", "num-integer", "pin-project-lite", @@ -646,18 +657,18 @@ dependencies = [ [[package]] name = "aws-smithy-xml" -version = "0.60.7" +version = "0.60.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "872c68cf019c0e4afc5de7753c4f7288ce4b71663212771bf5e4542eb9346ca9" +checksum = "d123fbc2a4adc3c301652ba8e149bf4bc1d1725affb9784eb20c953ace06bf55" dependencies = [ "xmlparser", ] [[package]] name = "aws-types" -version = "1.1.8" +version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0dbf2f3da841a8930f159163175cf6a3d16ddde517c1b0fba7aa776822800f40" +checksum = "5a43b56df2c529fe44cb4d92bd64d0479883fb9608ff62daede4df5405381814" dependencies = [ "aws-credential-types", "aws-smithy-async", @@ -697,7 +708,7 @@ dependencies = [ "sha1", "sync_wrapper", "tokio", - "tokio-tungstenite 0.20.0", + "tokio-tungstenite", "tower", "tower-layer", "tower-service", @@ -968,6 +979,12 @@ version = "3.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a3e2c3daef883ecc1b5d58c15adae93470a91d425f3532ba1695849656af3fc1" +[[package]] +name = "bytemuck" +version = "1.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78834c15cb5d5efe3452d58b1e8ba890dd62d21907f867f383358198e56ebca5" + [[package]] name = "byteorder" version = "1.4.3" @@ -1055,9 +1072,9 @@ dependencies = [ [[package]] name = "chrono" -version = "0.4.31" +version = "0.4.38" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f2c685bad3eb3d45a01354cedb7d5faa66194d1d58ba6e267a8de788f79db38" +checksum = "a21f936df1771bf62b77f047b726c4625ff2e8aa607c01ec06e5a05bd8463401" dependencies = [ "android-tzdata", "iana-time-zone", @@ -1065,7 +1082,7 @@ dependencies = [ "num-traits", "serde", "wasm-bindgen", - "windows-targets 0.48.0", + "windows-targets 0.52.4", ] [[package]] @@ -1092,7 +1109,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "defaa24ecc093c77630e6c15e17c51f5e187bf35ee514f4e2d67baaa96dae22b" dependencies = [ "ciborium-io", - "half", + "half 1.8.2", ] [[package]] @@ -1222,8 +1239,10 @@ dependencies = [ "serde_json", "signal-hook", "tar", + "thiserror", "tokio", "tokio-postgres", + "tokio-stream", "tokio-util", "toml_edit", "tracing", @@ -1452,26 +1471,21 @@ dependencies = [ [[package]] name = "crossbeam-deque" -version = "0.8.3" +version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ce6fd6f855243022dcecf8702fef0c297d4338e226845fe067f6341ad9fa0cef" +checksum = "613f8cc01fe9cf1a3eb3d7f488fd2fa8388403e97039e2f73692932e291a770d" dependencies = [ - "cfg-if", "crossbeam-epoch", "crossbeam-utils", ] [[package]] name = "crossbeam-epoch" -version = "0.9.14" +version = "0.9.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "46bd5f3f85273295a9d14aedfb86f6aadbff6d8f5295c4a9edb08e819dcf5695" +checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" dependencies = [ - "autocfg", - "cfg-if", "crossbeam-utils", - "memoffset 0.8.0", - "scopeguard", ] [[package]] @@ -1585,7 +1599,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6943ae99c34386c84a470c499d3414f66502a41340aa895406e0d2e4a207b91d" dependencies = [ "cfg-if", - "hashbrown 0.14.0", + "hashbrown 0.14.5", "lock_api", "once_cell", "parking_lot_core 0.9.8", @@ -1986,6 +2000,27 @@ dependencies = [ "percent-encoding", ] +[[package]] +name = "framed-websockets" +version = "0.1.0" +source = "git+https://github.com/neondatabase/framed-websockets#34eff3d6f8cfccbc5f35e4f65314ff7328621127" +dependencies = [ + "base64 0.21.1", + "bytemuck", + "bytes", + "futures-core", + "futures-sink", + "http-body-util", + "hyper 1.2.0", + "hyper-util", + "pin-project", + "rand 0.8.5", + "sha1", + "thiserror", + "tokio", + "tokio-util", +] + [[package]] name = "fs2" version = "0.4.3" @@ -2238,6 +2273,17 @@ version = "1.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "eabb4a44450da02c90444cf74558da904edde8fb4e9035a9a6a4e15445af0bd7" +[[package]] +name = "half" +version = "2.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6dd08c532ae367adf81c312a4580bc67f1d0fe8bc9c460520283f4c0ff277888" +dependencies = [ + "cfg-if", + "crunchy", + "num-traits", +] + [[package]] name = "hash32" version = "0.3.1" @@ -2264,9 +2310,9 @@ dependencies = [ [[package]] name = "hashbrown" -version = "0.14.0" +version = "0.14.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2c6201b9ff9fd90a5a3bac2e56a830d0caa509576f0e503818ee82c181b3437a" +checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" dependencies = [ "ahash", "allocator-api2", @@ -2274,11 +2320,11 @@ dependencies = [ [[package]] name = "hashlink" -version = "0.8.4" +version = "0.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e8094feaf31ff591f651a2664fb9cfd92bba7a60ce3197265e9482ebe753c8f7" +checksum = "6ba4ff7128dee98c7dc9794b6a411377e1404dba1c97deb8d1a55297bd25d8af" dependencies = [ - "hashbrown 0.14.0", + "hashbrown 0.14.5", ] [[package]] @@ -2587,21 +2633,6 @@ dependencies = [ "tokio-native-tls", ] -[[package]] -name = "hyper-tungstenite" -version = "0.13.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a343d17fe7885302ed7252767dc7bb83609a874b6ff581142241ec4b73957ad" -dependencies = [ - "http-body-util", - "hyper 1.2.0", - "hyper-util", - "pin-project-lite", - "tokio", - "tokio-tungstenite 0.21.0", - "tungstenite 0.21.0", -] - [[package]] name = "hyper-util" version = "0.1.3" @@ -2679,7 +2710,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ad227c3af19d4914570ad36d30409928b75967c298feb9ea1969db3a610bb14e" dependencies = [ "equivalent", - "hashbrown 0.14.0", + "hashbrown 0.14.5", ] [[package]] @@ -2935,6 +2966,15 @@ version = "0.4.20" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f" +[[package]] +name = "lru" +version = "0.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3262e75e648fce39813cb56ac41f3c3e3f65217ebf3844d818d1f9398cfb0dc" +dependencies = [ + "hashbrown 0.14.5", +] + [[package]] name = "match_cfg" version = "0.1.0" @@ -2985,7 +3025,7 @@ checksum = "652bc741286361c06de8cb4d89b21a6437f120c508c51713663589eeb9928ac5" dependencies = [ "bytes", "crossbeam-utils", - "hashbrown 0.14.0", + "hashbrown 0.14.5", "itoa", "lasso", "measured-derive", @@ -3547,7 +3587,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "49203cdcae0030493bad186b28da2fa25645fa276a51b6fec8010d281e02ef79" dependencies = [ "dlv-list", - "hashbrown 0.14.0", + "hashbrown 0.14.5", ] [[package]] @@ -3868,13 +3908,14 @@ dependencies = [ [[package]] name = "parquet" -version = "49.0.0" -source = "git+https://github.com/neondatabase/arrow-rs?branch=neon-fix-bugs#8a0bc58aa67b98aabbd8eee7c6ca4281967ff9e9" +version = "51.0.0" +source = "git+https://github.com/apache/arrow-rs?branch=master#2534976a564be3d2d56312dc88fb1b6ed4cef829" dependencies = [ "ahash", "bytes", "chrono", - "hashbrown 0.14.0", + "half 2.4.1", + "hashbrown 0.14.5", "num", "num-bigint", "paste", @@ -3882,12 +3923,13 @@ dependencies = [ "thrift", "twox-hash", "zstd", + "zstd-sys", ] [[package]] name = "parquet_derive" -version = "49.0.0" -source = "git+https://github.com/neondatabase/arrow-rs?branch=neon-fix-bugs#8a0bc58aa67b98aabbd8eee7c6ca4281967ff9e9" +version = "51.0.0" +source = "git+https://github.com/apache/arrow-rs?branch=master#2534976a564be3d2d56312dc88fb1b6ed4cef829" dependencies = [ "parquet", "proc-macro2", @@ -3914,9 +3956,9 @@ checksum = "de3145af08024dea9fa9914f381a17b8fc6034dfb00f3a84013f7ff43f29ed4c" [[package]] name = "pbkdf2" -version = "0.12.1" +version = "0.12.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f0ca0b5a68607598bf3bad68f32227a8164f6254833f84eafaac409cd6746c31" +checksum = "f8ed6a7761f76e3b9f92dfb0a60a6a6477c61024b775147ff0973a02653abaf2" dependencies = [ "digest", "hmac", @@ -4339,6 +4381,7 @@ dependencies = [ name = "proxy" version = "0.1.0" dependencies = [ + "ahash", "anyhow", "async-compression", "async-trait", @@ -4355,12 +4398,14 @@ dependencies = [ "chrono", "clap", "consumption_metrics", + "crossbeam-deque", "dashmap", "env_logger", "fallible-iterator", + "framed-websockets", "futures", "git-version", - "hashbrown 0.13.2", + "hashbrown 0.14.5", "hashlink", "hex", "hmac", @@ -4370,7 +4415,6 @@ dependencies = [ "humantime", "hyper 0.14.26", "hyper 1.2.0", - "hyper-tungstenite", "hyper-util", "indexmap 2.0.1", "ipnet", @@ -4415,7 +4459,6 @@ dependencies = [ "smol_str", "socket2 0.5.5", "subtle", - "sync_wrapper", "task-local-extensions", "thiserror", "tikv-jemalloc-ctl", @@ -4424,6 +4467,7 @@ dependencies = [ "tokio-postgres", "tokio-postgres-rustls", "tokio-rustls 0.25.0", + "tokio-tungstenite", "tokio-util", "tower-service", "tracing", @@ -5932,7 +5976,7 @@ checksum = "81cdd64d312baedb58e21336b31bc043b77e01cc99033ce76ef539f78e965ebc" [[package]] name = "svg_fmt" version = "0.4.2" -source = "git+https://github.com/neondatabase/fork--nical--rust_debug?branch=neon#c1820b28664b5df68de7f043fccf2ed5d67b6ae8" +source = "git+https://github.com/nical/rust_debug?rev=28a7d96eecff2f28e75b1ea09f2d499a60d0e3b4#28a7d96eecff2f28e75b1ea09f2d499a60d0e3b4" [[package]] name = "syn" @@ -6360,19 +6404,7 @@ dependencies = [ "futures-util", "log", "tokio", - "tungstenite 0.20.1", -] - -[[package]] -name = "tokio-tungstenite" -version = "0.21.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c83b561d025642014097b66e6c1bb422783339e0909e4429cde4749d1990bc38" -dependencies = [ - "futures-util", - "log", - "tokio", - "tungstenite 0.21.0", + "tungstenite", ] [[package]] @@ -6386,7 +6418,7 @@ dependencies = [ "futures-io", "futures-sink", "futures-util", - "hashbrown 0.14.0", + "hashbrown 0.14.5", "pin-project-lite", "tokio", "tracing", @@ -6668,25 +6700,6 @@ dependencies = [ "utf-8", ] -[[package]] -name = "tungstenite" -version = "0.21.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ef1a641ea34f399a848dea702823bbecfb4c486f911735368f1f137cb8257e1" -dependencies = [ - "byteorder", - "bytes", - "data-encoding", - "http 1.1.0", - "httparse", - "log", - "rand 0.8.5", - "sha1", - "thiserror", - "url", - "utf-8", -] - [[package]] name = "twox-hash" version = "1.6.3" @@ -7457,6 +7470,7 @@ dependencies = [ name = "workspace_hack" version = "0.1.0" dependencies = [ + "ahash", "anyhow", "aws-config", "aws-runtime", @@ -7482,7 +7496,7 @@ dependencies = [ "futures-sink", "futures-util", "getrandom 0.2.11", - "hashbrown 0.14.0", + "hashbrown 0.14.5", "hex", "hmac", "hyper 0.14.26", diff --git a/Cargo.toml b/Cargo.toml index 1ddadd2f3c..0887c039f8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -41,6 +41,7 @@ license = "Apache-2.0" ## All dependency versions, used in the project [workspace.dependencies] +ahash = "0.8" anyhow = { version = "1.0", features = ["backtrace"] } arc-swap = "1.6" async-compression = { version = "0.4.0", features = ["tokio", "gzip", "zstd"] } @@ -52,14 +53,14 @@ azure_storage_blobs = "0.19" flate2 = "1.0.26" async-stream = "0.3" async-trait = "0.1" -aws-config = { version = "1.1.4", default-features = false, features=["rustls"] } -aws-sdk-s3 = "1.14" +aws-config = { version = "1.3", default-features = false, features=["rustls"] } +aws-sdk-s3 = "1.26" aws-sdk-iam = "1.15.0" -aws-smithy-async = { version = "1.1.4", default-features = false, features=["rt-tokio"] } -aws-smithy-types = "1.1.4" -aws-credential-types = "1.1.4" -aws-sigv4 = { version = "1.2.0", features = ["sign-http"] } -aws-types = "1.1.7" +aws-smithy-async = { version = "1.2.1", default-features = false, features=["rt-tokio"] } +aws-smithy-types = "1.1.9" +aws-credential-types = "1.2.0" +aws-sigv4 = { version = "1.2.1", features = ["sign-http"] } +aws-types = "1.2.0" axum = { version = "0.6.20", features = ["ws"] } base64 = "0.13.0" bincode = "1.3" @@ -74,6 +75,7 @@ clap = { version = "4.0", features = ["derive"] } comfy-table = "6.1" const_format = "0.2" crc32c = "0.6" +crossbeam-deque = "0.8.5" crossbeam-utils = "0.8.5" dashmap = { version = "5.5.0", features = ["raw-api"] } either = "1.8" @@ -81,13 +83,14 @@ enum-map = "2.4.2" enumset = "1.0.12" fail = "0.5.0" fallible-iterator = "0.2" +framed-websockets = { version = "0.1.0", git = "https://github.com/neondatabase/framed-websockets" } fs2 = "0.4.3" futures = "0.3" futures-core = "0.3" futures-util = "0.3" git-version = "0.3" -hashbrown = "0.13" -hashlink = "0.8.4" +hashbrown = "0.14" +hashlink = "0.9.1" hdrhistogram = "7.5.2" hex = "0.4" hex-literal = "0.4" @@ -98,7 +101,7 @@ http-types = { version = "2", default-features = false } humantime = "2.1" humantime-serde = "1.1.1" hyper = "0.14" -hyper-tungstenite = "0.13.0" +tokio-tungstenite = "0.20.0" indexmap = "2" inotify = "0.10.2" ipnet = "2.9.0" @@ -121,8 +124,8 @@ opentelemetry = "0.20.0" opentelemetry-otlp = { version = "0.13.0", default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] } opentelemetry-semantic-conventions = "0.12.0" parking_lot = "0.12" -parquet = { version = "49.0.0", default-features = false, features = ["zstd"] } -parquet_derive = "49.0.0" +parquet = { version = "51.0.0", default-features = false, features = ["zstd"] } +parquet_derive = "51.0.0" pbkdf2 = { version = "0.12.1", features = ["simple", "std"] } pin-project-lite = "0.2" procfs = "0.14" @@ -158,8 +161,8 @@ socket2 = "0.5" strum = "0.24" strum_macros = "0.24" "subtle" = "2.5.0" -# https://github.com/nical/rust_debug/pull/4 -svg_fmt = { git = "https://github.com/neondatabase/fork--nical--rust_debug", branch = "neon" } +# Our PR https://github.com/nical/rust_debug/pull/4 has been merged but no new version released yet +svg_fmt = { git = "https://github.com/nical/rust_debug", rev = "28a7d96eecff2f28e75b1ea09f2d499a60d0e3b4" } sync_wrapper = "0.1.2" tar = "0.4" task-local-extensions = "0.1.4" @@ -243,8 +246,8 @@ tonic-build = "0.9" tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" } # bug fixes for UUID -parquet = { git = "https://github.com/neondatabase/arrow-rs", branch = "neon-fix-bugs" } -parquet_derive = { git = "https://github.com/neondatabase/arrow-rs", branch = "neon-fix-bugs" } +parquet = { git = "https://github.com/apache/arrow-rs", branch = "master" } +parquet_derive = { git = "https://github.com/apache/arrow-rs", branch = "master" } ################# Binary contents sections diff --git a/Dockerfile.build-tools b/Dockerfile.build-tools index 19739cc1f8..460b8c996d 100644 --- a/Dockerfile.build-tools +++ b/Dockerfile.build-tools @@ -87,7 +87,7 @@ RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-$(uname -m).zip" -o "aws && rm awscliv2.zip # Mold: A Modern Linker -ENV MOLD_VERSION v2.4.0 +ENV MOLD_VERSION v2.31.0 RUN set -e \ && git clone https://github.com/rui314/mold.git \ && mkdir mold/build \ diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node index bd4534ce1d..5bf3246f34 100644 --- a/Dockerfile.compute-node +++ b/Dockerfile.compute-node @@ -241,9 +241,12 @@ RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz - FROM build-deps AS vector-pg-build COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.5.1.tar.gz -O pgvector.tar.gz && \ - echo "cc7a8e034a96e30a819911ac79d32f6bc47bdd1aa2de4d7d4904e26b83209dc8 pgvector.tar.gz" | sha256sum --check && \ +COPY patches/pgvector.patch /pgvector.patch + +RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.7.0.tar.gz -O pgvector.tar.gz && \ + echo "1b5503a35c265408b6eb282621c5e1e75f7801afc04eecb950796cfee2e3d1d8 pgvector.tar.gz" | sha256sum --check && \ mkdir pgvector-src && cd pgvector-src && tar xvzf ../pgvector.tar.gz --strip-components=1 -C . && \ + patch -p1 < /pgvector.patch && \ make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/vector.control diff --git a/README.md b/README.md index 00a90f4483..ea0a289502 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,6 @@ -[![Neon](https://user-images.githubusercontent.com/13738772/236813940-dcfdcb5b-69d3-449b-a686-013febe834d4.png)](https://neon.tech) +[![Neon](https://github.com/neondatabase/neon/assets/11527560/f15a17f0-836e-40c5-b35d-030606a6b660)](https://neon.tech) + + # Neon diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml index 759a117ee9..8f96530a9d 100644 --- a/compute_tools/Cargo.toml +++ b/compute_tools/Cargo.toml @@ -27,10 +27,12 @@ reqwest = { workspace = true, features = ["json"] } tokio = { workspace = true, features = ["rt", "rt-multi-thread"] } tokio-postgres.workspace = true tokio-util.workspace = true +tokio-stream.workspace = true tracing.workspace = true tracing-opentelemetry.workspace = true tracing-subscriber.workspace = true tracing-utils.workspace = true +thiserror.workspace = true url.workspace = true compute_api.workspace = true diff --git a/compute_tools/src/catalog.rs b/compute_tools/src/catalog.rs new file mode 100644 index 0000000000..4fefa831e0 --- /dev/null +++ b/compute_tools/src/catalog.rs @@ -0,0 +1,116 @@ +use compute_api::{ + responses::CatalogObjects, + spec::{Database, Role}, +}; +use futures::Stream; +use postgres::{Client, NoTls}; +use std::{path::Path, process::Stdio, result::Result, sync::Arc}; +use tokio::{ + io::{AsyncBufReadExt, BufReader}, + process::Command, + task, +}; +use tokio_stream::{self as stream, StreamExt}; +use tokio_util::codec::{BytesCodec, FramedRead}; +use tracing::warn; + +use crate::{ + compute::ComputeNode, + pg_helpers::{get_existing_dbs, get_existing_roles}, +}; + +pub async fn get_dbs_and_roles(compute: &Arc) -> anyhow::Result { + let connstr = compute.connstr.clone(); + task::spawn_blocking(move || { + let mut client = Client::connect(connstr.as_str(), NoTls)?; + let roles: Vec; + { + let mut xact = client.transaction()?; + roles = get_existing_roles(&mut xact)?; + } + let databases: Vec = get_existing_dbs(&mut client)?.values().cloned().collect(); + + Ok(CatalogObjects { roles, databases }) + }) + .await? +} + +#[derive(Debug, thiserror::Error)] +pub enum SchemaDumpError { + #[error("Database does not exist.")] + DatabaseDoesNotExist, + #[error("Failed to execute pg_dump.")] + IO(#[from] std::io::Error), +} + +// It uses the pg_dump utility to dump the schema of the specified database. +// The output is streamed back to the caller and supposed to be streamed via HTTP. +// +// Before return the result with the output, it checks that pg_dump produced any output. +// If not, it tries to parse the stderr output to determine if the database does not exist +// and special error is returned. +// +// To make sure that the process is killed when the caller drops the stream, we use tokio kill_on_drop feature. +pub async fn get_database_schema( + compute: &Arc, + dbname: &str, +) -> Result>, SchemaDumpError> { + let pgbin = &compute.pgbin; + let basepath = Path::new(pgbin).parent().unwrap(); + let pgdump = basepath.join("pg_dump"); + let mut connstr = compute.connstr.clone(); + connstr.set_path(dbname); + let mut cmd = Command::new(pgdump) + .arg("--schema-only") + .arg(connstr.as_str()) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()) + .kill_on_drop(true) + .spawn()?; + + let stdout = cmd.stdout.take().ok_or_else(|| { + std::io::Error::new(std::io::ErrorKind::Other, "Failed to capture stdout.") + })?; + + let stderr = cmd.stderr.take().ok_or_else(|| { + std::io::Error::new(std::io::ErrorKind::Other, "Failed to capture stderr.") + })?; + + let mut stdout_reader = FramedRead::new(stdout, BytesCodec::new()); + let stderr_reader = BufReader::new(stderr); + + let first_chunk = match stdout_reader.next().await { + Some(Ok(bytes)) if !bytes.is_empty() => bytes, + Some(Err(e)) => { + return Err(SchemaDumpError::IO(e)); + } + _ => { + let mut lines = stderr_reader.lines(); + if let Some(line) = lines.next_line().await? { + if line.contains(&format!("FATAL: database \"{}\" does not exist", dbname)) { + return Err(SchemaDumpError::DatabaseDoesNotExist); + } + warn!("pg_dump stderr: {}", line) + } + tokio::spawn(async move { + while let Ok(Some(line)) = lines.next_line().await { + warn!("pg_dump stderr: {}", line) + } + }); + + return Err(SchemaDumpError::IO(std::io::Error::new( + std::io::ErrorKind::Other, + "failed to start pg_dump", + ))); + } + }; + let initial_stream = stream::once(Ok(first_chunk.freeze())); + // Consume stderr and log warnings + tokio::spawn(async move { + let mut lines = stderr_reader.lines(); + while let Ok(Some(line)) = lines.next_line().await { + warn!("pg_dump stderr: {}", line) + } + }); + Ok(initial_stream.chain(stdout_reader.map(|res| res.map(|b| b.freeze())))) +} diff --git a/compute_tools/src/http/api.rs b/compute_tools/src/http/api.rs index 128783b477..0286429cf2 100644 --- a/compute_tools/src/http/api.rs +++ b/compute_tools/src/http/api.rs @@ -5,17 +5,21 @@ use std::net::SocketAddr; use std::sync::Arc; use std::thread; +use crate::catalog::SchemaDumpError; +use crate::catalog::{get_database_schema, get_dbs_and_roles}; use crate::compute::forward_termination_signal; use crate::compute::{ComputeNode, ComputeState, ParsedSpec}; use compute_api::requests::ConfigurationRequest; use compute_api::responses::{ComputeStatus, ComputeStatusResponse, GenericAPIError}; use anyhow::Result; +use hyper::header::CONTENT_TYPE; use hyper::service::{make_service_fn, service_fn}; use hyper::{Body, Method, Request, Response, Server, StatusCode}; use tokio::task; use tracing::{error, info, warn}; use tracing_utils::http::OtelName; +use utils::http::request::must_get_query_param; fn status_response_from_state(state: &ComputeState) -> ComputeStatusResponse { ComputeStatusResponse { @@ -133,6 +137,34 @@ async fn routes(req: Request, compute: &Arc) -> Response { + info!("serving /dbs_and_roles GET request",); + match get_dbs_and_roles(compute).await { + Ok(res) => render_json(Body::from(serde_json::to_string(&res).unwrap())), + Err(_) => { + render_json_error("can't get dbs and roles", StatusCode::INTERNAL_SERVER_ERROR) + } + } + } + + (&Method::GET, "/database_schema") => { + let database = match must_get_query_param(&req, "database") { + Err(e) => return e.into_response(), + Ok(database) => database, + }; + info!("serving /database_schema GET request with database: {database}",); + match get_database_schema(compute, &database).await { + Ok(res) => render_plain(Body::wrap_stream(res)), + Err(SchemaDumpError::DatabaseDoesNotExist) => { + render_json_error("database does not exist", StatusCode::NOT_FOUND) + } + Err(e) => { + error!("can't get schema dump: {}", e); + render_json_error("can't get schema dump", StatusCode::INTERNAL_SERVER_ERROR) + } + } + } + // download extension files from remote extension storage on demand (&Method::POST, route) if route.starts_with("/extension_server/") => { info!("serving {:?} POST request", route); @@ -303,10 +335,25 @@ fn render_json_error(e: &str, status: StatusCode) -> Response { }; Response::builder() .status(status) + .header(CONTENT_TYPE, "application/json") .body(Body::from(serde_json::to_string(&error).unwrap())) .unwrap() } +fn render_json(body: Body) -> Response { + Response::builder() + .header(CONTENT_TYPE, "application/json") + .body(body) + .unwrap() +} + +fn render_plain(body: Body) -> Response { + Response::builder() + .header(CONTENT_TYPE, "text/plain") + .body(body) + .unwrap() +} + async fn handle_terminate_request(compute: &Arc) -> Result<(), (String, StatusCode)> { { let mut state = compute.state.lock().unwrap(); diff --git a/compute_tools/src/http/openapi_spec.yaml b/compute_tools/src/http/openapi_spec.yaml index d2ec54299f..b0ddaeae2b 100644 --- a/compute_tools/src/http/openapi_spec.yaml +++ b/compute_tools/src/http/openapi_spec.yaml @@ -68,6 +68,51 @@ paths: schema: $ref: "#/components/schemas/Info" + /dbs_and_roles: + get: + tags: + - Info + summary: Get databases and roles in the catalog. + description: "" + operationId: getDbsAndRoles + responses: + 200: + description: Compute schema objects + content: + application/json: + schema: + $ref: "#/components/schemas/DbsAndRoles" + + /database_schema: + get: + tags: + - Info + summary: Get schema dump + parameters: + - name: database + in: query + description: Database name to dump. + required: true + schema: + type: string + example: "postgres" + description: Get schema dump in SQL format. + operationId: getDatabaseSchema + responses: + 200: + description: Schema dump + content: + text/plain: + schema: + type: string + description: Schema dump in SQL format. + 404: + description: Non existing database. + content: + application/json: + schema: + $ref: "#/components/schemas/GenericError" + /check_writability: post: tags: @@ -229,6 +274,73 @@ components: num_cpus: type: integer + DbsAndRoles: + type: object + description: Databases and Roles + required: + - roles + - databases + properties: + roles: + type: array + items: + $ref: "#/components/schemas/Role" + databases: + type: array + items: + $ref: "#/components/schemas/Database" + + Database: + type: object + description: Database + required: + - name + - owner + - restrict_conn + - invalid + properties: + name: + type: string + owner: + type: string + options: + type: array + items: + $ref: "#/components/schemas/GenericOption" + restrict_conn: + type: boolean + invalid: + type: boolean + + Role: + type: object + description: Role + required: + - name + properties: + name: + type: string + encrypted_password: + type: string + options: + type: array + items: + $ref: "#/components/schemas/GenericOption" + + GenericOption: + type: object + description: Schema Generic option + required: + - name + - vartype + properties: + name: + type: string + value: + type: string + vartype: + type: string + ComputeState: type: object required: diff --git a/compute_tools/src/lib.rs b/compute_tools/src/lib.rs index eac808385c..18c228ba54 100644 --- a/compute_tools/src/lib.rs +++ b/compute_tools/src/lib.rs @@ -8,6 +8,7 @@ pub mod configurator; pub mod http; #[macro_use] pub mod logger; +pub mod catalog; pub mod compute; pub mod extension_server; pub mod monitor; diff --git a/compute_tools/src/swap.rs b/compute_tools/src/swap.rs index c22b6bc14e..024c5b338e 100644 --- a/compute_tools/src/swap.rs +++ b/compute_tools/src/swap.rs @@ -1,3 +1,5 @@ +use std::path::Path; + use anyhow::{anyhow, Context}; use tracing::warn; @@ -17,17 +19,24 @@ pub fn resize_swap(size_bytes: u64) -> anyhow::Result<()> { .arg(size_bytes.to_string()) .spawn(); - if matches!(&child_result, Err(e) if e.kind() == std::io::ErrorKind::NotFound) { - warn!("ignoring \"not found\" error from resize-swap to avoid swapoff while compute is running"); - return Ok(()); - } - child_result .context("spawn() failed") .and_then(|mut child| child.wait().context("wait() failed")) .and_then(|status| match status.success() { true => Ok(()), - false => Err(anyhow!("process exited with {status}")), + false => { + // The command failed. Maybe it was because the resize-swap file doesn't exist? + // The --once flag causes it to delete itself on success so we don't disable swap + // while postgres is running; maybe this is fine. + match Path::new(RESIZE_SWAP_BIN).try_exists() { + Err(_) | Ok(true) => Err(anyhow!("process exited with {status}")), + // The path doesn't exist; we're actually ok + Ok(false) => { + warn!("ignoring \"not found\" error from resize-swap to avoid swapoff while compute is running"); + Ok(()) + }, + } + } }) // wrap any prior error with the overall context that we couldn't run the command .with_context(|| { diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index 179a756135..18e395e2b5 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -9,8 +9,11 @@ use anyhow::{anyhow, bail, Context, Result}; use clap::{value_parser, Arg, ArgAction, ArgMatches, Command, ValueEnum}; use compute_api::spec::ComputeMode; use control_plane::endpoint::ComputeControlPlane; -use control_plane::local_env::{InitForceMode, LocalEnv}; -use control_plane::pageserver::{PageServerNode, PAGESERVER_REMOTE_STORAGE_DIR}; +use control_plane::local_env::{ + InitForceMode, LocalEnv, NeonBroker, NeonLocalInitConf, NeonLocalInitPageserverConf, + SafekeeperConf, +}; +use control_plane::pageserver::PageServerNode; use control_plane::safekeeper::SafekeeperNode; use control_plane::storage_controller::StorageController; use control_plane::{broker, local_env}; @@ -52,44 +55,6 @@ const DEFAULT_PG_VERSION: &str = "15"; const DEFAULT_PAGESERVER_CONTROL_PLANE_API: &str = "http://127.0.0.1:1234/upcall/v1/"; -fn default_conf(num_pageservers: u16) -> String { - let mut template = format!( - r#" -# Default built-in configuration, defined in main.rs -control_plane_api = '{DEFAULT_PAGESERVER_CONTROL_PLANE_API}' - -[broker] -listen_addr = '{DEFAULT_BROKER_ADDR}' - -[[safekeepers]] -id = {DEFAULT_SAFEKEEPER_ID} -pg_port = {DEFAULT_SAFEKEEPER_PG_PORT} -http_port = {DEFAULT_SAFEKEEPER_HTTP_PORT} - -"#, - ); - - for i in 0..num_pageservers { - let pageserver_id = NodeId(DEFAULT_PAGESERVER_ID.0 + i as u64); - let pg_port = DEFAULT_PAGESERVER_PG_PORT + i; - let http_port = DEFAULT_PAGESERVER_HTTP_PORT + i; - - template += &format!( - r#" -[[pageservers]] -id = {pageserver_id} -listen_pg_addr = '127.0.0.1:{pg_port}' -listen_http_addr = '127.0.0.1:{http_port}' -pg_auth_type = '{trust_auth}' -http_auth_type = '{trust_auth}' -"#, - trust_auth = AuthType::Trust, - ) - } - - template -} - /// /// Timelines tree element used as a value in the HashMap. /// @@ -152,7 +117,7 @@ fn main() -> Result<()> { }; match subcommand_result { - Ok(Some(updated_env)) => updated_env.persist_config(&updated_env.base_data_dir)?, + Ok(Some(updated_env)) => updated_env.persist_config()?, Ok(None) => (), Err(e) => { eprintln!("command failed: {e:?}"); @@ -341,55 +306,65 @@ fn parse_timeline_id(sub_match: &ArgMatches) -> anyhow::Result anyhow::Result { - let num_pageservers = init_match - .get_one::("num-pageservers") - .expect("num-pageservers arg has a default"); - // Create config file - let toml_file: String = if let Some(config_path) = init_match.get_one::("config") { + let num_pageservers = init_match.get_one::("num-pageservers"); + + let force = init_match.get_one("force").expect("we set a default value"); + + // Create the in-memory `LocalEnv` that we'd normally load from disk in `load_config`. + let init_conf: NeonLocalInitConf = if let Some(config_path) = + init_match.get_one::("config") + { + // User (likely the Python test suite) provided a description of the environment. + if num_pageservers.is_some() { + bail!("Cannot specify both --num-pageservers and --config, use key `pageservers` in the --config file instead"); + } // load and parse the file - std::fs::read_to_string(config_path).with_context(|| { + let contents = std::fs::read_to_string(config_path).with_context(|| { format!( "Could not read configuration file '{}'", config_path.display() ) - })? + })?; + toml_edit::de::from_str(&contents)? } else { - // Built-in default config - default_conf(*num_pageservers) + // User (likely interactive) did not provide a description of the environment, give them the default + NeonLocalInitConf { + control_plane_api: Some(Some(DEFAULT_PAGESERVER_CONTROL_PLANE_API.parse().unwrap())), + broker: NeonBroker { + listen_addr: DEFAULT_BROKER_ADDR.parse().unwrap(), + }, + safekeepers: vec![SafekeeperConf { + id: DEFAULT_SAFEKEEPER_ID, + pg_port: DEFAULT_SAFEKEEPER_PG_PORT, + http_port: DEFAULT_SAFEKEEPER_HTTP_PORT, + ..Default::default() + }], + pageservers: (0..num_pageservers.copied().unwrap_or(1)) + .map(|i| { + let pageserver_id = NodeId(DEFAULT_PAGESERVER_ID.0 + i as u64); + let pg_port = DEFAULT_PAGESERVER_PG_PORT + i; + let http_port = DEFAULT_PAGESERVER_HTTP_PORT + i; + NeonLocalInitPageserverConf { + id: pageserver_id, + listen_pg_addr: format!("127.0.0.1:{pg_port}"), + listen_http_addr: format!("127.0.0.1:{http_port}"), + pg_auth_type: AuthType::Trust, + http_auth_type: AuthType::Trust, + other: Default::default(), + } + }) + .collect(), + pg_distrib_dir: None, + neon_distrib_dir: None, + default_tenant_id: TenantId::from_array(std::array::from_fn(|_| 0)), + storage_controller: None, + control_plane_compute_hook_api: None, + } }; - let pageserver_config: toml_edit::Document = - if let Some(path) = init_match.get_one::("pageserver-config") { - std::fs::read_to_string(path)?.parse()? - } else { - toml_edit::Document::new() - }; - - let pg_version = init_match - .get_one::("pg-version") - .copied() - .context("Failed to parse postgres version from the argument string")?; - - let mut env = - LocalEnv::parse_config(&toml_file).context("Failed to create neon configuration")?; - let force = init_match.get_one("force").expect("we set a default value"); - env.init(pg_version, force) - .context("Failed to initialize neon repository")?; - - // Create remote storage location for default LocalFs remote storage - std::fs::create_dir_all(env.base_data_dir.join(PAGESERVER_REMOTE_STORAGE_DIR))?; - - // Initialize pageserver, create initial tenant and timeline. - for ps_conf in &env.pageservers { - PageServerNode::from_env(&env, ps_conf) - .initialize(pageserver_config.clone()) - .unwrap_or_else(|e| { - eprintln!("pageserver init failed: {e:?}"); - exit(1); - }); - } - - Ok(env) + LocalEnv::init(init_conf, force) + .context("materialize initial neon_local environment on disk")?; + Ok(LocalEnv::load_config().expect("freshly written config should be loadable")) } /// The default pageserver is the one where CLI tenant/timeline operations are sent by default. @@ -1418,9 +1393,7 @@ fn cli() -> Command { let num_pageservers_arg = Arg::new("num-pageservers") .value_parser(value_parser!(u16)) .long("num-pageservers") - .help("How many pageservers to create (default 1)") - .required(false) - .default_value("1"); + .help("How many pageservers to create (default 1)"); let update_catalog = Arg::new("update-catalog") .value_parser(value_parser!(bool)) @@ -1454,14 +1427,6 @@ fn cli() -> Command { .value_parser(value_parser!(PathBuf)) .value_name("config") ) - .arg( - Arg::new("pageserver-config") - .long("pageserver-config") - .required(false) - .value_parser(value_parser!(PathBuf)) - .value_name("pageserver-config") - .help("Merge the provided pageserver config into the one generated by neon_local."), - ) .arg(pg_version_arg.clone()) .arg(force_arg) ) diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index 7abbbce95a..0edcf1be4e 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -3,7 +3,7 @@ //! Now it also provides init method which acts like a stub for proper installation //! script which will use local paths. -use anyhow::{bail, ensure, Context}; +use anyhow::{bail, Context}; use clap::ValueEnum; use postgres_backend::AuthType; @@ -23,6 +23,8 @@ use utils::{ id::{NodeId, TenantId, TenantTimelineId, TimelineId}, }; +use crate::pageserver::PageServerNode; +use crate::pageserver::PAGESERVER_REMOTE_STORAGE_DIR; use crate::safekeeper::SafekeeperNode; pub const DEFAULT_PG_VERSION: u32 = 15; @@ -34,7 +36,7 @@ pub const DEFAULT_PG_VERSION: u32 = 15; // to 'neon_local init --config=' option. See control_plane/simple.conf for // an example. // -#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)] +#[derive(PartialEq, Eq, Clone, Debug)] pub struct LocalEnv { // Base directory for all the nodes (the pageserver, safekeepers and // compute endpoints). @@ -42,59 +44,99 @@ pub struct LocalEnv { // This is not stored in the config file. Rather, this is the path where the // config file itself is. It is read from the NEON_REPO_DIR env variable or // '.neon' if not given. - #[serde(skip)] pub base_data_dir: PathBuf, // Path to postgres distribution. It's expected that "bin", "include", // "lib", "share" from postgres distribution are there. If at some point // in time we will be able to run against vanilla postgres we may split that // to four separate paths and match OS-specific installation layout. - #[serde(default)] pub pg_distrib_dir: PathBuf, // Path to pageserver binary. - #[serde(default)] pub neon_distrib_dir: PathBuf, // Default tenant ID to use with the 'neon_local' command line utility, when // --tenant_id is not explicitly specified. - #[serde(default)] pub default_tenant_id: Option, // used to issue tokens during e.g pg start - #[serde(default)] pub private_key_path: PathBuf, pub broker: NeonBroker, // Configuration for the storage controller (1 per neon_local environment) - #[serde(default)] pub storage_controller: NeonStorageControllerConf, /// This Vec must always contain at least one pageserver + /// Populdated by [`Self::load_config`] from the individual `pageserver.toml`s. + /// NB: not used anymore except for informing users that they need to change their `.neon/config`. pub pageservers: Vec, - #[serde(default)] pub safekeepers: Vec, // Control plane upcall API for pageserver: if None, we will not run storage_controller If set, this will // be propagated into each pageserver's configuration. - #[serde(default)] pub control_plane_api: Option, // Control plane upcall API for storage controller. If set, this will be propagated into the // storage controller's configuration. - #[serde(default)] pub control_plane_compute_hook_api: Option, /// Keep human-readable aliases in memory (and persist them to config), to hide ZId hex strings from the user. - #[serde(default)] // A `HashMap>` would be more appropriate here, // but deserialization into a generic toml object as `toml::Value::try_from` fails with an error. // https://toml.io/en/v1.0.0 does not contain a concept of "a table inside another table". + pub branch_name_mappings: HashMap>, +} + +/// On-disk state stored in `.neon/config`. +#[derive(PartialEq, Eq, Clone, Debug, Default, Serialize, Deserialize)] +#[serde(default, deny_unknown_fields)] +pub struct OnDiskConfig { + pub pg_distrib_dir: PathBuf, + pub neon_distrib_dir: PathBuf, + pub default_tenant_id: Option, + pub private_key_path: PathBuf, + pub broker: NeonBroker, + pub storage_controller: NeonStorageControllerConf, + #[serde( + skip_serializing, + deserialize_with = "fail_if_pageservers_field_specified" + )] + pub pageservers: Vec, + pub safekeepers: Vec, + pub control_plane_api: Option, + pub control_plane_compute_hook_api: Option, branch_name_mappings: HashMap>, } +fn fail_if_pageservers_field_specified<'de, D>(_: D) -> Result, D::Error> +where + D: serde::Deserializer<'de>, +{ + Err(serde::de::Error::custom( + "The 'pageservers' field is no longer used; pageserver.toml is now authoritative; \ + Please remove the `pageservers` from your .neon/config.", + )) +} + +/// The description of the neon_local env to be initialized by `neon_local init --config`. +#[derive(Clone, Debug, Deserialize)] +#[serde(deny_unknown_fields)] +pub struct NeonLocalInitConf { + // TODO: do we need this? Seems unused + pub pg_distrib_dir: Option, + // TODO: do we need this? Seems unused + pub neon_distrib_dir: Option, + pub default_tenant_id: TenantId, + pub broker: NeonBroker, + pub storage_controller: Option, + pub pageservers: Vec, + pub safekeepers: Vec, + pub control_plane_api: Option>, + pub control_plane_compute_hook_api: Option>, +} + /// Broker config for cluster internal communication. #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)] #[serde(default)] @@ -110,6 +152,9 @@ pub struct NeonStorageControllerConf { /// Heartbeat timeout before marking a node offline #[serde(with = "humantime_serde")] pub max_unavailable: Duration, + + /// Threshold for auto-splitting a tenant into shards + pub split_threshold: Option, } impl NeonStorageControllerConf { @@ -122,6 +167,7 @@ impl Default for NeonStorageControllerConf { fn default() -> Self { Self { max_unavailable: Self::DEFAULT_MAX_UNAVAILABLE_INTERVAL, + split_threshold: None, } } } @@ -141,24 +187,18 @@ impl NeonBroker { } } +// neon_local needs to know this subset of pageserver configuration. +// For legacy reasons, this information is duplicated from `pageserver.toml` into `.neon/config`. +// It can get stale if `pageserver.toml` is changed. +// TODO(christian): don't store this at all in `.neon/config`, always load it from `pageserver.toml` #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)] #[serde(default, deny_unknown_fields)] pub struct PageServerConf { - // node id pub id: NodeId, - - // Pageserver connection settings pub listen_pg_addr: String, pub listen_http_addr: String, - - // auth type used for the PG and HTTP ports pub pg_auth_type: AuthType, pub http_auth_type: AuthType, - - pub(crate) virtual_file_io_engine: Option, - pub(crate) get_vectored_impl: Option, - pub(crate) get_impl: Option, - pub(crate) validate_vectored_get: Option, } impl Default for PageServerConf { @@ -169,10 +209,40 @@ impl Default for PageServerConf { listen_http_addr: String::new(), pg_auth_type: AuthType::Trust, http_auth_type: AuthType::Trust, - virtual_file_io_engine: None, - get_vectored_impl: None, - get_impl: None, - validate_vectored_get: None, + } + } +} + +/// The toml that can be passed to `neon_local init --config`. +/// This is a subset of the `pageserver.toml` configuration. +// TODO(christian): use pageserver_api::config::ConfigToml (PR #7656) +#[derive(Clone, Debug, serde::Deserialize, serde::Serialize)] +pub struct NeonLocalInitPageserverConf { + pub id: NodeId, + pub listen_pg_addr: String, + pub listen_http_addr: String, + pub pg_auth_type: AuthType, + pub http_auth_type: AuthType, + #[serde(flatten)] + pub other: HashMap, +} + +impl From<&NeonLocalInitPageserverConf> for PageServerConf { + fn from(conf: &NeonLocalInitPageserverConf) -> Self { + let NeonLocalInitPageserverConf { + id, + listen_pg_addr, + listen_http_addr, + pg_auth_type, + http_auth_type, + other: _, + } = conf; + Self { + id: *id, + listen_pg_addr: listen_pg_addr.clone(), + listen_http_addr: listen_http_addr.clone(), + pg_auth_type: *pg_auth_type, + http_auth_type: *http_auth_type, } } } @@ -360,44 +430,7 @@ impl LocalEnv { .collect() } - /// Create a LocalEnv from a config file. - /// - /// Unlike 'load_config', this function fills in any defaults that are missing - /// from the config file. - pub fn parse_config(toml: &str) -> anyhow::Result { - let mut env: LocalEnv = toml::from_str(toml)?; - - // Find postgres binaries. - // Follow POSTGRES_DISTRIB_DIR if set, otherwise look in "pg_install". - // Note that later in the code we assume, that distrib dirs follow the same pattern - // for all postgres versions. - if env.pg_distrib_dir == Path::new("") { - if let Some(postgres_bin) = env::var_os("POSTGRES_DISTRIB_DIR") { - env.pg_distrib_dir = postgres_bin.into(); - } else { - let cwd = env::current_dir()?; - env.pg_distrib_dir = cwd.join("pg_install") - } - } - - // Find neon binaries. - if env.neon_distrib_dir == Path::new("") { - env::current_exe()? - .parent() - .unwrap() - .clone_into(&mut env.neon_distrib_dir); - } - - if env.pageservers.is_empty() { - anyhow::bail!("Configuration must contain at least one pageserver"); - } - - env.base_data_dir = base_path(); - - Ok(env) - } - - /// Locate and load config + /// Construct `Self` from on-disk state. pub fn load_config() -> anyhow::Result { let repopath = base_path(); @@ -411,38 +444,129 @@ impl LocalEnv { // TODO: check that it looks like a neon repository // load and parse file - let config = fs::read_to_string(repopath.join("config"))?; - let mut env: LocalEnv = toml::from_str(config.as_str())?; + let config_file_contents = fs::read_to_string(repopath.join("config"))?; + let on_disk_config: OnDiskConfig = toml::from_str(config_file_contents.as_str())?; + let mut env = { + let OnDiskConfig { + pg_distrib_dir, + neon_distrib_dir, + default_tenant_id, + private_key_path, + broker, + storage_controller, + pageservers, + safekeepers, + control_plane_api, + control_plane_compute_hook_api, + branch_name_mappings, + } = on_disk_config; + LocalEnv { + base_data_dir: repopath.clone(), + pg_distrib_dir, + neon_distrib_dir, + default_tenant_id, + private_key_path, + broker, + storage_controller, + pageservers, + safekeepers, + control_plane_api, + control_plane_compute_hook_api, + branch_name_mappings, + } + }; - env.base_data_dir = repopath; + // The source of truth for pageserver configuration is the pageserver.toml. + assert!( + env.pageservers.is_empty(), + "we ensure this during deserialization" + ); + env.pageservers = { + let iter = std::fs::read_dir(&repopath).context("open dir")?; + let mut pageservers = Vec::new(); + for res in iter { + let dentry = res?; + const PREFIX: &str = "pageserver_"; + let dentry_name = dentry + .file_name() + .into_string() + .ok() + .with_context(|| format!("non-utf8 dentry: {:?}", dentry.path())) + .unwrap(); + if !dentry_name.starts_with(PREFIX) { + continue; + } + if !dentry.file_type().context("determine file type")?.is_dir() { + anyhow::bail!("expected a directory, got {:?}", dentry.path()); + } + let id = dentry_name[PREFIX.len()..] + .parse::() + .with_context(|| format!("parse id from {:?}", dentry.path()))?; + // TODO(christian): use pageserver_api::config::ConfigToml (PR #7656) + #[derive(serde::Serialize, serde::Deserialize)] + // (allow unknown fields, unlike PageServerConf) + struct PageserverConfigTomlSubset { + id: NodeId, + listen_pg_addr: String, + listen_http_addr: String, + pg_auth_type: AuthType, + http_auth_type: AuthType, + } + let config_toml_path = dentry.path().join("pageserver.toml"); + let config_toml: PageserverConfigTomlSubset = toml_edit::de::from_str( + &std::fs::read_to_string(&config_toml_path) + .with_context(|| format!("read {:?}", config_toml_path))?, + ) + .context("parse pageserver.toml")?; + let PageserverConfigTomlSubset { + id: config_toml_id, + listen_pg_addr, + listen_http_addr, + pg_auth_type, + http_auth_type, + } = config_toml; + let conf = PageServerConf { + id: { + anyhow::ensure!( + config_toml_id == id, + "id mismatch: config_toml.id={config_toml_id} id={id}", + ); + id + }, + listen_pg_addr, + listen_http_addr, + pg_auth_type, + http_auth_type, + }; + pageservers.push(conf); + } + pageservers + }; Ok(env) } - pub fn persist_config(&self, base_path: &Path) -> anyhow::Result<()> { - // Currently, the user first passes a config file with 'neon_local init --config=' - // We read that in, in `create_config`, and fill any missing defaults. Then it's saved - // to .neon/config. TODO: We lose any formatting and comments along the way, which is - // a bit sad. - let mut conf_content = r#"# This file describes a local deployment of the page server -# and safekeeeper node. It is read by the 'neon_local' command-line -# utility. -"# - .to_string(); - - // Convert the LocalEnv to a toml file. - // - // This could be as simple as this: - // - // conf_content += &toml::to_string_pretty(env)?; - // - // But it results in a "values must be emitted before tables". I'm not sure - // why, AFAICS the table, i.e. 'safekeepers: Vec' is last. - // Maybe rust reorders the fields to squeeze avoid padding or something? - // In any case, converting to toml::Value first, and serializing that, works. - // See https://github.com/alexcrichton/toml-rs/issues/142 - conf_content += &toml::to_string_pretty(&toml::Value::try_from(self)?)?; + pub fn persist_config(&self) -> anyhow::Result<()> { + Self::persist_config_impl( + &self.base_data_dir, + &OnDiskConfig { + pg_distrib_dir: self.pg_distrib_dir.clone(), + neon_distrib_dir: self.neon_distrib_dir.clone(), + default_tenant_id: self.default_tenant_id, + private_key_path: self.private_key_path.clone(), + broker: self.broker.clone(), + storage_controller: self.storage_controller.clone(), + pageservers: vec![], // it's skip_serializing anyway + safekeepers: self.safekeepers.clone(), + control_plane_api: self.control_plane_api.clone(), + control_plane_compute_hook_api: self.control_plane_compute_hook_api.clone(), + branch_name_mappings: self.branch_name_mappings.clone(), + }, + ) + } + pub fn persist_config_impl(base_path: &Path, config: &OnDiskConfig) -> anyhow::Result<()> { + let conf_content = &toml::to_string_pretty(config)?; let target_config_path = base_path.join("config"); fs::write(&target_config_path, conf_content).with_context(|| { format!( @@ -467,17 +591,13 @@ impl LocalEnv { } } - // - // Initialize a new Neon repository - // - pub fn init(&mut self, pg_version: u32, force: &InitForceMode) -> anyhow::Result<()> { - // check if config already exists - let base_path = &self.base_data_dir; - ensure!( - base_path != Path::new(""), - "repository base path is missing" - ); + /// Materialize the [`NeonLocalInitConf`] to disk. Called during [`neon_local init`]. + pub fn init(conf: NeonLocalInitConf, force: &InitForceMode) -> anyhow::Result<()> { + let base_path = base_path(); + assert_ne!(base_path, Path::new("")); + let base_path = &base_path; + // create base_path dir if base_path.exists() { match force { InitForceMode::MustNotExist => { @@ -509,74 +629,96 @@ impl LocalEnv { } } } - - if !self.pg_bin_dir(pg_version)?.join("postgres").exists() { - bail!( - "Can't find postgres binary at {}", - self.pg_bin_dir(pg_version)?.display() - ); - } - for binary in ["pageserver", "safekeeper"] { - if !self.neon_distrib_dir.join(binary).exists() { - bail!( - "Can't find binary '{binary}' in neon distrib dir '{}'", - self.neon_distrib_dir.display() - ); - } - } - if !base_path.exists() { fs::create_dir(base_path)?; } + let NeonLocalInitConf { + pg_distrib_dir, + neon_distrib_dir, + default_tenant_id, + broker, + storage_controller, + pageservers, + safekeepers, + control_plane_api, + control_plane_compute_hook_api, + } = conf; + + // Find postgres binaries. + // Follow POSTGRES_DISTRIB_DIR if set, otherwise look in "pg_install". + // Note that later in the code we assume, that distrib dirs follow the same pattern + // for all postgres versions. + let pg_distrib_dir = pg_distrib_dir.unwrap_or_else(|| { + if let Some(postgres_bin) = env::var_os("POSTGRES_DISTRIB_DIR") { + postgres_bin.into() + } else { + let cwd = env::current_dir().unwrap(); + cwd.join("pg_install") + } + }); + + // Find neon binaries. + let neon_distrib_dir = neon_distrib_dir + .unwrap_or_else(|| env::current_exe().unwrap().parent().unwrap().to_owned()); + // Generate keypair for JWT. // // The keypair is only needed if authentication is enabled in any of the // components. For convenience, we generate the keypair even if authentication // is not enabled, so that you can easily enable it after the initialization - // step. However, if the key generation fails, we treat it as non-fatal if - // authentication was not enabled. - if self.private_key_path == PathBuf::new() { - match generate_auth_keys( - base_path.join("auth_private_key.pem").as_path(), - base_path.join("auth_public_key.pem").as_path(), - ) { - Ok(()) => { - self.private_key_path = PathBuf::from("auth_private_key.pem"); - } - Err(e) => { - if !self.auth_keys_needed() { - eprintln!("Could not generate keypair for JWT authentication: {e}"); - eprintln!("Continuing anyway because authentication was not enabled"); - self.private_key_path = PathBuf::from("auth_private_key.pem"); - } else { - return Err(e); - } - } - } + // step. + generate_auth_keys( + base_path.join("auth_private_key.pem").as_path(), + base_path.join("auth_public_key.pem").as_path(), + ) + .context("generate auth keys")?; + let private_key_path = PathBuf::from("auth_private_key.pem"); + + // create the runtime type because the remaining initialization code below needs + // a LocalEnv instance op operation + // TODO: refactor to avoid this, LocalEnv should only be constructed from on-disk state + let env = LocalEnv { + base_data_dir: base_path.clone(), + pg_distrib_dir, + neon_distrib_dir, + default_tenant_id: Some(default_tenant_id), + private_key_path, + broker, + storage_controller: storage_controller.unwrap_or_default(), + pageservers: pageservers.iter().map(Into::into).collect(), + safekeepers, + control_plane_api: control_plane_api.unwrap_or_default(), + control_plane_compute_hook_api: control_plane_compute_hook_api.unwrap_or_default(), + branch_name_mappings: Default::default(), + }; + + // create endpoints dir + fs::create_dir_all(env.endpoints_path())?; + + // create safekeeper dirs + for safekeeper in &env.safekeepers { + fs::create_dir_all(SafekeeperNode::datadir_path_by_id(&env, safekeeper.id))?; } - fs::create_dir_all(self.endpoints_path())?; - - for safekeeper in &self.safekeepers { - fs::create_dir_all(SafekeeperNode::datadir_path_by_id(self, safekeeper.id))?; + // initialize pageserver state + for (i, ps) in pageservers.into_iter().enumerate() { + let runtime_ps = &env.pageservers[i]; + assert_eq!(&PageServerConf::from(&ps), runtime_ps); + fs::create_dir(env.pageserver_data_dir(ps.id))?; + PageServerNode::from_env(&env, runtime_ps) + .initialize(ps) + .context("pageserver init failed")?; } - for ps in &self.pageservers { - fs::create_dir(self.pageserver_data_dir(ps.id))?; - } + // setup remote remote location for default LocalFs remote storage + std::fs::create_dir_all(env.base_data_dir.join(PAGESERVER_REMOTE_STORAGE_DIR))?; - self.persist_config(base_path) - } - - fn auth_keys_needed(&self) -> bool { - self.pageservers.iter().any(|ps| { - ps.pg_auth_type == AuthType::NeonJWT || ps.http_auth_type == AuthType::NeonJWT - }) || self.safekeepers.iter().any(|sk| sk.auth_enabled) + env.persist_config() } } -fn base_path() -> PathBuf { +pub fn base_path() -> PathBuf { match std::env::var_os("NEON_REPO_DIR") { Some(val) => PathBuf::from(val), None => PathBuf::from(".neon"), @@ -619,31 +761,3 @@ fn generate_auth_keys(private_key_path: &Path, public_key_path: &Path) -> anyhow } Ok(()) } - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn simple_conf_parsing() { - let simple_conf_toml = include_str!("../simple.conf"); - let simple_conf_parse_result = LocalEnv::parse_config(simple_conf_toml); - assert!( - simple_conf_parse_result.is_ok(), - "failed to parse simple config {simple_conf_toml}, reason: {simple_conf_parse_result:?}" - ); - - let string_to_replace = "listen_addr = '127.0.0.1:50051'"; - let spoiled_url_str = "listen_addr = '!@$XOXO%^&'"; - let spoiled_url_toml = simple_conf_toml.replace(string_to_replace, spoiled_url_str); - assert!( - spoiled_url_toml.contains(spoiled_url_str), - "Failed to replace string {string_to_replace} in the toml file {simple_conf_toml}" - ); - let spoiled_url_parse_result = LocalEnv::parse_config(&spoiled_url_toml); - assert!( - spoiled_url_parse_result.is_err(), - "expected toml with invalid Url {spoiled_url_toml} to fail the parsing, but got {spoiled_url_parse_result:?}" - ); - } -} diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs index 6046c93bad..5a84763697 100644 --- a/control_plane/src/pageserver.rs +++ b/control_plane/src/pageserver.rs @@ -30,7 +30,7 @@ use utils::{ lsn::Lsn, }; -use crate::local_env::PageServerConf; +use crate::local_env::{NeonLocalInitPageserverConf, PageServerConf}; use crate::{background_process, local_env::LocalEnv}; /// Directory within .neon which will be used by default for LocalFs remote storage. @@ -76,9 +76,11 @@ impl PageServerNode { fn pageserver_init_make_toml( &self, - cli_overrides: toml_edit::Document, + conf: NeonLocalInitPageserverConf, ) -> anyhow::Result { - // TODO: this is a legacy code, it should be refactored to use toml_edit directly. + assert_eq!(&PageServerConf::from(&conf), &self.conf, "during neon_local init, we derive the runtime state of ps conf (self.conf) from the --config flag fully"); + + // TODO(christian): instead of what we do here, create a pageserver_api::config::ConfigToml (PR #7656) // FIXME: the paths should be shell-escaped to handle paths with spaces, quotas etc. let pg_distrib_dir_param = format!( @@ -86,61 +88,9 @@ impl PageServerNode { self.env.pg_distrib_dir_raw().display() ); - let PageServerConf { - id, - listen_pg_addr, - listen_http_addr, - pg_auth_type, - http_auth_type, - virtual_file_io_engine, - get_vectored_impl, - get_impl, - validate_vectored_get, - } = &self.conf; - - let id = format!("id={}", id); - - let http_auth_type_param = format!("http_auth_type='{}'", http_auth_type); - let listen_http_addr_param = format!("listen_http_addr='{}'", listen_http_addr); - - let pg_auth_type_param = format!("pg_auth_type='{}'", pg_auth_type); - let listen_pg_addr_param = format!("listen_pg_addr='{}'", listen_pg_addr); - let virtual_file_io_engine = if let Some(virtual_file_io_engine) = virtual_file_io_engine { - format!("virtual_file_io_engine='{virtual_file_io_engine}'") - } else { - String::new() - }; - let get_vectored_impl = if let Some(get_vectored_impl) = get_vectored_impl { - format!("get_vectored_impl='{get_vectored_impl}'") - } else { - String::new() - }; - let get_impl = if let Some(get_impl) = get_impl { - format!("get_impl='{get_impl}'") - } else { - String::new() - }; - let validate_vectored_get = if let Some(validate_vectored_get) = validate_vectored_get { - format!("validate_vectored_get={validate_vectored_get}") - } else { - String::new() - }; - let broker_endpoint_param = format!("broker_endpoint='{}'", self.env.broker.client_url()); - let mut overrides = vec![ - id, - pg_distrib_dir_param, - http_auth_type_param, - pg_auth_type_param, - listen_http_addr_param, - listen_pg_addr_param, - broker_endpoint_param, - virtual_file_io_engine, - get_vectored_impl, - get_impl, - validate_vectored_get, - ]; + let mut overrides = vec![pg_distrib_dir_param, broker_endpoint_param]; if let Some(control_plane_api) = &self.env.control_plane_api { overrides.push(format!( @@ -150,7 +100,7 @@ impl PageServerNode { // Storage controller uses the same auth as pageserver: if JWT is enabled // for us, we will also need it to talk to them. - if matches!(http_auth_type, AuthType::NeonJWT) { + if matches!(conf.http_auth_type, AuthType::NeonJWT) { let jwt_token = self .env .generate_auth_token(&Claims::new(None, Scope::GenerationsApi)) @@ -159,20 +109,23 @@ impl PageServerNode { } } - if !cli_overrides.contains_key("remote_storage") { + if !conf.other.contains_key("remote_storage") { overrides.push(format!( "remote_storage={{local_path='../{PAGESERVER_REMOTE_STORAGE_DIR}'}}" )); } - if *http_auth_type != AuthType::Trust || *pg_auth_type != AuthType::Trust { + if conf.http_auth_type != AuthType::Trust || conf.pg_auth_type != AuthType::Trust { // Keys are generated in the toplevel repo dir, pageservers' workdirs // are one level below that, so refer to keys with ../ overrides.push("auth_validation_public_key_path='../auth_public_key.pem'".to_owned()); } // Apply the user-provided overrides - overrides.push(cli_overrides.to_string()); + overrides.push( + toml_edit::ser::to_string_pretty(&conf) + .expect("we deserialized this from toml earlier"), + ); // Turn `overrides` into a toml document. // TODO: above code is legacy code, it should be refactored to use toml_edit directly. @@ -188,8 +141,8 @@ impl PageServerNode { } /// Initializes a pageserver node by creating its config with the overrides provided. - pub fn initialize(&self, config_overrides: toml_edit::Document) -> anyhow::Result<()> { - self.pageserver_init(config_overrides) + pub fn initialize(&self, conf: NeonLocalInitPageserverConf) -> anyhow::Result<()> { + self.pageserver_init(conf) .with_context(|| format!("Failed to run init for pageserver node {}", self.conf.id)) } @@ -209,7 +162,7 @@ impl PageServerNode { self.start_node().await } - fn pageserver_init(&self, cli_overrides: toml_edit::Document) -> anyhow::Result<()> { + fn pageserver_init(&self, conf: NeonLocalInitPageserverConf) -> anyhow::Result<()> { let datadir = self.repo_path(); let node_id = self.conf.id; println!( @@ -221,7 +174,7 @@ impl PageServerNode { io::stdout().flush()?; let config = self - .pageserver_init_make_toml(cli_overrides) + .pageserver_init_make_toml(conf) .context("make pageserver toml")?; let config_file_path = datadir.join("pageserver.toml"); let mut config_file = std::fs::OpenOptions::new() diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs index f1c43f4036..b6b7ea7762 100644 --- a/control_plane/src/storage_controller.rs +++ b/control_plane/src/storage_controller.rs @@ -243,9 +243,13 @@ impl StorageController { anyhow::bail!("initdb failed with status {status}"); } + // Write a minimal config file: + // - Specify the port, since this is chosen dynamically + // - Switch off fsync, since we're running on lightweight test environments and when e.g. scale testing + // the storage controller we don't want a slow local disk to interfere with that. tokio::fs::write( &pg_data_path.join("postgresql.conf"), - format!("port = {}", self.postgres_port), + format!("port = {}\nfsync=off\n", self.postgres_port), ) .await?; }; @@ -305,6 +309,10 @@ impl StorageController { )); } + if let Some(split_threshold) = self.config.split_threshold.as_ref() { + args.push(format!("--split-threshold={split_threshold}")) + } + background_process::start_process( COMMAND, &self.env.base_data_dir, diff --git a/docker-compose/compute_wrapper/Dockerfile b/docker-compose/compute_wrapper/Dockerfile index f1b1986072..974dcd7f03 100644 --- a/docker-compose/compute_wrapper/Dockerfile +++ b/docker-compose/compute_wrapper/Dockerfile @@ -1,4 +1,4 @@ -ARG REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com +ARG REPOSITORY=neondatabase ARG COMPUTE_IMAGE=compute-node-v14 ARG TAG=latest diff --git a/docker-compose/docker_compose_test.sh b/docker-compose/docker_compose_test.sh index e18b0f9176..062fc6fc92 100755 --- a/docker-compose/docker_compose_test.sh +++ b/docker-compose/docker_compose_test.sh @@ -8,8 +8,6 @@ # Their defaults point at DockerHub `neondatabase/neon:latest` image.`, # to verify custom image builds (e.g pre-published ones). -# XXX: Current does not work on M1 macs due to x86_64 Docker images compiled only, and no seccomp support in M1 Docker emulation layer. - set -eux -o pipefail SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" diff --git a/libs/compute_api/src/responses.rs b/libs/compute_api/src/responses.rs index fd0c90d447..d05d625b0a 100644 --- a/libs/compute_api/src/responses.rs +++ b/libs/compute_api/src/responses.rs @@ -3,7 +3,7 @@ use chrono::{DateTime, Utc}; use serde::{Deserialize, Serialize, Serializer}; -use crate::spec::ComputeSpec; +use crate::spec::{ComputeSpec, Database, Role}; #[derive(Serialize, Debug, Deserialize)] pub struct GenericAPIError { @@ -113,6 +113,12 @@ pub struct ComputeMetrics { pub total_ext_download_size: u64, } +#[derive(Clone, Debug, Default, Serialize)] +pub struct CatalogObjects { + pub roles: Vec, + pub databases: Vec, +} + /// Response of the `/computes/{compute_id}/spec` control-plane API. /// This is not actually a compute API response, so consider moving /// to a different place. diff --git a/libs/pageserver_api/src/keyspace.rs b/libs/pageserver_api/src/keyspace.rs index c0c4710a00..12c6dc3a6d 100644 --- a/libs/pageserver_api/src/keyspace.rs +++ b/libs/pageserver_api/src/keyspace.rs @@ -307,7 +307,7 @@ impl KeySpace { } /// Merge another keyspace into the current one. - /// Note: the keyspaces must not ovelap (enforced via assertions) + /// Note: the keyspaces must not overlap (enforced via assertions). To merge overlapping key ranges, use `KeySpaceRandomAccum`. pub fn merge(&mut self, other: &KeySpace) { let all_ranges = self .ranges diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index 1df5820fb9..9311dab33c 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -9,7 +9,7 @@ use std::{ collections::HashMap, io::{BufRead, Read}, num::{NonZeroU64, NonZeroUsize}, - str::FromStr, + sync::atomic::AtomicUsize, time::{Duration, SystemTime}, }; @@ -161,6 +161,22 @@ impl std::fmt::Debug for TenantState { } } +/// A temporary lease to a specific lsn inside a timeline. +/// Access to the lsn is guaranteed by the pageserver until the expiration indicated by `valid_until`. +#[serde_as] +#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] +pub struct LsnLease { + #[serde_as(as = "SystemTimeAsRfc3339Millis")] + pub valid_until: SystemTime, +} + +serde_with::serde_conv!( + SystemTimeAsRfc3339Millis, + SystemTime, + |time: &SystemTime| humantime::format_rfc3339_millis(*time).to_string(), + |value: String| -> Result<_, humantime::TimestampError> { humantime::parse_rfc3339(&value) } +); + /// The only [`TenantState`] variants we could be `TenantState::Activating` from. #[derive(Clone, Copy, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)] pub enum ActivatingFrom { @@ -289,7 +305,7 @@ pub struct TenantConfig { pub compaction_period: Option, pub compaction_threshold: Option, // defer parsing compaction_algorithm, like eviction_policy - pub compaction_algorithm: Option, + pub compaction_algorithm: Option, pub gc_horizon: Option, pub gc_period: Option, pub image_creation_threshold: Option, @@ -308,28 +324,100 @@ pub struct TenantConfig { pub switch_aux_file_policy: Option, } -#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +/// The policy for the aux file storage. It can be switched through `switch_aux_file_policy` +/// tenant config. When the first aux file written, the policy will be persisted in the +/// `index_part.json` file and has a limited migration path. +/// +/// Currently, we only allow the following migration path: +/// +/// Unset -> V1 +/// -> V2 +/// -> CrossValidation -> V2 +#[derive( + Eq, + PartialEq, + Debug, + Copy, + Clone, + strum_macros::EnumString, + strum_macros::Display, + serde_with::DeserializeFromStr, + serde_with::SerializeDisplay, +)] +#[strum(serialize_all = "kebab-case")] pub enum AuxFilePolicy { + /// V1 aux file policy: store everything in AUX_FILE_KEY + #[strum(ascii_case_insensitive)] V1, + /// V2 aux file policy: store in the AUX_FILE keyspace + #[strum(ascii_case_insensitive)] V2, + /// Cross validation runs both formats on the write path and does validation + /// on the read path. + #[strum(ascii_case_insensitive)] CrossValidation, } -impl FromStr for AuxFilePolicy { - type Err = anyhow::Error; +impl AuxFilePolicy { + pub fn is_valid_migration_path(from: Option, to: Self) -> bool { + matches!( + (from, to), + (None, _) | (Some(AuxFilePolicy::CrossValidation), AuxFilePolicy::V2) + ) + } - fn from_str(s: &str) -> Result { - let s = s.to_lowercase(); - if s == "v1" { - Ok(Self::V1) - } else if s == "v2" { - Ok(Self::V2) - } else if s == "crossvalidation" || s == "cross_validation" { - Ok(Self::CrossValidation) - } else { - anyhow::bail!("cannot parse {} to aux file policy", s) + /// If a tenant writes aux files without setting `switch_aux_policy`, this value will be used. + pub fn default_tenant_config() -> Self { + Self::V1 + } +} + +/// The aux file policy memory flag. Users can store `Option` into this atomic flag. 0 == unspecified. +pub struct AtomicAuxFilePolicy(AtomicUsize); + +impl AtomicAuxFilePolicy { + pub fn new(policy: Option) -> Self { + Self(AtomicUsize::new( + policy.map(AuxFilePolicy::to_usize).unwrap_or_default(), + )) + } + + pub fn load(&self) -> Option { + match self.0.load(std::sync::atomic::Ordering::Acquire) { + 0 => None, + other => Some(AuxFilePolicy::from_usize(other)), } } + + pub fn store(&self, policy: Option) { + self.0.store( + policy.map(AuxFilePolicy::to_usize).unwrap_or_default(), + std::sync::atomic::Ordering::Release, + ); + } +} + +impl AuxFilePolicy { + pub fn to_usize(self) -> usize { + match self { + Self::V1 => 1, + Self::CrossValidation => 2, + Self::V2 => 3, + } + } + + pub fn try_from_usize(this: usize) -> Option { + match this { + 1 => Some(Self::V1), + 2 => Some(Self::CrossValidation), + 3 => Some(Self::V2), + _ => None, + } + } + + pub fn from_usize(this: usize) -> Self { + Self::try_from_usize(this).unwrap() + } } #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] @@ -350,13 +438,28 @@ impl EvictionPolicy { } } -#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] -#[serde(tag = "kind")] +#[derive( + Eq, + PartialEq, + Debug, + Copy, + Clone, + strum_macros::EnumString, + strum_macros::Display, + serde_with::DeserializeFromStr, + serde_with::SerializeDisplay, +)] +#[strum(serialize_all = "kebab-case")] pub enum CompactionAlgorithm { Legacy, Tiered, } +#[derive(Eq, PartialEq, Debug, Clone, Serialize, Deserialize)] +pub struct CompactionAlgorithmSettings { + pub kind: CompactionAlgorithm, +} + #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] pub struct EvictionPolicyLayerAccessThreshold { #[serde(with = "humantime_serde")] @@ -604,6 +707,9 @@ pub struct TimelineInfo { pub state: TimelineState, pub walreceiver_status: String, + + /// The last aux file policy being used on this timeline + pub last_aux_file_policy: Option, } #[derive(Debug, Clone, Serialize, Deserialize)] @@ -710,6 +816,8 @@ pub enum HistoricLayerInfo { lsn_end: Lsn, remote: bool, access_stats: LayerAccessStats, + + l0: bool, }, Image { layer_file_name: String, @@ -745,6 +853,16 @@ impl HistoricLayerInfo { }; *field = value; } + pub fn layer_file_size(&self) -> u64 { + match self { + HistoricLayerInfo::Delta { + layer_file_size, .. + } => *layer_file_size, + HistoricLayerInfo::Image { + layer_file_size, .. + } => *layer_file_size, + } + } } #[derive(Debug, Serialize, Deserialize)] @@ -752,6 +870,16 @@ pub struct DownloadRemoteLayersTaskSpawnRequest { pub max_concurrent_downloads: NonZeroUsize, } +#[derive(Debug, Serialize, Deserialize)] +pub struct IngestAuxFilesRequest { + pub aux_files: HashMap, +} + +#[derive(Debug, Serialize, Deserialize)] +pub struct ListAuxFilesRequest { + pub lsn: Lsn, +} + #[derive(Debug, Serialize, Deserialize, Clone)] pub struct DownloadRemoteLayersTaskInfo { pub task_id: String, @@ -776,9 +904,6 @@ pub struct TimelineGcRequest { #[derive(Debug, Clone, Serialize, Deserialize)] pub struct WalRedoManagerProcessStatus { pub pid: u32, - /// The strum-generated `into::<&'static str>()` for `pageserver::walredo::ProcessKind`. - /// `ProcessKind` are a transitory thing, so, they have no enum representation in `pageserver_api`. - pub kind: Cow<'static, str>, } #[derive(Debug, Clone, Serialize, Deserialize)] @@ -817,6 +942,55 @@ pub struct TenantScanRemoteStorageResponse { pub shards: Vec, } +#[derive(Serialize, Deserialize, Debug, Clone)] +#[serde(rename_all = "snake_case")] +pub enum TenantSorting { + ResidentSize, + MaxLogicalSize, +} + +impl Default for TenantSorting { + fn default() -> Self { + Self::ResidentSize + } +} + +#[derive(Serialize, Deserialize, Debug, Clone)] +pub struct TopTenantShardsRequest { + // How would you like to sort the tenants? + pub order_by: TenantSorting, + + // How many results? + pub limit: usize, + + // Omit tenants with more than this many shards (e.g. if this is the max number of shards + // that the caller would ever split to) + pub where_shards_lt: Option, + + // Omit tenants where the ordering metric is less than this (this is an optimization to + // let us quickly exclude numerous tiny shards) + pub where_gt: Option, +} + +#[derive(Serialize, Deserialize, Debug, PartialEq, Eq)] +pub struct TopTenantShardItem { + pub id: TenantShardId, + + /// Total size of layers on local disk for all timelines in this tenant + pub resident_size: u64, + + /// Total size of layers in remote storage for all timelines in this tenant + pub physical_size: u64, + + /// The largest logical size of a timeline within this tenant + pub max_logical_size: u64, +} + +#[derive(Serialize, Deserialize, Debug, Default)] +pub struct TopTenantShardsResponse { + pub shards: Vec, +} + pub mod virtual_file { #[derive( Copy, @@ -1242,6 +1416,7 @@ impl PagestreamBeMessage { #[cfg(test)] mod tests { use serde_json::json; + use std::str::FromStr; use super::*; @@ -1449,4 +1624,69 @@ mod tests { assert_eq!(actual, expected, "example on {line}"); } } + + #[test] + fn test_aux_file_migration_path() { + assert!(AuxFilePolicy::is_valid_migration_path( + None, + AuxFilePolicy::V1 + )); + assert!(AuxFilePolicy::is_valid_migration_path( + None, + AuxFilePolicy::V2 + )); + assert!(AuxFilePolicy::is_valid_migration_path( + None, + AuxFilePolicy::CrossValidation + )); + // Self-migration is not a valid migration path, and the caller should handle it by itself. + assert!(!AuxFilePolicy::is_valid_migration_path( + Some(AuxFilePolicy::V1), + AuxFilePolicy::V1 + )); + assert!(!AuxFilePolicy::is_valid_migration_path( + Some(AuxFilePolicy::V2), + AuxFilePolicy::V2 + )); + assert!(!AuxFilePolicy::is_valid_migration_path( + Some(AuxFilePolicy::CrossValidation), + AuxFilePolicy::CrossValidation + )); + // Migrations not allowed + assert!(!AuxFilePolicy::is_valid_migration_path( + Some(AuxFilePolicy::CrossValidation), + AuxFilePolicy::V1 + )); + assert!(!AuxFilePolicy::is_valid_migration_path( + Some(AuxFilePolicy::V1), + AuxFilePolicy::V2 + )); + assert!(!AuxFilePolicy::is_valid_migration_path( + Some(AuxFilePolicy::V2), + AuxFilePolicy::V1 + )); + assert!(!AuxFilePolicy::is_valid_migration_path( + Some(AuxFilePolicy::V2), + AuxFilePolicy::CrossValidation + )); + assert!(!AuxFilePolicy::is_valid_migration_path( + Some(AuxFilePolicy::V1), + AuxFilePolicy::CrossValidation + )); + // Migrations allowed + assert!(AuxFilePolicy::is_valid_migration_path( + Some(AuxFilePolicy::CrossValidation), + AuxFilePolicy::V2 + )); + } + + #[test] + fn test_aux_parse() { + assert_eq!(AuxFilePolicy::from_str("V2").unwrap(), AuxFilePolicy::V2); + assert_eq!(AuxFilePolicy::from_str("v2").unwrap(), AuxFilePolicy::V2); + assert_eq!( + AuxFilePolicy::from_str("cross-validation").unwrap(), + AuxFilePolicy::CrossValidation + ); + } } diff --git a/libs/pageserver_api/src/shard.rs b/libs/pageserver_api/src/shard.rs index ff6d3d91b6..1c05a01926 100644 --- a/libs/pageserver_api/src/shard.rs +++ b/libs/pageserver_api/src/shard.rs @@ -125,7 +125,7 @@ impl ShardCount { /// `v` may be zero, or the number of shards in the tenant. `v` is what /// [`Self::literal`] would return. - pub fn new(val: u8) -> Self { + pub const fn new(val: u8) -> Self { Self(val) } } @@ -559,6 +559,14 @@ impl ShardIdentity { } } + /// Obtains the shard number and count combined into a `ShardIndex`. + pub fn shard_index(&self) -> ShardIndex { + ShardIndex { + shard_count: self.count, + shard_number: self.number, + } + } + pub fn shard_slug(&self) -> String { if self.count > ShardCount(0) { format!("-{:02x}{:02x}", self.number.0, self.count.0) diff --git a/libs/postgres_backend/src/lib.rs b/libs/postgres_backend/src/lib.rs index 260018ad89..6c41b7f347 100644 --- a/libs/postgres_backend/src/lib.rs +++ b/libs/postgres_backend/src/lib.rs @@ -820,10 +820,11 @@ impl PostgresBackend { Ok(ProcessMsgResult::Continue) } - /// Log as info/error result of handling COPY stream and send back - /// ErrorResponse if that makes sense. Shutdown the stream if we got - /// Terminate. TODO: transition into waiting for Sync msg if we initiate the - /// close. + /// - Log as info/error result of handling COPY stream and send back + /// ErrorResponse if that makes sense. + /// - Shutdown the stream if we got Terminate. + /// - Then close the connection because we don't handle exiting from COPY + /// stream normally. pub async fn handle_copy_stream_end(&mut self, end: CopyStreamHandlerEnd) { use CopyStreamHandlerEnd::*; @@ -849,10 +850,6 @@ impl PostgresBackend { } } - if let Terminate = &end { - self.state = ProtoState::Closed; - } - let err_to_send_and_errcode = match &end { ServerInitiated(_) => Some((end.to_string(), SQLSTATE_SUCCESSFUL_COMPLETION)), Other(_) => Some((format!("{end:#}"), SQLSTATE_INTERNAL_ERROR)), @@ -882,6 +879,12 @@ impl PostgresBackend { error!("failed to send ErrorResponse: {}", ee); } } + + // Proper COPY stream finishing to continue using the connection is not + // implemented at the server side (we don't need it so far). To prevent + // further usages of the connection, close it. + self.framed.shutdown().await.ok(); + self.state = ProtoState::Closed; } } diff --git a/libs/postgres_ffi/wal_craft/src/lib.rs b/libs/postgres_ffi/wal_craft/src/lib.rs index 262068cbda..6052f04d11 100644 --- a/libs/postgres_ffi/wal_craft/src/lib.rs +++ b/libs/postgres_ffi/wal_craft/src/lib.rs @@ -359,7 +359,7 @@ impl Crafter for LastWalRecordXlogSwitchEndsOnPageBoundary { // Is there enough space on the page for another logical message and an // XLOG_SWITCH? If not, start over. let page_remain = XLOG_BLCKSZ as u64 - u64::from(after_lsn) % XLOG_BLCKSZ as u64; - if page_remain < base_size - XLOG_SIZE_OF_XLOG_RECORD as u64 { + if page_remain < base_size + XLOG_SIZE_OF_XLOG_RECORD as u64 { continue; } @@ -373,31 +373,29 @@ impl Crafter for LastWalRecordXlogSwitchEndsOnPageBoundary { "SELECT pg_logical_emit_message(false, 'swch', REPEAT('a', $1))", &[&(repeats as i32)], )?; - break; - } - info!( - "current_wal_insert_lsn={}, XLOG_SIZE_OF_XLOG_RECORD={}", - client.pg_current_wal_insert_lsn()?, - XLOG_SIZE_OF_XLOG_RECORD - ); + info!( + "current_wal_insert_lsn={}, XLOG_SIZE_OF_XLOG_RECORD={}", + client.pg_current_wal_insert_lsn()?, + XLOG_SIZE_OF_XLOG_RECORD + ); - // Emit the XLOG_SWITCH - let before_xlog_switch = client.pg_current_wal_insert_lsn()?; - let xlog_switch_record_end: PgLsn = client.query_one("SELECT pg_switch_wal()", &[])?.get(0); - let next_segment = PgLsn::from(0x0200_0000); - ensure!( - xlog_switch_record_end < next_segment, - "XLOG_SWITCH record ended on or after the expected segment boundary: {} > {}", - xlog_switch_record_end, - next_segment - ); - ensure!( - u64::from(xlog_switch_record_end) as usize % XLOG_BLCKSZ == XLOG_SIZE_OF_XLOG_SHORT_PHD, - "XLOG_SWITCH message ended not on page boundary: {}, offset = {}", - xlog_switch_record_end, - u64::from(xlog_switch_record_end) as usize % XLOG_BLCKSZ - ); - Ok(vec![before_xlog_switch, xlog_switch_record_end]) + // Emit the XLOG_SWITCH + let before_xlog_switch = client.pg_current_wal_insert_lsn()?; + let xlog_switch_record_end: PgLsn = + client.query_one("SELECT pg_switch_wal()", &[])?.get(0); + + if u64::from(xlog_switch_record_end) as usize % XLOG_BLCKSZ + != XLOG_SIZE_OF_XLOG_SHORT_PHD + { + warn!( + "XLOG_SWITCH message ended not on page boundary: {}, offset = {}, repeating", + xlog_switch_record_end, + u64::from(xlog_switch_record_end) as usize % XLOG_BLCKSZ + ); + continue; + } + return Ok(vec![before_xlog_switch, xlog_switch_record_end]); + } } } diff --git a/libs/remote_storage/src/azure_blob.rs b/libs/remote_storage/src/azure_blob.rs index 24c1248304..220d4ef115 100644 --- a/libs/remote_storage/src/azure_blob.rs +++ b/libs/remote_storage/src/azure_blob.rs @@ -29,6 +29,7 @@ use http_types::{StatusCode, Url}; use tokio_util::sync::CancellationToken; use tracing::debug; +use crate::RemoteStorageActivity; use crate::{ error::Cancelled, s3_bucket::RequestKind, AzureConfig, ConcurrencyLimiter, Download, DownloadError, Listing, ListingMode, RemotePath, RemoteStorage, StorageMetadata, @@ -525,6 +526,10 @@ impl RemoteStorage for AzureBlobStorage { // https://learn.microsoft.com/en-us/azure/storage/blobs/point-in-time-restore-overview Err(TimeTravelError::Unimplemented) } + + fn activity(&self) -> RemoteStorageActivity { + self.concurrency_limiter.activity() + } } pin_project_lite::pin_project! { diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs index 708662f20f..f024021507 100644 --- a/libs/remote_storage/src/lib.rs +++ b/libs/remote_storage/src/lib.rs @@ -263,6 +263,17 @@ pub trait RemoteStorage: Send + Sync + 'static { done_if_after: SystemTime, cancel: &CancellationToken, ) -> Result<(), TimeTravelError>; + + /// Query how busy we currently are: may be used by callers which wish to politely + /// back off if there are already a lot of operations underway. + fn activity(&self) -> RemoteStorageActivity; +} + +pub struct RemoteStorageActivity { + pub read_available: usize, + pub read_total: usize, + pub write_available: usize, + pub write_total: usize, } /// DownloadStream is sensitive to the timeout and cancellation used with the original @@ -444,6 +455,15 @@ impl GenericRemoteStorage> { } } } + + pub fn activity(&self) -> RemoteStorageActivity { + match self { + Self::LocalFs(s) => s.activity(), + Self::AwsS3(s) => s.activity(), + Self::AzureBlob(s) => s.activity(), + Self::Unreliable(s) => s.activity(), + } + } } impl GenericRemoteStorage { @@ -774,6 +794,9 @@ struct ConcurrencyLimiter { // The helps to ensure we don't exceed the thresholds. write: Arc, read: Arc, + + write_total: usize, + read_total: usize, } impl ConcurrencyLimiter { @@ -802,10 +825,21 @@ impl ConcurrencyLimiter { Arc::clone(self.for_kind(kind)).acquire_owned().await } + fn activity(&self) -> RemoteStorageActivity { + RemoteStorageActivity { + read_available: self.read.available_permits(), + read_total: self.read_total, + write_available: self.write.available_permits(), + write_total: self.write_total, + } + } + fn new(limit: usize) -> ConcurrencyLimiter { Self { read: Arc::new(Semaphore::new(limit)), write: Arc::new(Semaphore::new(limit)), + read_total: limit, + write_total: limit, } } } diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs index 1f7bcfc982..f12f6590a3 100644 --- a/libs/remote_storage/src/local_fs.rs +++ b/libs/remote_storage/src/local_fs.rs @@ -23,8 +23,8 @@ use tokio_util::{io::ReaderStream, sync::CancellationToken}; use utils::crashsafe::path_with_suffix_extension; use crate::{ - Download, DownloadError, Listing, ListingMode, RemotePath, TimeTravelError, TimeoutOrCancel, - REMOTE_STORAGE_PREFIX_SEPARATOR, + Download, DownloadError, Listing, ListingMode, RemotePath, RemoteStorageActivity, + TimeTravelError, TimeoutOrCancel, REMOTE_STORAGE_PREFIX_SEPARATOR, }; use super::{RemoteStorage, StorageMetadata}; @@ -605,6 +605,16 @@ impl RemoteStorage for LocalFs { ) -> Result<(), TimeTravelError> { Err(TimeTravelError::Unimplemented) } + + fn activity(&self) -> RemoteStorageActivity { + // LocalFS has no concurrency limiting: give callers the impression that plenty of units are available + RemoteStorageActivity { + read_available: 16, + read_total: 16, + write_available: 16, + write_total: 16, + } + } } fn storage_metadata_path(original_path: &Utf8Path) -> Utf8PathBuf { diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs index c0b89cee2a..0f6772b274 100644 --- a/libs/remote_storage/src/s3_bucket.rs +++ b/libs/remote_storage/src/s3_bucket.rs @@ -27,7 +27,7 @@ use aws_config::{ }; use aws_credential_types::provider::SharedCredentialsProvider; use aws_sdk_s3::{ - config::{AsyncSleep, Builder, IdentityCache, Region, SharedAsyncSleep}, + config::{AsyncSleep, IdentityCache, Region, SharedAsyncSleep}, error::SdkError, operation::get_object::GetObjectError, types::{Delete, DeleteMarkerEntry, ObjectIdentifier, ObjectVersion, StorageClass}, @@ -47,8 +47,8 @@ use utils::backoff; use super::StorageMetadata; use crate::{ error::Cancelled, support::PermitCarrying, ConcurrencyLimiter, Download, DownloadError, - Listing, ListingMode, RemotePath, RemoteStorage, S3Config, TimeTravelError, TimeoutOrCancel, - MAX_KEYS_PER_DELETE, REMOTE_STORAGE_PREFIX_SEPARATOR, + Listing, ListingMode, RemotePath, RemoteStorage, RemoteStorageActivity, S3Config, + TimeTravelError, TimeoutOrCancel, MAX_KEYS_PER_DELETE, REMOTE_STORAGE_PREFIX_SEPARATOR, }; pub(super) mod metrics; @@ -75,13 +75,13 @@ struct GetObjectRequest { } impl S3Bucket { /// Creates the S3 storage, errors if incorrect AWS S3 configuration provided. - pub fn new(aws_config: &S3Config, timeout: Duration) -> anyhow::Result { + pub fn new(remote_storage_config: &S3Config, timeout: Duration) -> anyhow::Result { tracing::debug!( "Creating s3 remote storage for S3 bucket {}", - aws_config.bucket_name + remote_storage_config.bucket_name ); - let region = Some(Region::new(aws_config.bucket_region.clone())); + let region = Some(Region::new(remote_storage_config.bucket_region.clone())); let provider_conf = ProviderConfig::without_region().with_region(region.clone()); @@ -113,6 +113,38 @@ impl S3Bucket { // AWS SDK requires us to specify how the RetryConfig should sleep when it wants to back off let sleep_impl: Arc = Arc::new(TokioSleep::new()); + let sdk_config_loader: aws_config::ConfigLoader = aws_config::defaults( + #[allow(deprecated)] /* TODO: https://github.com/neondatabase/neon/issues/7665 */ + BehaviorVersion::v2023_11_09(), + ) + .region(region) + .identity_cache(IdentityCache::lazy().build()) + .credentials_provider(SharedCredentialsProvider::new(credentials_provider)) + .sleep_impl(SharedAsyncSleep::from(sleep_impl)); + + let sdk_config: aws_config::SdkConfig = std::thread::scope(|s| { + s.spawn(|| { + // TODO: make this function async. + tokio::runtime::Builder::new_current_thread() + .enable_all() + .build() + .unwrap() + .block_on(sdk_config_loader.load()) + }) + .join() + .unwrap() + }); + + let mut s3_config_builder = aws_sdk_s3::config::Builder::from(&sdk_config); + + // Technically, the `remote_storage_config.endpoint` field only applies to S3 interactions. + // (In case we ever re-use the `sdk_config` for more than just the S3 client in the future) + if let Some(custom_endpoint) = remote_storage_config.endpoint.clone() { + s3_config_builder = s3_config_builder + .endpoint_url(custom_endpoint) + .force_path_style(true); + } + // We do our own retries (see [`backoff::retry`]). However, for the AWS SDK to enable rate limiting in response to throttling // responses (e.g. 429 on too many ListObjectsv2 requests), we must provide a retry config. We set it to use at most one // attempt, and enable 'Adaptive' mode, which causes rate limiting to be enabled. @@ -120,42 +152,36 @@ impl S3Bucket { retry_config .set_max_attempts(Some(1)) .set_mode(Some(RetryMode::Adaptive)); + s3_config_builder = s3_config_builder.retry_config(retry_config.build()); - let mut config_builder = Builder::default() - .behavior_version(BehaviorVersion::v2023_11_09()) - .region(region) - .identity_cache(IdentityCache::lazy().build()) - .credentials_provider(SharedCredentialsProvider::new(credentials_provider)) - .retry_config(retry_config.build()) - .sleep_impl(SharedAsyncSleep::from(sleep_impl)); + let s3_config = s3_config_builder.build(); + let client = aws_sdk_s3::Client::from_conf(s3_config); - if let Some(custom_endpoint) = aws_config.endpoint.clone() { - config_builder = config_builder - .endpoint_url(custom_endpoint) - .force_path_style(true); - } + let prefix_in_bucket = remote_storage_config + .prefix_in_bucket + .as_deref() + .map(|prefix| { + let mut prefix = prefix; + while prefix.starts_with(REMOTE_STORAGE_PREFIX_SEPARATOR) { + prefix = &prefix[1..] + } - let client = Client::from_conf(config_builder.build()); + let mut prefix = prefix.to_string(); + while prefix.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR) { + prefix.pop(); + } + prefix + }); - let prefix_in_bucket = aws_config.prefix_in_bucket.as_deref().map(|prefix| { - let mut prefix = prefix; - while prefix.starts_with(REMOTE_STORAGE_PREFIX_SEPARATOR) { - prefix = &prefix[1..] - } - - let mut prefix = prefix.to_string(); - while prefix.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR) { - prefix.pop(); - } - prefix - }); Ok(Self { client, - bucket_name: aws_config.bucket_name.clone(), - max_keys_per_list_response: aws_config.max_keys_per_list_response, + bucket_name: remote_storage_config.bucket_name.clone(), + max_keys_per_list_response: remote_storage_config.max_keys_per_list_response, prefix_in_bucket, - concurrency_limiter: ConcurrencyLimiter::new(aws_config.concurrency_limit.get()), - upload_storage_class: aws_config.upload_storage_class.clone(), + concurrency_limiter: ConcurrencyLimiter::new( + remote_storage_config.concurrency_limit.get(), + ), + upload_storage_class: remote_storage_config.upload_storage_class.clone(), timeout, }) } @@ -949,6 +975,10 @@ impl RemoteStorage for S3Bucket { } Ok(()) } + + fn activity(&self) -> RemoteStorageActivity { + self.concurrency_limiter.activity() + } } /// On drop (cancellation) count towards [`metrics::BucketMetrics::cancelled_waits`]. diff --git a/libs/remote_storage/src/simulate_failures.rs b/libs/remote_storage/src/simulate_failures.rs index c467a2d196..66522e04ca 100644 --- a/libs/remote_storage/src/simulate_failures.rs +++ b/libs/remote_storage/src/simulate_failures.rs @@ -12,7 +12,7 @@ use tokio_util::sync::CancellationToken; use crate::{ Download, DownloadError, GenericRemoteStorage, Listing, ListingMode, RemotePath, RemoteStorage, - StorageMetadata, TimeTravelError, + RemoteStorageActivity, StorageMetadata, TimeTravelError, }; pub struct UnreliableWrapper { @@ -213,4 +213,8 @@ impl RemoteStorage for UnreliableWrapper { .time_travel_recover(prefix, timestamp, done_if_after, cancel) .await } + + fn activity(&self) -> RemoteStorageActivity { + self.inner.activity() + } } diff --git a/libs/safekeeper_api/src/models.rs b/libs/safekeeper_api/src/models.rs index ce5a1e411e..2fbc333075 100644 --- a/libs/safekeeper_api/src/models.rs +++ b/libs/safekeeper_api/src/models.rs @@ -50,6 +50,9 @@ pub struct SkTimelineInfo { pub safekeeper_connstr: Option, #[serde(default)] pub http_connstr: Option, + // Minimum of all active RO replicas flush LSN + #[serde(default = "lsn_invalid")] + pub standby_horizon: Lsn, } #[derive(Debug, Clone, Deserialize, Serialize)] diff --git a/libs/utils/src/poison.rs b/libs/utils/src/poison.rs index 0bf5664f47..27378c69fc 100644 --- a/libs/utils/src/poison.rs +++ b/libs/utils/src/poison.rs @@ -3,7 +3,7 @@ //! # Example //! //! ``` -//! # tokio_test::block_on(async { +//! # tokio::runtime::Builder::new_current_thread().enable_all().build().unwrap().block_on(async { //! use utils::poison::Poison; //! use std::time::Duration; //! diff --git a/libs/walproposer/src/walproposer.rs b/libs/walproposer/src/walproposer.rs index fb815607a7..f7b72b205f 100644 --- a/libs/walproposer/src/walproposer.rs +++ b/libs/walproposer/src/walproposer.rs @@ -496,9 +496,9 @@ mod tests { // TODO: When updating Postgres versions, this test will cause // problems. Postgres version in message needs updating. // - // Greeting(ProposerGreeting { protocol_version: 2, pg_version: 160002, proposer_id: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], system_id: 0, timeline_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tenant_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tli: 1, wal_seg_size: 16777216 }) + // Greeting(ProposerGreeting { protocol_version: 2, pg_version: 160003, proposer_id: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], system_id: 0, timeline_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tenant_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tli: 1, wal_seg_size: 16777216 }) vec![ - 103, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 2, 113, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 103, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 3, 113, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 158, 76, 143, 54, 6, 60, 108, 110, 147, 188, 32, 214, 90, 130, 15, 61, 158, 76, 143, 54, 6, 60, 108, 110, 147, 188, 32, 214, 90, 130, 15, 61, 1, 0, 0, 0, 0, 0, 0, 1, diff --git a/pageserver/benches/bench_layer_map.rs b/pageserver/benches/bench_layer_map.rs index 5d05af0c00..1d02aa7709 100644 --- a/pageserver/benches/bench_layer_map.rs +++ b/pageserver/benches/bench_layer_map.rs @@ -1,7 +1,7 @@ use pageserver::keyspace::{KeyPartitioning, KeySpace}; use pageserver::repository::Key; use pageserver::tenant::layer_map::LayerMap; -use pageserver::tenant::storage_layer::LayerFileName; +use pageserver::tenant::storage_layer::LayerName; use pageserver::tenant::storage_layer::PersistentLayerDesc; use pageserver_api::shard::TenantShardId; use rand::prelude::{SeedableRng, SliceRandom, StdRng}; @@ -28,7 +28,7 @@ fn build_layer_map(filename_dump: PathBuf) -> LayerMap { let mut updates = layer_map.batch_update(); for fname in filenames { let fname = fname.unwrap(); - let fname = LayerFileName::from_str(&fname).unwrap(); + let fname = LayerName::from_str(&fname).unwrap(); let layer = PersistentLayerDesc::from(fname); let lsn_range = layer.get_lsn_range(); diff --git a/pageserver/benches/bench_walredo.rs b/pageserver/benches/bench_walredo.rs index 5b871c5d5e..5aab10e5d9 100644 --- a/pageserver/benches/bench_walredo.rs +++ b/pageserver/benches/bench_walredo.rs @@ -30,47 +30,27 @@ //! 2024-04-15 on i3en.3xlarge //! //! ```text -//! async-short/1 time: [24.584 µs 24.737 µs 24.922 µs] -//! async-short/2 time: [33.479 µs 33.660 µs 33.888 µs] -//! async-short/4 time: [42.713 µs 43.046 µs 43.440 µs] -//! async-short/8 time: [71.814 µs 72.478 µs 73.240 µs] -//! async-short/16 time: [132.73 µs 134.45 µs 136.22 µs] -//! async-short/32 time: [258.31 µs 260.73 µs 263.27 µs] -//! async-short/64 time: [511.61 µs 514.44 µs 517.51 µs] -//! async-short/128 time: [992.64 µs 998.23 µs 1.0042 ms] -//! async-medium/1 time: [110.11 µs 110.50 µs 110.96 µs] -//! async-medium/2 time: [153.06 µs 153.85 µs 154.99 µs] -//! async-medium/4 time: [317.51 µs 319.92 µs 322.85 µs] -//! async-medium/8 time: [638.30 µs 644.68 µs 652.12 µs] -//! async-medium/16 time: [1.2651 ms 1.2773 ms 1.2914 ms] -//! async-medium/32 time: [2.5117 ms 2.5410 ms 2.5720 ms] -//! async-medium/64 time: [4.8088 ms 4.8555 ms 4.9047 ms] -//! async-medium/128 time: [8.8311 ms 8.9849 ms 9.1263 ms] -//! sync-short/1 time: [25.503 µs 25.626 µs 25.771 µs] -//! sync-short/2 time: [30.850 µs 31.013 µs 31.208 µs] -//! sync-short/4 time: [45.543 µs 45.856 µs 46.193 µs] -//! sync-short/8 time: [84.114 µs 84.639 µs 85.220 µs] -//! sync-short/16 time: [185.22 µs 186.15 µs 187.13 µs] -//! sync-short/32 time: [377.43 µs 378.87 µs 380.46 µs] -//! sync-short/64 time: [756.49 µs 759.04 µs 761.70 µs] -//! sync-short/128 time: [1.4825 ms 1.4874 ms 1.4923 ms] -//! sync-medium/1 time: [105.66 µs 106.01 µs 106.43 µs] -//! sync-medium/2 time: [153.10 µs 153.84 µs 154.72 µs] -//! sync-medium/4 time: [327.13 µs 329.44 µs 332.27 µs] -//! sync-medium/8 time: [654.26 µs 658.73 µs 663.63 µs] -//! sync-medium/16 time: [1.2682 ms 1.2748 ms 1.2816 ms] -//! sync-medium/32 time: [2.4456 ms 2.4595 ms 2.4731 ms] -//! sync-medium/64 time: [4.6523 ms 4.6890 ms 4.7256 ms] -//! sync-medium/128 time: [8.7215 ms 8.8323 ms 8.9344 ms] +//! short/1 time: [24.584 µs 24.737 µs 24.922 µs] +//! short/2 time: [33.479 µs 33.660 µs 33.888 µs] +//! short/4 time: [42.713 µs 43.046 µs 43.440 µs] +//! short/8 time: [71.814 µs 72.478 µs 73.240 µs] +//! short/16 time: [132.73 µs 134.45 µs 136.22 µs] +//! short/32 time: [258.31 µs 260.73 µs 263.27 µs] +//! short/64 time: [511.61 µs 514.44 µs 517.51 µs] +//! short/128 time: [992.64 µs 998.23 µs 1.0042 ms] +//! medium/1 time: [110.11 µs 110.50 µs 110.96 µs] +//! medium/2 time: [153.06 µs 153.85 µs 154.99 µs] +//! medium/4 time: [317.51 µs 319.92 µs 322.85 µs] +//! medium/8 time: [638.30 µs 644.68 µs 652.12 µs] +//! medium/16 time: [1.2651 ms 1.2773 ms 1.2914 ms] +//! medium/32 time: [2.5117 ms 2.5410 ms 2.5720 ms] +//! medium/64 time: [4.8088 ms 4.8555 ms 4.9047 ms] +//! medium/128 time: [8.8311 ms 8.9849 ms 9.1263 ms] //! ``` use bytes::{Buf, Bytes}; use criterion::{BenchmarkId, Criterion}; -use pageserver::{ - config::PageServerConf, - walrecord::NeonWalRecord, - walredo::{PostgresRedoManager, ProcessKind}, -}; +use pageserver::{config::PageServerConf, walrecord::NeonWalRecord, walredo::PostgresRedoManager}; use pageserver_api::{key::Key, shard::TenantShardId}; use std::{ sync::Arc, @@ -80,39 +60,32 @@ use tokio::{sync::Barrier, task::JoinSet}; use utils::{id::TenantId, lsn::Lsn}; fn bench(c: &mut Criterion) { - for process_kind in &[ProcessKind::Async, ProcessKind::Sync] { - { - let nclients = [1, 2, 4, 8, 16, 32, 64, 128]; - for nclients in nclients { - let mut group = c.benchmark_group(format!("{process_kind}-short")); - group.bench_with_input( - BenchmarkId::from_parameter(nclients), - &nclients, - |b, nclients| { - let redo_work = Arc::new(Request::short_input()); - b.iter_custom(|iters| { - bench_impl(*process_kind, Arc::clone(&redo_work), iters, *nclients) - }); - }, - ); - } + { + let nclients = [1, 2, 4, 8, 16, 32, 64, 128]; + for nclients in nclients { + let mut group = c.benchmark_group("short"); + group.bench_with_input( + BenchmarkId::from_parameter(nclients), + &nclients, + |b, nclients| { + let redo_work = Arc::new(Request::short_input()); + b.iter_custom(|iters| bench_impl(Arc::clone(&redo_work), iters, *nclients)); + }, + ); } - - { - let nclients = [1, 2, 4, 8, 16, 32, 64, 128]; - for nclients in nclients { - let mut group = c.benchmark_group(format!("{process_kind}-medium")); - group.bench_with_input( - BenchmarkId::from_parameter(nclients), - &nclients, - |b, nclients| { - let redo_work = Arc::new(Request::medium_input()); - b.iter_custom(|iters| { - bench_impl(*process_kind, Arc::clone(&redo_work), iters, *nclients) - }); - }, - ); - } + } + { + let nclients = [1, 2, 4, 8, 16, 32, 64, 128]; + for nclients in nclients { + let mut group = c.benchmark_group("medium"); + group.bench_with_input( + BenchmarkId::from_parameter(nclients), + &nclients, + |b, nclients| { + let redo_work = Arc::new(Request::medium_input()); + b.iter_custom(|iters| bench_impl(Arc::clone(&redo_work), iters, *nclients)); + }, + ); } } } @@ -120,16 +93,10 @@ criterion::criterion_group!(benches, bench); criterion::criterion_main!(benches); // Returns the sum of each client's wall-clock time spent executing their share of the n_redos. -fn bench_impl( - process_kind: ProcessKind, - redo_work: Arc, - n_redos: u64, - nclients: u64, -) -> Duration { +fn bench_impl(redo_work: Arc, n_redos: u64, nclients: u64) -> Duration { let repo_dir = camino_tempfile::tempdir_in(env!("CARGO_TARGET_TMPDIR")).unwrap(); - let mut conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf()); - conf.walredo_process_kind = process_kind; + let conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf()); let conf = Box::leak(Box::new(conf)); let tenant_shard_id = TenantShardId::unsharded(TenantId::generate()); @@ -158,27 +125,13 @@ fn bench_impl( }); } - let elapsed = rt.block_on(async move { + rt.block_on(async move { let mut total_wallclock_time = Duration::ZERO; while let Some(res) = tasks.join_next().await { total_wallclock_time += res.unwrap(); } total_wallclock_time - }); - - // consistency check to ensure process kind setting worked - if nredos_per_client > 0 { - assert_eq!( - manager - .status() - .process - .map(|p| p.kind) - .expect("the benchmark work causes a walredo process to be spawned"), - std::borrow::Cow::Borrowed(process_kind.into()) - ); - } - - elapsed + }) } async fn client( diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs index 6df8b2170d..69b86d9c46 100644 --- a/pageserver/client/src/mgmt_api.rs +++ b/pageserver/client/src/mgmt_api.rs @@ -1,8 +1,12 @@ +use std::collections::HashMap; + +use bytes::Bytes; use pageserver_api::{models::*, shard::TenantShardId}; use reqwest::{IntoUrl, Method, StatusCode}; use utils::{ http::error::HttpErrorBody, id::{TenantId, TimelineId}, + lsn::Lsn, }; pub mod util; @@ -486,6 +490,18 @@ impl Client { .map_err(Error::ReceiveBody) } + pub async fn top_tenant_shards( + &self, + request: TopTenantShardsRequest, + ) -> Result { + let uri = format!("{}/v1/top_tenants", self.mgmt_api_endpoint); + self.request(Method::POST, uri, request) + .await? + .json() + .await + .map_err(Error::ReceiveBody) + } + pub async fn layer_map_info( &self, tenant_shard_id: TenantShardId, @@ -549,4 +565,57 @@ impl Client { }), } } + + pub async fn ingest_aux_files( + &self, + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + aux_files: HashMap, + ) -> Result { + let uri = format!( + "{}/v1/tenant/{}/timeline/{}/ingest_aux_files", + self.mgmt_api_endpoint, tenant_shard_id, timeline_id + ); + let resp = self + .request_noerror(Method::POST, &uri, IngestAuxFilesRequest { aux_files }) + .await?; + match resp.status() { + StatusCode::OK => Ok(true), + status => Err(match resp.json::().await { + Ok(HttpErrorBody { msg }) => Error::ApiError(status, msg), + Err(_) => { + Error::ReceiveErrorBody(format!("Http error ({}) at {}.", status.as_u16(), uri)) + } + }), + } + } + + pub async fn list_aux_files( + &self, + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + lsn: Lsn, + ) -> Result> { + let uri = format!( + "{}/v1/tenant/{}/timeline/{}/list_aux_files", + self.mgmt_api_endpoint, tenant_shard_id, timeline_id + ); + let resp = self + .request_noerror(Method::POST, &uri, ListAuxFilesRequest { lsn }) + .await?; + match resp.status() { + StatusCode::OK => { + let resp: HashMap = resp.json().await.map_err(|e| { + Error::ApiError(StatusCode::INTERNAL_SERVER_ERROR, format!("{e}")) + })?; + Ok(resp) + } + status => Err(match resp.json::().await { + Ok(HttpErrorBody { msg }) => Error::ApiError(status, msg), + Err(_) => { + Error::ReceiveErrorBody(format!("Http error ({}) at {}.", status.as_u16(), uri)) + } + }), + } + } } diff --git a/pageserver/compaction/src/bin/compaction-simulator.rs b/pageserver/compaction/src/bin/compaction-simulator.rs index 1fd69407d3..c308694ae1 100644 --- a/pageserver/compaction/src/bin/compaction-simulator.rs +++ b/pageserver/compaction/src/bin/compaction-simulator.rs @@ -1,4 +1,5 @@ use clap::{Parser, Subcommand}; +use pageserver_compaction::helpers::PAGE_SZ; use pageserver_compaction::simulator::MockTimeline; use rand::Rng; use std::io::Write; @@ -51,7 +52,7 @@ async fn simulate(cmd: &SimulateCmd, results_path: &Path) -> anyhow::Result<()> let mut executor = MockTimeline::new(); // Convert the logical size in MB into a key range. - let key_range = 0..((cmd.logical_size * 1024 * 1024) / 8192); + let key_range = 0..((cmd.logical_size * 1024 * 1024) / PAGE_SZ); //let key_range = u64::MIN..u64::MAX; println!( "starting simulation with key range {:016X}-{:016X}", diff --git a/pageserver/compaction/src/compact_tiered.rs b/pageserver/compaction/src/compact_tiered.rs index 137b93055a..20f88868f9 100644 --- a/pageserver/compaction/src/compact_tiered.rs +++ b/pageserver/compaction/src/compact_tiered.rs @@ -24,7 +24,9 @@ use tracing::{debug, info}; use std::collections::{HashSet, VecDeque}; use std::ops::Range; -use crate::helpers::{accum_key_values, keyspace_total_size, merge_delta_keys, overlaps_with}; +use crate::helpers::{ + accum_key_values, keyspace_total_size, merge_delta_keys_buffered, overlaps_with, PAGE_SZ, +}; use crate::interface::*; use utils::lsn::Lsn; @@ -104,7 +106,13 @@ pub async fn compact_tiered( ctx, ) .await?; - if target_file_size == u64::MAX { + if current_level_target_height == u64::MAX { + // our target height includes all possible lsns + info!( + level = current_level_no, + depth = depth, + "compaction loop reached max current_level_target_height" + ); break; } current_level_no += 1; @@ -371,7 +379,7 @@ where .get_keyspace(&job.key_range, job.lsn_range.end, ctx) .await?, &self.shard_identity, - ) * 8192; + ) * PAGE_SZ; let wal_size = job .input_layers @@ -433,7 +441,7 @@ where let mut window = KeyspaceWindow::new( E::Key::MIN..E::Key::MAX, keyspace, - self.target_file_size / 8192, + self.target_file_size / PAGE_SZ, ); while let Some(key_range) = window.choose_next_image(&self.shard_identity) { new_jobs.push(CompactionJob:: { @@ -522,8 +530,6 @@ where // If we have accumulated only a narrow band of keyspace, create an // image layer. Otherwise write a delta layer. - // FIXME: deal with the case of lots of values for same key - // FIXME: we are ignoring images here. Did we already divide the work // so that we won't encounter them here? @@ -535,43 +541,101 @@ where } } // Open stream - let key_value_stream = std::pin::pin!(merge_delta_keys::(deltas.as_slice(), ctx)); + let key_value_stream = + std::pin::pin!(merge_delta_keys_buffered::(deltas.as_slice(), ctx) + .await? + .map(Result::<_, anyhow::Error>::Ok)); let mut new_jobs = Vec::new(); // Slide a window through the keyspace - let mut key_accum = std::pin::pin!(accum_key_values(key_value_stream)); + let mut key_accum = + std::pin::pin!(accum_key_values(key_value_stream, self.target_file_size)); let mut all_in_window: bool = false; let mut window = Window::new(); + + // Helper function to create a job for a new delta layer with given key-lsn + // rectangle. + let create_delta_job = |key_range, lsn_range: &Range, new_jobs: &mut Vec<_>| { + // The inputs for the job are all the input layers of the original job that + // overlap with the rectangle. + let batch_layers: Vec = job + .input_layers + .iter() + .filter(|layer_id| { + overlaps_with(self.layers[layer_id.0].layer.key_range(), &key_range) + }) + .cloned() + .collect(); + assert!(!batch_layers.is_empty()); + new_jobs.push(CompactionJob { + key_range, + lsn_range: lsn_range.clone(), + strategy: CompactionStrategy::CreateDelta, + input_layers: batch_layers, + completed: false, + }); + }; + loop { - if all_in_window && window.elems.is_empty() { + if all_in_window && window.is_empty() { // All done! break; } + + // If we now have enough keyspace for next delta layer in the window, create a + // new delta layer if let Some(key_range) = window.choose_next_delta(self.target_file_size, !all_in_window) { - let batch_layers: Vec = job - .input_layers - .iter() - .filter(|layer_id| { - overlaps_with(self.layers[layer_id.0].layer.key_range(), &key_range) - }) - .cloned() - .collect(); - assert!(!batch_layers.is_empty()); - new_jobs.push(CompactionJob { - key_range, - lsn_range: job.lsn_range.clone(), - strategy: CompactionStrategy::CreateDelta, - input_layers: batch_layers, - completed: false, - }); - } else { - assert!(!all_in_window); - if let Some(next_key) = key_accum.next().await.transpose()? { - window.feed(next_key.key, next_key.size); - } else { + create_delta_job(key_range, &job.lsn_range, &mut new_jobs); + continue; + } + assert!(!all_in_window); + + // Process next key in the key space + match key_accum.next().await.transpose()? { + None => { all_in_window = true; } + Some(next_key) if next_key.partition_lsns.is_empty() => { + // Normal case: extend the window by the key + window.feed(next_key.key, next_key.size); + } + Some(next_key) => { + // A key with too large size impact for a single delta layer. This + // case occurs if you make a huge number of updates for a single key. + // + // Drain the window with has_more = false to make a clean cut before + // the key, and then make dedicated delta layers for the single key. + // + // We cannot cluster the key with the others, because we don't want + // layer files to overlap with each other in the lsn,key space (no + // overlaps for the rectangles). + let key = next_key.key; + debug!("key {key} with size impact larger than the layer size"); + while !window.is_empty() { + let has_more = false; + let key_range = window.choose_next_delta(self.target_file_size, has_more) + .expect("with has_more==false, choose_next_delta always returns something for a non-empty Window"); + create_delta_job(key_range, &job.lsn_range, &mut new_jobs); + } + + // Not really required: but here for future resilience: + // We make a "gap" here, so any structure the window holds should + // probably be reset. + window = Window::new(); + + let mut prior_lsn = job.lsn_range.start; + let mut lsn_ranges = Vec::new(); + for (lsn, _size) in next_key.partition_lsns.iter() { + lsn_ranges.push(prior_lsn..*lsn); + prior_lsn = *lsn; + } + lsn_ranges.push(prior_lsn..job.lsn_range.end); + for lsn_range in lsn_ranges { + let key_range = key..key.next(); + create_delta_job(key_range, &lsn_range, &mut new_jobs); + } + } } } @@ -599,8 +663,8 @@ where } } -// Sliding window through keyspace and values -// This is used by over_with_images to decide on good split points +/// Sliding window through keyspace and values for image layer +/// This is used by [`LevelCompactionState::cover_with_images`] to decide on good split points struct KeyspaceWindow { head: KeyspaceWindowHead, @@ -740,9 +804,9 @@ struct WindowElement { accum_size: u64, } -// Sliding window through keyspace and values -// -// This is used to decide what layer to write next, from the beginning of the window. +/// Sliding window through keyspace and values for delta layer tiling +/// +/// This is used to decide which delta layer to write next. struct Window { elems: VecDeque>, @@ -766,11 +830,13 @@ where fn feed(&mut self, key: K, size: u64) { let last_size; if let Some(last) = self.elems.back_mut() { - assert!(last.last_key <= key); - if key == last.last_key { - last.accum_size += size; - return; - } + // We require the keys to be strictly increasing for the window. + // Keys should already have been deduplicated by `accum_key_values` + assert!( + last.last_key < key, + "last_key(={}) >= key(={key})", + last.last_key + ); last_size = last.accum_size; } else { last_size = 0; @@ -792,6 +858,10 @@ where self.elems.front().unwrap().accum_size - self.splitoff_size } + fn is_empty(&self) -> bool { + self.elems.is_empty() + } + fn commit_upto(&mut self, mut upto: usize) { while upto > 1 { let popped = self.elems.pop_front().unwrap(); @@ -854,7 +924,7 @@ where // If we're willing to stretch it up to 1.25 target size, could we // gobble up the rest of the work? This avoids creating very small // "tail" layers at the end of the keyspace - if !has_more && self.remain_size() < target_size * 5 / 3 { + if !has_more && self.remain_size() < target_size * 5 / 4 { self.commit_upto(self.elems.len()); } else { let delta_split_at = self.find_size_split(target_size); diff --git a/pageserver/compaction/src/helpers.rs b/pageserver/compaction/src/helpers.rs index 1b80373ba7..8ed1d16082 100644 --- a/pageserver/compaction/src/helpers.rs +++ b/pageserver/compaction/src/helpers.rs @@ -9,10 +9,14 @@ use pageserver_api::shard::ShardIdentity; use pin_project_lite::pin_project; use std::collections::BinaryHeap; use std::collections::VecDeque; +use std::fmt::Display; use std::future::Future; use std::ops::{DerefMut, Range}; use std::pin::Pin; use std::task::{ready, Poll}; +use utils::lsn::Lsn; + +pub const PAGE_SZ: u64 = 8192; pub fn keyspace_total_size( keyspace: &CompactionKeySpace, @@ -108,17 +112,40 @@ pub fn merge_delta_keys<'a, E: CompactionJobExecutor>( } } +pub async fn merge_delta_keys_buffered<'a, E: CompactionJobExecutor + 'a>( + layers: &'a [E::DeltaLayer], + ctx: &'a E::RequestContext, +) -> anyhow::Result>::DeltaEntry<'a>>> +{ + let mut keys = Vec::new(); + for l in layers { + // Boxing and casting to LoadFuture is required to obtain the right Sync bound. + // If we do l.load_keys(ctx).await? directly, there is a compilation error. + let load_future: LoadFuture<'a, _> = Box::pin(l.load_keys(ctx)); + keys.extend(load_future.await?.into_iter()); + } + keys.sort_by_key(|k| (k.key(), k.lsn())); + let stream = futures::stream::iter(keys.into_iter()); + Ok(stream) +} + enum LazyLoadLayer<'a, E: CompactionJobExecutor> { Loaded(VecDeque<>::DeltaEntry<'a>>), Unloaded(&'a E::DeltaLayer), } impl<'a, E: CompactionJobExecutor> LazyLoadLayer<'a, E> { - fn key(&self) -> E::Key { + fn min_key(&self) -> E::Key { match self { Self::Loaded(entries) => entries.front().unwrap().key(), Self::Unloaded(dl) => dl.key_range().start, } } + fn min_lsn(&self) -> Lsn { + match self { + Self::Loaded(entries) => entries.front().unwrap().lsn(), + Self::Unloaded(dl) => dl.lsn_range().start, + } + } } impl<'a, E: CompactionJobExecutor> PartialOrd for LazyLoadLayer<'a, E> { fn partial_cmp(&self, other: &Self) -> Option { @@ -128,12 +155,12 @@ impl<'a, E: CompactionJobExecutor> PartialOrd for LazyLoadLayer<'a, E> { impl<'a, E: CompactionJobExecutor> Ord for LazyLoadLayer<'a, E> { fn cmp(&self, other: &Self) -> std::cmp::Ordering { // reverse order so that we get a min-heap - other.key().cmp(&self.key()) + (other.min_key(), other.min_lsn()).cmp(&(self.min_key(), self.min_lsn())) } } impl<'a, E: CompactionJobExecutor> PartialEq for LazyLoadLayer<'a, E> { fn eq(&self, other: &Self) -> bool { - self.key().eq(&other.key()) + self.cmp(other) == std::cmp::Ordering::Equal } } impl<'a, E: CompactionJobExecutor> Eq for LazyLoadLayer<'a, E> {} @@ -210,11 +237,16 @@ pub struct KeySize { pub key: K, pub num_values: u64, pub size: u64, + /// The lsns to partition at (if empty then no per-lsn partitioning) + pub partition_lsns: Vec<(Lsn, u64)>, } -pub fn accum_key_values<'a, I, K, D, E>(input: I) -> impl Stream, E>> +pub fn accum_key_values<'a, I, K, D, E>( + input: I, + target_size: u64, +) -> impl Stream, E>> where - K: Eq, + K: Eq + PartialOrd + Display + Copy, I: Stream>, D: CompactionDeltaEntry<'a, K>, { @@ -224,25 +256,39 @@ where if let Some(first) = input.next().await { let first = first?; + let mut part_size = first.size(); let mut accum: KeySize = KeySize { key: first.key(), num_values: 1, - size: first.size(), + size: part_size, + partition_lsns: Vec::new(), }; + let mut last_key = accum.key; while let Some(this) = input.next().await { let this = this?; if this.key() == accum.key { - accum.size += this.size(); + let add_size = this.size(); + if part_size + add_size > target_size { + accum.partition_lsns.push((this.lsn(), part_size)); + part_size = 0; + } + part_size += add_size; + accum.size += add_size; accum.num_values += 1; } else { + assert!(last_key <= accum.key, "last_key={last_key} <= accum.key={}", accum.key); + last_key = accum.key; yield accum; + part_size = this.size(); accum = KeySize { key: this.key(), num_values: 1, - size: this.size(), + size: part_size, + partition_lsns: Vec::new(), }; } } + assert!(last_key <= accum.key, "last_key={last_key} <= accum.key={}", accum.key); yield accum; } } diff --git a/pageserver/compaction/src/identify_levels.rs b/pageserver/compaction/src/identify_levels.rs index 98dd46925c..1853afffdd 100644 --- a/pageserver/compaction/src/identify_levels.rs +++ b/pageserver/compaction/src/identify_levels.rs @@ -184,6 +184,12 @@ impl Level { } let mut events: Vec> = Vec::new(); for (idx, l) in self.layers.iter().enumerate() { + let key_range = l.key_range(); + if key_range.end == key_range.start.next() && l.is_delta() { + // Ignore single-key delta layers as they can be stacked on top of each other + // as that is the only way to cut further. + continue; + } events.push(Event { key: l.key_range().start, layer_idx: idx, diff --git a/pageserver/compaction/src/simulator.rs b/pageserver/compaction/src/simulator.rs index 3543df64fa..a7c8bd5c1f 100644 --- a/pageserver/compaction/src/simulator.rs +++ b/pageserver/compaction/src/simulator.rs @@ -14,6 +14,7 @@ use std::ops::Range; use std::sync::Arc; use std::sync::Mutex; +use crate::helpers::PAGE_SZ; use crate::helpers::{merge_delta_keys, overlaps_with}; use crate::interface; @@ -509,7 +510,7 @@ impl interface::CompactionJobExecutor for MockTimeline { let new_layer = Arc::new(MockImageLayer { key_range: key_range.clone(), lsn_range: lsn..lsn, - file_size: accum_size * 8192, + file_size: accum_size * PAGE_SZ, deleted: Mutex::new(false), }); info!( diff --git a/pageserver/compaction/tests/tests.rs b/pageserver/compaction/tests/tests.rs index 1cea2a20e1..bd8b54a286 100644 --- a/pageserver/compaction/tests/tests.rs +++ b/pageserver/compaction/tests/tests.rs @@ -1,23 +1,35 @@ +use once_cell::sync::OnceCell; use pageserver_compaction::interface::CompactionLayer; use pageserver_compaction::simulator::MockTimeline; +use utils::logging; + +static LOG_HANDLE: OnceCell<()> = OnceCell::new(); + +pub(crate) fn setup_logging() { + LOG_HANDLE.get_or_init(|| { + logging::init( + logging::LogFormat::Test, + logging::TracingErrorLayerEnablement::EnableWithRustLogFilter, + logging::Output::Stdout, + ) + .expect("Failed to init test logging") + }); +} /// Test the extreme case that there are so many updates for a single key that /// even if we produce an extremely narrow delta layer, spanning just that one /// key, we still too many records to fit in the target file size. We need to /// split in the LSN dimension too in that case. -/// -/// TODO: The code to avoid this problem has not been implemented yet! So the -/// assertion currently fails, but we need to make it not fail. -#[ignore] #[tokio::test] async fn test_many_updates_for_single_key() { + setup_logging(); let mut executor = MockTimeline::new(); - executor.target_file_size = 10_000_000; // 10 MB + executor.target_file_size = 1_000_000; // 1 MB - // Ingest 100 MB of updates to a single key. + // Ingest 10 MB of updates to a single key. for _ in 1..1000 { executor.ingest_uniform(100, 10, &(0..100_000)).unwrap(); - executor.ingest_uniform(10_000, 10, &(0..1)).unwrap(); + executor.ingest_uniform(1000, 10, &(0..1)).unwrap(); executor.compact().await.unwrap(); } @@ -27,9 +39,32 @@ async fn test_many_updates_for_single_key() { } for l in executor.live_layers.iter() { assert!(l.file_size() < executor.target_file_size * 2); - // sanity check that none of the delta layers are stupidly small either + // Sanity check that none of the delta layers are empty either. if l.is_delta() { - assert!(l.file_size() > executor.target_file_size / 2); + assert!(l.file_size() > 0); } } } + +#[tokio::test] +async fn test_simple_updates() { + setup_logging(); + let mut executor = MockTimeline::new(); + executor.target_file_size = 500_000; // 500 KB + + // Ingest some traffic. + for _ in 1..400 { + executor.ingest_uniform(100, 500, &(0..100_000)).unwrap(); + } + + for l in executor.live_layers.iter() { + println!("layer {}: {}", l.short_id(), l.file_size()); + } + + println!("Running compaction..."); + executor.compact().await.unwrap(); + + for l in executor.live_layers.iter() { + println!("layer {}: {}", l.short_id(), l.file_size()); + } +} diff --git a/pageserver/ctl/src/draw_timeline_dir.rs b/pageserver/ctl/src/draw_timeline_dir.rs index 9a556cb3d4..389519c65a 100644 --- a/pageserver/ctl/src/draw_timeline_dir.rs +++ b/pageserver/ctl/src/draw_timeline_dir.rs @@ -28,6 +28,8 @@ //! # From an `index_part.json` in S3 //! (jq -r '.layer_metadata | keys[]' | cargo run -p pagectl draw-timeline ) < index_part.json-00000016 > out.svg //! +//! # enrich with lines for gc_cutoff and a child branch point +//! cat <(jq -r '.historic_layers[] | .layer_file_name' < layers.json) <(echo -e 'gc_cutoff:0000001CE3FE32C9\nbranch:0000001DE3FE32C9') | cargo run --bin pagectl draw-timeline >| out.svg //! ``` //! //! ## Viewing @@ -48,9 +50,8 @@ //! ``` //! -use anyhow::Result; +use anyhow::{Context, Result}; use pageserver::repository::Key; -use pageserver::METADATA_FILE_NAME; use std::cmp::Ordering; use std::io::{self, BufRead}; use std::path::PathBuf; @@ -81,6 +82,11 @@ fn parse_filename(name: &str) -> (Range, Range) { let split: Vec<&str> = name.split("__").collect(); let keys: Vec<&str> = split[0].split('-').collect(); let mut lsns: Vec<&str> = split[1].split('-').collect(); + + if lsns.last().expect("should").len() == 8 { + lsns.pop(); + } + if lsns.len() == 1 { lsns.push(lsns[0]); } @@ -90,6 +96,33 @@ fn parse_filename(name: &str) -> (Range, Range) { (keys, lsns) } +#[derive(Clone, Copy)] +enum LineKind { + GcCutoff, + Branch, +} + +impl From for Fill { + fn from(value: LineKind) -> Self { + match value { + LineKind::GcCutoff => Fill::Color(rgb(255, 0, 0)), + LineKind::Branch => Fill::Color(rgb(0, 255, 0)), + } + } +} + +impl FromStr for LineKind { + type Err = anyhow::Error; + + fn from_str(s: &str) -> std::prelude::v1::Result { + Ok(match s { + "gc_cutoff" => LineKind::GcCutoff, + "branch" => LineKind::Branch, + _ => anyhow::bail!("unsupported linekind: {s}"), + }) + } +} + pub fn main() -> Result<()> { // Parse layer filenames from stdin struct Layer { @@ -99,15 +132,32 @@ pub fn main() -> Result<()> { } let mut files: Vec = vec![]; let stdin = io::stdin(); - for line in stdin.lock().lines() { + + let mut lines: Vec<(Lsn, LineKind)> = vec![]; + + for (lineno, line) in stdin.lock().lines().enumerate() { + let lineno = lineno + 1; + let line = line.unwrap(); + if let Some((kind, lsn)) = line.split_once(':') { + let (kind, lsn) = LineKind::from_str(kind) + .context("parse kind") + .and_then(|kind| { + if lsn.contains('/') { + Lsn::from_str(lsn) + } else { + Lsn::from_hex(lsn) + } + .map(|lsn| (kind, lsn)) + .context("parse lsn") + }) + .with_context(|| format!("parse {line:?} on {lineno}"))?; + lines.push((lsn, kind)); + continue; + } let line = PathBuf::from_str(&line).unwrap(); let filename = line.file_name().unwrap(); let filename = filename.to_str().unwrap(); - if filename == METADATA_FILE_NAME { - // Don't try and parse "metadata" like a key-lsn range - continue; - } let (key_range, lsn_range) = parse_filename(filename); files.push(Layer { filename: filename.to_owned(), @@ -117,8 +167,9 @@ pub fn main() -> Result<()> { } // Collect all coordinates - let mut keys: Vec = vec![]; - let mut lsns: Vec = vec![]; + let mut keys: Vec = Vec::with_capacity(files.len()); + let mut lsns: Vec = Vec::with_capacity(files.len() + lines.len()); + for Layer { key_range: keyr, lsn_range: lsnr, @@ -131,6 +182,8 @@ pub fn main() -> Result<()> { lsns.push(lsnr.end); } + lsns.extend(lines.iter().map(|(lsn, _)| *lsn)); + // Analyze let key_map = build_coordinate_compression_map(keys); let lsn_map = build_coordinate_compression_map(lsns); @@ -144,10 +197,13 @@ pub fn main() -> Result<()> { println!( "{}", BeginSvg { - w: key_map.len() as f32, + w: (key_map.len() + 10) as f32, h: stretch * lsn_map.len() as f32 } ); + + let xmargin = 0.05; // Height-dependent margin to disambiguate overlapping deltas + for Layer { filename, key_range: keyr, @@ -169,7 +225,6 @@ pub fn main() -> Result<()> { let mut lsn_diff = (lsn_end - lsn_start) as f32; let mut fill = Fill::None; let mut ymargin = 0.05 * lsn_diff; // Height-dependent margin to disambiguate overlapping deltas - let xmargin = 0.05; // Height-dependent margin to disambiguate overlapping deltas let mut lsn_offset = 0.0; // Fill in and thicken rectangle if it's an @@ -189,7 +244,7 @@ pub fn main() -> Result<()> { println!( " {}", rectangle( - key_start as f32 + stretch * xmargin, + 5.0 + key_start as f32 + stretch * xmargin, stretch * (lsn_max as f32 - (lsn_end as f32 - ymargin - lsn_offset)), key_diff as f32 - stretch * 2.0 * xmargin, stretch * (lsn_diff - 2.0 * ymargin) @@ -200,6 +255,26 @@ pub fn main() -> Result<()> { .comment(filename) ); } + + for (lsn, kind) in lines { + let lsn_start = *lsn_map.get(&lsn).unwrap(); + let lsn_end = lsn_start; + let stretch = 2.0; + let lsn_diff = 0.3; + let lsn_offset = -lsn_diff / 2.0; + let ymargin = 0.05; + println!( + "{}", + rectangle( + 0.0f32 + stretch * xmargin, + stretch * (lsn_map.len() as f32 - (lsn_end as f32 - ymargin - lsn_offset)), + (key_map.len() + 10) as f32, + stretch * (lsn_diff - 2.0 * ymargin) + ) + .fill(kind) + ); + } + println!("{}", EndSvg); eprintln!("num_images: {}", num_images); diff --git a/pageserver/ctl/src/index_part.rs b/pageserver/ctl/src/index_part.rs index 20e5572914..0d010eb009 100644 --- a/pageserver/ctl/src/index_part.rs +++ b/pageserver/ctl/src/index_part.rs @@ -3,7 +3,7 @@ use std::collections::HashMap; use anyhow::Context; use camino::Utf8PathBuf; use pageserver::tenant::remote_timeline_client::index::IndexLayerMetadata; -use pageserver::tenant::storage_layer::LayerFileName; +use pageserver::tenant::storage_layer::LayerName; use pageserver::tenant::{metadata::TimelineMetadata, IndexPart}; use utils::lsn::Lsn; @@ -19,7 +19,7 @@ pub(crate) async fn main(cmd: &IndexPartCmd) -> anyhow::Result<()> { let des: IndexPart = IndexPart::from_s3_bytes(&bytes).context("deserialize")?; #[derive(serde::Serialize)] struct Output<'a> { - layer_metadata: &'a HashMap, + layer_metadata: &'a HashMap, disk_consistent_lsn: Lsn, timeline_metadata: &'a TimelineMetadata, } diff --git a/pageserver/ctl/src/layer_map_analyzer.rs b/pageserver/ctl/src/layer_map_analyzer.rs index c4c282f33d..b4bb239f44 100644 --- a/pageserver/ctl/src/layer_map_analyzer.rs +++ b/pageserver/ctl/src/layer_map_analyzer.rs @@ -100,7 +100,7 @@ pub(crate) fn parse_filename(name: &str) -> Option { // Finds the max_holes largest holes, ignoring any that are smaller than MIN_HOLE_LENGTH" async fn get_holes(path: &Utf8Path, max_holes: usize, ctx: &RequestContext) -> Result> { - let file = VirtualFile::open(path).await?; + let file = VirtualFile::open(path, ctx).await?; let file_id = page_cache::next_file_id(); let block_reader = FileBlockReader::new(&file, file_id); let summary_blk = block_reader.read_blk(0, ctx).await?; diff --git a/pageserver/ctl/src/layers.rs b/pageserver/ctl/src/layers.rs index be8f91675d..3611b0baab 100644 --- a/pageserver/ctl/src/layers.rs +++ b/pageserver/ctl/src/layers.rs @@ -61,7 +61,7 @@ async fn read_delta_file(path: impl AsRef, ctx: &RequestContext) -> Result let path = Utf8Path::from_path(path.as_ref()).expect("non-Unicode path"); virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs); page_cache::init(100); - let file = VirtualFile::open(path).await?; + let file = VirtualFile::open(path, ctx).await?; let file_id = page_cache::next_file_id(); let block_reader = FileBlockReader::new(&file, file_id); let summary_blk = block_reader.read_blk(0, ctx).await?; diff --git a/pageserver/ctl/src/main.rs b/pageserver/ctl/src/main.rs index 1fb75584fc..e92c352dab 100644 --- a/pageserver/ctl/src/main.rs +++ b/pageserver/ctl/src/main.rs @@ -219,6 +219,7 @@ fn handle_metadata( let mut meta = TimelineMetadata::from_bytes(&metadata_bytes)?; println!("Current metadata:\n{meta:?}"); let mut update_meta = false; + // TODO: simplify this part if let Some(disk_consistent_lsn) = disk_consistent_lsn { meta = TimelineMetadata::new( *disk_consistent_lsn, diff --git a/pageserver/pagebench/src/cmd/aux_files.rs b/pageserver/pagebench/src/cmd/aux_files.rs new file mode 100644 index 0000000000..eb5b242a5f --- /dev/null +++ b/pageserver/pagebench/src/cmd/aux_files.rs @@ -0,0 +1,98 @@ +use pageserver_api::models::{AuxFilePolicy, TenantConfig, TenantConfigRequest}; +use pageserver_api::shard::TenantShardId; +use utils::id::TenantTimelineId; +use utils::lsn::Lsn; + +use std::collections::HashMap; +use std::sync::Arc; + +/// Ingest aux files into the pageserver. +#[derive(clap::Parser)] +pub(crate) struct Args { + #[clap(long, default_value = "http://localhost:9898")] + mgmt_api_endpoint: String, + #[clap(long, default_value = "postgres://postgres@localhost:64000")] + page_service_connstring: String, + #[clap(long)] + pageserver_jwt: Option, + + targets: Option>, +} + +pub(crate) fn main(args: Args) -> anyhow::Result<()> { + let rt = tokio::runtime::Builder::new_multi_thread() + .enable_all() + .build() + .unwrap(); + + let main_task = rt.spawn(main_impl(args)); + rt.block_on(main_task).unwrap() +} + +async fn main_impl(args: Args) -> anyhow::Result<()> { + let args: &'static Args = Box::leak(Box::new(args)); + + let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new( + args.mgmt_api_endpoint.clone(), + args.pageserver_jwt.as_deref(), + )); + + // discover targets + let timelines: Vec = crate::util::cli::targets::discover( + &mgmt_api_client, + crate::util::cli::targets::Spec { + limit_to_first_n_targets: None, + targets: { + if let Some(targets) = &args.targets { + if targets.len() != 1 { + anyhow::bail!("must specify exactly one target"); + } + Some(targets.clone()) + } else { + None + } + }, + }, + ) + .await?; + + let timeline = timelines[0]; + let tenant_shard_id = TenantShardId::unsharded(timeline.tenant_id); + let timeline_id = timeline.timeline_id; + + println!("operating on timeline {}", timeline); + + mgmt_api_client + .tenant_config(&TenantConfigRequest { + tenant_id: timeline.tenant_id, + config: TenantConfig { + switch_aux_file_policy: Some(AuxFilePolicy::V2), + ..Default::default() + }, + }) + .await?; + + for batch in 0..100 { + let items = (0..100) + .map(|id| { + ( + format!("pg_logical/mappings/{:03}.{:03}", batch, id), + format!("{:08}", id), + ) + }) + .collect::>(); + let file_cnt = items.len(); + mgmt_api_client + .ingest_aux_files(tenant_shard_id, timeline_id, items) + .await?; + println!("ingested {file_cnt} files"); + } + + let files = mgmt_api_client + .list_aux_files(tenant_shard_id, timeline_id, Lsn(Lsn::MAX.0 - 1)) + .await?; + + println!("{} files found", files.len()); + + anyhow::Ok(()) +} diff --git a/pageserver/pagebench/src/cmd/ondemand_download_churn.rs b/pageserver/pagebench/src/cmd/ondemand_download_churn.rs index 197e782dca..1bb71b9353 100644 --- a/pageserver/pagebench/src/cmd/ondemand_download_churn.rs +++ b/pageserver/pagebench/src/cmd/ondemand_download_churn.rs @@ -2,9 +2,11 @@ use pageserver_api::{models::HistoricLayerInfo, shard::TenantShardId}; use pageserver_client::mgmt_api; use rand::seq::SliceRandom; +use tokio_util::sync::CancellationToken; use tracing::{debug, info}; use utils::id::{TenantTimelineId, TimelineId}; +use std::{f64, sync::Arc}; use tokio::{ sync::{mpsc, OwnedSemaphorePermit}, task::JoinSet, @@ -12,10 +14,7 @@ use tokio::{ use std::{ num::NonZeroUsize, - sync::{ - atomic::{AtomicU64, Ordering}, - Arc, - }, + sync::atomic::{AtomicU64, Ordering}, time::{Duration, Instant}, }; @@ -51,19 +50,31 @@ pub(crate) fn main(args: Args) -> anyhow::Result<()> { Ok(()) } +#[derive(serde::Serialize)] +struct Output { + downloads_count: u64, + downloads_bytes: u64, + evictions_count: u64, + timeline_restarts: u64, + #[serde(with = "humantime_serde")] + runtime: Duration, +} + #[derive(Debug, Default)] struct LiveStats { - evictions: AtomicU64, - downloads: AtomicU64, + evictions_count: AtomicU64, + downloads_count: AtomicU64, + downloads_bytes: AtomicU64, timeline_restarts: AtomicU64, } impl LiveStats { fn eviction_done(&self) { - self.evictions.fetch_add(1, Ordering::Relaxed); + self.evictions_count.fetch_add(1, Ordering::Relaxed); } - fn download_done(&self) { - self.downloads.fetch_add(1, Ordering::Relaxed); + fn download_done(&self, size: u64) { + self.downloads_count.fetch_add(1, Ordering::Relaxed); + self.downloads_bytes.fetch_add(size, Ordering::Relaxed); } fn timeline_restart_done(&self) { self.timeline_restarts.fetch_add(1, Ordering::Relaxed); @@ -92,28 +103,49 @@ async fn main_impl(args: Args) -> anyhow::Result<()> { ) .await?; + let token = CancellationToken::new(); let mut tasks = JoinSet::new(); - let live_stats = Arc::new(LiveStats::default()); + let periodic_stats = Arc::new(LiveStats::default()); + let total_stats = Arc::new(LiveStats::default()); + + let start = Instant::now(); tasks.spawn({ - let live_stats = Arc::clone(&live_stats); + let periodic_stats = Arc::clone(&periodic_stats); + let total_stats = Arc::clone(&total_stats); + let cloned_token = token.clone(); async move { let mut last_at = Instant::now(); loop { + if cloned_token.is_cancelled() { + return; + } tokio::time::sleep_until((last_at + Duration::from_secs(1)).into()).await; let now = Instant::now(); let delta: Duration = now - last_at; last_at = now; let LiveStats { - evictions, - downloads, + evictions_count, + downloads_count, + downloads_bytes, timeline_restarts, - } = &*live_stats; - let evictions = evictions.swap(0, Ordering::Relaxed) as f64 / delta.as_secs_f64(); - let downloads = downloads.swap(0, Ordering::Relaxed) as f64 / delta.as_secs_f64(); + } = &*periodic_stats; + let evictions_count = evictions_count.swap(0, Ordering::Relaxed); + let downloads_count = downloads_count.swap(0, Ordering::Relaxed); + let downloads_bytes = downloads_bytes.swap(0, Ordering::Relaxed); let timeline_restarts = timeline_restarts.swap(0, Ordering::Relaxed); - info!("evictions={evictions:.2}/s downloads={downloads:.2}/s timeline_restarts={timeline_restarts}"); + + total_stats.evictions_count.fetch_add(evictions_count, Ordering::Relaxed); + total_stats.downloads_count.fetch_add(downloads_count, Ordering::Relaxed); + total_stats.downloads_bytes.fetch_add(downloads_bytes, Ordering::Relaxed); + total_stats.timeline_restarts.fetch_add(timeline_restarts, Ordering::Relaxed); + + let evictions_per_s = evictions_count as f64 / delta.as_secs_f64(); + let downloads_per_s = downloads_count as f64 / delta.as_secs_f64(); + let downloads_mibs_per_s = downloads_bytes as f64 / delta.as_secs_f64() / ((1 << 20) as f64); + + info!("evictions={evictions_per_s:.2}/s downloads={downloads_per_s:.2}/s download_bytes={downloads_mibs_per_s:.2}MiB/s timeline_restarts={timeline_restarts}"); } } }); @@ -124,14 +156,42 @@ async fn main_impl(args: Args) -> anyhow::Result<()> { args, Arc::clone(&mgmt_api_client), tl, - Arc::clone(&live_stats), + Arc::clone(&periodic_stats), + token.clone(), )); } } + if let Some(runtime) = args.runtime { + tokio::spawn(async move { + tokio::time::sleep(runtime.into()).await; + token.cancel(); + }); + } while let Some(res) = tasks.join_next().await { res.unwrap(); } + let end = Instant::now(); + let duration: Duration = end - start; + + let output = { + let LiveStats { + evictions_count, + downloads_count, + downloads_bytes, + timeline_restarts, + } = &*total_stats; + Output { + downloads_count: downloads_count.load(Ordering::Relaxed), + downloads_bytes: downloads_bytes.load(Ordering::Relaxed), + evictions_count: evictions_count.load(Ordering::Relaxed), + timeline_restarts: timeline_restarts.load(Ordering::Relaxed), + runtime: duration, + } + }; + let output = serde_json::to_string_pretty(&output).unwrap(); + println!("{output}"); + Ok(()) } @@ -140,6 +200,7 @@ async fn timeline_actor( mgmt_api_client: Arc, timeline: TenantTimelineId, live_stats: Arc, + token: CancellationToken, ) { // TODO: support sharding let tenant_shard_id = TenantShardId::unsharded(timeline.tenant_id); @@ -149,7 +210,7 @@ async fn timeline_actor( layers: Vec>, concurrency: Arc, } - loop { + while !token.is_cancelled() { debug!("restarting timeline"); let layer_map_info = mgmt_api_client .layer_map_info(tenant_shard_id, timeline.timeline_id) @@ -185,7 +246,7 @@ async fn timeline_actor( live_stats.timeline_restart_done(); - loop { + while !token.is_cancelled() { assert!(!timeline.joinset.is_empty()); if let Some(res) = timeline.joinset.try_join_next() { debug!(?res, "a layer actor exited, should not happen"); @@ -255,7 +316,7 @@ async fn layer_actor( .layer_ondemand_download(tenant_shard_id, timeline_id, layer.layer_file_name()) .await .unwrap(); - live_stats.download_done(); + live_stats.download_done(layer.layer_file_size()); did_it } }; diff --git a/pageserver/pagebench/src/main.rs b/pageserver/pagebench/src/main.rs index 743102d853..5527557450 100644 --- a/pageserver/pagebench/src/main.rs +++ b/pageserver/pagebench/src/main.rs @@ -14,6 +14,7 @@ mod util { /// The pagebench CLI sub-commands, dispatched in [`main`] below. mod cmd { + pub(super) mod aux_files; pub(super) mod basebackup; pub(super) mod getpage_latest_lsn; pub(super) mod ondemand_download_churn; @@ -27,6 +28,7 @@ enum Args { GetPageLatestLsn(cmd::getpage_latest_lsn::Args), TriggerInitialSizeCalculation(cmd::trigger_initial_size_calculation::Args), OndemandDownloadChurn(cmd::ondemand_download_churn::Args), + AuxFiles(cmd::aux_files::Args), } fn main() { @@ -46,6 +48,7 @@ fn main() { cmd::trigger_initial_size_calculation::main(args) } Args::OndemandDownloadChurn(args) => cmd::ondemand_download_churn::main(args), + Args::AuxFiles(args) => cmd::aux_files::main(args), } .unwrap() } diff --git a/pageserver/src/aux_file.rs b/pageserver/src/aux_file.rs index a26ed84a0d..38e1875db1 100644 --- a/pageserver/src/aux_file.rs +++ b/pageserver/src/aux_file.rs @@ -1,15 +1,39 @@ +use std::sync::Arc; + +use ::metrics::IntGauge; use bytes::{Buf, BufMut, Bytes}; use pageserver_api::key::{Key, AUX_KEY_PREFIX, METADATA_KEY_SIZE}; use tracing::warn; -/// Create a metadata key from a hash, encoded as [AUX_KEY_PREFIX, 2B directory prefix, first 13B of 128b xxhash]. +// BEGIN Copyright (c) 2017 Servo Contributors + +/// Const version of FNV hash. +#[inline] +#[must_use] +pub const fn fnv_hash(bytes: &[u8]) -> u128 { + const INITIAL_STATE: u128 = 0x6c62272e07bb014262b821756295c58d; + const PRIME: u128 = 0x0000000001000000000000000000013B; + + let mut hash = INITIAL_STATE; + let mut i = 0; + while i < bytes.len() { + hash ^= bytes[i] as u128; + hash = hash.wrapping_mul(PRIME); + i += 1; + } + hash +} + +// END Copyright (c) 2017 Servo Contributors + +/// Create a metadata key from a hash, encoded as [AUX_KEY_PREFIX, 2B directory prefix, least significant 13B of FNV hash]. fn aux_hash_to_metadata_key(dir_level1: u8, dir_level2: u8, data: &[u8]) -> Key { - let mut key = [0; METADATA_KEY_SIZE]; - let hash = twox_hash::xxh3::hash128(data).to_be_bytes(); + let mut key: [u8; 16] = [0; METADATA_KEY_SIZE]; + let hash = fnv_hash(data).to_be_bytes(); key[0] = AUX_KEY_PREFIX; key[1] = dir_level1; key[2] = dir_level2; - key[3..16].copy_from_slice(&hash[0..13]); + key[3..16].copy_from_slice(&hash[3..16]); Key::from_metadata_key_fixed_size(&key) } @@ -140,6 +164,55 @@ pub fn encode_file_value(files: &[(&str, &[u8])]) -> anyhow::Result> { Ok(encoded) } +/// An estimation of the size of aux files. +pub struct AuxFileSizeEstimator { + aux_file_size_gauge: IntGauge, + size: Arc>>, +} + +impl AuxFileSizeEstimator { + pub fn new(aux_file_size_gauge: IntGauge) -> Self { + Self { + aux_file_size_gauge, + size: Arc::new(std::sync::Mutex::new(None)), + } + } + + pub fn on_base_backup(&self, new_size: usize) { + let mut guard = self.size.lock().unwrap(); + *guard = Some(new_size as isize); + self.report(new_size as isize); + } + + pub fn on_add(&self, file_size: usize) { + let mut guard = self.size.lock().unwrap(); + if let Some(size) = &mut *guard { + *size += file_size as isize; + self.report(*size); + } + } + + pub fn on_remove(&self, file_size: usize) { + let mut guard = self.size.lock().unwrap(); + if let Some(size) = &mut *guard { + *size -= file_size as isize; + self.report(*size); + } + } + + pub fn on_update(&self, old_size: usize, new_size: usize) { + let mut guard = self.size.lock().unwrap(); + if let Some(size) = &mut *guard { + *size += new_size as isize - old_size as isize; + self.report(*size); + } + } + + pub fn report(&self, size: isize) { + self.aux_file_size_gauge.set(size as i64); + } +} + #[cfg(test)] mod tests { use super::*; @@ -148,15 +221,19 @@ mod tests { fn test_hash_portable() { // AUX file encoding requires the hash to be portable across all platforms. This test case checks // if the algorithm produces the same hash across different environments. + assert_eq!( - 305317690835051308206966631765527126151, - twox_hash::xxh3::hash128("test1".as_bytes()) + 265160408618497461376862998434862070044, + super::fnv_hash("test1".as_bytes()) ); assert_eq!( - 85104974691013376326742244813280798847, - twox_hash::xxh3::hash128("test/test2".as_bytes()) + 295486155126299629456360817749600553988, + super::fnv_hash("test/test2".as_bytes()) + ); + assert_eq!( + 144066263297769815596495629667062367629, + super::fnv_hash("".as_bytes()) ); - assert_eq!(0, twox_hash::xxh3::hash128("".as_bytes())); } #[test] @@ -164,28 +241,28 @@ mod tests { // To correct retrieve AUX files, the generated keys for the same file must be the same for all versions // of the page server. assert_eq!( - "6200000101E5B20C5F8DD5AA3289D6D9EAFA", - encode_aux_file_key("pg_logical/mappings/test1").to_string() + "62000001017F8B83D94F7081693471ABF91C", + encode_aux_file_key("pg_logical/mappings/test1").to_string(), ); assert_eq!( - "620000010239AAC544893139B26F501B97E6", - encode_aux_file_key("pg_logical/snapshots/test2").to_string() + "62000001027F8E83D94F7081693471ABFCCD", + encode_aux_file_key("pg_logical/snapshots/test2").to_string(), ); assert_eq!( - "620000010300000000000000000000000000", - encode_aux_file_key("pg_logical/replorigin_checkpoint").to_string() + "62000001032E07BB014262B821756295C58D", + encode_aux_file_key("pg_logical/replorigin_checkpoint").to_string(), ); assert_eq!( - "62000001FF8635AF2134B7266EC5B4189FD6", - encode_aux_file_key("pg_logical/unsupported").to_string() + "62000001FF4F38E1C74754E7D03C1A660178", + encode_aux_file_key("pg_logical/unsupported").to_string(), ); assert_eq!( - "6200000201772D0E5D71DE14DA86142A1619", + "62000002017F8D83D94F7081693471ABFB92", encode_aux_file_key("pg_replslot/test3").to_string() ); assert_eq!( - "620000FFFF1866EBEB53B807B26A2416F317", - encode_aux_file_key("other_file_not_supported").to_string() + "620000FFFF2B6ECC8AEF93F643DC44F15E03", + encode_aux_file_key("other_file_not_supported").to_string(), ); } diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index 58b18dae7d..dca1510810 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -601,7 +601,7 @@ where // add zenith.signal file let mut zenith_signal = String::new(); if self.prev_record_lsn == Lsn(0) { - if self.lsn == self.timeline.get_ancestor_lsn() { + if self.timeline.is_ancestor_lsn(self.lsn) { write!(zenith_signal, "PREV LSN: none") .map_err(|e| BasebackupError::Server(e.into()))?; } else { diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index eb4b8bb8bb..ba5b2608bd 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -284,7 +284,6 @@ fn start_pageserver( )) .unwrap(); pageserver::preinitialize_metrics(); - pageserver::metrics::wal_redo::set_process_kind_metric(conf.walredo_process_kind); // If any failpoints were set from FAILPOINTS environment variable, // print them to the log for debugging purposes @@ -516,16 +515,12 @@ fn start_pageserver( } }); - let secondary_controller = if let Some(remote_storage) = &remote_storage { - secondary::spawn_tasks( - tenant_manager.clone(), - remote_storage.clone(), - background_jobs_barrier.clone(), - shutdown_pageserver.clone(), - ) - } else { - secondary::null_controller() - }; + let secondary_controller = secondary::spawn_tasks( + tenant_manager.clone(), + remote_storage.clone(), + background_jobs_barrier.clone(), + shutdown_pageserver.clone(), + ); // shared state between the disk-usage backed eviction background task and the http endpoint // that allows triggering disk-usage based eviction manually. note that the http endpoint @@ -533,15 +528,13 @@ fn start_pageserver( // been configured. let disk_usage_eviction_state: Arc = Arc::default(); - if let Some(remote_storage) = &remote_storage { - launch_disk_usage_global_eviction_task( - conf, - remote_storage.clone(), - disk_usage_eviction_state.clone(), - tenant_manager.clone(), - background_jobs_barrier.clone(), - )?; - } + launch_disk_usage_global_eviction_task( + conf, + remote_storage.clone(), + disk_usage_eviction_state.clone(), + tenant_manager.clone(), + background_jobs_barrier.clone(), + )?; // Start up the service to handle HTTP mgmt API request. We created the // listener earlier already. @@ -654,17 +647,20 @@ fn start_pageserver( None, "libpq endpoint listener", true, - async move { - page_service::libpq_listener_main( - conf, - broker_client, - pg_auth, - pageserver_listener, - conf.pg_auth_type, - libpq_ctx, - task_mgr::shutdown_token(), - ) - .await + { + let tenant_manager = tenant_manager.clone(); + async move { + page_service::libpq_listener_main( + tenant_manager, + broker_client, + pg_auth, + pageserver_listener, + conf.pg_auth_type, + libpq_ctx, + task_mgr::shutdown_token(), + ) + .await + } }, ); } @@ -693,14 +689,7 @@ fn start_pageserver( // Right now that tree doesn't reach very far, and `task_mgr` is used instead. // The plan is to change that over time. shutdown_pageserver.take(); - let bg_remote_storage = remote_storage.clone(); - let bg_deletion_queue = deletion_queue.clone(); - pageserver::shutdown_pageserver( - &tenant_manager, - bg_remote_storage.map(|_| bg_deletion_queue), - 0, - ) - .await; + pageserver::shutdown_pageserver(&tenant_manager, deletion_queue.clone(), 0).await; unreachable!() }) } @@ -708,12 +697,11 @@ fn start_pageserver( fn create_remote_storage_client( conf: &'static PageServerConf, -) -> anyhow::Result> { +) -> anyhow::Result { let config = if let Some(config) = &conf.remote_storage_config { config } else { - tracing::warn!("no remote storage configured, this is a deprecated configuration"); - return Ok(None); + anyhow::bail!("no remote storage configured, this is a deprecated configuration"); }; // Create the client @@ -733,7 +721,7 @@ fn create_remote_storage_client( GenericRemoteStorage::unreliable_wrapper(remote_storage, conf.test_remote_failures); } - Ok(Some(remote_storage)) + Ok(remote_storage) } fn cli() -> Command { diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 258eed0b12..b0afb6414b 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -99,7 +99,7 @@ pub mod defaults { pub const DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB: usize = 0; - pub const DEFAULT_WALREDO_PROCESS_KIND: &str = "sync"; + pub const DEFAULT_WALREDO_PROCESS_KIND: &str = "async"; /// /// Default built-in configuration file. diff --git a/pageserver/src/deletion_queue.rs b/pageserver/src/deletion_queue.rs index e3c11cb299..8790a9b0a8 100644 --- a/pageserver/src/deletion_queue.rs +++ b/pageserver/src/deletion_queue.rs @@ -38,7 +38,7 @@ use deleter::DeleterMessage; use list_writer::ListWriterQueueMessage; use validator::ValidatorQueueMessage; -use crate::{config::PageServerConf, tenant::storage_layer::LayerFileName}; +use crate::{config::PageServerConf, tenant::storage_layer::LayerName}; // TODO: configurable for how long to wait before executing deletions @@ -479,7 +479,7 @@ impl DeletionQueueClient { tenant_shard_id: TenantShardId, timeline_id: TimelineId, current_generation: Generation, - layers: Vec<(LayerFileName, LayerFileMetadata)>, + layers: Vec<(LayerName, LayerFileMetadata)>, ) -> Result<(), DeletionQueueError> { if current_generation.is_none() { debug!("Enqueuing deletions in legacy mode, skipping queue"); @@ -511,7 +511,7 @@ impl DeletionQueueClient { tenant_shard_id: TenantShardId, timeline_id: TimelineId, current_generation: Generation, - layers: Vec<(LayerFileName, LayerFileMetadata)>, + layers: Vec<(LayerName, LayerFileMetadata)>, ) -> Result<(), DeletionQueueError> { metrics::DELETION_QUEUE .keys_submitted @@ -632,7 +632,7 @@ impl DeletionQueue { /// /// If remote_storage is None, then the returned workers will also be None. pub fn new( - remote_storage: Option, + remote_storage: GenericRemoteStorage, control_plane_client: Option, conf: &'static PageServerConf, ) -> (Self, Option>) @@ -658,23 +658,6 @@ impl DeletionQueue { // longer to flush after Tenants have all been torn down. let cancel = CancellationToken::new(); - let remote_storage = match remote_storage { - None => { - return ( - Self { - client: DeletionQueueClient { - tx, - executor_tx, - lsn_table: lsn_table.clone(), - }, - cancel, - }, - None, - ) - } - Some(r) => r, - }; - ( Self { client: DeletionQueueClient { @@ -734,20 +717,20 @@ mod test { use crate::{ control_plane_client::RetryForeverError, repository::Key, - tenant::{harness::TenantHarness, storage_layer::DeltaFileName}, + tenant::{harness::TenantHarness, storage_layer::DeltaLayerName}, }; use super::*; pub const TIMELINE_ID: TimelineId = TimelineId::from_array(hex!("11223344556677881122334455667788")); - pub const EXAMPLE_LAYER_NAME: LayerFileName = LayerFileName::Delta(DeltaFileName { + pub const EXAMPLE_LAYER_NAME: LayerName = LayerName::Delta(DeltaLayerName { key_range: Key::from_i128(0x0)..Key::from_i128(0xFFFFFFFFFFFFFFFF), lsn_range: Lsn(0x00000000016B59D8)..Lsn(0x00000000016B5A51), }); // When you need a second layer in a test. - pub const EXAMPLE_LAYER_NAME_ALT: LayerFileName = LayerFileName::Delta(DeltaFileName { + pub const EXAMPLE_LAYER_NAME_ALT: LayerName = LayerName::Delta(DeltaLayerName { key_range: Key::from_i128(0x0)..Key::from_i128(0xFFFFFFFFFFFFFFFF), lsn_range: Lsn(0x00000000016B5A51)..Lsn(0x00000000016B5A61), }); @@ -765,7 +748,7 @@ mod test { /// Simulate a pageserver restart by destroying and recreating the deletion queue async fn restart(&mut self) { let (deletion_queue, workers) = DeletionQueue::new( - Some(self.storage.clone()), + self.storage.clone(), Some(self.mock_control_plane.clone()), self.harness.conf, ); @@ -797,7 +780,7 @@ mod test { /// Returns remote layer file name, suitable for use in assert_remote_files fn write_remote_layer( &self, - file_name: LayerFileName, + file_name: LayerName, gen: Generation, ) -> anyhow::Result { let tenant_shard_id = self.harness.tenant_shard_id; @@ -875,7 +858,7 @@ mod test { let mock_control_plane = MockControlPlane::new(); let (deletion_queue, worker) = DeletionQueue::new( - Some(storage.clone()), + storage.clone(), Some(mock_control_plane.clone()), harness.conf, ); @@ -952,7 +935,7 @@ mod test { let client = ctx.deletion_queue.new_client(); client.recover(HashMap::new())?; - let layer_file_name_1: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(); + let layer_file_name_1: LayerName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(); let tenant_shard_id = ctx.harness.tenant_shard_id; let content: Vec = "victim1 contents".into(); diff --git a/pageserver/src/deletion_queue/list_writer.rs b/pageserver/src/deletion_queue/list_writer.rs index 3a3d600ac2..ae3b2c9180 100644 --- a/pageserver/src/deletion_queue/list_writer.rs +++ b/pageserver/src/deletion_queue/list_writer.rs @@ -34,7 +34,7 @@ use crate::deletion_queue::TEMP_SUFFIX; use crate::metrics; use crate::tenant::remote_timeline_client::remote_layer_path; use crate::tenant::remote_timeline_client::LayerFileMetadata; -use crate::tenant::storage_layer::LayerFileName; +use crate::tenant::storage_layer::LayerName; use crate::virtual_file::on_fatal_io_error; use crate::virtual_file::MaybeFatalIo; @@ -59,7 +59,7 @@ pub(super) struct DeletionOp { // `layers` and `objects` are both just lists of objects. `layers` is used if you do not // have a config object handy to project it to a remote key, and need the consuming worker // to do it for you. - pub(super) layers: Vec<(LayerFileName, LayerFileMetadata)>, + pub(super) layers: Vec<(LayerName, LayerFileMetadata)>, pub(super) objects: Vec, /// The _current_ generation of the Tenant shard attachment in which we are enqueuing diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs index 400930245b..7f25e49570 100644 --- a/pageserver/src/disk_usage_eviction_task.rs +++ b/pageserver/src/disk_usage_eviction_task.rs @@ -64,7 +64,7 @@ use crate::{ mgr::TenantManager, remote_timeline_client::LayerFileMetadata, secondary::SecondaryTenant, - storage_layer::{AsLayerDesc, EvictionError, Layer, LayerFileName}, + storage_layer::{AsLayerDesc, EvictionError, Layer, LayerName}, }, }; @@ -535,17 +535,11 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl( } EvictionLayer::Secondary(layer) => { let file_size = layer.metadata.file_size(); - let tenant_manager = tenant_manager.clone(); js.spawn(async move { layer .secondary_tenant - .evict_layer( - tenant_manager.get_conf(), - layer.timeline_id, - layer.name, - layer.metadata, - ) + .evict_layer(layer.timeline_id, layer.name) .await; Ok(file_size) }); @@ -604,7 +598,7 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl( pub(crate) struct EvictionSecondaryLayer { pub(crate) secondary_tenant: Arc, pub(crate) timeline_id: TimelineId, - pub(crate) name: LayerFileName, + pub(crate) name: LayerName, pub(crate) metadata: LayerFileMetadata, } @@ -637,9 +631,9 @@ impl EvictionLayer { } } - pub(crate) fn get_name(&self) -> LayerFileName { + pub(crate) fn get_name(&self) -> LayerName { match self { - Self::Attached(l) => l.layer_desc().filename(), + Self::Attached(l) => l.layer_desc().layer_name(), Self::Secondary(sl) => sl.name.clone(), } } diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index c425f3e628..e5eafc51f4 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -257,6 +257,37 @@ paths: schema: $ref: "#/components/schemas/LsnByTimestampResponse" + /v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/lsn_lease: + parameters: + - name: tenant_shard_id + in: path + required: true + schema: + type: string + - name: timeline_id + in: path + required: true + schema: + type: string + format: hex + post: + description: Obtain lease for the given LSN + parameters: + - name: lsn + in: query + required: true + schema: + type: string + format: hex + description: A LSN to obtain the lease for + responses: + "200": + description: OK + content: + application/json: + schema: + $ref: "#/components/schemas/LsnLease" + /v1/tenant/{tenant_id}/timeline/{timeline_id}/do_gc: parameters: - name: tenant_id @@ -420,25 +451,6 @@ paths: description: Tenant scheduled to load successfully /v1/tenant/{tenant_id}/synthetic_size: - parameters: - - name: tenant_id - in: path - required: true - schema: - type: string - get: - description: | - Calculate tenant's synthetic size - responses: - "200": - description: Tenant's synthetic size - content: - application/json: - schema: - $ref: "#/components/schemas/SyntheticSizeResponse" - - # This route has no handler. TODO: remove? - /v1/tenant/{tenant_id}/size: parameters: - name: tenant_id in: path @@ -468,19 +480,9 @@ paths: content: application/json: schema: - type: object - required: - - id - - size - properties: - id: - type: string - format: hex - size: - type: integer - nullable: true - description: | - Size metric in bytes or null if inputs_only=true was given. + $ref: "#/components/schemas/SyntheticSizeResponse" + text/html: + description: SVG representation of the tenant and it's timelines. "401": description: Unauthorized Error content: @@ -610,6 +612,80 @@ paths: schema: $ref: "#/components/schemas/Error" + /v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/detach_ancestor: + parameters: + - name: tenant_shard_id + in: path + required: true + schema: + type: string + - name: timeline_id + in: path + Å•equired: true + schema: + type: string + + put: + description: | + Detach a timeline from its ancestor and reparent all ancestors timelines with lower `ancestor_lsn`. + Current implementation might not be retryable across failure cases, but will be enhanced in future. + Detaching should be expected to be expensive operation. Timeouts should be retried. + responses: + "200": + description: | + The timeline has been detached from it's ancestor (now or earlier), and at least the returned timelines have been reparented. + If any timelines were deleted after reparenting, they might not be on this list. + content: + application/json: + schema: + $ref: "#/components/schemas/AncestorDetached" + + "400": + description: | + Number of early checks meaning the timeline cannot be detached now: + - the ancestor of timeline has an ancestor: not supported, see RFC + content: + application/json: + schema: + $ref: "#/components/schemas/Error" + + "404": + description: Tenant or timeline not found. + content: + application/json: + schema: + $ref: "#/components/schemas/NotFoundError" + + "409": + description: | + The timeline can never be detached: + - timeline has no ancestor, implying that the timeline has never had an ancestor + content: + application/json: + schema: + $ref: "#/components/schemas/ConflictError" + + "500": + description: | + Transient error, for example, pageserver shutdown happened while + processing the request but we were unable to distinguish that. Must + be retried. + content: + application/json: + schema: + $ref: "#/components/schemas/Error" + + "503": + description: | + Temporarily unavailable, please retry. Possible reasons: + - another timeline detach for the same tenant is underway, please retry later + - detected shutdown error + content: + application/json: + schema: + $ref: "#/components/schemas/ServiceUnavailableError" + + /v1/tenant/: get: description: Get tenants list @@ -929,6 +1005,9 @@ components: format: hex size: type: integer + nullable: true + description: | + Size metric in bytes or null if inputs_only=true was given. segment_sizes: type: array items: @@ -1006,6 +1085,15 @@ components: type: string enum: [past, present, future, nodata] + LsnLease: + type: object + required: + - valid_until + properties: + valid_until: + type: string + format: date-time + PageserverUtilization: type: object required: @@ -1063,6 +1151,19 @@ components: format: int64 description: How many bytes of layer content were in the latest layer heatmap + AncestorDetached: + type: object + required: + - reparented_timelines + properties: + reparented_timelines: + type: array + description: Set of reparented timeline ids + properties: + type: string + format: hex + description: TimelineId + Error: type: object diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 83b7b8a45e..7b55e88096 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -1,6 +1,8 @@ //! //! Management HTTP API //! +use std::cmp::Reverse; +use std::collections::BinaryHeap; use std::collections::HashMap; use std::str::FromStr; use std::sync::Arc; @@ -14,6 +16,9 @@ use hyper::header; use hyper::StatusCode; use hyper::{Body, Request, Response, Uri}; use metrics::launch_timestamp::LaunchTimestamp; +use pageserver_api::models::AuxFilePolicy; +use pageserver_api::models::IngestAuxFilesRequest; +use pageserver_api::models::ListAuxFilesRequest; use pageserver_api::models::LocationConfig; use pageserver_api::models::LocationConfigListResponse; use pageserver_api::models::ShardParameters; @@ -24,7 +29,11 @@ use pageserver_api::models::TenantScanRemoteStorageShard; use pageserver_api::models::TenantShardLocation; use pageserver_api::models::TenantShardSplitRequest; use pageserver_api::models::TenantShardSplitResponse; +use pageserver_api::models::TenantSorting; use pageserver_api::models::TenantState; +use pageserver_api::models::TopTenantShardItem; +use pageserver_api::models::TopTenantShardsRequest; +use pageserver_api::models::TopTenantShardsResponse; use pageserver_api::models::{ DownloadRemoteLayersTaskSpawnRequest, LocationConfigMode, TenantAttachRequest, TenantLoadRequest, TenantLocationConfigRequest, @@ -63,9 +72,10 @@ use crate::tenant::remote_timeline_client::list_remote_timelines; use crate::tenant::secondary::SecondaryController; use crate::tenant::size::ModelInputs; use crate::tenant::storage_layer::LayerAccessStatsReset; -use crate::tenant::storage_layer::LayerFileName; +use crate::tenant::storage_layer::LayerName; use crate::tenant::timeline::CompactFlags; use crate::tenant::timeline::Timeline; +use crate::tenant::GetTimelineError; use crate::tenant::SpawnMode; use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError}; use crate::{config::PageServerConf, tenant::mgr}; @@ -104,7 +114,7 @@ pub struct State { tenant_manager: Arc, auth: Option>, allowlist_routes: Vec, - remote_storage: Option, + remote_storage: GenericRemoteStorage, broker_client: storage_broker::BrokerClientChannel, disk_usage_eviction_state: Arc, deletion_queue_client: DeletionQueueClient, @@ -118,7 +128,7 @@ impl State { conf: &'static PageServerConf, tenant_manager: Arc, auth: Option>, - remote_storage: Option, + remote_storage: GenericRemoteStorage, broker_client: storage_broker::BrokerClientChannel, disk_usage_eviction_state: Arc, deletion_queue_client: DeletionQueueClient, @@ -271,6 +281,13 @@ impl From for ApiError { } } +impl From for ApiError { + fn from(gte: GetTimelineError) -> Self { + // Rationale: tenant is activated only after eligble timelines activate + ApiError::NotFound(gte.into()) + } +} + impl From for ApiError { fn from(e: GetActiveTenantError) -> ApiError { match e { @@ -433,6 +450,8 @@ async fn build_timeline_info_common( state, walreceiver_status, + + last_aux_file_policy: timeline.last_aux_file_policy.load(), }; Ok(info) } @@ -633,9 +652,7 @@ async fn timeline_preserve_initdb_handler( .tenant_manager .get_attached_tenant_shard(tenant_shard_id)?; - let timeline = tenant - .get_timeline(timeline_id, false) - .map_err(|e| ApiError::NotFound(e.into()))?; + let timeline = tenant.get_timeline(timeline_id, false)?; timeline .preserve_initdb_archive() @@ -677,9 +694,7 @@ async fn timeline_detail_handler( tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?; - let timeline = tenant - .get_timeline(timeline_id, false) - .map_err(|e| ApiError::NotFound(e.into()))?; + let timeline = tenant.get_timeline(timeline_id, false)?; let timeline_info = build_timeline_info( &timeline, @@ -813,12 +828,6 @@ async fn tenant_attach_handler( let generation = get_request_generation(state, maybe_body.as_ref().and_then(|r| r.generation))?; - if state.remote_storage.is_none() { - return Err(ApiError::BadRequest(anyhow!( - "attach_tenant is not possible because pageserver was configured without remote storage" - ))); - } - let tenant_shard_id = TenantShardId::unsharded(tenant_id); let shard_params = ShardParameters::default(); let location_conf = LocationConf::attached_single(tenant_conf, generation, &shard_params); @@ -1229,7 +1238,7 @@ async fn layer_download_handler( let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; let layer_file_name = get_request_param(&request, "layer_file_name")?; check_permission(&request, Some(tenant_shard_id.tenant_id))?; - let layer_name = LayerFileName::from_str(layer_file_name) + let layer_name = LayerName::from_str(layer_file_name) .map_err(|s| ApiError::BadRequest(anyhow::anyhow!(s)))?; let state = get_state(&request); @@ -1261,7 +1270,7 @@ async fn evict_timeline_layer_handler( let layer_file_name = get_request_param(&request, "layer_file_name")?; let state = get_state(&request); - let layer_name = LayerFileName::from_str(layer_file_name) + let layer_name = LayerName::from_str(layer_file_name) .map_err(|s| ApiError::BadRequest(anyhow::anyhow!(s)))?; let timeline = @@ -1643,12 +1652,6 @@ async fn tenant_time_travel_remote_storage_handler( ))); } - let Some(storage) = state.remote_storage.as_ref() else { - return Err(ApiError::InternalServerError(anyhow::anyhow!( - "remote storage not configured, cannot run time travel" - ))); - }; - if timestamp > done_if_after { return Err(ApiError::BadRequest(anyhow!( "The done_if_after timestamp comes before the timestamp to recover to" @@ -1658,7 +1661,7 @@ async fn tenant_time_travel_remote_storage_handler( tracing::info!("Issuing time travel request internally. timestamp={timestamp_raw}, done_if_after={done_if_after_raw}"); remote_timeline_client::upload::time_travel_recover_tenant( - storage, + &state.remote_storage, &tenant_shard_id, timestamp, done_if_after, @@ -1703,6 +1706,32 @@ async fn handle_tenant_break( json_response(StatusCode::OK, ()) } +// Obtains an lsn lease on the given timeline. +async fn lsn_lease_handler( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; + let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; + check_permission(&request, Some(tenant_shard_id.tenant_id))?; + + let lsn: Lsn = parse_query_param(&request, "lsn")? + .ok_or_else(|| ApiError::BadRequest(anyhow!("missing 'lsn' query parameter")))?; + + let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); + + let state = get_state(&request); + + let timeline = + active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id) + .await?; + let result = timeline + .make_lsn_lease(lsn, &ctx) + .map_err(|e| ApiError::InternalServerError(e.context("lsn lease http handler")))?; + + json_response(StatusCode::OK, result) +} + // Run GC immediately on given timeline. async fn timeline_gc_handler( mut request: Request, @@ -1715,12 +1744,7 @@ async fn timeline_gc_handler( let gc_req: TimelineGcRequest = json_request(&mut request).await?; let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); - let wait_task_done = mgr::immediate_gc(tenant_shard_id, timeline_id, gc_req, cancel, &ctx)?; - let gc_result = wait_task_done - .await - .context("wait for gc task") - .map_err(ApiError::InternalServerError)? - .map_err(ApiError::InternalServerError)?; + let gc_result = mgr::immediate_gc(tenant_shard_id, timeline_id, gc_req, cancel, &ctx).await?; json_response(StatusCode::OK, gc_result) } @@ -1743,6 +1767,8 @@ async fn timeline_compact_handler( if Some(true) == parse_query_param::<_, bool>(&request, "force_image_layer_creation")? { flags |= CompactFlags::ForceImageLayerCreation; } + let wait_until_uploaded = + parse_query_param::<_, bool>(&request, "wait_until_uploaded")?.unwrap_or(false); async { let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); @@ -1751,6 +1777,9 @@ async fn timeline_compact_handler( .compact(&cancel, flags, &ctx) .await .map_err(|e| ApiError::InternalServerError(e.into()))?; + if wait_until_uploaded { + timeline.remote_client.wait_completion().await.map_err(ApiError::InternalServerError)?; + } json_response(StatusCode::OK, ()) } .instrument(info_span!("manual_compaction", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id)) @@ -1775,6 +1804,8 @@ async fn timeline_checkpoint_handler( if Some(true) == parse_query_param::<_, bool>(&request, "force_image_layer_creation")? { flags |= CompactFlags::ForceImageLayerCreation; } + let wait_until_uploaded = + parse_query_param::<_, bool>(&request, "wait_until_uploaded")?.unwrap_or(false); async { let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); @@ -1788,6 +1819,10 @@ async fn timeline_checkpoint_handler( .await .map_err(|e| ApiError::InternalServerError(e.into()))?; + if wait_until_uploaded { + timeline.remote_client.wait_completion().await.map_err(ApiError::InternalServerError)?; + } + json_response(StatusCode::OK, ()) } .instrument(info_span!("manual_checkpoint", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id)) @@ -1871,14 +1906,11 @@ async fn timeline_detach_ancestor_handler( let ctx = RequestContext::new(TaskKind::DetachAncestor, DownloadBehavior::Download); let ctx = &ctx; - let timeline = tenant - .get_timeline(timeline_id, true) - .map_err(|e| ApiError::NotFound(e.into()))?; + let timeline = tenant.get_timeline(timeline_id, true)?; let (_guard, prepared) = timeline .prepare_to_detach_from_ancestor(&tenant, options, ctx) - .await - .map_err(|e| ApiError::InternalServerError(e.into()))?; + .await?; let res = state .tenant_manager @@ -1908,11 +1940,6 @@ async fn deletion_queue_flush( ) -> Result, ApiError> { let state = get_state(&r); - if state.remote_storage.is_none() { - // Nothing to do if remote storage is disabled. - return json_response(StatusCode::OK, ()); - } - let execute = parse_query_param(&r, "execute")?.unwrap_or(false); let flush = async { @@ -2017,9 +2044,7 @@ async fn active_timeline_of_active_tenant( tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?; - tenant - .get_timeline(timeline_id, true) - .map_err(|e| ApiError::NotFound(e.into())) + Ok(tenant.get_timeline(timeline_id, true)?) } async fn always_panic_handler( @@ -2077,18 +2102,11 @@ async fn disk_usage_eviction_run( }; let state = get_state(&r); - - let Some(storage) = state.remote_storage.as_ref() else { - return Err(ApiError::InternalServerError(anyhow::anyhow!( - "remote storage not configured, cannot run eviction iteration" - ))); - }; - let eviction_state = state.disk_usage_eviction_state.clone(); let res = crate::disk_usage_eviction_task::disk_usage_eviction_task_iteration_impl( &eviction_state, - storage, + &state.remote_storage, usage, &state.tenant_manager, config.eviction_order, @@ -2125,29 +2143,23 @@ async fn tenant_scan_remote_handler( let state = get_state(&request); let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; - let Some(remote_storage) = state.remote_storage.as_ref() else { - return Err(ApiError::BadRequest(anyhow::anyhow!( - "Remote storage not configured" - ))); - }; - let mut response = TenantScanRemoteStorageResponse::default(); let (shards, _other_keys) = - list_remote_tenant_shards(remote_storage, tenant_id, cancel.clone()) + list_remote_tenant_shards(&state.remote_storage, tenant_id, cancel.clone()) .await .map_err(|e| ApiError::InternalServerError(anyhow::anyhow!(e)))?; for tenant_shard_id in shards { let (timeline_ids, _other_keys) = - list_remote_timelines(remote_storage, tenant_shard_id, cancel.clone()) + list_remote_timelines(&state.remote_storage, tenant_shard_id, cancel.clone()) .await .map_err(|e| ApiError::InternalServerError(anyhow::anyhow!(e)))?; let mut generation = Generation::none(); for timeline_id in timeline_ids { match download_index_part( - remote_storage, + &state.remote_storage, &tenant_shard_id, &timeline_id, Generation::MAX, @@ -2296,6 +2308,31 @@ async fn post_tracing_event_handler( json_response(StatusCode::OK, ()) } +async fn force_aux_policy_switch_handler( + mut r: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + check_permission(&r, None)?; + let tenant_shard_id: TenantShardId = parse_request_param(&r, "tenant_shard_id")?; + let timeline_id: TimelineId = parse_request_param(&r, "timeline_id")?; + let policy: AuxFilePolicy = json_request(&mut r).await?; + + let state = get_state(&r); + + let tenant = state + .tenant_manager + .get_attached_tenant_shard(tenant_shard_id)?; + tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?; + let timeline = + active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id) + .await?; + timeline + .do_switch_aux_policy(policy) + .map_err(ApiError::InternalServerError)?; + + json_response(StatusCode::OK, ()) +} + async fn put_io_engine_handler( mut r: Request, _cancel: CancellationToken, @@ -2358,6 +2395,150 @@ async fn get_utilization( .map_err(ApiError::InternalServerError) } +async fn list_aux_files( + mut request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; + let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; + let body: ListAuxFilesRequest = json_request(&mut request).await?; + check_permission(&request, Some(tenant_shard_id.tenant_id))?; + + let state = get_state(&request); + + let timeline = + active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id) + .await?; + + let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); + let files = timeline.list_aux_files(body.lsn, &ctx).await?; + json_response(StatusCode::OK, files) +} + +async fn ingest_aux_files( + mut request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; + let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; + let body: IngestAuxFilesRequest = json_request(&mut request).await?; + check_permission(&request, Some(tenant_shard_id.tenant_id))?; + + let state = get_state(&request); + + let timeline = + active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id) + .await?; + + let mut modification = timeline.begin_modification( + Lsn(timeline.get_last_record_lsn().0 + 8), /* advance LSN by 8 */ + ); + let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); + for (fname, content) in body.aux_files { + modification + .put_file(&fname, content.as_bytes(), &ctx) + .await + .map_err(ApiError::InternalServerError)?; + } + modification + .commit(&ctx) + .await + .map_err(ApiError::InternalServerError)?; + + json_response(StatusCode::OK, ()) +} + +/// Report on the largest tenants on this pageserver, for the storage controller to identify +/// candidates for splitting +async fn post_top_tenants( + mut r: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + check_permission(&r, None)?; + let request: TopTenantShardsRequest = json_request(&mut r).await?; + let state = get_state(&r); + + fn get_size_metric(sizes: &TopTenantShardItem, order_by: &TenantSorting) -> u64 { + match order_by { + TenantSorting::ResidentSize => sizes.resident_size, + TenantSorting::MaxLogicalSize => sizes.max_logical_size, + } + } + + #[derive(Eq, PartialEq)] + struct HeapItem { + metric: u64, + sizes: TopTenantShardItem, + } + + impl PartialOrd for HeapItem { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } + } + + /// Heap items have reverse ordering on their metric: this enables using BinaryHeap, which + /// supports popping the greatest item but not the smallest. + impl Ord for HeapItem { + fn cmp(&self, other: &Self) -> std::cmp::Ordering { + Reverse(self.metric).cmp(&Reverse(other.metric)) + } + } + + let mut top_n: BinaryHeap = BinaryHeap::with_capacity(request.limit); + + // FIXME: this is a lot of clones to take this tenant list + for (tenant_shard_id, tenant_slot) in state.tenant_manager.list() { + if let Some(shards_lt) = request.where_shards_lt { + // Ignore tenants which already have >= this many shards + if tenant_shard_id.shard_count >= shards_lt { + continue; + } + } + + let sizes = match tenant_slot { + TenantSlot::Attached(tenant) => tenant.get_sizes(), + TenantSlot::Secondary(_) | TenantSlot::InProgress(_) => { + continue; + } + }; + let metric = get_size_metric(&sizes, &request.order_by); + + if let Some(gt) = request.where_gt { + // Ignore tenants whose metric is <= the lower size threshold, to do less sorting work + if metric <= gt { + continue; + } + }; + + match top_n.peek() { + None => { + // Top N list is empty: candidate becomes first member + top_n.push(HeapItem { metric, sizes }); + } + Some(i) if i.metric > metric && top_n.len() < request.limit => { + // Lowest item in list is greater than our candidate, but we aren't at limit yet: push to end + top_n.push(HeapItem { metric, sizes }); + } + Some(i) if i.metric > metric => { + // List is at limit and lowest value is greater than our candidate, drop it. + } + Some(_) => top_n.push(HeapItem { metric, sizes }), + } + + while top_n.len() > request.limit { + top_n.pop(); + } + } + + json_response( + StatusCode::OK, + TopTenantShardsResponse { + shards: top_n.into_iter().map(|i| i.sizes).collect(), + }, + ) +} + /// Common functionality of all the HTTP API handlers. /// /// - Adds a tracing span to each request (by `request_span`) @@ -2570,6 +2751,10 @@ pub fn make_router( "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/get_timestamp_of_lsn", |r| api_handler(r, get_timestamp_of_lsn_handler), ) + .post( + "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/lsn_lease", + |r| api_handler(r, lsn_lease_handler), + ) .put( "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/do_gc", |r| api_handler(r, timeline_gc_handler), @@ -2643,6 +2828,19 @@ pub fn make_router( |r| api_handler(r, timeline_collect_keyspace), ) .put("/v1/io_engine", |r| api_handler(r, put_io_engine_handler)) + .put( + "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/force_aux_policy_switch", + |r| api_handler(r, force_aux_policy_switch_handler), + ) .get("/v1/utilization", |r| api_handler(r, get_utilization)) + .post( + "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/ingest_aux_files", + |r| testing_api_handler("ingest_aux_files", r, ingest_aux_files), + ) + .post( + "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/list_aux_files", + |r| testing_api_handler("list_aux_files", r, list_aux_files), + ) + .post("/v1/top_tenants", |r| api_handler(r, post_top_tenants)) .any(handler_404)) } diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index 930700e50c..c69fb8c83b 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -57,7 +57,7 @@ pub use crate::metrics::preinitialize_metrics; #[tracing::instrument(skip_all, fields(%exit_code))] pub async fn shutdown_pageserver( tenant_manager: &TenantManager, - deletion_queue: Option, + mut deletion_queue: DeletionQueue, exit_code: i32, ) { use std::time::Duration; @@ -89,9 +89,7 @@ pub async fn shutdown_pageserver( .await; // Best effort to persist any outstanding deletions, to avoid leaking objects - if let Some(mut deletion_queue) = deletion_queue { - deletion_queue.shutdown(Duration::from_secs(5)).await; - } + deletion_queue.shutdown(Duration::from_secs(5)).await; // Shut down the HTTP endpoint last, so that you can still check the server's // status while it's shutting down. @@ -114,10 +112,6 @@ pub async fn shutdown_pageserver( std::process::exit(exit_code); } -/// The name of the metadata file pageserver creates per timeline. -/// Full path: `tenants//timelines//metadata`. -pub const METADATA_FILE_NAME: &str = "metadata"; - /// Per-tenant configuration file. /// Full path: `tenants//config`. pub(crate) const TENANT_CONFIG_NAME: &str = "config"; diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index 256f2f334c..4f2c75d308 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -525,6 +525,15 @@ static LAST_RECORD_LSN: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); +static STANDBY_HORIZON: Lazy = Lazy::new(|| { + register_int_gauge_vec!( + "pageserver_standby_horizon", + "Standby apply LSN for which GC is hold off, by timeline.", + &["tenant_id", "shard_id", "timeline_id"] + ) + .expect("failed to define a metric") +}); + static RESIDENT_PHYSICAL_SIZE: Lazy = Lazy::new(|| { register_uint_gauge_vec!( "pageserver_resident_physical_size", @@ -585,6 +594,15 @@ static CURRENT_LOGICAL_SIZE: Lazy = Lazy::new(|| { .expect("failed to define current logical size metric") }); +static AUX_FILE_SIZE: Lazy = Lazy::new(|| { + register_int_gauge_vec!( + "pageserver_aux_file_estimated_size", + "The size of all aux files for a timeline in aux file v2 store.", + &["tenant_id", "shard_id", "timeline_id"] + ) + .expect("failed to define a metric") +}); + pub(crate) mod initial_logical_size { use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec}; use once_cell::sync::Lazy; @@ -1849,7 +1867,6 @@ pub(crate) struct WalIngestMetrics { pub(crate) records_received: IntCounter, pub(crate) records_committed: IntCounter, pub(crate) records_filtered: IntCounter, - pub(crate) time_spent_on_ingest: Histogram, } pub(crate) static WAL_INGEST: Lazy = Lazy::new(|| WalIngestMetrics { @@ -1873,12 +1890,6 @@ pub(crate) static WAL_INGEST: Lazy = Lazy::new(|| WalIngestMet "Number of WAL records filtered out due to sharding" ) .expect("failed to define a metric"), - time_spent_on_ingest: register_histogram!( - "pageserver_wal_ingest_put_value_seconds", - "Actual time spent on ingesting a record", - redo_histogram_time_buckets!(), - ) - .expect("failed to define a metric"), }); pub(crate) static WAL_REDO_TIME: Lazy = Lazy::new(|| { @@ -1990,29 +2001,6 @@ impl Default for WalRedoProcessCounters { pub(crate) static WAL_REDO_PROCESS_COUNTERS: Lazy = Lazy::new(WalRedoProcessCounters::default); -#[cfg(not(test))] -pub mod wal_redo { - use super::*; - - static PROCESS_KIND: Lazy> = Lazy::new(|| { - std::sync::Mutex::new( - register_uint_gauge_vec!( - "pageserver_wal_redo_process_kind", - "The configured process kind for walredo", - &["kind"], - ) - .unwrap(), - ) - }); - - pub fn set_process_kind_metric(kind: crate::walredo::ProcessKind) { - // use guard to avoid races around the next two steps - let guard = PROCESS_KIND.lock().unwrap(); - guard.reset(); - guard.with_label_values(&[&format!("{kind}")]).set(1); - } -} - /// Similar to `prometheus::HistogramTimer` but does not record on drop. pub(crate) struct StorageTimeMetricsTimer { metrics: StorageTimeMetrics, @@ -2112,9 +2100,11 @@ pub(crate) struct TimelineMetrics { pub garbage_collect_histo: StorageTimeMetrics, pub find_gc_cutoffs_histo: StorageTimeMetrics, pub last_record_gauge: IntGauge, - resident_physical_size_gauge: UIntGauge, + pub standby_horizon_gauge: IntGauge, + pub resident_physical_size_gauge: UIntGauge, /// copy of LayeredTimeline.current_logical_size pub current_logical_size_gauge: UIntGauge, + pub aux_file_size_gauge: IntGauge, pub directory_entries_count_gauge: Lazy UIntGauge>>, pub evictions: IntCounter, pub evictions_with_low_residence_duration: std::sync::RwLock, @@ -2180,6 +2170,9 @@ impl TimelineMetrics { let last_record_gauge = LAST_RECORD_LSN .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) .unwrap(); + let standby_horizon_gauge = STANDBY_HORIZON + .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) + .unwrap(); let resident_physical_size_gauge = RESIDENT_PHYSICAL_SIZE .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) .unwrap(); @@ -2187,6 +2180,9 @@ impl TimelineMetrics { let current_logical_size_gauge = CURRENT_LOGICAL_SIZE .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) .unwrap(); + let aux_file_size_gauge = AUX_FILE_SIZE + .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) + .unwrap(); // TODO use impl Trait syntax here once we have ability to use it: https://github.com/rust-lang/rust/issues/63065 let directory_entries_count_gauge_closure = { let tenant_shard_id = *tenant_shard_id; @@ -2222,8 +2218,10 @@ impl TimelineMetrics { find_gc_cutoffs_histo, load_layer_map_histo, last_record_gauge, + standby_horizon_gauge, resident_physical_size_gauge, current_logical_size_gauge, + aux_file_size_gauge, directory_entries_count_gauge, evictions, evictions_with_low_residence_duration: std::sync::RwLock::new( @@ -2255,6 +2253,7 @@ impl TimelineMetrics { let timeline_id = &self.timeline_id; let shard_id = &self.shard_id; let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, shard_id, timeline_id]); + let _ = STANDBY_HORIZON.remove_label_values(&[tenant_id, shard_id, timeline_id]); { RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(self.resident_physical_size_get()); let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]); @@ -2264,6 +2263,7 @@ impl TimelineMetrics { let _ = metric.remove_label_values(&[tenant_id, shard_id, timeline_id]); } let _ = EVICTIONS.remove_label_values(&[tenant_id, shard_id, timeline_id]); + let _ = AUX_FILE_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]); self.evictions_with_low_residence_duration .write() @@ -2320,6 +2320,7 @@ use pin_project_lite::pin_project; use std::collections::HashMap; use std::num::NonZeroUsize; use std::pin::Pin; +use std::sync::atomic::AtomicU64; use std::sync::{Arc, Mutex}; use std::task::{Context, Poll}; use std::time::{Duration, Instant}; @@ -2329,35 +2330,35 @@ use crate::task_mgr::TaskKind; use crate::tenant::mgr::TenantSlot; /// Maintain a per timeline gauge in addition to the global gauge. -struct PerTimelineRemotePhysicalSizeGauge { - last_set: u64, +pub(crate) struct PerTimelineRemotePhysicalSizeGauge { + last_set: AtomicU64, gauge: UIntGauge, } impl PerTimelineRemotePhysicalSizeGauge { fn new(per_timeline_gauge: UIntGauge) -> Self { Self { - last_set: per_timeline_gauge.get(), + last_set: AtomicU64::new(0), gauge: per_timeline_gauge, } } - fn set(&mut self, sz: u64) { + pub(crate) fn set(&self, sz: u64) { self.gauge.set(sz); - if sz < self.last_set { - REMOTE_PHYSICAL_SIZE_GLOBAL.sub(self.last_set - sz); + let prev = self.last_set.swap(sz, std::sync::atomic::Ordering::Relaxed); + if sz < prev { + REMOTE_PHYSICAL_SIZE_GLOBAL.sub(prev - sz); } else { - REMOTE_PHYSICAL_SIZE_GLOBAL.add(sz - self.last_set); + REMOTE_PHYSICAL_SIZE_GLOBAL.add(sz - prev); }; - self.last_set = sz; } - fn get(&self) -> u64 { + pub(crate) fn get(&self) -> u64 { self.gauge.get() } } impl Drop for PerTimelineRemotePhysicalSizeGauge { fn drop(&mut self) { - REMOTE_PHYSICAL_SIZE_GLOBAL.sub(self.last_set); + REMOTE_PHYSICAL_SIZE_GLOBAL.sub(self.last_set.load(std::sync::atomic::Ordering::Relaxed)); } } @@ -2365,7 +2366,7 @@ pub(crate) struct RemoteTimelineClientMetrics { tenant_id: String, shard_id: String, timeline_id: String, - remote_physical_size_gauge: Mutex>, + pub(crate) remote_physical_size_gauge: PerTimelineRemotePhysicalSizeGauge, calls: Mutex>, bytes_started_counter: Mutex>, bytes_finished_counter: Mutex>, @@ -2373,38 +2374,27 @@ pub(crate) struct RemoteTimelineClientMetrics { impl RemoteTimelineClientMetrics { pub fn new(tenant_shard_id: &TenantShardId, timeline_id: &TimelineId) -> Self { + let tenant_id_str = tenant_shard_id.tenant_id.to_string(); + let shard_id_str = format!("{}", tenant_shard_id.shard_slug()); + let timeline_id_str = timeline_id.to_string(); + + let remote_physical_size_gauge = PerTimelineRemotePhysicalSizeGauge::new( + REMOTE_PHYSICAL_SIZE + .get_metric_with_label_values(&[&tenant_id_str, &shard_id_str, &timeline_id_str]) + .unwrap(), + ); + RemoteTimelineClientMetrics { - tenant_id: tenant_shard_id.tenant_id.to_string(), - shard_id: format!("{}", tenant_shard_id.shard_slug()), - timeline_id: timeline_id.to_string(), + tenant_id: tenant_id_str, + shard_id: shard_id_str, + timeline_id: timeline_id_str, calls: Mutex::new(HashMap::default()), bytes_started_counter: Mutex::new(HashMap::default()), bytes_finished_counter: Mutex::new(HashMap::default()), - remote_physical_size_gauge: Mutex::new(None), + remote_physical_size_gauge, } } - pub(crate) fn remote_physical_size_set(&self, sz: u64) { - let mut guard = self.remote_physical_size_gauge.lock().unwrap(); - let gauge = guard.get_or_insert_with(|| { - PerTimelineRemotePhysicalSizeGauge::new( - REMOTE_PHYSICAL_SIZE - .get_metric_with_label_values(&[ - &self.tenant_id, - &self.shard_id, - &self.timeline_id, - ]) - .unwrap(), - ) - }); - gauge.set(sz); - } - - pub(crate) fn remote_physical_size_get(&self) -> u64 { - let guard = self.remote_physical_size_gauge.lock().unwrap(); - guard.as_ref().map(|gauge| gauge.get()).unwrap_or(0) - } - pub fn remote_operation_time( &self, file_kind: &RemoteOpFileKind, diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index f6b251283c..d250864fd6 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -19,6 +19,7 @@ use pageserver_api::models::{ }; use pageserver_api::shard::ShardIndex; use pageserver_api::shard::ShardNumber; +use pageserver_api::shard::TenantShardId; use postgres_backend::{is_expected_io_error, AuthType, PostgresBackend, QueryError}; use pq_proto::framed::ConnectionError; use pq_proto::FeStartupPacket; @@ -32,6 +33,8 @@ use std::str; use std::str::FromStr; use std::sync::Arc; use std::time::Duration; +use std::time::Instant; +use std::time::SystemTime; use tokio::io::AsyncWriteExt; use tokio::io::{AsyncRead, AsyncWrite}; use tokio_util::io::StreamReader; @@ -49,7 +52,6 @@ use utils::{ use crate::auth::check_permission; use crate::basebackup; use crate::basebackup::BasebackupError; -use crate::config::PageServerConf; use crate::context::{DownloadBehavior, RequestContext}; use crate::import_datadir::import_wal_from_tar; use crate::metrics; @@ -59,13 +61,15 @@ use crate::span::debug_assert_current_span_has_tenant_and_timeline_id; use crate::span::debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id; use crate::task_mgr; use crate::task_mgr::TaskKind; -use crate::tenant::mgr; -use crate::tenant::mgr::get_active_tenant_with_timeout; use crate::tenant::mgr::GetActiveTenantError; +use crate::tenant::mgr::GetTenantError; +use crate::tenant::mgr::ShardResolveResult; use crate::tenant::mgr::ShardSelector; +use crate::tenant::mgr::TenantManager; use crate::tenant::timeline::WaitLsnError; use crate::tenant::GetTimelineError; use crate::tenant::PageReconstructError; +use crate::tenant::Tenant; use crate::tenant::Timeline; use crate::trace::Tracer; use pageserver_api::key::rel_block_to_key; @@ -135,7 +139,7 @@ async fn read_tar_eof(mut reader: (impl AsyncRead + Unpin)) -> anyhow::Result<() /// Listens for connections, and launches a new handler task for each. /// pub async fn libpq_listener_main( - conf: &'static PageServerConf, + tenant_manager: Arc, broker_client: storage_broker::BrokerClientChannel, auth: Option>, listener: TcpListener, @@ -180,7 +184,7 @@ pub async fn libpq_listener_main( "serving compute connection task", false, page_service_conn_main( - conf, + tenant_manager.clone(), broker_client.clone(), local_auth, socket, @@ -203,7 +207,7 @@ pub async fn libpq_listener_main( #[instrument(skip_all, fields(peer_addr))] async fn page_service_conn_main( - conf: &'static PageServerConf, + tenant_manager: Arc, broker_client: storage_broker::BrokerClientChannel, auth: Option>, socket: tokio::net::TcpStream, @@ -260,7 +264,8 @@ async fn page_service_conn_main( // and create a child per-query context when it invokes process_query. // But it's in a shared crate, so, we store connection_ctx inside PageServerHandler // and create the per-query context in process_query ourselves. - let mut conn_handler = PageServerHandler::new(conf, broker_client, auth, connection_ctx); + let mut conn_handler = + PageServerHandler::new(tenant_manager, broker_client, auth, connection_ctx); let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?; match pgbackend @@ -291,11 +296,12 @@ struct HandlerTimeline { } struct PageServerHandler { - _conf: &'static PageServerConf, broker_client: storage_broker::BrokerClientChannel, auth: Option>, claims: Option, + tenant_manager: Arc, + /// The context created for the lifetime of the connection /// services by this PageServerHandler. /// For each query received over the connection, @@ -381,13 +387,13 @@ impl From for QueryError { impl PageServerHandler { pub fn new( - conf: &'static PageServerConf, + tenant_manager: Arc, broker_client: storage_broker::BrokerClientChannel, auth: Option>, connection_ctx: RequestContext, ) -> Self { PageServerHandler { - _conf: conf, + tenant_manager, broker_client, auth, claims: None, @@ -552,13 +558,9 @@ impl PageServerHandler { { debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id(); - let tenant = mgr::get_active_tenant_with_timeout( - tenant_id, - ShardSelector::First, - ACTIVE_TENANT_TIMEOUT, - &task_mgr::shutdown_token(), - ) - .await?; + let tenant = self + .get_active_tenant_with_timeout(tenant_id, ShardSelector::First, ACTIVE_TENANT_TIMEOUT) + .await?; // Make request tracer if needed let mut tracer = if tenant.get_trace_read_requests() { @@ -726,13 +728,9 @@ impl PageServerHandler { // Create empty timeline info!("creating new timeline"); - let tenant = get_active_tenant_with_timeout( - tenant_id, - ShardSelector::Zero, - ACTIVE_TENANT_TIMEOUT, - &task_mgr::shutdown_token(), - ) - .await?; + let tenant = self + .get_active_tenant_with_timeout(tenant_id, ShardSelector::Zero, ACTIVE_TENANT_TIMEOUT) + .await?; let timeline = tenant .create_empty_timeline(timeline_id, base_lsn, pg_version, &ctx) .await?; @@ -909,6 +907,39 @@ impl PageServerHandler { } } + #[instrument(skip_all, fields(shard_id, %lsn))] + async fn handle_make_lsn_lease( + &self, + pgb: &mut PostgresBackend, + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + lsn: Lsn, + ctx: &RequestContext, + ) -> Result<(), QueryError> + where + IO: AsyncRead + AsyncWrite + Send + Sync + Unpin, + { + let shard_selector = ShardSelector::Known(tenant_shard_id.to_index()); + let timeline = self + .get_active_tenant_timeline(tenant_shard_id.tenant_id, timeline_id, shard_selector) + .await?; + let lease = timeline.make_lsn_lease(lsn, ctx)?; + let valid_until = lease + .valid_until + .duration_since(SystemTime::UNIX_EPOCH) + .map_err(|e| QueryError::Other(e.into()))?; + + pgb.write_message_noflush(&BeMessage::RowDescription(&[RowDescriptor::text_col( + b"valid_until", + )]))? + .write_message_noflush(&BeMessage::DataRow(&[Some( + &valid_until.as_millis().to_be_bytes(), + )]))? + .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; + + Ok(()) + } + #[instrument(skip_all, fields(shard_id))] async fn handle_get_rel_exists_request( &mut self, @@ -1370,18 +1401,69 @@ impl PageServerHandler { timeline_id: TimelineId, selector: ShardSelector, ) -> Result, GetActiveTimelineError> { - let tenant = get_active_tenant_with_timeout( - tenant_id, - selector, - ACTIVE_TENANT_TIMEOUT, - &task_mgr::shutdown_token(), - ) - .await - .map_err(GetActiveTimelineError::Tenant)?; + let tenant = self + .get_active_tenant_with_timeout(tenant_id, selector, ACTIVE_TENANT_TIMEOUT) + .await + .map_err(GetActiveTimelineError::Tenant)?; let timeline = tenant.get_timeline(timeline_id, true)?; set_tracing_field_shard_id(&timeline); Ok(timeline) } + + /// Get a shard's [`Tenant`] in its active state, if present. If we don't find the shard and some + /// slots for this tenant are `InProgress` then we will wait. + /// If we find the [`Tenant`] and it's not yet in state [`TenantState::Active`], we will wait. + /// + /// `timeout` is used as a total timeout for the whole wait operation. + async fn get_active_tenant_with_timeout( + &self, + tenant_id: TenantId, + shard_selector: ShardSelector, + timeout: Duration, + ) -> Result, GetActiveTenantError> { + let wait_start = Instant::now(); + let deadline = wait_start + timeout; + + // Resolve TenantId to TenantShardId. This is usually a quick one-shot thing, the loop is + // for handling the rare case that the slot we're accessing is InProgress. + let tenant_shard = loop { + let resolved = self + .tenant_manager + .resolve_attached_shard(&tenant_id, shard_selector); + match resolved { + ShardResolveResult::Found(tenant_shard) => break tenant_shard, + ShardResolveResult::NotFound => { + return Err(GetActiveTenantError::NotFound(GetTenantError::NotFound( + tenant_id, + ))); + } + ShardResolveResult::InProgress(barrier) => { + // We can't authoritatively answer right now: wait for InProgress state + // to end, then try again + tokio::select! { + _ = self.await_connection_cancelled() => { + return Err(GetActiveTenantError::Cancelled) + }, + _ = barrier.wait() => { + // The barrier completed: proceed around the loop to try looking up again + }, + _ = tokio::time::sleep(deadline.duration_since(Instant::now())) => { + return Err(GetActiveTenantError::WaitForActiveTimeout { + latest_state: None, + wait_time: timeout, + }); + } + } + } + }; + }; + + tracing::debug!("Waiting for tenant to enter active state..."); + tenant_shard + .wait_to_become_active(deadline.duration_since(Instant::now())) + .await?; + Ok(tenant_shard) + } } #[async_trait::async_trait] @@ -1439,9 +1521,8 @@ where let ctx = self.connection_ctx.attached_child(); debug!("process query {query_string:?}"); - if query_string.starts_with("pagestream_v2 ") { - let (_, params_raw) = query_string.split_at("pagestream_v2 ".len()); - let params = params_raw.split(' ').collect::>(); + let parts = query_string.split_whitespace().collect::>(); + if let Some(params) = parts.strip_prefix(&["pagestream_v2"]) { if params.len() != 2 { return Err(QueryError::Other(anyhow::anyhow!( "invalid param number for pagestream command" @@ -1466,9 +1547,7 @@ where ctx, ) .await?; - } else if query_string.starts_with("pagestream ") { - let (_, params_raw) = query_string.split_at("pagestream ".len()); - let params = params_raw.split(' ').collect::>(); + } else if let Some(params) = parts.strip_prefix(&["pagestream"]) { if params.len() != 2 { return Err(QueryError::Other(anyhow::anyhow!( "invalid param number for pagestream command" @@ -1493,10 +1572,7 @@ where ctx, ) .await?; - } else if query_string.starts_with("basebackup ") { - let (_, params_raw) = query_string.split_at("basebackup ".len()); - let params = params_raw.split_whitespace().collect::>(); - + } else if let Some(params) = parts.strip_prefix(&["basebackup"]) { if params.len() < 2 { return Err(QueryError::Other(anyhow::anyhow!( "invalid param number for basebackup command" @@ -1514,26 +1590,23 @@ where self.check_permission(Some(tenant_id))?; - let lsn = if params.len() >= 3 { + let lsn = if let Some(lsn_str) = params.get(2) { Some( - Lsn::from_str(params[2]) - .with_context(|| format!("Failed to parse Lsn from {}", params[2]))?, + Lsn::from_str(lsn_str) + .with_context(|| format!("Failed to parse Lsn from {lsn_str}"))?, ) } else { None }; - let gzip = if params.len() >= 4 { - if params[3] == "--gzip" { - true - } else { + let gzip = match params.get(3) { + Some(&"--gzip") => true, + None => false, + Some(third_param) => { return Err(QueryError::Other(anyhow::anyhow!( - "Parameter in position 3 unknown {}", - params[3], - ))); + "Parameter in position 3 unknown {third_param}", + ))) } - } else { - false }; let metric_recording = metrics::BASEBACKUP_QUERY_TIME.start_recording(&ctx); @@ -1557,10 +1630,7 @@ where res?; } // return pair of prev_lsn and last_lsn - else if query_string.starts_with("get_last_record_rlsn ") { - let (_, params_raw) = query_string.split_at("get_last_record_rlsn ".len()); - let params = params_raw.split_whitespace().collect::>(); - + else if let Some(params) = parts.strip_prefix(&["get_last_record_rlsn"]) { if params.len() != 2 { return Err(QueryError::Other(anyhow::anyhow!( "invalid param number for get_last_record_rlsn command" @@ -1602,10 +1672,7 @@ where .await?; } // same as basebackup, but result includes relational data as well - else if query_string.starts_with("fullbackup ") { - let (_, params_raw) = query_string.split_at("fullbackup ".len()); - let params = params_raw.split_whitespace().collect::>(); - + else if let Some(params) = parts.strip_prefix(&["fullbackup"]) { if params.len() < 2 { return Err(QueryError::Other(anyhow::anyhow!( "invalid param number for fullbackup command" @@ -1622,18 +1689,18 @@ where .record("timeline_id", field::display(timeline_id)); // The caller is responsible for providing correct lsn and prev_lsn. - let lsn = if params.len() > 2 { + let lsn = if let Some(lsn_str) = params.get(2) { Some( - Lsn::from_str(params[2]) - .with_context(|| format!("Failed to parse Lsn from {}", params[2]))?, + Lsn::from_str(lsn_str) + .with_context(|| format!("Failed to parse Lsn from {lsn_str}"))?, ) } else { None }; - let prev_lsn = if params.len() > 3 { + let prev_lsn = if let Some(prev_lsn_str) = params.get(3) { Some( - Lsn::from_str(params[3]) - .with_context(|| format!("Failed to parse Lsn from {}", params[3]))?, + Lsn::from_str(prev_lsn_str) + .with_context(|| format!("Failed to parse Lsn from {prev_lsn_str}"))?, ) } else { None @@ -1666,8 +1733,7 @@ where // 2. Run: // cat my_backup/base.tar | psql -h $PAGESERVER \ // -c "import basebackup $TENANT $TIMELINE $START_LSN $END_LSN $PG_VERSION" - let (_, params_raw) = query_string.split_at("import basebackup ".len()); - let params = params_raw.split_whitespace().collect::>(); + let params = &parts[2..]; if params.len() != 5 { return Err(QueryError::Other(anyhow::anyhow!( "invalid param number for import basebackup command" @@ -1716,8 +1782,7 @@ where // // Files are scheduled to be persisted to remote storage, and the // caller should poll the http api to check when that is done. - let (_, params_raw) = query_string.split_at("import wal ".len()); - let params = params_raw.split_whitespace().collect::>(); + let params = &parts[2..]; if params.len() != 4 { return Err(QueryError::Other(anyhow::anyhow!( "invalid param number for import wal command" @@ -1755,10 +1820,45 @@ where // important because psycopg2 executes "SET datestyle TO 'ISO'" // on connect pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; - } else if query_string.starts_with("show ") { + } else if query_string.starts_with("lease lsn ") { + let params = &parts[2..]; + if params.len() != 3 { + return Err(QueryError::Other(anyhow::anyhow!( + "invalid param number {} for lease lsn command", + params.len() + ))); + } + + let tenant_shard_id = TenantShardId::from_str(params[0]) + .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?; + let timeline_id = TimelineId::from_str(params[1]) + .with_context(|| format!("Failed to parse timeline id from {}", params[1]))?; + + tracing::Span::current() + .record("tenant_id", field::display(tenant_shard_id)) + .record("timeline_id", field::display(timeline_id)); + + self.check_permission(Some(tenant_shard_id.tenant_id))?; + + // The caller is responsible for providing correct lsn. + let lsn = Lsn::from_str(params[2]) + .with_context(|| format!("Failed to parse Lsn from {}", params[2]))?; + + match self + .handle_make_lsn_lease(pgb, tenant_shard_id, timeline_id, lsn, &ctx) + .await + { + Ok(()) => pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?, + Err(e) => { + error!("error obtaining lsn lease for {lsn}: {e:?}"); + pgb.write_message_noflush(&BeMessage::ErrorResponse( + &e.to_string(), + Some(e.pg_error_code()), + ))? + } + }; + } else if let Some(params) = parts.strip_prefix(&["show"]) { // show - let (_, params_raw) = query_string.split_at("show ".len()); - let params = params_raw.split(' ').collect::>(); if params.len() != 1 { return Err(QueryError::Other(anyhow::anyhow!( "invalid param number for config command" @@ -1771,13 +1871,13 @@ where self.check_permission(Some(tenant_id))?; - let tenant = get_active_tenant_with_timeout( - tenant_id, - ShardSelector::Zero, - ACTIVE_TENANT_TIMEOUT, - &task_mgr::shutdown_token(), - ) - .await?; + let tenant = self + .get_active_tenant_with_timeout( + tenant_id, + ShardSelector::Zero, + ACTIVE_TENANT_TIMEOUT, + ) + .await?; pgb.write_message_noflush(&BeMessage::RowDescription(&[ RowDescriptor::int8_col(b"checkpoint_distance"), RowDescriptor::int8_col(b"checkpoint_timeout"), diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index a4215ee107..afba34c6d1 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -9,7 +9,6 @@ use super::tenant::{PageReconstructError, Timeline}; use crate::context::RequestContext; use crate::keyspace::{KeySpace, KeySpaceAccum}; -use crate::metrics::WAL_INGEST; use crate::span::debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id; use crate::walrecord::NeonWalRecord; use crate::{aux_file, repository::*}; @@ -35,12 +34,16 @@ use std::ops::ControlFlow; use std::ops::Range; use strum::IntoEnumIterator; use tokio_util::sync::CancellationToken; -use tracing::{debug, trace, warn}; +use tracing::{debug, info, trace, warn}; use utils::bin_ser::DeserializeError; use utils::vec_map::{VecMap, VecMapOrdering}; use utils::{bin_ser::BeSer, lsn::Lsn}; -const MAX_AUX_FILE_DELTAS: usize = 1024; +/// Max delta records appended to the AUX_FILES_KEY (for aux v1). The write path will write a full image once this threshold is reached. +pub const MAX_AUX_FILE_DELTAS: usize = 1024; + +/// Max number of aux-file-related delta layers. The compaction will create a new image layer once this threshold is reached. +pub const MAX_AUX_FILE_V2_DELTAS: usize = 64; #[derive(Debug)] pub enum LsnForTimestamp { @@ -699,13 +702,17 @@ impl Timeline { .await .context("scan")?; let mut result = HashMap::new(); + let mut sz = 0; for (_, v) in kv { let v = v.context("get value")?; let v = aux_file::decode_file_value_bytes(&v).context("value decode")?; for (fname, content) in v { + sz += fname.len(); + sz += content.len(); result.insert(fname, content); } } + self.aux_file_size_estimator.on_base_backup(sz); Ok(result) } @@ -714,10 +721,11 @@ impl Timeline { lsn: Lsn, ctx: &RequestContext, ) -> Result, PageReconstructError> { - match self.get_switch_aux_file_policy() { - AuxFilePolicy::V1 => self.list_aux_files_v1(lsn, ctx).await, - AuxFilePolicy::V2 => self.list_aux_files_v2(lsn, ctx).await, - AuxFilePolicy::CrossValidation => { + let current_policy = self.last_aux_file_policy.load(); + match current_policy { + Some(AuxFilePolicy::V1) | None => self.list_aux_files_v1(lsn, ctx).await, + Some(AuxFilePolicy::V2) => self.list_aux_files_v2(lsn, ctx).await, + Some(AuxFilePolicy::CrossValidation) => { let v1_result = self.list_aux_files_v1(lsn, ctx).await; let v2_result = self.list_aux_files_v2(lsn, ctx).await; match (v1_result, v2_result) { @@ -1465,7 +1473,40 @@ impl<'a> DatadirModification<'a> { content: &[u8], ctx: &RequestContext, ) -> anyhow::Result<()> { - let policy = self.tline.get_switch_aux_file_policy(); + let switch_policy = self.tline.get_switch_aux_file_policy(); + + let policy = { + let current_policy = self.tline.last_aux_file_policy.load(); + // Allowed switch path: + // * no aux files -> v1/v2/cross-validation + // * cross-validation->v2 + + let current_policy = if current_policy.is_none() { + // This path will only be hit once per tenant: we will decide the final policy in this code block. + // The next call to `put_file` will always have `last_aux_file_policy != None`. + let lsn = Lsn::max(self.tline.get_last_record_lsn(), self.lsn); + let aux_files_key_v1 = self.tline.list_aux_files_v1(lsn, ctx).await?; + if aux_files_key_v1.is_empty() { + None + } else { + self.tline.do_switch_aux_policy(AuxFilePolicy::V1)?; + Some(AuxFilePolicy::V1) + } + } else { + current_policy + }; + + if AuxFilePolicy::is_valid_migration_path(current_policy, switch_policy) { + self.tline.do_switch_aux_policy(switch_policy)?; + info!(current=?current_policy, next=?switch_policy, "switching aux file policy"); + switch_policy + } else { + // This branch handles non-valid migration path, and the case that switch_policy == current_policy. + // And actually, because the migration path always allow unspecified -> *, this unwrap_or will never be hit. + current_policy.unwrap_or(AuxFilePolicy::default_tenant_config()) + } + }; + if let AuxFilePolicy::V2 | AuxFilePolicy::CrossValidation = policy { let key = aux_file::encode_aux_file_key(path); // retrieve the key from the engine @@ -1474,23 +1515,45 @@ impl<'a> DatadirModification<'a> { Err(PageReconstructError::MissingKey(_)) => None, Err(e) => return Err(e.into()), }; - let files = if let Some(ref old_val) = old_val { + let files: Vec<(&str, &[u8])> = if let Some(ref old_val) = old_val { aux_file::decode_file_value(old_val)? } else { Vec::new() }; - let new_files = if content.is_empty() { - files - .into_iter() - .filter(|(p, _)| &path != p) - .collect::>() - } else { - files - .into_iter() - .filter(|(p, _)| &path != p) - .chain(std::iter::once((path, content))) - .collect::>() - }; + let mut other_files = Vec::with_capacity(files.len()); + let mut modifying_file = None; + for file @ (p, content) in files { + if path == p { + assert!( + modifying_file.is_none(), + "duplicated entries found for {}", + path + ); + modifying_file = Some(content); + } else { + other_files.push(file); + } + } + let mut new_files = other_files; + match (modifying_file, content.is_empty()) { + (Some(old_content), false) => { + self.tline + .aux_file_size_estimator + .on_update(old_content.len(), content.len()); + new_files.push((path, content)); + } + (Some(old_content), true) => { + self.tline + .aux_file_size_estimator + .on_remove(old_content.len()); + // not adding the file key to the final `new_files` vec. + } + (None, false) => { + self.tline.aux_file_size_estimator.on_add(content.len()); + new_files.push((path, content)); + } + (None, true) => anyhow::bail!("removing non-existing aux file: {}", path), + } let new_val = aux_file::encode_file_value(&new_files)?; self.put(key, Value::Image(new_val.into())); } @@ -1651,8 +1714,6 @@ impl<'a> DatadirModification<'a> { pub async fn commit(&mut self, ctx: &RequestContext) -> anyhow::Result<()> { let mut writer = self.tline.writer().await; - let timer = WAL_INGEST.time_spent_on_ingest.start_timer(); - let pending_nblocks = self.pending_nblocks; self.pending_nblocks = 0; @@ -1671,7 +1732,7 @@ impl<'a> DatadirModification<'a> { } if !self.pending_deletions.is_empty() { - writer.delete_batch(&self.pending_deletions).await?; + writer.delete_batch(&self.pending_deletions, ctx).await?; self.pending_deletions.clear(); } @@ -1692,8 +1753,6 @@ impl<'a> DatadirModification<'a> { writer.update_directory_entries_count(kind, count as u64); } - timer.observe_duration(); - Ok(()) } @@ -1729,6 +1788,12 @@ impl<'a> DatadirModification<'a> { self.tline.get(key, lsn, ctx).await } + /// Only used during unit tests, force putting a key into the modification. + #[cfg(test)] + pub(crate) fn put_for_test(&mut self, key: Key, val: Value) { + self.put(key, val); + } + fn put(&mut self, key: Key, val: Value) { let values = self.pending_updates.entry(key).or_default(); // Replace the previous value if it exists at the same lsn diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 010e56a899..540eb10ed2 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -20,7 +20,9 @@ use futures::stream::FuturesUnordered; use futures::FutureExt; use futures::StreamExt; use pageserver_api::models; +use pageserver_api::models::AuxFilePolicy; use pageserver_api::models::TimelineState; +use pageserver_api::models::TopTenantShardItem; use pageserver_api::models::WalRedoManagerStatus; use pageserver_api::shard::ShardIdentity; use pageserver_api::shard::ShardStripeSize; @@ -190,7 +192,7 @@ pub const TENANT_DELETED_MARKER_FILE_NAME: &str = "deleted"; #[derive(Clone)] pub struct TenantSharedResources { pub broker_client: storage_broker::BrokerClientChannel, - pub remote_storage: Option, + pub remote_storage: GenericRemoteStorage, pub deletion_queue_client: DeletionQueueClient, } @@ -292,7 +294,7 @@ pub struct Tenant { walredo_mgr: Option>, // provides access to timeline data sitting in the remote storage - pub(crate) remote_storage: Option, + pub(crate) remote_storage: GenericRemoteStorage, // Access to global deletion queue for when this tenant wants to schedule a deletion deletion_queue_client: DeletionQueueClient, @@ -528,6 +530,7 @@ impl Tenant { index_part: Option, metadata: TimelineMetadata, ancestor: Option>, + last_aux_file_policy: Option, _ctx: &RequestContext, ) -> anyhow::Result<()> { let tenant_id = self.tenant_shard_id; @@ -538,6 +541,10 @@ impl Tenant { ancestor.clone(), resources, CreateTimelineCause::Load, + // This could be derived from ancestor branch + index part. Though the only caller of `timeline_init_and_sync` is `load_remote_timeline`, + // there will potentially be other caller of this function in the future, and we don't know whether `index_part` or `ancestor` takes precedence. + // Therefore, we pass this field explicitly for now, and remove it once we fully migrate to aux file v2. + last_aux_file_policy, )?; let disk_consistent_lsn = timeline.get_disk_consistent_lsn(); anyhow::ensure!( @@ -551,21 +558,26 @@ impl Tenant { ); if let Some(index_part) = index_part.as_ref() { + timeline.remote_client.init_upload_queue(index_part)?; + timeline - .remote_client - .as_ref() - .unwrap() - .init_upload_queue(index_part)?; - } else if self.remote_storage.is_some() { + .last_aux_file_policy + .store(index_part.last_aux_file_policy()); + } else { // No data on the remote storage, but we have local metadata file. We can end up // here with timeline_create being interrupted before finishing index part upload. // By doing what we do here, the index part upload is retried. // If control plane retries timeline creation in the meantime, the mgmt API handler // for timeline creation will coalesce on the upload we queue here. + // FIXME: this branch should be dead code as we no longer write local metadata. - let rtc = timeline.remote_client.as_ref().unwrap(); - rtc.init_upload_queue_for_empty_remote(&metadata)?; - rtc.schedule_index_upload_for_full_metadata_update(&metadata)?; + + timeline + .remote_client + .init_upload_queue_for_empty_remote(&metadata)?; + timeline + .remote_client + .schedule_index_upload_for_full_metadata_update(&metadata)?; } timeline @@ -777,14 +789,14 @@ impl Tenant { AttachType::Normal }; - let preload = match (&mode, &remote_storage) { - (SpawnMode::Create, _) => { + let preload = match &mode { + SpawnMode::Create => { None }, - (SpawnMode::Eager | SpawnMode::Lazy, Some(remote_storage)) => { + SpawnMode::Eager | SpawnMode::Lazy => { let _preload_timer = TENANT.preload.start_timer(); let res = tenant_clone - .preload(remote_storage, task_mgr::shutdown_token()) + .preload(&remote_storage, task_mgr::shutdown_token()) .await; match res { Ok(p) => Some(p), @@ -794,10 +806,7 @@ impl Tenant { } } } - (_, None) => { - let _preload_timer = TENANT.preload.start_timer(); - None - } + }; // Remote preload is complete. @@ -1021,7 +1030,7 @@ impl Tenant { index_part, remote_metadata, TimelineResources { - remote_client: Some(remote_client), + remote_client, deletion_queue_client: self.deletion_queue_client.clone(), timeline_get_throttle: self.timeline_get_throttle.clone(), }, @@ -1047,7 +1056,7 @@ impl Tenant { Arc::clone(self), timeline_id, &index_part.metadata, - Some(remote_timeline_client), + remote_timeline_client, self.deletion_queue_client.clone(), ) .instrument(tracing::info_span!("timeline_delete", %timeline_id)) @@ -1139,9 +1148,7 @@ impl Tenant { let mut size = 0; for timeline in self.list_timelines() { - if let Some(remote_client) = &timeline.remote_client { - size += remote_client.get_remote_physical_size(); - } + size += timeline.remote_client.get_remote_physical_size(); } size @@ -1176,12 +1183,15 @@ impl Tenant { None }; + let last_aux_file_policy = index_part.last_aux_file_policy(); + self.timeline_init_and_sync( timeline_id, resources, Some(index_part), remote_metadata, ancestor, + last_aux_file_policy, ctx, ) .await @@ -1191,6 +1201,7 @@ impl Tenant { pub fn create_broken_tenant( conf: &'static PageServerConf, tenant_shard_id: TenantShardId, + remote_storage: GenericRemoteStorage, reason: String, ) -> Arc { Arc::new(Tenant::new( @@ -1205,7 +1216,7 @@ impl Tenant { ShardIdentity::broken(tenant_shard_id.shard_number, tenant_shard_id.shard_count), None, tenant_shard_id, - None, + remote_storage, DeletionQueueClient::broken(), )) } @@ -1360,6 +1371,7 @@ impl Tenant { create_guard, initdb_lsn, None, + None, ) .await } @@ -1398,13 +1410,7 @@ impl Tenant { tline.freeze_and_flush().await.context("freeze_and_flush")?; // Make sure the freeze_and_flush reaches remote storage. - tline - .remote_client - .as_ref() - .unwrap() - .wait_completion() - .await - .unwrap(); + tline.remote_client.wait_completion().await.unwrap(); let tl = uninit_tl.finish_creation()?; // The non-test code would call tl.activate() here. @@ -1470,20 +1476,19 @@ impl Tenant { return Err(CreateTimelineError::Conflict); } - if let Some(remote_client) = existing.remote_client.as_ref() { - // Wait for uploads to complete, so that when we return Ok, the timeline - // is known to be durable on remote storage. Just like we do at the end of - // this function, after we have created the timeline ourselves. - // - // We only really care that the initial version of `index_part.json` has - // been uploaded. That's enough to remember that the timeline - // exists. However, there is no function to wait specifically for that so - // we just wait for all in-progress uploads to finish. - remote_client - .wait_completion() - .await - .context("wait for timeline uploads to complete")?; - } + // Wait for uploads to complete, so that when we return Ok, the timeline + // is known to be durable on remote storage. Just like we do at the end of + // this function, after we have created the timeline ourselves. + // + // We only really care that the initial version of `index_part.json` has + // been uploaded. That's enough to remember that the timeline + // exists. However, there is no function to wait specifically for that so + // we just wait for all in-progress uploads to finish. + existing + .remote_client + .wait_completion() + .await + .context("wait for timeline uploads to complete")?; return Ok(existing); } @@ -1559,14 +1564,14 @@ impl Tenant { // the timeline is visible in [`Self::timelines`], but it is _not_ durable yet. We must // not send a success to the caller until it is. The same applies to handling retries, // see the handling of [`TimelineExclusionError::AlreadyExists`] above. - if let Some(remote_client) = loaded_timeline.remote_client.as_ref() { - let kind = ancestor_timeline_id - .map(|_| "branched") - .unwrap_or("bootstrapped"); - remote_client.wait_completion().await.with_context(|| { - format!("wait for {} timeline initial uploads to complete", kind) - })?; - } + let kind = ancestor_timeline_id + .map(|_| "branched") + .unwrap_or("bootstrapped"); + loaded_timeline + .remote_client + .wait_completion() + .await + .with_context(|| format!("wait for {} timeline initial uploads to complete", kind))?; loaded_timeline.activate(self.clone(), broker_client, None, ctx); @@ -2161,32 +2166,26 @@ impl Tenant { ) -> anyhow::Result<()> { let timelines = self.timelines.lock().unwrap().clone(); for timeline in timelines.values() { - let Some(tl_client) = &timeline.remote_client else { - anyhow::bail!("Remote storage is mandatory"); - }; - - let Some(remote_storage) = &self.remote_storage else { - anyhow::bail!("Remote storage is mandatory"); - }; - // We do not block timeline creation/deletion during splits inside the pageserver: it is up to higher levels // to ensure that they do not start a split if currently in the process of doing these. // Upload an index from the parent: this is partly to provide freshness for the // child tenants that will copy it, and partly for general ease-of-debugging: there will // always be a parent shard index in the same generation as we wrote the child shard index. - tl_client.schedule_index_upload_for_file_changes()?; - tl_client.wait_completion().await?; + timeline + .remote_client + .schedule_index_upload_for_file_changes()?; + timeline.remote_client.wait_completion().await?; // Shut down the timeline's remote client: this means that the indices we write // for child shards will not be invalidated by the parent shard deleting layers. - tl_client.shutdown().await; + timeline.remote_client.shutdown().await; // Download methods can still be used after shutdown, as they don't flow through the remote client's // queue. In principal the RemoteTimelineClient could provide this without downloading it, but this // operation is rare, so it's simpler to just download it (and robustly guarantees that the index // we use here really is the remotely persistent one). - let result = tl_client + let result = timeline.remote_client .download_index_file(&self.cancel) .instrument(info_span!("download_index_file", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%timeline.timeline_id)) .await?; @@ -2199,7 +2198,7 @@ impl Tenant { for child_shard in child_shards { upload_index_part( - remote_storage, + &self.remote_storage, child_shard, &timeline.timeline_id, self.generation, @@ -2212,6 +2211,31 @@ impl Tenant { Ok(()) } + + pub(crate) fn get_sizes(&self) -> TopTenantShardItem { + let mut result = TopTenantShardItem { + id: self.tenant_shard_id, + resident_size: 0, + physical_size: 0, + max_logical_size: 0, + }; + + for timeline in self.timelines.lock().unwrap().values() { + result.resident_size += timeline.metrics.resident_physical_size_gauge.get(); + + result.physical_size += timeline + .remote_client + .metrics + .remote_physical_size_gauge + .get(); + result.max_logical_size = std::cmp::max( + result.max_logical_size, + timeline.metrics.current_logical_size_gauge.get(), + ); + } + + result + } } /// Given a Vec of timelines and their ancestors (timeline_id, ancestor_id), @@ -2431,6 +2455,7 @@ impl Tenant { ancestor: Option>, resources: TimelineResources, cause: CreateTimelineCause, + last_aux_file_policy: Option, ) -> anyhow::Result> { let state = match cause { CreateTimelineCause::Load => { @@ -2459,6 +2484,7 @@ impl Tenant { resources, pg_version, state, + last_aux_file_policy, self.cancel.child_token(), ); @@ -2475,7 +2501,7 @@ impl Tenant { shard_identity: ShardIdentity, walredo_mgr: Option>, tenant_shard_id: TenantShardId, - remote_storage: Option, + remote_storage: GenericRemoteStorage, deletion_queue_client: DeletionQueueClient, ) -> Tenant { let (state, mut rx) = watch::channel(state); @@ -2800,7 +2826,7 @@ impl Tenant { // See comments in [`Tenant::branch_timeline`] for more information about why branch // creation task can run concurrently with timeline's GC iteration. for timeline in gc_timelines { - if task_mgr::is_shutdown_requested() || cancel.is_cancelled() { + if cancel.is_cancelled() { // We were requested to shut down. Stop and return with the progress we // made. break; @@ -3109,6 +3135,7 @@ impl Tenant { timeline_create_guard, start_lsn + 1, Some(Arc::clone(src_timeline)), + src_timeline.last_aux_file_policy.load(), ) .await?; @@ -3119,11 +3146,10 @@ impl Tenant { // We still need to upload its metadata eagerly: if other nodes `attach` the tenant and miss this timeline, their GC // could get incorrect information and remove more layers, than needed. // See also https://github.com/neondatabase/neon/issues/3865 - if let Some(remote_client) = new_timeline.remote_client.as_ref() { - remote_client - .schedule_index_upload_for_full_metadata_update(&metadata) - .context("branch initial metadata upload")?; - } + new_timeline + .remote_client + .schedule_index_upload_for_full_metadata_update(&metadata) + .context("branch initial metadata upload")?; Ok(new_timeline) } @@ -3155,11 +3181,6 @@ impl Tenant { pgdata_path: &Utf8PathBuf, timeline_id: &TimelineId, ) -> anyhow::Result<()> { - let Some(storage) = &self.remote_storage else { - // No remote storage? No upload. - return Ok(()); - }; - let temp_path = timelines_path.join(format!( "{INITDB_PATH}.upload-{timeline_id}.{TEMP_FILE_SUFFIX}" )); @@ -3183,7 +3204,7 @@ impl Tenant { backoff::retry( || async { self::remote_timeline_client::upload_initdb_dir( - storage, + &self.remote_storage, &self.tenant_shard_id.tenant_id, timeline_id, pgdata_zstd.try_clone().await?, @@ -3240,9 +3261,6 @@ impl Tenant { } } if let Some(existing_initdb_timeline_id) = load_existing_initdb { - let Some(storage) = &self.remote_storage else { - bail!("no storage configured but load_existing_initdb set to {existing_initdb_timeline_id}"); - }; if existing_initdb_timeline_id != timeline_id { let source_path = &remote_initdb_archive_path( &self.tenant_shard_id.tenant_id, @@ -3252,7 +3270,7 @@ impl Tenant { &remote_initdb_archive_path(&self.tenant_shard_id.tenant_id, &timeline_id); // if this fails, it will get retried by retried control plane requests - storage + self.remote_storage .copy_object(source_path, dest_path, &self.cancel) .await .context("copy initdb tar")?; @@ -3260,7 +3278,7 @@ impl Tenant { let (initdb_tar_zst_path, initdb_tar_zst) = self::remote_timeline_client::download_initdb_tar_zst( self.conf, - storage, + &self.remote_storage, &self.tenant_shard_id, &existing_initdb_timeline_id, &self.cancel, @@ -3311,6 +3329,7 @@ impl Tenant { timeline_create_guard, pgdata_lsn, None, + None, ) .await?; @@ -3355,20 +3374,14 @@ impl Tenant { /// Call this before constructing a timeline, to build its required structures fn build_timeline_resources(&self, timeline_id: TimelineId) -> TimelineResources { - let remote_client = if let Some(remote_storage) = self.remote_storage.as_ref() { - let remote_client = RemoteTimelineClient::new( - remote_storage.clone(), - self.deletion_queue_client.clone(), - self.conf, - self.tenant_shard_id, - timeline_id, - self.generation, - ); - Some(remote_client) - } else { - None - }; - + let remote_client = RemoteTimelineClient::new( + self.remote_storage.clone(), + self.deletion_queue_client.clone(), + self.conf, + self.tenant_shard_id, + timeline_id, + self.generation, + ); TimelineResources { remote_client, deletion_queue_client: self.deletion_queue_client.clone(), @@ -3388,13 +3401,14 @@ impl Tenant { create_guard: TimelineCreateGuard<'a>, start_lsn: Lsn, ancestor: Option>, + last_aux_file_policy: Option, ) -> anyhow::Result { let tenant_shard_id = self.tenant_shard_id; let resources = self.build_timeline_resources(new_timeline_id); - if let Some(remote_client) = &resources.remote_client { - remote_client.init_upload_queue_for_empty_remote(new_metadata)?; - } + resources + .remote_client + .init_upload_queue_for_empty_remote(new_metadata)?; let timeline_struct = self .create_timeline_struct( @@ -3403,6 +3417,7 @@ impl Tenant { ancestor, resources, CreateTimelineCause::Load, + last_aux_file_policy, ) .context("Failed to create timeline data structure")?; @@ -3562,9 +3577,7 @@ impl Tenant { tracing::info!(timeline_id=%timeline.timeline_id, "Flushing..."); timeline.freeze_and_flush().await?; tracing::info!(timeline_id=%timeline.timeline_id, "Waiting for uploads..."); - if let Some(client) = &timeline.remote_client { - client.wait_completion().await?; - } + timeline.remote_client.wait_completion().await?; Ok(()) } @@ -3878,7 +3891,7 @@ pub(crate) mod harness { ShardIdentity::unsharded(), Some(walredo_mgr), self.tenant_shard_id, - Some(self.remote_storage.clone()), + self.remote_storage.clone(), self.deletion_queue.new_client(), )); @@ -3951,18 +3964,20 @@ mod tests { use super::*; use crate::keyspace::KeySpaceAccum; + use crate::pgdatadir_mapping::AuxFilesDirectory; use crate::repository::{Key, Value}; use crate::tenant::harness::*; use crate::tenant::timeline::CompactFlags; use crate::DEFAULT_PG_VERSION; - use bytes::BytesMut; + use bytes::{Bytes, BytesMut}; use hex_literal::hex; - use pageserver_api::key::{AUX_KEY_PREFIX, NON_INHERITED_RANGE}; + use pageserver_api::key::{AUX_FILES_KEY, AUX_KEY_PREFIX, NON_INHERITED_RANGE}; use pageserver_api::keyspace::KeySpace; - use pageserver_api::models::CompactionAlgorithm; + use pageserver_api::models::{CompactionAlgorithm, CompactionAlgorithmSettings}; use rand::{thread_rng, Rng}; use tests::storage_layer::ValuesReconstructState; use tests::timeline::{GetVectoredError, ShutdownMode}; + use utils::bin_ser::BeSer; static TEST_KEY: Lazy = Lazy::new(|| Key::from_slice(&hex!("010000000033333333444444445500000001"))); @@ -4764,7 +4779,12 @@ mod tests { info!("Doing vectored read on {:?}", read); let vectored_res = tline - .get_vectored_impl(read.clone(), reads_lsn, ValuesReconstructState::new(), &ctx) + .get_vectored_impl( + read.clone(), + reads_lsn, + &mut ValuesReconstructState::new(), + &ctx, + ) .await; tline .validate_get_vectored_impl(&vectored_res, read, reads_lsn, &ctx) @@ -4813,7 +4833,7 @@ mod tests { .get_vectored_impl( aux_keyspace.clone(), read_lsn, - ValuesReconstructState::new(), + &mut ValuesReconstructState::new(), &ctx, ) .await; @@ -4958,7 +4978,7 @@ mod tests { .get_vectored_impl( read.clone(), current_lsn, - ValuesReconstructState::new(), + &mut ValuesReconstructState::new(), &ctx, ) .await?; @@ -5093,7 +5113,7 @@ mod tests { ranges: vec![child_gap_at_key..child_gap_at_key.next()], }, query_lsn, - ValuesReconstructState::new(), + &mut ValuesReconstructState::new(), &ctx, ) .await; @@ -5149,7 +5169,9 @@ mod tests { compaction_algorithm: CompactionAlgorithm, ) -> anyhow::Result<()> { let mut harness = TenantHarness::create(name)?; - harness.tenant_conf.compaction_algorithm = compaction_algorithm; + harness.tenant_conf.compaction_algorithm = CompactionAlgorithmSettings { + kind: compaction_algorithm, + }; let (tenant, ctx) = harness.load().await; let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) @@ -5506,7 +5528,9 @@ mod tests { compaction_algorithm: CompactionAlgorithm, ) -> anyhow::Result<()> { let mut harness = TenantHarness::create(name)?; - harness.tenant_conf.compaction_algorithm = compaction_algorithm; + harness.tenant_conf.compaction_algorithm = CompactionAlgorithmSettings { + kind: compaction_algorithm, + }; let (tenant, ctx) = harness.load().await; let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx) @@ -5534,7 +5558,7 @@ mod tests { .await?; const NUM_KEYS: usize = 1000; - const STEP: usize = 100; // random update + scan base_key + idx * STEP + const STEP: usize = 10000; // random update + scan base_key + idx * STEP let cancel = CancellationToken::new(); @@ -5567,7 +5591,7 @@ mod tests { let keyspace = KeySpace::single(base_key..base_key.add((NUM_KEYS * STEP) as u32)); - for _ in 0..10 { + for iter in 0..=10 { // Read all the blocks for (blknum, last_lsn) in updated.iter().enumerate() { test_key.field6 = (blknum * STEP) as u32; @@ -5582,7 +5606,7 @@ mod tests { .get_vectored_impl( keyspace.clone(), lsn, - ValuesReconstructState::default(), + &mut ValuesReconstructState::default(), &ctx, ) .await? @@ -5618,14 +5642,858 @@ mod tests { updated[blknum] = lsn; } - // Perform a cycle of flush, compact, and GC + // Perform two cycles of flush, compact, and GC + for round in 0..2 { + tline.freeze_and_flush().await?; + tline + .compact( + &cancel, + if iter % 5 == 0 && round == 0 { + let mut flags = EnumSet::new(); + flags.insert(CompactFlags::ForceImageLayerCreation); + flags.insert(CompactFlags::ForceRepartition); + flags + } else { + EnumSet::empty() + }, + &ctx, + ) + .await?; + tenant + .gc_iteration(Some(tline.timeline_id), 0, Duration::ZERO, &cancel, &ctx) + .await?; + } + } + + Ok(()) + } + + #[tokio::test] + async fn test_metadata_compaction_trigger() -> anyhow::Result<()> { + let harness = TenantHarness::create("test_metadata_compaction_trigger")?; + let (tenant, ctx) = harness.load().await; + let tline = tenant + .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) + .await?; + + let cancel = CancellationToken::new(); + + let mut base_key = Key::from_hex("000000000033333333444444445500000000").unwrap(); + base_key.field1 = AUX_KEY_PREFIX; + let test_key = base_key; + let mut lsn = Lsn(0x10); + + for _ in 0..20 { + lsn = Lsn(lsn.0 + 0x10); + let mut writer = tline.writer().await; + writer + .put( + test_key, + lsn, + &Value::Image(test_img(&format!("{} at {}", 0, lsn))), + &ctx, + ) + .await?; + writer.finish_write(lsn); + drop(writer); + tline.freeze_and_flush().await?; // force create a delta layer + } + + let before_num_l0_delta_files = tline + .layers + .read() + .await + .layer_map() + .get_level0_deltas()? + .len(); + + tline.compact(&cancel, EnumSet::empty(), &ctx).await?; + + let after_num_l0_delta_files = tline + .layers + .read() + .await + .layer_map() + .get_level0_deltas()? + .len(); + + assert!(after_num_l0_delta_files < before_num_l0_delta_files, "after_num_l0_delta_files={after_num_l0_delta_files}, before_num_l0_delta_files={before_num_l0_delta_files}"); + + assert_eq!( + tline.get(test_key, lsn, &ctx).await?, + test_img(&format!("{} at {}", 0, lsn)) + ); + + Ok(()) + } + + #[tokio::test] + async fn test_branch_copies_dirty_aux_file_flag() { + let harness = TenantHarness::create("test_branch_copies_dirty_aux_file_flag").unwrap(); + + // the default aux file policy to switch is v1 if not set by the admins + assert_eq!( + harness.tenant_conf.switch_aux_file_policy, + AuxFilePolicy::V1 + ); + let (tenant, ctx) = harness.load().await; + + let mut lsn = Lsn(0x08); + + let tline: Arc = tenant + .create_test_timeline(TIMELINE_ID, lsn, DEFAULT_PG_VERSION, &ctx) + .await + .unwrap(); + + // no aux file is written at this point, so the persistent flag should be unset + assert_eq!(tline.last_aux_file_policy.load(), None); + + { + lsn += 8; + let mut modification = tline.begin_modification(lsn); + modification + .put_file("pg_logical/mappings/test1", b"first", &ctx) + .await + .unwrap(); + modification.commit(&ctx).await.unwrap(); + } + + // there is no tenant manager to pass the configuration through, so lets mimic it + tenant.set_new_location_config( + AttachedTenantConf::try_from(LocationConf::attached_single( + TenantConfOpt { + switch_aux_file_policy: Some(AuxFilePolicy::V2), + ..Default::default() + }, + tenant.generation, + &pageserver_api::models::ShardParameters::default(), + )) + .unwrap(), + ); + + assert_eq!( + tline.get_switch_aux_file_policy(), + AuxFilePolicy::V2, + "wanted state has been updated" + ); + assert_eq!( + tline.last_aux_file_policy.load(), + Some(AuxFilePolicy::V1), + "aux file is written with switch_aux_file_policy unset (which is v1), so we should keep v1" + ); + + // we can read everything from the storage + let files = tline.list_aux_files(lsn, &ctx).await.unwrap(); + assert_eq!( + files.get("pg_logical/mappings/test1"), + Some(&bytes::Bytes::from_static(b"first")) + ); + + { + lsn += 8; + let mut modification = tline.begin_modification(lsn); + modification + .put_file("pg_logical/mappings/test2", b"second", &ctx) + .await + .unwrap(); + modification.commit(&ctx).await.unwrap(); + } + + assert_eq!( + tline.last_aux_file_policy.load(), + Some(AuxFilePolicy::V1), + "keep v1 storage format when new files are written" + ); + + let files = tline.list_aux_files(lsn, &ctx).await.unwrap(); + assert_eq!( + files.get("pg_logical/mappings/test2"), + Some(&bytes::Bytes::from_static(b"second")) + ); + + let child = tenant + .branch_timeline_test(&tline, NEW_TIMELINE_ID, Some(lsn), &ctx) + .await + .unwrap(); + + // child copies the last flag even if that is not on remote storage yet + assert_eq!(child.get_switch_aux_file_policy(), AuxFilePolicy::V2); + assert_eq!(child.last_aux_file_policy.load(), Some(AuxFilePolicy::V1)); + + let files = child.list_aux_files(lsn, &ctx).await.unwrap(); + assert_eq!(files.get("pg_logical/mappings/test1"), None); + assert_eq!(files.get("pg_logical/mappings/test2"), None); + + // even if we crash here without flushing parent timeline with it's new + // last_aux_file_policy we are safe, because child was never meant to access ancestor's + // files. the ancestor can even switch back to V1 because of a migration safely. + } + + #[tokio::test] + async fn aux_file_policy_switch() { + let mut harness = TenantHarness::create("aux_file_policy_switch").unwrap(); + harness.tenant_conf.switch_aux_file_policy = AuxFilePolicy::CrossValidation; // set to cross-validation mode + let (tenant, ctx) = harness.load().await; + + let mut lsn = Lsn(0x08); + + let tline: Arc = tenant + .create_test_timeline(TIMELINE_ID, lsn, DEFAULT_PG_VERSION, &ctx) + .await + .unwrap(); + + assert_eq!( + tline.last_aux_file_policy.load(), + None, + "no aux file is written so it should be unset" + ); + + { + lsn += 8; + let mut modification = tline.begin_modification(lsn); + modification + .put_file("pg_logical/mappings/test1", b"first", &ctx) + .await + .unwrap(); + modification.commit(&ctx).await.unwrap(); + } + + // there is no tenant manager to pass the configuration through, so lets mimic it + tenant.set_new_location_config( + AttachedTenantConf::try_from(LocationConf::attached_single( + TenantConfOpt { + switch_aux_file_policy: Some(AuxFilePolicy::V2), + ..Default::default() + }, + tenant.generation, + &pageserver_api::models::ShardParameters::default(), + )) + .unwrap(), + ); + + assert_eq!( + tline.get_switch_aux_file_policy(), + AuxFilePolicy::V2, + "wanted state has been updated" + ); + assert_eq!( + tline.last_aux_file_policy.load(), + Some(AuxFilePolicy::CrossValidation), + "dirty index_part.json reflected state is yet to be updated" + ); + + // we can still read the auxfile v1 before we ingest anything new + let files = tline.list_aux_files(lsn, &ctx).await.unwrap(); + assert_eq!( + files.get("pg_logical/mappings/test1"), + Some(&bytes::Bytes::from_static(b"first")) + ); + + { + lsn += 8; + let mut modification = tline.begin_modification(lsn); + modification + .put_file("pg_logical/mappings/test2", b"second", &ctx) + .await + .unwrap(); + modification.commit(&ctx).await.unwrap(); + } + + assert_eq!( + tline.last_aux_file_policy.load(), + Some(AuxFilePolicy::V2), + "ingesting a file should apply the wanted switch state when applicable" + ); + + let files = tline.list_aux_files(lsn, &ctx).await.unwrap(); + assert_eq!( + files.get("pg_logical/mappings/test1"), + Some(&bytes::Bytes::from_static(b"first")), + "cross validation writes to both v1 and v2 so this should be available in v2" + ); + assert_eq!( + files.get("pg_logical/mappings/test2"), + Some(&bytes::Bytes::from_static(b"second")) + ); + + // mimic again by trying to flip it from V2 to V1 (not switched to while ingesting a file) + tenant.set_new_location_config( + AttachedTenantConf::try_from(LocationConf::attached_single( + TenantConfOpt { + switch_aux_file_policy: Some(AuxFilePolicy::V1), + ..Default::default() + }, + tenant.generation, + &pageserver_api::models::ShardParameters::default(), + )) + .unwrap(), + ); + + { + lsn += 8; + let mut modification = tline.begin_modification(lsn); + modification + .put_file("pg_logical/mappings/test2", b"third", &ctx) + .await + .unwrap(); + modification.commit(&ctx).await.unwrap(); + } + + assert_eq!( + tline.get_switch_aux_file_policy(), + AuxFilePolicy::V1, + "wanted state has been updated again, even if invalid request" + ); + + assert_eq!( + tline.last_aux_file_policy.load(), + Some(AuxFilePolicy::V2), + "ingesting a file should apply the wanted switch state when applicable" + ); + + let files = tline.list_aux_files(lsn, &ctx).await.unwrap(); + assert_eq!( + files.get("pg_logical/mappings/test1"), + Some(&bytes::Bytes::from_static(b"first")) + ); + assert_eq!( + files.get("pg_logical/mappings/test2"), + Some(&bytes::Bytes::from_static(b"third")) + ); + + // mimic again by trying to flip it from from V1 to V2 (not switched to while ingesting a file) + tenant.set_new_location_config( + AttachedTenantConf::try_from(LocationConf::attached_single( + TenantConfOpt { + switch_aux_file_policy: Some(AuxFilePolicy::V2), + ..Default::default() + }, + tenant.generation, + &pageserver_api::models::ShardParameters::default(), + )) + .unwrap(), + ); + + { + lsn += 8; + let mut modification = tline.begin_modification(lsn); + modification + .put_file("pg_logical/mappings/test3", b"last", &ctx) + .await + .unwrap(); + modification.commit(&ctx).await.unwrap(); + } + + assert_eq!(tline.get_switch_aux_file_policy(), AuxFilePolicy::V2); + + assert_eq!(tline.last_aux_file_policy.load(), Some(AuxFilePolicy::V2)); + + let files = tline.list_aux_files(lsn, &ctx).await.unwrap(); + assert_eq!( + files.get("pg_logical/mappings/test1"), + Some(&bytes::Bytes::from_static(b"first")) + ); + assert_eq!( + files.get("pg_logical/mappings/test2"), + Some(&bytes::Bytes::from_static(b"third")) + ); + assert_eq!( + files.get("pg_logical/mappings/test3"), + Some(&bytes::Bytes::from_static(b"last")) + ); + } + + #[tokio::test] + async fn aux_file_policy_force_switch() { + let mut harness = TenantHarness::create("aux_file_policy_force_switch").unwrap(); + harness.tenant_conf.switch_aux_file_policy = AuxFilePolicy::V1; + let (tenant, ctx) = harness.load().await; + + let mut lsn = Lsn(0x08); + + let tline: Arc = tenant + .create_test_timeline(TIMELINE_ID, lsn, DEFAULT_PG_VERSION, &ctx) + .await + .unwrap(); + + assert_eq!( + tline.last_aux_file_policy.load(), + None, + "no aux file is written so it should be unset" + ); + + { + lsn += 8; + let mut modification = tline.begin_modification(lsn); + modification + .put_file("pg_logical/mappings/test1", b"first", &ctx) + .await + .unwrap(); + modification.commit(&ctx).await.unwrap(); + } + + tline.do_switch_aux_policy(AuxFilePolicy::V2).unwrap(); + + assert_eq!( + tline.last_aux_file_policy.load(), + Some(AuxFilePolicy::V2), + "dirty index_part.json reflected state is yet to be updated" + ); + + // lose all data from v1 + let files = tline.list_aux_files(lsn, &ctx).await.unwrap(); + assert_eq!(files.get("pg_logical/mappings/test1"), None); + + { + lsn += 8; + let mut modification = tline.begin_modification(lsn); + modification + .put_file("pg_logical/mappings/test2", b"second", &ctx) + .await + .unwrap(); + modification.commit(&ctx).await.unwrap(); + } + + // read data ingested in v2 + let files = tline.list_aux_files(lsn, &ctx).await.unwrap(); + assert_eq!( + files.get("pg_logical/mappings/test2"), + Some(&bytes::Bytes::from_static(b"second")) + ); + // lose all data from v1 + assert_eq!(files.get("pg_logical/mappings/test1"), None); + } + + #[tokio::test] + async fn aux_file_policy_auto_detect() { + let mut harness = TenantHarness::create("aux_file_policy_auto_detect").unwrap(); + harness.tenant_conf.switch_aux_file_policy = AuxFilePolicy::V2; // set to cross-validation mode + let (tenant, ctx) = harness.load().await; + + let mut lsn = Lsn(0x08); + + let tline: Arc = tenant + .create_test_timeline(TIMELINE_ID, lsn, DEFAULT_PG_VERSION, &ctx) + .await + .unwrap(); + + assert_eq!( + tline.last_aux_file_policy.load(), + None, + "no aux file is written so it should be unset" + ); + + { + lsn += 8; + let mut modification = tline.begin_modification(lsn); + let buf = AuxFilesDirectory::ser(&AuxFilesDirectory { + files: vec![( + "test_file".to_string(), + Bytes::copy_from_slice(b"test_file"), + )] + .into_iter() + .collect(), + }) + .unwrap(); + modification.put_for_test(AUX_FILES_KEY, Value::Image(Bytes::from(buf))); + modification.commit(&ctx).await.unwrap(); + } + + { + lsn += 8; + let mut modification = tline.begin_modification(lsn); + modification + .put_file("pg_logical/mappings/test1", b"first", &ctx) + .await + .unwrap(); + modification.commit(&ctx).await.unwrap(); + } + + assert_eq!( + tline.last_aux_file_policy.load(), + Some(AuxFilePolicy::V1), + "keep using v1 because there are aux files writting with v1" + ); + + // we can still read the auxfile v1 + let files = tline.list_aux_files(lsn, &ctx).await.unwrap(); + assert_eq!( + files.get("pg_logical/mappings/test1"), + Some(&bytes::Bytes::from_static(b"first")) + ); + assert_eq!( + files.get("test_file"), + Some(&bytes::Bytes::from_static(b"test_file")) + ); + } + + #[tokio::test] + async fn test_metadata_image_creation() -> anyhow::Result<()> { + let harness = TenantHarness::create("test_metadata_image_creation")?; + let (tenant, ctx) = harness.load().await; + let tline = tenant + .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) + .await?; + + const NUM_KEYS: usize = 1000; + const STEP: usize = 10000; // random update + scan base_key + idx * STEP + + let cancel = CancellationToken::new(); + + let mut base_key = Key::from_hex("000000000033333333444444445500000000").unwrap(); + base_key.field1 = AUX_KEY_PREFIX; + let mut test_key = base_key; + let mut lsn = Lsn(0x10); + + async fn scan_with_statistics( + tline: &Timeline, + keyspace: &KeySpace, + lsn: Lsn, + ctx: &RequestContext, + ) -> anyhow::Result<(BTreeMap>, usize)> { + let mut reconstruct_state = ValuesReconstructState::default(); + let res = tline + .get_vectored_impl(keyspace.clone(), lsn, &mut reconstruct_state, ctx) + .await?; + Ok((res, reconstruct_state.get_delta_layers_visited() as usize)) + } + + #[allow(clippy::needless_range_loop)] + for blknum in 0..NUM_KEYS { + lsn = Lsn(lsn.0 + 0x10); + test_key.field6 = (blknum * STEP) as u32; + let mut writer = tline.writer().await; + writer + .put( + test_key, + lsn, + &Value::Image(test_img(&format!("{} at {}", blknum, lsn))), + &ctx, + ) + .await?; + writer.finish_write(lsn); + drop(writer); + } + + let keyspace = KeySpace::single(base_key..base_key.add((NUM_KEYS * STEP) as u32)); + + for iter in 1..=10 { + for _ in 0..NUM_KEYS { + lsn = Lsn(lsn.0 + 0x10); + let blknum = thread_rng().gen_range(0..NUM_KEYS); + test_key.field6 = (blknum * STEP) as u32; + let mut writer = tline.writer().await; + writer + .put( + test_key, + lsn, + &Value::Image(test_img(&format!("{} at {}", blknum, lsn))), + &ctx, + ) + .await?; + writer.finish_write(lsn); + drop(writer); + } + tline.freeze_and_flush().await?; - tline.compact(&cancel, EnumSet::empty(), &ctx).await?; + + if iter % 5 == 0 { + let (_, before_delta_file_accessed) = + scan_with_statistics(&tline, &keyspace, lsn, &ctx).await?; + tline + .compact( + &cancel, + { + let mut flags = EnumSet::new(); + flags.insert(CompactFlags::ForceImageLayerCreation); + flags.insert(CompactFlags::ForceRepartition); + flags + }, + &ctx, + ) + .await?; + let (_, after_delta_file_accessed) = + scan_with_statistics(&tline, &keyspace, lsn, &ctx).await?; + assert!(after_delta_file_accessed < before_delta_file_accessed, "after_delta_file_accessed={after_delta_file_accessed}, before_delta_file_accessed={before_delta_file_accessed}"); + // Given that we already produced an image layer, there should be no delta layer needed for the scan, but still setting a low threshold there for unforeseen circumstances. + assert!( + after_delta_file_accessed <= 2, + "after_delta_file_accessed={after_delta_file_accessed}" + ); + } + } + + Ok(()) + } + + #[tokio::test] + async fn test_vectored_missing_data_key_reads() -> anyhow::Result<()> { + let harness = TenantHarness::create("test_vectored_missing_data_key_reads")?; + let (tenant, ctx) = harness.load().await; + let tline = tenant + .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) + .await?; + + let cancel = CancellationToken::new(); + + let base_key = Key::from_hex("000000000033333333444444445500000000").unwrap(); + let base_key_child = Key::from_hex("000000000033333333444444445500000001").unwrap(); + let base_key_nonexist = Key::from_hex("000000000033333333444444445500000002").unwrap(); + + let mut lsn = Lsn(0x20); + + { + let mut writer = tline.writer().await; + writer + .put(base_key, lsn, &Value::Image(test_img("data key 1")), &ctx) + .await?; + writer.finish_write(lsn); + drop(writer); + + tline.freeze_and_flush().await?; // this will create a image layer + } + + let child = tenant + .branch_timeline_test(&tline, NEW_TIMELINE_ID, Some(lsn), &ctx) + .await + .unwrap(); + + lsn.0 += 0x10; + + { + let mut writer = child.writer().await; + writer + .put( + base_key_child, + lsn, + &Value::Image(test_img("data key 2")), + &ctx, + ) + .await?; + writer.finish_write(lsn); + drop(writer); + + child.freeze_and_flush().await?; // this will create a delta + + { + // update the partitioning to include the test key space, otherwise they + // will be dropped by image layer creation + let mut guard = child.partitioning.lock().await; + let ((partitioning, _), partition_lsn) = &mut *guard; + partitioning + .parts + .push(KeySpace::single(base_key..base_key_nonexist)); // exclude the nonexist key + *partition_lsn = lsn; + } + + child + .compact( + &cancel, + { + let mut set = EnumSet::empty(); + set.insert(CompactFlags::ForceImageLayerCreation); + set + }, + &ctx, + ) + .await?; // force create an image layer for the keys, TODO: check if the image layer is created + } + + async fn get_vectored_impl_wrapper( + tline: &Arc, + key: Key, + lsn: Lsn, + ctx: &RequestContext, + ) -> Result, GetVectoredError> { + let mut reconstruct_state = ValuesReconstructState::new(); + let mut res = tline + .get_vectored_impl( + KeySpace::single(key..key.next()), + lsn, + &mut reconstruct_state, + ctx, + ) + .await?; + Ok(res.pop_last().map(|(k, v)| { + assert_eq!(k, key); + v.unwrap() + })) + } + + // test vectored get on parent timeline + assert_eq!( + get_vectored_impl_wrapper(&tline, base_key, lsn, &ctx).await?, + Some(test_img("data key 1")) + ); + assert!(get_vectored_impl_wrapper(&tline, base_key_child, lsn, &ctx) + .await + .unwrap_err() + .is_missing_key_error()); + assert!( + get_vectored_impl_wrapper(&tline, base_key_nonexist, lsn, &ctx) + .await + .unwrap_err() + .is_missing_key_error() + ); + + // test vectored get on child timeline + assert_eq!( + get_vectored_impl_wrapper(&child, base_key, lsn, &ctx).await?, + Some(test_img("data key 1")) + ); + assert_eq!( + get_vectored_impl_wrapper(&child, base_key_child, lsn, &ctx).await?, + Some(test_img("data key 2")) + ); + assert!( + get_vectored_impl_wrapper(&child, base_key_nonexist, lsn, &ctx) + .await + .unwrap_err() + .is_missing_key_error() + ); + + Ok(()) + } + + #[tokio::test] + async fn test_vectored_missing_metadata_key_reads() -> anyhow::Result<()> { + let harness = TenantHarness::create("test_vectored_missing_metadata_key_reads")?; + let (tenant, ctx) = harness.load().await; + let tline = tenant + .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) + .await?; + + let cancel = CancellationToken::new(); + + let mut base_key = Key::from_hex("000000000033333333444444445500000000").unwrap(); + let mut base_key_child = Key::from_hex("000000000033333333444444445500000001").unwrap(); + let mut base_key_nonexist = Key::from_hex("000000000033333333444444445500000002").unwrap(); + base_key.field1 = AUX_KEY_PREFIX; + base_key_child.field1 = AUX_KEY_PREFIX; + base_key_nonexist.field1 = AUX_KEY_PREFIX; + + let mut lsn = Lsn(0x20); + + { + let mut writer = tline.writer().await; + writer + .put( + base_key, + lsn, + &Value::Image(test_img("metadata key 1")), + &ctx, + ) + .await?; + writer.finish_write(lsn); + drop(writer); + + tline.freeze_and_flush().await?; // this will create an image layer + + tline + .compact( + &cancel, + { + let mut set = EnumSet::empty(); + set.insert(CompactFlags::ForceImageLayerCreation); + set.insert(CompactFlags::ForceRepartition); + set + }, + &ctx, + ) + .await?; // force create an image layer for metadata keys tenant .gc_iteration(Some(tline.timeline_id), 0, Duration::ZERO, &cancel, &ctx) .await?; } + let child = tenant + .branch_timeline_test(&tline, NEW_TIMELINE_ID, Some(lsn), &ctx) + .await + .unwrap(); + + lsn.0 += 0x10; + + { + let mut writer = child.writer().await; + writer + .put( + base_key_child, + lsn, + &Value::Image(test_img("metadata key 2")), + &ctx, + ) + .await?; + writer.finish_write(lsn); + drop(writer); + + child.freeze_and_flush().await?; + + child + .compact( + &cancel, + { + let mut set = EnumSet::empty(); + set.insert(CompactFlags::ForceImageLayerCreation); + set.insert(CompactFlags::ForceRepartition); + set + }, + &ctx, + ) + .await?; // force create an image layer for metadata keys + tenant + .gc_iteration(Some(child.timeline_id), 0, Duration::ZERO, &cancel, &ctx) + .await?; + } + + async fn get_vectored_impl_wrapper( + tline: &Arc, + key: Key, + lsn: Lsn, + ctx: &RequestContext, + ) -> Result, GetVectoredError> { + let mut reconstruct_state = ValuesReconstructState::new(); + let mut res = tline + .get_vectored_impl( + KeySpace::single(key..key.next()), + lsn, + &mut reconstruct_state, + ctx, + ) + .await?; + Ok(res.pop_last().map(|(k, v)| { + assert_eq!(k, key); + v.unwrap() + })) + } + + // test vectored get on parent timeline + assert_eq!( + get_vectored_impl_wrapper(&tline, base_key, lsn, &ctx).await?, + Some(test_img("metadata key 1")) + ); + assert_eq!( + get_vectored_impl_wrapper(&tline, base_key_child, lsn, &ctx).await?, + None + ); + assert_eq!( + get_vectored_impl_wrapper(&tline, base_key_nonexist, lsn, &ctx).await?, + None + ); + + // test vectored get on child timeline + assert_eq!( + get_vectored_impl_wrapper(&child, base_key, lsn, &ctx).await?, + None + ); + assert_eq!( + get_vectored_impl_wrapper(&child, base_key_child, lsn, &ctx).await?, + Some(test_img("metadata key 2")) + ); + assert_eq!( + get_vectored_impl_wrapper(&child, base_key_nonexist, lsn, &ctx).await?, + None + ); + Ok(()) } } diff --git a/pageserver/src/tenant/blob_io.rs b/pageserver/src/tenant/blob_io.rs index 1dc451f5c9..24b4e4f3ea 100644 --- a/pageserver/src/tenant/blob_io.rs +++ b/pageserver/src/tenant/blob_io.rs @@ -299,7 +299,7 @@ mod tests { // Write part (in block to drop the file) let mut offsets = Vec::new(); { - let file = VirtualFile::create(pathbuf.as_path()).await?; + let file = VirtualFile::create(pathbuf.as_path(), &ctx).await?; let mut wtr = BlobWriter::::new(file, 0); for blob in blobs.iter() { let (_, res) = wtr.write_blob(blob.clone(), &ctx).await; @@ -314,7 +314,7 @@ mod tests { wtr.flush_buffer(&ctx).await?; } - let file = VirtualFile::open(pathbuf.as_path()).await?; + let file = VirtualFile::open(pathbuf.as_path(), &ctx).await?; let rdr = BlockReaderRef::VirtualFile(&file); let rdr = BlockCursor::new(rdr); for (idx, (blob, offset)) in blobs.iter().zip(offsets.iter()).enumerate() { diff --git a/pageserver/src/tenant/block_io.rs b/pageserver/src/tenant/block_io.rs index 37c84be342..92928116c1 100644 --- a/pageserver/src/tenant/block_io.rs +++ b/pageserver/src/tenant/block_io.rs @@ -102,7 +102,7 @@ impl<'a> BlockReaderRef<'a> { #[cfg(test)] TestDisk(r) => r.read_blk(blknum), #[cfg(test)] - VirtualFile(r) => r.read_blk(blknum).await, + VirtualFile(r) => r.read_blk(blknum, ctx).await, } } } @@ -177,10 +177,11 @@ impl<'a> FileBlockReader<'a> { &self, buf: PageWriteGuard<'static>, blkno: u32, + ctx: &RequestContext, ) -> Result, std::io::Error> { assert!(buf.len() == PAGE_SZ); self.file - .read_exact_at_page(buf, blkno as u64 * PAGE_SZ as u64) + .read_exact_at_page(buf, blkno as u64 * PAGE_SZ as u64, ctx) .await } /// Read a block. @@ -206,7 +207,7 @@ impl<'a> FileBlockReader<'a> { ReadBufResult::Found(guard) => Ok(guard.into()), ReadBufResult::NotFound(write_guard) => { // Read the page from disk into the buffer - let write_guard = self.fill_buffer(write_guard, blknum).await?; + let write_guard = self.fill_buffer(write_guard, blknum, ctx).await?; Ok(write_guard.mark_valid().into()) } } diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs index a743ce3c16..342d705954 100644 --- a/pageserver/src/tenant/config.rs +++ b/pageserver/src/tenant/config.rs @@ -11,6 +11,7 @@ use anyhow::bail; use pageserver_api::models::AuxFilePolicy; use pageserver_api::models::CompactionAlgorithm; +use pageserver_api::models::CompactionAlgorithmSettings; use pageserver_api::models::EvictionPolicy; use pageserver_api::models::{self, ThrottleConfig}; use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize}; @@ -320,7 +321,7 @@ pub struct TenantConf { pub compaction_period: Duration, // Level0 delta layer threshold for compaction. pub compaction_threshold: usize, - pub compaction_algorithm: CompactionAlgorithm, + pub compaction_algorithm: CompactionAlgorithmSettings, // Determines how much history is retained, to allow // branching and read replicas at an older point in time. // The unit is #of bytes of WAL. @@ -373,6 +374,8 @@ pub struct TenantConf { /// Switch to a new aux file policy. Switching this flag requires the user has not written any aux file into /// the storage before, and this flag cannot be switched back. Otherwise there will be data corruptions. + /// There is a `last_aux_file_policy` flag which gets persisted in `index_part.json` once the first aux + /// file is written. pub switch_aux_file_policy: AuxFilePolicy, } @@ -404,7 +407,7 @@ pub struct TenantConfOpt { #[serde(skip_serializing_if = "Option::is_none")] #[serde(default)] - pub compaction_algorithm: Option, + pub compaction_algorithm: Option, #[serde(skip_serializing_if = "Option::is_none")] #[serde(default)] @@ -495,7 +498,9 @@ impl TenantConfOpt { .unwrap_or(global_conf.compaction_threshold), compaction_algorithm: self .compaction_algorithm - .unwrap_or(global_conf.compaction_algorithm), + .as_ref() + .unwrap_or(&global_conf.compaction_algorithm) + .clone(), gc_horizon: self.gc_horizon.unwrap_or(global_conf.gc_horizon), gc_period: self.gc_period.unwrap_or(global_conf.gc_period), image_creation_threshold: self @@ -548,7 +553,9 @@ impl Default for TenantConf { compaction_period: humantime::parse_duration(DEFAULT_COMPACTION_PERIOD) .expect("cannot parse default compaction period"), compaction_threshold: DEFAULT_COMPACTION_THRESHOLD, - compaction_algorithm: DEFAULT_COMPACTION_ALGORITHM, + compaction_algorithm: CompactionAlgorithmSettings { + kind: DEFAULT_COMPACTION_ALGORITHM, + }, gc_horizon: DEFAULT_GC_HORIZON, gc_period: humantime::parse_duration(DEFAULT_GC_PERIOD) .expect("cannot parse default gc period"), @@ -574,7 +581,7 @@ impl Default for TenantConf { lazy_slru_download: false, timeline_get_throttle: crate::tenant::throttle::Config::disabled(), image_layer_creation_check_threshold: DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD, - switch_aux_file_policy: AuxFilePolicy::V1, + switch_aux_file_policy: AuxFilePolicy::default_tenant_config(), } } } diff --git a/pageserver/src/tenant/delete.rs b/pageserver/src/tenant/delete.rs index 2e5259bfe2..3173a33dad 100644 --- a/pageserver/src/tenant/delete.rs +++ b/pageserver/src/tenant/delete.rs @@ -181,25 +181,23 @@ async fn ensure_timelines_dir_empty(timelines_path: &Utf8Path) -> Result<(), Del async fn remove_tenant_remote_delete_mark( conf: &PageServerConf, - remote_storage: Option<&GenericRemoteStorage>, + remote_storage: &GenericRemoteStorage, tenant_shard_id: &TenantShardId, cancel: &CancellationToken, ) -> Result<(), DeleteTenantError> { - if let Some(remote_storage) = remote_storage { - let path = remote_tenant_delete_mark_path(conf, tenant_shard_id)?; - backoff::retry( - || async { remote_storage.delete(&path, cancel).await }, - TimeoutOrCancel::caused_by_cancel, - FAILED_UPLOAD_WARN_THRESHOLD, - FAILED_REMOTE_OP_RETRIES, - "remove_tenant_remote_delete_mark", - cancel, - ) - .await - .ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel)) - .and_then(|x| x) - .context("remove_tenant_remote_delete_mark")?; - } + let path = remote_tenant_delete_mark_path(conf, tenant_shard_id)?; + backoff::retry( + || async { remote_storage.delete(&path, cancel).await }, + TimeoutOrCancel::caused_by_cancel, + FAILED_UPLOAD_WARN_THRESHOLD, + FAILED_REMOTE_OP_RETRIES, + "remove_tenant_remote_delete_mark", + cancel, + ) + .await + .ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel)) + .and_then(|x| x) + .context("remove_tenant_remote_delete_mark")?; Ok(()) } @@ -297,7 +295,7 @@ impl DeleteTenantFlow { #[instrument(skip_all)] pub(crate) async fn run( conf: &'static PageServerConf, - remote_storage: Option, + remote_storage: GenericRemoteStorage, tenants: &'static std::sync::RwLock, tenant: Arc, cancel: &CancellationToken, @@ -308,9 +306,7 @@ impl DeleteTenantFlow { let mut guard = Self::prepare(&tenant).await?; - if let Err(e) = - Self::run_inner(&mut guard, conf, remote_storage.as_ref(), &tenant, cancel).await - { + if let Err(e) = Self::run_inner(&mut guard, conf, &remote_storage, &tenant, cancel).await { tenant.set_broken(format!("{e:#}")).await; return Err(e); } @@ -327,7 +323,7 @@ impl DeleteTenantFlow { async fn run_inner( guard: &mut OwnedMutexGuard, conf: &'static PageServerConf, - remote_storage: Option<&GenericRemoteStorage>, + remote_storage: &GenericRemoteStorage, tenant: &Tenant, cancel: &CancellationToken, ) -> Result<(), DeleteTenantError> { @@ -339,14 +335,9 @@ impl DeleteTenantFlow { ))? }); - // IDEA: implement detach as delete without remote storage. Then they would use the same lock (deletion_progress) so wont contend. - // Though sounds scary, different mark name? - // Detach currently uses remove_dir_all so in case of a crash we can end up in a weird state. - if let Some(remote_storage) = &remote_storage { - create_remote_delete_mark(conf, remote_storage, &tenant.tenant_shard_id, cancel) - .await - .context("remote_mark")? - } + create_remote_delete_mark(conf, remote_storage, &tenant.tenant_shard_id, cancel) + .await + .context("remote_mark")?; fail::fail_point!("tenant-delete-before-create-local-mark", |_| { Err(anyhow::anyhow!( @@ -483,7 +474,7 @@ impl DeleteTenantFlow { fn schedule_background( guard: OwnedMutexGuard, conf: &'static PageServerConf, - remote_storage: Option, + remote_storage: GenericRemoteStorage, tenants: &'static std::sync::RwLock, tenant: Arc, ) { @@ -512,7 +503,7 @@ impl DeleteTenantFlow { async fn background( mut guard: OwnedMutexGuard, conf: &PageServerConf, - remote_storage: Option, + remote_storage: GenericRemoteStorage, tenants: &'static std::sync::RwLock, tenant: &Arc, ) -> Result<(), DeleteTenantError> { @@ -551,7 +542,7 @@ impl DeleteTenantFlow { remove_tenant_remote_delete_mark( conf, - remote_storage.as_ref(), + &remote_storage, &tenant.tenant_shard_id, &task_mgr::shutdown_token(), ) diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs index 8b815a1885..79cc7bf153 100644 --- a/pageserver/src/tenant/ephemeral_file.rs +++ b/pageserver/src/tenant/ephemeral_file.rs @@ -28,6 +28,7 @@ impl EphemeralFile { conf: &PageServerConf, tenant_shard_id: TenantShardId, timeline_id: TimelineId, + ctx: &RequestContext, ) -> Result { static NEXT_FILENAME: AtomicU64 = AtomicU64::new(1); let filename_disambiguator = @@ -45,6 +46,7 @@ impl EphemeralFile { .read(true) .write(true) .create(true), + ctx, ) .await?; @@ -153,7 +155,7 @@ mod tests { async fn test_ephemeral_blobs() -> Result<(), io::Error> { let (conf, tenant_id, timeline_id, ctx) = harness("ephemeral_blobs")?; - let mut file = EphemeralFile::create(conf, tenant_id, timeline_id).await?; + let mut file = EphemeralFile::create(conf, tenant_id, timeline_id, &ctx).await?; let pos_foo = file.write_blob(b"foo", &ctx).await?; assert_eq!( diff --git a/pageserver/src/tenant/ephemeral_file/page_caching.rs b/pageserver/src/tenant/ephemeral_file/page_caching.rs index 42def8858e..276ac87064 100644 --- a/pageserver/src/tenant/ephemeral_file/page_caching.rs +++ b/pageserver/src/tenant/ephemeral_file/page_caching.rs @@ -78,7 +78,7 @@ impl RW { page_cache::ReadBufResult::NotFound(write_guard) => { let write_guard = writer .file - .read_exact_at_page(write_guard, blknum as u64 * PAGE_SZ as u64) + .read_exact_at_page(write_guard, blknum as u64 * PAGE_SZ as u64, ctx) .await?; let read_guard = write_guard.mark_valid(); return Ok(BlockLease::PageReadGuard(read_guard)); diff --git a/pageserver/src/tenant/metadata.rs b/pageserver/src/tenant/metadata.rs index 8ba0775120..fc71ea7642 100644 --- a/pageserver/src/tenant/metadata.rs +++ b/pageserver/src/tenant/metadata.rs @@ -214,12 +214,12 @@ impl TimelineMetadata { self.body.ancestor_timeline = Some(*timeline); } - pub fn detach_from_ancestor(&mut self, timeline: &TimelineId, ancestor_lsn: &Lsn) { + pub fn detach_from_ancestor(&mut self, branchpoint: &(TimelineId, Lsn)) { if let Some(ancestor) = self.body.ancestor_timeline { - assert_eq!(ancestor, *timeline); + assert_eq!(ancestor, branchpoint.0); } if self.body.ancestor_lsn != Lsn(0) { - assert_eq!(self.body.ancestor_lsn, *ancestor_lsn); + assert_eq!(self.body.ancestor_lsn, branchpoint.1); } self.body.ancestor_timeline = None; self.body.ancestor_lsn = Lsn(0); diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index 6be66e99ad..89fdf31849 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -7,7 +7,7 @@ use itertools::Itertools; use pageserver_api::key::Key; use pageserver_api::models::LocationConfigMode; use pageserver_api::shard::{ - ShardCount, ShardIdentity, ShardNumber, ShardStripeSize, TenantShardId, + ShardCount, ShardIdentity, ShardIndex, ShardNumber, ShardStripeSize, TenantShardId, }; use pageserver_api::upcall_api::ReAttachResponseTenant; use rand::{distributions::Alphanumeric, Rng}; @@ -16,10 +16,9 @@ use std::cmp::Ordering; use std::collections::{BTreeMap, HashMap}; use std::ops::Deref; use std::sync::Arc; -use std::time::{Duration, Instant}; +use std::time::Duration; use sysinfo::SystemExt; use tokio::fs; -use utils::timeout::{timeout_cancellable, TimeoutCancellableError}; use anyhow::Context; use once_cell::sync::Lazy; @@ -47,7 +46,7 @@ use crate::tenant::span::debug_assert_current_span_has_tenant_id; use crate::tenant::storage_layer::inmemory_layer; use crate::tenant::timeline::ShutdownMode; use crate::tenant::{AttachedTenantConf, SpawnMode, Tenant, TenantState}; -use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TEMP_FILE_SUFFIX}; +use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME, TEMP_FILE_SUFFIX}; use utils::crashsafe::path_with_suffix_extension; use utils::fs_ext::PathExt; @@ -119,6 +118,7 @@ pub(crate) enum TenantsMapRemoveResult { /// When resolving a TenantId to a shard, we may be looking for the 0th /// shard, or we might be looking for whichever shard holds a particular page. +#[derive(Copy, Clone)] pub(crate) enum ShardSelector { /// Only return the 0th shard, if it is present. If a non-0th shard is present, /// ignore it. @@ -127,6 +127,8 @@ pub(crate) enum ShardSelector { First, /// Pick the shard that holds this key Page(Key), + /// The shard ID is known: pick the given shard + Known(ShardIndex), } /// A convenience for use with the re_attach ControlPlaneClient function: rather @@ -169,6 +171,14 @@ impl TenantStartupMode { } } +/// Result type for looking up a TenantId to a specific shard +pub(crate) enum ShardResolveResult { + NotFound, + Found(Arc), + // Wait for this barrrier, then query again + InProgress(utils::completion::Barrier), +} + impl TenantsMap { /// Convenience function for typical usage, where we want to get a `Tenant` object, for /// working with attached tenants. If the TenantId is in the map but in Secondary state, @@ -182,51 +192,6 @@ impl TenantsMap { } } - /// A page service client sends a TenantId, and to look up the correct Tenant we must - /// resolve this to a fully qualified TenantShardId. - fn resolve_attached_shard( - &self, - tenant_id: &TenantId, - selector: ShardSelector, - ) -> Option { - let mut want_shard = None; - match self { - TenantsMap::Initializing => None, - TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => { - for slot in m.range(TenantShardId::tenant_range(*tenant_id)) { - // Ignore all slots that don't contain an attached tenant - let tenant = match &slot.1 { - TenantSlot::Attached(t) => t, - _ => continue, - }; - - match selector { - ShardSelector::First => return Some(*slot.0), - ShardSelector::Zero if slot.0.shard_number == ShardNumber(0) => { - return Some(*slot.0) - } - ShardSelector::Page(key) => { - // First slot we see for this tenant, calculate the expected shard number - // for the key: we will use this for checking if this and subsequent - // slots contain the key, rather than recalculating the hash each time. - if want_shard.is_none() { - want_shard = Some(tenant.shard_identity.get_shard_number(&key)); - } - - if Some(tenant.shard_identity.number) == want_shard { - return Some(*slot.0); - } - } - _ => continue, - } - } - - // Fall through: we didn't find an acceptable shard - None - } - } - } - /// Only for use from DeleteTenantFlow. This method directly removes a TenantSlot from the map. /// /// The normal way to remove a tenant is using a SlotGuard, which will gracefully remove the guarded @@ -391,22 +356,17 @@ async fn init_load_generations( // deletion list entries may still be valid. We provide that by pushing a recovery operation into // the queue. Sequential processing of te queue ensures that recovery is done before any new tenant deletions // are processed, even though we don't block on recovery completing here. - // - // Must only do this if remote storage is enabled, otherwise deletion queue - // is not running and channel push will fail. - if resources.remote_storage.is_some() { - let attached_tenants = generations - .iter() - .flat_map(|(id, start_mode)| { - match start_mode { - TenantStartupMode::Attached((_mode, generation)) => Some(generation), - TenantStartupMode::Secondary => None, - } - .map(|gen| (*id, *gen)) - }) - .collect(); - resources.deletion_queue_client.recover(attached_tenants)?; - } + let attached_tenants = generations + .iter() + .flat_map(|(id, start_mode)| { + match start_mode { + TenantStartupMode::Attached((_mode, generation)) => Some(generation), + TenantStartupMode::Secondary => None, + } + .map(|gen| (*id, *gen)) + }) + .collect(); + resources.deletion_queue_client.recover(attached_tenants)?; Ok(Some(generations)) } @@ -460,53 +420,6 @@ fn load_tenant_config( } }; - // Clean up legacy `metadata` files. - // Doing it here because every single tenant directory is visited here. - // In any later code, there's different treatment of tenant dirs - // ... depending on whether the tenant is in re-attach response or not - // ... epending on whether the tenant is ignored or not - assert_eq!( - &conf.tenant_path(&tenant_shard_id), - &tenant_dir_path, - "later use of conf....path() methods would be dubious" - ); - let timelines: Vec = match conf.timelines_path(&tenant_shard_id).read_dir_utf8() { - Ok(iter) => { - let mut timelines = Vec::new(); - for res in iter { - let p = res?; - let Some(timeline_id) = p.file_name().parse::().ok() else { - // skip any entries that aren't TimelineId, such as - // - *.___temp dirs - // - unfinished initdb uploads (test_non_uploaded_root_timeline_is_deleted_after_restart) - continue; - }; - timelines.push(timeline_id); - } - timelines - } - Err(e) if e.kind() == std::io::ErrorKind::NotFound => vec![], - Err(e) => return Err(anyhow::anyhow!(e)), - }; - for timeline_id in timelines { - let timeline_path = &conf.timeline_path(&tenant_shard_id, &timeline_id); - let metadata_path = timeline_path.join(METADATA_FILE_NAME); - match std::fs::remove_file(&metadata_path) { - Ok(()) => { - crashsafe::fsync(timeline_path) - .context("fsync timeline dir after removing legacy metadata file")?; - info!("removed legacy metadata file at {metadata_path}"); - } - Err(e) if e.kind() == std::io::ErrorKind::NotFound => { - // something removed the file earlier, or it was never there - // We don't care, this software version doesn't write it again, so, we're good. - } - Err(e) => { - anyhow::bail!("remove legacy metadata file: {e}: {metadata_path}"); - } - } - } - let tenant_ignore_mark_file = tenant_dir_path.join(IGNORED_TENANT_FILE_NAME); if tenant_ignore_mark_file.exists() { info!("Found an ignore mark file {tenant_ignore_mark_file:?}, skipping the tenant"); @@ -611,6 +524,7 @@ pub async fn init_tenant_mgr( TenantSlot::Attached(Tenant::create_broken_tenant( conf, tenant_shard_id, + resources.remote_storage.clone(), format!("{}", e), )), ); @@ -803,6 +717,7 @@ fn tenant_spawn( "Cannot load tenant, ignore mark found at {tenant_ignore_mark:?}" ); + let remote_storage = resources.remote_storage.clone(); let tenant = match Tenant::spawn( conf, tenant_shard_id, @@ -817,7 +732,7 @@ fn tenant_spawn( Ok(tenant) => tenant, Err(e) => { error!("Failed to spawn tenant {tenant_shard_id}, reason: {e:#}"); - Tenant::create_broken_tenant(conf, tenant_shard_id, format!("{e:#}")) + Tenant::create_broken_tenant(conf, tenant_shard_id, remote_storage, format!("{e:#}")) } }; @@ -2103,6 +2018,77 @@ impl TenantManager { Ok(reparented) } + + /// A page service client sends a TenantId, and to look up the correct Tenant we must + /// resolve this to a fully qualified TenantShardId. + /// + /// During shard splits: we shall see parent shards in InProgress state and skip them, and + /// instead match on child shards which should appear in Attached state. Very early in a shard + /// split, or in other cases where a shard is InProgress, we will return our own InProgress result + /// to instruct the caller to wait for that to finish before querying again. + pub(crate) fn resolve_attached_shard( + &self, + tenant_id: &TenantId, + selector: ShardSelector, + ) -> ShardResolveResult { + let tenants = self.tenants.read().unwrap(); + let mut want_shard = None; + let mut any_in_progress = None; + + match &*tenants { + TenantsMap::Initializing => ShardResolveResult::NotFound, + TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => { + for slot in m.range(TenantShardId::tenant_range(*tenant_id)) { + // Ignore all slots that don't contain an attached tenant + let tenant = match &slot.1 { + TenantSlot::Attached(t) => t, + TenantSlot::InProgress(barrier) => { + // We might still find a usable shard, but in case we don't, remember that + // we saw at least one InProgress slot, so that we can distinguish this case + // from a simple NotFound in our return value. + any_in_progress = Some(barrier.clone()); + continue; + } + _ => continue, + }; + + match selector { + ShardSelector::First => return ShardResolveResult::Found(tenant.clone()), + ShardSelector::Zero if slot.0.shard_number == ShardNumber(0) => { + return ShardResolveResult::Found(tenant.clone()) + } + ShardSelector::Page(key) => { + // First slot we see for this tenant, calculate the expected shard number + // for the key: we will use this for checking if this and subsequent + // slots contain the key, rather than recalculating the hash each time. + if want_shard.is_none() { + want_shard = Some(tenant.shard_identity.get_shard_number(&key)); + } + + if Some(tenant.shard_identity.number) == want_shard { + return ShardResolveResult::Found(tenant.clone()); + } + } + ShardSelector::Known(shard) + if tenant.shard_identity.shard_index() == shard => + { + return ShardResolveResult::Found(tenant.clone()); + } + _ => continue, + } + } + + // Fall through: we didn't find a slot that was in Attached state & matched our selector. If + // we found one or more InProgress slot, indicate to caller that they should retry later. Otherwise + // this requested shard simply isn't found. + if let Some(barrier) = any_in_progress { + ShardResolveResult::InProgress(barrier) + } else { + ShardResolveResult::NotFound + } + } + } + } } #[derive(Debug, thiserror::Error)] @@ -2151,105 +2137,6 @@ pub(crate) enum GetActiveTenantError { Broken(String), } -/// Get a [`Tenant`] in its active state. If the tenant_id is currently in [`TenantSlot::InProgress`] -/// state, then wait for up to `timeout`. If the [`Tenant`] is not currently in [`TenantState::Active`], -/// then wait for up to `timeout` (minus however long we waited for the slot). -pub(crate) async fn get_active_tenant_with_timeout( - tenant_id: TenantId, - shard_selector: ShardSelector, - timeout: Duration, - cancel: &CancellationToken, -) -> Result, GetActiveTenantError> { - enum WaitFor { - Barrier(utils::completion::Barrier), - Tenant(Arc), - } - - let wait_start = Instant::now(); - let deadline = wait_start + timeout; - - let (wait_for, tenant_shard_id) = { - let locked = TENANTS.read().unwrap(); - - // Resolve TenantId to TenantShardId - let tenant_shard_id = locked - .resolve_attached_shard(&tenant_id, shard_selector) - .ok_or(GetActiveTenantError::NotFound(GetTenantError::NotFound( - tenant_id, - )))?; - - let peek_slot = tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Read) - .map_err(GetTenantError::MapState)?; - match peek_slot { - Some(TenantSlot::Attached(tenant)) => { - match tenant.current_state() { - TenantState::Active => { - // Fast path: we don't need to do any async waiting. - return Ok(tenant.clone()); - } - _ => { - tenant.activate_now(); - (WaitFor::Tenant(tenant.clone()), tenant_shard_id) - } - } - } - Some(TenantSlot::Secondary(_)) => { - return Err(GetActiveTenantError::NotFound(GetTenantError::NotActive( - tenant_shard_id, - ))) - } - Some(TenantSlot::InProgress(barrier)) => { - (WaitFor::Barrier(barrier.clone()), tenant_shard_id) - } - None => { - return Err(GetActiveTenantError::NotFound(GetTenantError::NotFound( - tenant_id, - ))) - } - } - }; - - let tenant = match wait_for { - WaitFor::Barrier(barrier) => { - tracing::debug!("Waiting for tenant InProgress state to pass..."); - timeout_cancellable( - deadline.duration_since(Instant::now()), - cancel, - barrier.wait(), - ) - .await - .map_err(|e| match e { - TimeoutCancellableError::Timeout => GetActiveTenantError::WaitForActiveTimeout { - latest_state: None, - wait_time: wait_start.elapsed(), - }, - TimeoutCancellableError::Cancelled => GetActiveTenantError::Cancelled, - })?; - { - let locked = TENANTS.read().unwrap(); - let peek_slot = - tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Read) - .map_err(GetTenantError::MapState)?; - match peek_slot { - Some(TenantSlot::Attached(tenant)) => tenant.clone(), - _ => { - return Err(GetActiveTenantError::NotFound(GetTenantError::NotActive( - tenant_shard_id, - ))) - } - } - } - } - WaitFor::Tenant(tenant) => tenant, - }; - - tracing::debug!("Waiting for tenant to enter active state..."); - tenant - .wait_to_become_active(deadline.duration_since(Instant::now())) - .await?; - Ok(tenant) -} - #[derive(Debug, thiserror::Error)] pub(crate) enum DeleteTimelineError { #[error("Tenant {0}")] @@ -2276,7 +2163,7 @@ pub(crate) async fn load_tenant( tenant_id: TenantId, generation: Generation, broker_client: storage_broker::BrokerClientChannel, - remote_storage: Option, + remote_storage: GenericRemoteStorage, deletion_queue_client: DeletionQueueClient, ctx: &RequestContext, ) -> Result<(), TenantMapInsertError> { @@ -2880,86 +2767,73 @@ use { utils::http::error::ApiError, }; -pub(crate) fn immediate_gc( +#[instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id))] +pub(crate) async fn immediate_gc( tenant_shard_id: TenantShardId, timeline_id: TimelineId, gc_req: TimelineGcRequest, cancel: CancellationToken, ctx: &RequestContext, -) -> Result>, ApiError> { - let guard = TENANTS.read().unwrap(); - - let tenant = guard - .get(&tenant_shard_id) - .cloned() - .with_context(|| format!("tenant {tenant_shard_id}")) - .map_err(|e| ApiError::NotFound(e.into()))?; +) -> Result { + let tenant = { + let guard = TENANTS.read().unwrap(); + guard + .get(&tenant_shard_id) + .cloned() + .with_context(|| format!("tenant {tenant_shard_id}")) + .map_err(|e| ApiError::NotFound(e.into()))? + }; let gc_horizon = gc_req.gc_horizon.unwrap_or_else(|| tenant.get_gc_horizon()); // Use tenant's pitr setting let pitr = tenant.get_pitr_interval(); + tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?; + // Run in task_mgr to avoid race with tenant_detach operation - let ctx = ctx.detached_child(TaskKind::GarbageCollector, DownloadBehavior::Download); - let (task_done, wait_task_done) = tokio::sync::oneshot::channel(); - let span = info_span!("manual_gc", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id); + let ctx: RequestContext = + ctx.detached_child(TaskKind::GarbageCollector, DownloadBehavior::Download); - // TODO: spawning is redundant now, need to hold the gate - task_mgr::spawn( - &tokio::runtime::Handle::current(), - TaskKind::GarbageCollector, - Some(tenant_shard_id), - Some(timeline_id), - &format!("timeline_gc_handler garbage collection run for tenant {tenant_shard_id} timeline {timeline_id}"), - false, - async move { - fail::fail_point!("immediate_gc_task_pre"); + let _gate_guard = tenant.gate.enter().map_err(|_| ApiError::ShuttingDown)?; - #[allow(unused_mut)] - let mut result = tenant - .gc_iteration(Some(timeline_id), gc_horizon, pitr, &cancel, &ctx) - .await; - // FIXME: `gc_iteration` can return an error for multiple reasons; we should handle it - // better once the types support it. + fail::fail_point!("immediate_gc_task_pre"); - #[cfg(feature = "testing")] - { - // we need to synchronize with drop completion for python tests without polling for - // log messages - if let Ok(result) = result.as_mut() { - let mut js = tokio::task::JoinSet::new(); - for layer in std::mem::take(&mut result.doomed_layers) { - js.spawn(layer.wait_drop()); - } - tracing::info!(total = js.len(), "starting to wait for the gc'd layers to be dropped"); - while let Some(res) = js.join_next().await { - res.expect("wait_drop should not panic"); - } - } + #[allow(unused_mut)] + let mut result = tenant + .gc_iteration(Some(timeline_id), gc_horizon, pitr, &cancel, &ctx) + .await; + // FIXME: `gc_iteration` can return an error for multiple reasons; we should handle it + // better once the types support it. - let timeline = tenant.get_timeline(timeline_id, false).ok(); - let rtc = timeline.as_ref().and_then(|x| x.remote_client.as_ref()); - - if let Some(rtc) = rtc { - // layer drops schedule actions on remote timeline client to actually do the - // deletions; don't care about the shutdown error, just exit fast - drop(rtc.wait_completion().await); - } + #[cfg(feature = "testing")] + { + // we need to synchronize with drop completion for python tests without polling for + // log messages + if let Ok(result) = result.as_mut() { + let mut js = tokio::task::JoinSet::new(); + for layer in std::mem::take(&mut result.doomed_layers) { + js.spawn(layer.wait_drop()); } - - match task_done.send(result) { - Ok(_) => (), - Err(result) => error!("failed to send gc result: {result:?}"), + tracing::info!( + total = js.len(), + "starting to wait for the gc'd layers to be dropped" + ); + while let Some(res) = js.join_next().await { + res.expect("wait_drop should not panic"); } - Ok(()) } - .instrument(span) - ); - // drop the guard until after we've spawned the task so that timeline shutdown will wait for the task - drop(guard); + let timeline = tenant.get_timeline(timeline_id, false).ok(); + let rtc = timeline.as_ref().map(|x| &x.remote_client); - Ok(wait_task_done) + if let Some(rtc) = rtc { + // layer drops schedule actions on remote timeline client to actually do the + // deletions; don't care about the shutdown error, just exit fast + drop(rtc.wait_completion().await); + } + } + + result.map_err(ApiError::InternalServerError) } #[cfg(test)] diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index 356a0dc51c..d3adae6841 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -189,6 +189,7 @@ use camino::Utf8Path; use chrono::{NaiveDateTime, Utc}; pub(crate) use download::download_initdb_tar_zst; +use pageserver_api::models::AuxFilePolicy; use pageserver_api::shard::{ShardIndex, TenantShardId}; use scopeguard::ScopeGuard; use tokio_util::sync::CancellationToken; @@ -240,7 +241,7 @@ use utils::id::{TenantId, TimelineId}; use self::index::IndexPart; use super::metadata::MetadataUpdate; -use super::storage_layer::{Layer, LayerFileName, ResidentLayer}; +use super::storage_layer::{Layer, LayerName, ResidentLayer}; use super::upload_queue::SetDeletedFlagProgress; use super::Generation; @@ -317,7 +318,7 @@ pub struct RemoteTimelineClient { upload_queue: Mutex, - metrics: Arc, + pub(crate) metrics: Arc, storage_impl: GenericRemoteStorage, @@ -437,6 +438,19 @@ impl RemoteTimelineClient { } } + /// Returns true if this timeline was previously detached at this Lsn and the remote timeline + /// client is currently initialized. + pub(crate) fn is_previous_ancestor_lsn(&self, lsn: Lsn) -> bool { + // technically this is a dirty read, but given how timeline detach ancestor is implemented + // via tenant restart, the lineage has always been uploaded. + self.upload_queue + .lock() + .unwrap() + .initialized_mut() + .map(|uq| uq.latest_lineage.is_previous_ancestor_lsn(lsn)) + .unwrap_or(false) + } + fn update_remote_physical_size_gauge(&self, current_remote_index_part: Option<&IndexPart>) { let size: u64 = if let Some(current_remote_index_part) = current_remote_index_part { current_remote_index_part @@ -448,11 +462,11 @@ impl RemoteTimelineClient { } else { 0 }; - self.metrics.remote_physical_size_set(size); + self.metrics.remote_physical_size_gauge.set(size); } pub fn get_remote_physical_size(&self) -> u64 { - self.metrics.remote_physical_size_get() + self.metrics.remote_physical_size_gauge.get() } // @@ -503,8 +517,9 @@ impl RemoteTimelineClient { /// On success, returns the size of the downloaded file. pub async fn download_layer_file( &self, - layer_file_name: &LayerFileName, + layer_file_name: &LayerName, layer_metadata: &LayerFileMetadata, + local_path: &Utf8Path, cancel: &CancellationToken, ctx: &RequestContext, ) -> anyhow::Result { @@ -523,6 +538,7 @@ impl RemoteTimelineClient { self.timeline_id, layer_file_name, layer_metadata, + local_path, cancel, ctx, ) @@ -596,6 +612,17 @@ impl RemoteTimelineClient { Ok(()) } + /// Launch an index-file upload operation in the background, with only aux_file_policy flag updated. + pub(crate) fn schedule_index_upload_for_aux_file_policy_update( + self: &Arc, + last_aux_file_policy: Option, + ) -> anyhow::Result<()> { + let mut guard = self.upload_queue.lock().unwrap(); + let upload_queue = guard.initialized_mut()?; + upload_queue.last_aux_file_policy = last_aux_file_policy; + self.schedule_index_upload(upload_queue); + Ok(()) + } /// /// Launch an index-file upload operation in the background, if necessary. /// @@ -628,7 +655,7 @@ impl RemoteTimelineClient { ); let index_part = IndexPart::from(&*upload_queue); - let op = UploadOp::UploadMetadata(index_part, disk_consistent_lsn); + let op = UploadOp::UploadMetadata(Box::new(index_part), disk_consistent_lsn); self.metric_begin(&op); upload_queue.queued_operations.push_back(op); upload_queue.latest_files_changes_since_metadata_upload_scheduled = 0; @@ -647,7 +674,14 @@ impl RemoteTimelineClient { let mut guard = self.upload_queue.lock().unwrap(); let upload_queue = guard.initialized_mut()?; + let Some(prev) = upload_queue.latest_metadata.ancestor_timeline() else { + return Err(anyhow::anyhow!( + "cannot reparent without a current ancestor" + )); + }; + upload_queue.latest_metadata.reparent(new_parent); + upload_queue.latest_lineage.record_previous_ancestor(&prev); self.schedule_index_upload(upload_queue); @@ -670,14 +704,13 @@ impl RemoteTimelineClient { let mut guard = self.upload_queue.lock().unwrap(); let upload_queue = guard.initialized_mut()?; - upload_queue - .latest_metadata - .detach_from_ancestor(&adopted.0, &adopted.1); + upload_queue.latest_metadata.detach_from_ancestor(&adopted); + upload_queue.latest_lineage.record_detaching(&adopted); for layer in layers { upload_queue .latest_files - .insert(layer.layer_desc().filename(), layer.metadata()); + .insert(layer.layer_desc().layer_name(), layer.metadata()); } self.schedule_index_upload(upload_queue); @@ -713,7 +746,7 @@ impl RemoteTimelineClient { upload_queue .latest_files - .insert(layer.layer_desc().filename(), metadata.clone()); + .insert(layer.layer_desc().layer_name(), metadata.clone()); upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1; info!( @@ -737,7 +770,7 @@ impl RemoteTimelineClient { /// successfully. pub fn schedule_layer_file_deletion( self: &Arc, - names: &[LayerFileName], + names: &[LayerName], ) -> anyhow::Result<()> { let mut guard = self.upload_queue.lock().unwrap(); let upload_queue = guard.initialized_mut()?; @@ -765,7 +798,7 @@ impl RemoteTimelineClient { // the layer files as "dangling". this is fine, at worst case we create work for the // scrubber. - let names = gc_layers.iter().map(|x| x.layer_desc().filename()); + let names = gc_layers.iter().map(|x| x.layer_desc().layer_name()); self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names); @@ -780,9 +813,9 @@ impl RemoteTimelineClient { self: &Arc, upload_queue: &mut UploadQueueInitialized, names: I, - ) -> Vec<(LayerFileName, LayerFileMetadata)> + ) -> Vec<(LayerName, LayerFileMetadata)> where - I: IntoIterator, + I: IntoIterator, { // Decorate our list of names with each name's metadata, dropping // names that are unexpectedly missing from our metadata. This metadata @@ -832,7 +865,7 @@ impl RemoteTimelineClient { /// `index_part.json` with [`Self::schedule_gc_update`] or [`Self::schedule_compaction_update`]. pub(crate) fn schedule_deletion_of_unlinked( self: &Arc, - layers: Vec<(LayerFileName, LayerFileMetadata)>, + layers: Vec<(LayerName, LayerFileMetadata)>, ) -> anyhow::Result<()> { let mut guard = self.upload_queue.lock().unwrap(); let upload_queue = guard.initialized_mut()?; @@ -845,7 +878,7 @@ impl RemoteTimelineClient { fn schedule_deletion_of_unlinked0( self: &Arc, upload_queue: &mut UploadQueueInitialized, - mut with_metadata: Vec<(LayerFileName, LayerFileMetadata)>, + mut with_metadata: Vec<(LayerName, LayerFileMetadata)>, ) { // Filter out any layers which were not created by this tenant shard. These are // layers that originate from some ancestor shard after a split, and may still @@ -914,7 +947,7 @@ impl RemoteTimelineClient { self.schedule_layer_file_upload0(upload_queue, layer.clone()); } - let names = compacted_from.iter().map(|x| x.layer_desc().filename()); + let names = compacted_from.iter().map(|x| x.layer_desc().layer_name()); self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names); self.launch_queued_tasks(upload_queue); @@ -1108,6 +1141,11 @@ impl RemoteTimelineClient { Ok(()) } + pub(crate) fn is_deleting(&self) -> bool { + let mut locked = self.upload_queue.lock().unwrap(); + locked.stopped_mut().is_ok() + } + pub(crate) async fn preserve_initdb_archive( self: &Arc, tenant_id: &TenantId, @@ -1144,7 +1182,7 @@ impl RemoteTimelineClient { &self.tenant_shard_id.tenant_id, &self.timeline_id, self.tenant_shard_id.to_index(), - &uploaded.layer_desc().filename(), + &uploaded.layer_desc().layer_name(), uploaded.metadata().generation, ); @@ -1185,7 +1223,7 @@ impl RemoteTimelineClient { .get_timeline_id() .expect("Source timeline should be alive"), self.tenant_shard_id.to_index(), - &adopted.layer_desc().filename(), + &adopted.layer_desc().layer_name(), adopted.metadata().generation, ); @@ -1193,7 +1231,7 @@ impl RemoteTimelineClient { &self.tenant_shard_id.tenant_id, &self.timeline_id, self.tenant_shard_id.to_index(), - &adopted_as.layer_desc().filename(), + &adopted_as.layer_desc().layer_name(), adopted_as.metadata().generation, ); @@ -1527,7 +1565,7 @@ impl RemoteTimelineClient { &self.tenant_shard_id.tenant_id, &self.timeline_id, layer_metadata.shard, - &layer.layer_desc().filename(), + &layer.layer_desc().layer_name(), layer_metadata.generation, ); @@ -1811,6 +1849,7 @@ impl RemoteTimelineClient { latest_files: initialized.latest_files.clone(), latest_files_changes_since_metadata_upload_scheduled: 0, latest_metadata: initialized.latest_metadata.clone(), + latest_lineage: initialized.latest_lineage.clone(), projected_remote_consistent_lsn: None, visible_remote_consistent_lsn: initialized .visible_remote_consistent_lsn @@ -1824,6 +1863,7 @@ impl RemoteTimelineClient { dangling_files: HashMap::default(), shutting_down: false, shutdown_ready: Arc::new(tokio::sync::Semaphore::new(0)), + last_aux_file_policy: initialized.last_aux_file_policy, }; let upload_queue = std::mem::replace( @@ -1896,14 +1936,14 @@ pub fn remote_layer_path( tenant_id: &TenantId, timeline_id: &TimelineId, shard: ShardIndex, - layer_file_name: &LayerFileName, + layer_file_name: &LayerName, generation: Generation, ) -> RemotePath { // Generation-aware key format let path = format!( "tenants/{tenant_id}{0}/{TIMELINES_SEGMENT_NAME}/{timeline_id}/{1}{2}", shard.get_suffix(), - layer_file_name.file_name(), + layer_file_name, generation.get_suffix() ); @@ -2000,8 +2040,8 @@ mod tests { TimelineMetadata::from_bytes(&metadata.to_bytes().unwrap()).unwrap() } - fn assert_file_list(a: &HashSet, b: &[&str]) { - let mut avec: Vec = a.iter().map(|x| x.file_name()).collect(); + fn assert_file_list(a: &HashSet, b: &[&str]) { + let mut avec: Vec = a.iter().map(|x| x.to_string()).collect(); avec.sort(); let mut bvec = b.to_vec(); @@ -2112,7 +2152,7 @@ mod tests { tenant_ctx: _tenant_ctx, } = test_setup; - let client = timeline.remote_client.as_ref().unwrap(); + let client = &timeline.remote_client; // Download back the index.json, and check that the list of files is correct let initial_index_part = match client @@ -2127,7 +2167,7 @@ mod tests { .layer_metadata .keys() .map(|f| f.to_owned()) - .collect::>(); + .collect::>(); let initial_layer = { assert!(initial_layers.len() == 1); initial_layers.into_iter().next().unwrap() @@ -2153,7 +2193,7 @@ mod tests { ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59DA-00000000016B5A53".parse().unwrap(), dummy_contents("baz")) ] .into_iter() - .map(|(name, contents): (LayerFileName, Vec)| { + .map(|(name, contents): (LayerName, Vec)| { let local_path = local_layer_path( harness.conf, @@ -2234,9 +2274,9 @@ mod tests { .map(|f| f.to_owned()) .collect(), &[ - &initial_layer.file_name(), - &layers[0].layer_desc().filename().file_name(), - &layers[1].layer_desc().filename().file_name(), + &initial_layer.to_string(), + &layers[0].layer_desc().layer_name().to_string(), + &layers[1].layer_desc().layer_name().to_string(), ], ); assert_eq!(index_part.metadata, metadata); @@ -2250,7 +2290,7 @@ mod tests { // keep using schedule_layer_file_deletion because we don't have a way to wait for the // spawn_blocking started by the drop. client - .schedule_layer_file_deletion(&[layers[0].layer_desc().filename()]) + .schedule_layer_file_deletion(&[layers[0].layer_desc().layer_name()]) .unwrap(); { let mut guard = client.upload_queue.lock().unwrap(); @@ -2268,9 +2308,9 @@ mod tests { } assert_remote_files( &[ - &initial_layer.file_name(), - &layers[0].layer_desc().filename().file_name(), - &layers[1].layer_desc().filename().file_name(), + &initial_layer.to_string(), + &layers[0].layer_desc().layer_name().to_string(), + &layers[1].layer_desc().layer_name().to_string(), "index_part.json", ], &remote_timeline_dir, @@ -2283,9 +2323,9 @@ mod tests { assert_remote_files( &[ - &initial_layer.file_name(), - &layers[1].layer_desc().filename().file_name(), - &layers[2].layer_desc().filename().file_name(), + &initial_layer.to_string(), + &layers[1].layer_desc().layer_name().to_string(), + &layers[2].layer_desc().layer_name().to_string(), "index_part.json", ], &remote_timeline_dir, @@ -2303,9 +2343,9 @@ mod tests { timeline, .. } = TestSetup::new("metrics").await.unwrap(); - let client = timeline.remote_client.as_ref().unwrap(); + let client = &timeline.remote_client; - let layer_file_name_1: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(); + let layer_file_name_1: LayerName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(); let local_path = local_layer_path( harness.conf, &timeline.tenant_shard_id, diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs index c86b22d481..70c5cae05e 100644 --- a/pageserver/src/tenant/remote_timeline_client/download.rs +++ b/pageserver/src/tenant/remote_timeline_client/download.rs @@ -21,8 +21,7 @@ use crate::config::PageServerConf; use crate::context::RequestContext; use crate::span::debug_assert_current_span_has_tenant_and_timeline_id; use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_path}; -use crate::tenant::storage_layer::layer::local_layer_path; -use crate::tenant::storage_layer::LayerFileName; +use crate::tenant::storage_layer::LayerName; use crate::tenant::Generation; use crate::virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile}; use crate::TEMP_FILE_SUFFIX; @@ -48,21 +47,15 @@ pub async fn download_layer_file<'a>( storage: &'a GenericRemoteStorage, tenant_shard_id: TenantShardId, timeline_id: TimelineId, - layer_file_name: &'a LayerFileName, + layer_file_name: &'a LayerName, layer_metadata: &'a LayerFileMetadata, + local_path: &Utf8Path, cancel: &CancellationToken, ctx: &RequestContext, ) -> Result { debug_assert_current_span_has_tenant_and_timeline_id(); let timeline_path = conf.timeline_path(&tenant_shard_id, &timeline_id); - let local_path = local_layer_path( - conf, - &tenant_shard_id, - &timeline_id, - layer_file_name, - &layer_metadata.generation, - ); let remote_path = remote_layer_path( &tenant_shard_id.tenant_id, @@ -82,7 +75,7 @@ pub async fn download_layer_file<'a>( // For more context about durable_rename check this email from postgres mailing list: // https://www.postgresql.org/message-id/56583BDD.9060302@2ndquadrant.com // If pageserver crashes the temp file will be deleted on startup and re-downloaded. - let temp_file_path = path_with_suffix_extension(&local_path, TEMP_DOWNLOAD_EXTENSION); + let temp_file_path = path_with_suffix_extension(local_path, TEMP_DOWNLOAD_EXTENSION); let bytes_amount = download_retry( || async { download_object(storage, &remote_path, &temp_file_path, cancel, ctx).await }, @@ -112,14 +105,17 @@ pub async fn download_layer_file<'a>( // We use fatal_err() below because the after the rename above, // the in-memory state of the filesystem already has the layer file in its final place, // and subsequent pageserver code could think it's durable while it really isn't. - let work = async move { - let timeline_dir = VirtualFile::open(&timeline_path) - .await - .fatal_err("VirtualFile::open for timeline dir fsync"); - timeline_dir - .sync_all() - .await - .fatal_err("VirtualFile::sync_all timeline dir"); + let work = { + let ctx = ctx.detached_child(ctx.task_kind(), ctx.download_behavior()); + async move { + let timeline_dir = VirtualFile::open(&timeline_path, &ctx) + .await + .fatal_err("VirtualFile::open for timeline dir fsync"); + timeline_dir + .sync_all() + .await + .fatal_err("VirtualFile::sync_all timeline dir"); + } }; crate::virtual_file::io_engine::get() .spawn_blocking_and_block_on_if_std(work) @@ -196,7 +192,7 @@ async fn download_object<'a>( use crate::virtual_file::owned_buffers_io::{self, util::size_tracking_writer}; use bytes::BytesMut; async { - let destination_file = VirtualFile::create(dst_path) + let destination_file = VirtualFile::create(dst_path, ctx) .await .with_context(|| format!("create a destination file for layer '{dst_path}'")) .map_err(DownloadError::Other)?; diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs index 7e0619945f..032dda7ff3 100644 --- a/pageserver/src/tenant/remote_timeline_client/index.rs +++ b/pageserver/src/tenant/remote_timeline_client/index.rs @@ -5,10 +5,12 @@ use std::collections::HashMap; use chrono::NaiveDateTime; +use pageserver_api::models::AuxFilePolicy; use serde::{Deserialize, Serialize}; +use utils::id::TimelineId; use crate::tenant::metadata::TimelineMetadata; -use crate::tenant::storage_layer::LayerFileName; +use crate::tenant::storage_layer::LayerName; use crate::tenant::upload_queue::UploadQueueInitialized; use crate::tenant::Generation; use pageserver_api::shard::ShardIndex; @@ -75,7 +77,7 @@ pub struct IndexPart { /// /// Older versions of `IndexPart` will not have this property or have only a part of metadata /// that latest version stores. - pub layer_metadata: HashMap, + pub layer_metadata: HashMap, // 'disk_consistent_lsn' is a copy of the 'disk_consistent_lsn' in the metadata. // It's duplicated for convenience when reading the serialized structure, but is @@ -84,6 +86,19 @@ pub struct IndexPart { #[serde(rename = "metadata_bytes")] pub metadata: TimelineMetadata, + + #[serde(default)] + pub(crate) lineage: Lineage, + + /// Describes the kind of aux files stored in the timeline. + /// + /// The value is modified during file ingestion when the latest wanted value communicated via tenant config is applied if it is acceptable. + /// A V1 setting after V2 files have been committed is not accepted. + /// + /// None means no aux files have been written to the storage before the point + /// when this flag is introduced. + #[serde(skip_serializing_if = "Option::is_none", default)] + pub(crate) last_aux_file_policy: Option, } impl IndexPart { @@ -96,17 +111,21 @@ impl IndexPart { /// - 3: no longer deserialize `timeline_layers` (serialized format is the same, but timeline_layers /// is always generated from the keys of `layer_metadata`) /// - 4: timeline_layers is fully removed. - const LATEST_VERSION: usize = 4; + /// - 5: lineage was added + /// - 6: last_aux_file_policy is added. + const LATEST_VERSION: usize = 6; // Versions we may see when reading from a bucket. - pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4]; + pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5, 6]; pub const FILE_NAME: &'static str = "index_part.json"; fn new( - layers_and_metadata: &HashMap, + layers_and_metadata: &HashMap, disk_consistent_lsn: Lsn, metadata: TimelineMetadata, + lineage: Lineage, + last_aux_file_policy: Option, ) -> Self { let layer_metadata = layers_and_metadata .iter() @@ -119,6 +138,8 @@ impl IndexPart { disk_consistent_lsn, metadata, deleted_at: None, + lineage, + last_aux_file_policy, } } @@ -147,16 +168,29 @@ impl IndexPart { &HashMap::new(), example_metadata.disk_consistent_lsn(), example_metadata, + Default::default(), + Some(AuxFilePolicy::V1), ) } + + pub(crate) fn last_aux_file_policy(&self) -> Option { + self.last_aux_file_policy + } } impl From<&UploadQueueInitialized> for IndexPart { fn from(uq: &UploadQueueInitialized) -> Self { let disk_consistent_lsn = uq.latest_metadata.disk_consistent_lsn(); let metadata = uq.latest_metadata.clone(); + let lineage = uq.latest_lineage.clone(); - Self::new(&uq.latest_files, disk_consistent_lsn, metadata) + Self::new( + &uq.latest_files, + disk_consistent_lsn, + metadata, + lineage, + uq.last_aux_file_policy, + ) } } @@ -184,8 +218,76 @@ impl From<&LayerFileMetadata> for IndexLayerMetadata { } } +/// Limited history of earlier ancestors. +/// +/// A timeline can have more than 1 earlier ancestor, in the rare case that it was repeatedly +/// reparented by having an later timeline be detached from it's ancestor. +#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, Default)] +pub(crate) struct Lineage { + /// Has the `reparenting_history` been truncated to [`Lineage::REMEMBER_AT_MOST`]. + #[serde(skip_serializing_if = "is_false", default)] + reparenting_history_truncated: bool, + + /// Earlier ancestors, truncated when [`Self::reparenting_history_truncated`] + /// + /// These are stored in case we want to support WAL based DR on the timeline. There can be many + /// of these and at most one [`Self::original_ancestor`]. There cannot be more reparentings + /// after [`Self::original_ancestor`] has been set. + #[serde(skip_serializing_if = "Vec::is_empty", default)] + reparenting_history: Vec, + + /// The ancestor from which this timeline has been detached from and when. + /// + /// If you are adding support for detaching from a hierarchy, consider changing the ancestry + /// into a `Vec<(TimelineId, Lsn)>` to be a path instead. + #[serde(skip_serializing_if = "Option::is_none", default)] + original_ancestor: Option<(TimelineId, Lsn, NaiveDateTime)>, +} + +fn is_false(b: &bool) -> bool { + !b +} + +impl Lineage { + const REMEMBER_AT_MOST: usize = 100; + + pub(crate) fn record_previous_ancestor(&mut self, old_ancestor: &TimelineId) { + if self.reparenting_history.last() == Some(old_ancestor) { + // do not re-record it + return; + } + + let drop_oldest = self.reparenting_history.len() + 1 >= Self::REMEMBER_AT_MOST; + + self.reparenting_history_truncated |= drop_oldest; + if drop_oldest { + self.reparenting_history.remove(0); + } + self.reparenting_history.push(*old_ancestor); + } + + pub(crate) fn record_detaching(&mut self, branchpoint: &(TimelineId, Lsn)) { + assert!(self.original_ancestor.is_none()); + + self.original_ancestor = + Some((branchpoint.0, branchpoint.1, chrono::Utc::now().naive_utc())); + } + + /// The queried lsn is most likely the basebackup lsn, and this answers question "is it allowed + /// to start a read/write primary at this lsn". + /// + /// Returns true if the Lsn was previously a branch point. + pub(crate) fn is_previous_ancestor_lsn(&self, lsn: Lsn) -> bool { + self.original_ancestor + .as_ref() + .is_some_and(|(_, ancestor_lsn, _)| lsn == *ancestor_lsn) + } +} + #[cfg(test)] mod tests { + use std::str::FromStr; + use super::*; #[test] @@ -221,6 +323,8 @@ mod tests { disk_consistent_lsn: "0/16960E8".parse::().unwrap(), metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(), deleted_at: None, + lineage: Lineage::default(), + last_aux_file_policy: None, }; let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap(); @@ -261,6 +365,8 @@ mod tests { disk_consistent_lsn: "0/16960E8".parse::().unwrap(), metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(), deleted_at: None, + lineage: Lineage::default(), + last_aux_file_policy: None, }; let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap(); @@ -302,7 +408,9 @@ mod tests { disk_consistent_lsn: "0/16960E8".parse::().unwrap(), metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(), deleted_at: Some(chrono::NaiveDateTime::parse_from_str( - "2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap()) + "2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap()), + lineage: Lineage::default(), + last_aux_file_policy: None, }; let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap(); @@ -347,6 +455,8 @@ mod tests { ]) .unwrap(), deleted_at: None, + lineage: Lineage::default(), + last_aux_file_policy: None, }; let empty_layers_parsed = IndexPart::from_s3_bytes(empty_layers_json.as_bytes()).unwrap(); @@ -385,11 +495,110 @@ mod tests { ]), disk_consistent_lsn: "0/16960E8".parse::().unwrap(), metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(), - deleted_at: Some(chrono::NaiveDateTime::parse_from_str( - "2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap()), + deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")), + lineage: Lineage::default(), + last_aux_file_policy: None, }; let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap(); assert_eq!(part, expected); } + + #[test] + fn v5_indexpart_is_parsed() { + let example = r#"{ + "version":5, + "layer_metadata":{ + "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000014EF420-00000000014EF499":{"file_size":23289856,"generation":1}, + "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000014EF499-00000000015A7619":{"file_size":1015808,"generation":1}}, + "disk_consistent_lsn":"0/15A7618", + "metadata_bytes":[226,88,25,241,0,46,0,4,0,0,0,0,1,90,118,24,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,78,244,32,0,0,0,0,1,78,244,32,0,0,0,16,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0], + "lineage":{ + "original_ancestor":["e2bfd8c633d713d279e6fcd2bcc15b6d","0/15A7618","2024-05-07T18:52:36.322426563"], + "reparenting_history":["e1bfd8c633d713d279e6fcd2bcc15b6d"] + } + }"#; + + let expected = IndexPart { + version: 5, + layer_metadata: HashMap::from([ + ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000014EF420-00000000014EF499".parse().unwrap(), IndexLayerMetadata { + file_size: 23289856, + generation: Generation::new(1), + shard: ShardIndex::unsharded(), + }), + ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000014EF499-00000000015A7619".parse().unwrap(), IndexLayerMetadata { + file_size: 1015808, + generation: Generation::new(1), + shard: ShardIndex::unsharded(), + }) + ]), + disk_consistent_lsn: Lsn::from_str("0/15A7618").unwrap(), + metadata: TimelineMetadata::from_bytes(&[226,88,25,241,0,46,0,4,0,0,0,0,1,90,118,24,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,78,244,32,0,0,0,0,1,78,244,32,0,0,0,16,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(), + deleted_at: None, + lineage: Lineage { + reparenting_history_truncated: false, + reparenting_history: vec![TimelineId::from_str("e1bfd8c633d713d279e6fcd2bcc15b6d").unwrap()], + original_ancestor: Some((TimelineId::from_str("e2bfd8c633d713d279e6fcd2bcc15b6d").unwrap(), Lsn::from_str("0/15A7618").unwrap(), parse_naive_datetime("2024-05-07T18:52:36.322426563"))), + }, + last_aux_file_policy: None, + }; + + let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap(); + assert_eq!(part, expected); + } + + #[test] + fn v6_indexpart_is_parsed() { + let example = r#"{ + "version":6, + "layer_metadata":{ + "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 }, + "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 } + }, + "disk_consistent_lsn":"0/16960E8", + "metadata_bytes":[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0], + "deleted_at": "2023-07-31T09:00:00.123", + "lineage":{ + "original_ancestor":["e2bfd8c633d713d279e6fcd2bcc15b6d","0/15A7618","2024-05-07T18:52:36.322426563"], + "reparenting_history":["e1bfd8c633d713d279e6fcd2bcc15b6d"] + }, + "last_aux_file_policy": "V2" + }"#; + + let expected = IndexPart { + version: 6, + layer_metadata: HashMap::from([ + ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata { + file_size: 25600000, + generation: Generation::none(), + shard: ShardIndex::unsharded() + }), + ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata { + // serde_json should always parse this but this might be a double with jq for + // example. + file_size: 9007199254741001, + generation: Generation::none(), + shard: ShardIndex::unsharded() + }) + ]), + disk_consistent_lsn: "0/16960E8".parse::().unwrap(), + metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(), + deleted_at: Some(chrono::NaiveDateTime::parse_from_str( + "2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap()), + lineage: Lineage { + reparenting_history_truncated: false, + reparenting_history: vec![TimelineId::from_str("e1bfd8c633d713d279e6fcd2bcc15b6d").unwrap()], + original_ancestor: Some((TimelineId::from_str("e2bfd8c633d713d279e6fcd2bcc15b6d").unwrap(), Lsn::from_str("0/15A7618").unwrap(), parse_naive_datetime("2024-05-07T18:52:36.322426563"))), + }, + last_aux_file_policy: Some(AuxFilePolicy::V2), + }; + + let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap(); + assert_eq!(part, expected); + } + + fn parse_naive_datetime(s: &str) -> NaiveDateTime { + chrono::NaiveDateTime::parse_from_str(s, "%Y-%m-%dT%H:%M:%S.%f").unwrap() + } } diff --git a/pageserver/src/tenant/secondary.rs b/pageserver/src/tenant/secondary.rs index 0bb25f0ace..252b6eb11b 100644 --- a/pageserver/src/tenant/secondary.rs +++ b/pageserver/src/tenant/secondary.rs @@ -6,11 +6,9 @@ mod scheduler; use std::{sync::Arc, time::SystemTime}; use crate::{ - config::PageServerConf, context::RequestContext, disk_usage_eviction_task::DiskUsageEvictionInfo, task_mgr::{self, TaskKind, BACKGROUND_RUNTIME}, - virtual_file::MaybeFatalIo, }; use self::{ @@ -21,9 +19,8 @@ use self::{ use super::{ config::{SecondaryLocationConfig, TenantConfOpt}, mgr::TenantManager, - remote_timeline_client::LayerFileMetadata, span::debug_assert_current_span_has_tenant_id, - storage_layer::{layer::local_layer_path, LayerFileName}, + storage_layer::LayerName, }; use pageserver_api::{ @@ -178,13 +175,7 @@ impl SecondaryTenant { /// Cancellation safe, but on cancellation the eviction will go through #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%timeline_id, name=%name))] - pub(crate) async fn evict_layer( - self: &Arc, - conf: &PageServerConf, - timeline_id: TimelineId, - name: LayerFileName, - metadata: LayerFileMetadata, - ) { + pub(crate) async fn evict_layer(self: &Arc, timeline_id: TimelineId, name: LayerName) { debug_assert_current_span_has_tenant_id(); let guard = match self.gate.enter() { @@ -197,41 +188,11 @@ impl SecondaryTenant { let now = SystemTime::now(); - let local_path = local_layer_path( - conf, - &self.tenant_shard_id, - &timeline_id, - &name, - &metadata.generation, - ); - let this = self.clone(); // spawn it to be cancellation safe tokio::task::spawn_blocking(move || { let _guard = guard; - // We tolerate ENOENT, because between planning eviction and executing - // it, the secondary downloader could have seen an updated heatmap that - // resulted in a layer being deleted. - // Other local I/O errors are process-fatal: these should never happen. - let deleted = std::fs::remove_file(local_path); - - let not_found = deleted - .as_ref() - .is_err_and(|x| x.kind() == std::io::ErrorKind::NotFound); - - let deleted = if not_found { - false - } else { - deleted - .map(|()| true) - .fatal_err("Deleting layer during eviction") - }; - - if !deleted { - // skip updating accounting and putting perhaps later timestamp - return; - } // Update the timeline's state. This does not have to be synchronized with // the download process, because: @@ -250,8 +211,15 @@ impl SecondaryTenant { // of the cache. let mut detail = this.detail.lock().unwrap(); if let Some(timeline_detail) = detail.timelines.get_mut(&timeline_id) { - timeline_detail.on_disk_layers.remove(&name); - timeline_detail.evicted_at.insert(name, now); + let removed = timeline_detail.on_disk_layers.remove(&name); + + // We might race with removal of the same layer during downloads, if it was removed + // from the heatmap. If we see that the OnDiskState is gone, then no need to + // do a physical deletion or store in evicted_at. + if let Some(removed) = removed { + removed.remove_blocking(); + timeline_detail.evicted_at.insert(name, now); + } } }) .await diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs index 092630e74d..870475eb57 100644 --- a/pageserver/src/tenant/secondary/downloader.rs +++ b/pageserver/src/tenant/secondary/downloader.rs @@ -22,11 +22,11 @@ use crate::{ FAILED_REMOTE_OP_RETRIES, }, span::debug_assert_current_span_has_tenant_id, - storage_layer::{layer::local_layer_path, LayerFileName}, + storage_layer::{layer::local_layer_path, LayerName}, tasks::{warn_when_period_overrun, BackgroundLoopKind}, }, virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile}, - METADATA_FILE_NAME, TEMP_FILE_SUFFIX, + TEMP_FILE_SUFFIX, }; use super::{ @@ -45,10 +45,10 @@ use crate::tenant::{ use camino::Utf8PathBuf; use chrono::format::{DelayedFormat, StrftimeItems}; -use futures::Future; +use futures::{Future, StreamExt}; use pageserver_api::models::SecondaryProgress; use pageserver_api::shard::TenantShardId; -use remote_storage::{DownloadError, Etag, GenericRemoteStorage}; +use remote_storage::{DownloadError, Etag, GenericRemoteStorage, RemoteStorageActivity}; use tokio_util::sync::CancellationToken; use tracing::{info_span, instrument, warn, Instrument}; @@ -62,14 +62,16 @@ use super::{ CommandRequest, DownloadCommand, }; -/// For each tenant, how long must have passed since the last download_tenant call before -/// calling it again. This is approximately the time by which local data is allowed -/// to fall behind remote data. -/// -/// TODO: this should just be a default, and the actual period should be controlled -/// via the heatmap itself -/// `` -const DOWNLOAD_FRESHEN_INTERVAL: Duration = Duration::from_millis(60000); +/// For each tenant, default period for how long must have passed since the last download_tenant call before +/// calling it again. This default is replaced with the value of [`HeatMapTenant::upload_period_ms`] after first +/// download, if the uploader populated it. +const DEFAULT_DOWNLOAD_INTERVAL: Duration = Duration::from_millis(60000); + +/// Range of concurrency we may use when downloading layers within a timeline. This is independent +/// for each tenant we're downloading: the concurrency of _tenants_ is defined separately in +/// `PageServerConf::secondary_download_concurrency` +const MAX_LAYER_CONCURRENCY: usize = 16; +const MIN_LAYER_CONCURRENCY: usize = 1; pub(super) async fn downloader_task( tenant_manager: Arc, @@ -79,14 +81,15 @@ pub(super) async fn downloader_task( cancel: CancellationToken, root_ctx: RequestContext, ) { - let concurrency = tenant_manager.get_conf().secondary_download_concurrency; + // How many tenants' secondary download operations we will run concurrently + let tenant_concurrency = tenant_manager.get_conf().secondary_download_concurrency; let generator = SecondaryDownloader { tenant_manager, remote_storage, root_ctx, }; - let mut scheduler = Scheduler::new(generator, concurrency); + let mut scheduler = Scheduler::new(generator, tenant_concurrency); scheduler .run(command_queue, background_jobs_can_start, cancel) @@ -104,6 +107,7 @@ struct SecondaryDownloader { pub(super) struct OnDiskState { metadata: LayerFileMetadata, access_time: SystemTime, + local_path: Utf8PathBuf, } impl OnDiskState { @@ -111,23 +115,46 @@ impl OnDiskState { _conf: &'static PageServerConf, _tenant_shard_id: &TenantShardId, _imeline_id: &TimelineId, - _ame: LayerFileName, + _ame: LayerName, metadata: LayerFileMetadata, access_time: SystemTime, + local_path: Utf8PathBuf, ) -> Self { Self { metadata, access_time, + local_path, } } + + // This is infallible, because all errors are either acceptable (ENOENT), or totally + // unexpected (fatal). + pub(super) fn remove_blocking(&self) { + // We tolerate ENOENT, because between planning eviction and executing + // it, the secondary downloader could have seen an updated heatmap that + // resulted in a layer being deleted. + // Other local I/O errors are process-fatal: these should never happen. + std::fs::remove_file(&self.local_path) + .or_else(fs_ext::ignore_not_found) + .fatal_err("Deleting secondary layer") + } } #[derive(Debug, Clone, Default)] pub(super) struct SecondaryDetailTimeline { - pub(super) on_disk_layers: HashMap, + pub(super) on_disk_layers: HashMap, /// We remember when layers were evicted, to prevent re-downloading them. - pub(super) evicted_at: HashMap, + pub(super) evicted_at: HashMap, +} + +// Aspects of a heatmap that we remember after downloading it +#[derive(Clone, Debug)] +struct DownloadSummary { + etag: Etag, + #[allow(unused)] + mtime: SystemTime, + upload_period: Duration, } /// This state is written by the secondary downloader, it is opaque @@ -136,8 +163,7 @@ pub(super) struct SecondaryDetailTimeline { pub(super) struct SecondaryDetail { pub(super) config: SecondaryLocationConfig, - last_download: Option, - last_etag: Option, + last_download: Option, next_download: Option, pub(super) timelines: HashMap, } @@ -167,7 +193,6 @@ impl SecondaryDetail { Self { config, last_download: None, - last_etag: None, next_download: None, timelines: HashMap::new(), } @@ -221,9 +246,8 @@ impl SecondaryDetail { struct PendingDownload { secondary_state: Arc, - last_download: Option, + last_download: Option, target_time: Option, - period: Option, } impl scheduler::PendingJob for PendingDownload { @@ -273,10 +297,17 @@ impl JobGenerator SchedulingResult { @@ -309,11 +340,11 @@ impl JobGenerator next_download { @@ -321,7 +352,6 @@ impl JobGenerator TenantDownloader<'a> { let tenant_shard_id = self.secondary_state.get_tenant_shard_id(); // We will use the etag from last successful download to make the download conditional on changes - let last_etag = self + let last_download = self .secondary_state .detail .lock() .unwrap() - .last_etag + .last_download .clone(); // Download the tenant's heatmap @@ -517,7 +540,7 @@ impl<'a> TenantDownloader<'a> { etag: heatmap_etag, bytes: heatmap_bytes, } = match tokio::select!( - bytes = self.download_heatmap(last_etag.as_ref()) => {bytes?}, + bytes = self.download_heatmap(last_download.as_ref().map(|d| &d.etag)) => {bytes?}, _ = self.secondary_state.cancel.cancelled() => return Ok(()) ) { HeatMapDownload::Unmodified => { @@ -546,6 +569,39 @@ impl<'a> TenantDownloader<'a> { heatmap.timelines.len() ); + // Get or initialize the local disk state for the timelines we will update + let mut timeline_states = HashMap::new(); + for timeline in &heatmap.timelines { + let timeline_state = self + .secondary_state + .detail + .lock() + .unwrap() + .timelines + .get(&timeline.timeline_id) + .cloned(); + + let timeline_state = match timeline_state { + Some(t) => t, + None => { + // We have no existing state: need to scan local disk for layers first. + let timeline_state = + init_timeline_state(self.conf, tenant_shard_id, timeline).await; + + // Re-acquire detail lock now that we're done with async load from local FS + self.secondary_state + .detail + .lock() + .unwrap() + .timelines + .insert(timeline.timeline_id, timeline_state.clone()); + timeline_state + } + }; + + timeline_states.insert(timeline.timeline_id, timeline_state); + } + // Clean up any local layers that aren't in the heatmap. We do this first for all timelines, on the general // principle that deletions should be done before writes wherever possible, and so that we can use this // phase to initialize our SecondaryProgress. @@ -556,6 +612,10 @@ impl<'a> TenantDownloader<'a> { // Download the layers in the heatmap for timeline in heatmap.timelines { + let timeline_state = timeline_states + .remove(&timeline.timeline_id) + .expect("Just populated above"); + if self.secondary_state.cancel.is_cancelled() { tracing::debug!( "Cancelled before downloading timeline {}", @@ -565,7 +625,7 @@ impl<'a> TenantDownloader<'a> { } let timeline_id = timeline.timeline_id; - self.download_timeline(timeline, ctx) + self.download_timeline(timeline, timeline_state, ctx) .instrument(tracing::info_span!( "secondary_download_timeline", tenant_id=%tenant_shard_id.tenant_id, @@ -577,7 +637,30 @@ impl<'a> TenantDownloader<'a> { // Only update last_etag after a full successful download: this way will not skip // the next download, even if the heatmap's actual etag is unchanged. - self.secondary_state.detail.lock().unwrap().last_etag = Some(heatmap_etag); + self.secondary_state.detail.lock().unwrap().last_download = Some(DownloadSummary { + etag: heatmap_etag, + mtime: heatmap_mtime, + upload_period: heatmap + .upload_period_ms + .map(|ms| Duration::from_millis(ms as u64)) + .unwrap_or(DEFAULT_DOWNLOAD_INTERVAL), + }); + + // Robustness: we should have updated progress properly, but in case we didn't, make sure + // we don't leave the tenant in a state where we claim to have successfully downloaded + // everything, but our progress is incomplete. The invariant here should be that if + // we have set `last_download` to this heatmap's etag, then the next time we see that + // etag we can safely do no work (i.e. we must be complete). + let mut progress = self.secondary_state.progress.lock().unwrap(); + debug_assert!(progress.layers_downloaded == progress.layers_total); + debug_assert!(progress.bytes_downloaded == progress.bytes_total); + if progress.layers_downloaded != progress.layers_total + || progress.bytes_downloaded != progress.bytes_total + { + tracing::warn!("Correcting drift in progress stats ({progress:?})"); + progress.layers_downloaded = progress.layers_total; + progress.bytes_downloaded = progress.bytes_total; + } Ok(()) } @@ -754,6 +837,7 @@ impl<'a> TenantDownloader<'a> { async fn download_timeline( &self, timeline: HeatMapTimeline, + timeline_state: SecondaryDetailTimeline, ctx: &RequestContext, ) -> Result<(), UpdateError> { debug_assert_current_span_has_tenant_and_timeline_id(); @@ -762,36 +846,10 @@ impl<'a> TenantDownloader<'a> { // Accumulate updates to the state let mut touched = Vec::new(); - // Clone a view of what layers already exist on disk - let timeline_state = self - .secondary_state - .detail - .lock() - .unwrap() - .timelines - .get(&timeline.timeline_id) - .cloned(); - - let timeline_state = match timeline_state { - Some(t) => t, - None => { - // We have no existing state: need to scan local disk for layers first. - let timeline_state = - init_timeline_state(self.conf, tenant_shard_id, &timeline).await; - - // Re-acquire detail lock now that we're done with async load from local FS - self.secondary_state - .detail - .lock() - .unwrap() - .timelines - .insert(timeline.timeline_id, timeline_state.clone()); - timeline_state - } - }; - tracing::debug!(timeline_id=%timeline.timeline_id, "Downloading layers, {} in heatmap", timeline.layers.len()); + let mut download_futs = Vec::new(); + // Download heatmap layers that are not present on local disk, or update their // access time if they are already present. for layer in timeline.layers { @@ -807,20 +865,12 @@ impl<'a> TenantDownloader<'a> { if cfg!(debug_assertions) { // Debug for https://github.com/neondatabase/neon/issues/6966: check that the files we think // are already present on disk are really there. - let local_path = local_layer_path( - self.conf, - tenant_shard_id, - &timeline.timeline_id, - &layer.name, - &layer.metadata.generation, - ); - - match tokio::fs::metadata(&local_path).await { + match tokio::fs::metadata(&on_disk.local_path).await { Ok(meta) => { tracing::debug!( "Layer {} present at {}, size {}", layer.name, - local_path, + on_disk.local_path, meta.len(), ); } @@ -828,7 +878,7 @@ impl<'a> TenantDownloader<'a> { tracing::warn!( "Layer {} not found at {} ({})", layer.name, - local_path, + on_disk.local_path, e ); debug_assert!(false); @@ -874,67 +924,33 @@ impl<'a> TenantDownloader<'a> { } } - // Failpoint for simulating slow remote storage - failpoint_support::sleep_millis_async!( - "secondary-layer-download-sleep", - &self.secondary_state.cancel - ); - - // Note: no backoff::retry wrapper here because download_layer_file does its own retries internally - let downloaded_bytes = match download_layer_file( - self.conf, - self.remote_storage, - *tenant_shard_id, - timeline.timeline_id, - &layer.name, - &LayerFileMetadata::from(&layer.metadata), - &self.secondary_state.cancel, + download_futs.push(self.download_layer( + tenant_shard_id, + &timeline.timeline_id, + layer, ctx, - ) - .await - { - Ok(bytes) => bytes, - Err(DownloadError::NotFound) => { - // A heatmap might be out of date and refer to a layer that doesn't exist any more. - // This is harmless: continue to download the next layer. It is expected during compaction - // GC. - tracing::debug!( - "Skipped downloading missing layer {}, raced with compaction/gc?", - layer.name - ); - continue; + )); + } + + // Break up layer downloads into chunks, so that for each chunk we can re-check how much + // concurrency to use based on activity level of remote storage. + while !download_futs.is_empty() { + let chunk = + download_futs.split_off(download_futs.len().saturating_sub(MAX_LAYER_CONCURRENCY)); + + let concurrency = Self::layer_concurrency(self.remote_storage.activity()); + + let mut result_stream = futures::stream::iter(chunk).buffered(concurrency); + let mut result_stream = std::pin::pin!(result_stream); + while let Some(result) = result_stream.next().await { + match result { + Err(e) => return Err(e), + Ok(None) => { + // No error, but we didn't download the layer. Don't mark it touched + } + Ok(Some(layer)) => touched.push(layer), } - Err(e) => return Err(e.into()), - }; - - if downloaded_bytes != layer.metadata.file_size { - let local_path = local_layer_path( - self.conf, - tenant_shard_id, - &timeline.timeline_id, - &layer.name, - &layer.metadata.generation, - ); - - tracing::warn!( - "Downloaded layer {} with unexpected size {} != {}. Removing download.", - layer.name, - downloaded_bytes, - layer.metadata.file_size - ); - - tokio::fs::remove_file(&local_path) - .await - .or_else(fs_ext::ignore_not_found)?; - } else { - tracing::info!("Downloaded layer {}, size {}", layer.name, downloaded_bytes); - let mut progress = self.secondary_state.progress.lock().unwrap(); - progress.bytes_downloaded += downloaded_bytes; - progress.layers_downloaded += 1; } - - SECONDARY_MODE.download_layer.inc(); - touched.push(layer) } // Write updates to state to record layers we just downloaded or touched. @@ -951,6 +967,13 @@ impl<'a> TenantDownloader<'a> { v.get_mut().access_time = t.access_time; } Entry::Vacant(e) => { + let local_path = local_layer_path( + self.conf, + tenant_shard_id, + &timeline.timeline_id, + &t.name, + &t.metadata.generation, + ); e.insert(OnDiskState::new( self.conf, tenant_shard_id, @@ -958,6 +981,7 @@ impl<'a> TenantDownloader<'a> { t.name, LayerFileMetadata::from(&t.metadata), t.access_time, + local_path, )); } } @@ -966,6 +990,107 @@ impl<'a> TenantDownloader<'a> { Ok(()) } + + async fn download_layer( + &self, + tenant_shard_id: &TenantShardId, + timeline_id: &TimelineId, + layer: HeatMapLayer, + ctx: &RequestContext, + ) -> Result, UpdateError> { + // Failpoint for simulating slow remote storage + failpoint_support::sleep_millis_async!( + "secondary-layer-download-sleep", + &self.secondary_state.cancel + ); + + let local_path = local_layer_path( + self.conf, + tenant_shard_id, + timeline_id, + &layer.name, + &layer.metadata.generation, + ); + + // Note: no backoff::retry wrapper here because download_layer_file does its own retries internally + let downloaded_bytes = match download_layer_file( + self.conf, + self.remote_storage, + *tenant_shard_id, + *timeline_id, + &layer.name, + &LayerFileMetadata::from(&layer.metadata), + &local_path, + &self.secondary_state.cancel, + ctx, + ) + .await + { + Ok(bytes) => bytes, + Err(DownloadError::NotFound) => { + // A heatmap might be out of date and refer to a layer that doesn't exist any more. + // This is harmless: continue to download the next layer. It is expected during compaction + // GC. + tracing::debug!( + "Skipped downloading missing layer {}, raced with compaction/gc?", + layer.name + ); + + // If the layer is 404, adjust the progress statistics to reflect that we will not download it. + let mut progress = self.secondary_state.progress.lock().unwrap(); + progress.layers_total = progress.layers_total.saturating_sub(1); + progress.bytes_total = progress + .bytes_total + .saturating_sub(layer.metadata.file_size); + + return Ok(None); + } + Err(e) => return Err(e.into()), + }; + + if downloaded_bytes != layer.metadata.file_size { + let local_path = local_layer_path( + self.conf, + tenant_shard_id, + timeline_id, + &layer.name, + &layer.metadata.generation, + ); + + tracing::warn!( + "Downloaded layer {} with unexpected size {} != {}. Removing download.", + layer.name, + downloaded_bytes, + layer.metadata.file_size + ); + + tokio::fs::remove_file(&local_path) + .await + .or_else(fs_ext::ignore_not_found)?; + } else { + tracing::info!("Downloaded layer {}, size {}", layer.name, downloaded_bytes); + let mut progress = self.secondary_state.progress.lock().unwrap(); + progress.bytes_downloaded += downloaded_bytes; + progress.layers_downloaded += 1; + } + + SECONDARY_MODE.download_layer.inc(); + + Ok(Some(layer)) + } + + /// Calculate the currently allowed parallelism of layer download tasks, based on activity level of the remote storage + fn layer_concurrency(activity: RemoteStorageActivity) -> usize { + // When less than 75% of units are available, use minimum concurrency. Else, do a linear mapping + // of our concurrency range to the units available within the remaining 25%. + let clamp_at = (activity.read_total * 3) / 4; + if activity.read_available > clamp_at { + (MAX_LAYER_CONCURRENCY * (activity.read_available - clamp_at)) + / (activity.read_total - clamp_at) + } else { + MIN_LAYER_CONCURRENCY + } + } } /// Scan local storage and build up Layer objects based on the metadata in a HeatMapTimeline @@ -997,7 +1122,7 @@ async fn init_timeline_state( // As we iterate through layers found on disk, we will look up their metadata from this map. // Layers not present in metadata will be discarded. - let heatmap_metadata: HashMap<&LayerFileName, &HeatMapLayer> = + let heatmap_metadata: HashMap<&LayerName, &HeatMapLayer> = heatmap.layers.iter().map(|l| (&l.name, l)).collect(); while let Some(dentry) = dir @@ -1015,11 +1140,7 @@ async fn init_timeline_state( .fatal_err(&format!("Read metadata on {}", file_path)); let file_name = file_path.file_name().expect("created it from the dentry"); - if file_name == METADATA_FILE_NAME { - // Secondary mode doesn't use local metadata files, but they might have been left behind by an attached tenant. - warn!(path=?dentry.path(), "found legacy metadata file, these should have been removed in load_tenant_config"); - continue; - } else if crate::is_temporary(&file_path) + if crate::is_temporary(&file_path) || is_temp_download_file(&file_path) || is_ephemeral_file(file_name) { @@ -1034,7 +1155,7 @@ async fn init_timeline_state( continue; } - match LayerFileName::from_str(file_name) { + match LayerName::from_str(file_name) { Ok(name) => { let remote_meta = heatmap_metadata.get(&name); match remote_meta { @@ -1061,6 +1182,7 @@ async fn init_timeline_state( name, LayerFileMetadata::from(&remote_meta.metadata), remote_meta.access_time, + file_path, ), ); } @@ -1092,3 +1214,58 @@ async fn init_timeline_state( detail } + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn layer_concurrency() { + // Totally idle + assert_eq!( + TenantDownloader::layer_concurrency(RemoteStorageActivity { + read_available: 16, + read_total: 16, + write_available: 16, + write_total: 16 + }), + MAX_LAYER_CONCURRENCY + ); + + // Totally busy + assert_eq!( + TenantDownloader::layer_concurrency(RemoteStorageActivity { + read_available: 0, + read_total: 16, + + write_available: 16, + write_total: 16 + }), + MIN_LAYER_CONCURRENCY + ); + + // Edge of the range at which we interpolate + assert_eq!( + TenantDownloader::layer_concurrency(RemoteStorageActivity { + read_available: 12, + read_total: 16, + + write_available: 16, + write_total: 16 + }), + MIN_LAYER_CONCURRENCY + ); + + // Midpoint of the range in which we interpolate + assert_eq!( + TenantDownloader::layer_concurrency(RemoteStorageActivity { + read_available: 14, + read_total: 16, + + write_available: 16, + write_total: 16 + }), + MAX_LAYER_CONCURRENCY / 2 + ); + } +} diff --git a/pageserver/src/tenant/secondary/heatmap.rs b/pageserver/src/tenant/secondary/heatmap.rs index 73cdf6c6d4..2da4a3b9d5 100644 --- a/pageserver/src/tenant/secondary/heatmap.rs +++ b/pageserver/src/tenant/secondary/heatmap.rs @@ -1,8 +1,6 @@ use std::time::SystemTime; -use crate::tenant::{ - remote_timeline_client::index::IndexLayerMetadata, storage_layer::LayerFileName, -}; +use crate::tenant::{remote_timeline_client::index::IndexLayerMetadata, storage_layer::LayerName}; use serde::{Deserialize, Serialize}; use serde_with::{serde_as, DisplayFromStr, TimestampSeconds}; @@ -17,6 +15,14 @@ pub(super) struct HeatMapTenant { pub(super) generation: Generation, pub(super) timelines: Vec, + + /// Uploaders provide their own upload period in the heatmap, as a hint to downloaders + /// of how frequently it is worthwhile to check for updates. + /// + /// This is optional for backward compat, and because we sometimes might upload + /// a heatmap explicitly via API for a tenant that has no periodic upload configured. + #[serde(default)] + pub(super) upload_period_ms: Option, } #[serde_as] @@ -31,7 +37,7 @@ pub(crate) struct HeatMapTimeline { #[serde_as] #[derive(Serialize, Deserialize)] pub(crate) struct HeatMapLayer { - pub(super) name: LayerFileName, + pub(super) name: LayerName, pub(super) metadata: IndexLayerMetadata, #[serde_as(as = "TimestampSeconds")] @@ -42,7 +48,7 @@ pub(crate) struct HeatMapLayer { impl HeatMapLayer { pub(crate) fn new( - name: LayerFileName, + name: LayerName, metadata: IndexLayerMetadata, access_time: SystemTime, ) -> Self { @@ -83,4 +89,21 @@ impl HeatMapTenant { stats } + + pub(crate) fn strip_atimes(self) -> Self { + Self { + timelines: self + .timelines + .into_iter() + .map(|mut tl| { + for layer in &mut tl.layers { + layer.access_time = SystemTime::UNIX_EPOCH; + } + tl + }) + .collect(), + generation: self.generation, + upload_period_ms: self.upload_period_ms, + } + } } diff --git a/pageserver/src/tenant/secondary/heatmap_uploader.rs b/pageserver/src/tenant/secondary/heatmap_uploader.rs index 352409f5fc..fddced3ead 100644 --- a/pageserver/src/tenant/secondary/heatmap_uploader.rs +++ b/pageserver/src/tenant/secondary/heatmap_uploader.rs @@ -80,7 +80,7 @@ impl RunningJob for WriteInProgress { struct UploadPending { tenant: Arc, - last_digest: Option, + last_upload: Option, target_time: Option, period: Option, } @@ -94,7 +94,7 @@ impl scheduler::PendingJob for UploadPending { struct WriteComplete { tenant_shard_id: TenantShardId, completed_at: Instant, - digest: Option, + uploaded: Option, next_upload: Option, } @@ -115,10 +115,7 @@ struct UploaderTenantState { tenant: Weak, /// Digest of the serialized heatmap that we last successfully uploaded - /// - /// md5 is generally a bad hash. We use it because it's convenient for interop with AWS S3's ETag, - /// which is also an md5sum. - last_digest: Option, + last_upload_state: Option, /// When the last upload attempt completed (may have been successful or failed) last_upload: Option, @@ -187,7 +184,7 @@ impl JobGenerator tenant: Arc::downgrade(&tenant), last_upload: None, next_upload: Some(now.checked_add(period_warmup(period)).unwrap_or(now)), - last_digest: None, + last_upload_state: None, }); // Decline to do the upload if insufficient time has passed @@ -195,10 +192,10 @@ impl JobGenerator return; } - let last_digest = state.last_digest; + let last_upload = state.last_upload_state.clone(); result.jobs.push(UploadPending { tenant, - last_digest, + last_upload, target_time: state.next_upload, period: Some(period), }); @@ -218,7 +215,7 @@ impl JobGenerator ) { let UploadPending { tenant, - last_digest, + last_upload, target_time, period, } = job; @@ -231,16 +228,16 @@ impl JobGenerator let _completion = completion; let started_at = Instant::now(); - let digest = match upload_tenant_heatmap(remote_storage, &tenant, last_digest).await { - Ok(UploadHeatmapOutcome::Uploaded(digest)) => { + let uploaded = match upload_tenant_heatmap(remote_storage, &tenant, last_upload.clone()).await { + Ok(UploadHeatmapOutcome::Uploaded(uploaded)) => { let duration = Instant::now().duration_since(started_at); SECONDARY_MODE .upload_heatmap_duration .observe(duration.as_secs_f64()); SECONDARY_MODE.upload_heatmap.inc(); - Some(digest) + Some(uploaded) } - Ok(UploadHeatmapOutcome::NoChange | UploadHeatmapOutcome::Skipped) => last_digest, + Ok(UploadHeatmapOutcome::NoChange | UploadHeatmapOutcome::Skipped) => last_upload, Err(UploadHeatmapError::Upload(e)) => { tracing::warn!( "Failed to upload heatmap for tenant {}: {e:#}", @@ -251,11 +248,11 @@ impl JobGenerator .upload_heatmap_duration .observe(duration.as_secs_f64()); SECONDARY_MODE.upload_heatmap_errors.inc(); - last_digest + last_upload } Err(UploadHeatmapError::Cancelled) => { tracing::info!("Cancelled heatmap upload, shutting down"); - last_digest + last_upload } }; @@ -277,7 +274,7 @@ impl JobGenerator WriteComplete { tenant_shard_id: *tenant.get_tenant_shard_id(), completed_at: now, - digest, + uploaded, next_upload, } }.instrument(info_span!(parent: None, "heatmap_upload", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug())))) @@ -299,7 +296,7 @@ impl JobGenerator Ok(UploadPending { // Ignore our state for last digest: this forces an upload even if nothing has changed - last_digest: None, + last_upload: None, tenant, target_time: None, period: None, @@ -312,7 +309,7 @@ impl JobGenerator let WriteComplete { tenant_shard_id, completed_at, - digest, + uploaded, next_upload, } = completion; use std::collections::hash_map::Entry; @@ -322,7 +319,7 @@ impl JobGenerator } Entry::Occupied(mut entry) => { entry.get_mut().last_upload = Some(completed_at); - entry.get_mut().last_digest = digest; + entry.get_mut().last_upload_state = uploaded; entry.get_mut().next_upload = next_upload } } @@ -331,7 +328,7 @@ impl JobGenerator enum UploadHeatmapOutcome { /// We successfully wrote to remote storage, with this digest. - Uploaded(md5::Digest), + Uploaded(LastUploadState), /// We did not upload because the heatmap digest was unchanged since the last upload NoChange, /// We skipped the upload for some reason, such as tenant/timeline not ready @@ -347,12 +344,25 @@ enum UploadHeatmapError { Upload(#[from] anyhow::Error), } +/// Digests describing the heatmap we most recently uploaded successfully. +/// +/// md5 is generally a bad hash. We use it because it's convenient for interop with AWS S3's ETag, +/// which is also an md5sum. +#[derive(Clone)] +struct LastUploadState { + // Digest of json-encoded HeatMapTenant + uploaded_digest: md5::Digest, + + // Digest without atimes set. + layers_only_digest: md5::Digest, +} + /// The inner upload operation. This will skip if `last_digest` is Some and matches the digest /// of the object we would have uploaded. async fn upload_tenant_heatmap( remote_storage: GenericRemoteStorage, tenant: &Arc, - last_digest: Option, + last_upload: Option, ) -> Result { debug_assert_current_span_has_tenant_id(); @@ -368,6 +378,7 @@ async fn upload_tenant_heatmap( let mut heatmap = HeatMapTenant { timelines: Vec::new(), generation, + upload_period_ms: tenant.get_heatmap_period().map(|p| p.as_millis()), }; let timelines = tenant.timelines.lock().unwrap().clone(); @@ -396,15 +407,31 @@ async fn upload_tenant_heatmap( // Serialize the heatmap let bytes = serde_json::to_vec(&heatmap).map_err(|e| anyhow::anyhow!(e))?; - let bytes = bytes::Bytes::from(bytes); - let size = bytes.len(); // Drop out early if nothing changed since our last upload let digest = md5::compute(&bytes); - if Some(digest) == last_digest { + if Some(&digest) == last_upload.as_ref().map(|d| &d.uploaded_digest) { return Ok(UploadHeatmapOutcome::NoChange); } + // Calculate a digest that omits atimes, so that we can distinguish actual changes in + // layers from changes only in atimes. + let heatmap_size_bytes = heatmap.get_stats().bytes; + let layers_only_bytes = + serde_json::to_vec(&heatmap.strip_atimes()).map_err(|e| anyhow::anyhow!(e))?; + let layers_only_digest = md5::compute(&layers_only_bytes); + if heatmap_size_bytes < tenant.get_checkpoint_distance() { + // For small tenants, skip upload if only atimes changed. This avoids doing frequent + // uploads from long-idle tenants whose atimes are just incremented by periodic + // size calculations. + if Some(&layers_only_digest) == last_upload.as_ref().map(|d| &d.layers_only_digest) { + return Ok(UploadHeatmapOutcome::NoChange); + } + } + + let bytes = bytes::Bytes::from(bytes); + let size = bytes.len(); + let path = remote_heatmap_path(tenant.get_tenant_shard_id()); let cancel = &tenant.cancel; @@ -436,5 +463,8 @@ async fn upload_tenant_heatmap( tracing::info!("Successfully uploaded {size} byte heatmap to {path}"); - Ok(UploadHeatmapOutcome::Uploaded(digest)) + Ok(UploadHeatmapOutcome::Uploaded(LastUploadState { + uploaded_digest: digest, + layers_only_digest, + })) } diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs index 4f1b56ef9f..9ccf20c0d4 100644 --- a/pageserver/src/tenant/storage_layer.rs +++ b/pageserver/src/tenant/storage_layer.rs @@ -1,11 +1,11 @@ //! Common traits and structs for layers pub mod delta_layer; -mod filename; pub mod image_layer; pub(crate) mod inmemory_layer; pub(crate) mod layer; mod layer_desc; +mod layer_name; use crate::context::{AccessStatsBehavior, RequestContext}; use crate::repository::Value; @@ -34,10 +34,10 @@ use utils::rate_limit::RateLimit; use utils::{id::TimelineId, lsn::Lsn}; pub use delta_layer::{DeltaLayer, DeltaLayerWriter, ValueRef}; -pub use filename::{DeltaFileName, ImageFileName, LayerFileName}; pub use image_layer::{ImageLayer, ImageLayerWriter}; pub use inmemory_layer::InMemoryLayer; pub use layer_desc::{PersistentLayerDesc, PersistentLayerKey}; +pub use layer_name::{DeltaLayerName, ImageLayerName, LayerName}; pub(crate) use layer::{EvictionError, Layer, ResidentLayer}; @@ -113,12 +113,20 @@ impl From for ValueReconstructState { } } -/// Bag of data accumulated during a vectored get +/// Bag of data accumulated during a vectored get.. pub(crate) struct ValuesReconstructState { + /// The keys will be removed after `get_vectored` completes. The caller outside `Timeline` + /// should not expect to get anything from this hashmap. pub(crate) keys: HashMap>, - + /// The keys which are already retrieved keys_done: KeySpaceRandomAccum, + + /// The keys covered by the image layers + keys_with_image_coverage: Option>, + + // Statistics that are still accessible as a caller of `get_vectored_impl`. layers_visited: u32, + delta_layers_visited: u32, } impl ValuesReconstructState { @@ -126,7 +134,9 @@ impl ValuesReconstructState { Self { keys: HashMap::new(), keys_done: KeySpaceRandomAccum::new(), + keys_with_image_coverage: None, layers_visited: 0, + delta_layers_visited: 0, } } @@ -140,8 +150,17 @@ impl ValuesReconstructState { } } - pub(crate) fn on_layer_visited(&mut self) { + pub(crate) fn on_layer_visited(&mut self, layer: &ReadableLayer) { self.layers_visited += 1; + if let ReadableLayer::PersistentLayer(layer) = layer { + if layer.layer_desc().is_delta() { + self.delta_layers_visited += 1; + } + } + } + + pub(crate) fn get_delta_layers_visited(&self) -> u32 { + self.delta_layers_visited } pub(crate) fn get_layers_visited(&self) -> u32 { @@ -171,6 +190,16 @@ impl ValuesReconstructState { } } + /// On hitting image layer, we can mark all keys in this range as done, because + /// if the image layer does not contain a key, it is deleted/never added. + pub(crate) fn on_image_layer_visited(&mut self, key_range: &Range) { + let prev_val = self.keys_with_image_coverage.replace(key_range.clone()); + assert_eq!( + prev_val, None, + "should consume the keyspace before the next iteration" + ); + } + /// Update the state collected for a given key. /// Returns true if this was the last value needed for the key and false otherwise. /// @@ -233,8 +262,12 @@ impl ValuesReconstructState { /// Returns the key space describing the keys that have /// been marked as completed since the last call to this function. - pub(crate) fn consume_done_keys(&mut self) -> KeySpace { - self.keys_done.consume_keyspace() + /// Returns individual keys done, and the image layer coverage. + pub(crate) fn consume_done_keys(&mut self) -> (KeySpace, Option>) { + ( + self.keys_done.consume_keyspace(), + self.keys_with_image_coverage.take(), + ) } } @@ -646,8 +679,8 @@ pub mod tests { use super::*; - impl From for PersistentLayerDesc { - fn from(value: DeltaFileName) -> Self { + impl From for PersistentLayerDesc { + fn from(value: DeltaLayerName) -> Self { PersistentLayerDesc::new_delta( TenantShardId::from([0; 18]), TimelineId::from_array([0; 16]), @@ -658,8 +691,8 @@ pub mod tests { } } - impl From for PersistentLayerDesc { - fn from(value: ImageFileName) -> Self { + impl From for PersistentLayerDesc { + fn from(value: ImageLayerName) -> Self { PersistentLayerDesc::new_img( TenantShardId::from([0; 18]), TimelineId::from_array([0; 16]), @@ -670,11 +703,11 @@ pub mod tests { } } - impl From for PersistentLayerDesc { - fn from(value: LayerFileName) -> Self { + impl From for PersistentLayerDesc { + fn from(value: LayerName) -> Self { match value { - LayerFileName::Delta(d) => Self::from(d), - LayerFileName::Image(i) => Self::from(i), + LayerName::Delta(d) => Self::from(d), + LayerName::Image(i) => Self::from(i), } } } diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs index 6fd96b0e2f..1b3802840f 100644 --- a/pageserver/src/tenant/storage_layer/delta_layer.rs +++ b/pageserver/src/tenant/storage_layer/delta_layer.rs @@ -57,6 +57,7 @@ use std::fs::File; use std::io::SeekFrom; use std::ops::Range; use std::os::unix::fs::FileExt; +use std::str::FromStr; use std::sync::Arc; use tokio::sync::OnceCell; use tracing::*; @@ -68,7 +69,8 @@ use utils::{ }; use super::{ - AsLayerDesc, LayerAccessStats, PersistentLayerDesc, ResidentLayer, ValuesReconstructState, + AsLayerDesc, LayerAccessStats, LayerName, PersistentLayerDesc, ResidentLayer, + ValuesReconstructState, }; /// @@ -309,13 +311,13 @@ impl DeltaLayer { .and_then(|res| res)?; // not production code - let actual_filename = path.file_name().unwrap().to_owned(); - let expected_filename = self.layer_desc().filename().file_name(); + let actual_layer_name = LayerName::from_str(path.file_name().unwrap()).unwrap(); + let expected_layer_name = self.layer_desc().layer_name(); - if actual_filename != expected_filename { + if actual_layer_name != expected_layer_name { println!("warning: filename does not match what is expected from in-file summary"); - println!("actual: {:?}", actual_filename); - println!("expected: {:?}", expected_filename); + println!("actual: {:?}", actual_layer_name.to_string()); + println!("expected: {:?}", expected_layer_name.to_string()); } Ok(Arc::new(loaded)) @@ -392,6 +394,7 @@ impl DeltaLayerWriterInner { tenant_shard_id: TenantShardId, key_start: Key, lsn_range: Range, + ctx: &RequestContext, ) -> anyhow::Result { // Create the file initially with a temporary filename. We don't know // the end key yet, so we cannot form the final filename yet. We will @@ -402,7 +405,7 @@ impl DeltaLayerWriterInner { let path = DeltaLayer::temp_path_for(conf, &tenant_shard_id, &timeline_id, key_start, &lsn_range); - let mut file = VirtualFile::create(&path).await?; + let mut file = VirtualFile::create(&path, ctx).await?; // make room for the header block file.seek(SeekFrom::Start(PAGE_SZ as u64)).await?; let blob_writer = BlobWriter::new(file, PAGE_SZ as u64); @@ -584,6 +587,7 @@ impl DeltaLayerWriter { tenant_shard_id: TenantShardId, key_start: Key, lsn_range: Range, + ctx: &RequestContext, ) -> anyhow::Result { Ok(Self { inner: Some( @@ -593,6 +597,7 @@ impl DeltaLayerWriter { tenant_shard_id, key_start, lsn_range, + ctx, ) .await?, ), @@ -699,6 +704,7 @@ impl DeltaLayer { let mut file = VirtualFile::open_with_options( path, virtual_file::OpenOptions::new().read(true).write(true), + ctx, ) .await .with_context(|| format!("Failed to open file '{}'", path))?; @@ -732,7 +738,7 @@ impl DeltaLayerInner { max_vectored_read_bytes: Option, ctx: &RequestContext, ) -> Result, anyhow::Error> { - let file = match VirtualFile::open(path).await { + let file = match VirtualFile::open(path, ctx).await { Ok(file) => file, Err(e) => return Ok(Err(anyhow::Error::new(e).context("open layer file"))), }; @@ -906,7 +912,7 @@ impl DeltaLayerInner { .await .map_err(GetVectoredError::Other)?; - self.do_reads_and_update_state(reads, reconstruct_state) + self.do_reads_and_update_state(reads, reconstruct_state, ctx) .await; reconstruct_state.on_lsn_advanced(&keyspace, self.lsn_range.start); @@ -1010,6 +1016,7 @@ impl DeltaLayerInner { &self, reads: Vec, reconstruct_state: &mut ValuesReconstructState, + ctx: &RequestContext, ) { let vectored_blob_reader = VectoredBlobReader::new(&self.file); let mut ignore_key_with_err = None; @@ -1027,7 +1034,7 @@ impl DeltaLayerInner { // track when a key is done. for read in reads.into_iter().rev() { let res = vectored_blob_reader - .read_blobs(&read, buf.take().expect("Should have a buffer")) + .read_blobs(&read, buf.take().expect("Should have a buffer"), ctx) .await; let blobs_buf = match res { @@ -1272,7 +1279,7 @@ impl DeltaLayerInner { buf.clear(); buf.reserve(read.size()); - let res = reader.read_blobs(&read, buf).await?; + let res = reader.read_blobs(&read, buf, ctx).await?; for blob in res.blobs { let key = blob.meta.key; @@ -1789,6 +1796,7 @@ mod test { harness.tenant_shard_id, entries_meta.key_range.start, entries_meta.lsn_range.clone(), + &ctx, ) .await?; @@ -1846,7 +1854,7 @@ mod test { for read in vectored_reads { let blobs_buf = vectored_blob_reader - .read_blobs(&read, buf.take().expect("Should have a buffer")) + .read_blobs(&read, buf.take().expect("Should have a buffer"), &ctx) .await?; for meta in blobs_buf.blobs.iter() { let value = &blobs_buf.buf[meta.start..meta.end]; @@ -1976,6 +1984,7 @@ mod test { tenant.tenant_shard_id, Key::MIN, Lsn(0x11)..truncate_at, + ctx, ) .await .unwrap(); diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs index 1477a1fc33..becd1e7a6d 100644 --- a/pageserver/src/tenant/storage_layer/image_layer.rs +++ b/pageserver/src/tenant/storage_layer/image_layer.rs @@ -54,6 +54,7 @@ use std::fs::File; use std::io::SeekFrom; use std::ops::Range; use std::os::unix::prelude::FileExt; +use std::str::FromStr; use std::sync::Arc; use tokio::sync::OnceCell; use tokio_stream::StreamExt; @@ -65,8 +66,10 @@ use utils::{ lsn::Lsn, }; -use super::filename::ImageFileName; -use super::{AsLayerDesc, Layer, PersistentLayerDesc, ResidentLayer, ValuesReconstructState}; +use super::layer_name::ImageLayerName; +use super::{ + AsLayerDesc, Layer, LayerName, PersistentLayerDesc, ResidentLayer, ValuesReconstructState, +}; /// /// Header stored in the beginning of the file @@ -155,6 +158,7 @@ pub struct ImageLayerInner { index_start_blk: u32, index_root_blk: u32, + key_range: Range, lsn: Lsn, file: VirtualFile, @@ -231,7 +235,7 @@ impl ImageLayer { conf: &PageServerConf, timeline_id: TimelineId, tenant_shard_id: TenantShardId, - fname: &ImageFileName, + fname: &ImageLayerName, ) -> Utf8PathBuf { let rand_string: String = rand::thread_rng() .sample_iter(&Alphanumeric) @@ -267,13 +271,13 @@ impl ImageLayer { .and_then(|res| res)?; // not production code - let actual_filename = path.file_name().unwrap().to_owned(); - let expected_filename = self.layer_desc().filename().file_name(); + let actual_layer_name = LayerName::from_str(path.file_name().unwrap()).unwrap(); + let expected_layer_name = self.layer_desc().layer_name(); - if actual_filename != expected_filename { + if actual_layer_name != expected_layer_name { println!("warning: filename does not match what is expected from in-file summary"); - println!("actual: {:?}", actual_filename); - println!("expected: {:?}", expected_filename); + println!("actual: {:?}", actual_layer_name.to_string()); + println!("expected: {:?}", expected_layer_name.to_string()); } Ok(loaded) @@ -340,6 +344,7 @@ impl ImageLayer { let mut file = VirtualFile::open_with_options( path, virtual_file::OpenOptions::new().read(true).write(true), + ctx, ) .await .with_context(|| format!("Failed to open file '{}'", path))?; @@ -374,7 +379,7 @@ impl ImageLayerInner { max_vectored_read_bytes: Option, ctx: &RequestContext, ) -> Result, anyhow::Error> { - let file = match VirtualFile::open(path).await { + let file = match VirtualFile::open(path, ctx).await { Ok(file) => file, Err(e) => return Ok(Err(anyhow::Error::new(e).context("open layer file"))), }; @@ -415,6 +420,7 @@ impl ImageLayerInner { file, file_id, max_vectored_read_bytes, + key_range: actual_summary.key_range, })) } @@ -471,9 +477,11 @@ impl ImageLayerInner { .await .map_err(GetVectoredError::Other)?; - self.do_reads_and_update_state(reads, reconstruct_state) + self.do_reads_and_update_state(reads, reconstruct_state, ctx) .await; + reconstruct_state.on_image_layer_visited(&self.key_range); + Ok(()) } @@ -534,6 +542,7 @@ impl ImageLayerInner { &self, reads: Vec, reconstruct_state: &mut ValuesReconstructState, + ctx: &RequestContext, ) { let max_vectored_read_bytes = self .max_vectored_read_bytes @@ -562,7 +571,7 @@ impl ImageLayerInner { } let buf = BytesMut::with_capacity(buf_size); - let res = vectored_blob_reader.read_blobs(&read, buf).await; + let res = vectored_blob_reader.read_blobs(&read, buf, ctx).await; match res { Ok(blobs_buf) => { @@ -628,6 +637,7 @@ impl ImageLayerWriterInner { tenant_shard_id: TenantShardId, key_range: &Range, lsn: Lsn, + ctx: &RequestContext, ) -> anyhow::Result { // Create the file initially with a temporary filename. // We'll atomically rename it to the final name when we're done. @@ -635,7 +645,7 @@ impl ImageLayerWriterInner { conf, timeline_id, tenant_shard_id, - &ImageFileName { + &ImageLayerName { key_range: key_range.clone(), lsn, }, @@ -647,6 +657,7 @@ impl ImageLayerWriterInner { virtual_file::OpenOptions::new() .write(true) .create_new(true), + ctx, ) .await? }; @@ -801,10 +812,11 @@ impl ImageLayerWriter { tenant_shard_id: TenantShardId, key_range: &Range, lsn: Lsn, + ctx: &RequestContext, ) -> anyhow::Result { Ok(Self { inner: Some( - ImageLayerWriterInner::new(conf, timeline_id, tenant_shard_id, key_range, lsn) + ImageLayerWriterInner::new(conf, timeline_id, tenant_shard_id, key_range, lsn, ctx) .await?, ), }) diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs index 4dacbec2f3..9553f83026 100644 --- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs +++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs @@ -473,10 +473,11 @@ impl InMemoryLayer { timeline_id: TimelineId, tenant_shard_id: TenantShardId, start_lsn: Lsn, + ctx: &RequestContext, ) -> Result { trace!("initializing new empty InMemoryLayer for writing on timeline {timeline_id} at {start_lsn}"); - let file = EphemeralFile::create(conf, tenant_shard_id, timeline_id).await?; + let file = EphemeralFile::create(conf, tenant_shard_id, timeline_id, ctx).await?; let key = InMemoryLayerFileId(file.page_cache_file_id()); Ok(InMemoryLayer { @@ -642,6 +643,7 @@ impl InMemoryLayer { self.tenant_shard_id, Key::MIN, self.start_lsn..end_lsn, + ctx, ) .await?; diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs index b5e69db7f4..8c64621710 100644 --- a/pageserver/src/tenant/storage_layer/layer.rs +++ b/pageserver/src/tenant/storage_layer/layer.rs @@ -12,7 +12,7 @@ use std::time::{Duration, SystemTime}; use tracing::Instrument; use utils::id::TimelineId; use utils::lsn::Lsn; -use utils::sync::heavier_once_cell; +use utils::sync::{gate, heavier_once_cell}; use crate::config::PageServerConf; use crate::context::{DownloadBehavior, RequestContext}; @@ -25,7 +25,7 @@ use crate::tenant::{remote_timeline_client::LayerFileMetadata, Timeline}; use super::delta_layer::{self, DeltaEntry}; use super::image_layer; use super::{ - AsLayerDesc, LayerAccessStats, LayerAccessStatsReset, LayerFileName, PersistentLayerDesc, + AsLayerDesc, LayerAccessStats, LayerAccessStatsReset, LayerName, PersistentLayerDesc, ValueReconstructResult, ValueReconstructState, ValuesReconstructState, }; @@ -128,19 +128,17 @@ pub(crate) fn local_layer_path( conf: &PageServerConf, tenant_shard_id: &TenantShardId, timeline_id: &TimelineId, - layer_file_name: &LayerFileName, - _generation: &Generation, + layer_file_name: &LayerName, + generation: &Generation, ) -> Utf8PathBuf { let timeline_path = conf.timeline_path(tenant_shard_id, timeline_id); - timeline_path.join(layer_file_name.file_name()) - - // TOOD: include generation in the name in now+1 releases. - // timeline_path.join(format!( - // "{}{}", - // layer_file_name.file_name(), - // generation.get_suffix() - // )) + if generation.is_none() { + // Without a generation, we may only use legacy path style + timeline_path.join(layer_file_name.to_string()) + } else { + timeline_path.join(format!("{}-v1{}", layer_file_name, generation.get_suffix())) + } } impl Layer { @@ -148,7 +146,7 @@ impl Layer { pub(crate) fn for_evicted( conf: &'static PageServerConf, timeline: &Arc, - file_name: LayerFileName, + file_name: LayerName, metadata: LayerFileMetadata, ) -> Self { let local_path = local_layer_path( @@ -189,7 +187,7 @@ impl Layer { conf: &'static PageServerConf, timeline: &Arc, local_path: Utf8PathBuf, - file_name: LayerFileName, + file_name: LayerName, metadata: LayerFileMetadata, ) -> ResidentLayer { let desc = PersistentLayerDesc::from_filename( @@ -261,7 +259,7 @@ impl Layer { conf, &timeline.tenant_shard_id, &timeline.timeline_id, - &desc.filename(), + &desc.layer_name(), &timeline.generation, ); @@ -587,9 +585,6 @@ struct LayerInner { /// [`Timeline::gate`] at the same time. timeline: Weak, - /// Cached knowledge of [`Timeline::remote_client`] being `Some`. - have_remote_client: bool, - access_stats: LayerAccessStats, /// This custom OnceCell is backed by std mutex, but only held for short time periods. @@ -689,7 +684,7 @@ impl Drop for LayerInner { let span = tracing::info_span!(parent: None, "layer_delete", tenant_id = %self.layer_desc().tenant_shard_id.tenant_id, shard_id=%self.layer_desc().tenant_shard_id.shard_slug(), timeline_id = %self.layer_desc().timeline_id); let path = std::mem::take(&mut self.path); - let file_name = self.layer_desc().filename(); + let file_name = self.layer_desc().layer_name(); let file_size = self.layer_desc().file_size; let timeline = self.timeline.clone(); let meta = self.metadata(); @@ -734,23 +729,23 @@ impl Drop for LayerInner { if removed { timeline.metrics.resident_physical_size_sub(file_size); } - if let Some(remote_client) = timeline.remote_client.as_ref() { - let res = remote_client.schedule_deletion_of_unlinked(vec![(file_name, meta)]); + let res = timeline + .remote_client + .schedule_deletion_of_unlinked(vec![(file_name, meta)]); - if let Err(e) = res { - // test_timeline_deletion_with_files_stuck_in_upload_queue is good at - // demonstrating this deadlock (without spawn_blocking): stop will drop - // queued items, which will have ResidentLayer's, and those drops would try - // to re-entrantly lock the RemoteTimelineClient inner state. - if !timeline.is_active() { - tracing::info!("scheduling deletion on drop failed: {e:#}"); - } else { - tracing::warn!("scheduling deletion on drop failed: {e:#}"); - } - LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::DeleteSchedulingFailed); + if let Err(e) = res { + // test_timeline_deletion_with_files_stuck_in_upload_queue is good at + // demonstrating this deadlock (without spawn_blocking): stop will drop + // queued items, which will have ResidentLayer's, and those drops would try + // to re-entrantly lock the RemoteTimelineClient inner state. + if !timeline.is_active() { + tracing::info!("scheduling deletion on drop failed: {e:#}"); } else { - LAYER_IMPL_METRICS.inc_completed_deletes(); + tracing::warn!("scheduling deletion on drop failed: {e:#}"); } + LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::DeleteSchedulingFailed); + } else { + LAYER_IMPL_METRICS.inc_completed_deletes(); } }); } @@ -782,11 +777,12 @@ impl LayerInner { LayerInner { conf, - debug_str: { format!("timelines/{}/{}", timeline.timeline_id, desc.filename()).into() }, + debug_str: { + format!("timelines/{}/{}", timeline.timeline_id, desc.layer_name()).into() + }, path: local_path, desc, timeline: Arc::downgrade(timeline), - have_remote_client: timeline.remote_client.is_some(), access_stats, wanted_deleted: AtomicBool::new(false), inner, @@ -815,8 +811,6 @@ impl LayerInner { /// in a new attempt to evict OR join the previously started attempt. #[tracing::instrument(level = tracing::Level::DEBUG, skip_all, ret, err(level = tracing::Level::DEBUG), fields(layer=%self))] pub(crate) async fn evict_and_wait(&self, timeout: Duration) -> Result<(), EvictionError> { - assert!(self.have_remote_client); - let mut rx = self.status.as_ref().unwrap().subscribe(); { @@ -973,10 +967,6 @@ impl LayerInner { return Err(DownloadError::NotFile(ft)); } - if timeline.remote_client.as_ref().is_none() { - return Err(DownloadError::NoRemoteStorage); - } - if let Some(ctx) = ctx { self.check_expected_download(ctx)?; } @@ -1113,15 +1103,12 @@ impl LayerInner { permit: heavier_once_cell::InitPermit, ctx: &RequestContext, ) -> anyhow::Result> { - let client = timeline + let result = timeline .remote_client - .as_ref() - .expect("checked before download_init_and_wait"); - - let result = client .download_layer_file( - &self.desc.filename(), + &self.desc.layer_name(), &self.metadata(), + &self.path, &timeline.cancel, ctx, ) @@ -1257,7 +1244,7 @@ impl LayerInner { } fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo { - let layer_file_name = self.desc.filename().file_name(); + let layer_name = self.desc.layer_name().to_string(); let resident = self .inner @@ -1271,18 +1258,19 @@ impl LayerInner { let lsn_range = &self.desc.lsn_range; HistoricLayerInfo::Delta { - layer_file_name, + layer_file_name: layer_name, layer_file_size: self.desc.file_size, lsn_start: lsn_range.start, lsn_end: lsn_range.end, remote: !resident, access_stats, + l0: crate::tenant::layer_map::LayerMap::is_l0(self.layer_desc()), } } else { let lsn = self.desc.image_layer_lsn(); HistoricLayerInfo::Image { - layer_file_name, + layer_file_name: layer_name, layer_file_size: self.desc.file_size, lsn_start: lsn, remote: !resident, @@ -1293,20 +1281,10 @@ impl LayerInner { /// `DownloadedLayer` is being dropped, so it calls this method. fn on_downloaded_layer_drop(self: Arc, only_version: usize) { - let can_evict = self.have_remote_client; - // we cannot know without inspecting LayerInner::inner if we should evict or not, even // though here it is very likely let span = tracing::info_span!(parent: None, "layer_evict", tenant_id = %self.desc.tenant_shard_id.tenant_id, shard_id = %self.desc.tenant_shard_id.shard_slug(), timeline_id = %self.desc.timeline_id, layer=%self, version=%only_version); - if !can_evict { - // it would be nice to assert this case out, but we are in drop - span.in_scope(|| { - tracing::error!("bug in struct Layer: ResidentOrWantedEvicted has been downgraded while we have no remote storage"); - }); - return; - } - // NOTE: this scope *must* never call `self.inner.get` because evict_and_wait might // drop while the `self.inner` is being locked, leading to a deadlock. @@ -1355,7 +1333,7 @@ impl LayerInner { is_good_to_continue(&rx.borrow_and_update())?; - let Ok(_gate) = timeline.gate.enter() else { + let Ok(gate) = timeline.gate.enter() else { return Err(EvictionCancelled::TimelineGone); }; @@ -1443,7 +1421,7 @@ impl LayerInner { Self::spawn_blocking(move || { let _span = span.entered(); - let res = self.evict_blocking(&timeline, &permit); + let res = self.evict_blocking(&timeline, &gate, &permit); let waiters = self.inner.initializer_count(); @@ -1469,6 +1447,7 @@ impl LayerInner { fn evict_blocking( &self, timeline: &Timeline, + _gate: &gate::GateGuard, _permit: &heavier_once_cell::InitPermit, ) -> Result<(), EvictionCancelled> { // now accesses to `self.inner.get_or_init*` wait on the semaphore or the `_permit` @@ -1578,8 +1557,6 @@ pub(crate) enum EvictionError { pub(crate) enum DownloadError { #[error("timeline has already shutdown")] TimelineShutdown, - #[error("no remote storage configured")] - NoRemoteStorage, #[error("context denies downloading")] ContextAndConfigReallyDeniesDownloads, #[error("downloading is really required but not allowed by this method")] diff --git a/pageserver/src/tenant/storage_layer/layer/tests.rs b/pageserver/src/tenant/storage_layer/layer/tests.rs index 52f62faa8d..fa9142d5e9 100644 --- a/pageserver/src/tenant/storage_layer/layer/tests.rs +++ b/pageserver/src/tenant/storage_layer/layer/tests.rs @@ -145,7 +145,7 @@ async fn smoke_test() { .await .expect("the local layer file still exists"); - let rtc = timeline.remote_client.as_ref().unwrap(); + let rtc = &timeline.remote_client; { let layers = &[layer]; @@ -761,13 +761,7 @@ async fn eviction_cancellation_on_drop() { timeline.freeze_and_flush().await.unwrap(); // wait for the upload to complete so our Arc::strong_count assertion holds - timeline - .remote_client - .as_ref() - .unwrap() - .wait_completion() - .await - .unwrap(); + timeline.remote_client.wait_completion().await.unwrap(); let (evicted_layer, not_evicted) = { let mut layers = { diff --git a/pageserver/src/tenant/storage_layer/layer_desc.rs b/pageserver/src/tenant/storage_layer/layer_desc.rs index c375923e81..a89b66e4a1 100644 --- a/pageserver/src/tenant/storage_layer/layer_desc.rs +++ b/pageserver/src/tenant/storage_layer/layer_desc.rs @@ -5,7 +5,7 @@ use utils::{id::TimelineId, lsn::Lsn}; use crate::repository::Key; -use super::{DeltaFileName, ImageFileName, LayerFileName}; +use super::{DeltaLayerName, ImageLayerName, LayerName}; use serde::{Deserialize, Serialize}; @@ -51,7 +51,7 @@ impl PersistentLayerDesc { } pub fn short_id(&self) -> impl Display { - self.filename() + self.layer_name() } #[cfg(test)] @@ -103,14 +103,14 @@ impl PersistentLayerDesc { pub fn from_filename( tenant_shard_id: TenantShardId, timeline_id: TimelineId, - filename: LayerFileName, + filename: LayerName, file_size: u64, ) -> Self { match filename { - LayerFileName::Image(i) => { + LayerName::Image(i) => { Self::new_img(tenant_shard_id, timeline_id, i.key_range, i.lsn, file_size) } - LayerFileName::Delta(d) => Self::new_delta( + LayerName::Delta(d) => Self::new_delta( tenant_shard_id, timeline_id, d.key_range, @@ -132,34 +132,34 @@ impl PersistentLayerDesc { lsn..(lsn + 1) } - /// Get a delta file name for this layer. + /// Get a delta layer name for this layer. /// /// Panic: if this is not a delta layer. - pub fn delta_file_name(&self) -> DeltaFileName { + pub fn delta_layer_name(&self) -> DeltaLayerName { assert!(self.is_delta); - DeltaFileName { + DeltaLayerName { key_range: self.key_range.clone(), lsn_range: self.lsn_range.clone(), } } - /// Get a delta file name for this layer. + /// Get a image layer name for this layer. /// /// Panic: if this is not an image layer, or the lsn range is invalid - pub fn image_file_name(&self) -> ImageFileName { + pub fn image_layer_name(&self) -> ImageLayerName { assert!(!self.is_delta); assert!(self.lsn_range.start + 1 == self.lsn_range.end); - ImageFileName { + ImageLayerName { key_range: self.key_range.clone(), lsn: self.lsn_range.start, } } - pub fn filename(&self) -> LayerFileName { + pub fn layer_name(&self) -> LayerName { if self.is_delta { - self.delta_file_name().into() + self.delta_layer_name().into() } else { - self.image_file_name().into() + self.image_layer_name().into() } } diff --git a/pageserver/src/tenant/storage_layer/filename.rs b/pageserver/src/tenant/storage_layer/layer_name.rs similarity index 72% rename from pageserver/src/tenant/storage_layer/filename.rs rename to pageserver/src/tenant/storage_layer/layer_name.rs index fff66a9d07..da26e1eeb7 100644 --- a/pageserver/src/tenant/storage_layer/filename.rs +++ b/pageserver/src/tenant/storage_layer/layer_name.rs @@ -15,29 +15,29 @@ use super::PersistentLayerDesc; // Note: Timeline::load_layer_map() relies on this sort order #[derive(PartialEq, Eq, Clone, Hash)] -pub struct DeltaFileName { +pub struct DeltaLayerName { pub key_range: Range, pub lsn_range: Range, } -impl std::fmt::Debug for DeltaFileName { +impl std::fmt::Debug for DeltaLayerName { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { use super::RangeDisplayDebug; - f.debug_struct("DeltaFileName") + f.debug_struct("DeltaLayerName") .field("key_range", &RangeDisplayDebug(&self.key_range)) .field("lsn_range", &self.lsn_range) .finish() } } -impl PartialOrd for DeltaFileName { +impl PartialOrd for DeltaLayerName { fn partial_cmp(&self, other: &Self) -> Option { Some(self.cmp(other)) } } -impl Ord for DeltaFileName { +impl Ord for DeltaLayerName { fn cmp(&self, other: &Self) -> Ordering { let mut cmp = self.key_range.start.cmp(&other.key_range.start); if cmp != Ordering::Equal { @@ -57,16 +57,14 @@ impl Ord for DeltaFileName { } } -/// Represents the filename of a DeltaLayer +/// Represents the region of the LSN-Key space covered by a DeltaLayer /// /// ```text /// -__- /// ``` -impl DeltaFileName { - /// - /// Parse a string as a delta file name. Returns None if the filename does not - /// match the expected pattern. - /// +impl DeltaLayerName { + /// Parse the part of a delta layer's file name that represents the LayerName. Returns None + /// if the filename does not match the expected pattern. pub fn parse_str(fname: &str) -> Option { let mut parts = fname.split("__"); let mut key_parts = parts.next()?.split('-'); @@ -105,14 +103,14 @@ impl DeltaFileName { // or panic? } - Some(DeltaFileName { + Some(DeltaLayerName { key_range: key_start..key_end, lsn_range: start_lsn..end_lsn, }) } } -impl fmt::Display for DeltaFileName { +impl fmt::Display for DeltaLayerName { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!( f, @@ -126,29 +124,29 @@ impl fmt::Display for DeltaFileName { } #[derive(PartialEq, Eq, Clone, Hash)] -pub struct ImageFileName { +pub struct ImageLayerName { pub key_range: Range, pub lsn: Lsn, } -impl std::fmt::Debug for ImageFileName { +impl std::fmt::Debug for ImageLayerName { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { use super::RangeDisplayDebug; - f.debug_struct("ImageFileName") + f.debug_struct("ImageLayerName") .field("key_range", &RangeDisplayDebug(&self.key_range)) .field("lsn", &self.lsn) .finish() } } -impl PartialOrd for ImageFileName { +impl PartialOrd for ImageLayerName { fn partial_cmp(&self, other: &Self) -> Option { Some(self.cmp(other)) } } -impl Ord for ImageFileName { +impl Ord for ImageLayerName { fn cmp(&self, other: &Self) -> Ordering { let mut cmp = self.key_range.start.cmp(&other.key_range.start); if cmp != Ordering::Equal { @@ -164,7 +162,7 @@ impl Ord for ImageFileName { } } -impl ImageFileName { +impl ImageLayerName { pub fn lsn_as_range(&self) -> Range { // Saves from having to copypaste this all over PersistentLayerDesc::image_layer_lsn_range(self.lsn) @@ -172,16 +170,14 @@ impl ImageFileName { } /// -/// Represents the filename of an ImageLayer +/// Represents the part of the Key-LSN space covered by an ImageLayer /// /// ```text /// -__ /// ``` -impl ImageFileName { - /// - /// Parse a string as an image file name. Returns None if the filename does not - /// match the expected pattern. - /// +impl ImageLayerName { + /// Parse a string as then LayerName part of an image layer file name. Returns None if the + /// filename does not match the expected pattern. pub fn parse_str(fname: &str) -> Option { let mut parts = fname.split("__"); let mut key_parts = parts.next()?.split('-'); @@ -202,14 +198,14 @@ impl ImageFileName { let lsn = Lsn::from_hex(lsn_str).ok()?; - Some(ImageFileName { + Some(ImageLayerName { key_range: key_start..key_end, lsn, }) } } -impl fmt::Display for ImageFileName { +impl fmt::Display for ImageLayerName { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!( f, @@ -220,21 +216,24 @@ impl fmt::Display for ImageFileName { ) } } + +/// LayerName is the logical identity of a layer within a LayerMap at a moment in time. The +/// LayerName is not a unique filename, as the same LayerName may have multiple physical incarnations +/// over time (e.g. across shard splits or compression). The physical filenames of layers in local +/// storage and object names in remote storage consist of the LayerName plus some extra qualifiers +/// that uniquely identify the physical incarnation of a layer (see [crate::tenant::remote_timeline_client::remote_layer_path]) +/// and [`crate::tenant::storage_layer::layer::local_layer_path`]) #[derive(Debug, PartialEq, Eq, Hash, Clone)] -pub enum LayerFileName { - Image(ImageFileName), - Delta(DeltaFileName), +pub enum LayerName { + Image(ImageLayerName), + Delta(DeltaLayerName), } -impl LayerFileName { - pub fn file_name(&self) -> String { - self.to_string() - } - +impl LayerName { /// Determines if this layer file is considered to be in future meaning we will discard these /// layers during timeline initialization from the given disk_consistent_lsn. pub(crate) fn is_in_future(&self, disk_consistent_lsn: Lsn) -> bool { - use LayerFileName::*; + use LayerName::*; match self { Image(file_name) if file_name.lsn > disk_consistent_lsn => true, Delta(file_name) if file_name.lsn_range.end > disk_consistent_lsn + 1 => true, @@ -243,7 +242,7 @@ impl LayerFileName { } pub(crate) fn kind(&self) -> &'static str { - use LayerFileName::*; + use LayerName::*; match self { Delta(_) => "delta", Image(_) => "image", @@ -251,7 +250,7 @@ impl LayerFileName { } } -impl fmt::Display for LayerFileName { +impl fmt::Display for LayerName { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match self { Self::Image(fname) => write!(f, "{fname}"), @@ -260,25 +259,25 @@ impl fmt::Display for LayerFileName { } } -impl From for LayerFileName { - fn from(fname: ImageFileName) -> Self { +impl From for LayerName { + fn from(fname: ImageLayerName) -> Self { Self::Image(fname) } } -impl From for LayerFileName { - fn from(fname: DeltaFileName) -> Self { +impl From for LayerName { + fn from(fname: DeltaLayerName) -> Self { Self::Delta(fname) } } -impl FromStr for LayerFileName { +impl FromStr for LayerName { type Err = String; /// Conversion from either a physical layer filename, or the string-ization of /// Self. When loading a physical layer filename, we drop any extra information /// not needed to build Self. fn from_str(value: &str) -> Result { - let gen_suffix_regex = Regex::new("^(?.+)-(?[0-9a-f]{8})$").unwrap(); + let gen_suffix_regex = Regex::new("^(?.+)(?-v1-[0-9a-f]{8})$").unwrap(); let file_name: Cow = match gen_suffix_regex.captures(value) { Some(captures) => captures .name("base") @@ -288,8 +287,8 @@ impl FromStr for LayerFileName { None => value.into(), }; - let delta = DeltaFileName::parse_str(&file_name); - let image = ImageFileName::parse_str(&file_name); + let delta = DeltaLayerName::parse_str(&file_name); + let image = ImageLayerName::parse_str(&file_name); let ok = match (delta, image) { (None, None) => { return Err(format!( @@ -304,7 +303,7 @@ impl FromStr for LayerFileName { } } -impl serde::Serialize for LayerFileName { +impl serde::Serialize for LayerName { fn serialize(&self, serializer: S) -> Result where S: serde::Serializer, @@ -316,19 +315,19 @@ impl serde::Serialize for LayerFileName { } } -impl<'de> serde::Deserialize<'de> for LayerFileName { +impl<'de> serde::Deserialize<'de> for LayerName { fn deserialize(deserializer: D) -> Result where D: serde::Deserializer<'de>, { - deserializer.deserialize_string(LayerFileNameVisitor) + deserializer.deserialize_string(LayerNameVisitor) } } -struct LayerFileNameVisitor; +struct LayerNameVisitor; -impl<'de> serde::de::Visitor<'de> for LayerFileNameVisitor { - type Value = LayerFileName; +impl<'de> serde::de::Visitor<'de> for LayerNameVisitor { + type Value = LayerName; fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result { write!( @@ -348,37 +347,33 @@ impl<'de> serde::de::Visitor<'de> for LayerFileNameVisitor { mod test { use super::*; #[test] - fn image_layer_parse() -> anyhow::Result<()> { - let expected = LayerFileName::Image(ImageFileName { + fn image_layer_parse() { + let expected = LayerName::Image(ImageLayerName { key_range: Key::from_i128(0) ..Key::from_hex("000000067F00000001000004DF0000000006").unwrap(), lsn: Lsn::from_hex("00000000014FED58").unwrap(), }); - let parsed = LayerFileName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-00000001").map_err(|s| anyhow::anyhow!(s))?; + let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-v1-00000001").unwrap(); assert_eq!(parsed, expected,); // Omitting generation suffix is valid - let parsed = LayerFileName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58").map_err(|s| anyhow::anyhow!(s))?; + let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58").unwrap(); assert_eq!(parsed, expected,); - - Ok(()) } #[test] - fn delta_layer_parse() -> anyhow::Result<()> { - let expected = LayerFileName::Delta(DeltaFileName { + fn delta_layer_parse() { + let expected = LayerName::Delta(DeltaLayerName { key_range: Key::from_i128(0) ..Key::from_hex("000000067F00000001000004DF0000000006").unwrap(), lsn_range: Lsn::from_hex("00000000014FED58").unwrap() ..Lsn::from_hex("000000000154C481").unwrap(), }); - let parsed = LayerFileName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-000000000154C481-00000001").map_err(|s| anyhow::anyhow!(s))?; + let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-000000000154C481-v1-00000001").unwrap(); assert_eq!(parsed, expected); // Omitting generation suffix is valid - let parsed = LayerFileName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-000000000154C481").map_err(|s| anyhow::anyhow!(s))?; + let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-000000000154C481").unwrap(); assert_eq!(parsed, expected); - - Ok(()) } } diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs index f153719f98..ba2b8afd03 100644 --- a/pageserver/src/tenant/tasks.rs +++ b/pageserver/src/tenant/tasks.rs @@ -41,7 +41,7 @@ static CONCURRENT_BACKGROUND_TASKS: once_cell::sync::Lazy &'static str { - let s: &'static str = self.into(); - s + self.into() } } +static PERMIT_GAUGES: once_cell::sync::Lazy< + enum_map::EnumMap, +> = once_cell::sync::Lazy::new(|| { + enum_map::EnumMap::from_array(std::array::from_fn(|i| { + let kind = ::from_usize(i); + crate::metrics::BACKGROUND_LOOP_SEMAPHORE_WAIT_GAUGE.with_label_values(&[kind.into()]) + })) +}); + /// Cancellation safe. pub(crate) async fn concurrent_background_tasks_rate_limit_permit( loop_kind: BackgroundLoopKind, _ctx: &RequestContext, ) -> tokio::sync::SemaphorePermit<'static> { - let _guard = crate::metrics::BACKGROUND_LOOP_SEMAPHORE_WAIT_GAUGE - .with_label_values(&[loop_kind.as_static_str()]) - .guard(); + let _guard = PERMIT_GAUGES[loop_kind].guard(); pausable_failpoint!( "initial-size-calculation-permit-pause", diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index d6d012c70c..881e7f8f3c 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -18,14 +18,14 @@ use fail::fail_point; use once_cell::sync::Lazy; use pageserver_api::{ key::{ - AUX_FILES_KEY, METADATA_KEY_BEGIN_PREFIX, METADATA_KEY_END_PREFIX, NON_INHERITED_RANGE, - NON_INHERITED_SPARSE_RANGE, + AUX_FILES_KEY, KEY_SIZE, METADATA_KEY_BEGIN_PREFIX, METADATA_KEY_END_PREFIX, + NON_INHERITED_RANGE, NON_INHERITED_SPARSE_RANGE, }, - keyspace::{KeySpaceAccum, SparseKeyPartitioning}, + keyspace::{KeySpaceAccum, KeySpaceRandomAccum, SparseKeyPartitioning}, models::{ - AuxFilePolicy, CompactionAlgorithm, DownloadRemoteLayersTaskInfo, - DownloadRemoteLayersTaskSpawnRequest, EvictionPolicy, InMemoryLayerInfo, LayerMapInfo, - TimelineState, + AtomicAuxFilePolicy, AuxFilePolicy, CompactionAlgorithm, CompactionAlgorithmSettings, + DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest, EvictionPolicy, + InMemoryLayerInfo, LayerMapInfo, LsnLease, TimelineState, }, reltag::BlockNumber, shard::{ShardIdentity, ShardNumber, TenantShardId}, @@ -60,10 +60,12 @@ use std::{ ops::ControlFlow, }; -use crate::tenant::storage_layer::layer::local_layer_path; -use crate::tenant::{ - layer_map::{LayerMap, SearchResult}, - metadata::TimelineMetadata, +use crate::{ + aux_file::AuxFileSizeEstimator, + tenant::{ + layer_map::{LayerMap, SearchResult}, + metadata::TimelineMetadata, + }, }; use crate::{ context::{DownloadBehavior, RequestContext}, @@ -75,7 +77,7 @@ use crate::{ disk_usage_eviction_task::finite_f32, tenant::storage_layer::{ AsLayerDesc, DeltaLayerWriter, EvictionError, ImageLayerWriter, InMemoryLayer, Layer, - LayerAccessStatsReset, LayerFileName, ResidentLayer, ValueReconstructResult, + LayerAccessStatsReset, LayerName, ResidentLayer, ValueReconstructResult, ValueReconstructState, ValuesReconstructState, }, }; @@ -86,6 +88,9 @@ use crate::{ metrics::ScanLatencyOngoingRecording, tenant::timeline::logical_size::CurrentLogicalSize, }; use crate::{pgdatadir_mapping::LsnForTimestamp, tenant::tasks::BackgroundLoopKind}; +use crate::{ + pgdatadir_mapping::MAX_AUX_FILE_V2_DELTAS, tenant::timeline::init::LocalLayerFileMetadata, +}; use crate::{ pgdatadir_mapping::{AuxFilesDirectory, DirectoryKind}, virtual_file::{MaybeFatalIo, VirtualFile}, @@ -197,7 +202,7 @@ fn drop_wlock(rlock: tokio::sync::RwLockWriteGuard<'_, T>) { /// The outward-facing resources required to build a Timeline pub struct TimelineResources { - pub remote_client: Option, + pub remote_client: RemoteTimelineClient, pub deletion_queue_client: DeletionQueueClient, pub timeline_get_throttle: Arc< crate::tenant::throttle::Throttle<&'static crate::metrics::tenant_throttling::TimelineGet>, @@ -264,12 +269,14 @@ pub struct Timeline { // Atomic would be more appropriate here. last_freeze_ts: RwLock, + pub(crate) standby_horizon: AtomicLsn, + // WAL redo manager. `None` only for broken tenants. walredo_mgr: Option>, /// Remote storage client. /// See [`remote_timeline_client`](super::remote_timeline_client) module comment for details. - pub remote_client: Option>, + pub remote_client: Arc, // What page versions do we hold in the repository? If we get a // request > last_record_lsn, we need to wait until we receive all @@ -343,8 +350,8 @@ pub struct Timeline { // though let's keep them both for better error visibility. pub initdb_lsn: Lsn, - /// When did we last calculate the partitioning? - partitioning: tokio::sync::Mutex<((KeyPartitioning, SparseKeyPartitioning), Lsn)>, + /// When did we last calculate the partitioning? Make it pub to test cases. + pub(super) partitioning: tokio::sync::Mutex<((KeyPartitioning, SparseKeyPartitioning), Lsn)>, /// Configuration: how often should the partitioning be recalculated. repartition_threshold: u64, @@ -409,6 +416,12 @@ pub struct Timeline { /// Keep aux directory cache to avoid it's reconstruction on each update pub(crate) aux_files: tokio::sync::Mutex, + + /// Size estimator for aux file v2 + pub(crate) aux_file_size_estimator: AuxFileSizeEstimator, + + /// Indicate whether aux file v2 storage is enabled. + pub(crate) last_aux_file_policy: AtomicAuxFilePolicy, } pub struct WalReceiverInfo { @@ -472,6 +485,11 @@ impl GcCutoffs { } } +pub(crate) struct TimelineVisitOutcome { + completed_keyspace: KeySpace, + image_covered_keyspace: KeySpace, +} + /// An error happened in a get() operation. #[derive(thiserror::Error, Debug)] pub(crate) enum PageReconstructError { @@ -496,6 +514,13 @@ pub(crate) enum PageReconstructError { MissingKey(MissingKeyError), } +impl GetVectoredError { + #[cfg(test)] + pub(crate) fn is_missing_key_error(&self) -> bool { + matches!(self, Self::MissingKey(_)) + } +} + #[derive(Debug)] pub struct MissingKeyError { key: Key, @@ -773,6 +798,11 @@ pub(crate) enum ShutdownMode { Hard, } +struct ImageLayerCreationOutcome { + image: Option, + next_start_key: Key, +} + /// Public interface functions impl Timeline { /// Get the LSN where this branch was created @@ -874,7 +904,7 @@ impl Timeline { } let vectored_res = self - .get_vectored_impl(keyspace.clone(), lsn, reconstruct_state, ctx) + .get_vectored_impl(keyspace.clone(), lsn, &mut reconstruct_state, ctx) .await; if self.conf.validate_vectored_get { @@ -1019,7 +1049,12 @@ impl Timeline { } GetVectoredImpl::Vectored => { let vectored_res = self - .get_vectored_impl(keyspace.clone(), lsn, ValuesReconstructState::new(), ctx) + .get_vectored_impl( + keyspace.clone(), + lsn, + &mut ValuesReconstructState::new(), + ctx, + ) .await; if self.conf.validate_vectored_get { @@ -1107,7 +1142,7 @@ impl Timeline { .get_vectored_impl( keyspace.clone(), lsn, - ValuesReconstructState::default(), + &mut ValuesReconstructState::default(), ctx, ) .await; @@ -1184,7 +1219,7 @@ impl Timeline { &self, keyspace: KeySpace, lsn: Lsn, - mut reconstruct_state: ValuesReconstructState, + reconstruct_state: &mut ValuesReconstructState, ctx: &RequestContext, ) -> Result>, GetVectoredError> { let get_kind = if keyspace.total_raw_size() == 1 { @@ -1196,7 +1231,7 @@ impl Timeline { let get_data_timer = crate::metrics::GET_RECONSTRUCT_DATA_TIME .for_get_kind(get_kind) .start_timer(); - self.get_vectored_reconstruct_data(keyspace, lsn, &mut reconstruct_state, ctx) + self.get_vectored_reconstruct_data(keyspace, lsn, reconstruct_state, ctx) .await?; get_data_timer.stop_and_record(); @@ -1205,7 +1240,8 @@ impl Timeline { .start_timer(); let mut results: BTreeMap> = BTreeMap::new(); let layers_visited = reconstruct_state.get_layers_visited(); - for (key, res) in reconstruct_state.keys { + + for (key, res) in std::mem::take(&mut reconstruct_state.keys) { match res { Err(err) => { results.insert(key, Err(err)); @@ -1220,11 +1256,17 @@ impl Timeline { } reconstruct_timer.stop_and_record(); - // Note that this is an approximation. Tracking the exact number of layers visited - // per key requires virtually unbounded memory usage and is inefficient - // (i.e. segment tree tracking each range queried from a layer) - crate::metrics::VEC_READ_NUM_LAYERS_VISITED - .observe(layers_visited as f64 / results.len() as f64); + // For aux file keys (v1 or v2) the vectored read path does not return an error + // when they're missing. Instead they are omitted from the resulting btree + // (this is a requirement, not a bug). Skip updating the metric in these cases + // to avoid infinite results. + if !results.is_empty() { + // Note that this is an approximation. Tracking the exact number of layers visited + // per key requires virtually unbounded memory usage and is inefficient + // (i.e. segment tree tracking each range queried from a layer) + crate::metrics::VEC_READ_NUM_LAYERS_VISITED + .observe(layers_visited as f64 / results.len() as f64); + } Ok(results) } @@ -1364,22 +1406,14 @@ impl Timeline { /// not validated with control plane yet. /// See [`Self::get_remote_consistent_lsn_visible`]. pub(crate) fn get_remote_consistent_lsn_projected(&self) -> Option { - if let Some(remote_client) = &self.remote_client { - remote_client.remote_consistent_lsn_projected() - } else { - None - } + self.remote_client.remote_consistent_lsn_projected() } /// remote_consistent_lsn which the tenant is guaranteed not to go backward from, /// i.e. a value of remote_consistent_lsn_projected which has undergone /// generation validation in the deletion queue. pub(crate) fn get_remote_consistent_lsn_visible(&self) -> Option { - if let Some(remote_client) = &self.remote_client { - remote_client.remote_consistent_lsn_visible() - } else { - None - } + self.remote_client.remote_consistent_lsn_visible() } /// The sum of the file size of all historic layers in the layer map. @@ -1498,6 +1532,20 @@ impl Timeline { Ok(()) } + /// Obtains a temporary lease blocking garbage collection for the given LSN + pub(crate) fn make_lsn_lease( + &self, + _lsn: Lsn, + _ctx: &RequestContext, + ) -> anyhow::Result { + const LEASE_LENGTH: Duration = Duration::from_secs(5 * 60); + let lease = LsnLease { + valid_until: SystemTime::now() + LEASE_LENGTH, + }; + // TODO: dummy implementation + Ok(lease) + } + /// Flush to disk all data that was written with the put_* functions #[instrument(skip(self), fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%self.timeline_id))] pub(crate) async fn freeze_and_flush(&self) -> anyhow::Result<()> { @@ -1652,7 +1700,7 @@ impl Timeline { return Ok(()); } - match self.get_compaction_algorithm() { + match self.get_compaction_algorithm_settings().kind { CompactionAlgorithm::Tiered => self.compact_tiered(cancel, ctx).await, CompactionAlgorithm::Legacy => self.compact_legacy(cancel, flags, ctx).await, } @@ -1749,16 +1797,14 @@ impl Timeline { match self.freeze_and_flush().await { Ok(_) => { // drain the upload queue - if let Some(client) = self.remote_client.as_ref() { - // if we did not wait for completion here, it might be our shutdown process - // didn't wait for remote uploads to complete at all, as new tasks can forever - // be spawned. - // - // what is problematic is the shutting down of RemoteTimelineClient, because - // obviously it does not make sense to stop while we wait for it, but what - // about corner cases like s3 suddenly hanging up? - client.shutdown().await; - } + // if we did not wait for completion here, it might be our shutdown process + // didn't wait for remote uploads to complete at all, as new tasks can forever + // be spawned. + // + // what is problematic is the shutting down of RemoteTimelineClient, because + // obviously it does not make sense to stop while we wait for it, but what + // about corner cases like s3 suddenly hanging up? + self.remote_client.shutdown().await; } Err(e) => { // Non-fatal. Shutdown is infallible. Failures to flush just mean that @@ -1774,18 +1820,16 @@ impl Timeline { // Transition the remote_client into a state where it's only useful for timeline deletion. // (The deletion use case is why we can't just hook up remote_client to Self::cancel).) - if let Some(remote_client) = self.remote_client.as_ref() { - remote_client.stop(); - // As documented in remote_client.stop()'s doc comment, it's our responsibility - // to shut down the upload queue tasks. - // TODO: fix that, task management should be encapsulated inside remote_client. - task_mgr::shutdown_tasks( - Some(TaskKind::RemoteUploadTask), - Some(self.tenant_shard_id), - Some(self.timeline_id), - ) - .await; - } + self.remote_client.stop(); + // As documented in remote_client.stop()'s doc comment, it's our responsibility + // to shut down the upload queue tasks. + // TODO: fix that, task management should be encapsulated inside remote_client. + task_mgr::shutdown_tasks( + Some(TaskKind::RemoteUploadTask), + Some(self.tenant_shard_id), + Some(self.timeline_id), + ) + .await; // TODO: work toward making this a no-op. See this funciton's doc comment for more context. tracing::debug!("Waiting for tasks..."); @@ -1905,16 +1949,12 @@ impl Timeline { #[instrument(skip_all, fields(tenant_id = %self.tenant_shard_id.tenant_id, shard_id = %self.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id))] pub(crate) async fn download_layer( &self, - layer_file_name: &LayerFileName, + layer_file_name: &LayerName, ) -> anyhow::Result> { let Some(layer) = self.find_layer(layer_file_name).await else { return Ok(None); }; - if self.remote_client.is_none() { - return Ok(Some(false)); - } - layer.download().await?; Ok(Some(true)) @@ -1925,7 +1965,7 @@ impl Timeline { /// Returns `Ok(None)` in the case where the layer could not be found by its `layer_file_name`. pub(crate) async fn evict_layer( &self, - layer_file_name: &LayerFileName, + layer_file_name: &LayerName, ) -> anyhow::Result> { let _gate = self .gate @@ -2056,12 +2096,14 @@ impl Timeline { .unwrap_or(self.conf.default_tenant_conf.image_creation_threshold) } - fn get_compaction_algorithm(&self) -> CompactionAlgorithm { + fn get_compaction_algorithm_settings(&self) -> CompactionAlgorithmSettings { let tenant_conf = &self.tenant_conf.load(); tenant_conf .tenant_conf .compaction_algorithm - .unwrap_or(self.conf.default_tenant_conf.compaction_algorithm) + .as_ref() + .unwrap_or(&self.conf.default_tenant_conf.compaction_algorithm) + .clone() } fn get_eviction_policy(&self) -> EvictionPolicy { @@ -2138,6 +2180,7 @@ impl Timeline { resources: TimelineResources, pg_version: u32, state: TimelineState, + aux_file_policy: Option, cancel: CancellationToken, ) -> Arc { let disk_consistent_lsn = metadata.disk_consistent_lsn(); @@ -2155,6 +2198,16 @@ impl Timeline { }; Arc::new_cyclic(|myself| { + let metrics = TimelineMetrics::new( + &tenant_shard_id, + &timeline_id, + crate::metrics::EvictionsWithLowResidenceDurationBuilder::new( + "mtime", + evictions_low_residence_duration_metric_threshold, + ), + ); + let aux_file_metrics = metrics.aux_file_size_gauge.clone(); + let mut result = Timeline { conf, tenant_conf, @@ -2169,7 +2222,7 @@ impl Timeline { walredo_mgr, walreceiver: Mutex::new(None), - remote_client: resources.remote_client.map(Arc::new), + remote_client: Arc::new(resources.remote_client), // initialize in-memory 'last_record_lsn' from 'disk_consistent_lsn'. last_record_lsn: SeqWait::new(RecordLsn { @@ -2186,14 +2239,7 @@ impl Timeline { ancestor_timeline: ancestor, ancestor_lsn: metadata.ancestor_lsn(), - metrics: TimelineMetrics::new( - &tenant_shard_id, - &timeline_id, - crate::metrics::EvictionsWithLowResidenceDurationBuilder::new( - "mtime", - evictions_low_residence_duration_metric_threshold, - ), - ), + metrics, query_metrics: crate::metrics::SmgrQueryTimePerTimeline::new( &tenant_shard_id, @@ -2251,12 +2297,18 @@ impl Timeline { compaction_lock: tokio::sync::Mutex::default(), gc_lock: tokio::sync::Mutex::default(), + standby_horizon: AtomicLsn::new(0), + timeline_get_throttle: resources.timeline_get_throttle, aux_files: tokio::sync::Mutex::new(AuxFilesState { dir: None, n_deltas: 0, }), + + aux_file_size_estimator: AuxFileSizeEstimator::new(aux_file_metrics), + + last_aux_file_policy: AtomicAuxFilePolicy::new(aux_file_policy), }; result.repartition_threshold = result.get_checkpoint_distance() / REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE; @@ -2387,13 +2439,13 @@ impl Timeline { index_part: Option, ) -> anyhow::Result<()> { use init::{Decision::*, Discovered, DismissedLayer}; - use LayerFileName::*; + use LayerName::*; let mut guard = self.layers.write().await; let timer = self.metrics.load_layer_map_histo.start_timer(); - // Scan timeline directory and create ImageFileName and DeltaFilename + // Scan timeline directory and create ImageLayerName and DeltaFilename // structs representing all files on disk let timeline_path = self .conf @@ -2421,10 +2473,6 @@ impl Timeline { discovered_layers.push((layer_file_name, local_path, file_size)); continue; } - Discovered::Metadata => { - warn!("found legacy metadata file, these should have been removed in load_tenant_config"); - continue; - } Discovered::IgnoredBackup => { continue; } @@ -2463,33 +2511,37 @@ impl Timeline { let mut needs_cleanup = Vec::new(); let mut total_physical_size = 0; - for (name, local_path, decision) in decided { + for (name, decision) in decided { let decision = match decision { Ok(UseRemote { local, remote }) => { // Remote is authoritative, but we may still choose to retain // the local file if the contents appear to match - if local.file_size() == remote.file_size() { + if local.metadata.file_size() == remote.file_size() { // Use the local file, but take the remote metadata so that we pick up // the correct generation. - UseLocal(remote) + UseLocal(LocalLayerFileMetadata { + metadata: remote, + local_path: local.local_path, + }) } else { - let local_path = local_path.as_ref().expect("Locally found layer must have path"); - init::cleanup_local_file_for_remote(local_path, &local, &remote)?; + init::cleanup_local_file_for_remote(&local, &remote)?; UseRemote { local, remote } } } Ok(decision) => decision, Err(DismissedLayer::Future { local }) => { - if local.is_some() { - let local_path = local_path.expect("Locally found layer must have path"); - init::cleanup_future_layer(&local_path, &name, disk_consistent_lsn)?; + if let Some(local) = local { + init::cleanup_future_layer( + &local.local_path, + &name, + disk_consistent_lsn, + )?; } needs_cleanup.push(name); continue; } Err(DismissedLayer::LocalOnly(local)) => { - let local_path = local_path.expect("Locally found layer must have path"); - init::cleanup_local_only_file(&local_path, &name, &local)?; + init::cleanup_local_only_file(&name, &local)?; // this file never existed remotely, we will have to do rework continue; } @@ -2503,20 +2555,10 @@ impl Timeline { tracing::debug!(layer=%name, ?decision, "applied"); let layer = match decision { - UseLocal(m) => { - total_physical_size += m.file_size(); - - let local_path = local_path.unwrap_or_else(|| { - local_layer_path( - conf, - &this.tenant_shard_id, - &this.timeline_id, - &name, - &m.generation, - ) - }); - - Layer::for_resident(conf, &this, local_path, name, m).drop_eviction_guard() + UseLocal(local) => { + total_physical_size += local.metadata.file_size(); + Layer::for_resident(conf, &this, local.local_path, name, local.metadata) + .drop_eviction_guard() } Evicted(remote) | UseRemote { remote, .. } => { Layer::for_evicted(conf, &this, name, remote) @@ -2536,36 +2578,36 @@ impl Timeline { guard.initialize_local_layers(loaded_layers, disk_consistent_lsn + 1); - if let Some(rtc) = self.remote_client.as_ref() { - rtc.schedule_layer_file_deletion(&needs_cleanup)?; - rtc.schedule_index_upload_for_file_changes()?; - // This barrier orders above DELETEs before any later operations. - // This is critical because code executing after the barrier might - // create again objects with the same key that we just scheduled for deletion. - // For example, if we just scheduled deletion of an image layer "from the future", - // later compaction might run again and re-create the same image layer. - // "from the future" here means an image layer whose LSN is > IndexPart::disk_consistent_lsn. - // "same" here means same key range and LSN. - // - // Without a barrier between above DELETEs and the re-creation's PUTs, - // the upload queue may execute the PUT first, then the DELETE. - // In our example, we will end up with an IndexPart referencing a non-existent object. - // - // 1. a future image layer is created and uploaded - // 2. ps restart - // 3. the future layer from (1) is deleted during load layer map - // 4. image layer is re-created and uploaded - // 5. deletion queue would like to delete (1) but actually deletes (4) - // 6. delete by name works as expected, but it now deletes the wrong (later) version - // - // See https://github.com/neondatabase/neon/issues/5878 - // - // NB: generation numbers naturally protect against this because they disambiguate - // (1) and (4) - rtc.schedule_barrier()?; - // Tenant::create_timeline will wait for these uploads to happen before returning, or - // on retry. - } + self.remote_client + .schedule_layer_file_deletion(&needs_cleanup)?; + self.remote_client + .schedule_index_upload_for_file_changes()?; + // This barrier orders above DELETEs before any later operations. + // This is critical because code executing after the barrier might + // create again objects with the same key that we just scheduled for deletion. + // For example, if we just scheduled deletion of an image layer "from the future", + // later compaction might run again and re-create the same image layer. + // "from the future" here means an image layer whose LSN is > IndexPart::disk_consistent_lsn. + // "same" here means same key range and LSN. + // + // Without a barrier between above DELETEs and the re-creation's PUTs, + // the upload queue may execute the PUT first, then the DELETE. + // In our example, we will end up with an IndexPart referencing a non-existent object. + // + // 1. a future image layer is created and uploaded + // 2. ps restart + // 3. the future layer from (1) is deleted during load layer map + // 4. image layer is re-created and uploaded + // 5. deletion queue would like to delete (1) but actually deletes (4) + // 6. delete by name works as expected, but it now deletes the wrong (later) version + // + // See https://github.com/neondatabase/neon/issues/5878 + // + // NB: generation numbers naturally protect against this because they disambiguate + // (1) and (4) + self.remote_client.schedule_barrier()?; + // Tenant::create_timeline will wait for these uploads to happen before returning, or + // on retry. info!( "loaded layer map with {} layers at {}, total physical size: {}", @@ -2624,6 +2666,7 @@ impl Timeline { // Don't make noise. } else { warn!("unexpected: cancel_wait_for_background_loop_concurrency_limit_semaphore not set, priority-boosting of logical size calculation will not work"); + debug_assert!(false); } } }; @@ -2997,10 +3040,10 @@ impl Timeline { } } - async fn find_layer(&self, layer_name: &LayerFileName) -> Option { + async fn find_layer(&self, layer_name: &LayerName) -> Option { let guard = self.layers.read().await; for historic_layer in guard.layer_map().iter_historic_layers() { - let historic_layer_name = historic_layer.filename(); + let historic_layer_name = historic_layer.layer_name(); if layer_name == &historic_layer_name { return Some(guard.get_from_desc(&historic_layer)); } @@ -3017,9 +3060,6 @@ impl Timeline { /// should treat this as a cue to simply skip doing any heatmap uploading /// for this timeline. pub(crate) async fn generate_heatmap(&self) -> Option { - // no point in heatmaps without remote client - let _remote_client = self.remote_client.as_ref()?; - if !self.is_active() { return None; } @@ -3030,7 +3070,7 @@ impl Timeline { let last_activity_ts = layer.access_stats().latest_activity_or_now(); HeatMapLayer::new( - layer.layer_desc().filename(), + layer.layer_desc().layer_name(), (&layer.metadata()).into(), last_activity_ts, ) @@ -3040,6 +3080,15 @@ impl Timeline { Some(HeatMapTimeline::new(self.timeline_id, layers)) } + + /// Returns true if the given lsn is or was an ancestor branchpoint. + pub(crate) fn is_ancestor_lsn(&self, lsn: Lsn) -> bool { + // upon timeline detach, we set the ancestor_lsn to Lsn::INVALID and the store the original + // branchpoint in the value in IndexPart::lineage + self.ancestor_lsn == lsn + || (self.ancestor_lsn == Lsn::INVALID + && self.remote_client.is_previous_ancestor_lsn(lsn)) + } } type TraversalId = Arc; @@ -3177,7 +3226,7 @@ impl Timeline { if let Some(open_layer) = &layers.open_layer { let start_lsn = open_layer.get_lsn_range().start; if cont_lsn > start_lsn { - //info!("CHECKING for {} at {} on open layer {}", key, cont_lsn, open_layer.filename().display()); + //info!("CHECKING for {} at {} on open layer {}", key, cont_lsn, open_layer.layer_name().display()); // Get all the data needed to reconstruct the page version from this layer. // But if we have an older cached page image, no need to go past that. let lsn_floor = max(cached_lsn + 1, start_lsn); @@ -3206,7 +3255,7 @@ impl Timeline { for frozen_layer in layers.frozen_layers.iter().rev() { let start_lsn = frozen_layer.get_lsn_range().start; if cont_lsn > start_lsn { - //info!("CHECKING for {} at {} on frozen layer {}", key, cont_lsn, frozen_layer.filename().display()); + //info!("CHECKING for {} at {} on frozen layer {}", key, cont_lsn, frozen_layer.layer_name().display()); let lsn_floor = max(cached_lsn + 1, start_lsn); let frozen_layer = frozen_layer.clone(); @@ -3283,12 +3332,15 @@ impl Timeline { let mut cont_lsn = Lsn(request_lsn.0 + 1); - loop { + let missing_keyspace = loop { if self.cancel.is_cancelled() { return Err(GetVectoredError::Cancelled); } - let completed = Self::get_vectored_reconstruct_data_timeline( + let TimelineVisitOutcome { + completed_keyspace: completed, + image_covered_keyspace, + } = Self::get_vectored_reconstruct_data_timeline( timeline, keyspace.clone(), cont_lsn, @@ -3307,12 +3359,31 @@ impl Timeline { ranges: vec![NON_INHERITED_RANGE, NON_INHERITED_SPARSE_RANGE], }); - // Keyspace is fully retrieved, no ancestor timeline, or metadata scan (where we do not look - // into ancestor timelines). TODO: is there any other metadata which we want to inherit? - if keyspace.total_raw_size() == 0 || timeline.ancestor_timeline.is_none() { - break; + // Keyspace is fully retrieved + if keyspace.is_empty() { + break None; } + // Not fully retrieved but no ancestor timeline. + if timeline.ancestor_timeline.is_none() { + break Some(keyspace); + } + + // Now we see if there are keys covered by the image layer but does not exist in the + // image layer, which means that the key does not exist. + + // The block below will stop the vectored search if any of the keys encountered an image layer + // which did not contain a snapshot for said key. Since we have already removed all completed + // keys from `keyspace`, we expect there to be no overlap between it and the image covered key + // space. If that's not the case, we had at least one key encounter a gap in the image layer + // and stop the search as a result of that. + let removed = keyspace.remove_overlapping_with(&image_covered_keyspace); + if !removed.is_empty() { + break Some(removed); + } + // If we reached this point, `remove_overlapping_with` should not have made any change to the + // keyspace. + // Take the min to avoid reconstructing a page with data newer than request Lsn. cont_lsn = std::cmp::min(Lsn(request_lsn.0 + 1), Lsn(timeline.ancestor_lsn.0 + 1)); timeline_owned = timeline @@ -3320,14 +3391,14 @@ impl Timeline { .await .map_err(GetVectoredError::GetReadyAncestorError)?; timeline = &*timeline_owned; - } + }; - if keyspace.total_raw_size() != 0 { + if let Some(missing_keyspace) = missing_keyspace { return Err(GetVectoredError::MissingKey(MissingKeyError { - key: keyspace.start().unwrap(), /* better if we can store the full keyspace */ + key: missing_keyspace.start().unwrap(), /* better if we can store the full keyspace */ shard: self .shard_identity - .get_shard_number(&keyspace.start().unwrap()), + .get_shard_number(&missing_keyspace.start().unwrap()), cont_lsn, request_lsn, ancestor_lsn: Some(timeline.ancestor_lsn), @@ -3352,6 +3423,9 @@ impl Timeline { /// /// At each iteration pop the top of the fringe (the layer with the highest Lsn) /// and get all the required reconstruct data from the layer in one go. + /// + /// Returns the completed keyspace and the keyspaces with image coverage. The caller + /// decides how to deal with these two keyspaces. async fn get_vectored_reconstruct_data_timeline( timeline: &Timeline, keyspace: KeySpace, @@ -3359,20 +3433,27 @@ impl Timeline { reconstruct_state: &mut ValuesReconstructState, cancel: &CancellationToken, ctx: &RequestContext, - ) -> Result { + ) -> Result { let mut unmapped_keyspace = keyspace.clone(); let mut fringe = LayerFringe::new(); let mut completed_keyspace = KeySpace::default(); + let mut image_covered_keyspace = KeySpaceRandomAccum::new(); loop { if cancel.is_cancelled() { return Err(GetVectoredError::Cancelled); } - let keys_done_last_step = reconstruct_state.consume_done_keys(); + let (keys_done_last_step, keys_with_image_coverage) = + reconstruct_state.consume_done_keys(); unmapped_keyspace.remove_overlapping_with(&keys_done_last_step); completed_keyspace.merge(&keys_done_last_step); + if let Some(keys_with_image_coverage) = keys_with_image_coverage { + unmapped_keyspace + .remove_overlapping_with(&KeySpace::single(keys_with_image_coverage.clone())); + image_covered_keyspace.add_range(keys_with_image_coverage); + } // Do not descent any further if the last layer we visited // completed all keys in the keyspace it inspected. This is not @@ -3444,13 +3525,16 @@ impl Timeline { unmapped_keyspace = keyspace_to_read; cont_lsn = next_cont_lsn; - reconstruct_state.on_layer_visited(); + reconstruct_state.on_layer_visited(&layer_to_read); } else { break; } } - Ok(completed_keyspace) + Ok(TimelineVisitOutcome { + completed_keyspace, + image_covered_keyspace: image_covered_keyspace.consume_keyspace(), + }) } /// # Cancel-safety @@ -3551,7 +3635,11 @@ impl Timeline { /// /// Get a handle to the latest layer for appending. /// - async fn get_layer_for_write(&self, lsn: Lsn) -> anyhow::Result> { + async fn get_layer_for_write( + &self, + lsn: Lsn, + ctx: &RequestContext, + ) -> anyhow::Result> { let mut guard = self.layers.write().await; let layer = guard .get_layer_for_write( @@ -3560,6 +3648,7 @@ impl Timeline { self.conf, self.timeline_id, self.tenant_shard_id, + ctx, ) .await?; Ok(layer) @@ -3824,8 +3913,8 @@ impl Timeline { ); self.create_delta_layer( &frozen_layer, - ctx, Some(metadata_keyspace.0.ranges[0].clone()), + ctx, ) .await? } else { @@ -3854,7 +3943,7 @@ impl Timeline { // Normal case, write out a L0 delta layer file. // `create_delta_layer` will not modify the layer map. // We will remove frozen layer and add delta layer in one atomic operation later. - let Some(layer) = self.create_delta_layer(&frozen_layer, ctx, None).await? else { + let Some(layer) = self.create_delta_layer(&frozen_layer, None, ctx).await? else { panic!("delta layer cannot be empty if no filter is applied"); }; ( @@ -3953,29 +4042,23 @@ impl Timeline { x.unwrap() )); - if let Some(remote_client) = &self.remote_client { - for layer in layers_to_upload { - remote_client.schedule_layer_file_upload(layer)?; - } - remote_client.schedule_index_upload_for_metadata_update(&update)?; + for layer in layers_to_upload { + self.remote_client.schedule_layer_file_upload(layer)?; } + self.remote_client + .schedule_index_upload_for_metadata_update(&update)?; Ok(()) } pub(crate) async fn preserve_initdb_archive(&self) -> anyhow::Result<()> { - if let Some(remote_client) = &self.remote_client { - remote_client - .preserve_initdb_archive( - &self.tenant_shard_id.tenant_id, - &self.timeline_id, - &self.cancel, - ) - .await?; - } else { - bail!("No remote storage configured, but was asked to backup the initdb archive for {} / {}", self.tenant_shard_id.tenant_id, self.timeline_id); - } - Ok(()) + self.remote_client + .preserve_initdb_archive( + &self.tenant_shard_id.tenant_id, + &self.timeline_id, + &self.cancel, + ) + .await } // Write out the given frozen in-memory layer as a new L0 delta file. This L0 file will not be tracked @@ -3983,8 +4066,8 @@ impl Timeline { async fn create_delta_layer( self: &Arc, frozen_layer: &Arc, - ctx: &RequestContext, key_range: Option>, + ctx: &RequestContext, ) -> anyhow::Result> { let self_clone = Arc::clone(self); let frozen_layer = Arc::clone(frozen_layer); @@ -4007,6 +4090,7 @@ impl Timeline { &self_clone .conf .timeline_path(&self_clone.tenant_shard_id, &self_clone.timeline_id), + &ctx, ) .await .fatal_err("VirtualFile::open for timeline dir fsync"); @@ -4130,6 +4214,176 @@ impl Timeline { false } + /// Create image layers for Postgres data. Assumes the caller passes a partition that is not too large, + /// so that at most one image layer will be produced from this function. + async fn create_image_layer_for_rel_blocks( + self: &Arc, + partition: &KeySpace, + mut image_layer_writer: ImageLayerWriter, + lsn: Lsn, + ctx: &RequestContext, + img_range: Range, + start: Key, + ) -> Result { + let mut wrote_keys = false; + + let mut key_request_accum = KeySpaceAccum::new(); + for range in &partition.ranges { + let mut key = range.start; + while key < range.end { + // Decide whether to retain this key: usually we do, but sharded tenants may + // need to drop keys that don't belong to them. If we retain the key, add it + // to `key_request_accum` for later issuing a vectored get + if self.shard_identity.is_key_disposable(&key) { + debug!( + "Dropping key {} during compaction (it belongs on shard {:?})", + key, + self.shard_identity.get_shard_number(&key) + ); + } else { + key_request_accum.add_key(key); + } + + let last_key_in_range = key.next() == range.end; + key = key.next(); + + // Maybe flush `key_rest_accum` + if key_request_accum.raw_size() >= Timeline::MAX_GET_VECTORED_KEYS + || (last_key_in_range && key_request_accum.raw_size() > 0) + { + let results = self + .get_vectored(key_request_accum.consume_keyspace(), lsn, ctx) + .await?; + + for (img_key, img) in results { + let img = match img { + Ok(img) => img, + Err(err) => { + // If we fail to reconstruct a VM or FSM page, we can zero the + // page without losing any actual user data. That seems better + // than failing repeatedly and getting stuck. + // + // We had a bug at one point, where we truncated the FSM and VM + // in the pageserver, but the Postgres didn't know about that + // and continued to generate incremental WAL records for pages + // that didn't exist in the pageserver. Trying to replay those + // WAL records failed to find the previous image of the page. + // This special case allows us to recover from that situation. + // See https://github.com/neondatabase/neon/issues/2601. + // + // Unfortunately we cannot do this for the main fork, or for + // any metadata keys, keys, as that would lead to actual data + // loss. + if is_rel_fsm_block_key(img_key) || is_rel_vm_block_key(img_key) { + warn!("could not reconstruct FSM or VM key {img_key}, filling with zeros: {err:?}"); + ZERO_PAGE.clone() + } else { + return Err(CreateImageLayersError::PageReconstructError(err)); + } + } + }; + + // Write all the keys we just read into our new image layer. + image_layer_writer.put_image(img_key, img, ctx).await?; + wrote_keys = true; + } + } + } + } + + if wrote_keys { + // Normal path: we have written some data into the new image layer for this + // partition, so flush it to disk. + let image_layer = image_layer_writer.finish(self, ctx).await?; + Ok(ImageLayerCreationOutcome { + image: Some(image_layer), + next_start_key: img_range.end, + }) + } else { + // Special case: the image layer may be empty if this is a sharded tenant and the + // partition does not cover any keys owned by this shard. In this case, to ensure + // we don't leave gaps between image layers, leave `start` where it is, so that the next + // layer we write will cover the key range that we just scanned. + tracing::debug!("no data in range {}-{}", img_range.start, img_range.end); + Ok(ImageLayerCreationOutcome { + image: None, + next_start_key: start, + }) + } + } + + /// Create an image layer for metadata keys. This function produces one image layer for all metadata + /// keys for now. Because metadata keys cannot exceed basebackup size limit, the image layer for it + /// would not be too large to fit in a single image layer. + #[allow(clippy::too_many_arguments)] + async fn create_image_layer_for_metadata_keys( + self: &Arc, + partition: &KeySpace, + mut image_layer_writer: ImageLayerWriter, + lsn: Lsn, + ctx: &RequestContext, + img_range: Range, + mode: ImageLayerCreationMode, + ) -> Result { + assert!(!matches!(mode, ImageLayerCreationMode::Initial)); + + // Metadata keys image layer creation. + let mut reconstruct_state = ValuesReconstructState::default(); + let data = self + .get_vectored_impl(partition.clone(), lsn, &mut reconstruct_state, ctx) + .await?; + let (data, total_kb_retrieved, total_key_retrieved) = { + let mut new_data = BTreeMap::new(); + let mut total_kb_retrieved = 0; + let mut total_key_retrieved = 0; + for (k, v) in data { + let v = v.map_err(CreateImageLayersError::PageReconstructError)?; + total_kb_retrieved += KEY_SIZE + v.len(); + total_key_retrieved += 1; + new_data.insert(k, v); + } + (new_data, total_kb_retrieved / 1024, total_key_retrieved) + }; + let delta_file_accessed = reconstruct_state.get_delta_layers_visited(); + + let trigger_generation = delta_file_accessed as usize >= MAX_AUX_FILE_V2_DELTAS; + info!( + "generate image layers for metadata keys: trigger_generation={trigger_generation}, \ + delta_file_accessed={delta_file_accessed}, total_kb_retrieved={total_kb_retrieved}, \ + total_key_retrieved={total_key_retrieved}" + ); + if !trigger_generation && mode == ImageLayerCreationMode::Try { + return Ok(ImageLayerCreationOutcome { + image: None, + next_start_key: img_range.end, + }); + } + let has_keys = !data.is_empty(); + for (k, v) in data { + // Even if the value is empty (deleted), we do not delete it for now until we can ensure vectored get + // considers this situation properly. + // if v.is_empty() { + // continue; + // } + + // No need to handle sharding b/c metadata keys are always on the 0-th shard. + + // TODO: split image layers to avoid too large layer files. Too large image files are not handled + // on the normal data path either. + image_layer_writer.put_image(k, v, ctx).await?; + } + Ok(ImageLayerCreationOutcome { + image: if has_keys { + let image_layer = image_layer_writer.finish(self, ctx).await?; + Some(image_layer) + } else { + tracing::debug!("no data in range {}-{}", img_range.start, img_range.end); + None + }, + next_start_key: img_range.end, + }) + } + #[tracing::instrument(skip_all, fields(%lsn, %mode))] async fn create_image_layers( self: &Arc, @@ -4171,19 +4425,17 @@ impl Timeline { for partition in partitioning.parts.iter() { let img_range = start..partition.ranges.last().unwrap().end; - - if partition.overlaps(&Key::metadata_key_range()) { - // TODO(chi): The next patch will correctly create image layers for metadata keys, and it would be a - // rather big change. Keep this patch small for now. - match mode { - ImageLayerCreationMode::Force | ImageLayerCreationMode::Try => { - // skip image layer creation anyways for metadata keys. - start = img_range.end; - continue; - } - ImageLayerCreationMode::Initial => { - return Err(CreateImageLayersError::Other(anyhow::anyhow!("no image layer should be created for metadata keys when flushing frozen layers"))); - } + let compact_metadata = partition.overlaps(&Key::metadata_key_range()); + if compact_metadata { + for range in &partition.ranges { + assert!( + range.start.field1 >= METADATA_KEY_BEGIN_PREFIX + && range.end.field1 <= METADATA_KEY_END_PREFIX, + "metadata keys must be partitioned separately" + ); + } + if mode == ImageLayerCreationMode::Initial { + return Err(CreateImageLayersError::Other(anyhow::anyhow!("no image layer should be created for metadata keys when flushing frozen layers"))); } } else if let ImageLayerCreationMode::Try = mode { // check_for_image_layers = false -> skip @@ -4194,12 +4446,13 @@ impl Timeline { } } - let mut image_layer_writer = ImageLayerWriter::new( + let image_layer_writer = ImageLayerWriter::new( self.conf, self.timeline_id, self.tenant_shard_id, &img_range, lsn, + ctx, ) .await?; @@ -4209,87 +4462,39 @@ impl Timeline { ))) }); - let mut wrote_keys = false; + if !compact_metadata { + let ImageLayerCreationOutcome { + image, + next_start_key, + } = self + .create_image_layer_for_rel_blocks( + partition, + image_layer_writer, + lsn, + ctx, + img_range, + start, + ) + .await?; - let mut key_request_accum = KeySpaceAccum::new(); - for range in &partition.ranges { - let mut key = range.start; - while key < range.end { - // Decide whether to retain this key: usually we do, but sharded tenants may - // need to drop keys that don't belong to them. If we retain the key, add it - // to `key_request_accum` for later issuing a vectored get - if self.shard_identity.is_key_disposable(&key) { - debug!( - "Dropping key {} during compaction (it belongs on shard {:?})", - key, - self.shard_identity.get_shard_number(&key) - ); - } else { - key_request_accum.add_key(key); - } - - let last_key_in_range = key.next() == range.end; - key = key.next(); - - // Maybe flush `key_rest_accum` - if key_request_accum.raw_size() >= Timeline::MAX_GET_VECTORED_KEYS - || last_key_in_range - { - let results = self - .get_vectored(key_request_accum.consume_keyspace(), lsn, ctx) - .await?; - - for (img_key, img) in results { - let img = match img { - Ok(img) => img, - Err(err) => { - // If we fail to reconstruct a VM or FSM page, we can zero the - // page without losing any actual user data. That seems better - // than failing repeatedly and getting stuck. - // - // We had a bug at one point, where we truncated the FSM and VM - // in the pageserver, but the Postgres didn't know about that - // and continued to generate incremental WAL records for pages - // that didn't exist in the pageserver. Trying to replay those - // WAL records failed to find the previous image of the page. - // This special case allows us to recover from that situation. - // See https://github.com/neondatabase/neon/issues/2601. - // - // Unfortunately we cannot do this for the main fork, or for - // any metadata keys, keys, as that would lead to actual data - // loss. - if is_rel_fsm_block_key(img_key) || is_rel_vm_block_key(img_key) - { - warn!("could not reconstruct FSM or VM key {img_key}, filling with zeros: {err:?}"); - ZERO_PAGE.clone() - } else { - return Err(CreateImageLayersError::PageReconstructError( - err, - )); - } - } - }; - - // Write all the keys we just read into our new image layer. - image_layer_writer.put_image(img_key, img, ctx).await?; - wrote_keys = true; - } - } - } - } - - if wrote_keys { - // Normal path: we have written some data into the new image layer for this - // partition, so flush it to disk. - start = img_range.end; - let image_layer = image_layer_writer.finish(self, ctx).await?; - image_layers.push(image_layer); + start = next_start_key; + image_layers.extend(image); } else { - // Special case: the image layer may be empty if this is a sharded tenant and the - // partition does not cover any keys owned by this shard. In this case, to ensure - // we don't leave gaps between image layers, leave `start` where it is, so that the next - // layer we write will cover the key range that we just scanned. - tracing::debug!("no data in range {}-{}", img_range.start, img_range.end); + let ImageLayerCreationOutcome { + image, + next_start_key, + } = self + .create_image_layer_for_metadata_keys( + partition, + image_layer_writer, + lsn, + ctx, + img_range, + mode, + ) + .await?; + start = next_start_key; + image_layers.extend(image); } } @@ -4304,6 +4509,7 @@ impl Timeline { &self .conf .timeline_path(&self.tenant_shard_id, &self.timeline_id), + ctx, ) .await .fatal_err("VirtualFile::open for timeline dir fsync"); @@ -4328,6 +4534,16 @@ impl Timeline { /// this Timeline is shut down. Calling this function will cause the initial /// logical size calculation to skip waiting for the background jobs barrier. pub(crate) async fn await_initial_logical_size(self: Arc) { + if !self.shard_identity.is_shard_zero() { + // We don't populate logical size on shard >0: skip waiting for it. + return; + } + + if self.remote_client.is_deleting() { + // The timeline was created in a deletion-resume state, we don't expect logical size to be populated + return; + } + if let Some(await_bg_cancel) = self .current_logical_size .cancel_wait_for_background_loop_concurrency_limit_semaphore @@ -4339,9 +4555,10 @@ impl Timeline { // the logical size cancellation to skip the concurrency limit semaphore. // TODO: this is an unexpected case. We should restructure so that it // can't happen. - tracing::info!( + tracing::warn!( "await_initial_logical_size: can't get semaphore cancel token, skipping" ); + debug_assert!(false); } tokio::select!( @@ -4357,7 +4574,6 @@ impl Timeline { /// - has an ancestor to detach from /// - the ancestor does not have an ancestor -- follows from the original RFC limitations, not /// a technical requirement - /// - has prev_lsn in remote storage (temporary restriction) /// /// After the operation has been started, it cannot be canceled. Upon restart it needs to be /// polled again until completion. @@ -4392,6 +4608,14 @@ impl Timeline { ) -> Result, anyhow::Error> { detach_ancestor::complete(self, tenant, prepared, ctx).await } + + /// Switch aux file policy and schedule upload to the index part. + pub(crate) fn do_switch_aux_policy(&self, policy: AuxFilePolicy) -> anyhow::Result<()> { + self.last_aux_file_policy.store(Some(policy)); + self.remote_client + .schedule_index_upload_for_aux_file_policy_update(Some(policy))?; + Ok(()) + } } /// Top-level failure to compact. @@ -4491,9 +4715,8 @@ impl Timeline { // deletion will happen later, the layer file manager calls garbage_collect_on_drop guard.finish_compact_l0(&remove_layers, &insert_layers, &self.metrics); - if let Some(remote_client) = self.remote_client.as_ref() { - remote_client.schedule_compaction_update(&remove_layers, new_deltas)?; - } + self.remote_client + .schedule_compaction_update(&remove_layers, new_deltas)?; drop_wlock(guard); @@ -4511,9 +4734,8 @@ impl Timeline { let upload_layers: Vec<_> = replace_layers.into_iter().map(|r| r.1).collect(); - if let Some(remote_client) = self.remote_client.as_ref() { - remote_client.schedule_compaction_update(&drop_layers, &upload_layers)?; - } + self.remote_client + .schedule_compaction_update(&drop_layers, &upload_layers)?; Ok(()) } @@ -4523,16 +4745,14 @@ impl Timeline { self: &Arc, new_images: impl IntoIterator, ) -> anyhow::Result<()> { - let Some(remote_client) = &self.remote_client else { - return Ok(()); - }; for layer in new_images { - remote_client.schedule_layer_file_upload(layer)?; + self.remote_client.schedule_layer_file_upload(layer)?; } // should any new image layer been created, not uploading index_part will // result in a mismatch between remote_physical_size and layermap calculated // size, which will fail some tests, but should not be an issue otherwise. - remote_client.schedule_index_upload_for_file_changes()?; + self.remote_client + .schedule_index_upload_for_file_changes()?; Ok(()) } @@ -4630,11 +4850,9 @@ impl Timeline { pub(super) async fn gc(&self) -> anyhow::Result { // this is most likely the background tasks, but it might be the spawned task from // immediate_gc - let cancel = crate::task_mgr::shutdown_token(); let _g = tokio::select! { guard = self.gc_lock.lock() => guard, _ = self.cancel.cancelled() => return Ok(GcResult::default()), - _ = cancel.cancelled() => return Ok(GcResult::default()), }; let timer = self.metrics.garbage_collect_histo.start_timer(); @@ -4654,7 +4872,32 @@ impl Timeline { (horizon_cutoff, pitr_cutoff, retain_lsns) }; - let new_gc_cutoff = Lsn::min(horizon_cutoff, pitr_cutoff); + let mut new_gc_cutoff = Lsn::min(horizon_cutoff, pitr_cutoff); + let standby_horizon = self.standby_horizon.load(); + // Hold GC for the standby, but as a safety guard do it only within some + // reasonable lag. + if standby_horizon != Lsn::INVALID { + if let Some(standby_lag) = new_gc_cutoff.checked_sub(standby_horizon) { + const MAX_ALLOWED_STANDBY_LAG: u64 = 10u64 << 30; // 10 GB + if standby_lag.0 < MAX_ALLOWED_STANDBY_LAG { + new_gc_cutoff = Lsn::min(standby_horizon, new_gc_cutoff); + trace!("holding off GC for standby apply LSN {}", standby_horizon); + } else { + warn!( + "standby is lagging for more than {}MB, not holding gc for it", + MAX_ALLOWED_STANDBY_LAG / 1024 / 1024 + ) + } + } + } + + // Reset standby horizon to ignore it if it is not updated till next GC. + // It is an easy way to unset it when standby disappears without adding + // more conf options. + self.standby_horizon.store(Lsn::INVALID); + self.metrics + .standby_horizon_gauge + .set(Lsn::INVALID.0 as i64); let res = self .gc_timeline(horizon_cutoff, pitr_cutoff, retain_lsns, new_gc_cutoff) @@ -4731,7 +4974,7 @@ impl Timeline { if l.get_lsn_range().end > horizon_cutoff { debug!( "keeping {} because it's newer than horizon_cutoff {}", - l.filename(), + l.layer_name(), horizon_cutoff, ); result.layers_needed_by_cutoff += 1; @@ -4742,7 +4985,7 @@ impl Timeline { if l.get_lsn_range().end > pitr_cutoff { debug!( "keeping {} because it's newer than pitr_cutoff {}", - l.filename(), + l.layer_name(), pitr_cutoff, ); result.layers_needed_by_pitr += 1; @@ -4761,7 +5004,7 @@ impl Timeline { if &l.get_lsn_range().start <= retain_lsn { debug!( "keeping {} because it's still might be referenced by child branch forked at {} is_dropped: xx is_incremental: {}", - l.filename(), + l.layer_name(), retain_lsn, l.is_incremental(), ); @@ -4792,7 +5035,7 @@ impl Timeline { if !layers .image_layer_exists(&l.get_key_range(), &(l.get_lsn_range().end..new_gc_cutoff)) { - debug!("keeping {} because it is the latest layer", l.filename()); + debug!("keeping {} because it is the latest layer", l.layer_name()); result.layers_not_updated += 1; continue 'outer; } @@ -4800,7 +5043,7 @@ impl Timeline { // We didn't find any reason to keep this file, so remove it. debug!( "garbage collecting {} is_dropped: xx is_incremental: {}", - l.filename(), + l.layer_name(), l.is_incremental(), ); layers_to_remove.push(l); @@ -4820,9 +5063,7 @@ impl Timeline { result.layers_removed = gc_layers.len() as u64; - if let Some(remote_client) = self.remote_client.as_ref() { - remote_client.schedule_gc_update(&gc_layers)?; - } + self.remote_client.schedule_gc_update(&gc_layers)?; guard.finish_gc_timeline(&gc_layers); @@ -5206,7 +5447,7 @@ impl<'a> TimelineWriter<'a> { let buf_size: u64 = buf.len().try_into().expect("oversized value buf"); let action = self.get_open_layer_action(lsn, buf_size); - let layer = self.handle_open_layer_action(lsn, action).await?; + let layer = self.handle_open_layer_action(lsn, action, ctx).await?; let res = layer.put_value(key, lsn, &buf, ctx).await; if res.is_ok() { @@ -5229,14 +5470,15 @@ impl<'a> TimelineWriter<'a> { &mut self, at: Lsn, action: OpenLayerAction, + ctx: &RequestContext, ) -> anyhow::Result<&Arc> { match action { OpenLayerAction::Roll => { let freeze_at = self.write_guard.as_ref().unwrap().max_lsn.unwrap(); self.roll_layer(freeze_at).await?; - self.open_layer(at).await?; + self.open_layer(at, ctx).await?; } - OpenLayerAction::Open => self.open_layer(at).await?, + OpenLayerAction::Open => self.open_layer(at, ctx).await?, OpenLayerAction::None => { assert!(self.write_guard.is_some()); } @@ -5245,8 +5487,8 @@ impl<'a> TimelineWriter<'a> { Ok(&self.write_guard.as_ref().unwrap().open_layer) } - async fn open_layer(&mut self, at: Lsn) -> anyhow::Result<()> { - let layer = self.tl.get_layer_for_write(at).await?; + async fn open_layer(&mut self, at: Lsn, ctx: &RequestContext) -> anyhow::Result<()> { + let layer = self.tl.get_layer_for_write(at, ctx).await?; let initial_size = layer.size().await?; let last_freeze_at = self.last_freeze_at.load(); @@ -5323,10 +5565,14 @@ impl<'a> TimelineWriter<'a> { Ok(()) } - pub(crate) async fn delete_batch(&mut self, batch: &[(Range, Lsn)]) -> anyhow::Result<()> { + pub(crate) async fn delete_batch( + &mut self, + batch: &[(Range, Lsn)], + ctx: &RequestContext, + ) -> anyhow::Result<()> { if let Some((_, lsn)) = batch.first() { let action = self.get_open_layer_action(*lsn, 0); - let layer = self.handle_open_layer_action(*lsn, action).await?; + let layer = self.handle_open_layer_action(*lsn, action, ctx).await?; layer.put_tombstones(batch).await?; } diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index e83878b8fb..2eff469591 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -116,9 +116,13 @@ impl Timeline { // 3. Create new image layers for partitions that have been modified // "enough". - let dense_layers = self + let mut partitioning = dense_partitioning; + partitioning + .parts + .extend(sparse_partitioning.into_dense().parts); + let image_layers = self .create_image_layers( - &dense_partitioning, + &partitioning, lsn, if flags.contains(CompactFlags::ForceImageLayerCreation) { ImageLayerCreationMode::Force @@ -130,24 +134,8 @@ impl Timeline { .await .map_err(anyhow::Error::from)?; - // For now, nothing will be produced... - let sparse_layers = self - .create_image_layers( - &sparse_partitioning.clone().into_dense(), - lsn, - if flags.contains(CompactFlags::ForceImageLayerCreation) { - ImageLayerCreationMode::Force - } else { - ImageLayerCreationMode::Try - }, - &image_ctx, - ) - .await - .map_err(anyhow::Error::from)?; - assert!(sparse_layers.is_empty()); - - self.upload_new_image_layers(dense_layers)?; - dense_partitioning.parts.len() + self.upload_new_image_layers(image_layers)?; + partitioning.parts.len() } Err(err) => { // no partitioning? This is normal, if the timeline was just created @@ -295,13 +283,11 @@ impl Timeline { // Update the LayerMap so that readers will use the new layers, and enqueue it for writing to remote storage self.rewrite_layers(replace_layers, drop_layers).await?; - if let Some(remote_client) = self.remote_client.as_ref() { - // We wait for all uploads to complete before finishing this compaction stage. This is not - // necessary for correctness, but it simplifies testing, and avoids proceeding with another - // Timeline's compaction while this timeline's uploads may be generating lots of disk I/O - // load. - remote_client.wait_completion().await?; - } + // We wait for all uploads to complete before finishing this compaction stage. This is not + // necessary for correctness, but it simplifies testing, and avoids proceeding with another + // Timeline's compaction while this timeline's uploads may be generating lots of disk I/O + // load. + self.remote_client.wait_completion().await?; Ok(()) } @@ -501,8 +487,11 @@ impl Timeline { for &DeltaEntry { key: next_key, .. } in all_keys.iter() { if let Some(prev_key) = prev { - // just first fast filter - if next_key.to_i128() - prev_key.to_i128() >= min_hole_range { + // just first fast filter, do not create hole entries for metadata keys. The last hole in the + // compaction is the gap between data key and metadata keys. + if next_key.to_i128() - prev_key.to_i128() >= min_hole_range + && !Key::is_metadata_key(&prev_key) + { let key_range = prev_key..next_key; // Measuring hole by just subtraction of i128 representation of key range boundaries // has not so much sense, because largest holes will corresponds field1/field2 changes. @@ -700,6 +689,7 @@ impl Timeline { debug!("Create new layer {}..{}", lsn_range.start, lsn_range.end); lsn_range.clone() }, + ctx, ) .await?, ); @@ -755,6 +745,7 @@ impl Timeline { &self .conf .timeline_path(&self.tenant_shard_id, &self.timeline_id), + ctx, ) .await .fatal_err("VirtualFile::open for timeline dir fsync"); @@ -1093,6 +1084,7 @@ impl CompactionJobExecutor for TimelineAdaptor { self.timeline.tenant_shard_id, key_range.start, lsn_range.clone(), + ctx, ) .await?; @@ -1167,6 +1159,7 @@ impl TimelineAdaptor { self.timeline.tenant_shard_id, key_range, lsn, + ctx, ) .await?; diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs index d8701be170..b5dfc86e77 100644 --- a/pageserver/src/tenant/timeline/delete.rs +++ b/pageserver/src/tenant/timeline/delete.rs @@ -26,19 +26,21 @@ use super::{Timeline, TimelineResources}; /// during attach or pageserver restart. /// See comment in persist_index_part_with_deleted_flag. async fn set_deleted_in_remote_index(timeline: &Timeline) -> Result<(), DeleteTimelineError> { - if let Some(remote_client) = timeline.remote_client.as_ref() { - match remote_client.persist_index_part_with_deleted_flag().await { - // If we (now, or already) marked it successfully as deleted, we can proceed - Ok(()) | Err(PersistIndexPartWithDeletedFlagError::AlreadyDeleted(_)) => (), - // Bail out otherwise - // - // AlreadyInProgress shouldn't happen, because the 'delete_lock' prevents - // two tasks from performing the deletion at the same time. The first task - // that starts deletion should run it to completion. - Err(e @ PersistIndexPartWithDeletedFlagError::AlreadyInProgress(_)) - | Err(e @ PersistIndexPartWithDeletedFlagError::Other(_)) => { - return Err(DeleteTimelineError::Other(anyhow::anyhow!(e))); - } + match timeline + .remote_client + .persist_index_part_with_deleted_flag() + .await + { + // If we (now, or already) marked it successfully as deleted, we can proceed + Ok(()) | Err(PersistIndexPartWithDeletedFlagError::AlreadyDeleted(_)) => (), + // Bail out otherwise + // + // AlreadyInProgress shouldn't happen, because the 'delete_lock' prevents + // two tasks from performing the deletion at the same time. The first task + // that starts deletion should run it to completion. + Err(e @ PersistIndexPartWithDeletedFlagError::AlreadyInProgress(_)) + | Err(e @ PersistIndexPartWithDeletedFlagError::Other(_)) => { + return Err(DeleteTimelineError::Other(anyhow::anyhow!(e))); } } Ok(()) @@ -117,11 +119,11 @@ pub(super) async fn delete_local_timeline_directory( /// Removes remote layers and an index file after them. async fn delete_remote_layers_and_index(timeline: &Timeline) -> anyhow::Result<()> { - if let Some(remote_client) = &timeline.remote_client { - remote_client.delete_all().await.context("delete_all")? - }; - - Ok(()) + timeline + .remote_client + .delete_all() + .await + .context("delete_all") } // This function removs remaining traces of a timeline on disk. @@ -260,7 +262,7 @@ impl DeleteTimelineFlow { tenant: Arc, timeline_id: TimelineId, local_metadata: &TimelineMetadata, - remote_client: Option, + remote_client: RemoteTimelineClient, deletion_queue_client: DeletionQueueClient, ) -> anyhow::Result<()> { // Note: here we even skip populating layer map. Timeline is essentially uninitialized. @@ -278,6 +280,8 @@ impl DeleteTimelineFlow { // Important. We dont pass ancestor above because it can be missing. // Thus we need to skip the validation here. CreateTimelineCause::Delete, + // Aux file policy is not needed for deletion, assuming deletion does not read aux keyspace + None, ) .context("create_timeline_struct")?; diff --git a/pageserver/src/tenant/timeline/detach_ancestor.rs b/pageserver/src/tenant/timeline/detach_ancestor.rs index 5c2b25da56..e6ddabe5b5 100644 --- a/pageserver/src/tenant/timeline/detach_ancestor.rs +++ b/pageserver/src/tenant/timeline/detach_ancestor.rs @@ -12,7 +12,7 @@ use crate::{ }; use tokio_util::sync::CancellationToken; use tracing::Instrument; -use utils::{completion, generation::Generation, id::TimelineId, lsn::Lsn}; +use utils::{completion, generation::Generation, http::error::ApiError, id::TimelineId, lsn::Lsn}; #[derive(Debug, thiserror::Error)] pub(crate) enum Error { @@ -22,8 +22,6 @@ pub(crate) enum Error { TooManyAncestors, #[error("shutting down, please retry later")] ShuttingDown, - #[error("detached timeline must receive writes before the operation")] - DetachedTimelineNeedsWrites, #[error("flushing failed")] FlushAncestor(#[source] anyhow::Error), #[error("layer download failed")] @@ -43,6 +41,27 @@ pub(crate) enum Error { Unexpected(#[source] anyhow::Error), } +impl From for ApiError { + fn from(value: Error) -> Self { + match value { + e @ Error::NoAncestor => ApiError::Conflict(e.to_string()), + // TODO: ApiError converts the anyhow using debug formatting ... just stop using ApiError? + e @ Error::TooManyAncestors => ApiError::BadRequest(anyhow::anyhow!("{}", e)), + Error::ShuttingDown => ApiError::ShuttingDown, + Error::OtherTimelineDetachOngoing(_) => { + ApiError::ResourceUnavailable("other timeline detach is already ongoing".into()) + } + // All of these contain shutdown errors, in fact, it's the most common + e @ Error::FlushAncestor(_) + | e @ Error::RewrittenDeltaDownloadFailed(_) + | e @ Error::CopyDeltaPrefix(_) + | e @ Error::UploadRewritten(_) + | e @ Error::CopyFailed(_) + | e @ Error::Unexpected(_) => ApiError::InternalServerError(e.into()), + } + } +} + pub(crate) struct PreparedTimelineDetach { layers: Vec, } @@ -58,7 +77,7 @@ impl Default for Options { fn default() -> Self { Self { rewrite_concurrency: std::num::NonZeroUsize::new(2).unwrap(), - copy_concurrency: std::num::NonZeroUsize::new(10).unwrap(), + copy_concurrency: std::num::NonZeroUsize::new(100).unwrap(), } } } @@ -72,15 +91,16 @@ pub(super) async fn prepare( ) -> Result<(completion::Completion, PreparedTimelineDetach), Error> { use Error::*; - if detached.remote_client.as_ref().is_none() { - unimplemented!("no new code for running without remote storage"); - } - let Some((ancestor, ancestor_lsn)) = detached .ancestor_timeline .as_ref() .map(|tl| (tl.clone(), detached.ancestor_lsn)) else { + // TODO: check if we have already been detached; for this we need to read the stored data + // on remote client, for that we need a follow-up which makes uploads cheaper and maintains + // a projection of the commited data. + // + // the error is wrong per openapi return Err(NoAncestor); }; @@ -90,18 +110,10 @@ pub(super) async fn prepare( if ancestor.ancestor_timeline.is_some() { // non-technical requirement; we could flatten N ancestors just as easily but we chose - // not to + // not to, at least initially return Err(TooManyAncestors); } - if detached.get_prev_record_lsn() == Lsn::INVALID - || detached.disk_consistent_lsn.load() == ancestor_lsn - { - // this is to avoid a problem that after detaching we would be unable to start up the - // compute because of "PREV_LSN: invalid". - return Err(DetachedTimelineNeedsWrites); - } - // before we acquire the gate, we must mark the ancestor as having a detach operation // ongoing which will block other concurrent detach operations so we don't get to ackward // situations where there would be two branches trying to reparent earlier branches. @@ -225,6 +237,7 @@ pub(super) async fn prepare( &detached .conf .timeline_path(&detached.tenant_shard_id, &detached.timeline_id), + ctx, ) .await .fatal_err("VirtualFile::open for timeline dir fsync"); @@ -324,8 +337,6 @@ async fn upload_rewritten_layer( // FIXME: better shuttingdown error target .remote_client - .as_ref() - .unwrap() .upload_layer_file(&copied, cancel) .await .map_err(UploadRewritten)?; @@ -349,6 +360,7 @@ async fn copy_lsn_prefix( target_timeline.tenant_shard_id, layer.layer_desc().key_range.start, layer.layer_desc().lsn_range.start..end_lsn, + ctx, ) .await .map_err(CopyDeltaPrefix)?; @@ -407,15 +419,13 @@ async fn remote_copy( let owned = crate::tenant::storage_layer::Layer::for_evicted( adoptee.conf, adoptee, - adopted.layer_desc().filename(), + adopted.layer_desc().layer_name(), metadata, ); // FIXME: better shuttingdown error adoptee .remote_client - .as_ref() - .unwrap() .copy_timeline_layer(adopted, &owned, cancel) .await .map(move |()| owned) @@ -429,11 +439,6 @@ pub(super) async fn complete( prepared: PreparedTimelineDetach, _ctx: &RequestContext, ) -> Result, anyhow::Error> { - let rtc = detached - .remote_client - .as_ref() - .expect("has to have a remote timeline client for timeline ancestor detach"); - let PreparedTimelineDetach { layers } = prepared; let ancestor = detached @@ -450,11 +455,13 @@ pub(super) async fn complete( // // this is not perfect, but it avoids us a retry happening after a compaction or gc on restart // which could give us a completely wrong layer combination. - rtc.schedule_adding_existing_layers_to_index_detach_and_wait( - &layers, - (ancestor.timeline_id, ancestor_lsn), - ) - .await?; + detached + .remote_client + .schedule_adding_existing_layers_to_index_detach_and_wait( + &layers, + (ancestor.timeline_id, ancestor_lsn), + ) + .await?; let mut tasks = tokio::task::JoinSet::new(); @@ -499,8 +506,6 @@ pub(super) async fn complete( async move { let res = timeline .remote_client - .as_ref() - .expect("reparented has to have remote client because detached has one") .schedule_reparenting_and_wait(&new_parent) .await; diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs index 3567761b9a..8a8c38d0ce 100644 --- a/pageserver/src/tenant/timeline/eviction_task.rs +++ b/pageserver/src/tenant/timeline/eviction_task.rs @@ -23,7 +23,7 @@ use std::{ use pageserver_api::models::{EvictionPolicy, EvictionPolicyLayerAccessThreshold}; use tokio::time::Instant; use tokio_util::sync::CancellationToken; -use tracing::{debug, error, info, info_span, instrument, warn, Instrument}; +use tracing::{debug, info, info_span, instrument, warn, Instrument}; use crate::{ context::{DownloadBehavior, RequestContext}, @@ -211,11 +211,6 @@ impl Timeline { // So, we just need to deal with this. - if self.remote_client.is_none() { - error!("no remote storage configured, cannot evict layers"); - return ControlFlow::Continue(()); - } - let mut js = tokio::task::JoinSet::new(); { let guard = self.layers.read().await; diff --git a/pageserver/src/tenant/timeline/init.rs b/pageserver/src/tenant/timeline/init.rs index 9c33981807..feadc79e5e 100644 --- a/pageserver/src/tenant/timeline/init.rs +++ b/pageserver/src/tenant/timeline/init.rs @@ -6,10 +6,9 @@ use crate::{ self, index::{IndexPart, LayerFileMetadata}, }, - storage_layer::LayerFileName, + storage_layer::LayerName, Generation, }, - METADATA_FILE_NAME, }; use anyhow::Context; use camino::{Utf8Path, Utf8PathBuf}; @@ -20,15 +19,13 @@ use utils::lsn::Lsn; /// Identified files in the timeline directory. pub(super) enum Discovered { /// The only one we care about - Layer(LayerFileName, Utf8PathBuf, u64), + Layer(LayerName, Utf8PathBuf, u64), /// Old ephmeral files from previous launches, should be removed Ephemeral(String), /// Old temporary timeline files, unsure what these really are, should be removed Temporary(String), /// Temporary on-demand download files, should be removed TemporaryDownload(String), - /// "metadata" file we persist locally and include in `index_part.json` - Metadata, /// Backup file from previously future layers IgnoredBackup, /// Unrecognized, warn about these @@ -43,15 +40,13 @@ pub(super) fn scan_timeline_dir(path: &Utf8Path) -> anyhow::Result { let file_size = direntry.metadata()?.len(); Discovered::Layer(file_name, direntry.path().to_owned(), file_size) } Err(_) => { - if file_name == METADATA_FILE_NAME { - Discovered::Metadata - } else if file_name.ends_with(".old") { + if file_name.ends_with(".old") { // ignore these Discovered::IgnoredBackup } else if remote_timeline_client::is_temp_download_file(direntry.path()) { @@ -72,6 +67,28 @@ pub(super) fn scan_timeline_dir(path: &Utf8Path) -> anyhow::Result Self { + Self { + local_path, + metadata: LayerFileMetadata::new(file_size, generation, shard), + } + } +} + /// Decision on what to do with a layer file after considering its local and remote metadata. #[derive(Clone, Debug)] pub(super) enum Decision { @@ -80,11 +97,11 @@ pub(super) enum Decision { /// The layer is present locally, but local metadata does not match remote; we must /// delete it and treat it as evicted. UseRemote { - local: LayerFileMetadata, + local: LocalLayerFileMetadata, remote: LayerFileMetadata, }, /// The layer is present locally, and metadata matches. - UseLocal(LayerFileMetadata), + UseLocal(LocalLayerFileMetadata), } /// A layer needs to be left out of the layer map. @@ -92,39 +109,29 @@ pub(super) enum Decision { pub(super) enum DismissedLayer { /// The related layer is is in future compared to disk_consistent_lsn, it must not be loaded. Future { - /// The local metadata. `None` if the layer is only known through [`IndexPart`]. - local: Option, + /// `None` if the layer is only known through [`IndexPart`]. + local: Option, }, /// The layer only exists locally. /// /// In order to make crash safe updates to layer map, we must dismiss layers which are only /// found locally or not yet included in the remote `index_part.json`. - LocalOnly(LayerFileMetadata), + LocalOnly(LocalLayerFileMetadata), } /// Merges local discoveries and remote [`IndexPart`] to a collection of decisions. pub(super) fn reconcile( - discovered: Vec<(LayerFileName, Utf8PathBuf, u64)>, + discovered: Vec<(LayerName, Utf8PathBuf, u64)>, index_part: Option<&IndexPart>, disk_consistent_lsn: Lsn, generation: Generation, shard: ShardIndex, -) -> Vec<( - LayerFileName, - Option, - Result, -)> { +) -> Vec<(LayerName, Result)> { use Decision::*; - // name => (local_path, local_metadata, remote_metadata) - type Collected = HashMap< - LayerFileName, - ( - Option, - Option, - Option, - ), - >; + // name => (local_metadata, remote_metadata) + type Collected = + HashMap, Option)>; let mut discovered = discovered .into_iter() @@ -135,8 +142,9 @@ pub(super) fn reconcile( // it is not in IndexPart, in which case using our current generation makes sense // because it will be uploaded in this generation. ( - Some(local_path), - Some(LayerFileMetadata::new(file_size, generation, shard)), + Some(LocalLayerFileMetadata::new( + local_path, file_size, generation, shard, + )), None, ), ) @@ -152,20 +160,20 @@ pub(super) fn reconcile( .map(|(name, metadata)| (name, LayerFileMetadata::from(metadata))) .for_each(|(name, metadata)| { if let Some(existing) = discovered.get_mut(name) { - existing.2 = Some(metadata); + existing.1 = Some(metadata); } else { - discovered.insert(name.to_owned(), (None, None, Some(metadata))); + discovered.insert(name.to_owned(), (None, Some(metadata))); } }); discovered .into_iter() - .map(|(name, (local_path, local, remote))| { + .map(|(name, (local, remote))| { let decision = if name.is_in_future(disk_consistent_lsn) { Err(DismissedLayer::Future { local }) } else { match (local, remote) { - (Some(local), Some(remote)) if local != remote => { + (Some(local), Some(remote)) if local.metadata != remote => { Ok(UseRemote { local, remote }) } (Some(x), Some(_)) => Ok(UseLocal(x)), @@ -177,7 +185,7 @@ pub(super) fn reconcile( } }; - (name, local_path, decision) + (name, decision) }) .collect::>() } @@ -189,12 +197,12 @@ pub(super) fn cleanup(path: &Utf8Path, kind: &str) -> anyhow::Result<()> { } pub(super) fn cleanup_local_file_for_remote( - path: &Utf8Path, - local: &LayerFileMetadata, + local: &LocalLayerFileMetadata, remote: &LayerFileMetadata, ) -> anyhow::Result<()> { - let local_size = local.file_size(); + let local_size = local.metadata.file_size(); let remote_size = remote.file_size(); + let path = &local.local_path; let file_name = path.file_name().expect("must be file path"); tracing::warn!("removing local file {file_name:?} because it has unexpected length {local_size}; length in remote index is {remote_size}"); @@ -211,7 +219,7 @@ pub(super) fn cleanup_local_file_for_remote( pub(super) fn cleanup_future_layer( path: &Utf8Path, - name: &LayerFileName, + name: &LayerName, disk_consistent_lsn: Lsn, ) -> anyhow::Result<()> { // future image layers are allowed to be produced always for not yet flushed to disk @@ -223,12 +231,14 @@ pub(super) fn cleanup_future_layer( } pub(super) fn cleanup_local_only_file( - path: &Utf8Path, - name: &LayerFileName, - local: &LayerFileMetadata, + name: &LayerName, + local: &LocalLayerFileMetadata, ) -> anyhow::Result<()> { let kind = name.kind(); - tracing::info!("found local-only {kind} layer {name}, metadata {local:?}"); - std::fs::remove_file(path)?; + tracing::info!( + "found local-only {kind} layer {name}, metadata {:?}", + local.metadata + ); + std::fs::remove_file(&local.local_path)?; Ok(()) } diff --git a/pageserver/src/tenant/timeline/layer_manager.rs b/pageserver/src/tenant/timeline/layer_manager.rs index 8e8d64e0c6..248420e632 100644 --- a/pageserver/src/tenant/timeline/layer_manager.rs +++ b/pageserver/src/tenant/timeline/layer_manager.rs @@ -9,6 +9,7 @@ use utils::{ use crate::{ config::PageServerConf, + context::RequestContext, metrics::TimelineMetrics, tenant::{ layer_map::{BatchedUpdates, LayerMap}, @@ -69,6 +70,7 @@ impl LayerManager { conf: &'static PageServerConf, timeline_id: TimelineId, tenant_shard_id: TenantShardId, + ctx: &RequestContext, ) -> Result> { ensure!(lsn.is_aligned()); @@ -105,7 +107,7 @@ impl LayerManager { ); let new_layer = - InMemoryLayer::create(conf, timeline_id, tenant_shard_id, start_lsn).await?; + InMemoryLayer::create(conf, timeline_id, tenant_shard_id, start_lsn, ctx).await?; let layer = Arc::new(new_layer); self.layer_map.open_layer = Some(layer.clone()); @@ -294,7 +296,7 @@ impl LayerFileManager { // A layer's descriptor is present in the LayerMap => the LayerFileManager contains a layer for the descriptor. self.0 .get(&desc.key()) - .with_context(|| format!("get layer from desc: {}", desc.filename())) + .with_context(|| format!("get layer from desc: {}", desc.layer_name())) .expect("not found") .clone() } diff --git a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs index 991e4ac045..1d2ffec08f 100644 --- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs +++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs @@ -705,6 +705,7 @@ impl ConnectionManagerState { commit_lsn: info.commit_lsn, safekeeper_connstr: info.safekeeper_connstr, availability_zone: info.availability_zone, + standby_horizon: info.standby_horizon, } } MessageType::SafekeeperDiscoveryResponse => { @@ -725,6 +726,21 @@ impl ConnectionManagerState { WALRECEIVER_BROKER_UPDATES.inc(); + trace!( + "safekeeper info update: standby_horizon(cutoff)={}", + timeline_update.standby_horizon + ); + if timeline_update.standby_horizon != 0 { + // ignore reports from safekeepers not connected to replicas + self.timeline + .standby_horizon + .store(Lsn(timeline_update.standby_horizon)); + self.timeline + .metrics + .standby_horizon_gauge + .set(timeline_update.standby_horizon as i64); + } + let new_safekeeper_id = NodeId(timeline_update.safekeeper_id); let old_entry = self.wal_stream_candidates.insert( new_safekeeper_id, @@ -1094,6 +1110,7 @@ mod tests { commit_lsn, safekeeper_connstr: safekeeper_connstr.to_owned(), availability_zone: None, + standby_horizon: 0, }, latest_update, } diff --git a/pageserver/src/tenant/upload_queue.rs b/pageserver/src/tenant/upload_queue.rs index 0bf4d1e599..c0cc8f3124 100644 --- a/pageserver/src/tenant/upload_queue.rs +++ b/pageserver/src/tenant/upload_queue.rs @@ -1,12 +1,14 @@ -use super::storage_layer::LayerFileName; +use super::storage_layer::LayerName; use super::storage_layer::ResidentLayer; use crate::tenant::metadata::TimelineMetadata; use crate::tenant::remote_timeline_client::index::IndexPart; use crate::tenant::remote_timeline_client::index::LayerFileMetadata; +use crate::tenant::remote_timeline_client::index::Lineage; use std::collections::{HashMap, VecDeque}; use std::fmt::Debug; use chrono::NaiveDateTime; +use pageserver_api::models::AuxFilePolicy; use std::sync::Arc; use tracing::info; use utils::lsn::AtomicLsn; @@ -45,7 +47,7 @@ pub(crate) struct UploadQueueInitialized { /// All layer files stored in the remote storage, taking into account all /// in-progress and queued operations - pub(crate) latest_files: HashMap, + pub(crate) latest_files: HashMap, /// How many file uploads or deletions been scheduled, since the /// last (scheduling of) metadata index upload? @@ -56,6 +58,12 @@ pub(crate) struct UploadQueueInitialized { /// DANGER: do not return to outside world, e.g., safekeepers. pub(crate) latest_metadata: TimelineMetadata, + /// Part of the flattened "next" `index_part.json`. + pub(crate) latest_lineage: Lineage, + + /// The last aux file policy used on this timeline. + pub(crate) last_aux_file_policy: Option, + /// `disk_consistent_lsn` from the last metadata file that was successfully /// uploaded. `Lsn(0)` if nothing was uploaded yet. /// Unlike `latest_files` or `latest_metadata`, this value is never ahead. @@ -89,7 +97,7 @@ pub(crate) struct UploadQueueInitialized { /// Putting this behind a testing feature to catch problems in tests, but assuming we could have a /// bug causing leaks, then it's better to not leave this enabled for production builds. #[cfg(feature = "testing")] - pub(crate) dangling_files: HashMap, + pub(crate) dangling_files: HashMap, /// Set to true when we have inserted the `UploadOp::Shutdown` into the `inprogress_tasks`. pub(crate) shutting_down: bool, @@ -171,6 +179,7 @@ impl UploadQueue { latest_files: HashMap::new(), latest_files_changes_since_metadata_upload_scheduled: 0, latest_metadata: metadata.clone(), + latest_lineage: Lineage::default(), projected_remote_consistent_lsn: None, visible_remote_consistent_lsn: Arc::new(AtomicLsn::new(0)), // what follows are boring default initializations @@ -184,6 +193,7 @@ impl UploadQueue { dangling_files: HashMap::new(), shutting_down: false, shutdown_ready: Arc::new(tokio::sync::Semaphore::new(0)), + last_aux_file_policy: Default::default(), }; *self = UploadQueue::Initialized(state); @@ -218,6 +228,7 @@ impl UploadQueue { latest_files: files, latest_files_changes_since_metadata_upload_scheduled: 0, latest_metadata: index_part.metadata.clone(), + latest_lineage: index_part.lineage.clone(), projected_remote_consistent_lsn: Some(index_part.metadata.disk_consistent_lsn()), visible_remote_consistent_lsn: Arc::new( index_part.metadata.disk_consistent_lsn().into(), @@ -233,6 +244,7 @@ impl UploadQueue { dangling_files: HashMap::new(), shutting_down: false, shutdown_ready: Arc::new(tokio::sync::Semaphore::new(0)), + last_aux_file_policy: index_part.last_aux_file_policy(), }; *self = UploadQueue::Initialized(state); @@ -281,7 +293,7 @@ pub(crate) struct UploadTask { /// for timeline deletion, which skips this queue and goes directly to DeletionQueue. #[derive(Debug)] pub(crate) struct Delete { - pub(crate) layers: Vec<(LayerFileName, LayerFileMetadata)>, + pub(crate) layers: Vec<(LayerName, LayerFileMetadata)>, } #[derive(Debug)] @@ -290,7 +302,7 @@ pub(crate) enum UploadOp { UploadLayer(ResidentLayer, LayerFileMetadata), /// Upload the metadata file - UploadMetadata(IndexPart, Lsn), + UploadMetadata(Box, Lsn), /// Delete layer files Delete(Delete), diff --git a/pageserver/src/tenant/vectored_blob_io.rs b/pageserver/src/tenant/vectored_blob_io.rs index 91934d5e0e..6e825760e3 100644 --- a/pageserver/src/tenant/vectored_blob_io.rs +++ b/pageserver/src/tenant/vectored_blob_io.rs @@ -23,6 +23,7 @@ use pageserver_api::key::Key; use utils::lsn::Lsn; use utils::vec_map::VecMap; +use crate::context::RequestContext; use crate::virtual_file::VirtualFile; #[derive(Copy, Clone, Debug, PartialEq, Eq)] @@ -285,6 +286,7 @@ impl<'a> VectoredBlobReader<'a> { &self, read: &VectoredRead, buf: BytesMut, + ctx: &RequestContext, ) -> Result { assert!(read.size() > 0); assert!( @@ -295,7 +297,7 @@ impl<'a> VectoredBlobReader<'a> { ); let buf = self .file - .read_exact_at_n(buf, read.start, read.size()) + .read_exact_at_n(buf, read.start, read.size(), ctx) .await?; let blobs_at = read.blobs_at.as_slice(); diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs index a17488a286..b68f3a0e89 100644 --- a/pageserver/src/virtual_file.rs +++ b/pageserver/src/virtual_file.rs @@ -344,16 +344,23 @@ macro_rules! with_file { impl VirtualFile { /// Open a file in read-only mode. Like File::open. - pub async fn open(path: &Utf8Path) -> Result { - Self::open_with_options(path, OpenOptions::new().read(true)).await + pub async fn open( + path: &Utf8Path, + ctx: &RequestContext, + ) -> Result { + Self::open_with_options(path, OpenOptions::new().read(true), ctx).await } /// Create a new file for writing. If the file exists, it will be truncated. /// Like File::create. - pub async fn create(path: &Utf8Path) -> Result { + pub async fn create( + path: &Utf8Path, + ctx: &RequestContext, + ) -> Result { Self::open_with_options( path, OpenOptions::new().write(true).create(true).truncate(true), + ctx, ) .await } @@ -366,6 +373,7 @@ impl VirtualFile { pub async fn open_with_options( path: &Utf8Path, open_options: &OpenOptions, + _ctx: &RequestContext, /* TODO: carry a pointer to the metrics in the RequestContext instead of the parsing https://github.com/neondatabase/neon/issues/6107 */ ) -> Result { let path_str = path.to_string(); let parts = path_str.split('/').collect::>(); @@ -576,21 +584,34 @@ impl VirtualFile { Ok(self.pos) } - pub async fn read_exact_at(&self, buf: B, offset: u64) -> Result + pub async fn read_exact_at( + &self, + buf: B, + offset: u64, + ctx: &RequestContext, + ) -> Result where B: IoBufMut + Send, { - let (buf, res) = - read_exact_at_impl(buf, offset, None, |buf, offset| self.read_at(buf, offset)).await; + let (buf, res) = read_exact_at_impl(buf, offset, None, |buf, offset| { + self.read_at(buf, offset, ctx) + }) + .await; res.map(|()| buf) } - pub async fn read_exact_at_n(&self, buf: B, offset: u64, count: usize) -> Result + pub async fn read_exact_at_n( + &self, + buf: B, + offset: u64, + count: usize, + ctx: &RequestContext, + ) -> Result where B: IoBufMut + Send, { let (buf, res) = read_exact_at_impl(buf, offset, Some(count), |buf, offset| { - self.read_at(buf, offset) + self.read_at(buf, offset, ctx) }) .await; res.map(|()| buf) @@ -601,12 +622,13 @@ impl VirtualFile { &self, page: PageWriteGuard<'static>, offset: u64, + ctx: &RequestContext, ) -> Result, Error> { let buf = PageWriteGuardBuf { page, init_up_to: 0, }; - let res = self.read_exact_at(buf, offset).await; + let res = self.read_exact_at(buf, offset, ctx).await; res.map(|PageWriteGuardBuf { page, .. }| page) .map_err(|e| Error::new(ErrorKind::Other, e)) } @@ -699,7 +721,12 @@ impl VirtualFile { (buf, Ok(n)) } - pub(crate) async fn read_at(&self, buf: B, offset: u64) -> (B, Result) + pub(crate) async fn read_at( + &self, + buf: B, + offset: u64, + _ctx: &RequestContext, /* TODO: use for metrics: https://github.com/neondatabase/neon/issues/6107 */ + ) -> (B, Result) where B: tokio_epoll_uring::BoundedBufMut + Send, { @@ -1020,20 +1047,21 @@ impl VirtualFile { pub(crate) async fn read_blk( &self, blknum: u32, + ctx: &RequestContext, ) -> Result, std::io::Error> { use crate::page_cache::PAGE_SZ; let buf = vec![0; PAGE_SZ]; let buf = self - .read_exact_at(buf, blknum as u64 * (PAGE_SZ as u64)) + .read_exact_at(buf, blknum as u64 * (PAGE_SZ as u64), ctx) .await?; Ok(crate::tenant::block_io::BlockLease::Vec(buf)) } - async fn read_to_end(&mut self, buf: &mut Vec) -> Result<(), Error> { + async fn read_to_end(&mut self, buf: &mut Vec, ctx: &RequestContext) -> Result<(), Error> { let mut tmp = vec![0; 128]; loop { let res; - (tmp, res) = self.read_at(tmp, self.pos).await; + (tmp, res) = self.read_at(tmp, self.pos, ctx).await; match res { Ok(0) => return Ok(()), Ok(n) => { @@ -1159,7 +1187,6 @@ mod tests { use rand::seq::SliceRandom; use rand::thread_rng; use rand::Rng; - use std::future::Future; use std::io::Write; use std::os::unix::fs::FileExt; use std::sync::Arc; @@ -1176,9 +1203,14 @@ mod tests { } impl MaybeVirtualFile { - async fn read_exact_at(&self, mut buf: Vec, offset: u64) -> Result, Error> { + async fn read_exact_at( + &self, + mut buf: Vec, + offset: u64, + ctx: &RequestContext, + ) -> Result, Error> { match self { - MaybeVirtualFile::VirtualFile(file) => file.read_exact_at(buf, offset).await, + MaybeVirtualFile::VirtualFile(file) => file.read_exact_at(buf, offset, ctx).await, MaybeVirtualFile::File(file) => file.read_exact_at(&mut buf, offset).map(|()| buf), } } @@ -1230,13 +1262,13 @@ mod tests { // Helper function to slurp contents of a file, starting at the current position, // into a string - async fn read_string(&mut self) -> Result { + async fn read_string(&mut self, ctx: &RequestContext) -> Result { use std::io::Read; let mut buf = String::new(); match self { MaybeVirtualFile::VirtualFile(file) => { let mut buf = Vec::new(); - file.read_to_end(&mut buf).await?; + file.read_to_end(&mut buf, ctx).await?; return Ok(String::from_utf8(buf).unwrap()); } MaybeVirtualFile::File(file) => { @@ -1247,9 +1279,14 @@ mod tests { } // Helper function to slurp a portion of a file into a string - async fn read_string_at(&mut self, pos: u64, len: usize) -> Result { + async fn read_string_at( + &mut self, + pos: u64, + len: usize, + ctx: &RequestContext, + ) -> Result { let buf = vec![0; len]; - let buf = self.read_exact_at(buf, pos).await?; + let buf = self.read_exact_at(buf, pos, ctx).await?; Ok(String::from_utf8(buf).unwrap()) } } @@ -1263,73 +1300,101 @@ mod tests { // results with VirtualFiles as with native Files. (Except that with // native files, you will run out of file descriptors if the ulimit // is low enough.) - test_files("virtual_files", |path, open_options| async move { - let vf = VirtualFile::open_with_options(&path, &open_options).await?; - Ok(MaybeVirtualFile::VirtualFile(vf)) - }) - .await + struct A; + + impl Adapter for A { + async fn open( + path: Utf8PathBuf, + opts: OpenOptions, + ctx: &RequestContext, + ) -> Result { + let vf = VirtualFile::open_with_options(&path, &opts, ctx).await?; + Ok(MaybeVirtualFile::VirtualFile(vf)) + } + } + test_files::("virtual_files").await } #[tokio::test] async fn test_physical_files() -> anyhow::Result<()> { - test_files("physical_files", |path, open_options| async move { - Ok(MaybeVirtualFile::File({ - let owned_fd = open_options.open(path.as_std_path()).await?; - File::from(owned_fd) - })) - }) - .await + struct B; + + impl Adapter for B { + async fn open( + path: Utf8PathBuf, + opts: OpenOptions, + _ctx: &RequestContext, + ) -> Result { + Ok(MaybeVirtualFile::File({ + let owned_fd = opts.open(path.as_std_path()).await?; + File::from(owned_fd) + })) + } + } + + test_files::("physical_files").await } - async fn test_files(testname: &str, openfunc: OF) -> anyhow::Result<()> + /// This is essentially a closure which returns a MaybeVirtualFile, but because rust edition + /// 2024 is not yet out with new lifetime capture or outlives rules, this is a async function + /// in trait which benefits from the new lifetime capture rules already. + trait Adapter { + async fn open( + path: Utf8PathBuf, + opts: OpenOptions, + ctx: &RequestContext, + ) -> Result; + } + + async fn test_files(testname: &str) -> anyhow::Result<()> where - OF: Fn(Utf8PathBuf, OpenOptions) -> FT, - FT: Future>, + A: Adapter, { let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error); let testdir = crate::config::PageServerConf::test_repo_dir(testname); std::fs::create_dir_all(&testdir)?; let path_a = testdir.join("file_a"); - let mut file_a = openfunc( + let mut file_a = A::open( path_a.clone(), OpenOptions::new() .write(true) .create(true) .truncate(true) .to_owned(), + &ctx, ) .await?; file_a.write_all(b"foobar".to_vec(), &ctx).await?; // cannot read from a file opened in write-only mode - let _ = file_a.read_string().await.unwrap_err(); + let _ = file_a.read_string(&ctx).await.unwrap_err(); // Close the file and re-open for reading - let mut file_a = openfunc(path_a, OpenOptions::new().read(true).to_owned()).await?; + let mut file_a = A::open(path_a, OpenOptions::new().read(true).to_owned(), &ctx).await?; // cannot write to a file opened in read-only mode let _ = file_a.write_all(b"bar".to_vec(), &ctx).await.unwrap_err(); // Try simple read - assert_eq!("foobar", file_a.read_string().await?); + assert_eq!("foobar", file_a.read_string(&ctx).await?); // It's positioned at the EOF now. - assert_eq!("", file_a.read_string().await?); + assert_eq!("", file_a.read_string(&ctx).await?); // Test seeks. assert_eq!(file_a.seek(SeekFrom::Start(1)).await?, 1); - assert_eq!("oobar", file_a.read_string().await?); + assert_eq!("oobar", file_a.read_string(&ctx).await?); assert_eq!(file_a.seek(SeekFrom::End(-2)).await?, 4); - assert_eq!("ar", file_a.read_string().await?); + assert_eq!("ar", file_a.read_string(&ctx).await?); assert_eq!(file_a.seek(SeekFrom::Start(1)).await?, 1); assert_eq!(file_a.seek(SeekFrom::Current(2)).await?, 3); - assert_eq!("bar", file_a.read_string().await?); + assert_eq!("bar", file_a.read_string(&ctx).await?); assert_eq!(file_a.seek(SeekFrom::Current(-5)).await?, 1); - assert_eq!("oobar", file_a.read_string().await?); + assert_eq!("oobar", file_a.read_string(&ctx).await?); // Test erroneous seeks to before byte 0 file_a.seek(SeekFrom::End(-7)).await.unwrap_err(); @@ -1337,11 +1402,11 @@ mod tests { file_a.seek(SeekFrom::Current(-2)).await.unwrap_err(); // the erroneous seek should have left the position unchanged - assert_eq!("oobar", file_a.read_string().await?); + assert_eq!("oobar", file_a.read_string(&ctx).await?); // Create another test file, and try FileExt functions on it. let path_b = testdir.join("file_b"); - let mut file_b = openfunc( + let mut file_b = A::open( path_b.clone(), OpenOptions::new() .read(true) @@ -1349,12 +1414,13 @@ mod tests { .create(true) .truncate(true) .to_owned(), + &ctx, ) .await?; file_b.write_all_at(b"BAR".to_vec(), 3, &ctx).await?; file_b.write_all_at(b"FOO".to_vec(), 0, &ctx).await?; - assert_eq!(file_b.read_string_at(2, 3).await?, "OBA"); + assert_eq!(file_b.read_string_at(2, 3, &ctx).await?, "OBA"); // Open a lot of files, enough to cause some evictions. (Or to be precise, // open the same file many times. The effect is the same.) @@ -1364,9 +1430,13 @@ mod tests { let mut vfiles = Vec::new(); for _ in 0..100 { - let mut vfile = - openfunc(path_b.clone(), OpenOptions::new().read(true).to_owned()).await?; - assert_eq!("FOOBAR", vfile.read_string().await?); + let mut vfile = A::open( + path_b.clone(), + OpenOptions::new().read(true).to_owned(), + &ctx, + ) + .await?; + assert_eq!("FOOBAR", vfile.read_string(&ctx).await?); vfiles.push(vfile); } @@ -1375,13 +1445,13 @@ mod tests { // The underlying file descriptor for 'file_a' should be closed now. Try to read // from it again. We left the file positioned at offset 1 above. - assert_eq!("oobar", file_a.read_string().await?); + assert_eq!("oobar", file_a.read_string(&ctx).await?); // Check that all the other FDs still work too. Use them in random order for // good measure. vfiles.as_mut_slice().shuffle(&mut thread_rng()); for vfile in vfiles.iter_mut() { - assert_eq!("OOBAR", vfile.read_string_at(1, 5).await?); + assert_eq!("OOBAR", vfile.read_string_at(1, 5, &ctx).await?); } Ok(()) @@ -1397,6 +1467,7 @@ mod tests { const THREADS: usize = 100; const SAMPLE: [u8; SIZE] = [0xADu8; SIZE]; + let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error); let testdir = crate::config::PageServerConf::test_repo_dir("vfile_concurrency"); std::fs::create_dir_all(&testdir)?; @@ -1410,8 +1481,12 @@ mod tests { // Open the file many times. let mut files = Vec::new(); for _ in 0..VIRTUAL_FILES { - let f = VirtualFile::open_with_options(&test_file_path, OpenOptions::new().read(true)) - .await?; + let f = VirtualFile::open_with_options( + &test_file_path, + OpenOptions::new().read(true), + &ctx, + ) + .await?; files.push(f); } let files = Arc::new(files); @@ -1425,12 +1500,13 @@ mod tests { let mut hdls = Vec::new(); for _threadno in 0..THREADS { let files = files.clone(); + let ctx = ctx.detached_child(TaskKind::UnitTest, DownloadBehavior::Error); let hdl = rt.spawn(async move { let mut buf = vec![0u8; SIZE]; let mut rng = rand::rngs::OsRng; for _ in 1..1000 { let f = &files[rng.gen_range(0..files.len())]; - buf = f.read_exact_at(buf, 0).await.unwrap(); + buf = f.read_exact_at(buf, 0, &ctx).await.unwrap(); assert!(buf == SAMPLE); } }); @@ -1446,6 +1522,7 @@ mod tests { #[tokio::test] async fn test_atomic_overwrite_basic() { + let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error); let testdir = crate::config::PageServerConf::test_repo_dir("test_atomic_overwrite_basic"); std::fs::create_dir_all(&testdir).unwrap(); @@ -1455,8 +1532,8 @@ mod tests { VirtualFile::crashsafe_overwrite(path.clone(), tmp_path.clone(), b"foo".to_vec()) .await .unwrap(); - let mut file = MaybeVirtualFile::from(VirtualFile::open(&path).await.unwrap()); - let post = file.read_string().await.unwrap(); + let mut file = MaybeVirtualFile::from(VirtualFile::open(&path, &ctx).await.unwrap()); + let post = file.read_string(&ctx).await.unwrap(); assert_eq!(post, "foo"); assert!(!tmp_path.exists()); drop(file); @@ -1464,8 +1541,8 @@ mod tests { VirtualFile::crashsafe_overwrite(path.clone(), tmp_path.clone(), b"bar".to_vec()) .await .unwrap(); - let mut file = MaybeVirtualFile::from(VirtualFile::open(&path).await.unwrap()); - let post = file.read_string().await.unwrap(); + let mut file = MaybeVirtualFile::from(VirtualFile::open(&path, &ctx).await.unwrap()); + let post = file.read_string(&ctx).await.unwrap(); assert_eq!(post, "bar"); assert!(!tmp_path.exists()); drop(file); @@ -1473,6 +1550,7 @@ mod tests { #[tokio::test] async fn test_atomic_overwrite_preexisting_tmp() { + let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error); let testdir = crate::config::PageServerConf::test_repo_dir("test_atomic_overwrite_preexisting_tmp"); std::fs::create_dir_all(&testdir).unwrap(); @@ -1487,8 +1565,8 @@ mod tests { .await .unwrap(); - let mut file = MaybeVirtualFile::from(VirtualFile::open(&path).await.unwrap()); - let post = file.read_string().await.unwrap(); + let mut file = MaybeVirtualFile::from(VirtualFile::open(&path, &ctx).await.unwrap()); + let post = file.read_string(&ctx).await.unwrap(); assert_eq!(post, "foo"); assert!(!tmp_path.exists()); drop(file); diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index 9776d4ce88..3decea0c6d 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -153,10 +153,7 @@ impl PostgresRedoManager { process: self .redo_process .get() - .map(|p| WalRedoManagerProcessStatus { - pid: p.id(), - kind: std::borrow::Cow::Borrowed(p.kind().into()), - }), + .map(|p| WalRedoManagerProcessStatus { pid: p.id() }), } } } diff --git a/pageserver/src/walredo/process.rs b/pageserver/src/walredo/process.rs index ad6b4e5fe9..02c9c04bf1 100644 --- a/pageserver/src/walredo/process.rs +++ b/pageserver/src/walredo/process.rs @@ -1,7 +1,10 @@ +/// Layer of indirection previously used to support multiple implementations. +/// Subject to removal: use std::time::Duration; use bytes::Bytes; use pageserver_api::{reltag::RelTag, shard::TenantShardId}; +use tracing::warn; use utils::lsn::Lsn; use crate::{config::PageServerConf, walrecord::NeonWalRecord}; @@ -12,7 +15,6 @@ mod protocol; mod process_impl { pub(super) mod process_async; - pub(super) mod process_std; } #[derive( @@ -34,10 +36,7 @@ pub enum Kind { Async, } -pub(crate) enum Process { - Sync(process_impl::process_std::WalRedoProcess), - Async(process_impl::process_async::WalRedoProcess), -} +pub(crate) struct Process(process_impl::process_async::WalRedoProcess); impl Process { #[inline(always)] @@ -46,18 +45,17 @@ impl Process { tenant_shard_id: TenantShardId, pg_version: u32, ) -> anyhow::Result { - Ok(match conf.walredo_process_kind { - Kind::Sync => Self::Sync(process_impl::process_std::WalRedoProcess::launch( - conf, - tenant_shard_id, - pg_version, - )?), - Kind::Async => Self::Async(process_impl::process_async::WalRedoProcess::launch( - conf, - tenant_shard_id, - pg_version, - )?), - }) + if conf.walredo_process_kind != Kind::Async { + warn!( + configured = %conf.walredo_process_kind, + "the walredo_process_kind setting has been turned into a no-op, using async implementation" + ); + } + Ok(Self(process_impl::process_async::WalRedoProcess::launch( + conf, + tenant_shard_id, + pg_version, + )?)) } #[inline(always)] @@ -69,29 +67,12 @@ impl Process { records: &[(Lsn, NeonWalRecord)], wal_redo_timeout: Duration, ) -> anyhow::Result { - match self { - Process::Sync(p) => { - p.apply_wal_records(rel, blknum, base_img, records, wal_redo_timeout) - .await - } - Process::Async(p) => { - p.apply_wal_records(rel, blknum, base_img, records, wal_redo_timeout) - .await - } - } + self.0 + .apply_wal_records(rel, blknum, base_img, records, wal_redo_timeout) + .await } pub(crate) fn id(&self) -> u32 { - match self { - Process::Sync(p) => p.id(), - Process::Async(p) => p.id(), - } - } - - pub(crate) fn kind(&self) -> Kind { - match self { - Process::Sync(_) => Kind::Sync, - Process::Async(_) => Kind::Async, - } + self.0.id() } } diff --git a/pageserver/src/walredo/process/process_impl/process_std.rs b/pageserver/src/walredo/process/process_impl/process_std.rs deleted file mode 100644 index e7a6c263c9..0000000000 --- a/pageserver/src/walredo/process/process_impl/process_std.rs +++ /dev/null @@ -1,405 +0,0 @@ -use self::no_leak_child::NoLeakChild; -use crate::{ - config::PageServerConf, - metrics::{WalRedoKillCause, WAL_REDO_PROCESS_COUNTERS, WAL_REDO_RECORD_COUNTER}, - walrecord::NeonWalRecord, - walredo::process::{no_leak_child, protocol}, -}; -use anyhow::Context; -use bytes::Bytes; -use nix::poll::{PollFd, PollFlags}; -use pageserver_api::{reltag::RelTag, shard::TenantShardId}; -use postgres_ffi::BLCKSZ; -use std::os::fd::AsRawFd; -#[cfg(feature = "testing")] -use std::sync::atomic::AtomicUsize; -use std::{ - collections::VecDeque, - io::{Read, Write}, - process::{ChildStdin, ChildStdout, Command, Stdio}, - sync::{Mutex, MutexGuard}, - time::Duration, -}; -use tracing::{debug, error, instrument, Instrument}; -use utils::{lsn::Lsn, nonblock::set_nonblock}; - -pub struct WalRedoProcess { - #[allow(dead_code)] - conf: &'static PageServerConf, - tenant_shard_id: TenantShardId, - // Some() on construction, only becomes None on Drop. - child: Option, - stdout: Mutex, - stdin: Mutex, - /// Counter to separate same sized walredo inputs failing at the same millisecond. - #[cfg(feature = "testing")] - dump_sequence: AtomicUsize, -} - -struct ProcessInput { - stdin: ChildStdin, - n_requests: usize, -} - -struct ProcessOutput { - stdout: ChildStdout, - pending_responses: VecDeque>, - n_processed_responses: usize, -} - -impl WalRedoProcess { - // - // Start postgres binary in special WAL redo mode. - // - #[instrument(skip_all,fields(pg_version=pg_version))] - pub(crate) fn launch( - conf: &'static PageServerConf, - tenant_shard_id: TenantShardId, - pg_version: u32, - ) -> anyhow::Result { - crate::span::debug_assert_current_span_has_tenant_id(); - - let pg_bin_dir_path = conf.pg_bin_dir(pg_version).context("pg_bin_dir")?; // TODO these should be infallible. - let pg_lib_dir_path = conf.pg_lib_dir(pg_version).context("pg_lib_dir")?; - - use no_leak_child::NoLeakChildCommandExt; - // Start postgres itself - let child = Command::new(pg_bin_dir_path.join("postgres")) - // the first arg must be --wal-redo so the child process enters into walredo mode - .arg("--wal-redo") - // the child doesn't process this arg, but, having it in the argv helps indentify the - // walredo process for a particular tenant when debugging a pagserver - .args(["--tenant-shard-id", &format!("{tenant_shard_id}")]) - .stdin(Stdio::piped()) - .stderr(Stdio::piped()) - .stdout(Stdio::piped()) - .env_clear() - .env("LD_LIBRARY_PATH", &pg_lib_dir_path) - .env("DYLD_LIBRARY_PATH", &pg_lib_dir_path) - // NB: The redo process is not trusted after we sent it the first - // walredo work. Before that, it is trusted. Specifically, we trust - // it to - // 1. close all file descriptors except stdin, stdout, stderr because - // pageserver might not be 100% diligent in setting FD_CLOEXEC on all - // the files it opens, and - // 2. to use seccomp to sandbox itself before processing the first - // walredo request. - .spawn_no_leak_child(tenant_shard_id) - .context("spawn process")?; - WAL_REDO_PROCESS_COUNTERS.started.inc(); - let mut child = scopeguard::guard(child, |child| { - error!("killing wal-redo-postgres process due to a problem during launch"); - child.kill_and_wait(WalRedoKillCause::Startup); - }); - - let stdin = child.stdin.take().unwrap(); - let stdout = child.stdout.take().unwrap(); - let stderr = child.stderr.take().unwrap(); - let stderr = tokio::process::ChildStderr::from_std(stderr) - .context("convert to tokio::ChildStderr")?; - macro_rules! set_nonblock_or_log_err { - ($file:ident) => {{ - let res = set_nonblock($file.as_raw_fd()); - if let Err(e) = &res { - error!(error = %e, file = stringify!($file), pid = child.id(), "set_nonblock failed"); - } - res - }}; - } - set_nonblock_or_log_err!(stdin)?; - set_nonblock_or_log_err!(stdout)?; - - // all fallible operations post-spawn are complete, so get rid of the guard - let child = scopeguard::ScopeGuard::into_inner(child); - - tokio::spawn( - async move { - scopeguard::defer! { - debug!("wal-redo-postgres stderr_logger_task finished"); - crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_finished.inc(); - } - debug!("wal-redo-postgres stderr_logger_task started"); - crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_started.inc(); - - use tokio::io::AsyncBufReadExt; - let mut stderr_lines = tokio::io::BufReader::new(stderr); - let mut buf = Vec::new(); - let res = loop { - buf.clear(); - // TODO we don't trust the process to cap its stderr length. - // Currently it can do unbounded Vec allocation. - match stderr_lines.read_until(b'\n', &mut buf).await { - Ok(0) => break Ok(()), // eof - Ok(num_bytes) => { - let output = String::from_utf8_lossy(&buf[..num_bytes]); - error!(%output, "received output"); - } - Err(e) => { - break Err(e); - } - } - }; - match res { - Ok(()) => (), - Err(e) => { - error!(error=?e, "failed to read from walredo stderr"); - } - } - }.instrument(tracing::info_span!(parent: None, "wal-redo-postgres-stderr", pid = child.id(), tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %pg_version)) - ); - - Ok(Self { - conf, - tenant_shard_id, - child: Some(child), - stdin: Mutex::new(ProcessInput { - stdin, - n_requests: 0, - }), - stdout: Mutex::new(ProcessOutput { - stdout, - pending_responses: VecDeque::new(), - n_processed_responses: 0, - }), - #[cfg(feature = "testing")] - dump_sequence: AtomicUsize::default(), - }) - } - - pub(crate) fn id(&self) -> u32 { - self.child - .as_ref() - .expect("must not call this during Drop") - .id() - } - - // Apply given WAL records ('records') over an old page image. Returns - // new page image. - // - #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), pid=%self.id()))] - pub(crate) async fn apply_wal_records( - &self, - rel: RelTag, - blknum: u32, - base_img: &Option, - records: &[(Lsn, NeonWalRecord)], - wal_redo_timeout: Duration, - ) -> anyhow::Result { - let tag = protocol::BufferTag { rel, blknum }; - let input = self.stdin.lock().unwrap(); - - // Serialize all the messages to send the WAL redo process first. - // - // This could be problematic if there are millions of records to replay, - // but in practice the number of records is usually so small that it doesn't - // matter, and it's better to keep this code simple. - // - // Most requests start with a before-image with BLCKSZ bytes, followed by - // by some other WAL records. Start with a buffer that can hold that - // comfortably. - let mut writebuf: Vec = Vec::with_capacity((BLCKSZ as usize) * 3); - protocol::build_begin_redo_for_block_msg(tag, &mut writebuf); - if let Some(img) = base_img { - protocol::build_push_page_msg(tag, img, &mut writebuf); - } - for (lsn, rec) in records.iter() { - if let NeonWalRecord::Postgres { - will_init: _, - rec: postgres_rec, - } = rec - { - protocol::build_apply_record_msg(*lsn, postgres_rec, &mut writebuf); - } else { - anyhow::bail!("tried to pass neon wal record to postgres WAL redo"); - } - } - protocol::build_get_page_msg(tag, &mut writebuf); - WAL_REDO_RECORD_COUNTER.inc_by(records.len() as u64); - - let res = self.apply_wal_records0(&writebuf, input, wal_redo_timeout); - - if res.is_err() { - // not all of these can be caused by this particular input, however these are so rare - // in tests so capture all. - self.record_and_log(&writebuf); - } - - res - } - - fn apply_wal_records0( - &self, - writebuf: &[u8], - input: MutexGuard, - wal_redo_timeout: Duration, - ) -> anyhow::Result { - let mut proc = { input }; // TODO: remove this legacy rename, but this keep the patch small. - let mut nwrite = 0usize; - - while nwrite < writebuf.len() { - let mut stdin_pollfds = [PollFd::new(&proc.stdin, PollFlags::POLLOUT)]; - let n = loop { - match nix::poll::poll(&mut stdin_pollfds[..], wal_redo_timeout.as_millis() as i32) { - Err(nix::errno::Errno::EINTR) => continue, - res => break res, - } - }?; - - if n == 0 { - anyhow::bail!("WAL redo timed out"); - } - - // If 'stdin' is writeable, do write. - let in_revents = stdin_pollfds[0].revents().unwrap(); - if in_revents & (PollFlags::POLLERR | PollFlags::POLLOUT) != PollFlags::empty() { - nwrite += proc.stdin.write(&writebuf[nwrite..])?; - } - if in_revents.contains(PollFlags::POLLHUP) { - // We still have more data to write, but the process closed the pipe. - anyhow::bail!("WAL redo process closed its stdin unexpectedly"); - } - } - let request_no = proc.n_requests; - proc.n_requests += 1; - drop(proc); - - // To improve walredo performance we separate sending requests and receiving - // responses. Them are protected by different mutexes (output and input). - // If thread T1, T2, T3 send requests D1, D2, D3 to walredo process - // then there is not warranty that T1 will first granted output mutex lock. - // To address this issue we maintain number of sent requests, number of processed - // responses and ring buffer with pending responses. After sending response - // (under input mutex), threads remembers request number. Then it releases - // input mutex, locks output mutex and fetch in ring buffer all responses until - // its stored request number. The it takes correspondent element from - // pending responses ring buffer and truncate all empty elements from the front, - // advancing processed responses number. - - let mut output = self.stdout.lock().unwrap(); - let n_processed_responses = output.n_processed_responses; - while n_processed_responses + output.pending_responses.len() <= request_no { - // We expect the WAL redo process to respond with an 8k page image. We read it - // into this buffer. - let mut resultbuf = vec![0; BLCKSZ.into()]; - let mut nresult: usize = 0; // # of bytes read into 'resultbuf' so far - while nresult < BLCKSZ.into() { - let mut stdout_pollfds = [PollFd::new(&output.stdout, PollFlags::POLLIN)]; - // We do two things simultaneously: reading response from stdout - // and forward any logging information that the child writes to its stderr to the page server's log. - let n = loop { - match nix::poll::poll( - &mut stdout_pollfds[..], - wal_redo_timeout.as_millis() as i32, - ) { - Err(nix::errno::Errno::EINTR) => continue, - res => break res, - } - }?; - - if n == 0 { - anyhow::bail!("WAL redo timed out"); - } - - // If we have some data in stdout, read it to the result buffer. - let out_revents = stdout_pollfds[0].revents().unwrap(); - if out_revents & (PollFlags::POLLERR | PollFlags::POLLIN) != PollFlags::empty() { - nresult += output.stdout.read(&mut resultbuf[nresult..])?; - } - if out_revents.contains(PollFlags::POLLHUP) { - anyhow::bail!("WAL redo process closed its stdout unexpectedly"); - } - } - output - .pending_responses - .push_back(Some(Bytes::from(resultbuf))); - } - // Replace our request's response with None in `pending_responses`. - // Then make space in the ring buffer by clearing out any seqence of contiguous - // `None`'s from the front of `pending_responses`. - // NB: We can't pop_front() because other requests' responses because another - // requester might have grabbed the output mutex before us: - // T1: grab input mutex - // T1: send request_no 23 - // T1: release input mutex - // T2: grab input mutex - // T2: send request_no 24 - // T2: release input mutex - // T2: grab output mutex - // T2: n_processed_responses + output.pending_responses.len() <= request_no - // 23 0 24 - // T2: enters poll loop that reads stdout - // T2: put response for 23 into pending_responses - // T2: put response for 24 into pending_resposnes - // pending_responses now looks like this: Front Some(response_23) Some(response_24) Back - // T2: takes its response_24 - // pending_responses now looks like this: Front Some(response_23) None Back - // T2: does the while loop below - // pending_responses now looks like this: Front Some(response_23) None Back - // T2: releases output mutex - // T1: grabs output mutex - // T1: n_processed_responses + output.pending_responses.len() > request_no - // 23 2 23 - // T1: skips poll loop that reads stdout - // T1: takes its response_23 - // pending_responses now looks like this: Front None None Back - // T2: does the while loop below - // pending_responses now looks like this: Front Back - // n_processed_responses now has value 25 - let res = output.pending_responses[request_no - n_processed_responses] - .take() - .expect("we own this request_no, nobody else is supposed to take it"); - while let Some(front) = output.pending_responses.front() { - if front.is_none() { - output.pending_responses.pop_front(); - output.n_processed_responses += 1; - } else { - break; - } - } - Ok(res) - } - - #[cfg(feature = "testing")] - fn record_and_log(&self, writebuf: &[u8]) { - use std::sync::atomic::Ordering; - - let millis = std::time::SystemTime::now() - .duration_since(std::time::SystemTime::UNIX_EPOCH) - .unwrap() - .as_millis(); - - let seq = self.dump_sequence.fetch_add(1, Ordering::Relaxed); - - // these files will be collected to an allure report - let filename = format!("walredo-{millis}-{}-{seq}.walredo", writebuf.len()); - - let path = self.conf.tenant_path(&self.tenant_shard_id).join(&filename); - - let res = std::fs::OpenOptions::new() - .write(true) - .create_new(true) - .read(true) - .open(path) - .and_then(|mut f| f.write_all(writebuf)); - - // trip up allowed_errors - if let Err(e) = res { - tracing::error!(target=%filename, length=writebuf.len(), "failed to write out the walredo errored input: {e}"); - } else { - tracing::error!(filename, "erroring walredo input saved"); - } - } - - #[cfg(not(feature = "testing"))] - fn record_and_log(&self, _: &[u8]) {} -} - -impl Drop for WalRedoProcess { - fn drop(&mut self) { - self.child - .take() - .expect("we only do this once") - .kill_and_wait(WalRedoKillCause::WalRedoProcessDrop); - // no way to wait for stderr_logger_task from Drop because that is async only - } -} diff --git a/patches/pgvector.patch b/patches/pgvector.patch new file mode 100644 index 0000000000..84ac6644c5 --- /dev/null +++ b/patches/pgvector.patch @@ -0,0 +1,78 @@ +From 0b0194a57bd0f3598bd57dbedd0df3932330169d Mon Sep 17 00:00:00 2001 +From: Heikki Linnakangas +Date: Fri, 2 Feb 2024 22:26:45 +0200 +Subject: [PATCH 1/1] Make v0.6.0 work with Neon + +Now that the WAL-logging happens as a separate step at the end of the +build, we need a few neon-specific hints to make it work. +--- + src/hnswbuild.c | 36 ++++++++++++++++++++++++++++++++++++ + 1 file changed, 36 insertions(+) + +diff --git a/src/hnswbuild.c b/src/hnswbuild.c +index 680789b..ec54dea 100644 +--- a/src/hnswbuild.c ++++ b/src/hnswbuild.c +@@ -840,9 +840,17 @@ HnswParallelBuildMain(dsm_segment *seg, shm_toc *toc) + + hnswarea = shm_toc_lookup(toc, PARALLEL_KEY_HNSW_AREA, false); + ++#ifdef NEON_SMGR ++ smgr_start_unlogged_build(RelationGetSmgr(indexRel)); ++#endif ++ + /* Perform inserts */ + HnswParallelScanAndInsert(heapRel, indexRel, hnswshared, hnswarea, false); + ++#ifdef NEON_SMGR ++ smgr_finish_unlogged_build_phase_1(RelationGetSmgr(indexRel)); ++#endif ++ + /* Close relations within worker */ + index_close(indexRel, indexLockmode); + table_close(heapRel, heapLockmode); +@@ -1089,13 +1097,41 @@ BuildIndex(Relation heap, Relation index, IndexInfo *indexInfo, + SeedRandom(42); + #endif + ++#ifdef NEON_SMGR ++ smgr_start_unlogged_build(RelationGetSmgr(index)); ++#endif ++ + InitBuildState(buildstate, heap, index, indexInfo, forkNum); + + BuildGraph(buildstate, forkNum); + ++#ifdef NEON_SMGR ++ smgr_finish_unlogged_build_phase_1(RelationGetSmgr(index)); ++#endif ++ + if (RelationNeedsWAL(index)) ++ { + log_newpage_range(index, forkNum, 0, RelationGetNumberOfBlocks(index), true); + ++#ifdef NEON_SMGR ++ { ++#if PG_VERSION_NUM >= 160000 ++ RelFileLocator rlocator = RelationGetSmgr(index)->smgr_rlocator.locator; ++#else ++ RelFileNode rlocator = RelationGetSmgr(index)->smgr_rnode.node; ++#endif ++ ++ SetLastWrittenLSNForBlockRange(XactLastRecEnd, rlocator, ++ MAIN_FORKNUM, 0, RelationGetNumberOfBlocks(index)); ++ SetLastWrittenLSNForRelation(XactLastRecEnd, rlocator, MAIN_FORKNUM); ++ } ++#endif ++ } ++ ++#ifdef NEON_SMGR ++ smgr_end_unlogged_build(RelationGetSmgr(index)); ++#endif ++ + FreeBuildState(buildstate); + } + +-- +2.39.2 + diff --git a/pgxn/neon/pagestore_client.h b/pgxn/neon/pagestore_client.h index 7709ab9d42..8951e6607b 100644 --- a/pgxn/neon/pagestore_client.h +++ b/pgxn/neon/pagestore_client.h @@ -237,18 +237,50 @@ extern void neon_zeroextend(SMgrRelation reln, ForkNumber forknum, extern bool neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum); +/* + * LSN values associated with each request to the pageserver + */ +typedef struct +{ + /* + * 'request_lsn' is the main value that determines which page version to + * fetch. + */ + XLogRecPtr request_lsn; + + /* + * A hint to the pageserver that the requested page hasn't been modified + * between this LSN and 'request_lsn'. That allows the pageserver to + * return the page faster, without waiting for 'request_lsn' to arrive in + * the pageserver, as long as 'not_modified_since' has arrived. + */ + XLogRecPtr not_modified_since; + + /* + * 'effective_request_lsn' is not included in the request that's sent to + * the pageserver, but is used to keep track of the latest LSN of when the + * request was made. In a standby server, this is always the same as the + * 'request_lsn', but in the primary we use UINT64_MAX as the + * 'request_lsn' to request the latest page version, so we need this + * separate field to remember that latest LSN was when the request was + * made. It's needed to manage prefetch request, to verify if the response + * to a prefetched request is still valid. + */ + XLogRecPtr effective_request_lsn; +} neon_request_lsns; + #if PG_MAJORVERSION_NUM < 16 extern void neon_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer); extern PGDLLEXPORT void neon_read_at_lsn(NRelFileInfo rnode, ForkNumber forkNum, BlockNumber blkno, - XLogRecPtr request_lsn, XLogRecPtr not_modified_since, char *buffer); + neon_request_lsns request_lsns, char *buffer); extern void neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool skipFsync); #else extern void neon_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, void *buffer); extern PGDLLEXPORT void neon_read_at_lsn(NRelFileInfo rnode, ForkNumber forkNum, BlockNumber blkno, - XLogRecPtr request_lsn, XLogRecPtr not_modified_since, void *buffer); + neon_request_lsns request_lsns, void *buffer); extern void neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const void *buffer, bool skipFsync); #endif diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c index 44ecdbd9aa..41546eae85 100644 --- a/pgxn/neon/pagestore_smgr.c +++ b/pgxn/neon/pagestore_smgr.c @@ -45,6 +45,7 @@ */ #include "postgres.h" +#include "access/parallel.h" #include "access/xact.h" #include "access/xlog.h" #include "access/xlogdefs.h" @@ -168,8 +169,7 @@ typedef enum PrefetchStatus typedef struct PrefetchRequest { BufferTag buftag; /* must be first entry in the struct */ - XLogRecPtr request_lsn; - XLogRecPtr not_modified_since; + neon_request_lsns request_lsns; NeonResponse *response; /* may be null */ PrefetchStatus status; shardno_t shard_no; @@ -271,16 +271,15 @@ static PrefetchState *MyPState; static bool compact_prefetch_buffers(void); static void consume_prefetch_responses(void); -static uint64 prefetch_register_buffer(BufferTag tag, XLogRecPtr *force_request_lsn, XLogRecPtr *force_not_modified_since); +static uint64 prefetch_register_buffer(BufferTag tag, neon_request_lsns *force_request_lsns); static bool prefetch_read(PrefetchRequest *slot); -static void prefetch_do_request(PrefetchRequest *slot, XLogRecPtr *force_request_lsn, XLogRecPtr *force_not_modified_since); +static void prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns); static bool prefetch_wait_for(uint64 ring_index); static void prefetch_cleanup_trailing_unused(void); static inline void prefetch_set_unused(uint64 ring_index); -static void neon_get_request_lsn(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno, - XLogRecPtr *request_lsn, XLogRecPtr *not_modified_since); -static bool neon_prefetch_response_usable(XLogRecPtr request_lsn, XLogRecPtr not_modified_since, +static neon_request_lsns neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno); +static bool neon_prefetch_response_usable(neon_request_lsns request_lsns, PrefetchRequest *slot); static bool @@ -338,8 +337,7 @@ compact_prefetch_buffers(void) target_slot->shard_no = source_slot->shard_no; target_slot->status = source_slot->status; target_slot->response = source_slot->response; - target_slot->request_lsn = source_slot->request_lsn; - target_slot->not_modified_since = source_slot->not_modified_since; + target_slot->request_lsns = source_slot->request_lsns; target_slot->my_ring_index = empty_ring_index; prfh_delete(MyPState->prf_hash, source_slot); @@ -358,8 +356,9 @@ compact_prefetch_buffers(void) }; source_slot->response = NULL; source_slot->my_ring_index = 0; - source_slot->request_lsn = InvalidXLogRecPtr; - source_slot->not_modified_since = InvalidXLogRecPtr; + source_slot->request_lsns = (neon_request_lsns) { + InvalidXLogRecPtr, InvalidXLogRecPtr, InvalidXLogRecPtr + }; /* update bookkeeping */ n_moved++; @@ -689,7 +688,7 @@ prefetch_set_unused(uint64 ring_index) * prefetch_wait_for(). */ static void -prefetch_do_request(PrefetchRequest *slot, XLogRecPtr *force_request_lsn, XLogRecPtr *force_not_modified_since) +prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns) { bool found; NeonGetPageRequest request = { @@ -700,23 +699,14 @@ prefetch_do_request(PrefetchRequest *slot, XLogRecPtr *force_request_lsn, XLogRe .blkno = slot->buftag.blockNum, }; - Assert(((force_request_lsn != NULL) == (force_not_modified_since != NULL))); - - if (force_request_lsn) - { - request.req.lsn = *force_request_lsn; - request.req.not_modified_since = *force_not_modified_since; - } + if (force_request_lsns) + slot->request_lsns = *force_request_lsns; else - { - neon_get_request_lsn(BufTagGetNRelFileInfo(slot->buftag), - slot->buftag.forkNum, - slot->buftag.blockNum, - &request.req.lsn, - &request.req.not_modified_since); - } - slot->request_lsn = request.req.lsn; - slot->not_modified_since = request.req.not_modified_since; + slot->request_lsns = neon_get_request_lsns(BufTagGetNRelFileInfo(slot->buftag), + slot->buftag.forkNum, + slot->buftag.blockNum); + request.req.lsn = slot->request_lsns.request_lsn; + request.req.not_modified_since = slot->request_lsns.not_modified_since; Assert(slot->response == NULL); Assert(slot->my_ring_index == MyPState->ring_unused); @@ -742,25 +732,22 @@ prefetch_do_request(PrefetchRequest *slot, XLogRecPtr *force_request_lsn, XLogRe * * Register that we may want the contents of BufferTag in the near future. * - * If force_request_lsn and force_not_modified_since are not NULL, those - * values are sent to the pageserver. If they are NULL, we utilize the - * lastWrittenLsn -infrastructure to fill them in. + * If force_request_lsns is not NULL, those values are sent to the + * pageserver. If NULL, we utilize the lastWrittenLsn -infrastructure + * to calculate the LSNs to send. * * NOTE: this function may indirectly update MyPState->pfs_hash; which * invalidates any active pointers into the hash table. */ static uint64 -prefetch_register_buffer(BufferTag tag, XLogRecPtr *force_request_lsn, - XLogRecPtr *force_not_modified_since) +prefetch_register_buffer(BufferTag tag, neon_request_lsns *force_request_lsns) { uint64 ring_index; PrefetchRequest req; PrefetchRequest *slot; PrfHashEntry *entry; - Assert(((force_request_lsn != NULL) == (force_not_modified_since != NULL))); - /* use an intermediate PrefetchRequest struct to ensure correct alignment */ req.buftag = tag; Retry: @@ -781,10 +768,9 @@ Retry: * If the caller specified a request LSN to use, only accept prefetch * responses that satisfy that request. */ - if (force_request_lsn) + if (force_request_lsns) { - if (!neon_prefetch_response_usable(*force_request_lsn, - *force_not_modified_since, slot)) + if (!neon_prefetch_response_usable(*force_request_lsns, slot)) { /* Wait for the old request to finish and discard it */ if (!prefetch_wait_for(ring_index)) @@ -886,7 +872,7 @@ Retry: slot->shard_no = get_shard_number(&tag); slot->my_ring_index = ring_index; - prefetch_do_request(slot, force_request_lsn, force_not_modified_since); + prefetch_do_request(slot, force_request_lsns); Assert(slot->status == PRFS_REQUESTED); Assert(MyPState->ring_last <= ring_index && ring_index < MyPState->ring_unused); @@ -1363,6 +1349,10 @@ PageIsEmptyHeapPage(char *buffer) return memcmp(buffer, empty_page.data, BLCKSZ) == 0; } +/* + * A page is being evicted from the shared buffer cache. Update the + * last-written LSN of the page, and WAL-log it if needed. + */ static void #if PG_MAJORVERSION_NUM < 16 neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool force) @@ -1371,12 +1361,7 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co #endif { XLogRecPtr lsn = PageGetLSN((Page) buffer); - - if (ShutdownRequestPending) - return; - /* Don't log any pages if we're not allowed to do so. */ - if (!XLogInsertAllowed()) - return; + bool log_page; /* * Whenever a VM or FSM page is evicted, WAL-log it. FSM and (some) VM @@ -1385,9 +1370,21 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co * correctness, the non-logged updates are not critical. But we want to * have a reasonably up-to-date VM and FSM in the page server. */ - if ((force || forknum == FSM_FORKNUM || forknum == VISIBILITYMAP_FORKNUM) && !RecoveryInProgress()) + log_page = false; + if (force) + { + Assert(XLogInsertAllowed()); + log_page = true; + } + else if (XLogInsertAllowed() && + !ShutdownRequestPending && + (forknum == FSM_FORKNUM || forknum == VISIBILITYMAP_FORKNUM)) + { + log_page = true; + } + + if (log_page) { - /* FSM is never WAL-logged and we don't care. */ XLogRecPtr recptr; recptr = log_newpage_copy(&InfoFromSMgrRel(reln), forknum, blocknum, @@ -1400,7 +1397,8 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co RelFileInfoFmt(InfoFromSMgrRel(reln)), forknum, LSN_FORMAT_ARGS(lsn)))); } - else if (lsn == InvalidXLogRecPtr) + + if (lsn == InvalidXLogRecPtr) { /* * When PostgreSQL extends a relation, it calls smgrextend() with an @@ -1436,19 +1434,31 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co RelFileInfoFmt(InfoFromSMgrRel(reln)), forknum))); } - else + else if (forknum != FSM_FORKNUM && forknum != VISIBILITYMAP_FORKNUM) { - ereport(PANIC, + /* + * Its a bad sign if there is a page with zero LSN in the buffer + * cache in a standby, too. However, PANICing seems like a cure + * worse than the disease, as the damage has likely already been + * done in the primary. So in a standby, make this an assertion, + * and in a release build just LOG the error and soldier on. We + * update the last-written LSN of the page with a conservative + * value in that case, which is the last replayed LSN. + */ + ereport(RecoveryInProgress() ? LOG : PANIC, (errmsg(NEON_TAG "Page %u of relation %u/%u/%u.%u is evicted with zero LSN", blocknum, RelFileInfoFmt(InfoFromSMgrRel(reln)), forknum))); + Assert(false); + + lsn = GetXLogReplayRecPtr(NULL); /* in standby mode, soldier on */ } } else { ereport(SmgrTrace, - (errmsg(NEON_TAG "Page %u of relation %u/%u/%u.%u is already wal logged at lsn=%X/%X", + (errmsg(NEON_TAG "Evicting page %u of relation %u/%u/%u.%u with lsn=%X/%X", blocknum, RelFileInfoFmt(InfoFromSMgrRel(reln)), forknum, LSN_FORMAT_ARGS(lsn)))); @@ -1529,11 +1539,11 @@ nm_adjust_lsn(XLogRecPtr lsn) /* * Return LSN for requesting pages and number of blocks from page server */ -static void -neon_get_request_lsn(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno, - XLogRecPtr *request_lsn, XLogRecPtr *not_modified_since) +static neon_request_lsns +neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno) { XLogRecPtr last_written_lsn; + neon_request_lsns result; last_written_lsn = GetLastWrittenLSN(rinfo, forknum, blkno); last_written_lsn = nm_adjust_lsn(last_written_lsn); @@ -1541,13 +1551,98 @@ neon_get_request_lsn(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno, if (RecoveryInProgress()) { - /* Request the page at the last replayed LSN. */ - *request_lsn = GetXLogReplayRecPtr(NULL); - *not_modified_since = last_written_lsn; - Assert(last_written_lsn <= *request_lsn); + /*--- + * In broad strokes, a replica always requests the page at the current + * replay LSN. But looking closer, what exactly is the replay LSN? Is + * it the last replayed record, or the record being replayed? And does + * the startup process performing the replay need to do something + * differently than backends running queries? Let's take a closer look + * at the different scenarios: + * + * 1. Startup process reads a page, last_written_lsn is old. + * + * Read the old version of the page. We will apply the WAL record on + * it to bring it up-to-date. + * + * We could read the new version, with the changes from this WAL + * record already applied, to offload the work of replaying the record + * to the pageserver. The pageserver might not have received the WAL + * record yet, though, so a read of the old page version and applying + * the record ourselves is likely faster. Also, the redo function + * might be surprised if the changes have already applied. That's + * normal during crash recovery, but not in hot standby. + * + * 2. Startup process reads a page, last_written_lsn == record we're + * replaying. + * + * Can this happen? There are a few theoretical cases when it might: + * + * A) The redo function reads the same page twice. We had already read + * and applied the changes once, and now we're reading it for the + * second time. That would be a rather silly thing for a redo + * function to do, and I'm not aware of any that would do it. + * + * B) The redo function modifies multiple pages, and it already + * applied the changes to one of the pages, released the lock on + * it, and is now reading a second page. Furthermore, the first + * page was already evicted from the buffer cache, and also from + * the last-written LSN cache, so that the per-relation or global + * last-written LSN was already updated. All the WAL redo functions + * hold the locks on pages that they modify, until all the changes + * have been modified (?), which would make that impossible. + * However, we skip the locking, if the page isn't currently in the + * page cache (see neon_redo_read_buffer_filter below). + * + * Even if the one of the above cases were possible in theory, they + * would also require the pages being modified by the redo function to + * be immediately evicted from the page cache. + * + * So this probably does not happen in practice. But if it does, we + * request the new version, including the changes from the record + * being replayed. That seems like the correct behavior in any case. + * + * 3. Backend process reads a page with old last-written LSN + * + * Nothing special here. Read the old version. + * + * 4. Backend process reads a page with last_written_lsn == record being replayed + * + * This can happen, if the redo function has started to run, and saw + * that the page isn't present in the page cache (see + * neon_redo_read_buffer_filter below). Normally, in a normal + * Postgres server, the redo function would hold a lock on the page, + * so we would get blocked waiting the redo function to release the + * lock. To emulate that, wait for the WAL replay of the record to + * finish. + */ + /* Request the page at the end of the last fully replayed LSN. */ + XLogRecPtr replay_lsn = GetXLogReplayRecPtr(NULL); - neon_log(DEBUG1, "neon_get_request_lsn request lsn %X/%X, not_modified_since %X/%X", - LSN_FORMAT_ARGS(*request_lsn), LSN_FORMAT_ARGS(*not_modified_since)); + if (last_written_lsn > replay_lsn) + { + /* GetCurrentReplayRecPtr was introduced in v15 */ +#if PG_VERSION_NUM >= 150000 + Assert(last_written_lsn == GetCurrentReplayRecPtr(NULL)); +#endif + + /* + * Cases 2 and 4. If this is a backend (case 4), the + * neon_read_at_lsn() call later will wait for the WAL record to be + * fully replayed. + */ + result.request_lsn = last_written_lsn; + } + else + { + /* cases 1 and 3 */ + result.request_lsn = replay_lsn; + } + result.not_modified_since = last_written_lsn; + result.effective_request_lsn = result.request_lsn; + Assert(last_written_lsn <= result.request_lsn); + + neon_log(DEBUG1, "neon_get_request_lsns request lsn %X/%X, not_modified_since %X/%X", + LSN_FORMAT_ARGS(result.request_lsn), LSN_FORMAT_ARGS(result.not_modified_since)); } else { @@ -1559,7 +1654,7 @@ neon_get_request_lsn(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno, * must still in the buffer cache, so our request cannot concern * those. */ - neon_log(DEBUG1, "neon_get_request_lsn GetLastWrittenLSN lsn %X/%X ", + neon_log(DEBUG1, "neon_get_request_lsns GetLastWrittenLSN lsn %X/%X", LSN_FORMAT_ARGS(last_written_lsn)); /* @@ -1585,16 +1680,33 @@ neon_get_request_lsn(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno, } /* - * Request the latest version of the page. The most up-to-date request - * LSN we could use would be the current insert LSN, but to avoid the - * overhead of looking it up, use 'flushlsn' instead. This relies on - * the assumption that if the page was modified since the last WAL - * flush, it should still be in the buffer cache, and we wouldn't be - * requesting it. + * Request the very latest version of the page. In principle we + * want to read the page at the current insert LSN, and we could + * use that value in the request. However, there's a corner case + * with pageserver's garbage collection. If the GC horizon is + * set to a very small value, it's possible that by the time + * that the pageserver processes our request, the GC horizon has + * already moved past the LSN we calculate here. Standby servers + * always have that problem as the can always lag behind the + * primary, but for the primary we can avoid it by always + * requesting the latest page, by setting request LSN to + * UINT64_MAX. + * + * Remember the current LSN, however, so that we can later + * correctly determine if the response to the request is still + * valid. The most up-to-date LSN we could use for that purpose + * would be the current insert LSN, but to avoid the overhead of + * looking it up, use 'flushlsn' instead. This relies on the + * assumption that if the page was modified since the last WAL + * flush, it should still be in the buffer cache, and we + * wouldn't be requesting it. */ - *request_lsn = flushlsn; - *not_modified_since = last_written_lsn; + result.request_lsn = UINT64_MAX; + result.not_modified_since = last_written_lsn; + result.effective_request_lsn = flushlsn; } + + return result; } /* @@ -1604,12 +1716,16 @@ neon_get_request_lsn(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno, * satisfy a page read now. */ static bool -neon_prefetch_response_usable(XLogRecPtr request_lsn, XLogRecPtr not_modified_since, +neon_prefetch_response_usable(neon_request_lsns request_lsns, PrefetchRequest *slot) { /* sanity check the LSN's on the old and the new request */ - Assert(request_lsn >= not_modified_since); - Assert(slot->request_lsn >= slot->not_modified_since); + Assert(request_lsns.request_lsn >= request_lsns.not_modified_since); + Assert(request_lsns.effective_request_lsn >= request_lsns.not_modified_since); + Assert(request_lsns.effective_request_lsn <= request_lsns.request_lsn); + Assert(slot->request_lsns.request_lsn >= slot->request_lsns.not_modified_since); + Assert(slot->request_lsns.effective_request_lsn >= slot->request_lsns.not_modified_since); + Assert(slot->request_lsns.effective_request_lsn <= slot->request_lsns.request_lsn); Assert(slot->status != PRFS_UNUSED); /* @@ -1627,26 +1743,40 @@ neon_prefetch_response_usable(XLogRecPtr request_lsn, XLogRecPtr not_modified_si * calculate LSNs "out of order" with each other, but the prefetch queue * is backend-private at the moment.) */ - if (request_lsn < slot->request_lsn || not_modified_since < slot->not_modified_since) + if (request_lsns.effective_request_lsn < slot->request_lsns.effective_request_lsn || + request_lsns.not_modified_since < slot->request_lsns.not_modified_since) { ereport(LOG, (errcode(ERRCODE_IO_ERROR), errmsg(NEON_TAG "request with unexpected LSN after prefetch"), errdetail("Request %X/%X not_modified_since %X/%X, prefetch %X/%X not_modified_since %X/%X)", - LSN_FORMAT_ARGS(request_lsn), LSN_FORMAT_ARGS(not_modified_since), - LSN_FORMAT_ARGS(slot->request_lsn), LSN_FORMAT_ARGS(slot->not_modified_since)))); + LSN_FORMAT_ARGS(request_lsns.effective_request_lsn), + LSN_FORMAT_ARGS(request_lsns.not_modified_since), + LSN_FORMAT_ARGS(slot->request_lsns.effective_request_lsn), + LSN_FORMAT_ARGS(slot->request_lsns.not_modified_since)))); return false; } /*--- - * Each request to the pageserver carries two LSN values: - * `not_modified_since` and `request_lsn`. The (not_modified_since, - * request_lsn] range of each request is effectively a claim that the page - * has not been modified between those LSNs. If the range of the old - * request in the queue overlaps with the new request, we know that the - * page hasn't been modified in the union of the ranges. We can use the - * response to old request to satisfy the new request in that case. For - * example: + * Each request to the pageserver has three LSN values associated with it: + * `not_modified_since`, `request_lsn`, and 'effective_request_lsn'. + * `not_modified_since` and `request_lsn` are sent to the pageserver, but + * in the primary node, we always use UINT64_MAX as the `request_lsn`, so + * we remember `effective_request_lsn` separately. In a primary, + * `effective_request_lsn` is the last flush WAL position when the request + * was sent to the pageserver. That's logically the LSN that we are + * requesting the page at, but we send UINT64_MAX to the pageserver so + * that if the GC horizon advances past that position, we still get a + * valid response instead of an error. + * + * To determine whether a response to a GetPage request issued earlier is + * still valid to satisfy a new page read, we look at the + * (not_modified_since, effective_request_lsn] range of the request. It is + * effectively a claim that the page has not been modified between those + * LSNs. If the range of the old request in the queue overlaps with the + * new request, we know that the page hasn't been modified in the union of + * the ranges. We can use the response to old request to satisfy the new + * request in that case. For example: * * 100 500 * Old request: +--------+ @@ -1675,9 +1805,9 @@ neon_prefetch_response_usable(XLogRecPtr request_lsn, XLogRecPtr not_modified_si */ /* this follows from the checks above */ - Assert(request_lsn >= slot->not_modified_since); + Assert(request_lsns.effective_request_lsn >= slot->request_lsns.not_modified_since); - return not_modified_since <= slot->request_lsn; + return request_lsns.not_modified_since <= slot->request_lsns.effective_request_lsn; } /* @@ -1689,8 +1819,7 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum) bool exists; NeonResponse *resp; BlockNumber n_blocks; - XLogRecPtr request_lsn; - XLogRecPtr not_modified_since; + neon_request_lsns request_lsns; switch (reln->smgr_relpersistence) { @@ -1745,15 +1874,15 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum) return false; } - neon_get_request_lsn(InfoFromSMgrRel(reln), forkNum, REL_METADATA_PSEUDO_BLOCKNO, - &request_lsn, ¬_modified_since); + request_lsns = neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, REL_METADATA_PSEUDO_BLOCKNO); { NeonExistsRequest request = { .req.tag = T_NeonExistsRequest, - .req.lsn = request_lsn, - .req.not_modified_since = not_modified_since, + .req.lsn = request_lsns.request_lsn, + .req.not_modified_since = request_lsns.not_modified_since, .rinfo = InfoFromSMgrRel(reln), - .forknum = forkNum}; + .forknum = forkNum + }; resp = page_server_request(&request); } @@ -1770,7 +1899,7 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum) errmsg(NEON_TAG "could not read relation existence of rel %u/%u/%u.%u from page server at lsn %X/%08X", RelFileInfoFmt(InfoFromSMgrRel(reln)), forkNum, - (uint32) (request_lsn >> 32), (uint32) request_lsn), + LSN_FORMAT_ARGS(request_lsns.effective_request_lsn)), errdetail("page server returned error: %s", ((NeonErrorResponse *) resp)->message))); break; @@ -2135,7 +2264,7 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum) CopyNRelFileInfoToBufTag(tag, InfoFromSMgrRel(reln)); - ring_index = prefetch_register_buffer(tag, NULL, NULL); + ring_index = prefetch_register_buffer(tag, NULL); Assert(ring_index < MyPState->ring_unused && MyPState->ring_last <= ring_index); @@ -2188,10 +2317,10 @@ neon_writeback(SMgrRelation reln, ForkNumber forknum, void #if PG_MAJORVERSION_NUM < 16 neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, - XLogRecPtr request_lsn, XLogRecPtr not_modified_since, char *buffer) + neon_request_lsns request_lsns, char *buffer) #else neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, - XLogRecPtr request_lsn, XLogRecPtr not_modified_since, void *buffer) + neon_request_lsns request_lsns, void *buffer) #endif { NeonResponse *resp; @@ -2223,7 +2352,7 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, * value of the LwLsn cache when the entry is not found. */ if (RecoveryInProgress() && !(MyBackendType == B_STARTUP)) - XLogWaitForReplayOf(request_lsn); + XLogWaitForReplayOf(request_lsns.request_lsn); /* * Try to find prefetched page in the list of received pages. @@ -2234,7 +2363,7 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, if (entry != NULL) { slot = entry->slot; - if (neon_prefetch_response_usable(request_lsn, not_modified_since, slot)) + if (neon_prefetch_response_usable(request_lsns, slot)) { ring_index = slot->my_ring_index; pgBufferUsage.prefetch.hits += 1; @@ -2268,8 +2397,7 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, { pgBufferUsage.prefetch.misses += 1; - ring_index = prefetch_register_buffer(buftag, &request_lsn, - ¬_modified_since); + ring_index = prefetch_register_buffer(buftag, &request_lsns); slot = GetPrfSlot(ring_index); } else @@ -2310,7 +2438,7 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, slot->shard_no, blkno, RelFileInfoFmt(rinfo), forkNum, - (uint32) (request_lsn >> 32), (uint32) request_lsn), + LSN_FORMAT_ARGS(request_lsns.effective_request_lsn)), errdetail("page server returned error: %s", ((NeonErrorResponse *) resp)->message))); break; @@ -2333,8 +2461,7 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, char *buffer neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer) #endif { - XLogRecPtr request_lsn; - XLogRecPtr not_modified_since; + neon_request_lsns request_lsns; switch (reln->smgr_relpersistence) { @@ -2359,9 +2486,8 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer return; } - neon_get_request_lsn(InfoFromSMgrRel(reln), forkNum, blkno, - &request_lsn, ¬_modified_since); - neon_read_at_lsn(InfoFromSMgrRel(reln), forkNum, blkno, request_lsn, not_modified_since, buffer); + request_lsns = neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, blkno); + neon_read_at_lsn(InfoFromSMgrRel(reln), forkNum, blkno, request_lsns, buffer); #ifdef DEBUG_COMPARE_LOCAL if (forkNum == MAIN_FORKNUM && IS_LOCAL_REL(reln)) @@ -2530,8 +2656,7 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum) { NeonResponse *resp; BlockNumber n_blocks; - XLogRecPtr request_lsn; - XLogRecPtr not_modified_since; + neon_request_lsns request_lsns; switch (reln->smgr_relpersistence) { @@ -2558,13 +2683,12 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum) return n_blocks; } - neon_get_request_lsn(InfoFromSMgrRel(reln), forknum, REL_METADATA_PSEUDO_BLOCKNO, - &request_lsn, ¬_modified_since); + request_lsns = neon_get_request_lsns(InfoFromSMgrRel(reln), forknum, REL_METADATA_PSEUDO_BLOCKNO); { NeonNblocksRequest request = { .req.tag = T_NeonNblocksRequest, - .req.lsn = request_lsn, - .req.not_modified_since = not_modified_since, + .req.lsn = request_lsns.request_lsn, + .req.not_modified_since = request_lsns.not_modified_since, .rinfo = InfoFromSMgrRel(reln), .forknum = forknum, }; @@ -2584,7 +2708,7 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum) errmsg(NEON_TAG "could not read relation size of rel %u/%u/%u.%u from page server at lsn %X/%08X", RelFileInfoFmt(InfoFromSMgrRel(reln)), forknum, - (uint32) (request_lsn >> 32), (uint32) request_lsn), + LSN_FORMAT_ARGS(request_lsns.effective_request_lsn)), errdetail("page server returned error: %s", ((NeonErrorResponse *) resp)->message))); break; @@ -2595,10 +2719,10 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum) update_cached_relsize(InfoFromSMgrRel(reln), forknum, n_blocks); neon_log(SmgrTrace, "neon_nblocks: rel %u/%u/%u fork %u (request LSN %X/%08X): %u blocks", - RelFileInfoFmt(InfoFromSMgrRel(reln)), - forknum, - (uint32) (request_lsn >> 32), (uint32) request_lsn, - n_blocks); + RelFileInfoFmt(InfoFromSMgrRel(reln)), + forknum, + LSN_FORMAT_ARGS(request_lsns.effective_request_lsn), + n_blocks); pfree(resp); return n_blocks; @@ -2612,17 +2736,15 @@ neon_dbsize(Oid dbNode) { NeonResponse *resp; int64 db_size; - XLogRecPtr request_lsn, - not_modified_since; + neon_request_lsns request_lsns; NRelFileInfo dummy_node = {0}; - neon_get_request_lsn(dummy_node, MAIN_FORKNUM, REL_METADATA_PSEUDO_BLOCKNO, - &request_lsn, ¬_modified_since); + request_lsns = neon_get_request_lsns(dummy_node, MAIN_FORKNUM, REL_METADATA_PSEUDO_BLOCKNO); { NeonDbSizeRequest request = { .req.tag = T_NeonDbSizeRequest, - .req.lsn = request_lsn, - .req.not_modified_since = not_modified_since, + .req.lsn = request_lsns.request_lsn, + .req.not_modified_since = request_lsns.not_modified_since, .dbNode = dbNode, }; @@ -2639,8 +2761,7 @@ neon_dbsize(Oid dbNode) ereport(ERROR, (errcode(ERRCODE_IO_ERROR), errmsg(NEON_TAG "could not read db size of db %u from page server at lsn %X/%08X", - dbNode, - (uint32) (request_lsn >> 32), (uint32) request_lsn), + dbNode, LSN_FORMAT_ARGS(request_lsns.effective_request_lsn)), errdetail("page server returned error: %s", ((NeonErrorResponse *) resp)->message))); break; @@ -2650,9 +2771,7 @@ neon_dbsize(Oid dbNode) } neon_log(SmgrTrace, "neon_dbsize: db %u (request LSN %X/%08X): %ld bytes", - dbNode, - (uint32) (request_lsn >> 32), (uint32) request_lsn, - db_size); + dbNode, LSN_FORMAT_ARGS(request_lsns.effective_request_lsn), db_size); pfree(resp); return db_size; @@ -2812,10 +2931,14 @@ neon_start_unlogged_build(SMgrRelation reln) reln->smgr_relpersistence = RELPERSISTENCE_UNLOGGED; /* + * Create the local file. In a parallel build, the leader is expected to + * call this first and do it. + * * FIXME: should we pass isRedo true to create the tablespace dir if it * doesn't exist? Is it needed? */ - mdcreate(reln, MAIN_FORKNUM, false); + if (!IsParallelWorker()) + mdcreate(reln, MAIN_FORKNUM, false); } /* @@ -2839,7 +2962,17 @@ neon_finish_unlogged_build_phase_1(SMgrRelation reln) Assert(unlogged_build_phase == UNLOGGED_BUILD_PHASE_1); Assert(reln->smgr_relpersistence == RELPERSISTENCE_UNLOGGED); - unlogged_build_phase = UNLOGGED_BUILD_PHASE_2; + /* + * In a parallel build, (only) the leader process performs the 2nd + * phase. + */ + if (IsParallelWorker()) + { + unlogged_build_rel = NULL; + unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS; + } + else + unlogged_build_phase = UNLOGGED_BUILD_PHASE_2; } /* @@ -2897,6 +3030,10 @@ neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buf XLogRecPtr request_lsn, not_modified_since; + /* + * Compute a request LSN to use, similar to neon_get_request_lsns() but the + * logic is a bit simpler. + */ if (RecoveryInProgress()) { request_lsn = GetXLogReplayRecPtr(NULL); @@ -2908,10 +3045,10 @@ neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buf */ request_lsn = GetRedoStartLsn(); } + request_lsn = nm_adjust_lsn(request_lsn); } else - request_lsn = GetXLogInsertRecPtr(); - request_lsn = nm_adjust_lsn(request_lsn); + request_lsn = UINT64_MAX; /* * GetRedoStartLsn() returns LSN of basebackup. We know that the SLRU @@ -3187,7 +3324,7 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id) BufferTag tag; uint32 hash; LWLock *partitionLock; - Buffer buffer; + int buf_id; bool no_redo_needed; if (old_redo_read_buffer_filter && old_redo_read_buffer_filter(record, block_id)) @@ -3225,20 +3362,20 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id) else { /* Try to find the relevant buffer */ - buffer = BufTableLookup(&tag, hash); + buf_id = BufTableLookup(&tag, hash); - no_redo_needed = buffer < 0; + no_redo_needed = buf_id < 0; } - /* In both cases st lwlsn past this WAL record */ - SetLastWrittenLSNForBlock(end_recptr, rinfo, forknum, blkno); /* * we don't have the buffer in memory, update lwLsn past this record, also * evict page from file cache */ if (no_redo_needed) + { + SetLastWrittenLSNForBlock(end_recptr, rinfo, forknum, blkno); lfc_evict(rinfo, forknum, blkno); - + } LWLockRelease(partitionLock); diff --git a/pgxn/neon/walproposer_pg.c b/pgxn/neon/walproposer_pg.c index e5ef93b456..316e23a72e 100644 --- a/pgxn/neon/walproposer_pg.c +++ b/pgxn/neon/walproposer_pg.c @@ -1852,34 +1852,30 @@ static void CombineHotStanbyFeedbacks(HotStandbyFeedback *hs, WalProposer *wp) { hs->ts = 0; - hs->xmin.value = ~0; /* largest unsigned value */ - hs->catalog_xmin.value = ~0; /* largest unsigned value */ + hs->xmin = InvalidFullTransactionId; + hs->catalog_xmin = InvalidFullTransactionId; for (int i = 0; i < wp->n_safekeepers; i++) { - if (wp->safekeeper[i].appendResponse.hs.ts != 0) + + if (wp->safekeeper[i].state == SS_ACTIVE) { HotStandbyFeedback *skhs = &wp->safekeeper[i].appendResponse.hs; if (FullTransactionIdIsNormal(skhs->xmin) - && FullTransactionIdPrecedes(skhs->xmin, hs->xmin)) + && (!FullTransactionIdIsValid(hs->xmin) || FullTransactionIdPrecedes(skhs->xmin, hs->xmin))) { hs->xmin = skhs->xmin; hs->ts = skhs->ts; } if (FullTransactionIdIsNormal(skhs->catalog_xmin) - && FullTransactionIdPrecedes(skhs->catalog_xmin, hs->xmin)) + && (!FullTransactionIdIsValid(hs->catalog_xmin) || FullTransactionIdPrecedes(skhs->catalog_xmin, hs->catalog_xmin))) { hs->catalog_xmin = skhs->catalog_xmin; hs->ts = skhs->ts; } } } - - if (hs->xmin.value == ~0) - hs->xmin = InvalidFullTransactionId; - if (hs->catalog_xmin.value == ~0) - hs->catalog_xmin = InvalidFullTransactionId; } /* @@ -1946,14 +1942,28 @@ walprop_pg_process_safekeeper_feedback(WalProposer *wp, Safekeeper *sk) } CombineHotStanbyFeedbacks(&hsFeedback, wp); - if (hsFeedback.ts != 0 && memcmp(&hsFeedback, &agg_hs_feedback, sizeof hsFeedback) != 0) + if (memcmp(&hsFeedback, &agg_hs_feedback, sizeof hsFeedback) != 0) { + FullTransactionId xmin = hsFeedback.xmin; + FullTransactionId catalog_xmin = hsFeedback.catalog_xmin; + FullTransactionId next_xid = ReadNextFullTransactionId(); + /* + * Page server is updating nextXid in checkpoint each 1024 transactions, + * so feedback xmin can be actually larger then nextXid and + * function TransactionIdInRecentPast return false in this case, + * preventing update of slot's xmin. + */ + if (FullTransactionIdPrecedes(next_xid, xmin)) + xmin = next_xid; + if (FullTransactionIdPrecedes(next_xid, catalog_xmin)) + catalog_xmin = next_xid; agg_hs_feedback = hsFeedback; + elog(DEBUG2, "ProcessStandbyHSFeedback(xmin=%d, catalog_xmin=%d", XidFromFullTransactionId(hsFeedback.xmin), XidFromFullTransactionId(hsFeedback.catalog_xmin)); ProcessStandbyHSFeedback(hsFeedback.ts, - XidFromFullTransactionId(hsFeedback.xmin), - EpochFromFullTransactionId(hsFeedback.xmin), - XidFromFullTransactionId(hsFeedback.catalog_xmin), - EpochFromFullTransactionId(hsFeedback.catalog_xmin)); + XidFromFullTransactionId(xmin), + EpochFromFullTransactionId(xmin), + XidFromFullTransactionId(catalog_xmin), + EpochFromFullTransactionId(catalog_xmin)); } CheckGracefulShutdown(wp); diff --git a/pgxn/neon_test_utils/neontest.c b/pgxn/neon_test_utils/neontest.c index 677006923d..47f245fbf1 100644 --- a/pgxn/neon_test_utils/neontest.c +++ b/pgxn/neon_test_utils/neontest.c @@ -48,10 +48,10 @@ PG_FUNCTION_INFO_V1(neon_xlogflush); */ #if PG_MAJORVERSION_NUM < 16 typedef void (*neon_read_at_lsn_type) (NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, - XLogRecPtr request_lsn, XLogRecPtr not_modified_since, char *buffer); + neon_request_lsns request_lsns, char *buffer); #else typedef void (*neon_read_at_lsn_type) (NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, - XLogRecPtr request_lsn, XLogRecPtr not_modified_since, void *buffer); + neon_request_lsns request_lsns, void *buffer); #endif static neon_read_at_lsn_type neon_read_at_lsn_ptr; @@ -298,9 +298,7 @@ get_raw_page_at_lsn(PG_FUNCTION_ARGS) text *relname; text *forkname; uint32 blkno; - - XLogRecPtr request_lsn; - XLogRecPtr not_modified_since; + neon_request_lsns request_lsns; if (PG_NARGS() != 5) elog(ERROR, "unexpected number of arguments in SQL function signature"); @@ -312,8 +310,15 @@ get_raw_page_at_lsn(PG_FUNCTION_ARGS) forkname = PG_GETARG_TEXT_PP(1); blkno = PG_GETARG_UINT32(2); - request_lsn = PG_ARGISNULL(3) ? GetXLogInsertRecPtr() : PG_GETARG_LSN(3); - not_modified_since = PG_ARGISNULL(4) ? request_lsn : PG_GETARG_LSN(4); + request_lsns.request_lsn = PG_ARGISNULL(3) ? GetXLogInsertRecPtr() : PG_GETARG_LSN(3); + request_lsns.not_modified_since = PG_ARGISNULL(4) ? request_lsns.request_lsn : PG_GETARG_LSN(4); + /* + * For the time being, use the same LSN for request and + * effective request LSN. If any test needed to use UINT64_MAX + * as the request LSN, we'd need to add effective_request_lsn + * as a new argument. + */ + request_lsns.effective_request_lsn = request_lsns.request_lsn; if (!superuser()) ereport(ERROR, @@ -367,7 +372,8 @@ get_raw_page_at_lsn(PG_FUNCTION_ARGS) SET_VARSIZE(raw_page, BLCKSZ + VARHDRSZ); raw_page_data = VARDATA(raw_page); - neon_read_at_lsn(InfoFromRelation(rel), forknum, blkno, request_lsn, not_modified_since, raw_page_data); + neon_read_at_lsn(InfoFromRelation(rel), forknum, blkno, request_lsns, + raw_page_data); relation_close(rel, AccessShareLock); @@ -413,19 +419,25 @@ get_raw_page_at_lsn_ex(PG_FUNCTION_ARGS) ForkNumber forknum = PG_GETARG_UINT32(3); uint32 blkno = PG_GETARG_UINT32(4); - XLogRecPtr request_lsn; - XLogRecPtr not_modified_since; + neon_request_lsns request_lsns; /* Initialize buffer to copy to */ bytea *raw_page = (bytea *) palloc(BLCKSZ + VARHDRSZ); - request_lsn = PG_ARGISNULL(5) ? GetXLogInsertRecPtr() : PG_GETARG_LSN(5); - not_modified_since = PG_ARGISNULL(6) ? request_lsn : PG_GETARG_LSN(6); + request_lsns.request_lsn = PG_ARGISNULL(5) ? GetXLogInsertRecPtr() : PG_GETARG_LSN(5); + request_lsns.not_modified_since = PG_ARGISNULL(6) ? request_lsns.request_lsn : PG_GETARG_LSN(6); + /* + * For the time being, use the same LSN for request + * and effective request LSN. If any test needed to + * use UINT64_MAX as the request LSN, we'd need to add + * effective_request_lsn as a new argument. + */ + request_lsns.effective_request_lsn = request_lsns.request_lsn; SET_VARSIZE(raw_page, BLCKSZ + VARHDRSZ); raw_page_data = VARDATA(raw_page); - neon_read_at_lsn(rinfo, forknum, blkno, request_lsn, not_modified_since, raw_page_data); + neon_read_at_lsn(rinfo, forknum, blkno, request_lsns, raw_page_data); PG_RETURN_BYTEA_P(raw_page); } } diff --git a/poetry.lock b/poetry.lock index e437f5de74..25c0c7398d 100644 --- a/poetry.lock +++ b/poetry.lock @@ -158,6 +158,28 @@ files = [ attrs = ">=16.0.0" pluggy = ">=0.4.0" +[[package]] +name = "annotated-types" +version = "0.6.0" +description = "Reusable constraint types to use with typing.Annotated" +optional = false +python-versions = ">=3.8" +files = [ + {file = "annotated_types-0.6.0-py3-none-any.whl", hash = "sha256:0641064de18ba7a25dee8f96403ebc39113d0cb953a01429249d5c7564666a43"}, + {file = "annotated_types-0.6.0.tar.gz", hash = "sha256:563339e807e53ffd9c267e99fc6d9ea23eb8443c08f112651963e24e22f84a5d"}, +] + +[[package]] +name = "antlr4-python3-runtime" +version = "4.13.1" +description = "ANTLR 4.13.1 runtime for Python 3" +optional = false +python-versions = "*" +files = [ + {file = "antlr4-python3-runtime-4.13.1.tar.gz", hash = "sha256:3cd282f5ea7cfb841537fe01f143350fdb1c0b1ce7981443a2fa8513fddb6d1a"}, + {file = "antlr4_python3_runtime-4.13.1-py3-none-any.whl", hash = "sha256:78ec57aad12c97ac039ca27403ad61cb98aaec8a3f9bb8144f889aa0fa28b943"}, +] + [[package]] name = "anyio" version = "4.3.0" @@ -267,22 +289,23 @@ tests-no-zope = ["cloudpickle", "coverage[toml] (>=5.0.2)", "hypothesis", "mypy" [[package]] name = "aws-sam-translator" -version = "1.48.0" +version = "1.88.0" description = "AWS SAM Translator is a library that transform SAM templates into AWS CloudFormation templates" optional = false -python-versions = ">=3.7, <=4.0, !=4.0" +python-versions = "!=4.0,<=4.0,>=3.8" files = [ - {file = "aws-sam-translator-1.48.0.tar.gz", hash = "sha256:7171037323dfa30f8f73e9bccb9210e4c384a585e087219a9518a5204f0a2c44"}, - {file = "aws_sam_translator-1.48.0-py2-none-any.whl", hash = "sha256:be18dfa3dfe7ab291d281667c5f73ac62dbe6bfe86df7d122e4258b906b736f0"}, - {file = "aws_sam_translator-1.48.0-py3-none-any.whl", hash = "sha256:ca4f8f9910d7713aeaba59346775bfb3198f6acb47c6704572f9bd3fc0fb5bf0"}, + {file = "aws_sam_translator-1.88.0-py3-none-any.whl", hash = "sha256:aa93d498d8de3fb3d485c316155b1628144b823bbc176099a20de06df666fcac"}, + {file = "aws_sam_translator-1.88.0.tar.gz", hash = "sha256:e77c65f3488566122277accd44a0f1ec018e37403e0d5fe25120d96e537e91a7"}, ] [package.dependencies] boto3 = ">=1.19.5,<2.dev0" -jsonschema = ">=3.2,<4.0" +jsonschema = ">=3.2,<5" +pydantic = ">=1.8,<3" +typing-extensions = ">=4.4" [package.extras] -dev = ["black (==20.8b1)", "boto3 (>=1.23,<2)", "click (>=7.1,<8.0)", "coverage (>=5.3,<6.0)", "dateparser (>=0.7,<1.0)", "docopt (>=0.6.2,<0.7.0)", "flake8 (>=3.8.4,<3.9.0)", "parameterized (>=0.7.4,<0.8.0)", "pylint (>=2.9.0,<2.10.0)", "pytest (>=6.2.5,<6.3.0)", "pytest-cov (>=2.10.1,<2.11.0)", "pytest-env (>=0.6.2,<0.7.0)", "pytest-xdist (>=2.5,<3.0)", "pyyaml (>=5.4,<6.0)", "requests (>=2.24.0,<2.25.0)", "tenacity (>=7.0.0,<7.1.0)", "tox (>=3.24,<4.0)"] +dev = ["black (==24.3.0)", "boto3 (>=1.23,<2)", "boto3-stubs[appconfig,serverlessrepo] (>=1.19.5,<2.dev0)", "coverage (>=5.3,<8)", "dateparser (>=1.1,<2.0)", "mypy (>=1.3.0,<1.4.0)", "parameterized (>=0.7,<1.0)", "pytest (>=6.2,<8)", "pytest-cov (>=2.10,<5)", "pytest-env (>=0.6,<1)", "pytest-rerunfailures (>=9.1,<12)", "pytest-xdist (>=2.5,<4)", "pyyaml (>=6.0,<7.0)", "requests (>=2.28,<3.0)", "ruamel.yaml (==0.17.21)", "ruff (>=0.1.0,<0.2.0)", "tenacity (>=8.0,<9.0)", "types-PyYAML (>=6.0,<7.0)", "types-jsonschema (>=3.2,<4.0)"] [[package]] name = "aws-xray-sdk" @@ -798,24 +821,26 @@ pycparser = "*" [[package]] name = "cfn-lint" -version = "0.61.3" +version = "0.87.1" description = "Checks CloudFormation templates for practices and behaviour that could potentially be improved" optional = false -python-versions = ">=3.6, <=4.0, !=4.0" +python-versions = "!=4.0,<=4.0,>=3.8" files = [ - {file = "cfn-lint-0.61.3.tar.gz", hash = "sha256:3806e010d77901f5e935496df690c10e39676434a738fce1a1161cf9c7bd36a2"}, - {file = "cfn_lint-0.61.3-py3-none-any.whl", hash = "sha256:8e9522fad0c7c98b31ecbdd4724f8d8a5787457cc0f71e62ae0d11104d6e52ab"}, + {file = "cfn_lint-0.87.1-py3-none-any.whl", hash = "sha256:d450f450635fc223b6f66880ccac52a5fd1a52966fa1705f1ba52b88dfed3071"}, + {file = "cfn_lint-0.87.1.tar.gz", hash = "sha256:b3ce9d3e5e0eadcea5d584c8ccaa00bf2a990a36a64d7ffd8683bc60b7e4f06f"}, ] [package.dependencies] -aws-sam-translator = ">=1.47.0" +aws-sam-translator = ">=1.87.0" jschema-to-python = ">=1.2.3,<1.3.0" jsonpatch = "*" -jsonschema = ">=3.0,<4.0" +jsonschema = ">=3.0,<5" junit-xml = ">=1.9,<2.0" -networkx = ">=2.4,<3.0" +networkx = ">=2.4,<4" pyyaml = ">5.4" +regex = ">=2021.7.1" sarif-om = ">=1.0.4,<1.1.0" +sympy = ">=1.0.0" [[package]] name = "charset-normalizer" @@ -931,24 +956,6 @@ websocket-client = ">=0.32.0" ssh = ["paramiko (>=2.4.2)"] tls = ["cryptography (>=1.3.4)", "idna (>=2.0.0)", "pyOpenSSL (>=17.5.0)"] -[[package]] -name = "ecdsa" -version = "0.18.0" -description = "ECDSA cryptographic signature library (pure python)" -optional = false -python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*" -files = [ - {file = "ecdsa-0.18.0-py2.py3-none-any.whl", hash = "sha256:80600258e7ed2f16b9aa1d7c295bd70194109ad5a30fdee0eaeefef1d4c559dd"}, - {file = "ecdsa-0.18.0.tar.gz", hash = "sha256:190348041559e21b22a1d65cee485282ca11a6f81d503fddb84d5017e9ed1e49"}, -] - -[package.dependencies] -six = ">=1.9.0" - -[package.extras] -gmpy = ["gmpy"] -gmpy2 = ["gmpy2"] - [[package]] name = "exceptiongroup" version = "1.1.1" @@ -1268,6 +1275,23 @@ files = [ {file = "jmespath-1.0.1.tar.gz", hash = "sha256:90261b206d6defd58fdd5e85f478bf633a2901798906be2ad389150c5c60edbe"}, ] +[[package]] +name = "joserfc" +version = "0.9.0" +description = "The ultimate Python library for JOSE RFCs, including JWS, JWE, JWK, JWA, JWT" +optional = false +python-versions = ">=3.8" +files = [ + {file = "joserfc-0.9.0-py3-none-any.whl", hash = "sha256:4026bdbe2c196cd40574e916fa1e28874d99649412edaab0e373dec3077153fb"}, + {file = "joserfc-0.9.0.tar.gz", hash = "sha256:eebca7f587b1761ce43a98ffd5327f2b600b9aa5bb0a77b947687f503ad43bc0"}, +] + +[package.dependencies] +cryptography = "*" + +[package.extras] +drafts = ["pycryptodome"] + [[package]] name = "jschema-to-python" version = "1.2.3" @@ -1309,6 +1333,20 @@ files = [ [package.dependencies] jsonpointer = ">=1.9" +[[package]] +name = "jsonpath-ng" +version = "1.6.1" +description = "A final implementation of JSONPath for Python that aims to be standard compliant, including arithmetic and binary comparison operators and providing clear AST for metaprogramming." +optional = false +python-versions = "*" +files = [ + {file = "jsonpath-ng-1.6.1.tar.gz", hash = "sha256:086c37ba4917304850bd837aeab806670224d3f038fe2833ff593a672ef0a5fa"}, + {file = "jsonpath_ng-1.6.1-py3-none-any.whl", hash = "sha256:8f22cd8273d7772eea9aaa84d922e0841aa36fdb8a2c6b7f6c3791a16a9bc0be"}, +] + +[package.dependencies] +ply = "*" + [[package]] name = "jsonpickle" version = "2.2.0" @@ -1338,24 +1376,39 @@ files = [ [[package]] name = "jsonschema" -version = "3.2.0" +version = "4.17.3" description = "An implementation of JSON Schema validation for Python" optional = false -python-versions = "*" +python-versions = ">=3.7" files = [ - {file = "jsonschema-3.2.0-py2.py3-none-any.whl", hash = "sha256:4e5b3cf8216f577bee9ce139cbe72eca3ea4f292ec60928ff24758ce626cd163"}, - {file = "jsonschema-3.2.0.tar.gz", hash = "sha256:c8a85b28d377cc7737e46e2d9f2b4f44ee3c0e1deac6bf46ddefc7187d30797a"}, + {file = "jsonschema-4.17.3-py3-none-any.whl", hash = "sha256:a870ad254da1a8ca84b6a2905cac29d265f805acc57af304784962a2aa6508f6"}, + {file = "jsonschema-4.17.3.tar.gz", hash = "sha256:0f864437ab8b6076ba6707453ef8f98a6a0d512a80e93f8abdb676f737ecb60d"}, ] [package.dependencies] attrs = ">=17.4.0" -pyrsistent = ">=0.14.0" -setuptools = "*" -six = ">=1.11.0" +pyrsistent = ">=0.14.0,<0.17.0 || >0.17.0,<0.17.1 || >0.17.1,<0.17.2 || >0.17.2" [package.extras] -format = ["idna", "jsonpointer (>1.13)", "rfc3987", "strict-rfc3339", "webcolors"] -format-nongpl = ["idna", "jsonpointer (>1.13)", "rfc3339-validator", "rfc3986-validator (>0.1.0)", "webcolors"] +format = ["fqdn", "idna", "isoduration", "jsonpointer (>1.13)", "rfc3339-validator", "rfc3987", "uri-template", "webcolors (>=1.11)"] +format-nongpl = ["fqdn", "idna", "isoduration", "jsonpointer (>1.13)", "rfc3339-validator", "rfc3986-validator (>0.1.0)", "uri-template", "webcolors (>=1.11)"] + +[[package]] +name = "jsonschema-spec" +version = "0.1.6" +description = "JSONSchema Spec with object-oriented paths" +optional = false +python-versions = ">=3.7.0,<4.0.0" +files = [ + {file = "jsonschema_spec-0.1.6-py3-none-any.whl", hash = "sha256:f2206d18c89d1824c1f775ba14ed039743b41a9167bd2c5bdb774b66b3ca0bbf"}, + {file = "jsonschema_spec-0.1.6.tar.gz", hash = "sha256:90215863b56e212086641956b20127ccbf6d8a3a38343dad01d6a74d19482f76"}, +] + +[package.dependencies] +jsonschema = ">=4.0.0,<4.18.0" +pathable = ">=0.4.1,<0.5.0" +PyYAML = ">=5.1" +requests = ">=2.31.0,<3.0.0" [[package]] name = "junit-xml" @@ -1371,6 +1424,52 @@ files = [ [package.dependencies] six = "*" +[[package]] +name = "lazy-object-proxy" +version = "1.10.0" +description = "A fast and thorough lazy object proxy." +optional = false +python-versions = ">=3.8" +files = [ + {file = "lazy-object-proxy-1.10.0.tar.gz", hash = "sha256:78247b6d45f43a52ef35c25b5581459e85117225408a4128a3daf8bf9648ac69"}, + {file = "lazy_object_proxy-1.10.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:855e068b0358ab916454464a884779c7ffa312b8925c6f7401e952dcf3b89977"}, + {file = "lazy_object_proxy-1.10.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7ab7004cf2e59f7c2e4345604a3e6ea0d92ac44e1c2375527d56492014e690c3"}, + {file = "lazy_object_proxy-1.10.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dc0d2fc424e54c70c4bc06787e4072c4f3b1aa2f897dfdc34ce1013cf3ceef05"}, + {file = "lazy_object_proxy-1.10.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:e2adb09778797da09d2b5ebdbceebf7dd32e2c96f79da9052b2e87b6ea495895"}, + {file = "lazy_object_proxy-1.10.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:b1f711e2c6dcd4edd372cf5dec5c5a30d23bba06ee012093267b3376c079ec83"}, + {file = "lazy_object_proxy-1.10.0-cp310-cp310-win32.whl", hash = "sha256:76a095cfe6045c7d0ca77db9934e8f7b71b14645f0094ffcd842349ada5c5fb9"}, + {file = "lazy_object_proxy-1.10.0-cp310-cp310-win_amd64.whl", hash = "sha256:b4f87d4ed9064b2628da63830986c3d2dca7501e6018347798313fcf028e2fd4"}, + {file = "lazy_object_proxy-1.10.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:fec03caabbc6b59ea4a638bee5fce7117be8e99a4103d9d5ad77f15d6f81020c"}, + {file = "lazy_object_proxy-1.10.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:02c83f957782cbbe8136bee26416686a6ae998c7b6191711a04da776dc9e47d4"}, + {file = "lazy_object_proxy-1.10.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:009e6bb1f1935a62889ddc8541514b6a9e1fcf302667dcb049a0be5c8f613e56"}, + {file = "lazy_object_proxy-1.10.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:75fc59fc450050b1b3c203c35020bc41bd2695ed692a392924c6ce180c6f1dc9"}, + {file = "lazy_object_proxy-1.10.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:782e2c9b2aab1708ffb07d4bf377d12901d7a1d99e5e410d648d892f8967ab1f"}, + {file = "lazy_object_proxy-1.10.0-cp311-cp311-win32.whl", hash = "sha256:edb45bb8278574710e68a6b021599a10ce730d156e5b254941754a9cc0b17d03"}, + {file = "lazy_object_proxy-1.10.0-cp311-cp311-win_amd64.whl", hash = "sha256:e271058822765ad5e3bca7f05f2ace0de58a3f4e62045a8c90a0dfd2f8ad8cc6"}, + {file = "lazy_object_proxy-1.10.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:e98c8af98d5707dcdecc9ab0863c0ea6e88545d42ca7c3feffb6b4d1e370c7ba"}, + {file = "lazy_object_proxy-1.10.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:952c81d415b9b80ea261d2372d2a4a2332a3890c2b83e0535f263ddfe43f0d43"}, + {file = "lazy_object_proxy-1.10.0-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:80b39d3a151309efc8cc48675918891b865bdf742a8616a337cb0090791a0de9"}, + {file = "lazy_object_proxy-1.10.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:e221060b701e2aa2ea991542900dd13907a5c90fa80e199dbf5a03359019e7a3"}, + {file = "lazy_object_proxy-1.10.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:92f09ff65ecff3108e56526f9e2481b8116c0b9e1425325e13245abfd79bdb1b"}, + {file = "lazy_object_proxy-1.10.0-cp312-cp312-win32.whl", hash = "sha256:3ad54b9ddbe20ae9f7c1b29e52f123120772b06dbb18ec6be9101369d63a4074"}, + {file = "lazy_object_proxy-1.10.0-cp312-cp312-win_amd64.whl", hash = "sha256:127a789c75151db6af398b8972178afe6bda7d6f68730c057fbbc2e96b08d282"}, + {file = "lazy_object_proxy-1.10.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:9e4ed0518a14dd26092614412936920ad081a424bdcb54cc13349a8e2c6d106a"}, + {file = "lazy_object_proxy-1.10.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5ad9e6ed739285919aa9661a5bbed0aaf410aa60231373c5579c6b4801bd883c"}, + {file = "lazy_object_proxy-1.10.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2fc0a92c02fa1ca1e84fc60fa258458e5bf89d90a1ddaeb8ed9cc3147f417255"}, + {file = "lazy_object_proxy-1.10.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:0aefc7591920bbd360d57ea03c995cebc204b424524a5bd78406f6e1b8b2a5d8"}, + {file = "lazy_object_proxy-1.10.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:5faf03a7d8942bb4476e3b62fd0f4cf94eaf4618e304a19865abf89a35c0bbee"}, + {file = "lazy_object_proxy-1.10.0-cp38-cp38-win32.whl", hash = "sha256:e333e2324307a7b5d86adfa835bb500ee70bfcd1447384a822e96495796b0ca4"}, + {file = "lazy_object_proxy-1.10.0-cp38-cp38-win_amd64.whl", hash = "sha256:cb73507defd385b7705c599a94474b1d5222a508e502553ef94114a143ec6696"}, + {file = "lazy_object_proxy-1.10.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:366c32fe5355ef5fc8a232c5436f4cc66e9d3e8967c01fb2e6302fd6627e3d94"}, + {file = "lazy_object_proxy-1.10.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2297f08f08a2bb0d32a4265e98a006643cd7233fb7983032bd61ac7a02956b3b"}, + {file = "lazy_object_proxy-1.10.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:18dd842b49456aaa9a7cf535b04ca4571a302ff72ed8740d06b5adcd41fe0757"}, + {file = "lazy_object_proxy-1.10.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:217138197c170a2a74ca0e05bddcd5f1796c735c37d0eee33e43259b192aa424"}, + {file = "lazy_object_proxy-1.10.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:9a3a87cf1e133e5b1994144c12ca4aa3d9698517fe1e2ca82977781b16955658"}, + {file = "lazy_object_proxy-1.10.0-cp39-cp39-win32.whl", hash = "sha256:30b339b2a743c5288405aa79a69e706a06e02958eab31859f7f3c04980853b70"}, + {file = "lazy_object_proxy-1.10.0-cp39-cp39-win_amd64.whl", hash = "sha256:a899b10e17743683b293a729d3a11f2f399e8a90c73b089e29f5d0fe3509f0dd"}, + {file = "lazy_object_proxy-1.10.0-pp310.pp311.pp312.pp38.pp39-none-any.whl", hash = "sha256:80fa48bd89c8f2f456fc0765c11c23bf5af827febacd2f523ca5bc1893fcc09d"}, +] + [[package]] name = "markupsafe" version = "2.1.1" @@ -1422,64 +1521,80 @@ files = [ [[package]] name = "moto" -version = "4.1.2" +version = "5.0.6" description = "" optional = false -python-versions = ">=3.7" +python-versions = ">=3.8" files = [ - {file = "moto-4.1.2-py2.py3-none-any.whl", hash = "sha256:1b361ece638c74a657325378a259276f368aafce2f8be84f8143e69fa93ce8ec"}, - {file = "moto-4.1.2.tar.gz", hash = "sha256:63431733d2a02c7bd652ad71ec1da442a0e0d580cbac5eeb50d440a2ce066eac"}, + {file = "moto-5.0.6-py2.py3-none-any.whl", hash = "sha256:ca1e22831a741733b581ff2ef4d6ae2e1c6db1eab97af1b78b86ca2c6e88c609"}, + {file = "moto-5.0.6.tar.gz", hash = "sha256:ad8b23f2b555ad694da8b2432a42b6d96beaaf67a4e7d932196a72193a2eee2c"}, ] [package.dependencies] +antlr4-python3-runtime = {version = "*", optional = true, markers = "extra == \"server\""} aws-xray-sdk = {version = ">=0.93,<0.96 || >0.96", optional = true, markers = "extra == \"server\""} boto3 = ">=1.9.201" -botocore = ">=1.12.201" +botocore = ">=1.14.0" cfn-lint = {version = ">=0.40.0", optional = true, markers = "extra == \"server\""} cryptography = ">=3.3.1" -docker = {version = ">=2.5.1", optional = true, markers = "extra == \"server\""} -ecdsa = {version = "!=0.15", optional = true, markers = "extra == \"server\""} +docker = {version = ">=3.0.0", optional = true, markers = "extra == \"server\""} flask = {version = "<2.2.0 || >2.2.0,<2.2.1 || >2.2.1", optional = true, markers = "extra == \"server\""} flask-cors = {version = "*", optional = true, markers = "extra == \"server\""} graphql-core = {version = "*", optional = true, markers = "extra == \"server\""} Jinja2 = ">=2.10.1" +joserfc = {version = ">=0.9.0", optional = true, markers = "extra == \"server\""} jsondiff = {version = ">=1.1.2", optional = true, markers = "extra == \"server\""} -openapi-spec-validator = {version = ">=0.2.8", optional = true, markers = "extra == \"server\""} +jsonpath-ng = {version = "*", optional = true, markers = "extra == \"server\""} +openapi-spec-validator = {version = ">=0.5.0", optional = true, markers = "extra == \"server\""} +py-partiql-parser = {version = "0.5.4", optional = true, markers = "extra == \"server\""} pyparsing = {version = ">=3.0.7", optional = true, markers = "extra == \"server\""} python-dateutil = ">=2.1,<3.0.0" -python-jose = {version = ">=3.1.0,<4.0.0", extras = ["cryptography"], optional = true, markers = "extra == \"server\""} PyYAML = {version = ">=5.1", optional = true, markers = "extra == \"server\""} requests = ">=2.5" -responses = ">=0.13.0" +responses = ">=0.15.0" setuptools = {version = "*", optional = true, markers = "extra == \"server\""} -sshpubkeys = {version = ">=3.1.0", optional = true, markers = "extra == \"server\""} werkzeug = ">=0.5,<2.2.0 || >2.2.0,<2.2.1 || >2.2.1" xmltodict = "*" [package.extras] -all = ["PyYAML (>=5.1)", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.40.0)", "docker (>=2.5.1)", "ecdsa (!=0.15)", "graphql-core", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.2.8)", "pyparsing (>=3.0.7)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "setuptools", "sshpubkeys (>=3.1.0)"] -apigateway = ["PyYAML (>=5.1)", "ecdsa (!=0.15)", "openapi-spec-validator (>=0.2.8)", "python-jose[cryptography] (>=3.1.0,<4.0.0)"] -apigatewayv2 = ["PyYAML (>=5.1)"] +all = ["PyYAML (>=5.1)", "antlr4-python3-runtime", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.40.0)", "docker (>=3.0.0)", "graphql-core", "joserfc (>=0.9.0)", "jsondiff (>=1.1.2)", "jsonpath-ng", "multipart", "openapi-spec-validator (>=0.5.0)", "py-partiql-parser (==0.5.4)", "pyparsing (>=3.0.7)", "setuptools"] +apigateway = ["PyYAML (>=5.1)", "joserfc (>=0.9.0)", "openapi-spec-validator (>=0.5.0)"] +apigatewayv2 = ["PyYAML (>=5.1)", "openapi-spec-validator (>=0.5.0)"] appsync = ["graphql-core"] -awslambda = ["docker (>=2.5.1)"] -batch = ["docker (>=2.5.1)"] -cloudformation = ["PyYAML (>=5.1)", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.40.0)", "docker (>=2.5.1)", "ecdsa (!=0.15)", "graphql-core", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.2.8)", "pyparsing (>=3.0.7)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "setuptools", "sshpubkeys (>=3.1.0)"] -cognitoidp = ["ecdsa (!=0.15)", "python-jose[cryptography] (>=3.1.0,<4.0.0)"] -ds = ["sshpubkeys (>=3.1.0)"] -dynamodb = ["docker (>=2.5.1)"] -dynamodbstreams = ["docker (>=2.5.1)"] -ebs = ["sshpubkeys (>=3.1.0)"] -ec2 = ["sshpubkeys (>=3.1.0)"] -efs = ["sshpubkeys (>=3.1.0)"] -eks = ["sshpubkeys (>=3.1.0)"] +awslambda = ["docker (>=3.0.0)"] +batch = ["docker (>=3.0.0)"] +cloudformation = ["PyYAML (>=5.1)", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.40.0)", "docker (>=3.0.0)", "graphql-core", "joserfc (>=0.9.0)", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.5.0)", "py-partiql-parser (==0.5.4)", "pyparsing (>=3.0.7)", "setuptools"] +cognitoidp = ["joserfc (>=0.9.0)"] +dynamodb = ["docker (>=3.0.0)", "py-partiql-parser (==0.5.4)"] +dynamodbstreams = ["docker (>=3.0.0)", "py-partiql-parser (==0.5.4)"] glue = ["pyparsing (>=3.0.7)"] iotdata = ["jsondiff (>=1.1.2)"] -route53resolver = ["sshpubkeys (>=3.1.0)"] -s3 = ["PyYAML (>=5.1)"] -server = ["PyYAML (>=5.1)", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.40.0)", "docker (>=2.5.1)", "ecdsa (!=0.15)", "flask (!=2.2.0,!=2.2.1)", "flask-cors", "graphql-core", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.2.8)", "pyparsing (>=3.0.7)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "setuptools", "sshpubkeys (>=3.1.0)"] +proxy = ["PyYAML (>=5.1)", "antlr4-python3-runtime", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.40.0)", "docker (>=2.5.1)", "graphql-core", "joserfc (>=0.9.0)", "jsondiff (>=1.1.2)", "jsonpath-ng", "multipart", "openapi-spec-validator (>=0.5.0)", "py-partiql-parser (==0.5.4)", "pyparsing (>=3.0.7)", "setuptools"] +resourcegroupstaggingapi = ["PyYAML (>=5.1)", "cfn-lint (>=0.40.0)", "docker (>=3.0.0)", "graphql-core", "joserfc (>=0.9.0)", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.5.0)", "py-partiql-parser (==0.5.4)", "pyparsing (>=3.0.7)"] +s3 = ["PyYAML (>=5.1)", "py-partiql-parser (==0.5.4)"] +s3crc32c = ["PyYAML (>=5.1)", "crc32c", "py-partiql-parser (==0.5.4)"] +server = ["PyYAML (>=5.1)", "antlr4-python3-runtime", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.40.0)", "docker (>=3.0.0)", "flask (!=2.2.0,!=2.2.1)", "flask-cors", "graphql-core", "joserfc (>=0.9.0)", "jsondiff (>=1.1.2)", "jsonpath-ng", "openapi-spec-validator (>=0.5.0)", "py-partiql-parser (==0.5.4)", "pyparsing (>=3.0.7)", "setuptools"] ssm = ["PyYAML (>=5.1)"] +stepfunctions = ["antlr4-python3-runtime", "jsonpath-ng"] xray = ["aws-xray-sdk (>=0.93,!=0.96)", "setuptools"] +[[package]] +name = "mpmath" +version = "1.3.0" +description = "Python library for arbitrary-precision floating-point arithmetic" +optional = false +python-versions = "*" +files = [ + {file = "mpmath-1.3.0-py3-none-any.whl", hash = "sha256:a0b2b9fe80bbcd81a6647ff13108738cfb482d481d826cc0e02f5b35e5c88d2c"}, + {file = "mpmath-1.3.0.tar.gz", hash = "sha256:7a28eb2a9774d00c7bc92411c19a89209d5da7c4c9a9e227be8330a23a25b91f"}, +] + +[package.extras] +develop = ["codecov", "pycodestyle", "pytest (>=4.6)", "pytest-cov", "wheel"] +docs = ["sphinx"] +gmpy = ["gmpy2 (>=2.1.0a4)"] +tests = ["pytest (>=4.6)"] + [[package]] name = "multidict" version = "6.0.4" @@ -1654,42 +1769,38 @@ test = ["codecov (>=2.1)", "pytest (>=7.1)", "pytest-cov (>=3.0)"] [[package]] name = "openapi-schema-validator" -version = "0.2.3" +version = "0.4.4" description = "OpenAPI schema validation for Python" optional = false python-versions = ">=3.7.0,<4.0.0" files = [ - {file = "openapi-schema-validator-0.2.3.tar.gz", hash = "sha256:2c64907728c3ef78e23711c8840a423f0b241588c9ed929855e4b2d1bb0cf5f2"}, - {file = "openapi_schema_validator-0.2.3-py3-none-any.whl", hash = "sha256:9bae709212a19222892cabcc60cafd903cbf4b220223f48583afa3c0e3cc6fc4"}, + {file = "openapi_schema_validator-0.4.4-py3-none-any.whl", hash = "sha256:79f37f38ef9fd5206b924ed7a6f382cea7b649b3b56383c47f1906082b7b9015"}, + {file = "openapi_schema_validator-0.4.4.tar.gz", hash = "sha256:c573e2be2c783abae56c5a1486ab716ca96e09d1c3eab56020d1dc680aa57bf8"}, ] [package.dependencies] -jsonschema = ">=3.0.0,<5.0.0" +jsonschema = ">=4.0.0,<4.18.0" +rfc3339-validator = "*" [package.extras] -isodate = ["isodate"] -rfc3339-validator = ["rfc3339-validator"] -strict-rfc3339 = ["strict-rfc3339"] +docs = ["sphinx (>=5.3.0,<6.0.0)", "sphinx-immaterial (>=0.11.0,<0.12.0)"] [[package]] name = "openapi-spec-validator" -version = "0.4.0" -description = "OpenAPI 2.0 (aka Swagger) and OpenAPI 3.0 spec validator" +version = "0.5.7" +description = "OpenAPI 2.0 (aka Swagger) and OpenAPI 3 spec validator" optional = false python-versions = ">=3.7.0,<4.0.0" files = [ - {file = "openapi-spec-validator-0.4.0.tar.gz", hash = "sha256:97f258850afc97b048f7c2653855e0f88fa66ac103c2be5077c7960aca2ad49a"}, - {file = "openapi_spec_validator-0.4.0-py3-none-any.whl", hash = "sha256:06900ac4d546a1df3642a779da0055be58869c598e3042a2fef067cfd99d04d0"}, + {file = "openapi_spec_validator-0.5.7-py3-none-any.whl", hash = "sha256:8712d2879db7692974ef89c47a3ebfc79436442921ec3a826ac0ce80cde8c549"}, + {file = "openapi_spec_validator-0.5.7.tar.gz", hash = "sha256:6c2d42180045a80fd6314de848b94310bdb0fa4949f4b099578b69f79d9fa5ac"}, ] [package.dependencies] -jsonschema = ">=3.2.0,<5.0.0" -openapi-schema-validator = ">=0.2.0,<0.3.0" -PyYAML = ">=5.1" -setuptools = "*" - -[package.extras] -requests = ["requests"] +jsonschema = ">=4.0.0,<4.18.0" +jsonschema-spec = ">=0.1.1,<0.2.0" +lazy-object-proxy = ">=1.7.1,<2.0.0" +openapi-schema-validator = ">=0.4.2,<0.5.0" [[package]] name = "packaging" @@ -1702,6 +1813,17 @@ files = [ {file = "packaging-23.0.tar.gz", hash = "sha256:b6ad297f8907de0fa2fe1ccbd26fdaf387f5f47c7275fedf8cce89f99446cf97"}, ] +[[package]] +name = "pathable" +version = "0.4.3" +description = "Object-oriented paths" +optional = false +python-versions = ">=3.7.0,<4.0.0" +files = [ + {file = "pathable-0.4.3-py3-none-any.whl", hash = "sha256:cdd7b1f9d7d5c8b8d3315dbf5a86b2596053ae845f056f57d97c0eefff84da14"}, + {file = "pathable-0.4.3.tar.gz", hash = "sha256:5c869d315be50776cc8a993f3af43e0c60dc01506b399643f919034ebf4cdcab"}, +] + [[package]] name = "pbr" version = "5.9.0" @@ -1728,6 +1850,17 @@ files = [ dev = ["pre-commit", "tox"] testing = ["pytest", "pytest-benchmark"] +[[package]] +name = "ply" +version = "3.11" +description = "Python Lex & Yacc" +optional = false +python-versions = "*" +files = [ + {file = "ply-3.11-py2.py3-none-any.whl", hash = "sha256:096f9b8350b65ebd2fd1346b12452efe5b9607f7482813ffca50c22722a807ce"}, + {file = "ply-3.11.tar.gz", hash = "sha256:00c7c1aaa88358b9c765b6d3000c6eec0ba42abca5351b095321aef446081da3"}, +] + [[package]] name = "prometheus-client" version = "0.14.1" @@ -1840,16 +1973,19 @@ files = [ ] [[package]] -name = "pyasn1" -version = "0.4.8" -description = "ASN.1 types and codecs" +name = "py-partiql-parser" +version = "0.5.4" +description = "Pure Python PartiQL Parser" optional = false python-versions = "*" files = [ - {file = "pyasn1-0.4.8-py2.py3-none-any.whl", hash = "sha256:39c7e2ec30515947ff4e87fb6f456dfc6e84857d34be479c9d4a4ba4bf46aa5d"}, - {file = "pyasn1-0.4.8.tar.gz", hash = "sha256:aef77c9fb94a3ac588e87841208bdec464471d9871bd5050a287cc9a475cd0ba"}, + {file = "py_partiql_parser-0.5.4-py2.py3-none-any.whl", hash = "sha256:3dc4295a47da9587681a96b35c6e151886fdbd0a4acbe0d97c4c68e5f689d315"}, + {file = "py_partiql_parser-0.5.4.tar.gz", hash = "sha256:72e043919538fa63edae72fb59afc7e3fd93adbde656718a7d2b4666f23dd114"}, ] +[package.extras] +dev = ["black (==22.6.0)", "flake8", "mypy", "pytest"] + [[package]] name = "pycparser" version = "2.21" @@ -1861,6 +1997,116 @@ files = [ {file = "pycparser-2.21.tar.gz", hash = "sha256:e644fdec12f7872f86c58ff790da456218b10f863970249516d60a5eaca77206"}, ] +[[package]] +name = "pydantic" +version = "2.7.1" +description = "Data validation using Python type hints" +optional = false +python-versions = ">=3.8" +files = [ + {file = "pydantic-2.7.1-py3-none-any.whl", hash = "sha256:e029badca45266732a9a79898a15ae2e8b14840b1eabbb25844be28f0b33f3d5"}, + {file = "pydantic-2.7.1.tar.gz", hash = "sha256:e9dbb5eada8abe4d9ae5f46b9939aead650cd2b68f249bb3a8139dbe125803cc"}, +] + +[package.dependencies] +annotated-types = ">=0.4.0" +pydantic-core = "2.18.2" +typing-extensions = ">=4.6.1" + +[package.extras] +email = ["email-validator (>=2.0.0)"] + +[[package]] +name = "pydantic-core" +version = "2.18.2" +description = "Core functionality for Pydantic validation and serialization" +optional = false +python-versions = ">=3.8" +files = [ + {file = "pydantic_core-2.18.2-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:9e08e867b306f525802df7cd16c44ff5ebbe747ff0ca6cf3fde7f36c05a59a81"}, + {file = "pydantic_core-2.18.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:f0a21cbaa69900cbe1a2e7cad2aa74ac3cf21b10c3efb0fa0b80305274c0e8a2"}, + {file = "pydantic_core-2.18.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0680b1f1f11fda801397de52c36ce38ef1c1dc841a0927a94f226dea29c3ae3d"}, + {file = "pydantic_core-2.18.2-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:95b9d5e72481d3780ba3442eac863eae92ae43a5f3adb5b4d0a1de89d42bb250"}, + {file = "pydantic_core-2.18.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c4fcf5cd9c4b655ad666ca332b9a081112cd7a58a8b5a6ca7a3104bc950f2038"}, + {file = "pydantic_core-2.18.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9b5155ff768083cb1d62f3e143b49a8a3432e6789a3abee8acd005c3c7af1c74"}, + {file = "pydantic_core-2.18.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:553ef617b6836fc7e4df130bb851e32fe357ce36336d897fd6646d6058d980af"}, + {file = "pydantic_core-2.18.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:b89ed9eb7d616ef5714e5590e6cf7f23b02d0d539767d33561e3675d6f9e3857"}, + {file = "pydantic_core-2.18.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:75f7e9488238e920ab6204399ded280dc4c307d034f3924cd7f90a38b1829563"}, + {file = "pydantic_core-2.18.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:ef26c9e94a8c04a1b2924149a9cb081836913818e55681722d7f29af88fe7b38"}, + {file = "pydantic_core-2.18.2-cp310-none-win32.whl", hash = "sha256:182245ff6b0039e82b6bb585ed55a64d7c81c560715d1bad0cbad6dfa07b4027"}, + {file = "pydantic_core-2.18.2-cp310-none-win_amd64.whl", hash = "sha256:e23ec367a948b6d812301afc1b13f8094ab7b2c280af66ef450efc357d2ae543"}, + {file = "pydantic_core-2.18.2-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:219da3f096d50a157f33645a1cf31c0ad1fe829a92181dd1311022f986e5fbe3"}, + {file = "pydantic_core-2.18.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:cc1cfd88a64e012b74e94cd00bbe0f9c6df57049c97f02bb07d39e9c852e19a4"}, + {file = "pydantic_core-2.18.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:05b7133a6e6aeb8df37d6f413f7705a37ab4031597f64ab56384c94d98fa0e90"}, + {file = "pydantic_core-2.18.2-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:224c421235f6102e8737032483f43c1a8cfb1d2f45740c44166219599358c2cd"}, + {file = "pydantic_core-2.18.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b14d82cdb934e99dda6d9d60dc84a24379820176cc4a0d123f88df319ae9c150"}, + {file = "pydantic_core-2.18.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2728b01246a3bba6de144f9e3115b532ee44bd6cf39795194fb75491824a1413"}, + {file = "pydantic_core-2.18.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:470b94480bb5ee929f5acba6995251ada5e059a5ef3e0dfc63cca287283ebfa6"}, + {file = "pydantic_core-2.18.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:997abc4df705d1295a42f95b4eec4950a37ad8ae46d913caeee117b6b198811c"}, + {file = "pydantic_core-2.18.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:75250dbc5290e3f1a0f4618db35e51a165186f9034eff158f3d490b3fed9f8a0"}, + {file = "pydantic_core-2.18.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:4456f2dca97c425231d7315737d45239b2b51a50dc2b6f0c2bb181fce6207664"}, + {file = "pydantic_core-2.18.2-cp311-none-win32.whl", hash = "sha256:269322dcc3d8bdb69f054681edff86276b2ff972447863cf34c8b860f5188e2e"}, + {file = "pydantic_core-2.18.2-cp311-none-win_amd64.whl", hash = "sha256:800d60565aec896f25bc3cfa56d2277d52d5182af08162f7954f938c06dc4ee3"}, + {file = "pydantic_core-2.18.2-cp311-none-win_arm64.whl", hash = "sha256:1404c69d6a676245199767ba4f633cce5f4ad4181f9d0ccb0577e1f66cf4c46d"}, + {file = "pydantic_core-2.18.2-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:fb2bd7be70c0fe4dfd32c951bc813d9fe6ebcbfdd15a07527796c8204bd36242"}, + {file = "pydantic_core-2.18.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:6132dd3bd52838acddca05a72aafb6eab6536aa145e923bb50f45e78b7251043"}, + {file = "pydantic_core-2.18.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d7d904828195733c183d20a54230c0df0eb46ec746ea1a666730787353e87182"}, + {file = "pydantic_core-2.18.2-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:c9bd70772c720142be1020eac55f8143a34ec9f82d75a8e7a07852023e46617f"}, + {file = "pydantic_core-2.18.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2b8ed04b3582771764538f7ee7001b02e1170223cf9b75dff0bc698fadb00cf3"}, + {file = "pydantic_core-2.18.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e6dac87ddb34aaec85f873d737e9d06a3555a1cc1a8e0c44b7f8d5daeb89d86f"}, + {file = "pydantic_core-2.18.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7ca4ae5a27ad7a4ee5170aebce1574b375de390bc01284f87b18d43a3984df72"}, + {file = "pydantic_core-2.18.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:886eec03591b7cf058467a70a87733b35f44707bd86cf64a615584fd72488b7c"}, + {file = "pydantic_core-2.18.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:ca7b0c1f1c983e064caa85f3792dd2fe3526b3505378874afa84baf662e12241"}, + {file = "pydantic_core-2.18.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:4b4356d3538c3649337df4074e81b85f0616b79731fe22dd11b99499b2ebbdf3"}, + {file = "pydantic_core-2.18.2-cp312-none-win32.whl", hash = "sha256:8b172601454f2d7701121bbec3425dd71efcb787a027edf49724c9cefc14c038"}, + {file = "pydantic_core-2.18.2-cp312-none-win_amd64.whl", hash = "sha256:b1bd7e47b1558ea872bd16c8502c414f9e90dcf12f1395129d7bb42a09a95438"}, + {file = "pydantic_core-2.18.2-cp312-none-win_arm64.whl", hash = "sha256:98758d627ff397e752bc339272c14c98199c613f922d4a384ddc07526c86a2ec"}, + {file = "pydantic_core-2.18.2-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:9fdad8e35f278b2c3eb77cbdc5c0a49dada440657bf738d6905ce106dc1de439"}, + {file = "pydantic_core-2.18.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:1d90c3265ae107f91a4f279f4d6f6f1d4907ac76c6868b27dc7fb33688cfb347"}, + {file = "pydantic_core-2.18.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:390193c770399861d8df9670fb0d1874f330c79caaca4642332df7c682bf6b91"}, + {file = "pydantic_core-2.18.2-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:82d5d4d78e4448683cb467897fe24e2b74bb7b973a541ea1dcfec1d3cbce39fb"}, + {file = "pydantic_core-2.18.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4774f3184d2ef3e14e8693194f661dea5a4d6ca4e3dc8e39786d33a94865cefd"}, + {file = "pydantic_core-2.18.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d4d938ec0adf5167cb335acb25a4ee69a8107e4984f8fbd2e897021d9e4ca21b"}, + {file = "pydantic_core-2.18.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e0e8b1be28239fc64a88a8189d1df7fad8be8c1ae47fcc33e43d4be15f99cc70"}, + {file = "pydantic_core-2.18.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:868649da93e5a3d5eacc2b5b3b9235c98ccdbfd443832f31e075f54419e1b96b"}, + {file = "pydantic_core-2.18.2-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:78363590ef93d5d226ba21a90a03ea89a20738ee5b7da83d771d283fd8a56761"}, + {file = "pydantic_core-2.18.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:852e966fbd035a6468fc0a3496589b45e2208ec7ca95c26470a54daed82a0788"}, + {file = "pydantic_core-2.18.2-cp38-none-win32.whl", hash = "sha256:6a46e22a707e7ad4484ac9ee9f290f9d501df45954184e23fc29408dfad61350"}, + {file = "pydantic_core-2.18.2-cp38-none-win_amd64.whl", hash = "sha256:d91cb5ea8b11607cc757675051f61b3d93f15eca3cefb3e6c704a5d6e8440f4e"}, + {file = "pydantic_core-2.18.2-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:ae0a8a797a5e56c053610fa7be147993fe50960fa43609ff2a9552b0e07013e8"}, + {file = "pydantic_core-2.18.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:042473b6280246b1dbf530559246f6842b56119c2926d1e52b631bdc46075f2a"}, + {file = "pydantic_core-2.18.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1a388a77e629b9ec814c1b1e6b3b595fe521d2cdc625fcca26fbc2d44c816804"}, + {file = "pydantic_core-2.18.2-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:e25add29b8f3b233ae90ccef2d902d0ae0432eb0d45370fe315d1a5cf231004b"}, + {file = "pydantic_core-2.18.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f459a5ce8434614dfd39bbebf1041952ae01da6bed9855008cb33b875cb024c0"}, + {file = "pydantic_core-2.18.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:eff2de745698eb46eeb51193a9f41d67d834d50e424aef27df2fcdee1b153845"}, + {file = "pydantic_core-2.18.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a8309f67285bdfe65c372ea3722b7a5642680f3dba538566340a9d36e920b5f0"}, + {file = "pydantic_core-2.18.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:f93a8a2e3938ff656a7c1bc57193b1319960ac015b6e87d76c76bf14fe0244b4"}, + {file = "pydantic_core-2.18.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:22057013c8c1e272eb8d0eebc796701167d8377441ec894a8fed1af64a0bf399"}, + {file = "pydantic_core-2.18.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:cfeecd1ac6cc1fb2692c3d5110781c965aabd4ec5d32799773ca7b1456ac636b"}, + {file = "pydantic_core-2.18.2-cp39-none-win32.whl", hash = "sha256:0d69b4c2f6bb3e130dba60d34c0845ba31b69babdd3f78f7c0c8fae5021a253e"}, + {file = "pydantic_core-2.18.2-cp39-none-win_amd64.whl", hash = "sha256:d9319e499827271b09b4e411905b24a426b8fb69464dfa1696258f53a3334641"}, + {file = "pydantic_core-2.18.2-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:a1874c6dd4113308bd0eb568418e6114b252afe44319ead2b4081e9b9521fe75"}, + {file = "pydantic_core-2.18.2-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:ccdd111c03bfd3666bd2472b674c6899550e09e9f298954cfc896ab92b5b0e6d"}, + {file = "pydantic_core-2.18.2-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e18609ceaa6eed63753037fc06ebb16041d17d28199ae5aba0052c51449650a9"}, + {file = "pydantic_core-2.18.2-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6e5c584d357c4e2baf0ff7baf44f4994be121e16a2c88918a5817331fc7599d7"}, + {file = "pydantic_core-2.18.2-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:43f0f463cf89ace478de71a318b1b4f05ebc456a9b9300d027b4b57c1a2064fb"}, + {file = "pydantic_core-2.18.2-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:e1b395e58b10b73b07b7cf740d728dd4ff9365ac46c18751bf8b3d8cca8f625a"}, + {file = "pydantic_core-2.18.2-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:0098300eebb1c837271d3d1a2cd2911e7c11b396eac9661655ee524a7f10587b"}, + {file = "pydantic_core-2.18.2-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:36789b70d613fbac0a25bb07ab3d9dba4d2e38af609c020cf4d888d165ee0bf3"}, + {file = "pydantic_core-2.18.2-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:3f9a801e7c8f1ef8718da265bba008fa121243dfe37c1cea17840b0944dfd72c"}, + {file = "pydantic_core-2.18.2-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:3a6515ebc6e69d85502b4951d89131ca4e036078ea35533bb76327f8424531ce"}, + {file = "pydantic_core-2.18.2-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:20aca1e2298c56ececfd8ed159ae4dde2df0781988c97ef77d5c16ff4bd5b400"}, + {file = "pydantic_core-2.18.2-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:223ee893d77a310a0391dca6df00f70bbc2f36a71a895cecd9a0e762dc37b349"}, + {file = "pydantic_core-2.18.2-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:2334ce8c673ee93a1d6a65bd90327588387ba073c17e61bf19b4fd97d688d63c"}, + {file = "pydantic_core-2.18.2-pp39-pypy39_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:cbca948f2d14b09d20268cda7b0367723d79063f26c4ffc523af9042cad95592"}, + {file = "pydantic_core-2.18.2-pp39-pypy39_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:b3ef08e20ec49e02d5c6717a91bb5af9b20f1805583cb0adfe9ba2c6b505b5ae"}, + {file = "pydantic_core-2.18.2-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:c6fdc8627910eed0c01aed6a390a252fe3ea6d472ee70fdde56273f198938374"}, + {file = "pydantic_core-2.18.2.tar.gz", hash = "sha256:2e29d20810dfc3043ee13ac7d9e25105799817683348823f305ab3f349b9386e"}, +] + +[package.dependencies] +typing-extensions = ">=4.6.0,<4.7.0 || >4.7.0" + [[package]] name = "pyjwt" version = "2.4.0" @@ -2115,28 +2361,6 @@ files = [ [package.dependencies] six = ">=1.5" -[[package]] -name = "python-jose" -version = "3.3.0" -description = "JOSE implementation in Python" -optional = false -python-versions = "*" -files = [ - {file = "python-jose-3.3.0.tar.gz", hash = "sha256:55779b5e6ad599c6336191246e95eb2293a9ddebd555f796a65f838f07e5d78a"}, - {file = "python_jose-3.3.0-py2.py3-none-any.whl", hash = "sha256:9b1376b023f8b298536eedd47ae1089bcdb848f1535ab30555cd92002d78923a"}, -] - -[package.dependencies] -cryptography = {version = ">=3.4.0", optional = true, markers = "extra == \"cryptography\""} -ecdsa = "!=0.15" -pyasn1 = "*" -rsa = "*" - -[package.extras] -cryptography = ["cryptography (>=3.4.0)"] -pycrypto = ["pyasn1", "pycrypto (>=2.6.0,<2.7.0)"] -pycryptodome = ["pyasn1", "pycryptodome (>=3.3.1,<4.0.0)"] - [[package]] name = "pywin32" version = "301" @@ -2216,15 +2440,103 @@ files = [ {file = "PyYAML-6.0.1.tar.gz", hash = "sha256:bfdf460b1736c775f2ba9f6a92bca30bc2095067b8a9d77876d1fad6cc3b4a43"}, ] +[[package]] +name = "regex" +version = "2024.4.28" +description = "Alternative regular expression module, to replace re." +optional = false +python-versions = ">=3.8" +files = [ + {file = "regex-2024.4.28-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:cd196d056b40af073d95a2879678585f0b74ad35190fac04ca67954c582c6b61"}, + {file = "regex-2024.4.28-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:8bb381f777351bd534462f63e1c6afb10a7caa9fa2a421ae22c26e796fe31b1f"}, + {file = "regex-2024.4.28-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:47af45b6153522733aa6e92543938e97a70ce0900649ba626cf5aad290b737b6"}, + {file = "regex-2024.4.28-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:99d6a550425cc51c656331af0e2b1651e90eaaa23fb4acde577cf15068e2e20f"}, + {file = "regex-2024.4.28-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:bf29304a8011feb58913c382902fde3395957a47645bf848eea695839aa101b7"}, + {file = "regex-2024.4.28-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:92da587eee39a52c91aebea8b850e4e4f095fe5928d415cb7ed656b3460ae79a"}, + {file = "regex-2024.4.28-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6277d426e2f31bdbacb377d17a7475e32b2d7d1f02faaecc48d8e370c6a3ff31"}, + {file = "regex-2024.4.28-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:28e1f28d07220c0f3da0e8fcd5a115bbb53f8b55cecf9bec0c946eb9a059a94c"}, + {file = "regex-2024.4.28-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:aaa179975a64790c1f2701ac562b5eeb733946eeb036b5bcca05c8d928a62f10"}, + {file = "regex-2024.4.28-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:6f435946b7bf7a1b438b4e6b149b947c837cb23c704e780c19ba3e6855dbbdd3"}, + {file = "regex-2024.4.28-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:19d6c11bf35a6ad077eb23852827f91c804eeb71ecb85db4ee1386825b9dc4db"}, + {file = "regex-2024.4.28-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:fdae0120cddc839eb8e3c15faa8ad541cc6d906d3eb24d82fb041cfe2807bc1e"}, + {file = "regex-2024.4.28-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:e672cf9caaf669053121f1766d659a8813bd547edef6e009205378faf45c67b8"}, + {file = "regex-2024.4.28-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:f57515750d07e14743db55d59759893fdb21d2668f39e549a7d6cad5d70f9fea"}, + {file = "regex-2024.4.28-cp310-cp310-win32.whl", hash = "sha256:a1409c4eccb6981c7baabc8888d3550df518add6e06fe74fa1d9312c1838652d"}, + {file = "regex-2024.4.28-cp310-cp310-win_amd64.whl", hash = "sha256:1f687a28640f763f23f8a9801fe9e1b37338bb1ca5d564ddd41619458f1f22d1"}, + {file = "regex-2024.4.28-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:84077821c85f222362b72fdc44f7a3a13587a013a45cf14534df1cbbdc9a6796"}, + {file = "regex-2024.4.28-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b45d4503de8f4f3dc02f1d28a9b039e5504a02cc18906cfe744c11def942e9eb"}, + {file = "regex-2024.4.28-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:457c2cd5a646dd4ed536c92b535d73548fb8e216ebee602aa9f48e068fc393f3"}, + {file = "regex-2024.4.28-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2b51739ddfd013c6f657b55a508de8b9ea78b56d22b236052c3a85a675102dc6"}, + {file = "regex-2024.4.28-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:459226445c7d7454981c4c0ce0ad1a72e1e751c3e417f305722bbcee6697e06a"}, + {file = "regex-2024.4.28-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:670fa596984b08a4a769491cbdf22350431970d0112e03d7e4eeaecaafcd0fec"}, + {file = "regex-2024.4.28-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fe00f4fe11c8a521b173e6324d862ee7ee3412bf7107570c9b564fe1119b56fb"}, + {file = "regex-2024.4.28-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:36f392dc7763fe7924575475736bddf9ab9f7a66b920932d0ea50c2ded2f5636"}, + {file = "regex-2024.4.28-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:23a412b7b1a7063f81a742463f38821097b6a37ce1e5b89dd8e871d14dbfd86b"}, + {file = "regex-2024.4.28-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:f1d6e4b7b2ae3a6a9df53efbf199e4bfcff0959dbdb5fd9ced34d4407348e39a"}, + {file = "regex-2024.4.28-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:499334ad139557de97cbc4347ee921c0e2b5e9c0f009859e74f3f77918339257"}, + {file = "regex-2024.4.28-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:0940038bec2fe9e26b203d636c44d31dd8766abc1fe66262da6484bd82461ccf"}, + {file = "regex-2024.4.28-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:66372c2a01782c5fe8e04bff4a2a0121a9897e19223d9eab30c54c50b2ebeb7f"}, + {file = "regex-2024.4.28-cp311-cp311-win32.whl", hash = "sha256:c77d10ec3c1cf328b2f501ca32583625987ea0f23a0c2a49b37a39ee5c4c4630"}, + {file = "regex-2024.4.28-cp311-cp311-win_amd64.whl", hash = "sha256:fc0916c4295c64d6890a46e02d4482bb5ccf33bf1a824c0eaa9e83b148291f90"}, + {file = "regex-2024.4.28-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:08a1749f04fee2811c7617fdd46d2e46d09106fa8f475c884b65c01326eb15c5"}, + {file = "regex-2024.4.28-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:b8eb28995771c087a73338f695a08c9abfdf723d185e57b97f6175c5051ff1ae"}, + {file = "regex-2024.4.28-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:dd7ef715ccb8040954d44cfeff17e6b8e9f79c8019daae2fd30a8806ef5435c0"}, + {file = "regex-2024.4.28-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fb0315a2b26fde4005a7c401707c5352df274460f2f85b209cf6024271373013"}, + {file = "regex-2024.4.28-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f2fc053228a6bd3a17a9b0a3f15c3ab3cf95727b00557e92e1cfe094b88cc662"}, + {file = "regex-2024.4.28-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7fe9739a686dc44733d52d6e4f7b9c77b285e49edf8570754b322bca6b85b4cc"}, + {file = "regex-2024.4.28-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a74fcf77d979364f9b69fcf8200849ca29a374973dc193a7317698aa37d8b01c"}, + {file = "regex-2024.4.28-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:965fd0cf4694d76f6564896b422724ec7b959ef927a7cb187fc6b3f4e4f59833"}, + {file = "regex-2024.4.28-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:2fef0b38c34ae675fcbb1b5db760d40c3fc3612cfa186e9e50df5782cac02bcd"}, + {file = "regex-2024.4.28-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:bc365ce25f6c7c5ed70e4bc674f9137f52b7dd6a125037f9132a7be52b8a252f"}, + {file = "regex-2024.4.28-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:ac69b394764bb857429b031d29d9604842bc4cbfd964d764b1af1868eeebc4f0"}, + {file = "regex-2024.4.28-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:144a1fc54765f5c5c36d6d4b073299832aa1ec6a746a6452c3ee7b46b3d3b11d"}, + {file = "regex-2024.4.28-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:2630ca4e152c221072fd4a56d4622b5ada876f668ecd24d5ab62544ae6793ed6"}, + {file = "regex-2024.4.28-cp312-cp312-win32.whl", hash = "sha256:7f3502f03b4da52bbe8ba962621daa846f38489cae5c4a7b5d738f15f6443d17"}, + {file = "regex-2024.4.28-cp312-cp312-win_amd64.whl", hash = "sha256:0dd3f69098511e71880fb00f5815db9ed0ef62c05775395968299cb400aeab82"}, + {file = "regex-2024.4.28-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:374f690e1dd0dbdcddea4a5c9bdd97632cf656c69113f7cd6a361f2a67221cb6"}, + {file = "regex-2024.4.28-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:25f87ae6b96374db20f180eab083aafe419b194e96e4f282c40191e71980c666"}, + {file = "regex-2024.4.28-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:5dbc1bcc7413eebe5f18196e22804a3be1bfdfc7e2afd415e12c068624d48247"}, + {file = "regex-2024.4.28-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f85151ec5a232335f1be022b09fbbe459042ea1951d8a48fef251223fc67eee1"}, + {file = "regex-2024.4.28-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:57ba112e5530530fd175ed550373eb263db4ca98b5f00694d73b18b9a02e7185"}, + {file = "regex-2024.4.28-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:224803b74aab56aa7be313f92a8d9911dcade37e5f167db62a738d0c85fdac4b"}, + {file = "regex-2024.4.28-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0a54a047b607fd2d2d52a05e6ad294602f1e0dec2291152b745870afc47c1397"}, + {file = "regex-2024.4.28-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0a2a512d623f1f2d01d881513af9fc6a7c46e5cfffb7dc50c38ce959f9246c94"}, + {file = "regex-2024.4.28-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:c06bf3f38f0707592898428636cbb75d0a846651b053a1cf748763e3063a6925"}, + {file = "regex-2024.4.28-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:1031a5e7b048ee371ab3653aad3030ecfad6ee9ecdc85f0242c57751a05b0ac4"}, + {file = "regex-2024.4.28-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:d7a353ebfa7154c871a35caca7bfd8f9e18666829a1dc187115b80e35a29393e"}, + {file = "regex-2024.4.28-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:7e76b9cfbf5ced1aca15a0e5b6f229344d9b3123439ffce552b11faab0114a02"}, + {file = "regex-2024.4.28-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:5ce479ecc068bc2a74cb98dd8dba99e070d1b2f4a8371a7dfe631f85db70fe6e"}, + {file = "regex-2024.4.28-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:7d77b6f63f806578c604dca209280e4c54f0fa9a8128bb8d2cc5fb6f99da4150"}, + {file = "regex-2024.4.28-cp38-cp38-win32.whl", hash = "sha256:d84308f097d7a513359757c69707ad339da799e53b7393819ec2ea36bc4beb58"}, + {file = "regex-2024.4.28-cp38-cp38-win_amd64.whl", hash = "sha256:2cc1b87bba1dd1a898e664a31012725e48af826bf3971e786c53e32e02adae6c"}, + {file = "regex-2024.4.28-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:7413167c507a768eafb5424413c5b2f515c606be5bb4ef8c5dee43925aa5718b"}, + {file = "regex-2024.4.28-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:108e2dcf0b53a7c4ab8986842a8edcb8ab2e59919a74ff51c296772e8e74d0ae"}, + {file = "regex-2024.4.28-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:f1c5742c31ba7d72f2dedf7968998730664b45e38827637e0f04a2ac7de2f5f1"}, + {file = "regex-2024.4.28-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ecc6148228c9ae25ce403eade13a0961de1cb016bdb35c6eafd8e7b87ad028b1"}, + {file = "regex-2024.4.28-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b7d893c8cf0e2429b823ef1a1d360a25950ed11f0e2a9df2b5198821832e1947"}, + {file = "regex-2024.4.28-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4290035b169578ffbbfa50d904d26bec16a94526071ebec3dadbebf67a26b25e"}, + {file = "regex-2024.4.28-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:44a22ae1cfd82e4ffa2066eb3390777dc79468f866f0625261a93e44cdf6482b"}, + {file = "regex-2024.4.28-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:fd24fd140b69f0b0bcc9165c397e9b2e89ecbeda83303abf2a072609f60239e2"}, + {file = "regex-2024.4.28-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:39fb166d2196413bead229cd64a2ffd6ec78ebab83fff7d2701103cf9f4dfd26"}, + {file = "regex-2024.4.28-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:9301cc6db4d83d2c0719f7fcda37229691745168bf6ae849bea2e85fc769175d"}, + {file = "regex-2024.4.28-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:7c3d389e8d76a49923683123730c33e9553063d9041658f23897f0b396b2386f"}, + {file = "regex-2024.4.28-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:99ef6289b62042500d581170d06e17f5353b111a15aa6b25b05b91c6886df8fc"}, + {file = "regex-2024.4.28-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:b91d529b47798c016d4b4c1d06cc826ac40d196da54f0de3c519f5a297c5076a"}, + {file = "regex-2024.4.28-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:43548ad74ea50456e1c68d3c67fff3de64c6edb85bcd511d1136f9b5376fc9d1"}, + {file = "regex-2024.4.28-cp39-cp39-win32.whl", hash = "sha256:05d9b6578a22db7dedb4df81451f360395828b04f4513980b6bd7a1412c679cc"}, + {file = "regex-2024.4.28-cp39-cp39-win_amd64.whl", hash = "sha256:3986217ec830c2109875be740531feb8ddafe0dfa49767cdcd072ed7e8927962"}, + {file = "regex-2024.4.28.tar.gz", hash = "sha256:83ab366777ea45d58f72593adf35d36ca911ea8bd838483c1823b883a121b0e4"}, +] + [[package]] name = "requests" -version = "2.31.0" +version = "2.32.0" description = "Python HTTP for Humans." optional = false -python-versions = ">=3.7" +python-versions = ">=3.8" files = [ - {file = "requests-2.31.0-py3-none-any.whl", hash = "sha256:58cd2187c01e70e6e26505bca751777aa9f2ee0b7f4300988b709f44e013003f"}, - {file = "requests-2.31.0.tar.gz", hash = "sha256:942c5a758f98d790eaed1a29cb6eefc7ffb0d1cf7af05c3d2791656dbd6ad1e1"}, + {file = "requests-2.32.0-py3-none-any.whl", hash = "sha256:f2c3881dddb70d056c5bd7600a4fae312b2a300e39be6a118d30b90bd27262b5"}, + {file = "requests-2.32.0.tar.gz", hash = "sha256:fa5490319474c82ef1d2c9bc459d3652e3ae4ef4c4ebdd18a21145a47ca4b6b8"}, ] [package.dependencies] @@ -2256,18 +2568,18 @@ urllib3 = ">=1.25.10" tests = ["coverage (>=6.0.0)", "flake8", "mypy", "pytest (>=7.0.0)", "pytest-asyncio", "pytest-cov", "pytest-localserver", "types-mock", "types-requests"] [[package]] -name = "rsa" -version = "4.9" -description = "Pure-Python RSA implementation" +name = "rfc3339-validator" +version = "0.1.4" +description = "A pure python RFC3339 validator" optional = false -python-versions = ">=3.6,<4" +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" files = [ - {file = "rsa-4.9-py3-none-any.whl", hash = "sha256:90260d9058e514786967344d0ef75fa8727eed8a7d2e43ce9f4bcf1b536174f7"}, - {file = "rsa-4.9.tar.gz", hash = "sha256:e38464a49c6c85d7f1351b0126661487a7e0a14a50f1675ec50eb34d4f20ef21"}, + {file = "rfc3339_validator-0.1.4-py2.py3-none-any.whl", hash = "sha256:24f6ec1eda14ef823da9e36ec7113124b39c04d50a4d3d3a3c2859577e7791fa"}, + {file = "rfc3339_validator-0.1.4.tar.gz", hash = "sha256:138a2abdf93304ad60530167e51d2dfb9549521a836871b88d7f4695d0022f6b"}, ] [package.dependencies] -pyasn1 = ">=0.1.3" +six = "*" [[package]] name = "ruff" @@ -2366,22 +2678,18 @@ files = [ ] [[package]] -name = "sshpubkeys" -version = "3.3.1" -description = "SSH public key parser" +name = "sympy" +version = "1.12" +description = "Computer algebra system (CAS) in Python" optional = false -python-versions = ">=3" +python-versions = ">=3.8" files = [ - {file = "sshpubkeys-3.3.1-py2.py3-none-any.whl", hash = "sha256:946f76b8fe86704b0e7c56a00d80294e39bc2305999844f079a217885060b1ac"}, - {file = "sshpubkeys-3.3.1.tar.gz", hash = "sha256:3020ed4f8c846849299370fbe98ff4157b0ccc1accec105e07cfa9ae4bb55064"}, + {file = "sympy-1.12-py3-none-any.whl", hash = "sha256:c3588cd4295d0c0f603d0f2ae780587e64e2efeedb3521e46b9bb1d08d184fa5"}, + {file = "sympy-1.12.tar.gz", hash = "sha256:ebf595c8dac3e0fdc4152c51878b498396ec7f30e7a914d6071e674d49420fb8"}, ] [package.dependencies] -cryptography = ">=2.1.4" -ecdsa = ">=0.13" - -[package.extras] -dev = ["twine", "wheel", "yapf"] +mpmath = ">=0.19" [[package]] name = "toml" @@ -2899,4 +3207,4 @@ cffi = ["cffi (>=1.11)"] [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "496d6d9f722983bda4d1265370bc8ba75560da74ab5d6b68c94a03290815e1eb" +content-hash = "16ebd6a46768be7f67dbdb4ee5903b167d94edc9965f29252f038c67e9e907b0" diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml index 3002006aed..7da0763bc1 100644 --- a/proxy/Cargo.toml +++ b/proxy/Cargo.toml @@ -9,6 +9,7 @@ default = [] testing = [] [dependencies] +ahash.workspace = true anyhow.workspace = true async-compression.workspace = true async-trait.workspace = true @@ -24,8 +25,10 @@ camino.workspace = true chrono.workspace = true clap.workspace = true consumption_metrics.workspace = true +crossbeam-deque.workspace = true dashmap.workspace = true env_logger.workspace = true +framed-websockets.workspace = true futures.workspace = true git-version.workspace = true hashbrown.workspace = true @@ -35,7 +38,6 @@ hmac.workspace = true hostname.workspace = true http.workspace = true humantime.workspace = true -hyper-tungstenite.workspace = true hyper.workspace = true hyper1 = { package = "hyper", version = "1.2", features = ["server"] } hyper-util = { version = "0.1", features = ["server", "http1", "http2", "tokio"] } @@ -52,7 +54,6 @@ opentelemetry.workspace = true parking_lot.workspace = true parquet.workspace = true parquet_derive.workspace = true -pbkdf2 = { workspace = true, features = ["simple", "std"] } pin-project-lite.workspace = true postgres_backend.workspace = true pq_proto.workspace = true @@ -76,7 +77,6 @@ smol_str.workspace = true smallvec.workspace = true socket2.workspace = true subtle.workspace = true -sync_wrapper.workspace = true task-local-extensions.workspace = true thiserror.workspace = true tikv-jemallocator.workspace = true @@ -106,6 +106,8 @@ workspace_hack.workspace = true [dev-dependencies] camino-tempfile.workspace = true fallible-iterator.workspace = true +tokio-tungstenite.workspace = true +pbkdf2 = { workspace = true, features = ["simple", "std"] } rcgen.workspace = true rstest.workspace = true tokio-postgres-rustls.workspace = true diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs index 3795e3b608..3555eba543 100644 --- a/proxy/src/auth/backend.rs +++ b/proxy/src/auth/backend.rs @@ -13,7 +13,7 @@ use tokio_postgres::config::AuthKeys; use tracing::{info, warn}; use crate::auth::credentials::check_peer_addr_is_in_list; -use crate::auth::validate_password_and_exchange; +use crate::auth::{validate_password_and_exchange, AuthError}; use crate::cache::Cached; use crate::console::errors::GetAuthInfoError; use crate::console::provider::{CachedRoleSecret, ConsoleBackend}; @@ -23,7 +23,7 @@ use crate::intern::EndpointIdInt; use crate::metrics::Metrics; use crate::proxy::connect_compute::ComputeConnectBackend; use crate::proxy::NeonOptions; -use crate::rate_limiter::{BucketRateLimiter, RateBucketInfo}; +use crate::rate_limiter::{BucketRateLimiter, EndpointRateLimiter, RateBucketInfo}; use crate::stream::Stream; use crate::{ auth::{self, ComputeUserInfoMaybeEndpoint}, @@ -280,6 +280,7 @@ async fn auth_quirks( client: &mut stream::PqStream>, allow_cleartext: bool, config: &'static AuthenticationConfig, + endpoint_rate_limiter: Arc, ) -> auth::Result { // If there's no project so far, that entails that client doesn't // support SNI or other means of passing the endpoint (project) name. @@ -305,6 +306,10 @@ async fn auth_quirks( if !check_peer_addr_is_in_list(&ctx.peer_addr, &allowed_ips) { return Err(auth::AuthError::ip_address_not_allowed(ctx.peer_addr)); } + + if !endpoint_rate_limiter.check(info.endpoint.clone().into(), 1) { + return Err(AuthError::too_many_connections()); + } let cached_secret = match maybe_secret { Some(secret) => secret, None => api.get_role_secret(ctx, &info).await?, @@ -360,7 +365,10 @@ async fn authenticate_with_secret( config: &'static AuthenticationConfig, ) -> auth::Result { if let Some(password) = unauthenticated_password { - let auth_outcome = validate_password_and_exchange(&password, secret).await?; + let ep = EndpointIdInt::from(&info.endpoint); + + let auth_outcome = + validate_password_and_exchange(&config.thread_pool, ep, &password, secret).await?; let keys = match auth_outcome { crate::sasl::Outcome::Success(key) => key, crate::sasl::Outcome::Failure(reason) => { @@ -381,7 +389,7 @@ async fn authenticate_with_secret( // Currently, we use it for websocket connections (latency). if allow_cleartext { ctx.set_auth_method(crate::context::AuthMethod::Cleartext); - return hacks::authenticate_cleartext(ctx, info, client, secret).await; + return hacks::authenticate_cleartext(ctx, info, client, secret, config).await; } // Finally, proceed with the main auth flow (SCRAM-based). @@ -417,6 +425,7 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint, &()> { client: &mut stream::PqStream>, allow_cleartext: bool, config: &'static AuthenticationConfig, + endpoint_rate_limiter: Arc, ) -> auth::Result> { use BackendType::*; @@ -428,8 +437,16 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint, &()> { "performing authentication using the console" ); - let credentials = - auth_quirks(ctx, &*api, user_info, client, allow_cleartext, config).await?; + let credentials = auth_quirks( + ctx, + &*api, + user_info, + client, + allow_cleartext, + config, + endpoint_rate_limiter, + ) + .await?; BackendType::Console(api, credentials) } // NOTE: this auth backend doesn't use client credentials. @@ -539,8 +556,8 @@ mod tests { }, context::RequestMonitoring, proxy::NeonOptions, - rate_limiter::RateBucketInfo, - scram::ServerSecret, + rate_limiter::{EndpointRateLimiter, RateBucketInfo}, + scram::{threadpool::ThreadPool, ServerSecret}, stream::{PqStream, Stream}, }; @@ -582,6 +599,7 @@ mod tests { } static CONFIG: Lazy = Lazy::new(|| AuthenticationConfig { + thread_pool: ThreadPool::new(1), scram_protocol_timeout: std::time::Duration::from_secs(5), rate_limiter_enabled: true, rate_limiter: AuthRateLimiter::new(&RateBucketInfo::DEFAULT_AUTH_SET), @@ -699,10 +717,20 @@ mod tests { _ => panic!("wrong message"), } }); + let endpoint_rate_limiter = + Arc::new(EndpointRateLimiter::new(&RateBucketInfo::DEFAULT_AUTH_SET)); - let _creds = auth_quirks(&mut ctx, &api, user_info, &mut stream, false, &CONFIG) - .await - .unwrap(); + let _creds = auth_quirks( + &mut ctx, + &api, + user_info, + &mut stream, + false, + &CONFIG, + endpoint_rate_limiter, + ) + .await + .unwrap(); handle.await.unwrap(); } @@ -739,10 +767,20 @@ mod tests { frontend::password_message(b"my-secret-password", &mut write).unwrap(); client.write_all(&write).await.unwrap(); }); + let endpoint_rate_limiter = + Arc::new(EndpointRateLimiter::new(&RateBucketInfo::DEFAULT_AUTH_SET)); - let _creds = auth_quirks(&mut ctx, &api, user_info, &mut stream, true, &CONFIG) - .await - .unwrap(); + let _creds = auth_quirks( + &mut ctx, + &api, + user_info, + &mut stream, + true, + &CONFIG, + endpoint_rate_limiter, + ) + .await + .unwrap(); handle.await.unwrap(); } @@ -780,9 +818,20 @@ mod tests { client.write_all(&write).await.unwrap(); }); - let creds = auth_quirks(&mut ctx, &api, user_info, &mut stream, true, &CONFIG) - .await - .unwrap(); + let endpoint_rate_limiter = + Arc::new(EndpointRateLimiter::new(&RateBucketInfo::DEFAULT_AUTH_SET)); + + let creds = auth_quirks( + &mut ctx, + &api, + user_info, + &mut stream, + true, + &CONFIG, + endpoint_rate_limiter, + ) + .await + .unwrap(); assert_eq!(creds.info.endpoint, "my-endpoint"); diff --git a/proxy/src/auth/backend/hacks.rs b/proxy/src/auth/backend/hacks.rs index f7241be4a9..6b0f5e1726 100644 --- a/proxy/src/auth/backend/hacks.rs +++ b/proxy/src/auth/backend/hacks.rs @@ -3,8 +3,10 @@ use super::{ }; use crate::{ auth::{self, AuthFlow}, + config::AuthenticationConfig, console::AuthSecret, context::RequestMonitoring, + intern::EndpointIdInt, sasl, stream::{self, Stream}, }; @@ -20,6 +22,7 @@ pub async fn authenticate_cleartext( info: ComputeUserInfo, client: &mut stream::PqStream>, secret: AuthSecret, + config: &'static AuthenticationConfig, ) -> auth::Result { warn!("cleartext auth flow override is enabled, proceeding"); ctx.set_auth_method(crate::context::AuthMethod::Cleartext); @@ -27,8 +30,14 @@ pub async fn authenticate_cleartext( // pause the timer while we communicate with the client let paused = ctx.latency_timer.pause(crate::metrics::Waiting::Client); + let ep = EndpointIdInt::from(&info.endpoint); + let auth_flow = AuthFlow::new(client) - .begin(auth::CleartextPassword(secret)) + .begin(auth::CleartextPassword { + secret, + endpoint: ep, + pool: config.thread_pool.clone(), + }) .await?; drop(paused); // cleartext auth is only allowed to the ws/http protocol. diff --git a/proxy/src/auth/flow.rs b/proxy/src/auth/flow.rs index 45bbad8cb2..59d1ac17f4 100644 --- a/proxy/src/auth/flow.rs +++ b/proxy/src/auth/flow.rs @@ -5,12 +5,14 @@ use crate::{ config::TlsServerEndPoint, console::AuthSecret, context::RequestMonitoring, - sasl, scram, + intern::EndpointIdInt, + sasl, + scram::{self, threadpool::ThreadPool}, stream::{PqStream, Stream}, }; use postgres_protocol::authentication::sasl::{SCRAM_SHA_256, SCRAM_SHA_256_PLUS}; use pq_proto::{BeAuthenticationSaslMessage, BeMessage, BeMessage as Be}; -use std::io; +use std::{io, sync::Arc}; use tokio::io::{AsyncRead, AsyncWrite}; use tracing::info; @@ -53,7 +55,11 @@ impl AuthMethod for PasswordHack { /// Use clear-text password auth called `password` in docs /// -pub struct CleartextPassword(pub AuthSecret); +pub struct CleartextPassword { + pub pool: Arc, + pub endpoint: EndpointIdInt, + pub secret: AuthSecret, +} impl AuthMethod for CleartextPassword { #[inline(always)] @@ -126,7 +132,13 @@ impl AuthFlow<'_, S, CleartextPassword> { .strip_suffix(&[0]) .ok_or(AuthErrorImpl::MalformedPassword("missing terminator"))?; - let outcome = validate_password_and_exchange(password, self.state.0).await?; + let outcome = validate_password_and_exchange( + &self.state.pool, + self.state.endpoint, + password, + self.state.secret, + ) + .await?; if let sasl::Outcome::Success(_) = &outcome { self.stream.write_message_noflush(&Be::AuthenticationOk)?; @@ -181,6 +193,8 @@ impl AuthFlow<'_, S, Scram<'_>> { } pub(crate) async fn validate_password_and_exchange( + pool: &ThreadPool, + endpoint: EndpointIdInt, password: &[u8], secret: AuthSecret, ) -> super::Result> { @@ -194,7 +208,7 @@ pub(crate) async fn validate_password_and_exchange( } // perform scram authentication as both client and server to validate the keys AuthSecret::Scram(scram_secret) => { - let outcome = crate::scram::exchange(&scram_secret, password).await?; + let outcome = crate::scram::exchange(pool, endpoint, &scram_secret, password).await?; let client_key = match outcome { sasl::Outcome::Success(client_key) => client_key, diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs index d345781814..5be0653a09 100644 --- a/proxy/src/bin/proxy.rs +++ b/proxy/src/bin/proxy.rs @@ -27,6 +27,7 @@ use proxy::redis::cancellation_publisher::RedisPublisherClient; use proxy::redis::connection_with_credentials_provider::ConnectionWithCredentialsProvider; use proxy::redis::elasticache; use proxy::redis::notifications; +use proxy::scram::threadpool::ThreadPool; use proxy::serverless::cancel_set::CancelSet; use proxy::serverless::GlobalConnPoolOptions; use proxy::usage_metrics; @@ -132,6 +133,9 @@ struct ProxyCliArgs { /// timeout for scram authentication protocol #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)] scram_protocol_timeout: tokio::time::Duration, + /// size of the threadpool for password hashing + #[clap(long, default_value_t = 4)] + scram_thread_pool_size: u8, /// Require that all incoming requests have a Proxy Protocol V2 packet **and** have an IP address associated. #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)] require_client_ip: bool, @@ -144,6 +148,9 @@ struct ProxyCliArgs { /// Can be given multiple times for different bucket sizes. #[clap(long, default_values_t = RateBucketInfo::DEFAULT_ENDPOINT_SET)] endpoint_rps_limit: Vec, + /// Wake compute rate limiter max number of requests per second. + #[clap(long, default_values_t = RateBucketInfo::DEFAULT_SET)] + wake_compute_limit: Vec, /// Whether the auth rate limiter actually takes effect (for testing) #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)] auth_rate_limit_enabled: bool, @@ -154,7 +161,7 @@ struct ProxyCliArgs { #[clap(long, default_value_t = 64)] auth_rate_limit_ip_subnet: u8, /// Redis rate limiter max number of requests per second. - #[clap(long, default_values_t = RateBucketInfo::DEFAULT_ENDPOINT_SET)] + #[clap(long, default_values_t = RateBucketInfo::DEFAULT_SET)] redis_rps_limit: Vec, /// cache for `allowed_ips` (use `size=0` to disable) #[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)] @@ -365,6 +372,10 @@ async fn main() -> anyhow::Result<()> { proxy::metrics::CancellationSource::FromClient, )); + let mut endpoint_rps_limit = args.endpoint_rps_limit.clone(); + RateBucketInfo::validate(&mut endpoint_rps_limit)?; + let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new(endpoint_rps_limit)); + // client facing tasks. these will exit on error or on cancellation // cancellation returns Ok(()) let mut client_tasks = JoinSet::new(); @@ -373,6 +384,7 @@ async fn main() -> anyhow::Result<()> { proxy_listener, cancellation_token.clone(), cancellation_handler.clone(), + endpoint_rate_limiter.clone(), )); // TODO: rename the argument to something like serverless. @@ -387,6 +399,7 @@ async fn main() -> anyhow::Result<()> { serverless_listener, cancellation_token.clone(), cancellation_handler.clone(), + endpoint_rate_limiter.clone(), )); } @@ -480,6 +493,9 @@ async fn main() -> anyhow::Result<()> { /// ProxyConfig is created at proxy startup, and lives forever. fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { + let thread_pool = ThreadPool::new(args.scram_thread_pool_size); + Metrics::install(thread_pool.metrics.clone()); + let tls_config = match (&args.tls_key, &args.tls_cert) { (Some(key_path), Some(cert_path)) => Some(config::configure_tls( key_path, @@ -559,11 +575,16 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { let url = args.auth_endpoint.parse()?; let endpoint = http::Endpoint::new(url, http::new_client()); - let mut endpoint_rps_limit = args.endpoint_rps_limit.clone(); - RateBucketInfo::validate(&mut endpoint_rps_limit)?; - let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new(endpoint_rps_limit)); - let api = - console::provider::neon::Api::new(endpoint, caches, locks, endpoint_rate_limiter); + let mut wake_compute_rps_limit = args.wake_compute_limit.clone(); + RateBucketInfo::validate(&mut wake_compute_rps_limit)?; + let wake_compute_endpoint_rate_limiter = + Arc::new(EndpointRateLimiter::new(wake_compute_rps_limit)); + let api = console::provider::neon::Api::new( + endpoint, + caches, + locks, + wake_compute_endpoint_rate_limiter, + ); let api = console::provider::ConsoleBackend::Console(api); auth::BackendType::Console(MaybeOwned::Owned(api), ()) } @@ -610,6 +631,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { client_conn_threshold: args.sql_over_http.sql_over_http_client_conn_threshold, }; let authentication_config = AuthenticationConfig { + thread_pool, scram_protocol_timeout: args.scram_protocol_timeout, rate_limiter_enabled: args.auth_rate_limit_enabled, rate_limiter: AuthRateLimiter::new(args.auth_rate_limit.clone()), diff --git a/proxy/src/config.rs b/proxy/src/config.rs index b7ab2c00f9..5a0c251ce2 100644 --- a/proxy/src/config.rs +++ b/proxy/src/config.rs @@ -2,6 +2,7 @@ use crate::{ auth::{self, backend::AuthRateLimiter}, console::locks::ApiLocks, rate_limiter::RateBucketInfo, + scram::threadpool::ThreadPool, serverless::{cancel_set::CancelSet, GlobalConnPoolOptions}, Host, }; @@ -61,6 +62,7 @@ pub struct HttpConfig { } pub struct AuthenticationConfig { + pub thread_pool: Arc, pub scram_protocol_timeout: tokio::time::Duration, pub rate_limiter_enabled: bool, pub rate_limiter: AuthRateLimiter, diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs index ec66641d01..7728d2cafa 100644 --- a/proxy/src/console/provider/neon.rs +++ b/proxy/src/console/provider/neon.rs @@ -26,7 +26,7 @@ pub struct Api { endpoint: http::Endpoint, pub caches: &'static ApiCaches, pub locks: &'static ApiLocks, - pub endpoint_rate_limiter: Arc, + pub wake_compute_endpoint_rate_limiter: Arc, jwt: String, } @@ -36,7 +36,7 @@ impl Api { endpoint: http::Endpoint, caches: &'static ApiCaches, locks: &'static ApiLocks, - endpoint_rate_limiter: Arc, + wake_compute_endpoint_rate_limiter: Arc, ) -> Self { let jwt: String = match std::env::var("NEON_PROXY_TO_CONTROLPLANE_TOKEN") { Ok(v) => v, @@ -46,7 +46,7 @@ impl Api { endpoint, caches, locks, - endpoint_rate_limiter, + wake_compute_endpoint_rate_limiter, jwt, } } @@ -283,7 +283,7 @@ impl super::Api for Api { // check rate limit if !self - .endpoint_rate_limiter + .wake_compute_endpoint_rate_limiter .check(user_info.endpoint.normalize().into(), 1) { return Err(WakeComputeError::TooManyConnections); diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs index 8104fe6087..a213a32ca4 100644 --- a/proxy/src/context/parquet.rs +++ b/proxy/src/context/parquet.rs @@ -307,7 +307,7 @@ where } async fn upload_parquet( - w: SerializedFileWriter>, + mut w: SerializedFileWriter>, len: i64, storage: &GenericRemoteStorage, ) -> anyhow::Result> { @@ -319,11 +319,15 @@ async fn upload_parquet( // I don't know how compute intensive this is, although it probably isn't much... better be safe than sorry. // finish method only available on the fork: https://github.com/apache/arrow-rs/issues/5253 - let (writer, metadata) = tokio::task::spawn_blocking(move || w.finish()) + let (mut buffer, metadata) = + tokio::task::spawn_blocking(move || -> parquet::errors::Result<_> { + let metadata = w.finish()?; + let buffer = std::mem::take(w.inner_mut().get_mut()); + Ok((buffer, metadata)) + }) .await .unwrap()?; - let mut buffer = writer.into_inner(); let data = buffer.split().freeze(); let compression = len as f64 / len_uncompressed as f64; @@ -351,7 +355,7 @@ async fn upload_parquet( "{year:04}/{month:02}/{day:02}/{hour:02}/requests_{id}.parquet" ))?; let cancel = CancellationToken::new(); - backoff::retry( + let maybe_err = backoff::retry( || async { let stream = futures::stream::once(futures::future::ready(Ok(data.clone()))); storage @@ -368,7 +372,12 @@ async fn upload_parquet( .await .ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel)) .and_then(|x| x) - .context("request_data_upload")?; + .context("request_data_upload") + .err(); + + if let Some(err) = maybe_err { + tracing::warn!(%id, %err, "failed to upload request data"); + } Ok(buffer.writer()) } @@ -474,10 +483,11 @@ mod tests { RequestData { session_id: uuid::Builder::from_random_bytes(rng.gen()).into_uuid(), peer_addr: Ipv4Addr::from(rng.gen::<[u8; 4]>()).to_string(), - timestamp: chrono::NaiveDateTime::from_timestamp_millis( + timestamp: chrono::DateTime::from_timestamp_millis( rng.gen_range(1703862754..1803862754), ) - .unwrap(), + .unwrap() + .naive_utc(), application_name: Some("test".to_owned()), username: Some(hex::encode(rng.gen::<[u8; 4]>())), endpoint_id: Some(hex::encode(rng.gen::<[u8; 16]>())), @@ -560,15 +570,15 @@ mod tests { assert_eq!( file_stats, [ - (1315008, 3, 6000), - (1315001, 3, 6000), - (1315061, 3, 6000), - (1315018, 3, 6000), - (1315148, 3, 6000), - (1314990, 3, 6000), - (1314782, 3, 6000), - (1315018, 3, 6000), - (438575, 1, 2000) + (1315314, 3, 6000), + (1315307, 3, 6000), + (1315367, 3, 6000), + (1315324, 3, 6000), + (1315454, 3, 6000), + (1315296, 3, 6000), + (1315088, 3, 6000), + (1315324, 3, 6000), + (438713, 1, 2000) ] ); @@ -598,11 +608,11 @@ mod tests { assert_eq!( file_stats, [ - (1221738, 5, 10000), - (1227888, 5, 10000), - (1229682, 5, 10000), - (1229044, 5, 10000), - (1220322, 5, 10000) + (1222212, 5, 10000), + (1228362, 5, 10000), + (1230156, 5, 10000), + (1229518, 5, 10000), + (1220796, 5, 10000) ] ); @@ -634,11 +644,11 @@ mod tests { assert_eq!( file_stats, [ - (1207385, 5, 10000), - (1207116, 5, 10000), - (1207409, 5, 10000), - (1207397, 5, 10000), - (1207652, 5, 10000) + (1207859, 5, 10000), + (1207590, 5, 10000), + (1207883, 5, 10000), + (1207871, 5, 10000), + (1208126, 5, 10000) ] ); @@ -663,15 +673,15 @@ mod tests { assert_eq!( file_stats, [ - (1315008, 3, 6000), - (1315001, 3, 6000), - (1315061, 3, 6000), - (1315018, 3, 6000), - (1315148, 3, 6000), - (1314990, 3, 6000), - (1314782, 3, 6000), - (1315018, 3, 6000), - (438575, 1, 2000) + (1315314, 3, 6000), + (1315307, 3, 6000), + (1315367, 3, 6000), + (1315324, 3, 6000), + (1315454, 3, 6000), + (1315296, 3, 6000), + (1315088, 3, 6000), + (1315324, 3, 6000), + (438713, 1, 2000) ] ); @@ -708,7 +718,7 @@ mod tests { // files are smaller than the size threshold, but they took too long to fill so were flushed early assert_eq!( file_stats, - [(659240, 2, 3001), (658954, 2, 3000), (658750, 2, 2999)] + [(659462, 2, 3001), (659176, 2, 3000), (658972, 2, 2999)] ); tmpdir.close().unwrap(); diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs index 1590316925..e2a75a8720 100644 --- a/proxy/src/metrics.rs +++ b/proxy/src/metrics.rs @@ -1,11 +1,11 @@ -use std::sync::OnceLock; +use std::sync::{Arc, OnceLock}; use lasso::ThreadedRodeo; use measured::{ - label::StaticLabelSet, + label::{FixedCardinalitySet, LabelName, LabelSet, LabelValue, StaticLabelSet}, metric::{histogram::Thresholds, name::MetricName}, - Counter, CounterVec, FixedCardinalityLabel, Gauge, Histogram, HistogramVec, LabelGroup, - MetricGroup, + Counter, CounterVec, FixedCardinalityLabel, Gauge, GaugeVec, Histogram, HistogramVec, + LabelGroup, MetricGroup, }; use metrics::{CounterPairAssoc, CounterPairVec, HyperLogLog, HyperLogLogVec}; @@ -14,26 +14,36 @@ use tokio::time::{self, Instant}; use crate::console::messages::ColdStartInfo; #[derive(MetricGroup)] +#[metric(new(thread_pool: Arc))] pub struct Metrics { #[metric(namespace = "proxy")] + #[metric(init = ProxyMetrics::new(thread_pool))] pub proxy: ProxyMetrics, #[metric(namespace = "wake_compute_lock")] pub wake_compute_lock: ApiLockMetrics, } +static SELF: OnceLock = OnceLock::new(); impl Metrics { + pub fn install(thread_pool: Arc) { + SELF.set(Metrics::new(thread_pool)) + .ok() + .expect("proxy metrics must not be installed more than once"); + } + pub fn get() -> &'static Self { - static SELF: OnceLock = OnceLock::new(); - SELF.get_or_init(|| Metrics { - proxy: ProxyMetrics::default(), - wake_compute_lock: ApiLockMetrics::new(), - }) + #[cfg(test)] + return SELF.get_or_init(|| Metrics::new(Arc::new(ThreadPoolMetrics::new(0)))); + + #[cfg(not(test))] + SELF.get() + .expect("proxy metrics must be installed by the main() function") } } #[derive(MetricGroup)] -#[metric(new())] +#[metric(new(thread_pool: Arc))] pub struct ProxyMetrics { #[metric(flatten)] pub db_connections: CounterPairVec, @@ -129,6 +139,10 @@ pub struct ProxyMetrics { #[metric(namespace = "connect_compute_lock")] pub connect_compute_lock: ApiLockMetrics, + + #[metric(namespace = "scram_pool")] + #[metric(init = thread_pool)] + pub scram_pool: Arc, } #[derive(MetricGroup)] @@ -146,12 +160,6 @@ pub struct ApiLockMetrics { pub semaphore_acquire_seconds: Histogram<16>, } -impl Default for ProxyMetrics { - fn default() -> Self { - Self::new() - } -} - impl Default for ApiLockMetrics { fn default() -> Self { Self::new() @@ -553,3 +561,52 @@ pub enum RedisEventsCount { PasswordUpdate, AllowedIpsUpdate, } + +pub struct ThreadPoolWorkers(usize); +pub struct ThreadPoolWorkerId(pub usize); + +impl LabelValue for ThreadPoolWorkerId { + fn visit(&self, v: V) -> V::Output { + v.write_int(self.0 as i64) + } +} + +impl LabelGroup for ThreadPoolWorkerId { + fn visit_values(&self, v: &mut impl measured::label::LabelGroupVisitor) { + v.write_value(LabelName::from_str("worker"), self); + } +} + +impl LabelSet for ThreadPoolWorkers { + type Value<'a> = ThreadPoolWorkerId; + + fn dynamic_cardinality(&self) -> Option { + Some(self.0) + } + + fn encode(&self, value: Self::Value<'_>) -> Option { + (value.0 < self.0).then_some(value.0) + } + + fn decode(&self, value: usize) -> Self::Value<'_> { + ThreadPoolWorkerId(value) + } +} + +impl FixedCardinalitySet for ThreadPoolWorkers { + fn cardinality(&self) -> usize { + self.0 + } +} + +#[derive(MetricGroup)] +#[metric(new(workers: usize))] +pub struct ThreadPoolMetrics { + pub injector_queue_depth: Gauge, + #[metric(init = GaugeVec::with_label_set(ThreadPoolWorkers(workers)))] + pub worker_queue_depth: GaugeVec, + #[metric(init = CounterVec::with_label_set(ThreadPoolWorkers(workers)))] + pub worker_task_turns_total: CounterVec, + #[metric(init = CounterVec::with_label_set(ThreadPoolWorkers(workers)))] + pub worker_task_skips_total: CounterVec, +} diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs index e4e095d77d..5824b70df9 100644 --- a/proxy/src/proxy.rs +++ b/proxy/src/proxy.rs @@ -19,6 +19,7 @@ use crate::{ metrics::{Metrics, NumClientConnectionsGuard}, protocol2::read_proxy_protocol, proxy::handshake::{handshake, HandshakeData}, + rate_limiter::EndpointRateLimiter, stream::{PqStream, Stream}, EndpointCacheKey, }; @@ -61,6 +62,7 @@ pub async fn task_main( listener: tokio::net::TcpListener, cancellation_token: CancellationToken, cancellation_handler: Arc, + endpoint_rate_limiter: Arc, ) -> anyhow::Result<()> { scopeguard::defer! { info!("proxy has shut down"); @@ -86,6 +88,7 @@ pub async fn task_main( let cancellation_handler = Arc::clone(&cancellation_handler); tracing::info!(protocol = "tcp", %session_id, "accepted new TCP connection"); + let endpoint_rate_limiter2 = endpoint_rate_limiter.clone(); connections.spawn(async move { let (socket, peer_addr) = match read_proxy_protocol(socket).await{ @@ -123,6 +126,7 @@ pub async fn task_main( cancellation_handler, socket, ClientMode::Tcp, + endpoint_rate_limiter2, conn_gauge, ) .instrument(span.clone()) @@ -234,6 +238,7 @@ pub async fn handle_client( cancellation_handler: Arc, stream: S, mode: ClientMode, + endpoint_rate_limiter: Arc, conn_gauge: NumClientConnectionsGuard<'static>, ) -> Result>, ClientRequestError> { info!( @@ -243,7 +248,6 @@ pub async fn handle_client( let metrics = &Metrics::get().proxy; let proto = ctx.protocol; - // let _client_gauge = metrics.client_connections.guard(proto); let _request_gauge = metrics.connection_requests.guard(proto); let tls = config.tls_config.as_ref(); @@ -286,6 +290,7 @@ pub async fn handle_client( &mut stream, mode.allow_cleartext(), &config.authentication_config, + endpoint_rate_limiter, ) .await { diff --git a/proxy/src/rate_limiter/limiter.rs b/proxy/src/rate_limiter/limiter.rs index 5ba2c36436..b8c9490696 100644 --- a/proxy/src/rate_limiter/limiter.rs +++ b/proxy/src/rate_limiter/limiter.rs @@ -128,12 +128,18 @@ impl std::str::FromStr for RateBucketInfo { } impl RateBucketInfo { - pub const DEFAULT_ENDPOINT_SET: [Self; 3] = [ + pub const DEFAULT_SET: [Self; 3] = [ Self::new(300, Duration::from_secs(1)), Self::new(200, Duration::from_secs(60)), Self::new(100, Duration::from_secs(600)), ]; + pub const DEFAULT_ENDPOINT_SET: [Self; 3] = [ + Self::new(500, Duration::from_secs(1)), + Self::new(300, Duration::from_secs(60)), + Self::new(200, Duration::from_secs(600)), + ]; + pub fn validate(info: &mut [Self]) -> anyhow::Result<()> { info.sort_unstable_by_key(|info| info.interval); let invalid = info @@ -266,7 +272,7 @@ mod tests { #[test] fn default_rate_buckets() { - let mut defaults = RateBucketInfo::DEFAULT_ENDPOINT_SET; + let mut defaults = RateBucketInfo::DEFAULT_SET; RateBucketInfo::validate(&mut defaults[..]).unwrap(); } @@ -333,11 +339,8 @@ mod tests { let rand = rand::rngs::StdRng::from_seed([1; 32]); let hasher = BuildHasherDefault::::default(); - let limiter = BucketRateLimiter::new_with_rand_and_hasher( - &RateBucketInfo::DEFAULT_ENDPOINT_SET, - rand, - hasher, - ); + let limiter = + BucketRateLimiter::new_with_rand_and_hasher(&RateBucketInfo::DEFAULT_SET, rand, hasher); for i in 0..1_000_000 { limiter.check(i, 1); } diff --git a/proxy/src/scram.rs b/proxy/src/scram.rs index ed80675f8a..862facb4e5 100644 --- a/proxy/src/scram.rs +++ b/proxy/src/scram.rs @@ -6,11 +6,14 @@ //! * //! * +mod countmin; mod exchange; mod key; mod messages; +mod pbkdf2; mod secret; mod signature; +pub mod threadpool; pub use exchange::{exchange, Exchange}; pub use key::ScramKey; @@ -56,9 +59,13 @@ fn sha256<'a>(parts: impl IntoIterator) -> [u8; 32] { #[cfg(test)] mod tests { - use crate::sasl::{Mechanism, Step}; + use crate::{ + intern::EndpointIdInt, + sasl::{Mechanism, Step}, + EndpointId, + }; - use super::{Exchange, ServerSecret}; + use super::{threadpool::ThreadPool, Exchange, ServerSecret}; #[test] fn snapshot() { @@ -112,8 +119,13 @@ mod tests { } async fn run_round_trip_test(server_password: &str, client_password: &str) { + let pool = ThreadPool::new(1); + + let ep = EndpointId::from("foo"); + let ep = EndpointIdInt::from(ep); + let scram_secret = ServerSecret::build(server_password).await.unwrap(); - let outcome = super::exchange(&scram_secret, client_password.as_bytes()) + let outcome = super::exchange(&pool, ep, &scram_secret, client_password.as_bytes()) .await .unwrap(); diff --git a/proxy/src/scram/countmin.rs b/proxy/src/scram/countmin.rs new file mode 100644 index 0000000000..f2b794e5fe --- /dev/null +++ b/proxy/src/scram/countmin.rs @@ -0,0 +1,173 @@ +use std::hash::Hash; + +/// estimator of hash jobs per second. +/// +pub struct CountMinSketch { + // one for each depth + hashers: Vec, + width: usize, + depth: usize, + // buckets, width*depth + buckets: Vec, +} + +impl CountMinSketch { + /// Given parameters (ε, δ), + /// set width = ceil(e/ε) + /// set depth = ceil(ln(1/δ)) + /// + /// guarantees: + /// actual <= estimate + /// estimate <= actual + ε * N with probability 1 - δ + /// where N is the cardinality of the stream + pub fn with_params(epsilon: f64, delta: f64) -> Self { + CountMinSketch::new( + (std::f64::consts::E / epsilon).ceil() as usize, + (1.0_f64 / delta).ln().ceil() as usize, + ) + } + + fn new(width: usize, depth: usize) -> Self { + Self { + #[cfg(test)] + hashers: (0..depth) + .map(|i| { + // digits of pi for good randomness + ahash::RandomState::with_seeds( + 314159265358979323, + 84626433832795028, + 84197169399375105, + 82097494459230781 + i as u64, + ) + }) + .collect(), + #[cfg(not(test))] + hashers: (0..depth).map(|_| ahash::RandomState::new()).collect(), + width, + depth, + buckets: vec![0; width * depth], + } + } + + pub fn inc_and_return(&mut self, t: &T, x: u32) -> u32 { + let mut min = u32::MAX; + for row in 0..self.depth { + let col = (self.hashers[row].hash_one(t) as usize) % self.width; + + let row = &mut self.buckets[row * self.width..][..self.width]; + row[col] = row[col].saturating_add(x); + min = std::cmp::min(min, row[col]); + } + min + } + + pub fn reset(&mut self) { + self.buckets.clear(); + self.buckets.resize(self.width * self.depth, 0); + } +} + +#[cfg(test)] +mod tests { + use rand::{rngs::StdRng, seq::SliceRandom, Rng, SeedableRng}; + + use super::CountMinSketch; + + fn eval_precision(n: usize, p: f64, q: f64) -> usize { + // fixed value of phi for consistent test + let mut rng = StdRng::seed_from_u64(16180339887498948482); + + #[allow(non_snake_case)] + let mut N = 0; + + let mut ids = vec![]; + + for _ in 0..n { + // number of insert operations + let n = rng.gen_range(1..100); + // number to insert at once + let m = rng.gen_range(1..4096); + + let id = uuid::Builder::from_random_bytes(rng.gen()).into_uuid(); + ids.push((id, n, m)); + + // N = sum(actual) + N += n * m; + } + + // q% of counts will be within p of the actual value + let mut sketch = CountMinSketch::with_params(p / N as f64, 1.0 - q); + + dbg!(sketch.buckets.len()); + + // insert a bunch of entries in a random order + let mut ids2 = ids.clone(); + while !ids2.is_empty() { + ids2.shuffle(&mut rng); + + let mut i = 0; + while i < ids2.len() { + sketch.inc_and_return(&ids2[i].0, ids2[i].1); + ids2[i].2 -= 1; + if ids2[i].2 == 0 { + ids2.remove(i); + } else { + i += 1; + } + } + } + + let mut within_p = 0; + for (id, n, m) in ids { + let actual = n * m; + let estimate = sketch.inc_and_return(&id, 0); + + // This estimate has the guarantee that actual <= estimate + assert!(actual <= estimate); + + // This estimate has the guarantee that estimate <= actual + εN with probability 1 - δ. + // ε = p / N, δ = 1 - q; + // therefore, estimate <= actual + p with probability q. + if estimate as f64 <= actual as f64 + p { + within_p += 1; + } + } + within_p + } + + #[test] + fn precision() { + assert_eq!(eval_precision(100, 100.0, 0.99), 100); + assert_eq!(eval_precision(1000, 100.0, 0.99), 1000); + assert_eq!(eval_precision(100, 4096.0, 0.99), 100); + assert_eq!(eval_precision(1000, 4096.0, 0.99), 1000); + + // seems to be more precise than the literature indicates? + // probably numbers are too small to truly represent the probabilities. + assert_eq!(eval_precision(100, 4096.0, 0.90), 100); + assert_eq!(eval_precision(1000, 4096.0, 0.90), 1000); + assert_eq!(eval_precision(100, 4096.0, 0.1), 98); + assert_eq!(eval_precision(1000, 4096.0, 0.1), 991); + } + + // returns memory usage in bytes, and the time complexity per insert. + fn eval_cost(p: f64, q: f64) -> (usize, usize) { + #[allow(non_snake_case)] + // N = sum(actual) + // Let's assume 1021 samples, all of 4096 + let N = 1021 * 4096; + let sketch = CountMinSketch::with_params(p / N as f64, 1.0 - q); + + let memory = std::mem::size_of::() * sketch.buckets.len(); + let time = sketch.depth; + (memory, time) + } + + #[test] + fn memory_usage() { + assert_eq!(eval_cost(100.0, 0.99), (2273580, 5)); + assert_eq!(eval_cost(4096.0, 0.99), (55520, 5)); + assert_eq!(eval_cost(4096.0, 0.90), (33312, 3)); + assert_eq!(eval_cost(4096.0, 0.1), (11104, 1)); + } +} diff --git a/proxy/src/scram/exchange.rs b/proxy/src/scram/exchange.rs index 89dd33e59f..d0adbc780e 100644 --- a/proxy/src/scram/exchange.rs +++ b/proxy/src/scram/exchange.rs @@ -4,15 +4,17 @@ use std::convert::Infallible; use hmac::{Hmac, Mac}; use sha2::Sha256; -use tokio::task::yield_now; use super::messages::{ ClientFinalMessage, ClientFirstMessage, OwnedServerFirstMessage, SCRAM_RAW_NONCE_LEN, }; +use super::pbkdf2::Pbkdf2; use super::secret::ServerSecret; use super::signature::SignatureBuilder; +use super::threadpool::ThreadPool; use super::ScramKey; use crate::config; +use crate::intern::EndpointIdInt; use crate::sasl::{self, ChannelBinding, Error as SaslError}; /// The only channel binding mode we currently support. @@ -74,37 +76,18 @@ impl<'a> Exchange<'a> { } } -// copied from -async fn pbkdf2(str: &[u8], salt: &[u8], iterations: u32) -> [u8; 32] { - let hmac = Hmac::::new_from_slice(str).expect("HMAC is able to accept all key sizes"); - let mut prev = hmac - .clone() - .chain_update(salt) - .chain_update(1u32.to_be_bytes()) - .finalize() - .into_bytes(); - - let mut hi = prev; - - for i in 1..iterations { - prev = hmac.clone().chain_update(prev).finalize().into_bytes(); - - for (hi, prev) in hi.iter_mut().zip(prev) { - *hi ^= prev; - } - // yield every ~250us - // hopefully reduces tail latencies - if i % 1024 == 0 { - yield_now().await - } - } - - hi.into() -} - // copied from -async fn derive_client_key(password: &[u8], salt: &[u8], iterations: u32) -> ScramKey { - let salted_password = pbkdf2(password, salt, iterations).await; +async fn derive_client_key( + pool: &ThreadPool, + endpoint: EndpointIdInt, + password: &[u8], + salt: &[u8], + iterations: u32, +) -> ScramKey { + let salted_password = pool + .spawn_job(endpoint, Pbkdf2::start(password, salt, iterations)) + .await + .expect("job should not be cancelled"); let make_key = |name| { let key = Hmac::::new_from_slice(&salted_password) @@ -119,11 +102,13 @@ async fn derive_client_key(password: &[u8], salt: &[u8], iterations: u32) -> Scr } pub async fn exchange( + pool: &ThreadPool, + endpoint: EndpointIdInt, secret: &ServerSecret, password: &[u8], ) -> sasl::Result> { let salt = base64::decode(&secret.salt_base64)?; - let client_key = derive_client_key(password, &salt, secret.iterations).await; + let client_key = derive_client_key(pool, endpoint, password, &salt, secret.iterations).await; if secret.is_password_invalid(&client_key).into() { Ok(sasl::Outcome::Failure("password doesn't match")) diff --git a/proxy/src/scram/pbkdf2.rs b/proxy/src/scram/pbkdf2.rs new file mode 100644 index 0000000000..a803ba7e1b --- /dev/null +++ b/proxy/src/scram/pbkdf2.rs @@ -0,0 +1,89 @@ +use hmac::{ + digest::{consts::U32, generic_array::GenericArray}, + Hmac, Mac, +}; +use sha2::Sha256; + +pub struct Pbkdf2 { + hmac: Hmac, + prev: GenericArray, + hi: GenericArray, + iterations: u32, +} + +// inspired from +impl Pbkdf2 { + pub fn start(str: &[u8], salt: &[u8], iterations: u32) -> Self { + let hmac = + Hmac::::new_from_slice(str).expect("HMAC is able to accept all key sizes"); + + let prev = hmac + .clone() + .chain_update(salt) + .chain_update(1u32.to_be_bytes()) + .finalize() + .into_bytes(); + + Self { + hmac, + // one consumed for the hash above + iterations: iterations - 1, + hi: prev, + prev, + } + } + + pub fn cost(&self) -> u32 { + (self.iterations).clamp(0, 4096) + } + + pub fn turn(&mut self) -> std::task::Poll<[u8; 32]> { + let Self { + hmac, + prev, + hi, + iterations, + } = self; + + // only do 4096 iterations per turn before sharing the thread for fairness + let n = (*iterations).clamp(0, 4096); + for _ in 0..n { + *prev = hmac.clone().chain_update(*prev).finalize().into_bytes(); + + for (hi, prev) in hi.iter_mut().zip(*prev) { + *hi ^= prev; + } + } + + *iterations -= n; + if *iterations == 0 { + std::task::Poll::Ready((*hi).into()) + } else { + std::task::Poll::Pending + } + } +} + +#[cfg(test)] +mod tests { + use super::Pbkdf2; + use pbkdf2::pbkdf2_hmac_array; + use sha2::Sha256; + + #[test] + fn works() { + let salt = b"sodium chloride"; + let pass = b"Ne0n_!5_50_C007"; + + let mut job = Pbkdf2::start(pass, salt, 600000); + let hash = loop { + let std::task::Poll::Ready(hash) = job.turn() else { + continue; + }; + break hash; + }; + + let expected = pbkdf2_hmac_array::(pass, salt, 600000); + assert_eq!(hash, expected) + } +} diff --git a/proxy/src/scram/threadpool.rs b/proxy/src/scram/threadpool.rs new file mode 100644 index 0000000000..7701b869a3 --- /dev/null +++ b/proxy/src/scram/threadpool.rs @@ -0,0 +1,321 @@ +//! Custom threadpool implementation for password hashing. +//! +//! Requirements: +//! 1. Fairness per endpoint. +//! 2. Yield support for high iteration counts. + +use std::sync::{ + atomic::{AtomicU64, Ordering}, + Arc, +}; + +use crossbeam_deque::{Injector, Stealer, Worker}; +use itertools::Itertools; +use parking_lot::{Condvar, Mutex}; +use rand::Rng; +use rand::{rngs::SmallRng, SeedableRng}; +use tokio::sync::oneshot; + +use crate::{ + intern::EndpointIdInt, + metrics::{ThreadPoolMetrics, ThreadPoolWorkerId}, + scram::countmin::CountMinSketch, +}; + +use super::pbkdf2::Pbkdf2; + +pub struct ThreadPool { + queue: Injector, + stealers: Vec>, + parkers: Vec<(Condvar, Mutex)>, + /// bitpacked representation. + /// lower 8 bits = number of sleeping threads + /// next 8 bits = number of idle threads (searching for work) + counters: AtomicU64, + + pub metrics: Arc, +} + +#[derive(PartialEq)] +enum ThreadState { + Parked, + Active, +} + +impl ThreadPool { + pub fn new(n_workers: u8) -> Arc { + let workers = (0..n_workers).map(|_| Worker::new_fifo()).collect_vec(); + let stealers = workers.iter().map(|w| w.stealer()).collect_vec(); + + let parkers = (0..n_workers) + .map(|_| (Condvar::new(), Mutex::new(ThreadState::Active))) + .collect_vec(); + + let pool = Arc::new(Self { + queue: Injector::new(), + stealers, + parkers, + // threads start searching for work + counters: AtomicU64::new((n_workers as u64) << 8), + metrics: Arc::new(ThreadPoolMetrics::new(n_workers as usize)), + }); + + for (i, worker) in workers.into_iter().enumerate() { + let pool = Arc::clone(&pool); + std::thread::spawn(move || thread_rt(pool, worker, i)); + } + + pool + } + + pub fn spawn_job( + &self, + endpoint: EndpointIdInt, + pbkdf2: Pbkdf2, + ) -> oneshot::Receiver<[u8; 32]> { + let (tx, rx) = oneshot::channel(); + + let queue_was_empty = self.queue.is_empty(); + + self.metrics.injector_queue_depth.inc(); + self.queue.push(JobSpec { + response: tx, + pbkdf2, + endpoint, + }); + + // inspired from + let counts = self.counters.load(Ordering::SeqCst); + let num_awake_but_idle = (counts >> 8) & 0xff; + let num_sleepers = counts & 0xff; + + // If the queue is non-empty, then we always wake up a worker + // -- clearly the existing idle jobs aren't enough. Otherwise, + // check to see if we have enough idle workers. + if !queue_was_empty || num_awake_but_idle == 0 { + let num_to_wake = Ord::min(1, num_sleepers); + self.wake_any_threads(num_to_wake); + } + + rx + } + + #[cold] + fn wake_any_threads(&self, mut num_to_wake: u64) { + if num_to_wake > 0 { + for i in 0..self.parkers.len() { + if self.wake_specific_thread(i) { + num_to_wake -= 1; + if num_to_wake == 0 { + return; + } + } + } + } + } + + fn wake_specific_thread(&self, index: usize) -> bool { + let (condvar, lock) = &self.parkers[index]; + + let mut state = lock.lock(); + if *state == ThreadState::Parked { + condvar.notify_one(); + + // When the thread went to sleep, it will have incremented + // this value. When we wake it, its our job to decrement + // it. We could have the thread do it, but that would + // introduce a delay between when the thread was + // *notified* and when this counter was decremented. That + // might mislead people with new work into thinking that + // there are sleeping threads that they should try to + // wake, when in fact there is nothing left for them to + // do. + self.counters.fetch_sub(1, Ordering::SeqCst); + *state = ThreadState::Active; + + true + } else { + false + } + } + + fn steal(&self, rng: &mut impl Rng, skip: usize, worker: &Worker) -> Option { + // announce thread as idle + self.counters.fetch_add(256, Ordering::SeqCst); + + // try steal from the global queue + loop { + match self.queue.steal_batch_and_pop(worker) { + crossbeam_deque::Steal::Success(job) => { + self.metrics + .injector_queue_depth + .set(self.queue.len() as i64); + // no longer idle + self.counters.fetch_sub(256, Ordering::SeqCst); + return Some(job); + } + crossbeam_deque::Steal::Retry => continue, + crossbeam_deque::Steal::Empty => break, + } + } + + // try steal from our neighbours + loop { + let mut retry = false; + let start = rng.gen_range(0..self.stealers.len()); + let job = (start..self.stealers.len()) + .chain(0..start) + .filter(|i| *i != skip) + .find_map( + |victim| match self.stealers[victim].steal_batch_and_pop(worker) { + crossbeam_deque::Steal::Success(job) => Some(job), + crossbeam_deque::Steal::Empty => None, + crossbeam_deque::Steal::Retry => { + retry = true; + None + } + }, + ); + if job.is_some() { + // no longer idle + self.counters.fetch_sub(256, Ordering::SeqCst); + return job; + } + if !retry { + return None; + } + } + } +} + +fn thread_rt(pool: Arc, worker: Worker, index: usize) { + /// interval when we should steal from the global queue + /// so that tail latencies are managed appropriately + const STEAL_INTERVAL: usize = 61; + + /// How often to reset the sketch values + const SKETCH_RESET_INTERVAL: usize = 1021; + + let mut rng = SmallRng::from_entropy(); + + // used to determine whether we should temporarily skip tasks for fairness. + // 99% of estimates will overcount by no more than 4096 samples + let mut sketch = CountMinSketch::with_params(1.0 / (SKETCH_RESET_INTERVAL as f64), 0.01); + + let (condvar, lock) = &pool.parkers[index]; + + 'wait: loop { + // wait for notification of work + { + let mut lock = lock.lock(); + + // queue is empty + pool.metrics + .worker_queue_depth + .set(ThreadPoolWorkerId(index), 0); + + // subtract 1 from idle count, add 1 to sleeping count. + pool.counters.fetch_sub(255, Ordering::SeqCst); + + *lock = ThreadState::Parked; + condvar.wait(&mut lock); + } + + for i in 0.. { + let mut job = match worker + .pop() + .or_else(|| pool.steal(&mut rng, index, &worker)) + { + Some(job) => job, + None => continue 'wait, + }; + + pool.metrics + .worker_queue_depth + .set(ThreadPoolWorkerId(index), worker.len() as i64); + + // receiver is closed, cancel the task + if !job.response.is_closed() { + let rate = sketch.inc_and_return(&job.endpoint, job.pbkdf2.cost()); + + const P: f64 = 2000.0; + // probability decreases as rate increases. + // lower probability, higher chance of being skipped + // + // estimates (rate in terms of 4096 rounds): + // rate = 0 => probability = 100% + // rate = 10 => probability = 71.3% + // rate = 50 => probability = 62.1% + // rate = 500 => probability = 52.3% + // rate = 1021 => probability = 49.8% + // + // My expectation is that the pool queue will only begin backing up at ~1000rps + // in which case the SKETCH_RESET_INTERVAL represents 1 second. Thus, the rates above + // are in requests per second. + let probability = P.ln() / (P + rate as f64).ln(); + if pool.queue.len() > 32 || rng.gen_bool(probability) { + pool.metrics + .worker_task_turns_total + .inc(ThreadPoolWorkerId(index)); + + match job.pbkdf2.turn() { + std::task::Poll::Ready(result) => { + let _ = job.response.send(result); + } + std::task::Poll::Pending => worker.push(job), + } + } else { + pool.metrics + .worker_task_skips_total + .inc(ThreadPoolWorkerId(index)); + + // skip for now + worker.push(job) + } + } + + // if we get stuck with a few long lived jobs in the queue + // it's better to try and steal from the queue too for fairness + if i % STEAL_INTERVAL == 0 { + let _ = pool.queue.steal_batch(&worker); + } + + if i % SKETCH_RESET_INTERVAL == 0 { + sketch.reset(); + } + } + } +} + +struct JobSpec { + response: oneshot::Sender<[u8; 32]>, + pbkdf2: Pbkdf2, + endpoint: EndpointIdInt, +} + +#[cfg(test)] +mod tests { + use crate::EndpointId; + + use super::*; + + #[tokio::test] + async fn hash_is_correct() { + let pool = ThreadPool::new(1); + + let ep = EndpointId::from("foo"); + let ep = EndpointIdInt::from(ep); + + let salt = [0x55; 32]; + let actual = pool + .spawn_job(ep, Pbkdf2::start(b"password", &salt, 4096)) + .await + .unwrap(); + + let expected = [ + 10, 114, 73, 188, 140, 222, 196, 156, 214, 184, 79, 157, 119, 242, 16, 31, 53, 242, + 178, 43, 95, 8, 225, 182, 122, 40, 219, 21, 89, 147, 64, 140, + ]; + assert_eq!(actual, expected) + } +} diff --git a/proxy/src/serverless.rs b/proxy/src/serverless.rs index cbff51f207..24ee749e6e 100644 --- a/proxy/src/serverless.rs +++ b/proxy/src/serverless.rs @@ -36,6 +36,7 @@ use crate::context::RequestMonitoring; use crate::metrics::Metrics; use crate::protocol2::read_proxy_protocol; use crate::proxy::run_until_cancelled; +use crate::rate_limiter::EndpointRateLimiter; use crate::serverless::backend::PoolingBackend; use crate::serverless::http_util::{api_error_into_response, json_response}; @@ -54,6 +55,7 @@ pub async fn task_main( ws_listener: TcpListener, cancellation_token: CancellationToken, cancellation_handler: Arc, + endpoint_rate_limiter: Arc, ) -> anyhow::Result<()> { scopeguard::defer! { info!("websocket server has shut down"); @@ -82,6 +84,7 @@ pub async fn task_main( let backend = Arc::new(PoolingBackend { pool: Arc::clone(&conn_pool), config, + endpoint_rate_limiter: Arc::clone(&endpoint_rate_limiter), }); let tls_config = match config.tls_config.as_ref() { @@ -99,7 +102,7 @@ pub async fn task_main( let connections = tokio_util::task::task_tracker::TaskTracker::new(); connections.close(); // allows `connections.wait to complete` - let server = Builder::new(hyper_util::rt::TokioExecutor::new()); + let server = Builder::new(TokioExecutor::new()); while let Some(res) = run_until_cancelled(ws_listener.accept(), &cancellation_token).await { let (conn, peer_addr) = res.context("could not accept TCP stream")?; @@ -129,6 +132,7 @@ pub async fn task_main( backend.clone(), connections.clone(), cancellation_handler.clone(), + endpoint_rate_limiter.clone(), conn_token.clone(), server.clone(), tls_acceptor.clone(), @@ -162,6 +166,7 @@ async fn connection_handler( backend: Arc, connections: TaskTracker, cancellation_handler: Arc, + endpoint_rate_limiter: Arc, cancellation_token: CancellationToken, server: Builder, tls_acceptor: TlsAcceptor, @@ -245,11 +250,11 @@ async fn connection_handler( session_id, peer_addr, http_request_token, + endpoint_rate_limiter.clone(), ) .in_current_span() .map_ok_or_else(api_error_into_response, |r| r), ); - async move { let res = handler.await; cancel_request.disarm(); @@ -285,6 +290,7 @@ async fn request_handler( peer_addr: IpAddr, // used to cancel in-flight HTTP requests. not used to cancel websockets http_cancellation_token: CancellationToken, + endpoint_rate_limiter: Arc, ) -> Result>, ApiError> { let host = request .headers() @@ -294,7 +300,7 @@ async fn request_handler( .map(|s| s.to_string()); // Check if the request is a websocket upgrade request. - if hyper_tungstenite::is_upgrade_request(&request) { + if framed_websockets::upgrade::is_upgrade_request(&request) { let ctx = RequestMonitoring::new( session_id, peer_addr, @@ -305,14 +311,20 @@ async fn request_handler( let span = ctx.span.clone(); info!(parent: &span, "performing websocket upgrade"); - let (response, websocket) = hyper_tungstenite::upgrade(&mut request, None) + let (response, websocket) = framed_websockets::upgrade::upgrade(&mut request) .map_err(|e| ApiError::BadRequest(e.into()))?; ws_connections.spawn( async move { - if let Err(e) = - websocket::serve_websocket(config, ctx, websocket, cancellation_handler, host) - .await + if let Err(e) = websocket::serve_websocket( + config, + ctx, + websocket, + cancellation_handler, + endpoint_rate_limiter, + host, + ) + .await { error!("error in websocket connection: {e:#}"); } @@ -321,7 +333,7 @@ async fn request_handler( ); // Return the response so the spawned future can continue. - Ok(response) + Ok(response.map(|_: http_body_util::Empty| Full::new(Bytes::new()))) } else if request.uri().path() == "/sql" && *request.method() == Method::POST { let ctx = RequestMonitoring::new( session_id, diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs index ce58f575e2..52fc7b556a 100644 --- a/proxy/src/serverless/backend.rs +++ b/proxy/src/serverless/backend.rs @@ -15,7 +15,9 @@ use crate::{ }, context::RequestMonitoring, error::{ErrorKind, ReportableError, UserFacingError}, + intern::EndpointIdInt, proxy::{connect_compute::ConnectMechanism, retry::ShouldRetry}, + rate_limiter::EndpointRateLimiter, Host, }; @@ -24,6 +26,7 @@ use super::conn_pool::{poll_client, Client, ConnInfo, GlobalConnPool}; pub struct PoolingBackend { pub pool: Arc>, pub config: &'static ProxyConfig, + pub endpoint_rate_limiter: Arc, } impl PoolingBackend { @@ -39,6 +42,12 @@ impl PoolingBackend { if !check_peer_addr_is_in_list(&ctx.peer_addr, &allowed_ips) { return Err(AuthError::ip_address_not_allowed(ctx.peer_addr)); } + if !self + .endpoint_rate_limiter + .check(conn_info.user_info.endpoint.clone().into(), 1) + { + return Err(AuthError::too_many_connections()); + } let cached_secret = match maybe_secret { Some(secret) => secret, None => backend.get_role_secret(ctx).await?, @@ -58,8 +67,14 @@ impl PoolingBackend { return Err(AuthError::auth_failed(&*user_info.user)); } }; - let auth_outcome = - crate::auth::validate_password_and_exchange(&conn_info.password, secret).await?; + let ep = EndpointIdInt::from(&conn_info.user_info.endpoint); + let auth_outcome = crate::auth::validate_password_and_exchange( + &config.thread_pool, + ep, + &conn_info.password, + secret, + ) + .await?; let res = match auth_outcome { crate::sasl::Outcome::Success(key) => { info!("user successfully authenticated"); diff --git a/proxy/src/serverless/websocket.rs b/proxy/src/serverless/websocket.rs index b6cd85af73..61d6d60dbe 100644 --- a/proxy/src/serverless/websocket.rs +++ b/proxy/src/serverless/websocket.rs @@ -5,11 +5,13 @@ use crate::{ error::{io_error, ReportableError}, metrics::Metrics, proxy::{handle_client, ClientMode}, + rate_limiter::EndpointRateLimiter, }; -use bytes::{Buf, Bytes}; +use bytes::{Buf, BufMut, Bytes, BytesMut}; +use framed_websockets::{Frame, OpCode, WebSocketServer}; use futures::{Sink, Stream}; -use hyper::upgrade::Upgraded; -use hyper_tungstenite::{tungstenite::Message, HyperWebsocket, WebSocketStream}; +use hyper1::upgrade::OnUpgrade; +use hyper_util::rt::TokioIo; use pin_project_lite::pin_project; use std::{ @@ -20,25 +22,23 @@ use std::{ use tokio::io::{self, AsyncBufRead, AsyncRead, AsyncWrite, ReadBuf}; use tracing::warn; -// TODO: use `std::sync::Exclusive` once it's stabilized. -// Tracking issue: https://github.com/rust-lang/rust/issues/98407. -use sync_wrapper::SyncWrapper; - pin_project! { /// This is a wrapper around a [`WebSocketStream`] that /// implements [`AsyncRead`] and [`AsyncWrite`]. - pub struct WebSocketRw { + pub struct WebSocketRw { #[pin] - stream: SyncWrapper>, - bytes: Bytes, + stream: WebSocketServer, + recv: Bytes, + send: BytesMut, } } impl WebSocketRw { - pub fn new(stream: WebSocketStream) -> Self { + pub fn new(stream: WebSocketServer) -> Self { Self { - stream: stream.into(), - bytes: Bytes::new(), + stream, + recv: Bytes::new(), + send: BytesMut::new(), } } } @@ -49,22 +49,24 @@ impl AsyncWrite for WebSocketRw { cx: &mut Context<'_>, buf: &[u8], ) -> Poll> { - let mut stream = self.project().stream.get_pin_mut(); + let this = self.project(); + let mut stream = this.stream; + this.send.put(buf); ready!(stream.as_mut().poll_ready(cx).map_err(io_error))?; - match stream.as_mut().start_send(Message::Binary(buf.into())) { + match stream.as_mut().start_send(Frame::binary(this.send.split())) { Ok(()) => Poll::Ready(Ok(buf.len())), Err(e) => Poll::Ready(Err(io_error(e))), } } fn poll_flush(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { - let stream = self.project().stream.get_pin_mut(); + let stream = self.project().stream; stream.poll_flush(cx).map_err(io_error) } fn poll_shutdown(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { - let stream = self.project().stream.get_pin_mut(); + let stream = self.project().stream; stream.poll_close(cx).map_err(io_error) } } @@ -75,13 +77,10 @@ impl AsyncRead for WebSocketRw { cx: &mut Context<'_>, buf: &mut ReadBuf<'_>, ) -> Poll> { - if buf.remaining() > 0 { - let bytes = ready!(self.as_mut().poll_fill_buf(cx))?; - let len = std::cmp::min(bytes.len(), buf.remaining()); - buf.put_slice(&bytes[..len]); - self.consume(len); - } - + let bytes = ready!(self.as_mut().poll_fill_buf(cx))?; + let len = std::cmp::min(bytes.len(), buf.remaining()); + buf.put_slice(&bytes[..len]); + self.consume(len); Poll::Ready(Ok(())) } } @@ -93,31 +92,27 @@ impl AsyncBufRead for WebSocketRw { let mut this = self.project(); loop { - if !this.bytes.chunk().is_empty() { - let chunk = (*this.bytes).chunk(); + if !this.recv.chunk().is_empty() { + let chunk = (*this.recv).chunk(); return Poll::Ready(Ok(chunk)); } - let res = ready!(this.stream.as_mut().get_pin_mut().poll_next(cx)); + let res = ready!(this.stream.as_mut().poll_next(cx)); match res.transpose().map_err(io_error)? { - Some(message) => match message { - Message::Ping(_) => {} - Message::Pong(_) => {} - Message::Text(text) => { + Some(message) => match message.opcode { + OpCode::Ping => {} + OpCode::Pong => {} + OpCode::Text => { // We expect to see only binary messages. let error = "unexpected text message in the websocket"; - warn!(length = text.len(), error); + warn!(length = message.payload.len(), error); return Poll::Ready(Err(io_error(error))); } - Message::Frame(_) => { - // This case is impossible according to Frame's doc. - panic!("unexpected raw frame in the websocket"); + OpCode::Binary | OpCode::Continuation => { + debug_assert!(this.recv.is_empty()); + *this.recv = message.payload.freeze(); } - Message::Binary(chunk) => { - assert!(this.bytes.is_empty()); - *this.bytes = Bytes::from(chunk); - } - Message::Close(_) => return EOF, + OpCode::Close => return EOF, }, None => return EOF, } @@ -125,18 +120,21 @@ impl AsyncBufRead for WebSocketRw { } fn consume(self: Pin<&mut Self>, amount: usize) { - self.project().bytes.advance(amount); + self.project().recv.advance(amount); } } pub async fn serve_websocket( config: &'static ProxyConfig, mut ctx: RequestMonitoring, - websocket: HyperWebsocket, + websocket: OnUpgrade, cancellation_handler: Arc, + endpoint_rate_limiter: Arc, hostname: Option, ) -> anyhow::Result<()> { let websocket = websocket.await?; + let websocket = WebSocketServer::after_handshake(TokioIo::new(websocket)); + let conn_gauge = Metrics::get() .proxy .client_connections @@ -148,6 +146,7 @@ pub async fn serve_websocket( cancellation_handler, WebSocketRw::new(websocket), ClientMode::Websockets { hostname }, + endpoint_rate_limiter, conn_gauge, ) .await; @@ -174,15 +173,16 @@ pub async fn serve_websocket( mod tests { use std::pin::pin; + use framed_websockets::WebSocketServer; use futures::{SinkExt, StreamExt}; - use hyper_tungstenite::{ - tungstenite::{protocol::Role, Message}, - WebSocketStream, - }; use tokio::{ io::{duplex, AsyncReadExt, AsyncWriteExt}, task::JoinSet, }; + use tokio_tungstenite::{ + tungstenite::{protocol::Role, Message}, + WebSocketStream, + }; use super::WebSocketRw; @@ -207,9 +207,7 @@ mod tests { }); js.spawn(async move { - let mut rw = pin!(WebSocketRw::new( - WebSocketStream::from_raw_socket(stream2, Role::Server, None).await - )); + let mut rw = pin!(WebSocketRw::new(WebSocketServer::after_handshake(stream2))); let mut buf = vec![0; 1024]; let n = rw.read(&mut buf).await.unwrap(); diff --git a/pyproject.toml b/pyproject.toml index 4ec8efc2ff..131d1121f7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,7 +10,7 @@ pytest = "^7.4.4" psycopg2-binary = "^2.9.6" typing-extensions = "^4.6.1" PyJWT = {version = "^2.1.0", extras = ["crypto"]} -requests = "^2.31.0" +requests = "^2.32.0" pytest-xdist = "^3.3.1" asyncpg = "^0.29.0" aiopg = "^1.4.0" @@ -19,7 +19,7 @@ types-requests = "^2.31.0.0" types-psycopg2 = "^2.9.21.10" boto3 = "^1.34.11" boto3-stubs = {extras = ["s3"], version = "^1.26.16"} -moto = {extras = ["server"], version = "^4.1.2"} +moto = {extras = ["server"], version = "^5.0.6"} backoff = "^2.2.1" pytest-lazy-fixture = "^0.6.3" prometheus-client = "^0.14.1" diff --git a/s3_scrubber/README.md b/s3_scrubber/README.md index c1deab8852..8a96542ada 100644 --- a/s3_scrubber/README.md +++ b/s3_scrubber/README.md @@ -9,11 +9,13 @@ and `safekeeper`, and does housekeeping such as cleaning up objects for tenants #### S3 -Do `aws sso login --profile dev` to get the SSO access to the bucket to clean, get the SSO_ACCOUNT_ID for your profile (`cat ~/.aws/config` may help). +Do `aws sso login --profile dev` to get the SSO access to the bucket to clean. +Also, set the following environment variables: -- `SSO_ACCOUNT_ID`: Credentials id to use for accessing S3 buckets +- `AWS_PROFILE`: Profile name to use for accessing S3 buckets (e.g. `dev`) - `REGION`: A region where the bucket is located at. - `BUCKET`: Bucket name +- `BUCKET_PREFIX` (optional): Prefix inside the bucket #### Console API @@ -43,7 +45,7 @@ processing by the `purge-garbage` subcommand. Example: -`env SSO_ACCOUNT_ID=123456 REGION=eu-west-1 BUCKET=my-dev-bucket CLOUD_ADMIN_API_TOKEN=${NEON_CLOUD_ADMIN_API_STAGING_KEY} CLOUD_ADMIN_API_URL=[url] cargo run --release -- find-garbage --node-kind=pageserver --depth=tenant --output-path=eu-west-1-garbage.json` +`env AWS_PROFILE=dev REGION=eu-west-1 BUCKET=my-dev-bucket CLOUD_ADMIN_API_TOKEN=${NEON_CLOUD_ADMIN_API_STAGING_KEY} CLOUD_ADMIN_API_URL=[url] cargo run --release -- find-garbage --node-kind=pageserver --depth=tenant --output-path=eu-west-1-garbage.json` #### `purge-garbage` @@ -59,7 +61,7 @@ to pass them on the command line Example: -`env SSO_ACCOUNT_ID=123456 cargo run --release -- purge-garbage --node-kind=pageserver --depth=tenant --input-path=eu-west-1-garbage.json` +`env AWS_PROFILE=dev cargo run --release -- purge-garbage --node-kind=pageserver --depth=tenant --input-path=eu-west-1-garbage.json` Add the `--delete` argument before `purge-garbage` to enable deletion. This is intentionally not provided inline in the example above to avoid accidents. Without the `--delete` flag @@ -72,7 +74,7 @@ Errors are logged to stderr and summary to stdout. For pageserver: ``` -env SSO_ACCOUNT_ID=123456 REGION=eu-west-1 BUCKET=my-dev-bucket CLOUD_ADMIN_API_TOKEN=${NEON_CLOUD_ADMIN_API_STAGING_KEY} CLOUD_ADMIN_API_URL=[url] cargo run --release -- scan-metadata --node-kind pageserver +env AWS_PROFILE=dev REGION=eu-west-1 BUCKET=my-dev-bucket CLOUD_ADMIN_API_TOKEN=${NEON_CLOUD_ADMIN_API_STAGING_KEY} CLOUD_ADMIN_API_URL=[url] cargo run --release -- scan-metadata --node-kind pageserver Timelines: 31106 With errors: 3 diff --git a/s3_scrubber/src/checks.rs b/s3_scrubber/src/checks.rs index 7c0f699958..dd64a0a98f 100644 --- a/s3_scrubber/src/checks.rs +++ b/s3_scrubber/src/checks.rs @@ -13,7 +13,7 @@ use crate::metadata_stream::stream_listing; use crate::{download_object_with_retries, RootTarget, TenantShardTimelineId}; use futures_util::StreamExt; use pageserver::tenant::remote_timeline_client::parse_remote_index_path; -use pageserver::tenant::storage_layer::LayerFileName; +use pageserver::tenant::storage_layer::LayerName; use pageserver::tenant::IndexPart; use remote_storage::RemotePath; @@ -110,7 +110,7 @@ pub(crate) fn branch_cleanup_and_check_errors( for (layer, metadata) in index_part.layer_metadata { if metadata.file_size == 0 { result.errors.push(format!( - "index_part.json contains a layer {} that has 0 size in its layer metadata", layer.file_name(), + "index_part.json contains a layer {} that has 0 size in its layer metadata", layer, )) } @@ -121,7 +121,7 @@ pub(crate) fn branch_cleanup_and_check_errors( // layer we think is missing. result.errors.push(format!( "index_part.json contains a layer {}{} (shard {}) that is not present in remote storage", - layer.file_name(), + layer, metadata.generation.get_suffix(), metadata.shard )) @@ -170,8 +170,7 @@ pub(crate) struct LayerRef { /// the tenant to query whether an object exists. #[derive(Default)] pub(crate) struct TenantObjectListing { - shard_timelines: - HashMap<(ShardIndex, TimelineId), HashMap<(LayerFileName, Generation), LayerRef>>, + shard_timelines: HashMap<(ShardIndex, TimelineId), HashMap<(LayerName, Generation), LayerRef>>, } impl TenantObjectListing { @@ -180,7 +179,7 @@ impl TenantObjectListing { pub(crate) fn push( &mut self, ttid: TenantShardTimelineId, - layers: HashSet<(LayerFileName, Generation)>, + layers: HashSet<(LayerName, Generation)>, ) { let shard_index = ShardIndex::new( ttid.tenant_shard_id.shard_number, @@ -208,7 +207,7 @@ impl TenantObjectListing { pub(crate) fn check_ref( &mut self, timeline_id: TimelineId, - layer_file: &LayerFileName, + layer_file: &LayerName, metadata: &IndexLayerMetadata, ) -> bool { let Some(shard_tl) = self.shard_timelines.get_mut(&(metadata.shard, timeline_id)) else { @@ -224,7 +223,7 @@ impl TenantObjectListing { true } - pub(crate) fn get_orphans(&self) -> Vec<(ShardIndex, TimelineId, LayerFileName, Generation)> { + pub(crate) fn get_orphans(&self) -> Vec<(ShardIndex, TimelineId, LayerName, Generation)> { let mut result = Vec::new(); for ((shard_index, timeline_id), layers) in &self.shard_timelines { for ((layer_file, generation), layer_ref) in layers { @@ -247,25 +246,25 @@ pub(crate) struct S3TimelineBlobData { #[derive(Debug)] pub(crate) enum BlobDataParseResult { Parsed { - index_part: IndexPart, + index_part: Box, index_part_generation: Generation, - s3_layers: HashSet<(LayerFileName, Generation)>, + s3_layers: HashSet<(LayerName, Generation)>, }, /// The remains of a deleted Timeline (i.e. an initdb archive only) Relic, Incorrect(Vec), } -fn parse_layer_object_name(name: &str) -> Result<(LayerFileName, Generation), String> { +fn parse_layer_object_name(name: &str) -> Result<(LayerName, Generation), String> { match name.rsplit_once('-') { // FIXME: this is gross, just use a regex? Some((layer_filename, gen)) if gen.len() == 8 => { - let layer = layer_filename.parse::()?; + let layer = layer_filename.parse::()?; let gen = Generation::parse_suffix(gen).ok_or("Malformed generation suffix".to_string())?; Ok((layer, gen)) } - _ => Ok((name.parse::()?, Generation::none())), + _ => Ok((name.parse::()?, Generation::none())), } } @@ -369,7 +368,7 @@ pub(crate) async fn list_timeline_blobs( Ok(index_part) => { return Ok(S3TimelineBlobData { blob_data: BlobDataParseResult::Parsed { - index_part, + index_part: Box::new(index_part), index_part_generation, s3_layers, }, diff --git a/s3_scrubber/src/lib.rs b/s3_scrubber/src/lib.rs index e976e66748..e0f99ecd9c 100644 --- a/s3_scrubber/src/lib.rs +++ b/s3_scrubber/src/lib.rs @@ -200,30 +200,15 @@ impl RootTarget { } #[derive(Debug, Clone, Serialize, Deserialize)] +#[serde(deny_unknown_fields)] pub struct BucketConfig { pub region: String, pub bucket: String, pub prefix_in_bucket: Option, - - /// Use SSO if this is set, else rely on AWS_* environment vars - pub sso_account_id: Option, -} - -impl Display for BucketConfig { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!( - f, - "{}/{}/{}", - self.sso_account_id.as_deref().unwrap_or(""), - self.region, - self.bucket - ) - } } impl BucketConfig { pub fn from_env() -> anyhow::Result { - let sso_account_id = env::var("SSO_ACCOUNT_ID").ok(); let region = env::var("REGION").context("'REGION' param retrieval")?; let bucket = env::var("BUCKET").context("'BUCKET' param retrieval")?; let prefix_in_bucket = env::var("BUCKET_PREFIX").ok(); @@ -232,7 +217,6 @@ impl BucketConfig { region, bucket, prefix_in_bucket, - sso_account_id, }) } } @@ -276,7 +260,7 @@ pub fn init_logging(file_name: &str) -> WorkerGuard { guard } -pub fn init_s3_client(account_id: Option, bucket_region: Region) -> Client { +pub fn init_s3_client(bucket_region: Region) -> Client { let credentials_provider = { // uses "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY" let chain = CredentialsProviderChain::first_try( @@ -290,7 +274,7 @@ pub fn init_s3_client(account_id: Option, bucket_region: Region) -> Clie ); // Use SSO if we were given an account ID - match account_id { + match std::env::var("SSO_ACCOUNT_ID").ok() { Some(sso_account) => chain.or_else( "sso", SsoCredentialsProvider::builder() @@ -312,7 +296,10 @@ pub fn init_s3_client(account_id: Option, bucket_region: Region) -> Clie let sleep_impl: Arc = Arc::new(TokioSleep::new()); let mut builder = Config::builder() - .behavior_version(BehaviorVersion::v2023_11_09()) + .behavior_version( + #[allow(deprecated)] /* TODO: https://github.com/neondatabase/neon/issues/7665 */ + BehaviorVersion::v2023_11_09(), + ) .region(bucket_region) .retry_config(RetryConfig::adaptive().with_max_attempts(3)) .sleep_impl(SharedAsyncSleep::from(sleep_impl)) @@ -331,7 +318,7 @@ fn init_remote( ) -> anyhow::Result<(Arc, RootTarget)> { let bucket_region = Region::new(bucket_config.region); let delimiter = "/".to_string(); - let s3_client = Arc::new(init_s3_client(bucket_config.sso_account_id, bucket_region)); + let s3_client = Arc::new(init_s3_client(bucket_region)); let s3_root = match node_kind { NodeKind::Pageserver => RootTarget::Pageserver(S3Target { diff --git a/s3_scrubber/src/tenant_snapshot.rs b/s3_scrubber/src/tenant_snapshot.rs index 4eccad381b..a24a1e92ae 100644 --- a/s3_scrubber/src/tenant_snapshot.rs +++ b/s3_scrubber/src/tenant_snapshot.rs @@ -12,7 +12,7 @@ use aws_sdk_s3::Client; use camino::Utf8PathBuf; use futures::{StreamExt, TryStreamExt}; use pageserver::tenant::remote_timeline_client::index::IndexLayerMetadata; -use pageserver::tenant::storage_layer::LayerFileName; +use pageserver::tenant::storage_layer::LayerName; use pageserver::tenant::IndexPart; use pageserver_api::shard::TenantShardId; use utils::generation::Generation; @@ -48,16 +48,16 @@ impl SnapshotDownloader { async fn download_layer( &self, ttid: TenantShardTimelineId, - layer_name: LayerFileName, + layer_name: LayerName, layer_metadata: IndexLayerMetadata, - ) -> anyhow::Result<(LayerFileName, IndexLayerMetadata)> { + ) -> anyhow::Result<(LayerName, IndexLayerMetadata)> { // Note this is local as in a local copy of S3 data, not local as in the pageserver's local format. They use // different layer names (remote-style has the generation suffix) let local_path = self.output_path.join(format!( "{}/timelines/{}/{}{}", ttid.tenant_shard_id, ttid.timeline_id, - layer_name.file_name(), + layer_name, layer_metadata.generation.get_suffix() )); @@ -76,7 +76,7 @@ impl SnapshotDownloader { let remote_layer_path = format!( "{}{}{}", timeline_root.prefix_in_bucket, - layer_name.file_name(), + layer_name, layer_metadata.generation.get_suffix() ); @@ -110,7 +110,7 @@ impl SnapshotDownloader { async fn download_layers( &self, ttid: TenantShardTimelineId, - layers: Vec<(LayerFileName, IndexLayerMetadata)>, + layers: Vec<(LayerName, IndexLayerMetadata)>, ) -> anyhow::Result<()> { let layer_count = layers.len(); tracing::info!("Downloading {} layers for timeline {ttid}...", layer_count); @@ -138,7 +138,7 @@ impl SnapshotDownloader { tracing::info!( "[{download_count}/{layer_count}] OK: {} bytes {ttid} {}", layer_metadata.file_size, - layer_name.file_name() + layer_name ); } Err(e) => { @@ -159,11 +159,11 @@ impl SnapshotDownloader { async fn download_timeline( &self, ttid: TenantShardTimelineId, - index_part: IndexPart, + index_part: Box, index_part_generation: Generation, ancestor_layers: &mut HashMap< TenantShardTimelineId, - HashMap, + HashMap, >, ) -> anyhow::Result<()> { let index_bytes = serde_json::to_string(&index_part).unwrap(); @@ -234,7 +234,7 @@ impl SnapshotDownloader { // happen if this tenant has been split at some point) let mut ancestor_layers: HashMap< TenantShardTimelineId, - HashMap, + HashMap, > = Default::default(); for shard in shards.into_iter().filter(|s| s.shard_count == shard_count) { diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs index 09c565ce71..aee3898ac7 100644 --- a/safekeeper/src/bin/safekeeper.rs +++ b/safekeeper/src/bin/safekeeper.rs @@ -20,7 +20,6 @@ use std::str::FromStr; use std::sync::Arc; use std::time::Duration; use storage_broker::Uri; -use tokio::sync::mpsc; use tracing::*; use utils::pid_file; @@ -30,13 +29,13 @@ use safekeeper::defaults::{ DEFAULT_HEARTBEAT_TIMEOUT, DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_MAX_OFFLOADER_LAG_BYTES, DEFAULT_PARTIAL_BACKUP_TIMEOUT, DEFAULT_PG_LISTEN_ADDR, }; +use safekeeper::remove_wal; use safekeeper::wal_service; use safekeeper::GlobalTimelines; use safekeeper::SafeKeeperConf; use safekeeper::{broker, WAL_SERVICE_RUNTIME}; use safekeeper::{control_file, BROKER_RUNTIME}; use safekeeper::{http, WAL_REMOVER_RUNTIME}; -use safekeeper::{remove_wal, WAL_BACKUP_RUNTIME}; use safekeeper::{wal_backup, HTTP_RUNTIME}; use storage_broker::DEFAULT_ENDPOINT; use utils::auth::{JwtAuth, Scope, SwappableJwtAuth}; @@ -377,8 +376,6 @@ async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> { let timeline_collector = safekeeper::metrics::TimelineCollector::new(); metrics::register_internal(Box::new(timeline_collector))?; - let (wal_backup_launcher_tx, wal_backup_launcher_rx) = mpsc::channel(100); - wal_backup::init_remote_storage(&conf); // Keep handles to main tasks to die if any of them disappears. @@ -391,19 +388,9 @@ async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> { let current_thread_rt = conf .current_thread_runtime .then(|| Handle::try_current().expect("no runtime in main")); - let conf_ = conf.clone(); - let wal_backup_handle = current_thread_rt - .as_ref() - .unwrap_or_else(|| WAL_BACKUP_RUNTIME.handle()) - .spawn(wal_backup::wal_backup_launcher_task_main( - conf_, - wal_backup_launcher_rx, - )) - .map(|res| ("WAL backup launcher".to_owned(), res)); - tasks_handles.push(Box::pin(wal_backup_handle)); // Load all timelines from disk to memory. - GlobalTimelines::init(conf.clone(), wal_backup_launcher_tx).await?; + GlobalTimelines::init(conf.clone()).await?; let conf_ = conf.clone(); // Run everything in current thread rt, if asked. diff --git a/safekeeper/src/broker.rs b/safekeeper/src/broker.rs index 98f58d3e49..7cc2142291 100644 --- a/safekeeper/src/broker.rs +++ b/safekeeper/src/broker.rs @@ -46,6 +46,8 @@ async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> { return Ok(()); } + let active_timelines_set = GlobalTimelines::get_global_broker_active_set(); + let mut client = storage_broker::connect(conf.broker_endpoint.clone(), conf.broker_keepalive_interval)?; let push_interval = Duration::from_millis(PUSH_INTERVAL_MSEC); @@ -57,15 +59,9 @@ async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> { // sensitive and there is no risk of deadlock as we don't await while // lock is held. let now = Instant::now(); - let all_tlis = GlobalTimelines::get_all(); + let all_tlis = active_timelines_set.get_all(); let mut n_pushed_tlis = 0; for tli in &all_tlis { - // filtering alternative futures::stream::iter(all_tlis) - // .filter(|tli| {let tli = tli.clone(); async move { tli.is_active().await}}).collect::>().await; - // doesn't look better, and I'm not sure how to do that without collect. - if !tli.is_active().await { - continue; - } let sk_info = tli.get_safekeeper_info(&conf).await; yield sk_info; BROKER_PUSHED_UPDATES.inc(); @@ -90,6 +86,7 @@ async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> { } /// Subscribe and fetch all the interesting data from the broker. +#[instrument(name = "broker pull", skip_all)] async fn pull_loop(conf: SafeKeeperConf, stats: Arc) -> Result<()> { let mut client = storage_broker::connect(conf.broker_endpoint, conf.broker_keepalive_interval)?; @@ -186,6 +183,7 @@ async fn discover_loop(conf: SafeKeeperConf, stats: Arc) -> Result< commit_lsn: sk_info.commit_lsn, safekeeper_connstr: sk_info.safekeeper_connstr, availability_zone: sk_info.availability_zone, + standby_horizon: 0, }; // note this is a blocking call @@ -319,7 +317,7 @@ async fn task_stats(stats: Arc) { let now = BrokerStats::now_millis(); if now > last_pulled && now - last_pulled > warn_duration.as_millis() as u64 { - let ts = chrono::NaiveDateTime::from_timestamp_millis(last_pulled as i64).expect("invalid timestamp"); + let ts = chrono::DateTime::from_timestamp_millis(last_pulled as i64).expect("invalid timestamp"); info!("no broker updates for some time, last update: {:?}", ts); } } diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs index 9ce26e6c5d..808bb1e490 100644 --- a/safekeeper/src/http/routes.rs +++ b/safekeeper/src/http/routes.rs @@ -350,6 +350,7 @@ async fn record_safekeeper_info(mut request: Request) -> Result RouterBuilder .get("/v1/status", |r| request_span(r, status_handler)) .put("/v1/failpoints", |r| { request_span(r, move |r| async { + check_permission(&r, None)?; let cancel = CancellationToken::new(); failpoints_handler(r, cancel).await }) diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs index 543714a54e..8d8d2cf23e 100644 --- a/safekeeper/src/lib.rs +++ b/safekeeper/src/lib.rs @@ -31,6 +31,8 @@ pub mod safekeeper; pub mod send_wal; pub mod state; pub mod timeline; +pub mod timeline_manager; +pub mod timelines_set; pub mod wal_backup; pub mod wal_backup_partial; pub mod wal_service; diff --git a/safekeeper/src/metrics.rs b/safekeeper/src/metrics.rs index 28ae042bb3..1e965393e3 100644 --- a/safekeeper/src/metrics.rs +++ b/safekeeper/src/metrics.rs @@ -11,8 +11,9 @@ use futures::Future; use metrics::{ core::{AtomicU64, Collector, Desc, GenericCounter, GenericGaugeVec, Opts}, proto::MetricFamily, - register_int_counter, register_int_counter_pair_vec, register_int_counter_vec, Gauge, - IntCounter, IntCounterPairVec, IntCounterVec, IntGaugeVec, + register_int_counter, register_int_counter_pair, register_int_counter_pair_vec, + register_int_counter_vec, Gauge, IntCounter, IntCounterPair, IntCounterPairVec, IntCounterVec, + IntGaugeVec, }; use once_cell::sync::Lazy; @@ -162,6 +163,29 @@ pub static PARTIAL_BACKUP_UPLOADED_BYTES: Lazy = Lazy::new(|| { ) .expect("Failed to register safekeeper_partial_backup_uploaded_bytes_total counter") }); +pub static MANAGER_ITERATIONS_TOTAL: Lazy = Lazy::new(|| { + register_int_counter!( + "safekeeper_manager_iterations_total", + "Number of iterations of the timeline manager task" + ) + .expect("Failed to register safekeeper_manager_iterations_total counter") +}); +pub static MANAGER_ACTIVE_CHANGES: Lazy = Lazy::new(|| { + register_int_counter!( + "safekeeper_manager_active_changes_total", + "Number of timeline active status changes in the timeline manager task" + ) + .expect("Failed to register safekeeper_manager_active_changes_total counter") +}); +pub static WAL_BACKUP_TASKS: Lazy = Lazy::new(|| { + register_int_counter_pair!( + "safekeeper_wal_backup_tasks_started_total", + "Number of active WAL backup tasks", + "safekeeper_wal_backup_tasks_finished_total", + "Number of finished WAL backup tasks", + ) + .expect("Failed to register safekeeper_wal_backup_tasks_finished_total counter") +}); pub const LABEL_UNKNOWN: &str = "unknown"; @@ -614,8 +638,7 @@ impl Collector for TimelineCollector { self.written_wal_seconds.reset(); self.flushed_wal_seconds.reset(); - let timelines = GlobalTimelines::get_all(); - let timelines_count = timelines.len(); + let timelines_count = GlobalTimelines::get_all().len(); let mut active_timelines_count = 0; // Prometheus Collector is sync, and data is stored under async lock. To @@ -746,9 +769,9 @@ impl Collector for TimelineCollector { async fn collect_timeline_metrics() -> Vec { let mut res = vec![]; - let timelines = GlobalTimelines::get_all(); + let active_timelines = GlobalTimelines::get_global_broker_active_set().get_all(); - for tli in timelines { + for tli in active_timelines { if let Some(info) = tli.info_for_metrics().await { res.push(info); } diff --git a/safekeeper/src/receive_wal.rs b/safekeeper/src/receive_wal.rs index 015b53bb2e..03cfa882c4 100644 --- a/safekeeper/src/receive_wal.rs +++ b/safekeeper/src/receive_wal.rs @@ -45,6 +45,9 @@ const DEFAULT_FEEDBACK_CAPACITY: usize = 8; pub struct WalReceivers { mutex: Mutex, pageserver_feedback_tx: tokio::sync::broadcast::Sender, + + num_computes_tx: tokio::sync::watch::Sender, + num_computes_rx: tokio::sync::watch::Receiver, } /// Id under which walreceiver is registered in shmem. @@ -55,16 +58,21 @@ impl WalReceivers { let (pageserver_feedback_tx, _) = tokio::sync::broadcast::channel(DEFAULT_FEEDBACK_CAPACITY); + let (num_computes_tx, num_computes_rx) = tokio::sync::watch::channel(0usize); + Arc::new(WalReceivers { mutex: Mutex::new(WalReceiversShared { slots: Vec::new() }), pageserver_feedback_tx, + num_computes_tx, + num_computes_rx, }) } /// Register new walreceiver. Returned guard provides access to the slot and /// automatically deregisters in Drop. pub fn register(self: &Arc, conn_id: Option) -> WalReceiverGuard { - let slots = &mut self.mutex.lock().slots; + let mut shared = self.mutex.lock(); + let slots = &mut shared.slots; let walreceiver = WalReceiverState { conn_id, status: WalReceiverStatus::Voting, @@ -78,6 +86,9 @@ impl WalReceivers { slots.push(Some(walreceiver)); pos }; + + self.update_num(&shared); + WalReceiverGuard { id: pos, walreceivers: self.clone(), @@ -99,7 +110,18 @@ impl WalReceivers { /// Get number of walreceivers (compute connections). pub fn get_num(self: &Arc) -> usize { - self.mutex.lock().slots.iter().flatten().count() + self.mutex.lock().get_num() + } + + /// Get channel for number of walreceivers. + pub fn get_num_rx(self: &Arc) -> tokio::sync::watch::Receiver { + self.num_computes_rx.clone() + } + + /// Should get called after every update of slots. + fn update_num(self: &Arc, shared: &MutexGuard) { + let num = shared.get_num(); + self.num_computes_tx.send_replace(num); } /// Get state of all walreceivers. @@ -123,6 +145,7 @@ impl WalReceivers { fn unregister(self: &Arc, id: WalReceiverId) { let mut shared = self.mutex.lock(); shared.slots[id] = None; + self.update_num(&shared); } /// Broadcast pageserver feedback to connected walproposers. @@ -137,6 +160,13 @@ struct WalReceiversShared { slots: Vec>, } +impl WalReceiversShared { + /// Get number of walreceivers (compute connections). + fn get_num(&self) -> usize { + self.slots.iter().flatten().count() + } +} + #[derive(Debug, Clone, Serialize, Deserialize)] pub struct WalReceiverState { /// None means it is recovery initiated by us (this safekeeper). @@ -183,9 +213,19 @@ impl SafekeeperPostgresHandler { &mut self, pgb: &mut PostgresBackend, ) -> Result<(), QueryError> { - if let Err(end) = self.handle_start_wal_push_guts(pgb).await { + let mut tli: Option> = None; + if let Err(end) = self.handle_start_wal_push_guts(pgb, &mut tli).await { // Log the result and probably send it to the client, closing the stream. - pgb.handle_copy_stream_end(end).await; + let handle_end_fut = pgb.handle_copy_stream_end(end); + // If we managed to create the timeline, augment logging with current LSNs etc. + if let Some(tli) = tli { + let info = tli.get_safekeeper_info(&self.conf).await; + handle_end_fut + .instrument(info_span!("", term=%info.term, last_log_term=%info.last_log_term, flush_lsn=%Lsn(info.flush_lsn), commit_lsn=%Lsn(info.commit_lsn))) + .await; + } else { + handle_end_fut.await; + } } Ok(()) } @@ -193,6 +233,7 @@ impl SafekeeperPostgresHandler { pub async fn handle_start_wal_push_guts( &mut self, pgb: &mut PostgresBackend, + tli: &mut Option>, ) -> Result<(), CopyStreamHandlerEnd> { // Notify the libpq client that it's allowed to send `CopyData` messages pgb.write_message(&BeMessage::CopyBothResponse).await?; @@ -222,13 +263,17 @@ impl SafekeeperPostgresHandler { // Read first message and create timeline if needed. let res = network_reader.read_first_message().await; - let res = if let Ok((tli, next_msg)) = res { + let network_res = if let Ok((timeline, next_msg)) = res { let pageserver_feedback_rx: tokio::sync::broadcast::Receiver = - tli.get_walreceivers().pageserver_feedback_tx.subscribe(); + timeline + .get_walreceivers() + .pageserver_feedback_tx + .subscribe(); + *tli = Some(timeline.clone()); tokio::select! { // todo: add read|write .context to these errors - r = network_reader.run(msg_tx, msg_rx, reply_tx, tli.clone(), next_msg) => r, + r = network_reader.run(msg_tx, msg_rx, reply_tx, timeline.clone(), next_msg) => r, r = network_write(pgb, reply_rx, pageserver_feedback_rx) => r, } } else { @@ -244,13 +289,13 @@ impl SafekeeperPostgresHandler { match acceptor_handle { None => { // failed even before spawning; read_network should have error - Err(res.expect_err("no error with WalAcceptor not spawn")) + Err(network_res.expect_err("no error with WalAcceptor not spawn")) } Some(handle) => { let wal_acceptor_res = handle.await; // If there was any network error, return it. - res?; + network_res?; // Otherwise, WalAcceptor thread must have errored. match wal_acceptor_res { @@ -441,14 +486,7 @@ impl WalAcceptor { /// The main loop. Returns Ok(()) if either msg_rx or reply_tx got closed; /// it must mean that network thread terminated. async fn run(&mut self) -> anyhow::Result<()> { - // Register the connection and defer unregister. - // Order of the next two lines is important: we want first to remove our entry and then - // update status which depends on registered connections. - let _compute_conn_guard = ComputeConnectionGuard { - timeline: Arc::clone(&self.tli), - }; let walreceiver_guard = self.tli.get_walreceivers().register(self.conn_id); - self.tli.update_status_notify().await?; // After this timestamp we will stop processing AppendRequests and send a response // to the walproposer. walproposer sends at least one AppendRequest per second, @@ -514,19 +552,3 @@ impl WalAcceptor { } } } - -/// Calls update_status_notify in drop to update timeline status. -struct ComputeConnectionGuard { - timeline: Arc, -} - -impl Drop for ComputeConnectionGuard { - fn drop(&mut self) { - let tli = self.timeline.clone(); - tokio::spawn(async move { - if let Err(e) = tli.update_status_notify().await { - error!("failed to update timeline status: {}", e); - } - }); - } -} diff --git a/safekeeper/src/recovery.rs b/safekeeper/src/recovery.rs index e8fa6c55f4..dfa1892c40 100644 --- a/safekeeper/src/recovery.rs +++ b/safekeeper/src/recovery.rs @@ -37,17 +37,11 @@ use crate::{ #[instrument(name = "recovery task", skip_all, fields(ttid = %tli.ttid))] pub async fn recovery_main(tli: Arc, conf: SafeKeeperConf) { info!("started"); - let mut cancellation_rx = match tli.get_cancellation_rx() { - Ok(rx) => rx, - Err(_) => { - info!("timeline canceled during task start"); - return; - } - }; + let cancel = tli.cancel.clone(); select! { _ = recovery_main_loop(tli, conf) => { unreachable!() } - _ = cancellation_rx.changed() => { + _ = cancel.cancelled() => { info!("stopped"); } } diff --git a/safekeeper/src/remove_wal.rs b/safekeeper/src/remove_wal.rs index 9dce06a886..98ce671182 100644 --- a/safekeeper/src/remove_wal.rs +++ b/safekeeper/src/remove_wal.rs @@ -7,29 +7,18 @@ use tracing::*; use crate::{GlobalTimelines, SafeKeeperConf}; -const ALLOW_INACTIVE_TIMELINES: bool = true; - -pub async fn task_main(conf: SafeKeeperConf) -> anyhow::Result<()> { +pub async fn task_main(_conf: SafeKeeperConf) -> anyhow::Result<()> { let wal_removal_interval = Duration::from_millis(5000); loop { let now = tokio::time::Instant::now(); - let mut active_timelines = 0; - let tlis = GlobalTimelines::get_all(); for tli in &tlis { - let is_active = tli.is_active().await; - if is_active { - active_timelines += 1; - } - if !ALLOW_INACTIVE_TIMELINES && !is_active { - continue; - } let ttid = tli.ttid; async { if let Err(e) = tli.maybe_persist_control_file().await { warn!("failed to persist control file: {e}"); } - if let Err(e) = tli.remove_old_wal(conf.wal_backup_enabled).await { + if let Err(e) = tli.remove_old_wal().await { error!("failed to remove WAL: {}", e); } } @@ -42,8 +31,8 @@ pub async fn task_main(conf: SafeKeeperConf) -> anyhow::Result<()> { if elapsed > wal_removal_interval { info!( - "WAL removal is too long, processed {} active timelines ({} total) in {:?}", - active_timelines, total_timelines, elapsed + "WAL removal is too long, processed {} timelines in {:?}", + total_timelines, elapsed ); } diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs index 59a8c595ab..5a9745e1c9 100644 --- a/safekeeper/src/send_wal.rs +++ b/safekeeper/src/send_wal.rs @@ -23,7 +23,7 @@ use utils::failpoint_support; use utils::id::TenantTimelineId; use utils::pageserver_feedback::PageserverFeedback; -use std::cmp::min; +use std::cmp::{max, min}; use std::net::SocketAddr; use std::str; use std::sync::Arc; @@ -85,8 +85,17 @@ impl StandbyReply { #[derive(Debug, Clone, Copy, Serialize, Deserialize)] pub struct StandbyFeedback { - reply: StandbyReply, - hs_feedback: HotStandbyFeedback, + pub reply: StandbyReply, + pub hs_feedback: HotStandbyFeedback, +} + +impl StandbyFeedback { + pub fn empty() -> Self { + StandbyFeedback { + reply: StandbyReply::empty(), + hs_feedback: HotStandbyFeedback::empty(), + } + } } /// WalSenders registry. Timeline holds it (wrapped in Arc). @@ -162,8 +171,8 @@ impl WalSenders { } /// Get aggregated hot standby feedback (we send it to compute). - pub fn get_hotstandby(self: &Arc) -> HotStandbyFeedback { - self.mutex.lock().agg_hs_feedback + pub fn get_hotstandby(self: &Arc) -> StandbyFeedback { + self.mutex.lock().agg_standby_feedback } /// Record new pageserver feedback, update aggregated values. @@ -184,6 +193,10 @@ impl WalSenders { fn record_standby_reply(self: &Arc, id: WalSenderId, reply: &StandbyReply) { let mut shared = self.mutex.lock(); let slot = shared.get_slot_mut(id); + debug!( + "Record standby reply: ts={} apply_lsn={}", + reply.reply_ts, reply.apply_lsn + ); match &mut slot.feedback { ReplicationFeedback::Standby(sf) => sf.reply = *reply, ReplicationFeedback::Pageserver(_) => { @@ -208,7 +221,7 @@ impl WalSenders { }) } } - shared.update_hs_feedback(); + shared.update_reply_feedback(); } /// Get remote_consistent_lsn reported by the pageserver. Returns None if @@ -226,13 +239,13 @@ impl WalSenders { fn unregister(self: &Arc, id: WalSenderId) { let mut shared = self.mutex.lock(); shared.slots[id] = None; - shared.update_hs_feedback(); + shared.update_reply_feedback(); } } struct WalSendersShared { // aggregated over all walsenders value - agg_hs_feedback: HotStandbyFeedback, + agg_standby_feedback: StandbyFeedback, // last feedback ever received from any pageserver, empty if none last_ps_feedback: PageserverFeedback, // total counter of pageserver feedbacks received @@ -243,7 +256,7 @@ struct WalSendersShared { impl WalSendersShared { fn new() -> Self { WalSendersShared { - agg_hs_feedback: HotStandbyFeedback::empty(), + agg_standby_feedback: StandbyFeedback::empty(), last_ps_feedback: PageserverFeedback::empty(), ps_feedback_counter: 0, slots: Vec::new(), @@ -260,10 +273,11 @@ impl WalSendersShared { self.slots[id].as_mut().expect("walsender doesn't exist") } - /// Update aggregated hot standy feedback. We just take min of valid xmins + /// Update aggregated hot standy and normal reply feedbacks. We just take min of valid xmins /// and ts. - fn update_hs_feedback(&mut self) { + fn update_reply_feedback(&mut self) { let mut agg = HotStandbyFeedback::empty(); + let mut reply_agg = StandbyReply::empty(); for ws_state in self.slots.iter().flatten() { if let ReplicationFeedback::Standby(standby_feedback) = ws_state.feedback { let hs_feedback = standby_feedback.hs_feedback; @@ -276,7 +290,7 @@ impl WalSendersShared { } else { agg.xmin = hs_feedback.xmin; } - agg.ts = min(agg.ts, hs_feedback.ts); + agg.ts = max(agg.ts, hs_feedback.ts); } if hs_feedback.catalog_xmin != INVALID_FULL_TRANSACTION_ID { if agg.catalog_xmin != INVALID_FULL_TRANSACTION_ID { @@ -284,11 +298,43 @@ impl WalSendersShared { } else { agg.catalog_xmin = hs_feedback.catalog_xmin; } - agg.ts = min(agg.ts, hs_feedback.ts); + agg.ts = max(agg.ts, hs_feedback.ts); + } + let reply = standby_feedback.reply; + if reply.write_lsn != Lsn::INVALID { + if reply_agg.write_lsn != Lsn::INVALID { + reply_agg.write_lsn = Lsn::min(reply_agg.write_lsn, reply.write_lsn); + } else { + reply_agg.write_lsn = reply.write_lsn; + } + } + if reply.flush_lsn != Lsn::INVALID { + if reply_agg.flush_lsn != Lsn::INVALID { + reply_agg.flush_lsn = Lsn::min(reply_agg.flush_lsn, reply.flush_lsn); + } else { + reply_agg.flush_lsn = reply.flush_lsn; + } + } + if reply.apply_lsn != Lsn::INVALID { + if reply_agg.apply_lsn != Lsn::INVALID { + reply_agg.apply_lsn = Lsn::min(reply_agg.apply_lsn, reply.apply_lsn); + } else { + reply_agg.apply_lsn = reply.apply_lsn; + } + } + if reply.reply_ts != 0 { + if reply_agg.reply_ts != 0 { + reply_agg.reply_ts = TimestampTz::min(reply_agg.reply_ts, reply.reply_ts); + } else { + reply_agg.reply_ts = reply.reply_ts; + } } } } - self.agg_hs_feedback = agg; + self.agg_standby_feedback = StandbyFeedback { + reply: reply_agg, + hs_feedback: agg, + }; } } @@ -340,12 +386,16 @@ impl SafekeeperPostgresHandler { start_pos: Lsn, term: Option, ) -> Result<(), QueryError> { + let tli = GlobalTimelines::get(self.ttid).map_err(|e| QueryError::Other(e.into()))?; if let Err(end) = self - .handle_start_replication_guts(pgb, start_pos, term) + .handle_start_replication_guts(pgb, start_pos, term, tli.clone()) .await { + let info = tli.get_safekeeper_info(&self.conf).await; // Log the result and probably send it to the client, closing the stream. - pgb.handle_copy_stream_end(end).await; + pgb.handle_copy_stream_end(end) + .instrument(info_span!("", term=%info.term, last_log_term=%info.last_log_term, flush_lsn=%Lsn(info.flush_lsn), commit_lsn=%Lsn(info.flush_lsn))) + .await; } Ok(()) } @@ -355,10 +405,9 @@ impl SafekeeperPostgresHandler { pgb: &mut PostgresBackend, start_pos: Lsn, term: Option, + tli: Arc, ) -> Result<(), CopyStreamHandlerEnd> { let appname = self.appname.clone(); - let tli = - GlobalTimelines::get(self.ttid).map_err(|e| CopyStreamHandlerEnd::Other(e.into()))?; // Use a guard object to remove our entry from the timeline when we are done. let ws_guard = Arc::new(tli.get_walsenders().register( @@ -707,8 +756,15 @@ impl ReplyReader { match msg.first().cloned() { Some(HOT_STANDBY_FEEDBACK_TAG_BYTE) => { // Note: deserializing is on m[1..] because we skip the tag byte. - let hs_feedback = HotStandbyFeedback::des(&msg[1..]) + let mut hs_feedback = HotStandbyFeedback::des(&msg[1..]) .context("failed to deserialize HotStandbyFeedback")?; + // TODO: xmin/catalog_xmin are serialized by walreceiver.c in this way: + // pq_sendint32(&reply_message, xmin); + // pq_sendint32(&reply_message, xmin_epoch); + // So it is two big endian 32-bit words in low endian order! + hs_feedback.xmin = (hs_feedback.xmin >> 32) | (hs_feedback.xmin << 32); + hs_feedback.catalog_xmin = + (hs_feedback.catalog_xmin >> 32) | (hs_feedback.catalog_xmin << 32); self.ws_guard .walsenders .record_hs_feedback(self.ws_guard.id, &hs_feedback); @@ -790,8 +846,11 @@ mod tests { fn test_hs_feedback_no_valid() { let mut wss = WalSendersShared::new(); push_feedback(&mut wss, hs_feedback(1, INVALID_FULL_TRANSACTION_ID)); - wss.update_hs_feedback(); - assert_eq!(wss.agg_hs_feedback.xmin, INVALID_FULL_TRANSACTION_ID); + wss.update_reply_feedback(); + assert_eq!( + wss.agg_standby_feedback.hs_feedback.xmin, + INVALID_FULL_TRANSACTION_ID + ); } #[test] @@ -800,7 +859,7 @@ mod tests { push_feedback(&mut wss, hs_feedback(1, INVALID_FULL_TRANSACTION_ID)); push_feedback(&mut wss, hs_feedback(1, 42)); push_feedback(&mut wss, hs_feedback(1, 64)); - wss.update_hs_feedback(); - assert_eq!(wss.agg_hs_feedback.xmin, 42); + wss.update_reply_feedback(); + assert_eq!(wss.agg_standby_feedback.hs_feedback.xmin, 42); } } diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index 64f764f191..89c157d514 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -6,15 +6,15 @@ use camino::Utf8PathBuf; use postgres_ffi::XLogSegNo; use serde::{Deserialize, Serialize}; use tokio::fs; +use tokio_util::sync::CancellationToken; use std::cmp::max; +use std::ops::{Deref, DerefMut}; +use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::Arc; use std::time::Duration; -use tokio::sync::{Mutex, MutexGuard}; -use tokio::{ - sync::{mpsc::Sender, watch}, - time::Instant, -}; +use tokio::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard}; +use tokio::{sync::watch, time::Instant}; use tracing::*; use utils::http::error::ApiError; use utils::{ @@ -33,12 +33,13 @@ use crate::safekeeper::{ }; use crate::send_wal::WalSenders; use crate::state::{TimelineMemState, TimelinePersistentState}; +use crate::timelines_set::TimelinesSet; use crate::wal_backup::{self}; use crate::{control_file, safekeeper::UNKNOWN_SERVER_VERSION}; use crate::metrics::FullTimelineInfo; use crate::wal_storage::Storage as wal_storage_iface; -use crate::{debug_dump, wal_backup_partial, wal_storage}; +use crate::{debug_dump, timeline_manager, wal_backup_partial, wal_storage}; use crate::{GlobalTimelines, SafeKeeperConf}; /// Things safekeeper should know about timeline state on peers. @@ -51,8 +52,7 @@ pub struct PeerInfo { /// LSN of the last record. pub flush_lsn: Lsn, pub commit_lsn: Lsn, - /// Since which LSN safekeeper has WAL. TODO: remove this once we fill new - /// sk since backup_lsn. + /// Since which LSN safekeeper has WAL. pub local_start_lsn: Lsn, /// When info was received. Serde annotations are not very useful but make /// the code compile -- we don't rely on this field externally. @@ -97,25 +97,72 @@ impl PeersInfo { } } +pub type ReadGuardSharedState<'a> = RwLockReadGuard<'a, SharedState>; + +/// WriteGuardSharedState is a wrapper around `RwLockWriteGuard` that +/// automatically updates `watch::Sender` channels with state on drop. +pub struct WriteGuardSharedState<'a> { + tli: Arc, + guard: RwLockWriteGuard<'a, SharedState>, +} + +impl<'a> WriteGuardSharedState<'a> { + fn new(tli: Arc, guard: RwLockWriteGuard<'a, SharedState>) -> Self { + WriteGuardSharedState { tli, guard } + } +} + +impl<'a> Deref for WriteGuardSharedState<'a> { + type Target = SharedState; + + fn deref(&self) -> &Self::Target { + &self.guard + } +} + +impl<'a> DerefMut for WriteGuardSharedState<'a> { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.guard + } +} + +impl<'a> Drop for WriteGuardSharedState<'a> { + fn drop(&mut self) { + let term_flush_lsn = TermLsn::from((self.guard.sk.get_term(), self.guard.sk.flush_lsn())); + let commit_lsn = self.guard.sk.state.inmem.commit_lsn; + + let _ = self.tli.term_flush_lsn_watch_tx.send_if_modified(|old| { + if *old != term_flush_lsn { + *old = term_flush_lsn; + true + } else { + false + } + }); + + let _ = self.tli.commit_lsn_watch_tx.send_if_modified(|old| { + if *old != commit_lsn { + *old = commit_lsn; + true + } else { + false + } + }); + + // send notification about shared state update + self.tli.shared_state_version_tx.send_modify(|old| { + *old += 1; + }); + } +} + /// Shared state associated with database instance pub struct SharedState { /// Safekeeper object - sk: SafeKeeper, + pub(crate) sk: SafeKeeper, /// In memory list containing state of peers sent in latest messages from them. - peers_info: PeersInfo, - /// True when WAL backup launcher oversees the timeline, making sure WAL is - /// offloaded, allows to bother launcher less. - wal_backup_active: bool, - /// True whenever there is at least some pending activity on timeline: live - /// compute connection, pageserver is not caughtup (it must have latest WAL - /// for new compute start) or WAL backuping is not finished. Practically it - /// means safekeepers broadcast info to peers about the timeline, old WAL is - /// trimmed. - /// - /// TODO: it might be better to remove tli completely from GlobalTimelines - /// when tli is inactive instead of having this flag. - active: bool, - last_removed_segno: XLogSegNo, + pub(crate) peers_info: PeersInfo, + pub(crate) last_removed_segno: XLogSegNo, } impl SharedState { @@ -152,8 +199,6 @@ impl SharedState { Ok(Self { sk, peers_info: PeersInfo(vec![]), - wal_backup_active: false, - active: false, last_removed_segno: 0, }) } @@ -171,75 +216,10 @@ impl SharedState { Ok(Self { sk: SafeKeeper::new(control_store, wal_store, conf.my_id)?, peers_info: PeersInfo(vec![]), - wal_backup_active: false, - active: false, last_removed_segno: 0, }) } - fn is_active(&self, num_computes: usize) -> bool { - self.is_wal_backup_required(num_computes) - // FIXME: add tracking of relevant pageservers and check them here individually, - // otherwise migration won't work (we suspend too early). - || self.sk.state.inmem.remote_consistent_lsn < self.sk.state.inmem.commit_lsn - } - - /// Mark timeline active/inactive and return whether s3 offloading requires - /// start/stop action. If timeline is deactivated, control file is persisted - /// as maintenance task does that only for active timelines. - async fn update_status(&mut self, num_computes: usize, ttid: TenantTimelineId) -> bool { - let is_active = self.is_active(num_computes); - if self.active != is_active { - info!( - "timeline {} active={} now, remote_consistent_lsn={}, commit_lsn={}", - ttid, - is_active, - self.sk.state.inmem.remote_consistent_lsn, - self.sk.state.inmem.commit_lsn - ); - if !is_active { - if let Err(e) = self.sk.state.flush().await { - warn!("control file save in update_status failed: {:?}", e); - } - } - } - self.active = is_active; - self.is_wal_backup_action_pending(num_computes) - } - - /// Should we run s3 offloading in current state? - fn is_wal_backup_required(&self, num_computes: usize) -> bool { - let seg_size = self.get_wal_seg_size(); - num_computes > 0 || - // Currently only the whole segment is offloaded, so compare segment numbers. - (self.sk.state.inmem.commit_lsn.segment_number(seg_size) > - self.sk.state.inmem.backup_lsn.segment_number(seg_size)) - } - - /// Is current state of s3 offloading is not what it ought to be? - fn is_wal_backup_action_pending(&self, num_computes: usize) -> bool { - let res = self.wal_backup_active != self.is_wal_backup_required(num_computes); - if res { - let action_pending = if self.is_wal_backup_required(num_computes) { - "start" - } else { - "stop" - }; - trace!( - "timeline {} s3 offloading action {} pending: num_computes={}, commit_lsn={}, backup_lsn={}", - self.sk.state.timeline_id, action_pending, num_computes, self.sk.state.inmem.commit_lsn, self.sk.state.inmem.backup_lsn - ); - } - res - } - - /// Returns whether s3 offloading is required and sets current status as - /// matching. - fn wal_backup_attend(&mut self, num_computes: usize) -> bool { - self.wal_backup_active = self.is_wal_backup_required(num_computes); - self.wal_backup_active - } - fn get_wal_seg_size(&self) -> usize { self.sk.state.server.wal_seg_size as usize } @@ -248,6 +228,7 @@ impl SharedState { &self, ttid: &TenantTimelineId, conf: &SafeKeeperConf, + standby_apply_lsn: Lsn, ) -> SafekeeperTimelineInfo { SafekeeperTimelineInfo { safekeeper_id: conf.my_id.0, @@ -270,13 +251,14 @@ impl SharedState { backup_lsn: self.sk.state.inmem.backup_lsn.0, local_start_lsn: self.sk.state.local_start_lsn.0, availability_zone: conf.availability_zone.clone(), + standby_horizon: standby_apply_lsn.0, } } /// Get our latest view of alive peers status on the timeline. /// We pass our own info through the broker as well, so when we don't have connection /// to the broker returned vec is empty. - fn get_peers(&self, heartbeat_timeout: Duration) -> Vec { + pub(crate) fn get_peers(&self, heartbeat_timeout: Duration) -> Vec { let now = Instant::now(); self.peers_info .0 @@ -292,18 +274,13 @@ impl SharedState { /// offloading. /// While it is safe to use inmem values for determining horizon, /// we use persistent to make possible normal states less surprising. - fn get_horizon_segno( - &self, - wal_backup_enabled: bool, - extra_horizon_lsn: Option, - ) -> XLogSegNo { + fn get_horizon_segno(&self, extra_horizon_lsn: Option) -> XLogSegNo { let state = &self.sk.state; use std::cmp::min; let mut horizon_lsn = min(state.remote_consistent_lsn, state.peer_horizon_lsn); - if wal_backup_enabled { - horizon_lsn = min(horizon_lsn, state.backup_lsn); - } + // we don't want to remove WAL that is not yet offloaded to s3 + horizon_lsn = min(horizon_lsn, state.backup_lsn); if let Some(extra_horizon_lsn) = extra_horizon_lsn { horizon_lsn = min(horizon_lsn, extra_horizon_lsn); } @@ -344,11 +321,6 @@ impl From for ApiError { pub struct Timeline { pub ttid: TenantTimelineId, - /// Sending here asks for wal backup launcher attention (start/stop - /// offloading). Sending ttid instead of concrete command allows to do - /// sending without timeline lock. - pub wal_backup_launcher_tx: Sender, - /// Used to broadcast commit_lsn updates to all background jobs. commit_lsn_watch_tx: watch::Sender, commit_lsn_watch_rx: watch::Receiver, @@ -360,19 +332,19 @@ pub struct Timeline { term_flush_lsn_watch_tx: watch::Sender, term_flush_lsn_watch_rx: watch::Receiver, + /// Broadcasts shared state updates. + shared_state_version_tx: watch::Sender, + shared_state_version_rx: watch::Receiver, + /// Safekeeper and other state, that should remain consistent and /// synchronized with the disk. This is tokio mutex as we write WAL to disk /// while holding it, ensuring that consensus checks are in order. - mutex: Mutex, + mutex: RwLock, walsenders: Arc, walreceivers: Arc, - /// Cancellation channel. Delete/cancel will send `true` here as a cancellation signal. - cancellation_tx: watch::Sender, - - /// Timeline should not be used after cancellation. Background tasks should - /// monitor this channel and stop eventually after receiving `true` from this channel. - cancellation_rx: watch::Receiver, + /// Delete/cancel will trigger this, background tasks should drop out as soon as it fires + pub(crate) cancel: CancellationToken, /// Directory where timeline state is stored. pub timeline_dir: Utf8PathBuf, @@ -382,15 +354,15 @@ pub struct Timeline { /// with different speed. // TODO: add `Arc` here instead of adding each field separately. walsenders_keep_horizon: bool, + + // timeline_manager controlled state + pub(crate) broker_active: AtomicBool, + pub(crate) wal_backup_active: AtomicBool, } impl Timeline { /// Load existing timeline from disk. - pub fn load_timeline( - conf: &SafeKeeperConf, - ttid: TenantTimelineId, - wal_backup_launcher_tx: Sender, - ) -> Result { + pub fn load_timeline(conf: &SafeKeeperConf, ttid: TenantTimelineId) -> Result { let _enter = info_span!("load_timeline", timeline = %ttid.timeline_id).entered(); let shared_state = SharedState::restore(conf, &ttid)?; @@ -400,23 +372,25 @@ impl Timeline { shared_state.sk.get_term(), shared_state.sk.flush_lsn(), ))); - let (cancellation_tx, cancellation_rx) = watch::channel(false); + let (shared_state_version_tx, shared_state_version_rx) = watch::channel(0); let walreceivers = WalReceivers::new(); Ok(Timeline { ttid, - wal_backup_launcher_tx, commit_lsn_watch_tx, commit_lsn_watch_rx, term_flush_lsn_watch_tx, term_flush_lsn_watch_rx, - mutex: Mutex::new(shared_state), + shared_state_version_tx, + shared_state_version_rx, + mutex: RwLock::new(shared_state), walsenders: WalSenders::new(walreceivers.clone()), walreceivers, - cancellation_rx, - cancellation_tx, + cancel: CancellationToken::default(), timeline_dir: conf.timeline_dir(&ttid), walsenders_keep_horizon: conf.walsenders_keep_horizon, + broker_active: AtomicBool::new(false), + wal_backup_active: AtomicBool::new(false), }) } @@ -424,7 +398,6 @@ impl Timeline { pub fn create_empty( conf: &SafeKeeperConf, ttid: TenantTimelineId, - wal_backup_launcher_tx: Sender, server_info: ServerInfo, commit_lsn: Lsn, local_start_lsn: Lsn, @@ -432,25 +405,28 @@ impl Timeline { let (commit_lsn_watch_tx, commit_lsn_watch_rx) = watch::channel(Lsn::INVALID); let (term_flush_lsn_watch_tx, term_flush_lsn_watch_rx) = watch::channel(TermLsn::from((INVALID_TERM, Lsn::INVALID))); - let (cancellation_tx, cancellation_rx) = watch::channel(false); + let (shared_state_version_tx, shared_state_version_rx) = watch::channel(0); + let state = TimelinePersistentState::new(&ttid, server_info, vec![], commit_lsn, local_start_lsn); let walreceivers = WalReceivers::new(); Ok(Timeline { ttid, - wal_backup_launcher_tx, commit_lsn_watch_tx, commit_lsn_watch_rx, term_flush_lsn_watch_tx, term_flush_lsn_watch_rx, - mutex: Mutex::new(SharedState::create_new(conf, &ttid, state)?), + shared_state_version_tx, + shared_state_version_rx, + mutex: RwLock::new(SharedState::create_new(conf, &ttid, state)?), walsenders: WalSenders::new(walreceivers.clone()), walreceivers, - cancellation_rx, - cancellation_tx, + cancel: CancellationToken::default(), timeline_dir: conf.timeline_dir(&ttid), walsenders_keep_horizon: conf.walsenders_keep_horizon, + broker_active: AtomicBool::new(false), + wal_backup_active: AtomicBool::new(false), }) } @@ -461,8 +437,9 @@ impl Timeline { /// and state on disk should remain unchanged. pub async fn init_new( self: &Arc, - shared_state: &mut MutexGuard<'_, SharedState>, + shared_state: &mut WriteGuardSharedState<'_>, conf: &SafeKeeperConf, + broker_active_set: Arc, ) -> Result<()> { match fs::metadata(&self.timeline_dir).await { Ok(_) => { @@ -493,16 +470,29 @@ impl Timeline { return Err(e); } - self.bootstrap(conf); + self.bootstrap(conf, broker_active_set); Ok(()) } - /// Bootstrap new or existing timeline starting background stasks. - pub fn bootstrap(self: &Arc, conf: &SafeKeeperConf) { + /// Bootstrap new or existing timeline starting background tasks. + pub fn bootstrap( + self: &Arc, + conf: &SafeKeeperConf, + broker_active_set: Arc, + ) { + // Start manager task which will monitor timeline state and update + // background tasks. + tokio::spawn(timeline_manager::main_task( + self.clone(), + conf.clone(), + broker_active_set, + )); + // Start recovery task which always runs on the timeline. if conf.peer_recovery_enabled { tokio::spawn(recovery_main(self.clone(), conf.clone())); } + // TODO: migrate to timeline_manager if conf.is_wal_backup_enabled() && conf.partial_backup_enabled { tokio::spawn(wal_backup_partial::main_task(self.clone(), conf.clone())); } @@ -515,10 +505,9 @@ impl Timeline { /// deletion API endpoint is retriable. pub async fn delete( &self, - shared_state: &mut MutexGuard<'_, SharedState>, + shared_state: &mut WriteGuardSharedState<'_>, only_local: bool, - ) -> Result<(bool, bool)> { - let was_active = shared_state.active; + ) -> Result { self.cancel(shared_state); // TODO: It's better to wait for s3 offloader termination before @@ -532,20 +521,14 @@ impl Timeline { wal_backup::delete_timeline(&self.ttid).await?; } let dir_existed = delete_dir(&self.timeline_dir).await?; - Ok((dir_existed, was_active)) + Ok(dir_existed) } /// Cancel timeline to prevent further usage. Background tasks will stop /// eventually after receiving cancellation signal. - /// - /// Note that we can't notify backup launcher here while holding - /// shared_state lock, as this is a potential deadlock: caller is - /// responsible for that. Generally we should probably make WAL backup tasks - /// to shut down on their own, checking once in a while whether it is the - /// time. - fn cancel(&self, shared_state: &mut MutexGuard<'_, SharedState>) { + fn cancel(&self, shared_state: &mut WriteGuardSharedState<'_>) { info!("timeline {} is cancelled", self.ttid); - let _ = self.cancellation_tx.send(true); + self.cancel.cancel(); // Close associated FDs. Nobody will be able to touch timeline data once // it is cancelled, so WAL storage won't be opened again. shared_state.sk.wal_store.close(); @@ -553,44 +536,16 @@ impl Timeline { /// Returns if timeline is cancelled. pub fn is_cancelled(&self) -> bool { - *self.cancellation_rx.borrow() - } - - /// Returns watch channel which gets value when timeline is cancelled. It is - /// guaranteed to have not cancelled value observed (errors otherwise). - pub fn get_cancellation_rx(&self) -> Result> { - let rx = self.cancellation_rx.clone(); - if *rx.borrow() { - bail!(TimelineError::Cancelled(self.ttid)); - } - Ok(rx) + self.cancel.is_cancelled() } /// Take a writing mutual exclusive lock on timeline shared_state. - pub async fn write_shared_state(&self) -> MutexGuard { - self.mutex.lock().await + pub async fn write_shared_state<'a>(self: &'a Arc) -> WriteGuardSharedState<'a> { + WriteGuardSharedState::new(self.clone(), self.mutex.write().await) } - async fn update_status(&self, shared_state: &mut SharedState) -> bool { - shared_state - .update_status(self.walreceivers.get_num(), self.ttid) - .await - } - - /// Update timeline status and kick wal backup launcher to stop/start offloading if needed. - pub async fn update_status_notify(&self) -> Result<()> { - if self.is_cancelled() { - bail!(TimelineError::Cancelled(self.ttid)); - } - let is_wal_backup_action_pending: bool = { - let mut shared_state = self.write_shared_state().await; - self.update_status(&mut shared_state).await - }; - if is_wal_backup_action_pending { - // Can fail only if channel to a static thread got closed, which is not normal at all. - self.wal_backup_launcher_tx.send(self.ttid).await?; - } - Ok(()) + pub async fn read_shared_state(&self) -> ReadGuardSharedState { + self.mutex.read().await } /// Returns true if walsender should stop sending WAL to pageserver. We @@ -602,7 +557,7 @@ impl Timeline { if self.is_cancelled() { return true; } - let shared_state = self.write_shared_state().await; + let shared_state = self.read_shared_state().await; if self.walreceivers.get_num() == 0 { return shared_state.sk.state.inmem.commit_lsn == Lsn(0) || // no data at all yet reported_remote_consistent_lsn >= shared_state.sk.state.inmem.commit_lsn; @@ -610,9 +565,9 @@ impl Timeline { false } - /// Ensure taht current term is t, erroring otherwise, and lock the state. - pub async fn acquire_term(&self, t: Term) -> Result> { - let ss = self.write_shared_state().await; + /// Ensure that current term is t, erroring otherwise, and lock the state. + pub async fn acquire_term(&self, t: Term) -> Result { + let ss = self.read_shared_state().await; if ss.sk.state.acceptor_state.term != t { bail!( "failed to acquire term {}, current term {}", @@ -623,18 +578,6 @@ impl Timeline { Ok(ss) } - /// Returns whether s3 offloading is required and sets current status as - /// matching it. - pub async fn wal_backup_attend(&self) -> bool { - if self.is_cancelled() { - return false; - } - - self.write_shared_state() - .await - .wal_backup_attend(self.walreceivers.get_num()) - } - /// Returns commit_lsn watch channel. pub fn get_commit_lsn_watch_rx(&self) -> watch::Receiver { self.commit_lsn_watch_rx.clone() @@ -645,9 +588,14 @@ impl Timeline { self.term_flush_lsn_watch_rx.clone() } + /// Returns watch channel for SharedState update version. + pub fn get_state_version_rx(&self) -> watch::Receiver { + self.shared_state_version_rx.clone() + } + /// Pass arrived message to the safekeeper. pub async fn process_msg( - &self, + self: &Arc, msg: &ProposerAcceptorMessage, ) -> Result> { if self.is_cancelled() { @@ -655,53 +603,36 @@ impl Timeline { } let mut rmsg: Option; - let commit_lsn: Lsn; - let term_flush_lsn: TermLsn; { let mut shared_state = self.write_shared_state().await; rmsg = shared_state.sk.process_msg(msg).await?; // if this is AppendResponse, fill in proper hot standby feedback. if let Some(AcceptorProposerMessage::AppendResponse(ref mut resp)) = rmsg { - resp.hs_feedback = self.walsenders.get_hotstandby(); + resp.hs_feedback = self.walsenders.get_hotstandby().hs_feedback; } - - commit_lsn = shared_state.sk.state.inmem.commit_lsn; - term_flush_lsn = - TermLsn::from((shared_state.sk.get_term(), shared_state.sk.flush_lsn())); } - self.term_flush_lsn_watch_tx.send(term_flush_lsn)?; - self.commit_lsn_watch_tx.send(commit_lsn)?; Ok(rmsg) } /// Returns wal_seg_size. pub async fn get_wal_seg_size(&self) -> usize { - self.write_shared_state().await.get_wal_seg_size() - } - - /// Returns true only if the timeline is loaded and active. - pub async fn is_active(&self) -> bool { - if self.is_cancelled() { - return false; - } - - self.write_shared_state().await.active + self.read_shared_state().await.get_wal_seg_size() } /// Returns state of the timeline. pub async fn get_state(&self) -> (TimelineMemState, TimelinePersistentState) { - let state = self.write_shared_state().await; + let state = self.read_shared_state().await; (state.sk.state.inmem.clone(), state.sk.state.clone()) } /// Returns latest backup_lsn. pub async fn get_wal_backup_lsn(&self) -> Lsn { - self.write_shared_state().await.sk.state.inmem.backup_lsn + self.read_shared_state().await.sk.state.inmem.backup_lsn } /// Sets backup_lsn to the given value. - pub async fn set_wal_backup_lsn(&self, backup_lsn: Lsn) -> Result<()> { + pub async fn set_wal_backup_lsn(self: &Arc, backup_lsn: Lsn) -> Result<()> { if self.is_cancelled() { bail!(TimelineError::Cancelled(self.ttid)); } @@ -715,39 +646,34 @@ impl Timeline { /// Get safekeeper info for broadcasting to broker and other peers. pub async fn get_safekeeper_info(&self, conf: &SafeKeeperConf) -> SafekeeperTimelineInfo { - let shared_state = self.write_shared_state().await; - shared_state.get_safekeeper_info(&self.ttid, conf) + let standby_apply_lsn = self.walsenders.get_hotstandby().reply.apply_lsn; + let shared_state = self.read_shared_state().await; + shared_state.get_safekeeper_info(&self.ttid, conf, standby_apply_lsn) } /// Update timeline state with peer safekeeper data. - pub async fn record_safekeeper_info(&self, sk_info: SafekeeperTimelineInfo) -> Result<()> { - let is_wal_backup_action_pending: bool; - let commit_lsn: Lsn; + pub async fn record_safekeeper_info( + self: &Arc, + sk_info: SafekeeperTimelineInfo, + ) -> Result<()> { { let mut shared_state = self.write_shared_state().await; shared_state.sk.record_safekeeper_info(&sk_info).await?; let peer_info = PeerInfo::from_sk_info(&sk_info, Instant::now()); shared_state.peers_info.upsert(&peer_info); - is_wal_backup_action_pending = self.update_status(&mut shared_state).await; - commit_lsn = shared_state.sk.state.inmem.commit_lsn; - } - self.commit_lsn_watch_tx.send(commit_lsn)?; - // Wake up wal backup launcher, if it is time to stop the offloading. - if is_wal_backup_action_pending { - self.wal_backup_launcher_tx.send(self.ttid).await?; } Ok(()) } /// Update in memory remote consistent lsn. - pub async fn update_remote_consistent_lsn(&self, candidate: Lsn) { + pub async fn update_remote_consistent_lsn(self: &Arc, candidate: Lsn) { let mut shared_state = self.write_shared_state().await; shared_state.sk.state.inmem.remote_consistent_lsn = max(shared_state.sk.state.inmem.remote_consistent_lsn, candidate); } pub async fn get_peers(&self, conf: &SafeKeeperConf) -> Vec { - let shared_state = self.write_shared_state().await; + let shared_state = self.read_shared_state().await; shared_state.get_peers(conf.heartbeat_timeout) } @@ -769,7 +695,7 @@ impl Timeline { /// depending on assembled quorum (e.g. classic picture 8 from Raft paper). /// Thus we don't try to predict it here. pub async fn recovery_needed(&self, heartbeat_timeout: Duration) -> RecoveryNeededInfo { - let ss = self.write_shared_state().await; + let ss = self.read_shared_state().await; let term = ss.sk.state.acceptor_state.term; let last_log_term = ss.sk.get_epoch(); let flush_lsn = ss.sk.flush_lsn(); @@ -840,12 +766,12 @@ impl Timeline { /// Returns flush_lsn. pub async fn get_flush_lsn(&self) -> Lsn { - self.write_shared_state().await.sk.wal_store.flush_lsn() + self.read_shared_state().await.sk.wal_store.flush_lsn() } /// Delete WAL segments from disk that are no longer needed. This is determined /// based on pageserver's remote_consistent_lsn and local backup_lsn/peer_lsn. - pub async fn remove_old_wal(&self, wal_backup_enabled: bool) -> Result<()> { + pub async fn remove_old_wal(self: &Arc) -> Result<()> { if self.is_cancelled() { bail!(TimelineError::Cancelled(self.ttid)); } @@ -861,9 +787,8 @@ impl Timeline { let horizon_segno: XLogSegNo; let remover = { - let shared_state = self.write_shared_state().await; - horizon_segno = - shared_state.get_horizon_segno(wal_backup_enabled, replication_horizon_lsn); + let shared_state = self.read_shared_state().await; + horizon_segno = shared_state.get_horizon_segno(replication_horizon_lsn); if horizon_segno <= 1 || horizon_segno <= shared_state.last_removed_segno { return Ok(()); // nothing to do } @@ -885,7 +810,7 @@ impl Timeline { /// passed after the last save. This helps to keep remote_consistent_lsn up /// to date so that storage nodes restart doesn't cause many pageserver -> /// safekeeper reconnections. - pub async fn maybe_persist_control_file(&self) -> Result<()> { + pub async fn maybe_persist_control_file(self: &Arc) -> Result<()> { self.write_shared_state() .await .sk @@ -893,38 +818,33 @@ impl Timeline { .await } - /// Gather timeline data for metrics. If the timeline is not active, returns - /// None, we do not collect these. + /// Gather timeline data for metrics. pub async fn info_for_metrics(&self) -> Option { if self.is_cancelled() { return None; } let (ps_feedback_count, last_ps_feedback) = self.walsenders.get_ps_feedback_stats(); - let state = self.write_shared_state().await; - if state.active { - Some(FullTimelineInfo { - ttid: self.ttid, - ps_feedback_count, - last_ps_feedback, - wal_backup_active: state.wal_backup_active, - timeline_is_active: state.active, - num_computes: self.walreceivers.get_num() as u32, - last_removed_segno: state.last_removed_segno, - epoch_start_lsn: state.sk.epoch_start_lsn, - mem_state: state.sk.state.inmem.clone(), - persisted_state: state.sk.state.clone(), - flush_lsn: state.sk.wal_store.flush_lsn(), - wal_storage: state.sk.wal_store.get_metrics(), - }) - } else { - None - } + let state = self.read_shared_state().await; + Some(FullTimelineInfo { + ttid: self.ttid, + ps_feedback_count, + last_ps_feedback, + wal_backup_active: self.wal_backup_active.load(Ordering::Relaxed), + timeline_is_active: self.broker_active.load(Ordering::Relaxed), + num_computes: self.walreceivers.get_num() as u32, + last_removed_segno: state.last_removed_segno, + epoch_start_lsn: state.sk.epoch_start_lsn, + mem_state: state.sk.state.inmem.clone(), + persisted_state: state.sk.state.clone(), + flush_lsn: state.sk.wal_store.flush_lsn(), + wal_storage: state.sk.wal_store.get_metrics(), + }) } /// Returns in-memory timeline state to build a full debug dump. pub async fn memory_dump(&self) -> debug_dump::Memory { - let state = self.write_shared_state().await; + let state = self.read_shared_state().await; let (write_lsn, write_record_lsn, flush_lsn, file_open) = state.sk.wal_store.internal_state(); @@ -933,8 +853,8 @@ impl Timeline { is_cancelled: self.is_cancelled(), peers_info_len: state.peers_info.0.len(), walsenders: self.walsenders.get_all(), - wal_backup_active: state.wal_backup_active, - active: state.active, + wal_backup_active: self.wal_backup_active.load(Ordering::Relaxed), + active: self.broker_active.load(Ordering::Relaxed), num_computes: self.walreceivers.get_num() as u32, last_removed_segno: state.last_removed_segno, epoch_start_lsn: state.sk.epoch_start_lsn, @@ -948,7 +868,7 @@ impl Timeline { /// Apply a function to the control file state and persist it. pub async fn map_control_file( - &self, + self: &Arc, f: impl FnOnce(&mut TimelinePersistentState) -> Result, ) -> Result { let mut state = self.write_shared_state().await; diff --git a/safekeeper/src/timeline_manager.rs b/safekeeper/src/timeline_manager.rs new file mode 100644 index 0000000000..e74ba37ad8 --- /dev/null +++ b/safekeeper/src/timeline_manager.rs @@ -0,0 +1,145 @@ +//! The timeline manager task is responsible for managing the timeline's background tasks. +//! It is spawned alongside each timeline and exits when the timeline is deleted. +//! It watches for changes in the timeline state and decides when to spawn or kill background tasks. +//! It also can manage some reactive state, like should the timeline be active for broker pushes or not. + +use std::{sync::Arc, time::Duration}; + +use tracing::{info, instrument, warn}; +use utils::lsn::Lsn; + +use crate::{ + metrics::{MANAGER_ACTIVE_CHANGES, MANAGER_ITERATIONS_TOTAL}, + timeline::{PeerInfo, ReadGuardSharedState, Timeline}, + timelines_set::TimelinesSet, + wal_backup::{self, WalBackupTaskHandle}, + SafeKeeperConf, +}; + +pub struct StateSnapshot { + pub commit_lsn: Lsn, + pub backup_lsn: Lsn, + pub remote_consistent_lsn: Lsn, + pub peers: Vec, +} + +impl StateSnapshot { + /// Create a new snapshot of the timeline state. + fn new(read_guard: ReadGuardSharedState, heartbeat_timeout: Duration) -> Self { + Self { + commit_lsn: read_guard.sk.state.inmem.commit_lsn, + backup_lsn: read_guard.sk.state.inmem.backup_lsn, + remote_consistent_lsn: read_guard.sk.state.inmem.remote_consistent_lsn, + peers: read_guard.get_peers(heartbeat_timeout), + } + } +} + +/// Control how often the manager task should wake up to check updates. +/// There is no need to check for updates more often than this. +const REFRESH_INTERVAL: Duration = Duration::from_millis(300); + +/// This task gets spawned alongside each timeline and is responsible for managing the timeline's +/// background tasks. +#[instrument(name = "manager", skip_all, fields(ttid = %tli.ttid))] +pub async fn main_task( + tli: Arc, + conf: SafeKeeperConf, + broker_active_set: Arc, +) { + scopeguard::defer! { + if tli.is_cancelled() { + info!("manager task finished"); + } else { + warn!("manager task finished prematurely"); + } + }; + + // sets whether timeline is active for broker pushes or not + let mut tli_broker_active = broker_active_set.guard(tli.clone()); + + let ttid = tli.ttid; + let wal_seg_size = tli.get_wal_seg_size().await; + let heartbeat_timeout = conf.heartbeat_timeout; + + let mut state_version_rx = tli.get_state_version_rx(); + + let walreceivers = tli.get_walreceivers(); + let mut num_computes_rx = walreceivers.get_num_rx(); + + // list of background tasks + let mut backup_task: Option = None; + + let last_state = 'outer: loop { + MANAGER_ITERATIONS_TOTAL.inc(); + + let state_snapshot = StateSnapshot::new(tli.read_shared_state().await, heartbeat_timeout); + let num_computes = *num_computes_rx.borrow(); + + let is_wal_backup_required = + wal_backup::is_wal_backup_required(wal_seg_size, num_computes, &state_snapshot); + + if conf.is_wal_backup_enabled() { + wal_backup::update_task( + &conf, + ttid, + is_wal_backup_required, + &state_snapshot, + &mut backup_task, + ) + .await; + } + + let is_active = is_wal_backup_required + || num_computes > 0 + || state_snapshot.remote_consistent_lsn < state_snapshot.commit_lsn; + + // update the broker timeline set + if tli_broker_active.set(is_active) { + // write log if state has changed + info!( + "timeline active={} now, remote_consistent_lsn={}, commit_lsn={}", + is_active, state_snapshot.remote_consistent_lsn, state_snapshot.commit_lsn, + ); + + MANAGER_ACTIVE_CHANGES.inc(); + + if !is_active { + // TODO: maybe use tokio::spawn? + if let Err(e) = tli.maybe_persist_control_file().await { + warn!("control file save in update_status failed: {:?}", e); + } + } + } + + // update the state in Arc + tli.wal_backup_active + .store(backup_task.is_some(), std::sync::atomic::Ordering::Relaxed); + tli.broker_active + .store(is_active, std::sync::atomic::Ordering::Relaxed); + + // wait until something changes. tx channels are stored under Arc, so they will not be + // dropped until the manager task is finished. + tokio::select! { + _ = tli.cancel.cancelled() => { + // timeline was deleted + break 'outer state_snapshot; + } + _ = async { + // don't wake up on every state change, but at most every REFRESH_INTERVAL + tokio::time::sleep(REFRESH_INTERVAL).await; + let _ = state_version_rx.changed().await; + } => { + // state was updated + } + _ = num_computes_rx.changed() => { + // number of connected computes was updated + } + } + }; + + // shutdown background tasks + if conf.is_wal_backup_enabled() { + wal_backup::update_task(&conf, ttid, false, &last_state, &mut backup_task).await; + } +} diff --git a/safekeeper/src/timelines_global_map.rs b/safekeeper/src/timelines_global_map.rs index 079e706ff8..8d37bd6371 100644 --- a/safekeeper/src/timelines_global_map.rs +++ b/safekeeper/src/timelines_global_map.rs @@ -4,6 +4,7 @@ use crate::safekeeper::ServerInfo; use crate::timeline::{Timeline, TimelineError}; +use crate::timelines_set::TimelinesSet; use crate::SafeKeeperConf; use anyhow::{bail, Context, Result}; use camino::Utf8PathBuf; @@ -11,16 +12,16 @@ use once_cell::sync::Lazy; use serde::Serialize; use std::collections::HashMap; use std::str::FromStr; +use std::sync::atomic::Ordering; use std::sync::{Arc, Mutex}; -use tokio::sync::mpsc::Sender; use tracing::*; use utils::id::{TenantId, TenantTimelineId, TimelineId}; use utils::lsn::Lsn; struct GlobalTimelinesState { timelines: HashMap>, - wal_backup_launcher_tx: Option>, conf: Option, + broker_active_set: Arc, load_lock: Arc>, } @@ -36,11 +37,8 @@ impl GlobalTimelinesState { } /// Get dependencies for a timeline constructor. - fn get_dependencies(&self) -> (SafeKeeperConf, Sender) { - ( - self.get_conf().clone(), - self.wal_backup_launcher_tx.as_ref().unwrap().clone(), - ) + fn get_dependencies(&self) -> (SafeKeeperConf, Arc) { + (self.get_conf().clone(), self.broker_active_set.clone()) } /// Insert timeline into the map. Returns error if timeline with the same id already exists. @@ -65,8 +63,8 @@ impl GlobalTimelinesState { static TIMELINES_STATE: Lazy> = Lazy::new(|| { Mutex::new(GlobalTimelinesState { timelines: HashMap::new(), - wal_backup_launcher_tx: None, conf: None, + broker_active_set: Arc::new(TimelinesSet::default()), load_lock: Arc::new(tokio::sync::Mutex::new(TimelineLoadLock)), }) }); @@ -76,16 +74,11 @@ pub struct GlobalTimelines; impl GlobalTimelines { /// Inject dependencies needed for the timeline constructors and load all timelines to memory. - pub async fn init( - conf: SafeKeeperConf, - wal_backup_launcher_tx: Sender, - ) -> Result<()> { + pub async fn init(conf: SafeKeeperConf) -> Result<()> { // clippy isn't smart enough to understand that drop(state) releases the // lock, so use explicit block let tenants_dir = { let mut state = TIMELINES_STATE.lock().unwrap(); - assert!(state.wal_backup_launcher_tx.is_none()); - state.wal_backup_launcher_tx = Some(wal_backup_launcher_tx); state.conf = Some(conf); // Iterate through all directories and load tenants for all directories @@ -129,12 +122,9 @@ impl GlobalTimelines { /// this function is called during init when nothing else is running, so /// this is fine. async fn load_tenant_timelines(tenant_id: TenantId) -> Result<()> { - let (conf, wal_backup_launcher_tx) = { + let (conf, broker_active_set) = { let state = TIMELINES_STATE.lock().unwrap(); - ( - state.get_conf().clone(), - state.wal_backup_launcher_tx.as_ref().unwrap().clone(), - ) + state.get_dependencies() }; let timelines_dir = conf.tenant_dir(&tenant_id); @@ -147,7 +137,7 @@ impl GlobalTimelines { TimelineId::from_str(timeline_dir_entry.file_name().to_str().unwrap_or("")) { let ttid = TenantTimelineId::new(tenant_id, timeline_id); - match Timeline::load_timeline(&conf, ttid, wal_backup_launcher_tx.clone()) { + match Timeline::load_timeline(&conf, ttid) { Ok(timeline) => { let tli = Arc::new(timeline); TIMELINES_STATE @@ -155,8 +145,7 @@ impl GlobalTimelines { .unwrap() .timelines .insert(ttid, tli.clone()); - tli.bootstrap(&conf); - tli.update_status_notify().await.unwrap(); + tli.bootstrap(&conf, broker_active_set.clone()); } // If we can't load a timeline, it's most likely because of a corrupted // directory. We will log an error and won't allow to delete/recreate @@ -189,9 +178,9 @@ impl GlobalTimelines { _guard: &tokio::sync::MutexGuard<'a, TimelineLoadLock>, ttid: TenantTimelineId, ) -> Result> { - let (conf, wal_backup_launcher_tx) = TIMELINES_STATE.lock().unwrap().get_dependencies(); + let (conf, broker_active_set) = TIMELINES_STATE.lock().unwrap().get_dependencies(); - match Timeline::load_timeline(&conf, ttid, wal_backup_launcher_tx) { + match Timeline::load_timeline(&conf, ttid) { Ok(timeline) => { let tli = Arc::new(timeline); @@ -202,7 +191,7 @@ impl GlobalTimelines { .timelines .insert(ttid, tli.clone()); - tli.bootstrap(&conf); + tli.bootstrap(&conf, broker_active_set); Ok(tli) } @@ -221,6 +210,10 @@ impl GlobalTimelines { TIMELINES_STATE.lock().unwrap().get_conf().clone() } + pub fn get_global_broker_active_set() -> Arc { + TIMELINES_STATE.lock().unwrap().broker_active_set.clone() + } + /// Create a new timeline with the given id. If the timeline already exists, returns /// an existing timeline. pub async fn create( @@ -229,7 +222,7 @@ impl GlobalTimelines { commit_lsn: Lsn, local_start_lsn: Lsn, ) -> Result> { - let (conf, wal_backup_launcher_tx) = { + let (conf, broker_active_set) = { let state = TIMELINES_STATE.lock().unwrap(); if let Ok(timeline) = state.get(&ttid) { // Timeline already exists, return it. @@ -243,7 +236,6 @@ impl GlobalTimelines { let timeline = Arc::new(Timeline::create_empty( &conf, ttid, - wal_backup_launcher_tx, server_info, commit_lsn, local_start_lsn, @@ -264,7 +256,10 @@ impl GlobalTimelines { // Write the new timeline to the disk and start background workers. // Bootstrap is transactional, so if it fails, the timeline will be deleted, // and the state on disk should remain unchanged. - if let Err(e) = timeline.init_new(&mut shared_state, &conf).await { + if let Err(e) = timeline + .init_new(&mut shared_state, &conf, broker_active_set) + .await + { // Note: the most likely reason for init failure is that the timeline // directory already exists on disk. This happens when timeline is corrupted // and wasn't loaded from disk on startup because of that. We want to preserve @@ -281,8 +276,6 @@ impl GlobalTimelines { // We are done with bootstrap, release the lock, return the timeline. // {} block forces release before .await } - timeline.update_status_notify().await?; - timeline.wal_backup_launcher_tx.send(timeline.ttid).await?; Ok(timeline) } @@ -335,12 +328,13 @@ impl GlobalTimelines { let tli_res = TIMELINES_STATE.lock().unwrap().get(ttid); match tli_res { Ok(timeline) => { + let was_active = timeline.broker_active.load(Ordering::Relaxed); + // Take a lock and finish the deletion holding this mutex. let mut shared_state = timeline.write_shared_state().await; info!("deleting timeline {}, only_local={}", ttid, only_local); - let (dir_existed, was_active) = - timeline.delete(&mut shared_state, only_local).await?; + let dir_existed = timeline.delete(&mut shared_state, only_local).await?; // Remove timeline from the map. // FIXME: re-enable it once we fix the issue with recreation of deleted timelines @@ -349,7 +343,7 @@ impl GlobalTimelines { Ok(TimelineDeleteForceResult { dir_existed, - was_active, + was_active, // TODO: we probably should remove this field }) } Err(_) => { diff --git a/safekeeper/src/timelines_set.rs b/safekeeper/src/timelines_set.rs new file mode 100644 index 0000000000..ea8e23bb72 --- /dev/null +++ b/safekeeper/src/timelines_set.rs @@ -0,0 +1,90 @@ +use std::{collections::HashMap, sync::Arc}; + +use utils::id::TenantTimelineId; + +use crate::timeline::Timeline; + +/// Set of timelines, supports operations: +/// - add timeline +/// - remove timeline +/// - clone the set +/// +/// Usually used for keeping subset of timelines. For example active timelines that require broker push. +pub struct TimelinesSet { + timelines: std::sync::Mutex>>, +} + +impl Default for TimelinesSet { + fn default() -> Self { + Self { + timelines: std::sync::Mutex::new(HashMap::new()), + } + } +} + +impl TimelinesSet { + pub fn insert(&self, tli: Arc) { + self.timelines.lock().unwrap().insert(tli.ttid, tli); + } + + pub fn delete(&self, ttid: &TenantTimelineId) { + self.timelines.lock().unwrap().remove(ttid); + } + + /// If present is true, adds timeline to the set, otherwise removes it. + pub fn set_present(&self, tli: Arc, present: bool) { + if present { + self.insert(tli); + } else { + self.delete(&tli.ttid); + } + } + + pub fn is_present(&self, ttid: &TenantTimelineId) -> bool { + self.timelines.lock().unwrap().contains_key(ttid) + } + + /// Returns all timelines in the set. + pub fn get_all(&self) -> Vec> { + self.timelines.lock().unwrap().values().cloned().collect() + } + + /// Returns a timeline guard for easy presence control. + pub fn guard(self: &Arc, tli: Arc) -> TimelineSetGuard { + let is_present = self.is_present(&tli.ttid); + TimelineSetGuard { + timelines_set: self.clone(), + tli, + is_present, + } + } +} + +/// Guard is used to add or remove timeline from the set. +/// If the timeline present in set, it will be removed from it on drop. +/// Note: do not use more than one guard for the same timeline, it caches the presence state. +/// It is designed to be used in the manager task only. +pub struct TimelineSetGuard { + timelines_set: Arc, + tli: Arc, + is_present: bool, +} + +impl TimelineSetGuard { + /// Returns true if the state was changed. + pub fn set(&mut self, present: bool) -> bool { + if present == self.is_present { + return false; + } + self.is_present = present; + self.timelines_set.set_present(self.tli.clone(), present); + true + } +} + +impl Drop for TimelineSetGuard { + fn drop(&mut self) { + // remove timeline from the map on drop + self.timelines_set.delete(&self.tli.ttid); + } +} diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs index e496f07114..84680557f9 100644 --- a/safekeeper/src/wal_backup.rs +++ b/safekeeper/src/wal_backup.rs @@ -9,7 +9,7 @@ use utils::backoff; use utils::id::NodeId; use std::cmp::min; -use std::collections::{HashMap, HashSet}; +use std::collections::HashSet; use std::num::NonZeroU32; use std::pin::Pin; use std::sync::Arc; @@ -29,9 +29,10 @@ use tracing::*; use utils::{id::TenantTimelineId, lsn::Lsn}; -use crate::metrics::{BACKED_UP_SEGMENTS, BACKUP_ERRORS}; +use crate::metrics::{BACKED_UP_SEGMENTS, BACKUP_ERRORS, WAL_BACKUP_TASKS}; use crate::timeline::{PeerInfo, Timeline}; -use crate::{GlobalTimelines, SafeKeeperConf}; +use crate::timeline_manager::StateSnapshot; +use crate::{GlobalTimelines, SafeKeeperConf, WAL_BACKUP_RUNTIME}; use once_cell::sync::OnceCell; @@ -41,35 +42,84 @@ const UPLOAD_FAILURE_RETRY_MAX_MS: u64 = 5000; /// Default buffer size when interfacing with [`tokio::fs::File`]. const BUFFER_SIZE: usize = 32 * 1024; -/// Check whether wal backup is required for timeline. If yes, mark that launcher is -/// aware of current status and return the timeline. -async fn is_wal_backup_required(ttid: TenantTimelineId) -> Option> { - match GlobalTimelines::get(ttid).ok() { - Some(tli) => { - tli.wal_backup_attend().await; - Some(tli) - } - None => None, - } -} - -struct WalBackupTaskHandle { +pub struct WalBackupTaskHandle { shutdown_tx: Sender<()>, handle: JoinHandle<()>, } -struct WalBackupTimelineEntry { - timeline: Arc, - handle: Option, +/// Do we have anything to upload to S3, i.e. should safekeepers run backup activity? +pub fn is_wal_backup_required( + wal_seg_size: usize, + num_computes: usize, + state: &StateSnapshot, +) -> bool { + num_computes > 0 || + // Currently only the whole segment is offloaded, so compare segment numbers. + (state.commit_lsn.segment_number(wal_seg_size) > state.backup_lsn.segment_number(wal_seg_size)) } -async fn shut_down_task(ttid: TenantTimelineId, entry: &mut WalBackupTimelineEntry) { - if let Some(wb_handle) = entry.handle.take() { +/// Based on peer information determine which safekeeper should offload; if it +/// is me, run (per timeline) task, if not yet. OTOH, if it is not me and task +/// is running, kill it. +pub async fn update_task( + conf: &SafeKeeperConf, + ttid: TenantTimelineId, + need_backup: bool, + state: &StateSnapshot, + entry: &mut Option, +) { + let (offloader, election_dbg_str) = + determine_offloader(&state.peers, state.backup_lsn, ttid, conf); + let elected_me = Some(conf.my_id) == offloader; + + let should_task_run = need_backup && elected_me; + + // start or stop the task + if should_task_run != (entry.is_some()) { + if should_task_run { + info!("elected for backup: {}", election_dbg_str); + + let (shutdown_tx, shutdown_rx) = mpsc::channel(1); + let timeline_dir = conf.timeline_dir(&ttid); + + let async_task = backup_task_main( + ttid, + timeline_dir, + conf.workdir.clone(), + conf.backup_parallel_jobs, + shutdown_rx, + ); + + let handle = if conf.current_thread_runtime { + tokio::spawn(async_task) + } else { + WAL_BACKUP_RUNTIME.spawn(async_task) + }; + + *entry = Some(WalBackupTaskHandle { + shutdown_tx, + handle, + }); + } else { + if !need_backup { + // don't need backup at all + info!("stepping down from backup, need_backup={}", need_backup); + } else { + // someone else has been elected + info!("stepping down from backup: {}", election_dbg_str); + } + shut_down_task(entry).await; + } + } +} + +async fn shut_down_task(entry: &mut Option) { + if let Some(wb_handle) = entry.take() { // Tell the task to shutdown. Error means task exited earlier, that's ok. let _ = wb_handle.shutdown_tx.send(()).await; // Await the task itself. TODO: restart panicked tasks earlier. if let Err(e) = wb_handle.handle.await { - warn!("WAL backup task for {} panicked: {}", ttid, e); + warn!("WAL backup task panicked: {}", e); } } } @@ -126,49 +176,6 @@ fn determine_offloader( } } -/// Based on peer information determine which safekeeper should offload; if it -/// is me, run (per timeline) task, if not yet. OTOH, if it is not me and task -/// is running, kill it. -async fn update_task( - conf: &SafeKeeperConf, - ttid: TenantTimelineId, - entry: &mut WalBackupTimelineEntry, -) { - let alive_peers = entry.timeline.get_peers(conf).await; - let wal_backup_lsn = entry.timeline.get_wal_backup_lsn().await; - let (offloader, election_dbg_str) = - determine_offloader(&alive_peers, wal_backup_lsn, ttid, conf); - let elected_me = Some(conf.my_id) == offloader; - - if elected_me != (entry.handle.is_some()) { - if elected_me { - info!("elected for backup: {}", election_dbg_str); - - let (shutdown_tx, shutdown_rx) = mpsc::channel(1); - let timeline_dir = conf.timeline_dir(&ttid); - - let handle = tokio::spawn( - backup_task_main( - ttid, - timeline_dir, - conf.workdir.clone(), - conf.backup_parallel_jobs, - shutdown_rx, - ) - .in_current_span(), - ); - - entry.handle = Some(WalBackupTaskHandle { - shutdown_tx, - handle, - }); - } else { - info!("stepping down from backup: {}", election_dbg_str); - shut_down_task(ttid, entry).await; - } - } -} - static REMOTE_STORAGE: OnceCell> = OnceCell::new(); // Storage must be configured and initialized when this is called. @@ -190,67 +197,6 @@ pub fn init_remote_storage(conf: &SafeKeeperConf) { }); } -const CHECK_TASKS_INTERVAL_MSEC: u64 = 1000; - -/// Sits on wal_backup_launcher_rx and starts/stops per timeline wal backup -/// tasks. Having this in separate task simplifies locking, allows to reap -/// panics and separate elections from offloading itself. -pub async fn wal_backup_launcher_task_main( - conf: SafeKeeperConf, - mut wal_backup_launcher_rx: Receiver, -) -> anyhow::Result<()> { - info!( - "WAL backup launcher started, remote config {:?}", - conf.remote_storage - ); - - // Presence in this map means launcher is aware s3 offloading is needed for - // the timeline, but task is started only if it makes sense for to offload - // from this safekeeper. - let mut tasks: HashMap = HashMap::new(); - - let mut ticker = tokio::time::interval(Duration::from_millis(CHECK_TASKS_INTERVAL_MSEC)); - loop { - tokio::select! { - ttid = wal_backup_launcher_rx.recv() => { - // channel is never expected to get closed - let ttid = ttid.unwrap(); - if !conf.is_wal_backup_enabled() { - continue; /* just drain the channel and do nothing */ - } - async { - let timeline = is_wal_backup_required(ttid).await; - // do we need to do anything at all? - if timeline.is_some() != tasks.contains_key(&ttid) { - if let Some(timeline) = timeline { - // need to start the task - let entry = tasks.entry(ttid).or_insert(WalBackupTimelineEntry { - timeline, - handle: None, - }); - update_task(&conf, ttid, entry).await; - } else { - // need to stop the task - info!("stopping WAL backup task"); - let mut entry = tasks.remove(&ttid).unwrap(); - shut_down_task(ttid, &mut entry).await; - } - } - }.instrument(info_span!("WAL backup", ttid = %ttid)).await; - } - // For each timeline needing offloading, check if this safekeeper - // should do the job and start/stop the task accordingly. - _ = ticker.tick() => { - for (ttid, entry) in tasks.iter_mut() { - update_task(&conf, *ttid, entry) - .instrument(info_span!("WAL backup", ttid = %ttid)) - .await; - } - } - } - } -} - struct WalBackupTask { timeline: Arc, timeline_dir: Utf8PathBuf, @@ -261,6 +207,7 @@ struct WalBackupTask { } /// Offload single timeline. +#[instrument(name = "WAL backup", skip_all, fields(ttid = %ttid))] async fn backup_task_main( ttid: TenantTimelineId, timeline_dir: Utf8PathBuf, @@ -268,6 +215,8 @@ async fn backup_task_main( parallel_jobs: usize, mut shutdown_rx: Receiver<()>, ) { + let _guard = WAL_BACKUP_TASKS.guard(); + info!("started"); let res = GlobalTimelines::get(ttid); if let Err(e) = res { diff --git a/safekeeper/src/wal_backup_partial.rs b/safekeeper/src/wal_backup_partial.rs index 200096ac5c..29e944bff3 100644 --- a/safekeeper/src/wal_backup_partial.rs +++ b/safekeeper/src/wal_backup_partial.rs @@ -277,14 +277,6 @@ pub async fn main_task(tli: Arc, conf: SafeKeeperConf) { debug!("started"); let await_duration = conf.partial_backup_timeout; - let mut cancellation_rx = match tli.get_cancellation_rx() { - Ok(rx) => rx, - Err(_) => { - info!("timeline canceled during task start"); - return; - } - }; - // sleep for random time to avoid thundering herd { let randf64 = rand::thread_rng().gen_range(0.0..1.0); @@ -327,7 +319,7 @@ pub async fn main_task(tli: Arc, conf: SafeKeeperConf) { && flush_lsn_rx.borrow().term == seg.term { tokio::select! { - _ = cancellation_rx.changed() => { + _ = backup.tli.cancel.cancelled() => { info!("timeline canceled"); return; } @@ -340,7 +332,7 @@ pub async fn main_task(tli: Arc, conf: SafeKeeperConf) { // if we don't have any data and zero LSNs, wait for something while flush_lsn_rx.borrow().lsn == Lsn(0) { tokio::select! { - _ = cancellation_rx.changed() => { + _ = backup.tli.cancel.cancelled() => { info!("timeline canceled"); return; } @@ -357,7 +349,7 @@ pub async fn main_task(tli: Arc, conf: SafeKeeperConf) { // waiting until timeout expires OR segno changes 'inner: loop { tokio::select! { - _ = cancellation_rx.changed() => { + _ = backup.tli.cancel.cancelled() => { info!("timeline canceled"); return; } diff --git a/scripts/check_allowed_errors.sh b/scripts/check_allowed_errors.sh new file mode 100755 index 0000000000..87e52c1e64 --- /dev/null +++ b/scripts/check_allowed_errors.sh @@ -0,0 +1,18 @@ +#!/usr/bin/env bash + +set -eu + +HELPER_DIR="$(dirname "${BASH_SOURCE[0]}")" +SCRIPT="test_runner/fixtures/pageserver/allowed_errors.py" + +# first run to understand all of the errors: +# +# example: ./scripts/check_allowed_errors.sh -i - < pageserver.log +# example: ./scripts/check_allowed_errors.sh -i pageserver.log +# +# then edit the test local allowed_errors to the +# test_runner/fixtures/pageserver/allowed_errors.py, then re-run to make sure +# they are handled. +# +# finally revert any local changes to allowed_errors.py. +poetry run python3 "$HELPER_DIR/../$SCRIPT" $* diff --git a/scripts/flaky_tests.py b/scripts/flaky_tests.py index 878840fcee..919a9278a9 100755 --- a/scripts/flaky_tests.py +++ b/scripts/flaky_tests.py @@ -5,10 +5,11 @@ import json import logging import os from collections import defaultdict -from typing import DefaultDict, Dict +from typing import Any, DefaultDict, Dict, Optional import psycopg2 import psycopg2.extras +import toml FLAKY_TESTS_QUERY = """ SELECT @@ -58,6 +59,24 @@ def main(args: argparse.Namespace): else: pageserver_virtual_file_io_engine_parameter = "" + # re-use existing records of flaky tests from before parametrization by compaction_algorithm + def get_pageserver_default_tenant_config_compaction_algorithm() -> Optional[Dict[str, Any]]: + """Duplicated from parametrize.py""" + toml_table = os.getenv("PAGESERVER_DEFAULT_TENANT_CONFIG_COMPACTION_ALGORITHM") + if toml_table is None: + return None + v = toml.loads(toml_table) + assert isinstance(v, dict) + return v + + pageserver_default_tenant_config_compaction_algorithm_parameter = "" + if ( + explicit_default := get_pageserver_default_tenant_config_compaction_algorithm() + ) is not None: + pageserver_default_tenant_config_compaction_algorithm_parameter = ( + f"-{explicit_default['kind']}" + ) + for row in rows: # We don't want to automatically rerun tests in a performance suite if row["parent_suite"] != "test_runner.regress": @@ -66,10 +85,10 @@ def main(args: argparse.Namespace): if row["name"].endswith("]"): parametrized_test = row["name"].replace( "[", - f"[{build_type}-pg{pg_version}{pageserver_virtual_file_io_engine_parameter}-", + f"[{build_type}-pg{pg_version}{pageserver_virtual_file_io_engine_parameter}{pageserver_default_tenant_config_compaction_algorithm_parameter}-", ) else: - parametrized_test = f"{row['name']}[{build_type}-pg{pg_version}{pageserver_virtual_file_io_engine_parameter}]" + parametrized_test = f"{row['name']}[{build_type}-pg{pg_version}{pageserver_virtual_file_io_engine_parameter}{pageserver_default_tenant_config_compaction_algorithm_parameter}]" res[row["parent_suite"]][row["suite"]][parametrized_test] = True diff --git a/storage_broker/benches/rps.rs b/storage_broker/benches/rps.rs index d66cbefa45..1a6fb7fedf 100644 --- a/storage_broker/benches/rps.rs +++ b/storage_broker/benches/rps.rs @@ -147,6 +147,7 @@ async fn publish(client: Option, n_keys: u64) { http_connstr: "zenith-1-sk-1.local:7677".to_owned(), local_start_lsn: 0, availability_zone: None, + standby_horizon: 0, }; counter += 1; yield info; diff --git a/storage_broker/proto/broker.proto b/storage_broker/proto/broker.proto index 7d1b63d23f..a420fd9c66 100644 --- a/storage_broker/proto/broker.proto +++ b/storage_broker/proto/broker.proto @@ -42,6 +42,7 @@ message SafekeeperTimelineInfo { uint64 remote_consistent_lsn = 7; uint64 peer_horizon_lsn = 8; uint64 local_start_lsn = 9; + uint64 standby_horizon = 14; // A connection string to use for WAL receiving. string safekeeper_connstr = 10; // HTTP endpoint connection string @@ -105,4 +106,6 @@ message SafekeeperDiscoveryResponse { string safekeeper_connstr = 4; // Availability zone of a safekeeper. optional string availability_zone = 5; + // Replica apply LSN + uint64 standby_horizon = 6; } diff --git a/storage_broker/src/bin/storage_broker.rs b/storage_broker/src/bin/storage_broker.rs index 8c88b61abc..0a4af543ab 100644 --- a/storage_broker/src/bin/storage_broker.rs +++ b/storage_broker/src/bin/storage_broker.rs @@ -736,6 +736,7 @@ mod tests { http_connstr: "neon-1-sk-1.local:7677".to_owned(), local_start_lsn: 0, availability_zone: None, + standby_horizon: 0, }) } diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs index f1454af533..ce8f8d0cdd 100644 --- a/storage_controller/src/main.rs +++ b/storage_controller/src/main.rs @@ -66,6 +66,10 @@ struct Cli { #[arg(long)] max_unavailable_interval: Option, + /// Size threshold for automatically splitting shards (disabled by default) + #[arg(long)] + split_threshold: Option, + /// Maximum number of reconcilers that may run in parallel #[arg(long)] reconciler_concurrency: Option, @@ -255,6 +259,7 @@ async fn async_main() -> anyhow::Result<()> { reconciler_concurrency: args .reconciler_concurrency .unwrap_or(RECONCILER_CONCURRENCY_DEFAULT), + split_threshold: args.split_threshold, }; // After loading secrets & config, but before starting anything else, apply database migrations diff --git a/storage_controller/src/pageserver_client.rs b/storage_controller/src/pageserver_client.rs index 25b6b67e12..769aba80ca 100644 --- a/storage_controller/src/pageserver_client.rs +++ b/storage_controller/src/pageserver_client.rs @@ -2,7 +2,7 @@ use pageserver_api::{ models::{ LocationConfig, LocationConfigListResponse, PageserverUtilization, SecondaryProgress, TenantScanRemoteStorageResponse, TenantShardSplitRequest, TenantShardSplitResponse, - TimelineCreateRequest, TimelineInfo, + TimelineCreateRequest, TimelineInfo, TopTenantShardsRequest, TopTenantShardsResponse, }, shard::TenantShardId, }; @@ -234,4 +234,16 @@ impl PageserverClient { self.inner.get_utilization().await ) } + + pub(crate) async fn top_tenant_shards( + &self, + request: TopTenantShardsRequest, + ) -> Result { + measured_request!( + "top_tenants", + crate::metrics::Method::Post, + &self.node_id_label, + self.inner.top_tenant_shards(request).await + ) + } } diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs index dca37166ba..67c05296d5 100644 --- a/storage_controller/src/persistence.rs +++ b/storage_controller/src/persistence.rs @@ -173,7 +173,7 @@ impl Persistence { /// Wraps `with_conn` in order to collect latency and error metrics async fn with_measured_conn(&self, op: DatabaseOperation, func: F) -> DatabaseResult where - F: FnOnce(&mut PgConnection) -> DatabaseResult + Send + 'static, + F: Fn(&mut PgConnection) -> DatabaseResult + Send + 'static, R: Send + 'static, { let latency = &METRICS_REGISTRY @@ -199,13 +199,48 @@ impl Persistence { /// Call the provided function in a tokio blocking thread, with a Diesel database connection. async fn with_conn(&self, func: F) -> DatabaseResult where - F: FnOnce(&mut PgConnection) -> DatabaseResult + Send + 'static, + F: Fn(&mut PgConnection) -> DatabaseResult + Send + 'static, R: Send + 'static, { + // A generous allowance for how many times we may retry serializable transactions + // before giving up. This is not expected to be hit: it is a defensive measure in case we + // somehow engineer a situation where duelling transactions might otherwise live-lock. + const MAX_RETRIES: usize = 128; + let mut conn = self.connection_pool.get()?; - tokio::task::spawn_blocking(move || -> DatabaseResult { func(&mut conn) }) - .await - .expect("Task panic") + tokio::task::spawn_blocking(move || -> DatabaseResult { + let mut retry_count = 0; + loop { + match conn.build_transaction().serializable().run(|c| func(c)) { + Ok(r) => break Ok(r), + Err( + err @ DatabaseError::Query(diesel::result::Error::DatabaseError( + diesel::result::DatabaseErrorKind::SerializationFailure, + _, + )), + ) => { + retry_count += 1; + if retry_count > MAX_RETRIES { + tracing::error!( + "Exceeded max retries on SerializationFailure errors: {err:?}" + ); + break Err(err); + } else { + // Retry on serialization errors: these are expected, because even though our + // transactions don't fight for the same rows, they will occasionally collide + // on index pages (e.g. increment_generation for unrelated shards can collide) + tracing::debug!( + "Retrying transaction on serialization failure {err:?}" + ); + continue; + } + } + Err(e) => break Err(e), + } + } + }) + .await + .expect("Task panic") } /// When a node is first registered, persist it before using it for anything @@ -358,14 +393,11 @@ impl Persistence { self.with_measured_conn( DatabaseOperation::InsertTenantShards, move |conn| -> DatabaseResult<()> { - conn.transaction(|conn| -> QueryResult<()> { - for tenant in &shards { - diesel::insert_into(tenant_shards) - .values(tenant) - .execute(conn)?; - } - Ok(()) - })?; + for tenant in &shards { + diesel::insert_into(tenant_shards) + .values(tenant) + .execute(conn)?; + } Ok(()) }, ) @@ -533,8 +565,11 @@ impl Persistence { let update = ShardUpdate { generation: input_generation.map(|g| g.into().unwrap() as i32), placement_policy: input_placement_policy + .as_ref() .map(|p| serde_json::to_string(&p).unwrap()), - config: input_config.map(|c| serde_json::to_string(&c).unwrap()), + config: input_config + .as_ref() + .map(|c| serde_json::to_string(&c).unwrap()), scheduling_policy: input_scheduling_policy .map(|p| serde_json::to_string(&p).unwrap()), }; @@ -581,55 +616,51 @@ impl Persistence { ) -> DatabaseResult<()> { use crate::schema::tenant_shards::dsl::*; self.with_measured_conn(DatabaseOperation::BeginShardSplit, move |conn| -> DatabaseResult<()> { - conn.transaction(|conn| -> DatabaseResult<()> { - // Mark parent shards as splitting + // Mark parent shards as splitting - let updated = diesel::update(tenant_shards) - .filter(tenant_id.eq(split_tenant_id.to_string())) - .filter(shard_count.eq(old_shard_count.literal() as i32)) - .set((splitting.eq(1),)) - .execute(conn)?; - if u8::try_from(updated) - .map_err(|_| DatabaseError::Logical( - format!("Overflow existing shard count {} while splitting", updated)) - )? != old_shard_count.count() { - // Perhaps a deletion or another split raced with this attempt to split, mutating - // the parent shards that we intend to split. In this case the split request should fail. - return Err(DatabaseError::Logical( - format!("Unexpected existing shard count {updated} when preparing tenant for split (expected {})", old_shard_count.count()) - )); + let updated = diesel::update(tenant_shards) + .filter(tenant_id.eq(split_tenant_id.to_string())) + .filter(shard_count.eq(old_shard_count.literal() as i32)) + .set((splitting.eq(1),)) + .execute(conn)?; + if u8::try_from(updated) + .map_err(|_| DatabaseError::Logical( + format!("Overflow existing shard count {} while splitting", updated)) + )? != old_shard_count.count() { + // Perhaps a deletion or another split raced with this attempt to split, mutating + // the parent shards that we intend to split. In this case the split request should fail. + return Err(DatabaseError::Logical( + format!("Unexpected existing shard count {updated} when preparing tenant for split (expected {})", old_shard_count.count()) + )); + } + + // FIXME: spurious clone to sidestep closure move rules + let parent_to_children = parent_to_children.clone(); + + // Insert child shards + for (parent_shard_id, children) in parent_to_children { + let mut parent = crate::schema::tenant_shards::table + .filter(tenant_id.eq(parent_shard_id.tenant_id.to_string())) + .filter(shard_number.eq(parent_shard_id.shard_number.0 as i32)) + .filter(shard_count.eq(parent_shard_id.shard_count.literal() as i32)) + .load::(conn)?; + let parent = if parent.len() != 1 { + return Err(DatabaseError::Logical(format!( + "Parent shard {parent_shard_id} not found" + ))); + } else { + parent.pop().unwrap() + }; + for mut shard in children { + // Carry the parent's generation into the child + shard.generation = parent.generation; + + debug_assert!(shard.splitting == SplitState::Splitting); + diesel::insert_into(tenant_shards) + .values(shard) + .execute(conn)?; } - - // FIXME: spurious clone to sidestep closure move rules - let parent_to_children = parent_to_children.clone(); - - // Insert child shards - for (parent_shard_id, children) in parent_to_children { - let mut parent = crate::schema::tenant_shards::table - .filter(tenant_id.eq(parent_shard_id.tenant_id.to_string())) - .filter(shard_number.eq(parent_shard_id.shard_number.0 as i32)) - .filter(shard_count.eq(parent_shard_id.shard_count.literal() as i32)) - .load::(conn)?; - let parent = if parent.len() != 1 { - return Err(DatabaseError::Logical(format!( - "Parent shard {parent_shard_id} not found" - ))); - } else { - parent.pop().unwrap() - }; - for mut shard in children { - // Carry the parent's generation into the child - shard.generation = parent.generation; - - debug_assert!(shard.splitting == SplitState::Splitting); - diesel::insert_into(tenant_shards) - .values(shard) - .execute(conn)?; - } - } - - Ok(()) - })?; + } Ok(()) }) @@ -647,22 +678,18 @@ impl Persistence { self.with_measured_conn( DatabaseOperation::CompleteShardSplit, move |conn| -> DatabaseResult<()> { - conn.transaction(|conn| -> QueryResult<()> { - // Drop parent shards - diesel::delete(tenant_shards) - .filter(tenant_id.eq(split_tenant_id.to_string())) - .filter(shard_count.eq(old_shard_count.literal() as i32)) - .execute(conn)?; + // Drop parent shards + diesel::delete(tenant_shards) + .filter(tenant_id.eq(split_tenant_id.to_string())) + .filter(shard_count.eq(old_shard_count.literal() as i32)) + .execute(conn)?; - // Clear sharding flag - let updated = diesel::update(tenant_shards) - .filter(tenant_id.eq(split_tenant_id.to_string())) - .set((splitting.eq(0),)) - .execute(conn)?; - debug_assert!(updated > 0); - - Ok(()) - })?; + // Clear sharding flag + let updated = diesel::update(tenant_shards) + .filter(tenant_id.eq(split_tenant_id.to_string())) + .set((splitting.eq(0),)) + .execute(conn)?; + debug_assert!(updated > 0); Ok(()) }, @@ -681,39 +708,34 @@ impl Persistence { self.with_measured_conn( DatabaseOperation::AbortShardSplit, move |conn| -> DatabaseResult { - let aborted = - conn.transaction(|conn| -> DatabaseResult { - // Clear the splitting state on parent shards - let updated = diesel::update(tenant_shards) - .filter(tenant_id.eq(split_tenant_id.to_string())) - .filter(shard_count.ne(new_shard_count.literal() as i32)) - .set((splitting.eq(0),)) - .execute(conn)?; + // Clear the splitting state on parent shards + let updated = diesel::update(tenant_shards) + .filter(tenant_id.eq(split_tenant_id.to_string())) + .filter(shard_count.ne(new_shard_count.literal() as i32)) + .set((splitting.eq(0),)) + .execute(conn)?; - // Parent shards are already gone: we cannot abort. - if updated == 0 { - return Ok(AbortShardSplitStatus::Complete); - } + // Parent shards are already gone: we cannot abort. + if updated == 0 { + return Ok(AbortShardSplitStatus::Complete); + } - // Sanity check: if parent shards were present, their cardinality should - // be less than the number of child shards. - if updated >= new_shard_count.count() as usize { - return Err(DatabaseError::Logical(format!( - "Unexpected parent shard count {updated} while aborting split to \ + // Sanity check: if parent shards were present, their cardinality should + // be less than the number of child shards. + if updated >= new_shard_count.count() as usize { + return Err(DatabaseError::Logical(format!( + "Unexpected parent shard count {updated} while aborting split to \ count {new_shard_count:?} on tenant {split_tenant_id}" - ))); - } + ))); + } - // Erase child shards - diesel::delete(tenant_shards) - .filter(tenant_id.eq(split_tenant_id.to_string())) - .filter(shard_count.eq(new_shard_count.literal() as i32)) - .execute(conn)?; + // Erase child shards + diesel::delete(tenant_shards) + .filter(tenant_id.eq(split_tenant_id.to_string())) + .filter(shard_count.eq(new_shard_count.literal() as i32)) + .execute(conn)?; - Ok(AbortShardSplitStatus::Aborted) - })?; - - Ok(aborted) + Ok(AbortShardSplitStatus::Aborted) }, ) .await diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index d3a53066c9..f914f4e0bb 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -32,10 +32,10 @@ use pageserver_api::{ TenantPolicyRequest, TenantShardMigrateRequest, TenantShardMigrateResponse, UtilizationScore, }, - models::{SecondaryProgress, TenantConfigRequest}, + models::{SecondaryProgress, TenantConfigRequest, TopTenantShardsRequest}, }; use reqwest::StatusCode; -use tracing::instrument; +use tracing::{instrument, Instrument}; use crate::pageserver_client::PageserverClient; use pageserver_api::{ @@ -222,6 +222,10 @@ pub struct Config { /// How many Reconcilers may be spawned concurrently pub reconciler_concurrency: usize, + + /// How large must a shard grow in bytes before we split it? + /// None disables auto-splitting. + pub split_threshold: Option, } impl From for ApiError { @@ -699,7 +703,7 @@ impl Service { /// e.g. a tenant create/attach/migrate must eventually be retried: this task is responsible /// for those retries. #[instrument(skip_all)] - async fn background_reconcile(&self) { + async fn background_reconcile(self: &Arc) { self.startup_complete.clone().wait().await; const BACKGROUND_RECONCILE_PERIOD: Duration = Duration::from_secs(20); @@ -711,7 +715,11 @@ impl Service { let reconciles_spawned = self.reconcile_all(); if reconciles_spawned == 0 { // Run optimizer only when we didn't find any other work to do - self.optimize_all().await; + let optimizations = self.optimize_all().await; + if optimizations == 0 { + // Run new splits only when no optimizations are pending + self.autosplit_tenants().await; + } } } _ = self.cancel.cancelled() => return @@ -4745,7 +4753,7 @@ impl Service { // them in an optimization const DOWNLOAD_FRESHNESS_THRESHOLD: u64 = 10 * 1024 * 1024 * 1024; - if progress.bytes_total == 0 + if progress.heatmap_mtime.is_none() || progress.bytes_total < DOWNLOAD_FRESHNESS_THRESHOLD && progress.bytes_downloaded != progress.bytes_total || progress.bytes_total - progress.bytes_downloaded @@ -4766,6 +4774,104 @@ impl Service { validated_work } + /// Look for shards which are oversized and in need of splitting + async fn autosplit_tenants(self: &Arc) { + let Some(split_threshold) = self.config.split_threshold else { + // Auto-splitting is disabled + return; + }; + + let nodes = self.inner.read().unwrap().nodes.clone(); + + const SPLIT_TO_MAX: ShardCount = ShardCount::new(8); + + let mut top_n = Vec::new(); + + // Call into each node to look for big tenants + let top_n_request = TopTenantShardsRequest { + // We currently split based on logical size, for simplicity: logical size is a signal of + // the user's intent to run a large database, whereas physical/resident size can be symptoms + // of compaction issues. Eventually we should switch to using resident size to bound the + // disk space impact of one shard. + order_by: models::TenantSorting::MaxLogicalSize, + limit: 10, + where_shards_lt: Some(SPLIT_TO_MAX), + where_gt: Some(split_threshold), + }; + for node in nodes.values() { + let request_ref = &top_n_request; + match node + .with_client_retries( + |client| async move { + let request = request_ref.clone(); + client.top_tenant_shards(request.clone()).await + }, + &self.config.jwt_token, + 3, + 3, + Duration::from_secs(5), + &self.cancel, + ) + .await + { + Some(Ok(node_top_n)) => { + top_n.extend(node_top_n.shards.into_iter()); + } + Some(Err(mgmt_api::Error::Cancelled)) => { + continue; + } + Some(Err(e)) => { + tracing::warn!("Failed to fetch top N tenants from {node}: {e}"); + continue; + } + None => { + // Node is shutting down + continue; + } + }; + } + + // Pick the biggest tenant to split first + top_n.sort_by_key(|i| i.resident_size); + let Some(split_candidate) = top_n.into_iter().next() else { + tracing::debug!("No split-elegible shards found"); + return; + }; + + // We spawn a task to run this, so it's exactly like some external API client requesting it. We don't + // want to block the background reconcile loop on this. + tracing::info!("Auto-splitting tenant for size threshold {split_threshold}: current size {split_candidate:?}"); + + let this = self.clone(); + tokio::spawn( + async move { + match this + .tenant_shard_split( + split_candidate.id.tenant_id, + TenantShardSplitRequest { + // Always split to the max number of shards: this avoids stepping through + // intervening shard counts and encountering the overrhead of a split+cleanup + // each time as a tenant grows, and is not too expensive because our max shard + // count is relatively low anyway. + // This policy will be adjusted in future once we support higher shard count. + new_shard_count: SPLIT_TO_MAX.literal(), + new_stripe_size: Some(ShardParameters::DEFAULT_STRIPE_SIZE), + }, + ) + .await + { + Ok(_) => { + tracing::info!("Successful auto-split"); + } + Err(e) => { + tracing::error!("Auto-split failed: {e}"); + } + } + } + .instrument(tracing::info_span!("auto_split", tenant_id=%split_candidate.id.tenant_id)), + ); + } + /// Useful for tests: run whatever work a background [`Self::reconcile_all`] would have done, but /// also wait for any generated Reconcilers to complete. Calling this until it returns zero should /// put the system into a quiescent state where future background reconciliations won't do anything. diff --git a/test_runner/README.md b/test_runner/README.md index 051897744a..fd68cfff79 100644 --- a/test_runner/README.md +++ b/test_runner/README.md @@ -92,6 +92,166 @@ Exit after the first test failure: `./scripts/pytest -x ...` (there are many more pytest options; run `pytest -h` to see them.) +#### Running Python tests against real S3 or S3-compatible services + +Neon's `libs/remote_storage` supports multiple implementations of remote storage. +At the time of writing, that is +```rust +pub enum RemoteStorageKind { + /// Storage based on local file system. + /// Specify a root folder to place all stored files into. + LocalFs(Utf8PathBuf), + /// AWS S3 based storage, storing all files in the S3 bucket + /// specified by the config + AwsS3(S3Config), + /// Azure Blob based storage, storing all files in the container + /// specified by the config + AzureContainer(AzureConfig), +} +``` + +The test suite has a Python enum with equal name but different meaning: + +```python +@enum.unique +class RemoteStorageKind(str, enum.Enum): + LOCAL_FS = "local_fs" + MOCK_S3 = "mock_s3" + REAL_S3 = "real_s3" +``` + +* `LOCAL_FS` => `LocalFs` +* `MOCK_S3`: starts [`moto`](https://github.com/getmoto/moto)'s S3 implementation, then configures Pageserver with `AwsS3` +* `REAL_S3` => configure `AwsS3` as detailed below + +When a test in the test suite needs an `AwsS3`, it is supposed to call `remote_storage.s3_storage()`. +That function checks env var `ENABLE_REAL_S3_REMOTE_STORAGE`: +* If it is not set, use `MOCK_S3` +* If it is set, use `REAL_S3`. + +For `REAL_S3`, the test suite creates the dict/toml representation of the `RemoteStorageKind::AwsS3` based on env vars: + +```rust +pub struct S3Config { + // test suite env var: REMOTE_STORAGE_S3_BUCKET + pub bucket_name: String, + // test suite env var: REMOTE_STORAGE_S3_REGION + pub bucket_region: String, + // test suite determines this + pub prefix_in_bucket: Option, + // no env var exists; test suite sets it for MOCK_S3, because that's how moto works + pub endpoint: Option, + ... +} +``` + +*Credentials* are not part of the config, but discovered by the AWS SDK. +See the `libs/remote_storage` Rust code. +We're documenting two mechanism here: + +The test suite supports two mechanisms (`remote_storage.py`): + +**Credential mechanism 1**: env vars `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY`. +Populate the env vars with AWS access keys that you created in IAM. +Our CI uses this mechanism. +However, it is _not_ recommended for interactive use by developers ([learn more](https://docs.aws.amazon.com/sdkref/latest/guide/access-users.html#credentials-long-term)). +Instead, use profiles (next section). + +**Credential mechanism 2**: env var `AWS_PROFILE`. +This uses the AWS SDK's (and CLI's) profile mechanism. +Learn more about it [in the official docs](https://docs.aws.amazon.com/sdkref/latest/guide/file-format.html). +After configuring a profile (e.g. via the aws CLI), set the env var to its name. + +In conclusion, the full command line is: + +```bash +# with long-term AWS access keys +ENABLE_REAL_S3_REMOTE_STORAGE=true \ +REMOTE_STORAGE_S3_BUCKET=mybucket \ +REMOTE_STORAGE_S3_REGION=eu-central-1 \ +AWS_ACCESS_KEY_ID=... \ +AWS_SECRET_ACCESS_KEY=... \ +./scripts/pytest +``` + +```bash +# with AWS PROFILE +ENABLE_REAL_S3_REMOTE_STORAGE=true \ +REMOTE_STORAGE_S3_BUCKET=mybucket \ +REMOTE_STORAGE_S3_REGION=eu-central-1 \ +AWS_PROFILE=... \ +./scripts/pytest +``` + +If you're using SSO, make sure to `aws sso login --profile $AWS_PROFILE` first. + +##### Minio + +If you want to run test without the cloud setup, we recommend [minio](https://min.io/docs/minio/linux/index.html). + +```bash +# Start in Terminal 1 +mkdir /tmp/minio_data +minio server /tmp/minio_data --console-address 127.0.0.1:9001 --address 127.0.0.1:9000 +``` + +In another terminal, create an `aws` CLI profile for it: + +```ini +# append to ~/.aws/config +[profile local-minio] +services = local-minio-services +[services local-minio-services] +s3 = + endpoint_url=http://127.0.0.1:9000/ +``` + + +Now configure the credentials (this is going to write `~/.aws/credentials` for you). +It's an interactive prompt. + +```bash +# Terminal 2 +$ aws --profile local-minio configure +AWS Access Key ID [None]: minioadmin +AWS Secret Access Key [None]: minioadmin +Default region name [None]: +Default output format [None]: +``` + +Now create a bucket `testbucket` using the CLI. + +```bash +# (don't forget to have AWS_PROFILE env var set; or use --profile) +aws --profile local-minio s3 mb s3://mybucket +``` + +(If it doesn't work, make sure you update your AWS CLI to a recent version. + The [service-specific endpoint feature](https://docs.aws.amazon.com/sdkref/latest/guide/feature-ss-endpoints.html) + that we're using is quite new.) + +```bash +# with AWS PROFILE +ENABLE_REAL_S3_REMOTE_STORAGE=true \ +REMOTE_STORAGE_S3_BUCKET=mybucket \ +REMOTE_STORAGE_S3_REGION=doesntmatterforminio \ +AWS_PROFILE=local-minio \ +./scripts/pytest +``` + +NB: you can avoid the `--profile` by setting the `AWS_PROFILE` variable. +Just like the AWS SDKs, the `aws` CLI is sensible to it. + +#### Running Rust tests against real S3 or S3-compatible services + +We have some Rust tests that only run against real S3, e.g., [here](https://github.com/neondatabase/neon/blob/c18d3340b5e3c978a81c3db8b6f1e83cd9087e8a/libs/remote_storage/tests/test_real_s3.rs#L392-L397). + +They use the same env vars as the Python test suite (see previous section) +but interpret them on their own. +However, at this time, the interpretation is identical. + +So, above instructions apply to the Rust test as well. + ### Writing a test Every test needs a Neon Environment, or NeonEnv to operate in. A Neon Environment diff --git a/test_runner/fixtures/benchmark_fixture.py b/test_runner/fixtures/benchmark_fixture.py index c32748f6f0..038f557cc8 100644 --- a/test_runner/fixtures/benchmark_fixture.py +++ b/test_runner/fixtures/benchmark_fixture.py @@ -19,9 +19,9 @@ from _pytest.config.argparsing import Parser from _pytest.fixtures import FixtureRequest from _pytest.terminal import TerminalReporter +from fixtures.common_types import TenantId, TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import NeonPageserver -from fixtures.types import TenantId, TimelineId """ This file contains fixtures for micro-benchmarks. diff --git a/test_runner/fixtures/types.py b/test_runner/fixtures/common_types.py similarity index 100% rename from test_runner/fixtures/types.py rename to test_runner/fixtures/common_types.py diff --git a/test_runner/fixtures/compute_reconfigure.py b/test_runner/fixtures/compute_reconfigure.py index a883d94f73..66fc35b6aa 100644 --- a/test_runner/fixtures/compute_reconfigure.py +++ b/test_runner/fixtures/compute_reconfigure.py @@ -5,8 +5,8 @@ import pytest from werkzeug.wrappers.request import Request from werkzeug.wrappers.response import Response +from fixtures.common_types import TenantId from fixtures.log_helper import log -from fixtures.types import TenantId class ComputeReconfigure: diff --git a/test_runner/fixtures/endpoint/__init__.py b/test_runner/fixtures/endpoint/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/test_runner/fixtures/endpoint/http.py b/test_runner/fixtures/endpoint/http.py new file mode 100644 index 0000000000..42f0539c19 --- /dev/null +++ b/test_runner/fixtures/endpoint/http.py @@ -0,0 +1,23 @@ +import requests +from requests.adapters import HTTPAdapter + + +class EndpointHttpClient(requests.Session): + def __init__( + self, + port: int, + ): + super().__init__() + self.port = port + + self.mount("http://", HTTPAdapter()) + + def dbs_and_roles(self): + res = self.get(f"http://localhost:{self.port}/dbs_and_roles") + res.raise_for_status() + return res.json() + + def database_schema(self, database: str): + res = self.get(f"http://localhost:{self.port}/database_schema?database={database}") + res.raise_for_status() + return res.text diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py index 7d34e12ca3..8b8075f8c1 100644 --- a/test_runner/fixtures/metrics.py +++ b/test_runner/fixtures/metrics.py @@ -142,6 +142,7 @@ PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = ( "pageserver_resident_physical_size", "pageserver_io_operations_bytes_total", "pageserver_last_record_lsn", + "pageserver_standby_horizon", "pageserver_smgr_query_seconds_bucket", "pageserver_smgr_query_seconds_count", "pageserver_smgr_query_seconds_sum", @@ -149,6 +150,7 @@ PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = ( "pageserver_storage_operations_seconds_sum_total", "pageserver_evictions_total", "pageserver_evictions_with_low_residence_duration_total", + "pageserver_aux_file_estimated_size", *PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS, # "pageserver_directory_entries_count", -- only used if above a certain threshold # "pageserver_broken_tenants_count" -- used only for broken diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 30cec4c726..796ae7217b 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -14,7 +14,7 @@ import textwrap import threading import time import uuid -from contextlib import ExitStack, closing, contextmanager +from contextlib import closing, contextmanager from dataclasses import dataclass from datetime import datetime from enum import Enum @@ -47,17 +47,20 @@ from urllib3.util.retry import Retry from fixtures import overlayfs from fixtures.broker import NeonBroker +from fixtures.common_types import Lsn, TenantId, TenantShardId, TimelineId +from fixtures.endpoint.http import EndpointHttpClient from fixtures.log_helper import log from fixtures.metrics import Metrics, MetricsGetter, parse_metrics from fixtures.pageserver.allowed_errors import ( DEFAULT_PAGESERVER_ALLOWED_ERRORS, DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS, ) +from fixtures.pageserver.common_types import IndexPartDump, LayerName, parse_layer_file_name from fixtures.pageserver.http import PageserverHttpClient -from fixtures.pageserver.types import IndexPartDump, LayerFileName, parse_layer_file_name from fixtures.pageserver.utils import ( wait_for_last_record_lsn, wait_for_upload, + wait_for_upload_queue_empty, ) from fixtures.pg_version import PgVersion from fixtures.port_distributor import PortDistributor @@ -72,13 +75,13 @@ from fixtures.remote_storage import ( ) from fixtures.safekeeper.http import SafekeeperHttpClient from fixtures.safekeeper.utils import are_walreceivers_absent -from fixtures.types import Lsn, TenantId, TenantShardId, TimelineId from fixtures.utils import ( ATTACHMENT_NAME_REGEX, allure_add_grafana_links, allure_attach_from_dir, assert_no_errors, get_self_dir, + print_gc_result, subprocess_capture, wait_until, ) @@ -452,7 +455,7 @@ class NeonEnvBuilder: test_overlay_dir: Optional[Path] = None, pageserver_remote_storage: Optional[RemoteStorage] = None, # toml that will be decomposed into `--config-override` flags during `pageserver --init` - pageserver_config_override: Optional[str] = None, + pageserver_config_override: Optional[str | Callable[[Dict[str, Any]], None]] = None, num_safekeepers: int = 1, num_pageservers: int = 1, # Use non-standard SK ids to check for various parsing bugs @@ -467,6 +470,7 @@ class NeonEnvBuilder: initial_timeline: Optional[TimelineId] = None, pageserver_virtual_file_io_engine: Optional[str] = None, pageserver_aux_file_policy: Optional[AuxFileStore] = None, + pageserver_default_tenant_config_compaction_algorithm: Optional[Dict[str, Any]] = None, ): self.repo_dir = repo_dir self.rust_log_override = rust_log_override @@ -507,6 +511,14 @@ class NeonEnvBuilder: self.pageserver_virtual_file_io_engine: Optional[str] = pageserver_virtual_file_io_engine + self.pageserver_default_tenant_config_compaction_algorithm: Optional[ + Dict[str, Any] + ] = pageserver_default_tenant_config_compaction_algorithm + if self.pageserver_default_tenant_config_compaction_algorithm is not None: + log.debug( + f"Overriding pageserver default compaction algorithm to {self.pageserver_default_tenant_config_compaction_algorithm}" + ) + self.pageserver_get_vectored_impl: Optional[str] = None if os.getenv("PAGESERVER_GET_VECTORED_IMPL", "") == "vectored": self.pageserver_get_vectored_impl = "vectored" @@ -701,6 +713,10 @@ class NeonEnvBuilder: config["default_tenant_id"] = snapshot_config["default_tenant_id"] config["branch_name_mappings"] = snapshot_config["branch_name_mappings"] + # Update the config with new neon + postgres path in case of compat test + config["pg_distrib_dir"] = str(self.pg_distrib_dir) + config["neon_distrib_dir"] = str(self.neon_binpath) + with (self.repo_dir / "config").open("w") as f: toml.dump(config, f) @@ -1054,14 +1070,14 @@ class NeonEnv: self.pageserver_virtual_file_io_engine = config.pageserver_virtual_file_io_engine self.pageserver_aux_file_policy = config.pageserver_aux_file_policy - # Create a config file corresponding to the options + # Create the neon_local's `NeonLocalInitConf` cfg: Dict[str, Any] = { "default_tenant_id": str(self.initial_tenant), "broker": { "listen_addr": self.broker.listen_addr(), }, - "pageservers": [], "safekeepers": [], + "pageservers": [], } if self.control_plane_api is not None: @@ -1099,6 +1115,26 @@ class NeonEnv: ps_cfg["get_impl"] = config.pageserver_get_impl if config.pageserver_validate_vectored_get is not None: ps_cfg["validate_vectored_get"] = config.pageserver_validate_vectored_get + if config.pageserver_default_tenant_config_compaction_algorithm is not None: + tenant_config = ps_cfg.setdefault("tenant_config", {}) + tenant_config[ + "compaction_algorithm" + ] = config.pageserver_default_tenant_config_compaction_algorithm + + if self.pageserver_remote_storage is not None: + ps_cfg["remote_storage"] = remote_storage_to_toml_dict( + self.pageserver_remote_storage + ) + + if config.pageserver_config_override is not None: + if callable(config.pageserver_config_override): + config.pageserver_config_override(ps_cfg) + else: + assert isinstance(config.pageserver_config_override, str) + for o in config.pageserver_config_override.split(";"): + override = toml.loads(o) + for key, value in override.items(): + ps_cfg[key] = value # Create a corresponding NeonPageserver object self.pageservers.append( @@ -1136,7 +1172,6 @@ class NeonEnv: self.neon_cli.init( cfg, force=config.config_init_force, - pageserver_config_override=config.pageserver_config_override, ) def start(self): @@ -1290,6 +1325,7 @@ def _shared_simple_env( pg_version: PgVersion, pageserver_virtual_file_io_engine: str, pageserver_aux_file_policy: Optional[AuxFileStore], + pageserver_default_tenant_config_compaction_algorithm: Optional[Dict[str, Any]], ) -> Iterator[NeonEnv]: """ # Internal fixture backing the `neon_simple_env` fixture. If TEST_SHARED_FIXTURES @@ -1321,6 +1357,7 @@ def _shared_simple_env( test_output_dir=test_output_dir, pageserver_virtual_file_io_engine=pageserver_virtual_file_io_engine, pageserver_aux_file_policy=pageserver_aux_file_policy, + pageserver_default_tenant_config_compaction_algorithm=pageserver_default_tenant_config_compaction_algorithm, ) as builder: env = builder.init_start() @@ -1360,7 +1397,8 @@ def neon_env_builder( test_overlay_dir: Path, top_output_dir: Path, pageserver_virtual_file_io_engine: str, - pageserver_aux_file_policy: Optional[AuxFileStore] = None, + pageserver_default_tenant_config_compaction_algorithm: Optional[Dict[str, Any]], + pageserver_aux_file_policy: Optional[AuxFileStore], ) -> Iterator[NeonEnvBuilder]: """ Fixture to create a Neon environment for test. @@ -1395,6 +1433,7 @@ def neon_env_builder( test_output_dir=test_output_dir, test_overlay_dir=test_overlay_dir, pageserver_aux_file_policy=pageserver_aux_file_policy, + pageserver_default_tenant_config_compaction_algorithm=pageserver_default_tenant_config_compaction_algorithm, ) as builder: yield builder @@ -1586,7 +1625,7 @@ class NeonCli(AbstractNeonCli): args.extend(["-c", "switch_aux_file_policy:v1"]) if aux_file_v2 is AuxFileStore.CrossValidation: - args.extend(["-c", "switch_aux_file_policy:cross_validation"]) + args.extend(["-c", "switch_aux_file_policy:cross-validation"]) if set_default: args.append("--set-default") @@ -1722,46 +1761,22 @@ class NeonCli(AbstractNeonCli): def init( self, - config: Dict[str, Any], + init_config: Dict[str, Any], force: Optional[str] = None, - pageserver_config_override: Optional[str] = None, ) -> "subprocess.CompletedProcess[str]": - remote_storage = self.env.pageserver_remote_storage - - ps_config = {} - if remote_storage is not None: - ps_config["remote_storage"] = remote_storage_to_toml_dict(remote_storage) - - if pageserver_config_override is not None: - for o in pageserver_config_override.split(";"): - override = toml.loads(o) - for key, value in override.items(): - ps_config[key] = value - - with ExitStack() as stack: - ps_config_file = stack.enter_context(tempfile.NamedTemporaryFile(mode="w+")) - ps_config_file.write(toml.dumps(ps_config)) - ps_config_file.flush() - - neon_local_config = stack.enter_context(tempfile.NamedTemporaryFile(mode="w+")) - neon_local_config.write(toml.dumps(config)) - neon_local_config.flush() + with tempfile.NamedTemporaryFile(mode="w+") as init_config_tmpfile: + init_config_tmpfile.write(toml.dumps(init_config)) + init_config_tmpfile.flush() cmd = [ "init", - f"--config={neon_local_config.name}", - "--pg-version", - self.env.pg_version, - f"--pageserver-config={ps_config_file.name}", + f"--config={init_config_tmpfile.name}", ] if force is not None: cmd.extend(["--force", force]) - s3_env_vars = None - if isinstance(remote_storage, S3Storage): - s3_env_vars = remote_storage.access_env_vars() - res = self.raw_cli(cmd, extra_env_vars=s3_env_vars) + res = self.raw_cli(cmd) res.check_returncode() return res @@ -2678,7 +2693,7 @@ class NeonPageserver(PgProtocol, LogUtils): ) def layer_exists( - self, tenant_id: TenantId, timeline_id: TimelineId, layer_name: LayerFileName + self, tenant_id: TenantId, timeline_id: TimelineId, layer_name: LayerName ) -> bool: layers = self.list_layers(tenant_id, timeline_id) return layer_name in [parse_layer_file_name(p.name) for p in layers] @@ -2706,7 +2721,12 @@ class PgBin: env.update(env_add) return env - def run(self, command: List[str], env: Optional[Env] = None, cwd: Optional[str] = None): + def run( + self, + command: List[str], + env: Optional[Env] = None, + cwd: Optional[Union[str, Path]] = None, + ): """ Run one of the postgres binaries. @@ -2768,6 +2788,28 @@ class PgBin: log.info(f"last checkpoint at {checkpoint_lsn}") return Lsn(checkpoint_lsn) + def take_fullbackup( + self, + pageserver: NeonPageserver, + tenant: TenantId, + timeline: TimelineId, + lsn: Lsn, + output: Path, + ): + """ + Request fullbackup from pageserver, store it at 'output'. + """ + cmd = [ + "psql", + "--no-psqlrc", + pageserver.connstr(), + "-c", + f"fullbackup {tenant} {timeline} {lsn}", + "-o", + str(output), + ] + self.run_capture(cmd) + @pytest.fixture(scope="function") def pg_bin(test_output_dir: Path, pg_distrib_dir: Path, pg_version: PgVersion) -> PgBin: @@ -3363,6 +3405,13 @@ class Endpoint(PgProtocol): self.active_safekeepers: List[int] = list(map(lambda sk: sk.id, env.safekeepers)) # path to conf is /endpoints//pgdata/postgresql.conf + def http_client( + self, auth_token: Optional[str] = None, retries: Optional[Retry] = None + ) -> EndpointHttpClient: + return EndpointHttpClient( + port=self.http_port, + ) + def create( self, branch_name: str, @@ -4123,7 +4172,12 @@ def list_files_to_compare(pgdata_dir: Path) -> List[str]: # pg is the existing and running compute node, that we want to compare with a basebackup -def check_restored_datadir_content(test_output_dir: Path, env: NeonEnv, endpoint: Endpoint): +def check_restored_datadir_content( + test_output_dir: Path, + env: NeonEnv, + endpoint: Endpoint, + ignored_files: Optional[list[str]] = None, +): pg_bin = PgBin(test_output_dir, env.pg_distrib_dir, env.pg_version) # Get the timeline ID. We need it for the 'basebackup' command @@ -4176,6 +4230,10 @@ def check_restored_datadir_content(test_output_dir: Path, env: NeonEnv, endpoint if not f.startswith("pg_xact") and not f.startswith("pg_multixact") ] + if ignored_files: + pgdata_files = [f for f in pgdata_files if f not in ignored_files] + restored_files = [f for f in restored_files if f not in ignored_files] + # check that file sets are equal assert pgdata_files == restored_files @@ -4266,6 +4324,17 @@ def wait_replica_caughtup(primary: Endpoint, secondary: Endpoint): time.sleep(1) +def log_replica_lag(primary: Endpoint, secondary: Endpoint): + last_replay_lsn = Lsn( + secondary.safe_psql_scalar("SELECT pg_last_wal_replay_lsn()", log_query=False) + ) + primary_lsn = Lsn( + primary.safe_psql_scalar("SELECT pg_current_wal_flush_lsn()", log_query=False) + ) + lag = primary_lsn - last_replay_lsn + log.info(f"primary_lsn={primary_lsn}, replay_lsn={last_replay_lsn}, lag={lag}") + + def wait_for_last_flush_lsn( env: NeonEnv, endpoint: Endpoint, @@ -4411,3 +4480,79 @@ def parse_project_git_version_output(s: str) -> str: return commit raise ValueError(f"unable to parse --version output: '{s}'") + + +def generate_uploads_and_deletions( + env: NeonEnv, + *, + init: bool = True, + tenant_id: Optional[TenantId] = None, + timeline_id: Optional[TimelineId] = None, + data: Optional[str] = None, + pageserver: NeonPageserver, +): + """ + Using the environment's default tenant + timeline, generate a load pattern + that results in some uploads and some deletions to remote storage. + """ + + if tenant_id is None: + tenant_id = env.initial_tenant + assert tenant_id is not None + + if timeline_id is None: + timeline_id = env.initial_timeline + assert timeline_id is not None + + ps_http = pageserver.http_client() + + with env.endpoints.create_start( + "main", tenant_id=tenant_id, pageserver_id=pageserver.id + ) as endpoint: + if init: + endpoint.safe_psql("CREATE TABLE foo (id INTEGER PRIMARY KEY, val text)") + last_flush_lsn_upload( + env, endpoint, tenant_id, timeline_id, pageserver_id=pageserver.id + ) + + def churn(data): + endpoint.safe_psql_many( + [ + f""" + INSERT INTO foo (id, val) + SELECT g, '{data}' + FROM generate_series(1, 200) g + ON CONFLICT (id) DO UPDATE + SET val = EXCLUDED.val + """, + # to ensure that GC can actually remove some layers + "VACUUM foo", + ] + ) + assert tenant_id is not None + assert timeline_id is not None + # We are waiting for uploads as well as local flush, in order to avoid leaving the system + # in a state where there are "future layers" in remote storage that will generate deletions + # after a restart. + last_flush_lsn_upload( + env, endpoint, tenant_id, timeline_id, pageserver_id=pageserver.id + ) + + # Compaction should generate some GC-elegible layers + for i in range(0, 2): + churn(f"{i if data is None else data}") + + gc_result = ps_http.timeline_gc(tenant_id, timeline_id, 0) + print_gc_result(gc_result) + assert gc_result["layers_removed"] > 0 + + # Stop endpoint and flush all data to pageserver, then checkpoint it: this + # ensures that the pageserver is in a fully idle state: there will be no more + # background ingest, no more uploads pending, and therefore no non-determinism + # in subsequent actions like pageserver restarts. + final_lsn = flush_ep_to_pageserver(env, endpoint, tenant_id, timeline_id, pageserver.id) + ps_http.timeline_checkpoint(tenant_id, timeline_id) + # Finish uploads + wait_for_upload(ps_http, tenant_id, timeline_id, final_lsn) + # Finish all remote writes (including deletions) + wait_for_upload_queue_empty(ps_http, tenant_id, timeline_id) diff --git a/test_runner/fixtures/pageserver/allowed_errors.py b/test_runner/fixtures/pageserver/allowed_errors.py index e560844944..fa6e4eaafd 100755 --- a/test_runner/fixtures/pageserver/allowed_errors.py +++ b/test_runner/fixtures/pageserver/allowed_errors.py @@ -70,6 +70,7 @@ DEFAULT_PAGESERVER_ALLOWED_ERRORS = ( # this is expected given our collaborative shutdown approach for the UploadQueue ".*Compaction failed.*, retrying in .*: Other\\(queue is in state Stopped.*", ".*Compaction failed.*, retrying in .*: ShuttingDown", + ".*Compaction failed.*, retrying in .*: timeline shutting down.*", # Pageserver timeline deletion should be polled until it gets 404, so ignore it globally ".*Error processing HTTP request: NotFound: Timeline .* was not found", ".*took more than expected to complete.*", @@ -88,9 +89,13 @@ DEFAULT_PAGESERVER_ALLOWED_ERRORS = ( ".*Flushed oversized open layer with size.*", # During teardown, we stop the storage controller before the pageservers, so pageservers # can experience connection errors doing background deletion queue work. - ".*WARN deletion backend: calling control plane generation validation API failed.*Connection refused.*", + ".*WARN deletion backend: calling control plane generation validation API failed.*error sending request.*", # Can happen when the test shuts down the storage controller while it is calling the utilization API ".*WARN.*path=/v1/utilization .*request was dropped before completing", + # Can happen during shutdown + ".*scheduling deletion on drop failed: queue is in state Stopped.*", + # Can happen during shutdown + ".*ignoring failure to find gc cutoffs: timeline shutting down.*", ) @@ -131,9 +136,10 @@ if __name__ == "__main__": "-i", "--input", type=argparse.FileType("r"), - default=sys.stdin, - help="Pageserver logs file. Reads from stdin if no file is provided.", + help="Pageserver logs file. Use '-' for stdin.", + required=True, ) + args = parser.parse_args() errors = _check_allowed_errors(args.input) diff --git a/test_runner/fixtures/pageserver/types.py b/test_runner/fixtures/pageserver/common_types.py similarity index 79% rename from test_runner/fixtures/pageserver/types.py rename to test_runner/fixtures/pageserver/common_types.py index fd018cb778..a6c327a8a0 100644 --- a/test_runner/fixtures/pageserver/types.py +++ b/test_runner/fixtures/pageserver/common_types.py @@ -2,7 +2,7 @@ import re from dataclasses import dataclass from typing import Any, Dict, Tuple, Union -from fixtures.types import KEY_MAX, KEY_MIN, Key, Lsn +from fixtures.common_types import KEY_MAX, KEY_MIN, Key, Lsn @dataclass @@ -12,7 +12,7 @@ class IndexLayerMetadata: @dataclass(frozen=True) -class ImageLayerFileName: +class ImageLayerName: lsn: Lsn key_start: Key key_end: Key @@ -26,7 +26,7 @@ class ImageLayerFileName: @dataclass(frozen=True) -class DeltaLayerFileName: +class DeltaLayerName: lsn_start: Lsn lsn_end: Lsn key_start: Key @@ -41,14 +41,16 @@ class DeltaLayerFileName: return ret -LayerFileName = Union[ImageLayerFileName, DeltaLayerFileName] +LayerName = Union[ImageLayerName, DeltaLayerName] class InvalidFileName(Exception): pass -IMAGE_LAYER_FILE_NAME = re.compile("^([A-F0-9]{36})-([A-F0-9]{36})__([A-F0-9]{16})(-[a-f0-9]{8})?$") +IMAGE_LAYER_FILE_NAME = re.compile( + "^([A-F0-9]{36})-([A-F0-9]{36})__([A-F0-9]{16})(-v1-[a-f0-9]{8})?$" +) def parse_image_layer(f_name: str) -> Tuple[int, int, int]: @@ -62,7 +64,7 @@ def parse_image_layer(f_name: str) -> Tuple[int, int, int]: DELTA_LAYER_FILE_NAME = re.compile( - "^([A-F0-9]{36})-([A-F0-9]{36})__([A-F0-9]{16})-([A-F0-9]{16})(-[a-f0-9]{8})?$" + "^([A-F0-9]{36})-([A-F0-9]{36})__([A-F0-9]{16})-([A-F0-9]{16})(-v1-[a-f0-9]{8})?$" ) @@ -80,16 +82,16 @@ def parse_delta_layer(f_name: str) -> Tuple[int, int, int, int]: ) -def parse_layer_file_name(file_name: str) -> LayerFileName: +def parse_layer_file_name(file_name: str) -> LayerName: try: key_start, key_end, lsn = parse_image_layer(file_name) - return ImageLayerFileName(lsn=Lsn(lsn), key_start=Key(key_start), key_end=Key(key_end)) + return ImageLayerName(lsn=Lsn(lsn), key_start=Key(key_start), key_end=Key(key_end)) except InvalidFileName: pass try: key_start, key_end, lsn_start, lsn_end = parse_delta_layer(file_name) - return DeltaLayerFileName( + return DeltaLayerName( lsn_start=Lsn(lsn_start), lsn_end=Lsn(lsn_end), key_start=Key(key_start), @@ -101,18 +103,15 @@ def parse_layer_file_name(file_name: str) -> LayerFileName: raise InvalidFileName("neither image nor delta layer") -def is_future_layer(layer_file_name: LayerFileName, disk_consistent_lsn: Lsn): +def is_future_layer(layer_file_name: LayerName, disk_consistent_lsn: Lsn): """ Determines if this layer file is considered to be in future meaning we will discard these layers during timeline initialization from the given disk_consistent_lsn. """ - if ( - isinstance(layer_file_name, ImageLayerFileName) - and layer_file_name.lsn > disk_consistent_lsn - ): + if isinstance(layer_file_name, ImageLayerName) and layer_file_name.lsn > disk_consistent_lsn: return True elif ( - isinstance(layer_file_name, DeltaLayerFileName) + isinstance(layer_file_name, DeltaLayerName) and layer_file_name.lsn_end > disk_consistent_lsn + 1 ): return True @@ -122,7 +121,7 @@ def is_future_layer(layer_file_name: LayerFileName, disk_consistent_lsn: Lsn): @dataclass class IndexPartDump: - layer_metadata: Dict[LayerFileName, IndexLayerMetadata] + layer_metadata: Dict[LayerName, IndexLayerMetadata] disk_consistent_lsn: Lsn @classmethod diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py index b06972056c..f1f96f6d5f 100644 --- a/test_runner/fixtures/pageserver/http.py +++ b/test_runner/fixtures/pageserver/http.py @@ -11,10 +11,10 @@ import requests from requests.adapters import HTTPAdapter from urllib3.util.retry import Retry +from fixtures.common_types import Lsn, TenantId, TenantShardId, TimelineId from fixtures.log_helper import log from fixtures.metrics import Metrics, MetricsGetter, parse_metrics from fixtures.pg_version import PgVersion -from fixtures.types import Lsn, TenantId, TenantShardId, TimelineId from fixtures.utils import Fn @@ -56,20 +56,30 @@ class InMemoryLayerInfo: class HistoricLayerInfo: kind: str layer_file_name: str - layer_file_size: Optional[int] + layer_file_size: int lsn_start: str lsn_end: Optional[str] remote: bool + # None for image layers, true if pageserver thinks this is an L0 delta layer + l0: Optional[bool] @classmethod def from_json(cls, d: Dict[str, Any]) -> HistoricLayerInfo: + # instead of parsing the key range lets keep the definition of "L0" in pageserver + l0_ness = d.get("l0") + assert l0_ness is None or isinstance(l0_ness, bool) + + size = d["layer_file_size"] + assert isinstance(size, int) + return HistoricLayerInfo( kind=d["kind"], layer_file_name=d["layer_file_name"], - layer_file_size=d.get("layer_file_size"), + layer_file_size=size, lsn_start=d["lsn_start"], lsn_end=d.get("lsn_end"), remote=d["remote"], + l0=l0_ness, ) @@ -583,6 +593,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter): timeline_id: TimelineId, force_repartition=False, force_image_layer_creation=False, + wait_until_uploaded=False, ): self.is_testing_enabled_or_skip() query = {} @@ -590,6 +601,8 @@ class PageserverHttpClient(requests.Session, MetricsGetter): query["force_repartition"] = "true" if force_image_layer_creation: query["force_image_layer_creation"] = "true" + if wait_until_uploaded: + query["wait_until_uploaded"] = "true" log.info(f"Requesting compact: tenant {tenant_id}, timeline {timeline_id}") res = self.put( @@ -656,6 +669,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter): timeline_id: TimelineId, force_repartition=False, force_image_layer_creation=False, + wait_until_uploaded=False, ): self.is_testing_enabled_or_skip() query = {} @@ -663,6 +677,8 @@ class PageserverHttpClient(requests.Session, MetricsGetter): query["force_repartition"] = "true" if force_image_layer_creation: query["force_image_layer_creation"] = "true" + if wait_until_uploaded: + query["wait_until_uploaded"] = "true" log.info(f"Requesting checkpoint: tenant {tenant_id}, timeline {timeline_id}") res = self.put( @@ -890,3 +906,18 @@ class PageserverHttpClient(requests.Session, MetricsGetter): assert current_logical_size == non_incremental assert isinstance(current_logical_size, int) return current_logical_size + + def top_tenants( + self, order_by: str, limit: int, where_shards_lt: int, where_gt: int + ) -> dict[Any, Any]: + res = self.post( + f"http://localhost:{self.port}/v1/top_tenants", + json={ + "order_by": order_by, + "limit": limit, + "where_shards_lt": where_shards_lt, + "where_gt": where_gt, + }, + ) + self.verbose_error(res) + return res.json() # type: ignore diff --git a/test_runner/fixtures/pageserver/many_tenants.py b/test_runner/fixtures/pageserver/many_tenants.py index f47a3ea043..def80a1c3e 100644 --- a/test_runner/fixtures/pageserver/many_tenants.py +++ b/test_runner/fixtures/pageserver/many_tenants.py @@ -3,6 +3,7 @@ import time from typing import Any, Callable, Dict, Tuple import fixtures.pageserver.remote_storage +from fixtures.common_types import TenantId, TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnv, @@ -12,7 +13,6 @@ from fixtures.pageserver.utils import ( wait_until_tenant_state, ) from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind -from fixtures.types import TenantId, TimelineId def single_timeline( diff --git a/test_runner/fixtures/pageserver/remote_storage.py b/test_runner/fixtures/pageserver/remote_storage.py index e6cd9b4614..0c3612716a 100644 --- a/test_runner/fixtures/pageserver/remote_storage.py +++ b/test_runner/fixtures/pageserver/remote_storage.py @@ -6,13 +6,13 @@ import threading from pathlib import Path from typing import Any, List, Tuple +from fixtures.common_types import TenantId, TimelineId from fixtures.neon_fixtures import NeonEnv, Pagectl -from fixtures.pageserver.types import ( +from fixtures.pageserver.common_types import ( InvalidFileName, parse_layer_file_name, ) from fixtures.remote_storage import LocalFsStorage -from fixtures.types import TenantId, TimelineId def duplicate_one_tenant(env: NeonEnv, template_tenant: TenantId, new_tenant: TenantId): diff --git a/test_runner/fixtures/pageserver/utils.py b/test_runner/fixtures/pageserver/utils.py index 4b0dd7a815..91435e8a1f 100644 --- a/test_runner/fixtures/pageserver/utils.py +++ b/test_runner/fixtures/pageserver/utils.py @@ -8,10 +8,10 @@ from mypy_boto3_s3.type_defs import ( ObjectTypeDef, ) +from fixtures.common_types import Lsn, TenantId, TenantShardId, TimelineId from fixtures.log_helper import log from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient from fixtures.remote_storage import RemoteStorage, RemoteStorageKind, S3Storage -from fixtures.types import Lsn, TenantId, TenantShardId, TimelineId from fixtures.utils import wait_until diff --git a/test_runner/fixtures/parametrize.py b/test_runner/fixtures/parametrize.py index 77523a542b..0227285822 100644 --- a/test_runner/fixtures/parametrize.py +++ b/test_runner/fixtures/parametrize.py @@ -1,7 +1,8 @@ import os -from typing import Optional +from typing import Any, Dict, Optional import pytest +import toml from _pytest.python import Metafunc from fixtures.pg_version import PgVersion @@ -37,6 +38,20 @@ def pageserver_aux_file_policy() -> Optional[AuxFileStore]: return None +def get_pageserver_default_tenant_config_compaction_algorithm() -> Optional[Dict[str, Any]]: + toml_table = os.getenv("PAGESERVER_DEFAULT_TENANT_CONFIG_COMPACTION_ALGORITHM") + if toml_table is None: + return None + v = toml.loads(toml_table) + assert isinstance(v, dict) + return v + + +@pytest.fixture(scope="function", autouse=True) +def pageserver_default_tenant_config_compaction_algorithm() -> Optional[Dict[str, Any]]: + return get_pageserver_default_tenant_config_compaction_algorithm() + + def pytest_generate_tests(metafunc: Metafunc): if (bt := os.getenv("BUILD_TYPE")) is None: build_types = ["debug", "release"] @@ -60,6 +75,16 @@ def pytest_generate_tests(metafunc: Metafunc): ): metafunc.parametrize("pageserver_virtual_file_io_engine", [io_engine]) + # Same hack for pageserver_default_tenant_config_compaction_algorithm + if ( + explicit_default := get_pageserver_default_tenant_config_compaction_algorithm() + ) is not None: + metafunc.parametrize( + "pageserver_default_tenant_config_compaction_algorithm", + [explicit_default], + ids=[explicit_default["kind"]], + ) + # For performance tests, parametrize also by platform if ( "test_runner/performance" in metafunc.definition._nodeid diff --git a/test_runner/fixtures/remote_storage.py b/test_runner/fixtures/remote_storage.py index 925e1b450f..ee18c53b52 100644 --- a/test_runner/fixtures/remote_storage.py +++ b/test_runner/fixtures/remote_storage.py @@ -12,8 +12,8 @@ import boto3 import toml from mypy_boto3_s3 import S3Client +from fixtures.common_types import TenantId, TimelineId from fixtures.log_helper import log -from fixtures.types import TenantId, TimelineId TIMELINE_INDEX_PART_FILE_NAME = "index_part.json" TENANT_HEATMAP_FILE_NAME = "heatmap-v1.json" @@ -50,7 +50,7 @@ class MockS3Server: # XXX: do not use `shell=True` or add `exec ` to the command here otherwise. # We use `self.subprocess.kill()` to shut down the server, which would not "just" work in Linux # if a process is started from the shell process. - self.subprocess = subprocess.Popen(["poetry", "run", "moto_server", "s3", f"-p{port}"]) + self.subprocess = subprocess.Popen(["poetry", "run", "moto_server", f"-p{port}"]) error = None try: return_code = self.subprocess.poll() diff --git a/test_runner/fixtures/safekeeper/http.py b/test_runner/fixtures/safekeeper/http.py index b9c1986818..82148d0556 100644 --- a/test_runner/fixtures/safekeeper/http.py +++ b/test_runner/fixtures/safekeeper/http.py @@ -6,8 +6,8 @@ from typing import Any, Dict, List, Optional, Tuple, Union import pytest import requests +from fixtures.common_types import Lsn, TenantId, TimelineId from fixtures.log_helper import log -from fixtures.types import Lsn, TenantId, TimelineId # Walreceiver as returned by sk's timeline status endpoint. diff --git a/test_runner/fixtures/safekeeper/utils.py b/test_runner/fixtures/safekeeper/utils.py index 2818a493d6..0e4b5d7883 100644 --- a/test_runner/fixtures/safekeeper/utils.py +++ b/test_runner/fixtures/safekeeper/utils.py @@ -1,6 +1,6 @@ +from fixtures.common_types import TenantId, TimelineId from fixtures.log_helper import log from fixtures.safekeeper.http import SafekeeperHttpClient -from fixtures.types import TenantId, TimelineId def are_walreceivers_absent( diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py index 6470621900..22bb43c580 100644 --- a/test_runner/fixtures/utils.py +++ b/test_runner/fixtures/utils.py @@ -4,10 +4,13 @@ import json import os import re import subprocess +import tarfile import threading import time +from hashlib import sha256 from pathlib import Path from typing import ( + IO, TYPE_CHECKING, Any, Callable, @@ -15,8 +18,10 @@ from typing import ( Iterable, List, Optional, + Set, Tuple, TypeVar, + Union, ) from urllib.parse import urlencode @@ -25,14 +30,14 @@ import zstandard from psycopg2.extensions import cursor from fixtures.log_helper import log -from fixtures.pageserver.types import ( +from fixtures.pageserver.common_types import ( parse_delta_layer, parse_image_layer, ) if TYPE_CHECKING: from fixtures.neon_fixtures import PgBin -from fixtures.types import TimelineId +from fixtures.common_types import TimelineId Fn = TypeVar("Fn", bound=Callable[..., Any]) @@ -452,6 +457,7 @@ def humantime_to_ms(humantime: str) -> float: def scan_log_for_errors(input: Iterable[str], allowed_errors: List[str]) -> List[Tuple[int, str]]: + # FIXME: this duplicates test_runner/fixtures/pageserver/allowed_errors.py error_or_warn = re.compile(r"\s(ERROR|WARN)") errors = [] for lineno, line in enumerate(input, start=1): @@ -484,17 +490,62 @@ def assert_no_errors(log_file, service, allowed_errors): for _lineno, error in errors: log.info(f"not allowed {service} error: {error.strip()}") - assert not errors, f"Log errors on {service}: {errors[0]}" + assert not errors, f"First log error on {service}: {errors[0]}\nHint: use scripts/check_allowed_errors.sh to test any new allowed_error you add" @enum.unique class AuxFileStore(str, enum.Enum): - V1 = "V1" - V2 = "V2" - CrossValidation = "CrossValidation" + V1 = "v1" + V2 = "v2" + CrossValidation = "cross-validation" def __repr__(self) -> str: return f"'aux-{self.value}'" def __str__(self) -> str: return f"'aux-{self.value}'" + + +def assert_pageserver_backups_equal(left: Path, right: Path, skip_files: Set[str]): + """ + This is essentially: + + lines=$(comm -3 \ + <(mkdir left && cd left && tar xf "$left" && find . -type f -print0 | xargs sha256sum | sort -k2) \ + <(mkdir right && cd right && tar xf "$right" && find . -type f -print0 | xargs sha256sum | sort -k2) \ + | wc -l) + [ "$lines" = "0" ] + + But in a more mac friendly fashion. + """ + started_at = time.time() + + def hash_extracted(reader: Union[IO[bytes], None]) -> bytes: + assert reader is not None + digest = sha256(usedforsecurity=False) + while True: + buf = reader.read(64 * 1024) + if not buf: + break + digest.update(buf) + return digest.digest() + + def build_hash_list(p: Path) -> List[Tuple[str, bytes]]: + with tarfile.open(p) as f: + matching_files = (info for info in f if info.isreg() and info.name not in skip_files) + ret = list( + map(lambda info: (info.name, hash_extracted(f.extractfile(info))), matching_files) + ) + ret.sort(key=lambda t: t[0]) + return ret + + left_list, right_list = map(build_hash_list, [left, right]) + + try: + assert len(left_list) == len(right_list) + + for left_tuple, right_tuple in zip(left_list, right_list): + assert left_tuple == right_tuple + finally: + elapsed = time.time() - started_at + log.info(f"assert_pageserver_backups_equal completed in {elapsed}s") diff --git a/test_runner/fixtures/workload.py b/test_runner/fixtures/workload.py index c44628ce06..dfd9caba3e 100644 --- a/test_runner/fixtures/workload.py +++ b/test_runner/fixtures/workload.py @@ -1,6 +1,7 @@ import threading from typing import Any, Optional +from fixtures.common_types import TenantId, TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import ( Endpoint, @@ -10,7 +11,6 @@ from fixtures.neon_fixtures import ( wait_for_last_flush_lsn, ) from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload -from fixtures.types import TenantId, TimelineId # neon_local doesn't handle creating/modifying endpoints concurrently, so we use a mutex # to ensure we don't do that: this enables running lots of Workloads in parallel safely. diff --git a/test_runner/performance/pageserver/pagebench/test_ondemand_download_churn.py b/test_runner/performance/pageserver/pagebench/test_ondemand_download_churn.py new file mode 100644 index 0000000000..644c1f559b --- /dev/null +++ b/test_runner/performance/pageserver/pagebench/test_ondemand_download_churn.py @@ -0,0 +1,175 @@ +import json +from pathlib import Path +from typing import Any, Dict, Tuple + +import pytest +from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, PgBin, wait_for_last_flush_lsn +from fixtures.pageserver.utils import wait_for_upload_queue_empty +from fixtures.remote_storage import s3_storage +from fixtures.utils import humantime_to_ms + + +@pytest.mark.parametrize("duration", [30]) +@pytest.mark.parametrize("io_engine", ["tokio-epoll-uring", "std-fs"]) +@pytest.mark.parametrize("concurrency_per_target", [1, 10, 100]) +@pytest.mark.timeout(1000) +def test_download_churn( + neon_env_builder: NeonEnvBuilder, + zenbenchmark: NeonBenchmarker, + pg_bin: PgBin, + io_engine: str, + concurrency_per_target: int, + duration: int, +): + def record(metric, **kwargs): + zenbenchmark.record(metric_name=f"pageserver_ondemand_download_churn.{metric}", **kwargs) + + params: Dict[str, Tuple[Any, Dict[str, Any]]] = {} + + # params from fixtures + params.update( + { + # we don't capture `duration`, but instead use the `runtime` output field from pagebench + } + ) + + # configure cache sizes like in prod + page_cache_size = 16384 + max_file_descriptors = 500000 + neon_env_builder.pageserver_config_override = ( + f"page_cache_size={page_cache_size}; max_file_descriptors={max_file_descriptors}" + ) + params.update( + { + "pageserver_config_override.page_cache_size": ( + page_cache_size * 8192, + {"unit": "byte"}, + ), + "pageserver_config_override.max_file_descriptors": (max_file_descriptors, {"unit": ""}), + } + ) + + for param, (value, kwargs) in params.items(): + record(param, metric_value=value, report=MetricReport.TEST_PARAM, **kwargs) + + # Setup env + env = setup_env(neon_env_builder, pg_bin) + env.pageserver.allowed_errors.append( + f".*path=/v1/tenant/{env.initial_tenant}/timeline.* request was dropped before completing" + ) + + run_benchmark(env, pg_bin, record, io_engine, concurrency_per_target, duration) + + +def setup_env(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): + remote_storage_kind = s3_storage() + neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) + + # We configure tenant conf such that SQL query below produces a lot of layers. + # We don't care what's in the layers really, we just care that layers are created. + bytes_per_layer = 10 * (1024**2) + env = neon_env_builder.init_start( + initial_tenant_conf={ + "pitr_interval": "1000d", # let's not make it get in the way + "gc_period": "0s", # disable periodic gc to avoid noise + "compaction_period": "0s", # disable L0=>L1 compaction + "checkpoint_timeout": "10years", # rely solely on checkpoint_distance + "checkpoint_distance": bytes_per_layer, # 10M instead of 256M to create more smaller layers + "image_creation_threshold": 100000, # don't create image layers ever + } + ) + + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + client = env.pageserver.http_client() + + with env.endpoints.create_start("main", tenant_id=tenant_id) as ep: + ep.safe_psql("CREATE TABLE data (random_text text)") + bytes_per_row = 512 # make big enough so WAL record size doesn't dominate + desired_layers = 300 + desired_bytes = bytes_per_layer * desired_layers + nrows = desired_bytes / bytes_per_row + ep.safe_psql( + f"INSERT INTO data SELECT lpad(i::text, {bytes_per_row}, '0') FROM generate_series(1, {int(nrows)}) as i", + options="-c statement_timeout=0", + ) + wait_for_last_flush_lsn(env, ep, tenant_id, timeline_id) + # TODO: this is a bit imprecise, there could be frozen layers being written out that we don't observe here + wait_for_upload_queue_empty(client, tenant_id, timeline_id) + + return env + + +def run_benchmark( + env: NeonEnv, + pg_bin: PgBin, + record, + io_engine: str, + concurrency_per_target: int, + duration_secs: int, +): + ps_http = env.pageserver.http_client() + cmd = [ + str(env.neon_binpath / "pagebench"), + "ondemand-download-churn", + "--mgmt-api-endpoint", + ps_http.base_url, + "--runtime", + f"{duration_secs}s", + "--set-io-engine", + f"{io_engine}", + "--concurrency-per-target", + f"{concurrency_per_target}", + # don't specify the targets explicitly, let pagebench auto-discover them + ] + + log.info(f"command: {' '.join(cmd)}") + basepath = pg_bin.run_capture(cmd, with_command_header=False) + results_path = Path(basepath + ".stdout") + log.info(f"Benchmark results at: {results_path}") + + with open(results_path, "r") as f: + results = json.load(f) + log.info(f"Results:\n{json.dumps(results, sort_keys=True, indent=2)}") + + metric = "downloads_count" + record( + metric, + metric_value=results[metric], + unit="", + report=MetricReport.HIGHER_IS_BETTER, + ) + + metric = "downloads_bytes" + record( + metric, + metric_value=results[metric], + unit="byte", + report=MetricReport.HIGHER_IS_BETTER, + ) + + metric = "evictions_count" + record( + metric, + metric_value=results[metric], + unit="", + report=MetricReport.HIGHER_IS_BETTER, + ) + + metric = "timeline_restarts" + record( + metric, + metric_value=results[metric], + unit="", + report=MetricReport.LOWER_IS_BETTER, + ) + + metric = "runtime" + record( + metric, + metric_value=humantime_to_ms(results[metric]) / 1000, + unit="s", + report=MetricReport.TEST_PARAM, + ) diff --git a/test_runner/performance/pageserver/util.py b/test_runner/performance/pageserver/util.py index 009d62c9ba..f31cd9a9f8 100644 --- a/test_runner/performance/pageserver/util.py +++ b/test_runner/performance/pageserver/util.py @@ -5,13 +5,13 @@ Utilities used by all code in this sub-directory from typing import Any, Callable, Dict, Tuple import fixtures.pageserver.many_tenants as many_tenants +from fixtures.common_types import TenantId, TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnv, NeonEnvBuilder, ) from fixtures.pageserver.utils import wait_until_all_tenants_state -from fixtures.types import TenantId, TimelineId def ensure_pageserver_ready_for_benchmarking(env: NeonEnv, n_tenants: int): diff --git a/test_runner/performance/test_branch_creation.py b/test_runner/performance/test_branch_creation.py index 7687b8417f..b3866f1813 100644 --- a/test_runner/performance/test_branch_creation.py +++ b/test_runner/performance/test_branch_creation.py @@ -9,11 +9,11 @@ from typing import List import pytest from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker +from fixtures.common_types import Lsn from fixtures.compare_fixtures import NeonCompare from fixtures.log_helper import log from fixtures.neon_fixtures import NeonPageserver from fixtures.pageserver.utils import wait_for_last_record_lsn -from fixtures.types import Lsn from fixtures.utils import wait_until from prometheus_client.samples import Sample diff --git a/test_runner/performance/test_bulk_insert.py b/test_runner/performance/test_bulk_insert.py index 1df3f2f5f1..3f56da7c1d 100644 --- a/test_runner/performance/test_bulk_insert.py +++ b/test_runner/performance/test_bulk_insert.py @@ -2,10 +2,10 @@ from contextlib import closing import pytest from fixtures.benchmark_fixture import MetricReport +from fixtures.common_types import Lsn from fixtures.compare_fixtures import NeonCompare, PgCompare from fixtures.pageserver.utils import wait_tenant_status_404 from fixtures.pg_version import PgVersion -from fixtures.types import Lsn # diff --git a/test_runner/performance/test_sharding_autosplit.py b/test_runner/performance/test_sharding_autosplit.py new file mode 100644 index 0000000000..9cd83f0959 --- /dev/null +++ b/test_runner/performance/test_sharding_autosplit.py @@ -0,0 +1,280 @@ +import concurrent.futures +import re +from pathlib import Path + +import pytest +from fixtures.common_types import TenantId, TimelineId +from fixtures.log_helper import log +from fixtures.neon_fixtures import ( + NeonEnvBuilder, + PgBin, + tenant_get_shards, +) + + +@pytest.mark.timeout(600) +def test_sharding_autosplit(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): + """ + Check that sharding, including auto-splitting, "just works" under pgbench workloads. + + This is not a benchmark, but it lives in the same place as benchmarks in order to be run + on a dedicated node that can sustain some significant throughput. + + Other tests validate the details of shard splitting, error cases etc. This test is + the sanity check that it all really works as expected with realistic amounts of data + and under load. + + Success conditions: + - Tenants auto-split when their capacity grows + - Client workloads are not interrupted while that happens + """ + + neon_env_builder.num_pageservers = 8 + neon_env_builder.storage_controller_config = { + # Split tenants at 500MB: it's up to the storage controller how it interprets this (logical + # sizes, physical sizes, etc). We will write this much data logically, therefore other sizes + # will reliably be greater. + "split_threshold": 1024 * 1024 * 500 + } + + tenant_conf = { + # We want layer rewrites to happen as soon as possible (this is the most stressful + # case for the system), so set PITR interval to something tiny. + "pitr_interval": "5s", + # Scaled down thresholds. We will run at ~1GB scale but would like to emulate + # the behavior of a system running at ~100GB scale. + "checkpoint_distance": f"{1024 * 1024}", + "compaction_threshold": "1", + "compaction_target_size": f"{1024 * 1024}", + "image_creation_threshold": "2", + "image_layer_creation_check_threshold": "0", + } + + env = neon_env_builder.init_start() + + for ps in env.pageservers: + ps.allowed_errors.extend( + [ + # We shut down pageservers while they might have some compaction work going on + ".*Compaction failed.*shutting down.*" + ] + ) + + env.storage_controller.allowed_errors.extend( + [ + # The neon_local functionality for updating computes is flaky for unknown reasons + ".*Local notification hook failed.*", + ".*Marking shard.*for notification retry.*", + ".*Failed to notify compute.*", + ] + ) + + # Total tenants + tenant_count = 4 + + # Transaction rate: we set this rather than running at full-speed because we + # might run on a slow node that doesn't cope well with many full-speed pgbenches running concurrently. + transaction_rate = 100 + + class TenantState: + def __init__(self, timeline_id, endpoint): + self.timeline_id = timeline_id + self.endpoint = endpoint + + # Create tenants + tenants = {} + for tenant_id in set(TenantId.generate() for _i in range(0, tenant_count)): + timeline_id = TimelineId.generate() + env.neon_cli.create_tenant(tenant_id, timeline_id, conf=tenant_conf) + endpoint = env.endpoints.create("main", tenant_id=tenant_id) + tenants[tenant_id] = TenantState(timeline_id, endpoint) + endpoint.start() + + def run_pgbench_init(endpoint): + pg_bin.run_capture( + [ + "pgbench", + "-s50", + "-i", + f"postgres://cloud_admin@localhost:{endpoint.pg_port}/postgres", + ] + ) + + def check_pgbench_output(out_path: str): + """ + When we run pgbench, we want not just an absence of errors, but also continuous evidence + of I/O progressing: our shard splitting and migration should not interrrupt the benchmark. + """ + matched_lines = 0 + stderr = Path(f"{out_path}.stderr").read_text() + + low_watermark = None + + # Apply this as a threshold for what we consider an unacceptable interruption to I/O + min_tps = transaction_rate // 10 + + for line in stderr.split("\n"): + match = re.match(r"progress: ([0-9\.]+) s, ([0-9\.]+) tps, .* ([0-9]+) failed", line) + if match is None: + # Fall back to older-version pgbench output (omits failure count) + match = re.match(r"progress: ([0-9\.]+) s, ([0-9\.]+) tps, .*", line) + if match is None: + continue + else: + (_time, tps) = match.groups() + tps = float(tps) + failed = 0 + else: + (_time, tps, failed) = match.groups() # type: ignore + tps = float(tps) + failed = int(failed) + + matched_lines += 1 + + if failed > 0: + raise RuntimeError( + f"pgbench on tenant {endpoint.tenant_id} run at {out_path} has failed > 0" + ) + + if low_watermark is None or low_watermark > tps: + low_watermark = tps + + # Temporarily disabled: have seen some 0 tps regions on Hetzner runners, but not + # at the same time as a shard split. + # if tps < min_tps: + # raise RuntimeError( + # f"pgbench on tenant {endpoint.tenant_id} run at {out_path} has tps < {min_tps}" + # ) + + log.info(f"Checked {matched_lines} progress lines, lowest TPS was {min_tps}") + + if matched_lines == 0: + raise RuntimeError(f"pgbench output at {out_path} contained no progress lines") + + def run_pgbench_main(endpoint): + out_path = pg_bin.run_capture( + [ + "pgbench", + "-s50", + "-T", + "180", + "-R", + f"{transaction_rate}", + "-P", + "1", + f"postgres://cloud_admin@localhost:{endpoint.pg_port}/postgres", + ] + ) + + check_pgbench_output(out_path) + + def run_pgbench_read(endpoint): + out_path = pg_bin.run_capture( + [ + "pgbench", + "-s50", + "-T", + "30", + "-R", + f"{transaction_rate}", + "-S", + "-P", + "1", + f"postgres://cloud_admin@localhost:{endpoint.pg_port}/postgres", + ] + ) + + check_pgbench_output(out_path) + + with concurrent.futures.ThreadPoolExecutor(max_workers=tenant_count) as pgbench_threads: + pgbench_futs = [] + for tenant_state in tenants.values(): + fut = pgbench_threads.submit(run_pgbench_init, tenant_state.endpoint) + pgbench_futs.append(fut) + + log.info("Waiting for pgbench inits") + for fut in pgbench_futs: + fut.result() + + pgbench_futs = [] + for tenant_state in tenants.values(): + fut = pgbench_threads.submit(run_pgbench_main, tenant_state.endpoint) + pgbench_futs.append(fut) + + log.info("Waiting for pgbench read/write pass") + for fut in pgbench_futs: + fut.result() + + def assert_all_split(): + for tenant_id in tenants.keys(): + shards = tenant_get_shards(env, tenant_id) + assert len(shards) == 8 + + # This is not a wait_until, because we wanted the splits to happen _while_ pgbench is running: otherwise + # this test is not properly doing its job of validating that splits work nicely under load. + assert_all_split() + + env.storage_controller.assert_log_contains(".*Successful auto-split.*") + + # Log timeline sizes, useful for debug, and implicitly validates that the shards + # are available in the places the controller thinks they should be. + for tenant_id, tenant_state in tenants.items(): + (shard_zero_id, shard_zero_ps) = tenant_get_shards(env, tenant_id)[0] + timeline_info = shard_zero_ps.http_client().timeline_detail( + shard_zero_id, tenant_state.timeline_id + ) + log.info(f"{shard_zero_id} timeline: {timeline_info}") + + # Run compaction for all tenants, restart endpoint so that on subsequent reads we will + # definitely hit pageserver for reads. This compaction passis expected to drop unwanted + # layers but not do any rewrites (we're still in the same generation) + for tenant_id, tenant_state in tenants.items(): + tenant_state.endpoint.stop() + for shard_id, shard_ps in tenant_get_shards(env, tenant_id): + shard_ps.http_client().timeline_gc(shard_id, tenant_state.timeline_id, gc_horizon=None) + shard_ps.http_client().timeline_compact(shard_id, tenant_state.timeline_id) + tenant_state.endpoint.start() + + with concurrent.futures.ThreadPoolExecutor(max_workers=tenant_count) as pgbench_threads: + pgbench_futs = [] + for tenant_state in tenants.values(): + fut = pgbench_threads.submit(run_pgbench_read, tenant_state.endpoint) + pgbench_futs.append(fut) + + log.info("Waiting for pgbench read pass") + for fut in pgbench_futs: + fut.result() + + env.storage_controller.consistency_check() + + # Restart the storage controller + env.storage_controller.stop() + env.storage_controller.start() + + env.storage_controller.consistency_check() + + # Restart all pageservers + for ps in env.pageservers: + ps.stop() + ps.start() + + # Freshen gc_info in Timeline, so that when compaction runs in the background in the + # subsequent pgbench period, the last_gc_cutoff is updated and enables the conditions for a rewrite to pass. + for tenant_id, tenant_state in tenants.items(): + for shard_id, shard_ps in tenant_get_shards(env, tenant_id): + shard_ps.http_client().timeline_gc(shard_id, tenant_state.timeline_id, gc_horizon=None) + + # One last check data remains readable after everything has restarted + with concurrent.futures.ThreadPoolExecutor(max_workers=tenant_count) as pgbench_threads: + pgbench_futs = [] + for tenant_state in tenants.values(): + fut = pgbench_threads.submit(run_pgbench_read, tenant_state.endpoint) + pgbench_futs.append(fut) + + log.info("Waiting for pgbench read pass") + for fut in pgbench_futs: + fut.result() + + # Assert that some rewrites happened + # TODO: uncomment this after https://github.com/neondatabase/neon/pull/7531 is merged + # assert any(ps.log_contains(".*Rewriting layer after shard split.*") for ps in env.pageservers) diff --git a/test_runner/performance/test_storage_controller_scale.py b/test_runner/performance/test_storage_controller_scale.py index 17dc96dabe..cb013ae8c3 100644 --- a/test_runner/performance/test_storage_controller_scale.py +++ b/test_runner/performance/test_storage_controller_scale.py @@ -3,6 +3,7 @@ import random import time import pytest +from fixtures.common_types import TenantId, TenantShardId, TimelineId from fixtures.compute_reconfigure import ComputeReconfigure from fixtures.log_helper import log from fixtures.neon_fixtures import ( @@ -10,7 +11,6 @@ from fixtures.neon_fixtures import ( ) from fixtures.pageserver.http import PageserverHttpClient from fixtures.pg_version import PgVersion -from fixtures.types import TenantId, TenantShardId, TimelineId @pytest.mark.timeout(3600) # super long running test: should go down as we optimize @@ -102,6 +102,9 @@ def test_storage_controller_many_tenants( tenant_id, shard_count, stripe_size, + # Upload heatmaps fast, so that secondary downloads happen promptly, enabling + # the controller's optimization migrations to proceed promptly. + tenant_config={"heatmap_period": "10s"}, placement_policy={"Attached": 1}, ) futs.append(f) diff --git a/test_runner/performance/test_wal_backpressure.py b/test_runner/performance/test_wal_backpressure.py index 7eb244d378..513ebc74c3 100644 --- a/test_runner/performance/test_wal_backpressure.py +++ b/test_runner/performance/test_wal_backpressure.py @@ -6,10 +6,10 @@ from typing import Any, Callable, List import pytest from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker +from fixtures.common_types import Lsn from fixtures.compare_fixtures import NeonCompare, PgCompare, VanillaCompare from fixtures.log_helper import log from fixtures.neon_fixtures import DEFAULT_BRANCH_NAME, NeonEnvBuilder, PgBin -from fixtures.types import Lsn from performance.test_perf_pgbench import get_durations_matrix, get_scales_matrix diff --git a/test_runner/pg_clients/csharp/npgsql/csharp-npgsql.csproj b/test_runner/pg_clients/csharp/npgsql/csharp-npgsql.csproj index 50243e3ea7..edf2a01337 100644 --- a/test_runner/pg_clients/csharp/npgsql/csharp-npgsql.csproj +++ b/test_runner/pg_clients/csharp/npgsql/csharp-npgsql.csproj @@ -8,7 +8,7 @@ - + diff --git a/test_runner/regress/test_ancestor_branch.py b/test_runner/regress/test_ancestor_branch.py index d16d2d6a24..7e40081aa2 100644 --- a/test_runner/regress/test_ancestor_branch.py +++ b/test_runner/regress/test_ancestor_branch.py @@ -1,6 +1,6 @@ +from fixtures.common_types import TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder -from fixtures.types import TimelineId from fixtures.utils import query_scalar diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py index 693add422f..8c60b454d8 100644 --- a/test_runner/regress/test_attach_tenant_config.py +++ b/test_runner/regress/test_attach_tenant_config.py @@ -2,13 +2,13 @@ from dataclasses import dataclass from typing import Generator, Optional import pytest +from fixtures.common_types import TenantId from fixtures.neon_fixtures import ( NeonEnv, NeonEnvBuilder, ) from fixtures.pageserver.http import PageserverApiException, TenantConfig from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind -from fixtures.types import TenantId from fixtures.utils import wait_until @@ -162,7 +162,7 @@ def test_fully_custom_config(positive_env: NeonEnv): "checkpoint_distance": 10000, "checkpoint_timeout": "13m", "compaction_algorithm": { - "kind": "Tiered", + "kind": "tiered", }, "eviction_policy": { "kind": "LayerAccessThreshold", @@ -190,7 +190,7 @@ def test_fully_custom_config(positive_env: NeonEnv): "trace_read_requests": True, "walreceiver_connect_timeout": "13m", "image_layer_creation_check_threshold": 1, - "switch_aux_file_policy": "CrossValidation", + "switch_aux_file_policy": "cross-validation", } ps_http = env.pageserver.http_client() diff --git a/test_runner/regress/test_auth.py b/test_runner/regress/test_auth.py index bb622c0d59..035ab2796f 100644 --- a/test_runner/regress/test_auth.py +++ b/test_runner/regress/test_auth.py @@ -4,13 +4,13 @@ from pathlib import Path import psycopg2 import pytest +from fixtures.common_types import TenantId, TimelineId from fixtures.neon_fixtures import ( NeonEnv, NeonEnvBuilder, PgProtocol, ) from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient -from fixtures.types import TenantId, TimelineId def assert_client_authorized(env: NeonEnv, http_client: PageserverHttpClient): diff --git a/test_runner/regress/test_aux_files.py b/test_runner/regress/test_aux_files.py new file mode 100644 index 0000000000..5328aef156 --- /dev/null +++ b/test_runner/regress/test_aux_files.py @@ -0,0 +1,76 @@ +from fixtures.log_helper import log +from fixtures.neon_fixtures import ( + AuxFileStore, + NeonEnvBuilder, + logical_replication_sync, +) + + +def test_aux_v2_config_switch(neon_env_builder: NeonEnvBuilder, vanilla_pg): + env = neon_env_builder.init_start() + endpoint = env.endpoints.create_start("main") + client = env.pageserver.http_client() + + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + + tenant_config = client.tenant_config(tenant_id).effective_config + tenant_config["switch_aux_file_policy"] = AuxFileStore.V2 + client.set_tenant_config(tenant_id, tenant_config) + # aux file v2 is enabled on the write path, so for now, it should be unset (or null) + assert ( + client.timeline_detail(tenant_id=tenant_id, timeline_id=timeline_id)["last_aux_file_policy"] + is None + ) + + pg_conn = endpoint.connect() + cur = pg_conn.cursor() + + cur.execute("create table t(pk integer primary key, payload integer)") + cur.execute( + "CREATE TABLE replication_example(id SERIAL PRIMARY KEY, somedata int, text varchar(120));" + ) + cur.execute("create publication pub1 for table t, replication_example") + + # now start subscriber, aux files will be created at this point. TODO: find better ways of testing aux files (i.e., neon_test_utils) + # instead of going through the full logical replication process. + vanilla_pg.start() + vanilla_pg.safe_psql("create table t(pk integer primary key, payload integer)") + vanilla_pg.safe_psql( + "CREATE TABLE replication_example(id SERIAL PRIMARY KEY, somedata int, text varchar(120), testcolumn1 int, testcolumn2 int, testcolumn3 int);" + ) + connstr = endpoint.connstr().replace("'", "''") + log.info(f"ep connstr is {endpoint.connstr()}, subscriber connstr {vanilla_pg.connstr()}") + vanilla_pg.safe_psql(f"create subscription sub1 connection '{connstr}' publication pub1") + + # Wait logical replication channel to be established + logical_replication_sync(vanilla_pg, endpoint) + vanilla_pg.stop() + endpoint.stop() + + with env.pageserver.http_client() as client: + # aux file v2 flag should be enabled at this point + assert ( + client.timeline_detail(tenant_id, timeline_id)["last_aux_file_policy"] + == AuxFileStore.V2 + ) + with env.pageserver.http_client() as client: + tenant_config = client.tenant_config(tenant_id).effective_config + tenant_config["switch_aux_file_policy"] = "V1" + client.set_tenant_config(tenant_id, tenant_config) + # the flag should still be enabled + assert ( + client.timeline_detail(tenant_id=tenant_id, timeline_id=timeline_id)[ + "last_aux_file_policy" + ] + == AuxFileStore.V2 + ) + env.pageserver.restart() + with env.pageserver.http_client() as client: + # aux file v2 flag should be persisted + assert ( + client.timeline_detail(tenant_id=tenant_id, timeline_id=timeline_id)[ + "last_aux_file_policy" + ] + == AuxFileStore.V2 + ) diff --git a/test_runner/regress/test_branch_and_gc.py b/test_runner/regress/test_branch_and_gc.py index ddd02238ea..eb503ddbfa 100644 --- a/test_runner/regress/test_branch_and_gc.py +++ b/test_runner/regress/test_branch_and_gc.py @@ -2,10 +2,10 @@ import threading import time import pytest +from fixtures.common_types import Lsn, TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnv from fixtures.pageserver.http import TimelineCreate406 -from fixtures.types import Lsn, TimelineId from fixtures.utils import query_scalar diff --git a/test_runner/regress/test_branch_behind.py b/test_runner/regress/test_branch_behind.py index b79cad979f..0a5336f5a2 100644 --- a/test_runner/regress/test_branch_behind.py +++ b/test_runner/regress/test_branch_behind.py @@ -1,8 +1,8 @@ import pytest +from fixtures.common_types import Lsn, TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder from fixtures.pageserver.http import TimelineCreate406 -from fixtures.types import Lsn, TimelineId from fixtures.utils import print_gc_result, query_scalar @@ -11,8 +11,7 @@ from fixtures.utils import print_gc_result, query_scalar # def test_branch_behind(neon_env_builder: NeonEnvBuilder): # Disable pitr, because here we want to test branch creation after GC - neon_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}" - env = neon_env_builder.init_start() + env = neon_env_builder.init_start(initial_tenant_conf={"pitr_interval": "0 sec"}) error_regexes = [ ".*invalid branch start lsn.*", diff --git a/test_runner/regress/test_branching.py b/test_runner/regress/test_branching.py index 9fe9f77fea..03d6946c15 100644 --- a/test_runner/regress/test_branching.py +++ b/test_runner/regress/test_branching.py @@ -5,6 +5,7 @@ from concurrent.futures import ThreadPoolExecutor from typing import List import pytest +from fixtures.common_types import Lsn, TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import ( Endpoint, @@ -14,7 +15,6 @@ from fixtures.neon_fixtures import ( ) from fixtures.pageserver.http import PageserverApiException from fixtures.pageserver.utils import wait_until_tenant_active -from fixtures.types import Lsn, TimelineId from fixtures.utils import query_scalar from performance.test_perf_pgbench import get_scales_matrix from requests import RequestException diff --git a/test_runner/regress/test_broken_timeline.py b/test_runner/regress/test_broken_timeline.py index 1279c1bf81..61afd820ca 100644 --- a/test_runner/regress/test_broken_timeline.py +++ b/test_runner/regress/test_broken_timeline.py @@ -3,6 +3,7 @@ import os from typing import List, Tuple import pytest +from fixtures.common_types import TenantId, TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import ( Endpoint, @@ -11,7 +12,6 @@ from fixtures.neon_fixtures import ( wait_for_last_flush_lsn, ) from fixtures.pg_version import PgVersion -from fixtures.types import TenantId, TimelineId # Test restarting page server, while safekeeper and compute node keep @@ -56,14 +56,8 @@ def test_local_corruption(neon_env_builder: NeonEnvBuilder): (tenant0, timeline0, pg0) = tenant_timelines[0] log.info(f"Timeline {tenant0}/{timeline0} is left intact") - (tenant1, timeline1, pg1) = tenant_timelines[1] - metadata_path = f"{env.pageserver.workdir}/tenants/{tenant1}/timelines/{timeline1}/metadata" - with open(metadata_path, "w") as f: - f.write("overwritten with garbage!") - log.info(f"Timeline {tenant1}/{timeline1} got its metadata spoiled") - - (tenant2, timeline2, pg2) = tenant_timelines[2] - timeline_path = f"{env.pageserver.workdir}/tenants/{tenant2}/timelines/{timeline2}/" + (tenant1, timeline1, pg1) = tenant_timelines[2] + timeline_path = f"{env.pageserver.workdir}/tenants/{tenant1}/timelines/{timeline1}/" for filename in os.listdir(timeline_path): if filename.startswith("00000"): # Looks like a layer file. Corrupt it @@ -72,7 +66,7 @@ def test_local_corruption(neon_env_builder: NeonEnvBuilder): with open(p, "wb") as f: f.truncate(0) f.truncate(size) - log.info(f"Timeline {tenant2}/{timeline2} got its local layer files spoiled") + log.info(f"Timeline {tenant1}/{timeline1} got its local layer files spoiled") env.pageserver.start() @@ -80,19 +74,15 @@ def test_local_corruption(neon_env_builder: NeonEnvBuilder): pg0.start() assert pg0.safe_psql("SELECT COUNT(*) FROM t")[0][0] == 100 - # Tenant with corrupt local metadata works: remote storage is authoritative for metadata - pg1.start() - assert pg1.safe_psql("SELECT COUNT(*) FROM t")[0][0] == 100 - # Second timeline will fail during basebackup, because the local layer file is corrupt. # It will fail when we try to read (and reconstruct) a page from it, ergo the error message. # (We don't check layer file contents on startup, when loading the timeline) # # This will change when we implement checksums for layers with pytest.raises(Exception, match=f"{reconstruct_function_name} for layer ") as err: - pg2.start() + pg1.start() log.info( - f"As expected, compute startup failed for timeline {tenant2}/{timeline2} with corrupt layers: {err}" + f"As expected, compute startup failed for timeline {tenant1}/{timeline1} with corrupt layers: {err}" ) diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py index 43a3323462..4850a5c688 100644 --- a/test_runner/regress/test_compaction.py +++ b/test_runner/regress/test_compaction.py @@ -1,10 +1,12 @@ +import enum import json import os from typing import Optional import pytest from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnvBuilder +from fixtures.neon_fixtures import NeonEnvBuilder, generate_uploads_and_deletions +from fixtures.pageserver.http import PageserverApiException from fixtures.workload import Workload AGGRESIVE_COMPACTION_TENANT_CONF = { @@ -163,7 +165,6 @@ def test_sharding_compaction( image_layer_sizes[layer.layer_file_name] = layer.layer_file_size # Pageserver should assert rather than emit an empty layer file, but double check here - assert layer.layer_file_size is not None assert layer.layer_file_size > 0 shard_has_image_layers.append(len(image_layer_sizes) > 1) @@ -176,7 +177,7 @@ def test_sharding_compaction( # # We only do this check with tiny stripes, because large stripes may not give all shards enough # data to have statistically significant image layers - avg_size = sum(v for v in image_layer_sizes.values()) / len(image_layer_sizes) # type: ignore + avg_size = sum(v for v in image_layer_sizes.values()) / len(image_layer_sizes) log.info(f"Shard {shard_id} average image layer size: {avg_size}") assert avg_size > compaction_target_size / 2 @@ -190,3 +191,61 @@ def test_sharding_compaction( # Assert that everything is still readable workload.validate() + + +class CompactionAlgorithm(str, enum.Enum): + LEGACY = "legacy" + TIERED = "tiered" + + +@pytest.mark.parametrize( + "compaction_algorithm", [CompactionAlgorithm.LEGACY, CompactionAlgorithm.TIERED] +) +def test_uploads_and_deletions( + neon_env_builder: NeonEnvBuilder, + compaction_algorithm: CompactionAlgorithm, +): + """ + :param compaction_algorithm: the compaction algorithm to use. + """ + + tenant_conf = { + # small checkpointing and compaction targets to ensure we generate many upload operations + "checkpoint_distance": f"{128 * 1024}", + "compaction_threshold": "1", + "compaction_target_size": f"{128 * 1024}", + # no PITR horizon, we specify the horizon when we request on-demand GC + "pitr_interval": "0s", + # disable background compaction and GC. We invoke it manually when we want it to happen. + "gc_period": "0s", + "compaction_period": "0s", + # create image layers eagerly, so that GC can remove some layers + "image_creation_threshold": "1", + "image_layer_creation_check_threshold": "0", + "compaction_algorithm": json.dumps({"kind": compaction_algorithm.value}), + } + env = neon_env_builder.init_start(initial_tenant_conf=tenant_conf) + + # TODO remove these allowed errors + # https://github.com/neondatabase/neon/issues/7707 + # https://github.com/neondatabase/neon/issues/7759 + allowed_errors = [ + ".*duplicated L1 layer.*", + ".*delta layer created with.*duplicate values.*", + ".*assertion failed: self.lsn_range.start <= lsn.*", + ".*HTTP request handler task panicked: task.*panicked.*", + ] + if compaction_algorithm == CompactionAlgorithm.TIERED: + env.pageserver.allowed_errors.extend(allowed_errors) + + try: + generate_uploads_and_deletions(env, pageserver=env.pageserver) + except PageserverApiException as e: + log.info(f"Obtained PageserverApiException: {e}") + + # The errors occur flakily and no error is ensured to occur, + # however at least one of them occurs. + if compaction_algorithm == CompactionAlgorithm.TIERED: + found_allowed_error = any(env.pageserver.log_contains(e) for e in allowed_errors) + if not found_allowed_error: + raise Exception("None of the allowed_errors occured in the log") diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py index 787c114fc1..65649e0c0a 100644 --- a/test_runner/regress/test_compatibility.py +++ b/test_runner/regress/test_compatibility.py @@ -1,4 +1,5 @@ import os +import re import shutil import subprocess import tempfile @@ -7,6 +8,7 @@ from typing import List, Optional import pytest import toml +from fixtures.common_types import Lsn from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnv, @@ -21,7 +23,6 @@ from fixtures.pageserver.utils import ( ) from fixtures.pg_version import PgVersion from fixtures.remote_storage import RemoteStorageKind -from fixtures.types import Lsn # # A test suite that help to prevent unintentionally breaking backward or forward compatibility between Neon releases. @@ -245,14 +246,34 @@ def test_forward_compatibility( compatibility_snapshot_dir / "repo", ) + # not using env.pageserver.version because it was initialized before + prev_pageserver_version_str = env.get_binary_version("pageserver") + prev_pageserver_version_match = re.search( + "Neon page server git-env:(.*) failpoints: (.*), features: (.*)", + prev_pageserver_version_str, + ) + if prev_pageserver_version_match is not None: + prev_pageserver_version = prev_pageserver_version_match.group(1) + else: + raise AssertionError( + "cannot find git hash in the version string: " + prev_pageserver_version_str + ) + + # does not include logs from previous runs + assert not env.pageserver.log_contains("git-env:" + prev_pageserver_version) + neon_env_builder.start() + # ensure the specified pageserver is running + assert env.pageserver.log_contains("git-env:" + prev_pageserver_version) + check_neon_works( env, test_output_dir=test_output_dir, sql_dump_path=compatibility_snapshot_dir / "dump.sql", repo_dir=env.repo_dir, ) + except Exception: if breaking_changes_allowed: pytest.xfail( diff --git a/test_runner/regress/test_compute_catalog.py b/test_runner/regress/test_compute_catalog.py new file mode 100644 index 0000000000..dd36190fcd --- /dev/null +++ b/test_runner/regress/test_compute_catalog.py @@ -0,0 +1,34 @@ +import requests +from fixtures.neon_fixtures import NeonEnv + + +def test_compute_catalog(neon_simple_env: NeonEnv): + env = neon_simple_env + env.neon_cli.create_branch("test_config", "empty") + + endpoint = env.endpoints.create_start("test_config", config_lines=["log_min_messages=debug1"]) + client = endpoint.http_client() + + objects = client.dbs_and_roles() + + # Assert that 'cloud_admin' role exists in the 'roles' list + assert any( + role["name"] == "cloud_admin" for role in objects["roles"] + ), "The 'cloud_admin' role is missing" + + # Assert that 'postgres' database exists in the 'databases' list + assert any( + db["name"] == "postgres" for db in objects["databases"] + ), "The 'postgres' database is missing" + + ddl = client.database_schema(database="postgres") + + assert "-- PostgreSQL database dump" in ddl + + try: + client.database_schema(database="nonexistentdb") + raise AssertionError("Expected HTTPError was not raised") + except requests.exceptions.HTTPError as e: + assert ( + e.response.status_code == 404 + ), f"Expected 404 status code, but got {e.response.status_code}" diff --git a/test_runner/regress/test_disk_usage_eviction.py b/test_runner/regress/test_disk_usage_eviction.py index 5e9efa7cce..7ae2352c06 100644 --- a/test_runner/regress/test_disk_usage_eviction.py +++ b/test_runner/regress/test_disk_usage_eviction.py @@ -5,6 +5,7 @@ from dataclasses import dataclass from typing import Any, Dict, Iterable, Tuple import pytest +from fixtures.common_types import Lsn, TenantId, TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnv, @@ -16,7 +17,6 @@ from fixtures.neon_fixtures import ( from fixtures.pageserver.http import PageserverHttpClient from fixtures.pageserver.utils import wait_for_upload_queue_empty from fixtures.remote_storage import RemoteStorageKind -from fixtures.types import Lsn, TenantId, TimelineId from fixtures.utils import wait_until GLOBAL_LRU_LOG_LINE = "tenant_min_resident_size-respecting LRU would not relieve pressure, evicting more following global LRU policy" @@ -623,15 +623,16 @@ def test_partial_evict_tenant(eviction_env: EvictionEnv, order: EvictionOrder): ratio = count_now / original_count abs_diff = abs(ratio - expected_ratio) assert original_count > count_now - log.info( - f"tenant {tenant_id} layer count {original_count} -> {count_now}, ratio: {ratio}, expecting {abs_diff} < 0.1" - ) + expectation = 0.06 + log.info( + f"tenant {tenant_id} layer count {original_count} -> {count_now}, ratio: {ratio}, expecting {abs_diff} < {expectation}" + ) # in this test case both relative_spare and relative_equal produce # the same outcomes; this must be a quantization effect of similar # sizes (-s4 and -s6) and small (5MB) layer size. # for pg15 and pg16 the absdiff is < 0.01, for pg14 it is closer to 0.02 - assert abs_diff < 0.05 + assert abs_diff < expectation @pytest.mark.parametrize( diff --git a/test_runner/regress/test_duplicate_layers.py b/test_runner/regress/test_duplicate_layers.py index 7471338ce5..0ebb99c712 100644 --- a/test_runner/regress/test_duplicate_layers.py +++ b/test_runner/regress/test_duplicate_layers.py @@ -2,7 +2,7 @@ import time import pytest from fixtures.neon_fixtures import NeonEnvBuilder, PgBin, wait_for_last_flush_lsn -from fixtures.pageserver.types import parse_layer_file_name +from fixtures.pageserver.common_types import parse_layer_file_name from fixtures.pageserver.utils import ( wait_for_last_record_lsn, wait_for_upload_queue_empty, diff --git a/test_runner/regress/test_fullbackup.py b/test_runner/regress/test_fullbackup.py index d5f898492b..e6d51a77a6 100644 --- a/test_runner/regress/test_fullbackup.py +++ b/test_runner/regress/test_fullbackup.py @@ -1,6 +1,7 @@ import os from pathlib import Path +from fixtures.common_types import Lsn from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnvBuilder, @@ -8,7 +9,6 @@ from fixtures.neon_fixtures import ( VanillaPostgres, ) from fixtures.port_distributor import PortDistributor -from fixtures.types import Lsn, TimelineId from fixtures.utils import query_scalar, subprocess_capture num_rows = 1000 @@ -19,17 +19,16 @@ def test_fullbackup( neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, port_distributor: PortDistributor, - pg_distrib_dir: Path, test_output_dir: Path, ): env = neon_env_builder.init_start() - env.neon_cli.create_branch("test_fullbackup") - endpoint_main = env.endpoints.create_start("test_fullbackup") + # endpoint needs to be alive until the fullbackup so that we have + # prev_record_lsn for the vanilla_pg to start in read-write mode + # for some reason this does not happen if endpoint is shutdown. + endpoint_main = env.endpoints.create_start("main") with endpoint_main.cursor() as cur: - timeline = TimelineId(query_scalar(cur, "SHOW neon.timeline_id")) - # data loading may take a while, so increase statement timeout cur.execute("SET statement_timeout='300s'") cur.execute( @@ -41,17 +40,13 @@ def test_fullbackup( lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_insert_lsn()")) log.info(f"start_backup_lsn = {lsn}") - # Set LD_LIBRARY_PATH in the env properly, otherwise we may use the wrong libpq. - # PgBin sets it automatically, but here we need to pipe psql output to the tar command. - psql_env = {"LD_LIBRARY_PATH": str(pg_distrib_dir / "lib")} - # Get and unpack fullbackup from pageserver restored_dir_path = env.repo_dir / "restored_datadir" os.mkdir(restored_dir_path, 0o750) - query = f"fullbackup {env.initial_tenant} {timeline} {lsn}" tar_output_file = test_output_dir / "fullbackup.tar" - cmd = ["psql", "--no-psqlrc", env.pageserver.connstr(), "-c", query, "-o", str(tar_output_file)] - pg_bin.run_capture(cmd, env=psql_env) + pg_bin.take_fullbackup( + env.pageserver, env.initial_tenant, env.initial_timeline, lsn, tar_output_file + ) subprocess_capture( env.repo_dir, ["tar", "-xf", str(tar_output_file), "-C", str(restored_dir_path)] ) @@ -61,7 +56,7 @@ def test_fullbackup( # use resetwal to overwrite it pg_resetwal_path = os.path.join(pg_bin.pg_bin_path, "pg_resetwal") cmd = [pg_resetwal_path, "-D", str(restored_dir_path)] - pg_bin.run_capture(cmd, env=psql_env) + pg_bin.run_capture(cmd) # Restore from the backup and find the data we inserted port = port_distributor.get_port() diff --git a/test_runner/regress/test_gc_aggressive.py b/test_runner/regress/test_gc_aggressive.py index c5070ee815..44133f2350 100644 --- a/test_runner/regress/test_gc_aggressive.py +++ b/test_runner/regress/test_gc_aggressive.py @@ -2,6 +2,7 @@ import asyncio import concurrent.futures import random +from fixtures.common_types import TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import ( Endpoint, @@ -10,7 +11,6 @@ from fixtures.neon_fixtures import ( wait_for_last_flush_lsn, ) from fixtures.remote_storage import RemoteStorageKind -from fixtures.types import TimelineId # Test configuration # @@ -67,8 +67,7 @@ async def update_and_gc(env: NeonEnv, endpoint: Endpoint, timeline: TimelineId): # def test_gc_aggressive(neon_env_builder: NeonEnvBuilder): # Disable pitr, because here we want to test branch creation after GC - neon_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}" - env = neon_env_builder.init_start() + env = neon_env_builder.init_start(initial_tenant_conf={"pitr_interval": "0 sec"}) timeline = env.neon_cli.create_branch("test_gc_aggressive", "main") endpoint = env.endpoints.create_start("test_gc_aggressive") @@ -94,13 +93,11 @@ def test_gc_aggressive(neon_env_builder: NeonEnvBuilder): # def test_gc_index_upload(neon_env_builder: NeonEnvBuilder): - # Disable time-based pitr, we will use LSN-based thresholds in the manual GC calls - neon_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}" num_index_uploads = 0 neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) - - env = neon_env_builder.init_start() + # Disable time-based pitr, we will use LSN-based thresholds in the manual GC calls + env = neon_env_builder.init_start(initial_tenant_conf={"pitr_interval": "0 sec"}) tenant_id = env.initial_tenant timeline_id = env.neon_cli.create_branch("test_gc_index_upload", "main") endpoint = env.endpoints.create_start("test_gc_index_upload") diff --git a/test_runner/regress/test_hot_standby.py b/test_runner/regress/test_hot_standby.py index 179cc273ec..cf7a1c56ee 100644 --- a/test_runner/regress/test_hot_standby.py +++ b/test_runner/regress/test_hot_standby.py @@ -1,9 +1,21 @@ +import asyncio import os import re +import threading import time +from functools import partial +import pytest from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, tenant_get_shards, wait_replica_caughtup +from fixtures.neon_fixtures import ( + NeonEnv, + NeonEnvBuilder, + PgBin, + log_replica_lag, + tenant_get_shards, + wait_replica_caughtup, +) +from fixtures.utils import wait_until # Check for corrupted WAL messages which might otherwise go unnoticed if @@ -104,19 +116,28 @@ def test_2_replicas_start(neon_simple_env: NeonEnv): wait_replica_caughtup(primary, secondary2) -# We had an issue that a standby server made GetPage requests with an -# old LSN, based on the last-written LSN cache, to avoid waits in the -# pageserver. However, requesting a page with a very old LSN, such -# that the GC horizon has already advanced past it, results in an -# error from the pageserver: -# "Bad request: tried to request a page version that was garbage collected" +# Test two different scenarios related to gc of data needed by hot standby. # -# To avoid that, the compute<-> pageserver protocol was updated so -# that that the standby now sends two LSNs, the old last-written LSN -# and the current replay LSN. +# When pause_apply is False, standby is mostly caught up with the primary. +# However, in compute <-> pageserver protocol version 1 only one LSN had been +# sent to the pageserver in page request, and to avoid waits in the pageserver +# it was last-written LSN cache value. If page hasn't been updated for a long +# time that resulted in an error from the pageserver: "Bad request: tried to +# request a page version that was garbage collected". For primary this wasn't a +# problem because pageserver always bumped LSN to the newest one; for standy +# that would be incorrect since we might get page fresher then apply LSN. Hence, +# in protocol version v2 two LSNs were introduced: main request_lsn (apply LSN +# in case of standby) and not_modified_since which could be used as an +# optimization to avoid waiting. # # https://github.com/neondatabase/neon/issues/6211 -def test_hot_standby_gc(neon_env_builder: NeonEnvBuilder): +# +# When pause_apply is True we model standby lagging behind primary (e.g. due to +# high max_standby_streaming_delay). To prevent pageserver from removing data +# still needed by the standby apply LSN is propagated in standby -> safekeepers +# -> broker -> pageserver flow so that pageserver could hold off gc for it. +@pytest.mark.parametrize("pause_apply", [False, True]) +def test_hot_standby_gc(neon_env_builder: NeonEnvBuilder, pause_apply: bool): tenant_conf = { # set PITR interval to be small, so we can do GC "pitr_interval": "0 s", @@ -160,6 +181,9 @@ def test_hot_standby_gc(neon_env_builder: NeonEnvBuilder): # so we still remember the LSNs of the pages. s_cur.execute("SELECT clear_buffer_cache()") + if pause_apply: + s_cur.execute("SELECT pg_wal_replay_pause()") + # Do other stuff on the primary, to advance the WAL p_cur.execute("CREATE TABLE test2 AS SELECT generate_series(1, 1000000) AS g") @@ -176,6 +200,155 @@ def test_hot_standby_gc(neon_env_builder: NeonEnvBuilder): # generates use old not_modified_since LSNs, older than # the GC cutoff, but new request LSNs. (In protocol # version 1 there was only one LSN, and this failed.) + log_replica_lag(primary, secondary) s_cur.execute("SELECT COUNT(*) FROM test") + log_replica_lag(primary, secondary) res = s_cur.fetchone() assert res[0] == 10000 + + +def run_pgbench(connstr: str, pg_bin: PgBin): + log.info(f"Start a pgbench workload on pg {connstr}") + # s10 is about 150MB of data. In debug mode init takes about 15s on SSD. + pg_bin.run_capture(["pgbench", "-i", "-s10", connstr]) + log.info("pgbench init done") + pg_bin.run_capture(["pgbench", "-T60", connstr]) + + +# assert that pgbench_accounts and its index are created. +def pgbench_accounts_initialized(ep): + ep.safe_psql_scalar("select 'pgbench_accounts_pkey'::regclass") + + +# Test that hot_standby_feedback works in neon (it is forwarded through +# safekeepers). That is, ensure queries on standby don't fail during load on +# primary under the following conditions: +# - pgbench bombards primary with updates. +# - On the secondary we run long select of the updated table. +# - Set small max_standby_streaming_delay: hs feedback should prevent conflicts +# so apply doesn't need to wait. +# - Do agressive vacuum on primary which still shouldn't create conflicts. +# Actually this appears to be redundant due to microvacuum existence. +# +# Without hs feedback enabled we'd see 'User query might have needed to see row +# versions that must be removed.' errors. +def test_hot_standby_feedback(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): + env = neon_env_builder.init_start() + agressive_vacuum_conf = [ + "log_autovacuum_min_duration = 0", + "autovacuum_naptime = 10s", + "autovacuum_vacuum_threshold = 25", + "autovacuum_vacuum_scale_factor = 0.1", + "autovacuum_vacuum_cost_delay = -1", + ] + with env.endpoints.create_start( + branch_name="main", endpoint_id="primary", config_lines=agressive_vacuum_conf + ) as primary: + # It would be great to have more strict max_standby_streaming_delay=0s here, but then sometimes it fails with + # 'User was holding shared buffer pin for too long.'. + with env.endpoints.new_replica_start( + origin=primary, + endpoint_id="secondary", + config_lines=[ + "max_standby_streaming_delay=2s", + "neon.protocol_version=2", + "hot_standby_feedback=true", + ], + ) as secondary: + log.info( + f"primary connstr is {primary.connstr()}, secondary connstr {secondary.connstr()}" + ) + t = threading.Thread(target=run_pgbench, args=(primary.connstr(), pg_bin)) + t.start() + # Wait until pgbench_accounts is created + filled on replica *and* + # index is created. Otherwise index creation would conflict with + # read queries and hs feedback won't save us. + wait_until(60, 1.0, partial(pgbench_accounts_initialized, secondary)) + + # Test should fail if hs feedback is disabled anyway, but cross + # check that walproposer sets some xmin. + def xmin_is_not_null(): + slot_xmin = primary.safe_psql_scalar( + "select xmin from pg_replication_slots where slot_name = 'wal_proposer_slot'", + log_query=False, + ) + log.info(f"xmin is {slot_xmin}") + assert int(slot_xmin) > 0 + + wait_until(10, 1.0, xmin_is_not_null) + for _ in range(1, 5): + # in debug mode takes about 5-7s + balance = secondary.safe_psql_scalar("select sum(abalance) from pgbench_accounts") + log.info(f"balance={balance}") + log_replica_lag(primary, secondary) + t.join() + + # check xmin is reset when standby is gone + def xmin_is_null(): + slot_xmin = primary.safe_psql_scalar( + "select xmin from pg_replication_slots where slot_name = 'wal_proposer_slot'", + log_query=False, + ) + log.info(f"xmin is {slot_xmin}") + assert slot_xmin is None + + wait_until(10, 1.0, xmin_is_null) + + +# Test race condition between WAL replay and backends performing queries +# https://github.com/neondatabase/neon/issues/7791 +def test_replica_query_race(neon_simple_env: NeonEnv): + env = neon_simple_env + + primary_ep = env.endpoints.create_start( + branch_name="main", + endpoint_id="primary", + ) + + with primary_ep.connect() as p_con: + with p_con.cursor() as p_cur: + p_cur.execute("CREATE EXTENSION neon_test_utils") + p_cur.execute("CREATE TABLE test AS SELECT 0 AS counter") + + standby_ep = env.endpoints.new_replica_start(origin=primary_ep, endpoint_id="standby") + time.sleep(1) + + # In primary, run a lot of UPDATEs on a single page + finished = False + writecounter = 1 + + async def primary_workload(): + nonlocal writecounter, finished + conn = await primary_ep.connect_async() + while writecounter < 10000: + writecounter += 1 + await conn.execute(f"UPDATE test SET counter = {writecounter}") + finished = True + + # In standby, at the same time, run queries on it. And repeatedly drop caches + async def standby_workload(): + nonlocal writecounter, finished + conn = await standby_ep.connect_async() + reads = 0 + while not finished: + readcounter = await conn.fetchval("SELECT counter FROM test") + + # Check that the replica is keeping up with the primary. In local + # testing, the lag between primary and standby is much smaller, in + # the ballpark of 2-3 counter values. But be generous in case there's + # some hiccup. + # assert(writecounter - readcounter < 1000) + assert readcounter <= writecounter + if reads % 100 == 0: + log.info(f"read {reads}: counter {readcounter}, last update {writecounter}") + reads += 1 + + await conn.execute("SELECT clear_buffer_cache()") + + async def both(): + await asyncio.gather( + primary_workload(), + standby_workload(), + ) + + asyncio.run(both()) diff --git a/test_runner/regress/test_import.py b/test_runner/regress/test_import.py index 132427ba2d..62229ebfe7 100644 --- a/test_runner/regress/test_import.py +++ b/test_runner/regress/test_import.py @@ -7,6 +7,7 @@ from contextlib import closing from pathlib import Path import pytest +from fixtures.common_types import Lsn, TenantId, TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import ( Endpoint, @@ -20,8 +21,7 @@ from fixtures.pageserver.utils import ( wait_for_upload, ) from fixtures.remote_storage import RemoteStorageKind -from fixtures.types import Lsn, TenantId, TimelineId -from fixtures.utils import subprocess_capture +from fixtures.utils import assert_pageserver_backups_equal, subprocess_capture def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_builder): @@ -248,15 +248,9 @@ def _import( path to the backup archive file""" log.info(f"start_backup_lsn = {lsn}") - # Set LD_LIBRARY_PATH in the env properly, otherwise we may use the wrong libpq. - # PgBin sets it automatically, but here we need to pipe psql output to the tar command. - psql_env = {"LD_LIBRARY_PATH": str(pg_distrib_dir / "lib")} - # Get a fullbackup from pageserver - query = f"fullbackup { env.initial_tenant} {timeline} {lsn}" tar_output_file = test_output_dir / "fullbackup.tar" - cmd = ["psql", "--no-psqlrc", env.pageserver.connstr(), "-c", query, "-o", str(tar_output_file)] - pg_bin.run_capture(cmd, env=psql_env) + pg_bin.take_fullbackup(env.pageserver, env.initial_tenant, timeline, lsn, tar_output_file) # Stop the first pageserver instance, erase all its data env.endpoints.stop_all() @@ -305,22 +299,11 @@ def _import( assert endpoint.safe_psql("select count(*) from tbl") == [(expected_num_rows,)] # Take another fullbackup - query = f"fullbackup { tenant} {timeline} {lsn}" new_tar_output_file = test_output_dir / "fullbackup-new.tar" - cmd = [ - "psql", - "--no-psqlrc", - env.pageserver.connstr(), - "-c", - query, - "-o", - str(new_tar_output_file), - ] - pg_bin.run_capture(cmd, env=psql_env) + pg_bin.take_fullbackup(env.pageserver, tenant, timeline, lsn, new_tar_output_file) # Check it's the same as the first fullbackup - # TODO pageserver should be checking checksum - assert os.path.getsize(tar_output_file) == os.path.getsize(new_tar_output_file) + assert_pageserver_backups_equal(tar_output_file, new_tar_output_file, set()) # Check that gc works pageserver_http = env.pageserver.http_client() diff --git a/test_runner/regress/test_layer_eviction.py b/test_runner/regress/test_layer_eviction.py index 5c967fd72e..193149ea03 100644 --- a/test_runner/regress/test_layer_eviction.py +++ b/test_runner/regress/test_layer_eviction.py @@ -7,7 +7,7 @@ from fixtures.neon_fixtures import ( flush_ep_to_pageserver, wait_for_last_flush_lsn, ) -from fixtures.pageserver.types import parse_layer_file_name +from fixtures.pageserver.common_types import parse_layer_file_name from fixtures.pageserver.utils import wait_for_upload from fixtures.remote_storage import RemoteStorageKind @@ -159,7 +159,9 @@ def test_basic_eviction( def test_gc_of_remote_layers(neon_env_builder: NeonEnvBuilder): neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) - env = neon_env_builder.init_start() + # don't create initial tenant, we'll create it manually with custom config + env = neon_env_builder.init_configs() + env.start() tenant_config = { "pitr_interval": "1s", # set to non-zero, so GC actually does something @@ -270,14 +272,14 @@ def test_gc_of_remote_layers(neon_env_builder: NeonEnvBuilder): resident_physical_size_metric == 0 ), "ensure that resident_physical_size metric is zero" assert resident_physical_size_metric == sum( - layer.layer_file_size or 0 for layer in info.historic_layers if not layer.remote + layer.layer_file_size for layer in info.historic_layers if not layer.remote ), "ensure that resident_physical_size metric corresponds to layer map dump" remote_physical_size_metric = ps_http.get_timeline_metric( tenant_id, timeline_id, "pageserver_remote_physical_size" ) assert remote_physical_size_metric == sum( - layer.layer_file_size or 0 for layer in info.historic_layers if layer.remote + layer.layer_file_size for layer in info.historic_layers if layer.remote ), "ensure that remote_physical_size metric corresponds to layer map dump" log.info("before runnning GC, ensure that remote_physical size is zero") diff --git a/test_runner/regress/test_layers_from_future.py b/test_runner/regress/test_layers_from_future.py index f311a8bf2c..18e5111786 100644 --- a/test_runner/regress/test_layers_from_future.py +++ b/test_runner/regress/test_layers_from_future.py @@ -1,10 +1,11 @@ import time +from fixtures.common_types import Lsn from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder, flush_ep_to_pageserver -from fixtures.pageserver.types import ( - DeltaLayerFileName, - ImageLayerFileName, +from fixtures.pageserver.common_types import ( + DeltaLayerName, + ImageLayerName, is_future_layer, ) from fixtures.pageserver.utils import ( @@ -13,7 +14,6 @@ from fixtures.pageserver.utils import ( wait_until_tenant_active, ) from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind -from fixtures.types import Lsn from fixtures.utils import query_scalar, wait_until @@ -81,7 +81,7 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder): current = get_index_part() assert len(set(current.layer_metadata.keys())) == 1 layer_file_name = list(current.layer_metadata.keys())[0] - assert isinstance(layer_file_name, DeltaLayerFileName) + assert isinstance(layer_file_name, DeltaLayerName) assert layer_file_name.is_l0(), f"{layer_file_name}" log.info("force image layer creation in the future by writing some data into in-memory layer") @@ -146,7 +146,7 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder): future_layers = get_future_layers() assert len(future_layers) == 1 future_layer = future_layers[0] - assert isinstance(future_layer, ImageLayerFileName) + assert isinstance(future_layer, ImageLayerName) assert future_layer.lsn == last_record_lsn log.info( f"got layer from the future: lsn={future_layer.lsn} disk_consistent_lsn={ip.disk_consistent_lsn} last_record_lsn={last_record_lsn}" diff --git a/test_runner/regress/test_logical_replication.py b/test_runner/regress/test_logical_replication.py index 57d3447cae..a657d5a035 100644 --- a/test_runner/regress/test_logical_replication.py +++ b/test_runner/regress/test_logical_replication.py @@ -4,6 +4,7 @@ from random import choice from string import ascii_lowercase import pytest +from fixtures.common_types import Lsn from fixtures.log_helper import log from fixtures.neon_fixtures import ( AuxFileStore, @@ -12,7 +13,6 @@ from fixtures.neon_fixtures import ( logical_replication_sync, wait_for_last_flush_lsn, ) -from fixtures.types import Lsn from fixtures.utils import query_scalar, wait_until diff --git a/test_runner/regress/test_lsn_mapping.py b/test_runner/regress/test_lsn_mapping.py index 225622868d..83d52d4c4c 100644 --- a/test_runner/regress/test_lsn_mapping.py +++ b/test_runner/regress/test_lsn_mapping.py @@ -2,10 +2,10 @@ import re import time from datetime import datetime, timedelta, timezone +from fixtures.common_types import Lsn from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder, wait_for_last_flush_lsn from fixtures.pageserver.http import PageserverApiException -from fixtures.types import Lsn from fixtures.utils import query_scalar diff --git a/test_runner/regress/test_neon_cli.py b/test_runner/regress/test_neon_cli.py index cb69f0ef39..ba170cfb4c 100644 --- a/test_runner/regress/test_neon_cli.py +++ b/test_runner/regress/test_neon_cli.py @@ -5,6 +5,7 @@ from typing import cast import pytest import requests +from fixtures.common_types import TenantId, TimelineId from fixtures.neon_fixtures import ( DEFAULT_BRANCH_NAME, NeonEnv, @@ -13,7 +14,6 @@ from fixtures.neon_fixtures import ( ) from fixtures.pageserver.http import PageserverHttpClient from fixtures.pg_version import PgVersion, skip_on_postgres -from fixtures.types import TenantId, TimelineId def helper_compare_timeline_list( diff --git a/test_runner/regress/test_next_xid.py b/test_runner/regress/test_next_xid.py index e880445c4d..98fb06a0d6 100644 --- a/test_runner/regress/test_next_xid.py +++ b/test_runner/regress/test_next_xid.py @@ -3,13 +3,13 @@ import os import time from pathlib import Path +from fixtures.common_types import Lsn, TenantId, TimelineId from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnvBuilder, wait_for_wal_insert_lsn +from fixtures.neon_fixtures import NeonEnvBuilder, PgBin, wait_for_wal_insert_lsn from fixtures.pageserver.utils import ( wait_for_last_record_lsn, ) from fixtures.remote_storage import RemoteStorageKind -from fixtures.types import Lsn, TenantId, TimelineId from fixtures.utils import query_scalar @@ -71,22 +71,17 @@ def test_next_xid(neon_env_builder: NeonEnvBuilder): def test_import_at_2bil( neon_env_builder: NeonEnvBuilder, test_output_dir: Path, - pg_distrib_dir: Path, - pg_bin, + pg_bin: PgBin, vanilla_pg, ): neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) env = neon_env_builder.init_start() ps_http = env.pageserver.http_client() - # Set LD_LIBRARY_PATH in the env properly, otherwise we may use the wrong libpq. - # PgBin sets it automatically, but here we need to pipe psql output to the tar command. - psql_env = {"LD_LIBRARY_PATH": str(pg_distrib_dir / "lib")} - # Reset the vanilla Postgres instance to somewhat before 2 billion transactions. pg_resetwal_path = os.path.join(pg_bin.pg_bin_path, "pg_resetwal") cmd = [pg_resetwal_path, "--next-transaction-id=2129920000", "-D", str(vanilla_pg.pgdatadir)] - pg_bin.run_capture(cmd, env=psql_env) + pg_bin.run_capture(cmd) vanilla_pg.start() vanilla_pg.safe_psql("create user cloud_admin with password 'postgres' superuser") diff --git a/test_runner/regress/test_old_request_lsn.py b/test_runner/regress/test_old_request_lsn.py index 391305c58a..f1dd3fb67d 100644 --- a/test_runner/regress/test_old_request_lsn.py +++ b/test_runner/regress/test_old_request_lsn.py @@ -1,6 +1,6 @@ +from fixtures.common_types import TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder -from fixtures.types import TimelineId from fixtures.utils import print_gc_result, query_scalar @@ -16,8 +16,7 @@ from fixtures.utils import print_gc_result, query_scalar # def test_old_request_lsn(neon_env_builder: NeonEnvBuilder): # Disable pitr, because here we want to test branch creation after GC - neon_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}" - env = neon_env_builder.init_start() + env = neon_env_builder.init_start(initial_tenant_conf={"pitr_interval": "0 sec"}) env.neon_cli.create_branch("test_old_request_lsn", "main") endpoint = env.endpoints.create_start("test_old_request_lsn") diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py index 6c2556f6a2..b137fb3a5c 100644 --- a/test_runner/regress/test_ondemand_download.py +++ b/test_runner/regress/test_ondemand_download.py @@ -5,6 +5,7 @@ import time from collections import defaultdict from typing import Any, DefaultDict, Dict, Tuple +from fixtures.common_types import Lsn from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnvBuilder, @@ -21,7 +22,6 @@ from fixtures.pageserver.utils import ( wait_until_tenant_active, ) from fixtures.remote_storage import RemoteStorageKind -from fixtures.types import Lsn from fixtures.utils import query_scalar, wait_until @@ -540,7 +540,6 @@ def test_compaction_downloads_on_demand_without_image_creation(neon_env_builder: for layer in layers.historic_layers: log.info(f"pre-compact: {layer}") - assert layer.layer_file_size is not None, "we must know layer file sizes" layer_sizes += layer.layer_file_size pageserver_http.evict_layer(tenant_id, timeline_id, layer.layer_file_name) diff --git a/test_runner/regress/test_ondemand_slru_download.py b/test_runner/regress/test_ondemand_slru_download.py index 0b36b32552..4af7dcdfc3 100644 --- a/test_runner/regress/test_ondemand_slru_download.py +++ b/test_runner/regress/test_ondemand_slru_download.py @@ -1,9 +1,9 @@ from typing import Optional import pytest +from fixtures.common_types import Lsn from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder, tenant_get_shards -from fixtures.types import Lsn from fixtures.utils import query_scalar diff --git a/test_runner/regress/test_pageserver_api.py b/test_runner/regress/test_pageserver_api.py index be351db429..abbea59113 100644 --- a/test_runner/regress/test_pageserver_api.py +++ b/test_runner/regress/test_pageserver_api.py @@ -2,19 +2,22 @@ import subprocess from pathlib import Path from typing import Optional +import toml +from fixtures.common_types import Lsn, TenantId, TimelineId from fixtures.neon_fixtures import ( DEFAULT_BRANCH_NAME, NeonEnv, NeonEnvBuilder, ) from fixtures.pageserver.http import PageserverHttpClient -from fixtures.types import Lsn, TenantId, TimelineId from fixtures.utils import wait_until -def test_pageserver_init_node_id( - neon_simple_env: NeonEnv, neon_binpath: Path, pg_distrib_dir: Path -): +def test_pageserver_init_node_id(neon_simple_env: NeonEnv, neon_binpath: Path): + """ + NB: The neon_local doesn't use `--init` mode anymore, but our production + deployment still does => https://github.com/neondatabase/aws/pull/1322 + """ workdir = neon_simple_env.pageserver.workdir pageserver_config = workdir / "pageserver.toml" pageserver_bin = neon_binpath / "pageserver" @@ -28,18 +31,38 @@ def test_pageserver_init_node_id( stderr=subprocess.PIPE, ) - # remove initial config and stop existing pageserver - pageserver_config.unlink() neon_simple_env.pageserver.stop() - bad_init = run_pageserver(["--init", "-c", f'pg_distrib_dir="{pg_distrib_dir}"']) + with open(neon_simple_env.pageserver.config_toml_path, "r") as f: + ps_config = toml.load(f) + + required_config_keys = [ + "pg_distrib_dir", + "listen_pg_addr", + "listen_http_addr", + "pg_auth_type", + "http_auth_type", + # TODO: only needed for NEON_PAGESERVER_PANIC_ON_UNSPECIFIED_COMPACTION_ALGORITHM in https://github.com/neondatabase/neon/pull/7748 + # "tenant_config", + ] + required_config_overrides = [ + f"--config-override={toml.dumps({k: ps_config[k]})}" for k in required_config_keys + ] + + pageserver_config.unlink() + + bad_init = run_pageserver(["--init", *required_config_overrides]) assert ( bad_init.returncode == 1 ), "pageserver should not be able to init new config without the node id" assert 'missing config value "id"' in bad_init.stderr assert not pageserver_config.exists(), "config file should not be created after init error" - good_init_cmd = ["--init", "-c", "id = 12345", "-c", f'pg_distrib_dir="{pg_distrib_dir}"'] + good_init_cmd = [ + "--init", + f"--config-override=id={ps_config['id']}", + *required_config_overrides, + ] completed_init = run_pageserver(good_init_cmd) assert ( completed_init.returncode == 0 diff --git a/test_runner/regress/test_pageserver_config.py b/test_runner/regress/test_pageserver_config.py deleted file mode 100644 index c04348b488..0000000000 --- a/test_runner/regress/test_pageserver_config.py +++ /dev/null @@ -1,35 +0,0 @@ -import pytest -from fixtures.neon_fixtures import ( - NeonEnvBuilder, - last_flush_lsn_upload, -) - - -@pytest.mark.parametrize("kind", ["sync", "async"]) -def test_walredo_process_kind_config(neon_env_builder: NeonEnvBuilder, kind: str): - neon_env_builder.pageserver_config_override = f"walredo_process_kind = '{kind}'" - # ensure it starts - env = neon_env_builder.init_start() - # ensure the metric is set - ps_http = env.pageserver.http_client() - metrics = ps_http.get_metrics() - samples = metrics.query_all("pageserver_wal_redo_process_kind") - assert [(s.labels, s.value) for s in samples] == [({"kind": kind}, 1)] - # ensure default tenant's config kind matches - # => write some data to force-spawn walredo - ep = env.endpoints.create_start("main") - with ep.connect() as conn: - with conn.cursor() as cur: - cur.execute("create table foo(bar text)") - cur.execute("insert into foo select from generate_series(1, 100)") - last_flush_lsn_upload(env, ep, env.initial_tenant, env.initial_timeline) - ep.stop() - ep.start() - with ep.connect() as conn: - with conn.cursor() as cur: - cur.execute("select count(*) from foo") - [(count,)] = cur.fetchall() - assert count == 100 - - status = ps_http.tenant_status(env.initial_tenant) - assert status["walredo"]["process"]["kind"] == kind diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py index adcf7de8d4..0235cf6d20 100644 --- a/test_runner/regress/test_pageserver_generations.py +++ b/test_runner/regress/test_pageserver_generations.py @@ -16,29 +16,27 @@ import time from typing import Optional import pytest +from fixtures.common_types import TenantId, TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnv, NeonEnvBuilder, - NeonPageserver, PgBin, S3Scrubber, - flush_ep_to_pageserver, - last_flush_lsn_upload, + generate_uploads_and_deletions, ) +from fixtures.pageserver.common_types import parse_layer_file_name from fixtures.pageserver.http import PageserverApiException from fixtures.pageserver.utils import ( assert_tenant_state, list_prefix, wait_for_last_record_lsn, wait_for_upload, - wait_for_upload_queue_empty, ) from fixtures.remote_storage import ( RemoteStorageKind, ) -from fixtures.types import TenantId, TimelineId -from fixtures.utils import print_gc_result, wait_until +from fixtures.utils import wait_until from fixtures.workload import Workload # A tenant configuration that is convenient for generating uploads and deletions @@ -59,82 +57,6 @@ TENANT_CONF = { } -def generate_uploads_and_deletions( - env: NeonEnv, - *, - init: bool = True, - tenant_id: Optional[TenantId] = None, - timeline_id: Optional[TimelineId] = None, - data: Optional[str] = None, - pageserver: NeonPageserver, -): - """ - Using the environment's default tenant + timeline, generate a load pattern - that results in some uploads and some deletions to remote storage. - """ - - if tenant_id is None: - tenant_id = env.initial_tenant - assert tenant_id is not None - - if timeline_id is None: - timeline_id = env.initial_timeline - assert timeline_id is not None - - ps_http = pageserver.http_client() - - with env.endpoints.create_start( - "main", tenant_id=tenant_id, pageserver_id=pageserver.id - ) as endpoint: - if init: - endpoint.safe_psql("CREATE TABLE foo (id INTEGER PRIMARY KEY, val text)") - last_flush_lsn_upload( - env, endpoint, tenant_id, timeline_id, pageserver_id=pageserver.id - ) - - def churn(data): - endpoint.safe_psql_many( - [ - f""" - INSERT INTO foo (id, val) - SELECT g, '{data}' - FROM generate_series(1, 200) g - ON CONFLICT (id) DO UPDATE - SET val = EXCLUDED.val - """, - # to ensure that GC can actually remove some layers - "VACUUM foo", - ] - ) - assert tenant_id is not None - assert timeline_id is not None - # We are waiting for uploads as well as local flush, in order to avoid leaving the system - # in a state where there are "future layers" in remote storage that will generate deletions - # after a restart. - last_flush_lsn_upload( - env, endpoint, tenant_id, timeline_id, pageserver_id=pageserver.id - ) - - # Compaction should generate some GC-elegible layers - for i in range(0, 2): - churn(f"{i if data is None else data}") - - gc_result = ps_http.timeline_gc(tenant_id, timeline_id, 0) - print_gc_result(gc_result) - assert gc_result["layers_removed"] > 0 - - # Stop endpoint and flush all data to pageserver, then checkpoint it: this - # ensures that the pageserver is in a fully idle state: there will be no more - # background ingest, no more uploads pending, and therefore no non-determinism - # in subsequent actions like pageserver restarts. - final_lsn = flush_ep_to_pageserver(env, endpoint, tenant_id, timeline_id, pageserver.id) - ps_http.timeline_checkpoint(tenant_id, timeline_id) - # Finish uploads - wait_for_upload(ps_http, tenant_id, timeline_id, final_lsn) - # Finish all remote writes (including deletions) - wait_for_upload_queue_empty(ps_http, tenant_id, timeline_id) - - def read_all( env: NeonEnv, tenant_id: Optional[TenantId] = None, timeline_id: Optional[TimelineId] = None ): @@ -703,7 +625,6 @@ def test_multi_attach( workload.validate(pageservers[2].id) -@pytest.mark.skip(reason="To be enabled after release with new local path style") def test_upgrade_generationless_local_file_paths( neon_env_builder: NeonEnvBuilder, ): @@ -712,39 +633,86 @@ def test_upgrade_generationless_local_file_paths( generation numbers: it should accept these layer files, and avoid doing a delete/download cycle on them. """ - env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF) - tenant_id = env.initial_tenant - timeline_id = env.initial_timeline + neon_env_builder.num_pageservers = 2 + env = neon_env_builder.init_configs() + env.start() + + tenant_id = TenantId.generate() + timeline_id = TimelineId.generate() + env.neon_cli.create_tenant( + tenant_id, timeline_id, conf=TENANT_CONF, placement_policy='{"Attached":1}' + ) workload = Workload(env, tenant_id, timeline_id) workload.init() workload.write_rows(1000) - env.pageserver.stop() + attached_pageserver = env.get_tenant_pageserver(tenant_id) + secondary_pageserver = list([ps for ps in env.pageservers if ps.id != attached_pageserver.id])[ + 0 + ] + + attached_pageserver.http_client().tenant_heatmap_upload(tenant_id) + secondary_pageserver.http_client().tenant_secondary_download(tenant_id) # Rename the local paths to legacy format, to simulate what - # we would see when upgrading - timeline_dir = env.pageserver.timeline_dir(tenant_id, timeline_id) - files_renamed = 0 - for filename in os.listdir(timeline_dir): - path = os.path.join(timeline_dir, filename) - log.info(f"Found file {path}") - if path.endswith("-00000001"): - new_path = path[:-9] - os.rename(path, new_path) - log.info(f"Renamed {path} -> {new_path}") - files_renamed += 1 + # we would see when upgrading. Do this on both attached and secondary locations, as we will + # test the behavior of both. + for pageserver in env.pageservers: + pageserver.stop() + timeline_dir = pageserver.timeline_dir(tenant_id, timeline_id) + files_renamed = 0 + for filename in os.listdir(timeline_dir): + path = os.path.join(timeline_dir, filename) + log.info(f"Found file {path}") + if path.endswith("-v1-00000001"): + new_path = path[:-12] + os.rename(path, new_path) + log.info(f"Renamed {path} -> {new_path}") + files_renamed += 1 - assert files_renamed > 0 + assert files_renamed > 0 - env.pageserver.start() + pageserver.start() workload.validate() # Assert that there were no on-demand downloads assert ( - env.pageserver.http_client().get_metric_value( + attached_pageserver.http_client().get_metric_value( "pageserver_remote_ondemand_downloaded_layers_total" ) == 0 ) + + # Do a secondary download and ensure there were no layer downloads + secondary_pageserver.http_client().tenant_secondary_download(tenant_id) + assert ( + secondary_pageserver.http_client().get_metric_value( + "pageserver_secondary_download_layer_total" + ) + == 0 + ) + + # Check that when we evict and promote one of the legacy-named layers, everything works as + # expected + local_layers = list( + ( + parse_layer_file_name(path.name), + os.path.join(attached_pageserver.timeline_dir(tenant_id, timeline_id), path), + ) + for path in attached_pageserver.list_layers(tenant_id, timeline_id) + ) + (victim_layer_name, victim_path) = local_layers[0] + assert os.path.exists(victim_path) + + attached_pageserver.http_client().evict_layer( + tenant_id, timeline_id, victim_layer_name.to_str() + ) + assert not os.path.exists(victim_path) + + attached_pageserver.http_client().download_layer( + tenant_id, timeline_id, victim_layer_name.to_str() + ) + # We should download into the same local path we started with + assert os.path.exists(victim_path) diff --git a/test_runner/regress/test_pageserver_getpage_throttle.py b/test_runner/regress/test_pageserver_getpage_throttle.py index 42cc28efee..111285b40c 100644 --- a/test_runner/regress/test_pageserver_getpage_throttle.py +++ b/test_runner/regress/test_pageserver_getpage_throttle.py @@ -2,10 +2,10 @@ import json import uuid from anyio import Path +from fixtures.common_types import TenantId, TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder, PgBin from fixtures.pg_version import PgVersion -from fixtures.types import TenantId, TimelineId from fixtures.utils import wait_until diff --git a/test_runner/regress/test_pageserver_layer_rolling.py b/test_runner/regress/test_pageserver_layer_rolling.py index c5dc0f2919..66b6185aaa 100644 --- a/test_runner/regress/test_pageserver_layer_rolling.py +++ b/test_runner/regress/test_pageserver_layer_rolling.py @@ -5,6 +5,7 @@ from typing import Optional, Tuple import psutil import pytest +from fixtures.common_types import Lsn, TenantId, TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnv, @@ -13,7 +14,6 @@ from fixtures.neon_fixtures import ( ) from fixtures.pageserver.http import PageserverHttpClient from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload -from fixtures.types import Lsn, TenantId, TimelineId from fixtures.utils import wait_until TIMELINE_COUNT = 10 @@ -287,7 +287,7 @@ def test_total_size_limit(neon_env_builder: NeonEnvBuilder): total_historic_bytes += sum( layer.layer_file_size for layer in layer_map.historic_layers - if layer.layer_file_size is not None and Lsn(layer.lsn_start) > initdb_lsn + if Lsn(layer.lsn_start) > initdb_lsn ) total_ephemeral_layers += len(layer_map.in_memory_layers) diff --git a/test_runner/regress/test_pageserver_metric_collection.py b/test_runner/regress/test_pageserver_metric_collection.py index c34ef46d07..b0465f2a96 100644 --- a/test_runner/regress/test_pageserver_metric_collection.py +++ b/test_runner/regress/test_pageserver_metric_collection.py @@ -7,6 +7,7 @@ from pathlib import Path from queue import SimpleQueue from typing import Any, Dict, Set +from fixtures.common_types import TenantId, TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnvBuilder, @@ -17,7 +18,6 @@ from fixtures.remote_storage import ( RemoteStorageKind, remote_storage_to_toml_inline_table, ) -from fixtures.types import TenantId, TimelineId from pytest_httpserver import HTTPServer from werkzeug.wrappers.request import Request from werkzeug.wrappers.response import Response diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py index c40bb962f2..25a3f8521c 100644 --- a/test_runner/regress/test_pageserver_secondary.py +++ b/test_runner/regress/test_pageserver_secondary.py @@ -5,9 +5,10 @@ import time from typing import Any, Dict, Optional import pytest +from fixtures.common_types import TenantId, TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder, NeonPageserver, S3Scrubber -from fixtures.pageserver.types import parse_layer_file_name +from fixtures.pageserver.common_types import parse_layer_file_name from fixtures.pageserver.utils import ( assert_prefix_empty, poll_for_remote_storage_iterations, @@ -15,7 +16,6 @@ from fixtures.pageserver.utils import ( wait_for_upload_queue_empty, ) from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind, S3Storage -from fixtures.types import TenantId, TimelineId from fixtures.utils import wait_until from fixtures.workload import Workload @@ -575,7 +575,10 @@ def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder): tenant_timelines = {} # This mirrors a constant in `downloader.rs` - freshen_interval_secs = 60 + default_download_period_secs = 60 + + # The upload period, which will also be the download once the secondary has seen its first heatmap + upload_period_secs = 30 for _i in range(0, tenant_count): tenant_id = TenantId.generate() @@ -587,17 +590,32 @@ def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder): placement_policy='{"Attached":1}', # Run with a low heatmap period so that we can avoid having to do synthetic API calls # to trigger the upload promptly. - conf={"heatmap_period": "1s"}, + conf={"heatmap_period": f"{upload_period_secs}s"}, ) env.neon_cli.create_timeline("main2", tenant_id, timeline_b) tenant_timelines[tenant_id] = [timeline_a, timeline_b] + def await_log(pageserver, deadline, expression): + """ + Wrapper around assert_log_contains that waits with a deadline rather than timeout + """ + now = time.time() + if now > deadline: + raise RuntimeError(f"Timed out waiting for {expression}") + else: + timeout = int(deadline - now) + 1 + try: + wait_until(timeout, 1, lambda: pageserver.assert_log_contains(expression)) # type: ignore + except: + log.error(f"Timed out waiting for '{expression}'") + raise + t_start = time.time() # Wait long enough that the background downloads should happen; we expect all the inital layers # of all the initial timelines to show up on the secondary location of each tenant. - time.sleep(freshen_interval_secs * 1.5) + initial_download_deadline = time.time() + default_download_period_secs * 3 for tenant_id, timelines in tenant_timelines.items(): attached_to_id = env.storage_controller.locate(tenant_id)[0]["node_id"] @@ -605,16 +623,32 @@ def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder): # We only have two: the other one must be secondary ps_secondary = next(p for p in env.pageservers if p != ps_attached) + now = time.time() + if now > initial_download_deadline: + raise RuntimeError("Timed out waiting for initial secondary download") + else: + for timeline_id in timelines: + log.info( + f"Waiting for downloads of timeline {timeline_id} on secondary pageserver {ps_secondary.id}" + ) + await_log( + ps_secondary, + initial_download_deadline, + f".*{timeline_id}.*Wrote timeline_detail.*", + ) + for timeline_id in timelines: - log.info(f"Checking for secondary timeline {timeline_id} on node {ps_secondary.id}") + log.info( + f"Checking for secondary timeline downloads {timeline_id} on node {ps_secondary.id}" + ) # One or more layers should be present for all timelines assert ps_secondary.list_layers(tenant_id, timeline_id) # Delete the second timeline: this should be reflected later on the secondary env.storage_controller.pageserver_api().timeline_delete(tenant_id, timelines[1]) - # Wait long enough for the secondary locations to see the deletion - time.sleep(freshen_interval_secs * 1.5) + # Wait long enough for the secondary locations to see the deletion: 2x period plus a grace factor + deletion_deadline = time.time() + upload_period_secs * 3 for tenant_id, timelines in tenant_timelines.items(): attached_to_id = env.storage_controller.locate(tenant_id)[0]["node_id"] @@ -622,11 +656,24 @@ def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder): # We only have two: the other one must be secondary ps_secondary = next(p for p in env.pageservers if p != ps_attached) + expect_del_timeline = timelines[1] + log.info( + f"Waiting for deletion of timeline {expect_del_timeline} on secondary pageserver {ps_secondary.id}" + ) + await_log( + ps_secondary, + deletion_deadline, + f".*Timeline no longer in heatmap.*{expect_del_timeline}.*", + ) + # This one was not deleted assert ps_secondary.list_layers(tenant_id, timelines[0]) # This one was deleted - assert not ps_secondary.list_layers(tenant_id, timelines[1]) + log.info( + f"Checking for secondary timeline deletion {tenant_id}/{timeline_id} on node {ps_secondary.id}" + ) + assert not ps_secondary.list_layers(tenant_id, expect_del_timeline) t_end = time.time() @@ -640,7 +687,7 @@ def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder): download_rate = (total_heatmap_downloads / tenant_count) / (t_end - t_start) - expect_download_rate = 1.0 / freshen_interval_secs + expect_download_rate = 1.0 / upload_period_secs log.info(f"Download rate: {download_rate * 60}/min vs expected {expect_download_rate * 60}/min") assert download_rate < expect_download_rate * 2 diff --git a/test_runner/regress/test_pg_regress.py b/test_runner/regress/test_pg_regress.py index 2b1b7fff34..885a94a557 100644 --- a/test_runner/regress/test_pg_regress.py +++ b/test_runner/regress/test_pg_regress.py @@ -1,16 +1,25 @@ # # This file runs pg_regress-based tests. # +from __future__ import annotations + from pathlib import Path -from typing import Optional +from typing import TYPE_CHECKING, cast import pytest from fixtures.neon_fixtures import ( NeonEnvBuilder, check_restored_datadir_content, ) +from fixtures.pg_version import PgVersion from fixtures.remote_storage import s3_storage +if TYPE_CHECKING: + from typing import Optional + + from fixtures.neon_fixtures import PgBin + from pytest import CaptureFixture + # Run the main PostgreSQL regression tests, in src/test/regress. # @@ -19,12 +28,14 @@ def test_pg_regress( neon_env_builder: NeonEnvBuilder, test_output_dir: Path, build_type: str, - pg_bin, - capsys, + pg_bin: PgBin, + capsys: CaptureFixture[str], base_dir: Path, pg_distrib_dir: Path, shard_count: Optional[int], ): + DBNAME = "regression" + """ :param shard_count: if None, create an unsharded tenant. Otherwise create a tenant with this many shards. @@ -42,7 +53,7 @@ def test_pg_regress( # Connect to postgres and create a database called "regression". endpoint = env.endpoints.create_start("main") - endpoint.safe_psql("CREATE DATABASE regression") + endpoint.safe_psql(f"CREATE DATABASE {DBNAME}") # Create some local directories for pg_regress to run in. runpath = test_output_dir / "regress" @@ -77,7 +88,67 @@ def test_pg_regress( with capsys.disabled(): pg_bin.run(pg_regress_command, env=env_vars, cwd=runpath) - check_restored_datadir_content(test_output_dir, env, endpoint) + ignored_files: Optional[list[str]] = None + + # Neon handles unlogged relations in a special manner. During a + # basebackup, we ship the init fork as the main fork. This presents a + # problem in that the endpoint's data directory and the basebackup will + # have differences and will fail the eventual file comparison. + # + # Unlogged tables were introduced in version 9.1. ALTER TABLE grew + # support for setting the persistence of a table in 9.5. The reason that + # this doesn't affect versions < 15 (but probably would between 9.1 and + # 9.5) is that all the regression tests that deal with unlogged tables + # up until that point dropped the unlogged tables or set them to logged + # at some point during the test. + # + # In version 15, Postgres grew support for unlogged sequences, and with + # that came a few more regression tests. These tests did not all drop + # the unlogged tables/sequences prior to finishing. + # + # But unlogged sequences came with a bug in that, sequences didn't + # inherit the persistence of their "parent" tables if they had one. This + # was fixed and backported to 15, thus exacerbating our problem a bit. + # + # So what we can do is just ignore file differences between the data + # directory and basebackup for unlogged relations. + results = cast( + "list[tuple[str, str]]", + endpoint.safe_psql( + """ + SELECT + relkind, + pg_relation_filepath( + pg_filenode_relation(reltablespace, relfilenode) + ) AS unlogged_relation_paths + FROM pg_class + WHERE relpersistence = 'u' + """, + dbname=DBNAME, + ), + ) + + unlogged_relation_files: list[str] = [] + for r in results: + unlogged_relation_files.append(r[1]) + # This is related to the following Postgres commit: + # + # commit ccadf73163ca88bdaa74b8223d4dde05d17f550b + # Author: Heikki Linnakangas + # Date: 2023-08-23 09:21:31 -0500 + # + # Use the buffer cache when initializing an unlogged index. + # + # This patch was backpatched to 16. Without it, the LSN in the + # page header would be 0/0 in the data directory, which wouldn't + # match the LSN generated during the basebackup, thus creating + # a difference. + if env.pg_version <= PgVersion.V15 and r[0] == "i": + unlogged_relation_files.append(f"{r[1]}_init") + + ignored_files = unlogged_relation_files + + check_restored_datadir_content(test_output_dir, env, endpoint, ignored_files=ignored_files) # Run the PostgreSQL "isolation" tests, in src/test/isolation. @@ -86,8 +157,8 @@ def test_pg_regress( def test_isolation( neon_env_builder: NeonEnvBuilder, test_output_dir: Path, - pg_bin, - capsys, + pg_bin: PgBin, + capsys: CaptureFixture[str], base_dir: Path, pg_distrib_dir: Path, shard_count: Optional[int], @@ -142,8 +213,8 @@ def test_isolation( def test_sql_regress( neon_env_builder: NeonEnvBuilder, test_output_dir: Path, - pg_bin, - capsys, + pg_bin: PgBin, + capsys: CaptureFixture[str], base_dir: Path, pg_distrib_dir: Path, shard_count: Optional[int], diff --git a/test_runner/regress/test_pitr_gc.py b/test_runner/regress/test_pitr_gc.py index 539ef3eda7..7e676b5515 100644 --- a/test_runner/regress/test_pitr_gc.py +++ b/test_runner/regress/test_pitr_gc.py @@ -1,6 +1,6 @@ +from fixtures.common_types import TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder -from fixtures.types import TimelineId from fixtures.utils import print_gc_result, query_scalar @@ -10,11 +10,9 @@ from fixtures.utils import print_gc_result, query_scalar # def test_pitr_gc(neon_env_builder: NeonEnvBuilder): # Set pitr interval such that we need to keep the data - neon_env_builder.pageserver_config_override = ( - "tenant_config={pitr_interval = '1 day', gc_horizon = 0}" + env = neon_env_builder.init_start( + initial_tenant_conf={"pitr_interval": "1 day", "gc_horizon": "0"} ) - - env = neon_env_builder.init_start() endpoint_main = env.endpoints.create_start("main") main_pg_conn = endpoint_main.connect() diff --git a/test_runner/regress/test_proxy_websockets.py b/test_runner/regress/test_proxy_websockets.py index 6d1cb9765a..6211446a40 100644 --- a/test_runner/regress/test_proxy_websockets.py +++ b/test_runner/regress/test_proxy_websockets.py @@ -135,7 +135,14 @@ async def test_websockets_pipelined(static_proxy: NeonProxy): query_message = "SELECT 1".encode("utf-8") + b"\0" length2 = (4 + len(query_message)).to_bytes(4, byteorder="big") await websocket.send( - [length0, startup_message, b"p", length1, auth_message, b"Q", length2, query_message] + length0 + + startup_message + + b"p" + + length1 + + auth_message + + b"Q" + + length2 + + query_message ) startup_response = await websocket.recv() diff --git a/test_runner/regress/test_read_trace.py b/test_runner/regress/test_read_trace.py index e6b3ccd7ec..cc5853b727 100644 --- a/test_runner/regress/test_read_trace.py +++ b/test_runner/regress/test_read_trace.py @@ -1,8 +1,8 @@ from contextlib import closing +from fixtures.common_types import Lsn from fixtures.neon_fixtures import NeonEnvBuilder from fixtures.pageserver.utils import wait_for_last_record_lsn -from fixtures.types import Lsn from fixtures.utils import query_scalar diff --git a/test_runner/regress/test_readonly_node.py b/test_runner/regress/test_readonly_node.py index b7c8f36107..ba8b91e84d 100644 --- a/test_runner/regress/test_readonly_node.py +++ b/test_runner/regress/test_readonly_node.py @@ -1,8 +1,8 @@ import pytest +from fixtures.common_types import Lsn from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnv from fixtures.pageserver.utils import wait_for_last_record_lsn -from fixtures.types import Lsn from fixtures.utils import query_scalar diff --git a/test_runner/regress/test_recovery.py b/test_runner/regress/test_recovery.py index ab5c8be256..e21f9bb6f6 100644 --- a/test_runner/regress/test_recovery.py +++ b/test_runner/regress/test_recovery.py @@ -10,9 +10,11 @@ from fixtures.neon_fixtures import NeonEnvBuilder # def test_pageserver_recovery(neon_env_builder: NeonEnvBuilder): # Override default checkpointer settings to run it more often - neon_env_builder.pageserver_config_override = "tenant_config={checkpoint_distance = 1048576}" - - env = neon_env_builder.init_start() + env = neon_env_builder.init_start( + initial_tenant_conf={ + "checkpoint_distance": "1048576", + } + ) env.pageserver.is_testing_enabled_or_skip() # We expect the pageserver to exit, which will cause storage storage controller diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py index 70c025c225..7f79bf5d5c 100644 --- a/test_runner/regress/test_remote_storage.py +++ b/test_runner/regress/test_remote_storage.py @@ -6,13 +6,14 @@ import time from typing import Dict, List, Optional, Tuple import pytest +from fixtures.common_types import Lsn, TenantId, TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnvBuilder, wait_for_last_flush_lsn, ) +from fixtures.pageserver.common_types import parse_layer_file_name from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient -from fixtures.pageserver.types import parse_layer_file_name from fixtures.pageserver.utils import ( timeline_delete_wait_completed, wait_for_last_record_lsn, @@ -25,7 +26,6 @@ from fixtures.remote_storage import ( RemoteStorageKind, available_remote_storages, ) -from fixtures.types import Lsn, TenantId, TimelineId from fixtures.utils import ( assert_eq, assert_ge, diff --git a/test_runner/regress/test_s3_restore.py b/test_runner/regress/test_s3_restore.py index 9227836862..7fdabaaec7 100644 --- a/test_runner/regress/test_s3_restore.py +++ b/test_runner/regress/test_s3_restore.py @@ -1,6 +1,7 @@ import time from datetime import datetime, timezone +from fixtures.common_types import Lsn from fixtures.neon_fixtures import ( NeonEnvBuilder, PgBin, @@ -14,7 +15,6 @@ from fixtures.pageserver.utils import ( wait_for_upload, ) from fixtures.remote_storage import RemoteStorageKind, s3_storage -from fixtures.types import Lsn from fixtures.utils import run_pg_bench_small diff --git a/test_runner/regress/test_s3_scrubber.py b/test_runner/regress/test_s3_scrubber.py index 018c1637d0..8981000c24 100644 --- a/test_runner/regress/test_s3_scrubber.py +++ b/test_runner/regress/test_s3_scrubber.py @@ -3,12 +3,12 @@ import shutil from typing import Optional import pytest +from fixtures.common_types import TenantShardId from fixtures.neon_fixtures import ( NeonEnvBuilder, S3Scrubber, ) from fixtures.remote_storage import S3Storage, s3_storage -from fixtures.types import TenantShardId from fixtures.workload import Workload diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py index d33803250f..bbb1ad0c6d 100644 --- a/test_runner/regress/test_sharding.py +++ b/test_runner/regress/test_sharding.py @@ -5,6 +5,7 @@ from typing import Dict, List, Optional, Union import pytest import requests +from fixtures.common_types import Lsn, TenantId, TenantShardId, TimelineId from fixtures.compute_reconfigure import ComputeReconfigure from fixtures.log_helper import log from fixtures.neon_fixtures import ( @@ -18,7 +19,6 @@ from fixtures.neon_fixtures import ( ) from fixtures.pageserver.utils import assert_prefix_empty, assert_prefix_not_empty from fixtures.remote_storage import s3_storage -from fixtures.types import Lsn, TenantId, TenantShardId, TimelineId from fixtures.utils import wait_until from fixtures.workload import Workload from pytest_httpserver import HTTPServer @@ -632,7 +632,6 @@ def test_sharding_ingest_layer_sizes( historic_layers = sorted(layer_map.historic_layers, key=lambda layer: layer.lsn_start) for layer in historic_layers: - assert layer.layer_file_size is not None if layer.layer_file_size < expect_layer_size // 2: classification = "Small" small_layer_count += 1 @@ -1326,3 +1325,45 @@ def test_sharding_unlogged_relation(neon_env_builder: NeonEnvBuilder): # Ensure that post-endpoint-restart modifications are ingested happily by pageserver wait_for_last_flush_lsn(env, ep, tenant_id, timeline_id) + + +def test_top_tenants(neon_env_builder: NeonEnvBuilder): + """ + The top_tenants API is used in shard auto-splitting to find candidates. + """ + + env = neon_env_builder.init_configs() + neon_env_builder.start() + + tenants = [] + n_tenants = 8 + for i in range(0, n_tenants): + tenant_id = TenantId.generate() + timeline_id = TimelineId.generate() + env.neon_cli.create_tenant(tenant_id, timeline_id) + + # Write a different amount of data to each tenant + w = Workload(env, tenant_id, timeline_id) + w.init() + w.write_rows(i * 1000) + w.stop() + + logical_size = env.pageserver.http_client().timeline_detail(tenant_id, timeline_id)[ + "current_logical_size" + ] + tenants.append((tenant_id, timeline_id, logical_size)) + + log.info(f"Created {tenant_id}/{timeline_id} with size {logical_size}") + + # Ask for 1 largest tenant + top_1 = env.pageserver.http_client().top_tenants("max_logical_size", 1, 8, 0) + assert len(top_1["shards"]) == 1 + assert top_1["shards"][0]["id"] == str(tenants[-1][0]) + assert top_1["shards"][0]["max_logical_size"] == tenants[-1][2] + + # Apply a lower bound limit + top = env.pageserver.http_client().top_tenants( + "max_logical_size", 100, 8, where_gt=tenants[3][2] + ) + assert len(top["shards"]) == n_tenants - 4 + assert set(i["id"] for i in top["shards"]) == set(str(i[0]) for i in tenants[4:]) diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py index bdd356388f..3a9a522f3f 100644 --- a/test_runner/regress/test_storage_controller.py +++ b/test_runner/regress/test_storage_controller.py @@ -6,6 +6,7 @@ from datetime import datetime, timezone from typing import Any, Dict, List, Union import pytest +from fixtures.common_types import TenantId, TenantShardId, TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnv, @@ -25,7 +26,6 @@ from fixtures.pageserver.utils import ( ) from fixtures.pg_version import PgVersion from fixtures.remote_storage import RemoteStorageKind, s3_storage -from fixtures.types import TenantId, TenantShardId, TimelineId from fixtures.utils import run_pg_bench_small, subprocess_capture, wait_until from fixtures.workload import Workload from mypy_boto3_s3.type_defs import ( @@ -1284,7 +1284,7 @@ def test_lock_time_tracing(neon_env_builder: NeonEnvBuilder): # Apply failpoint env.storage_controller.configure_failpoints( - ("tenant-update-policy-exclusive-lock", "return(31000)") + ("tenant-update-policy-exclusive-lock", "return(35000)") ) # This will hold the exclusive for enough time to cause an warning @@ -1306,7 +1306,7 @@ def test_lock_time_tracing(neon_env_builder: NeonEnvBuilder): env.storage_controller.pageserver_api().timeline_create( pg_version=PgVersion.NOT_SET, tenant_id=tenant_id, new_timeline_id=timeline_id ) - thread_update_tenant_policy.join(timeout=10) + thread_update_tenant_policy.join() env.storage_controller.assert_log_contains("Lock on UpdatePolicy was held for") env.storage_controller.assert_log_contains( diff --git a/test_runner/regress/test_tenant_conf.py b/test_runner/regress/test_tenant_conf.py index fc099297e1..2cbb036c0d 100644 --- a/test_runner/regress/test_tenant_conf.py +++ b/test_runner/regress/test_tenant_conf.py @@ -1,29 +1,36 @@ import json from contextlib import closing +from typing import Any, Dict import psycopg2.extras +from fixtures.common_types import Lsn from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnvBuilder, ) from fixtures.pageserver.utils import assert_tenant_state, wait_for_upload from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind -from fixtures.types import Lsn from fixtures.utils import wait_until def test_tenant_config(neon_env_builder: NeonEnvBuilder): """Test per tenant configuration""" - # set some non-default global config - neon_env_builder.pageserver_config_override = """ -page_cache_size=444; -wait_lsn_timeout='111 s'; -[tenant_config] -checkpoint_distance = 10000 -compaction_target_size = 1048576 -evictions_low_residence_duration_metric_threshold = "2 days" -eviction_policy = { "kind" = "LayerAccessThreshold", period = "20s", threshold = "23 hours" } -""" + + def set_some_nondefault_global_config(ps_cfg: Dict[str, Any]): + ps_cfg["page_cache_size"] = 444 + ps_cfg["wait_lsn_timeout"] = "111 s" + + tenant_config = ps_cfg.setdefault("tenant_config", {}) + tenant_config["checkpoint_distance"] = 10000 + tenant_config["compaction_target_size"] = 1048576 + tenant_config["evictions_low_residence_duration_metric_threshold"] = "2 days" + tenant_config["eviction_policy"] = { + "kind": "LayerAccessThreshold", + "period": "20s", + "threshold": "23 hours", + } + + neon_env_builder.pageserver_config_override = set_some_nondefault_global_config env = neon_env_builder.init_start() # we configure eviction but no remote storage, there might be error lines diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py index 363c3c88ec..3fc44de6fa 100644 --- a/test_runner/regress/test_tenant_delete.py +++ b/test_runner/regress/test_tenant_delete.py @@ -5,6 +5,7 @@ import shutil from threading import Thread import pytest +from fixtures.common_types import Lsn, TenantId, TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnvBuilder, @@ -26,7 +27,6 @@ from fixtures.pageserver.utils import ( wait_until_tenant_state, ) from fixtures.remote_storage import RemoteStorageKind, available_s3_storages, s3_storage -from fixtures.types import Lsn, TenantId, TimelineId from fixtures.utils import run_pg_bench_small, wait_until from requests.exceptions import ReadTimeout diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py index 0ba0108651..12a4730e69 100644 --- a/test_runner/regress/test_tenant_detach.py +++ b/test_runner/regress/test_tenant_detach.py @@ -7,6 +7,7 @@ from typing import List, Optional import asyncpg import pytest +from fixtures.common_types import Lsn, TenantId, TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import ( Endpoint, @@ -22,7 +23,6 @@ from fixtures.pageserver.utils import ( from fixtures.remote_storage import ( RemoteStorageKind, ) -from fixtures.types import Lsn, TenantId, TimelineId from fixtures.utils import query_scalar, wait_until from prometheus_client.samples import Sample diff --git a/test_runner/regress/test_tenant_relocation.py b/test_runner/regress/test_tenant_relocation.py index 68d9d9a660..be289e03d6 100644 --- a/test_runner/regress/test_tenant_relocation.py +++ b/test_runner/regress/test_tenant_relocation.py @@ -7,6 +7,7 @@ from pathlib import Path from typing import Any, Dict, Optional, Tuple import pytest +from fixtures.common_types import Lsn, TenantId, TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import Endpoint, NeonEnvBuilder, NeonPageserver from fixtures.pageserver.http import PageserverHttpClient @@ -20,7 +21,6 @@ from fixtures.remote_storage import ( LocalFsStorage, RemoteStorageKind, ) -from fixtures.types import Lsn, TenantId, TimelineId from fixtures.utils import ( query_scalar, wait_until, diff --git a/test_runner/regress/test_tenant_size.py b/test_runner/regress/test_tenant_size.py index e73eae91f0..d3a228dbeb 100644 --- a/test_runner/regress/test_tenant_size.py +++ b/test_runner/regress/test_tenant_size.py @@ -4,6 +4,7 @@ from pathlib import Path from typing import List, Tuple import pytest +from fixtures.common_types import Lsn, TenantId, TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import ( Endpoint, @@ -19,7 +20,6 @@ from fixtures.pageserver.utils import ( wait_until_tenant_active, ) from fixtures.pg_version import PgVersion -from fixtures.types import Lsn, TenantId, TimelineId from fixtures.utils import wait_until @@ -502,9 +502,14 @@ def test_get_tenant_size_with_multiple_branches( gc_horizon = 128 * 1024 - neon_env_builder.pageserver_config_override = f"tenant_config={{compaction_period='0s', gc_period='0s', pitr_interval='0sec', gc_horizon={gc_horizon}}}" - - env = neon_env_builder.init_start() + env = neon_env_builder.init_start( + initial_tenant_conf={ + "compaction_period": "0s", + "gc_period": "0s", + "pitr_interval": "0sec", + "gc_horizon": gc_horizon, + } + ) # FIXME: we have a race condition between GC and delete timeline. GC might fail with this # error. Similar to https://github.com/neondatabase/neon/issues/2671 diff --git a/test_runner/regress/test_tenant_tasks.py b/test_runner/regress/test_tenant_tasks.py index 75e5c2c91c..d08ad3cd2e 100644 --- a/test_runner/regress/test_tenant_tasks.py +++ b/test_runner/regress/test_tenant_tasks.py @@ -1,3 +1,4 @@ +from fixtures.common_types import TenantId, TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder from fixtures.pageserver.utils import ( @@ -5,7 +6,6 @@ from fixtures.pageserver.utils import ( timeline_delete_wait_completed, wait_until_tenant_active, ) -from fixtures.types import TenantId, TimelineId from fixtures.utils import wait_until diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py index 2832304dcc..93e9ad3673 100644 --- a/test_runner/regress/test_tenants.py +++ b/test_runner/regress/test_tenants.py @@ -9,6 +9,7 @@ from typing import List import pytest import requests +from fixtures.common_types import Lsn, TenantId from fixtures.log_helper import log from fixtures.metrics import ( PAGESERVER_GLOBAL_METRICS, @@ -24,7 +25,6 @@ from fixtures.pageserver.http import PageserverApiException from fixtures.pageserver.utils import timeline_delete_wait_completed, wait_until_tenant_active from fixtures.pg_version import PgVersion from fixtures.remote_storage import RemoteStorageKind -from fixtures.types import Lsn, TenantId from fixtures.utils import wait_until from prometheus_client.samples import Sample diff --git a/test_runner/regress/test_tenants_with_remote_storage.py b/test_runner/regress/test_tenants_with_remote_storage.py index a1e96928bf..168876b711 100644 --- a/test_runner/regress/test_tenants_with_remote_storage.py +++ b/test_runner/regress/test_tenants_with_remote_storage.py @@ -11,6 +11,7 @@ import os from pathlib import Path from typing import List, Tuple +from fixtures.common_types import Lsn, TenantId, TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import ( Endpoint, @@ -18,7 +19,7 @@ from fixtures.neon_fixtures import ( NeonEnvBuilder, last_flush_lsn_upload, ) -from fixtures.pageserver.types import parse_layer_file_name +from fixtures.pageserver.common_types import parse_layer_file_name from fixtures.pageserver.utils import ( assert_tenant_state, wait_for_last_record_lsn, @@ -28,7 +29,6 @@ from fixtures.remote_storage import ( LocalFsStorage, RemoteStorageKind, ) -from fixtures.types import Lsn, TenantId, TimelineId from fixtures.utils import query_scalar, wait_until diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py index 0eb1327c9e..da37f469b3 100644 --- a/test_runner/regress/test_timeline_delete.py +++ b/test_runner/regress/test_timeline_delete.py @@ -6,6 +6,7 @@ import threading import pytest import requests +from fixtures.common_types import Lsn, TenantId, TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnv, @@ -31,7 +32,6 @@ from fixtures.remote_storage import ( RemoteStorageKind, s3_storage, ) -from fixtures.types import Lsn, TenantId, TimelineId from fixtures.utils import query_scalar, run_pg_bench_small, wait_until from urllib3.util.retry import Retry diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py index b8a88ca6df..f0b2f7d733 100644 --- a/test_runner/regress/test_timeline_detach_ancestor.py +++ b/test_runner/regress/test_timeline_detach_ancestor.py @@ -1,18 +1,22 @@ +import datetime import enum from concurrent.futures import ThreadPoolExecutor from queue import Empty, Queue from threading import Barrier -from typing import List +from typing import List, Tuple import pytest +from fixtures.common_types import Lsn, TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnvBuilder, + PgBin, wait_for_last_flush_lsn, ) -from fixtures.pageserver.http import HistoricLayerInfo -from fixtures.pageserver.utils import wait_timeline_detail_404 -from fixtures.types import Lsn, TimelineId +from fixtures.pageserver.http import HistoricLayerInfo, PageserverApiException +from fixtures.pageserver.utils import wait_tenant_status_404, wait_timeline_detail_404 +from fixtures.remote_storage import LocalFsStorage +from fixtures.utils import assert_pageserver_backups_equal def by_end_lsn(info: HistoricLayerInfo) -> Lsn: @@ -51,20 +55,25 @@ class Branchpoint(str, enum.Enum): SHUTDOWN_ALLOWED_ERRORS = [ ".*initial size calculation failed: downloading failed, possibly for shutdown", ".*failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited", + ".*logical_size_calculation_task:panic.*: Sequential get failed with Bad state \\(not active\\).*", + ".*Task 'initial size calculation' .* panicked.*", ] @pytest.mark.parametrize("branchpoint", Branchpoint.all()) @pytest.mark.parametrize("restart_after", [True, False]) +@pytest.mark.parametrize("write_to_branch_first", [True, False]) def test_ancestor_detach_branched_from( - neon_env_builder: NeonEnvBuilder, branchpoint: Branchpoint, restart_after: bool + test_output_dir, + neon_env_builder: NeonEnvBuilder, + pg_bin: PgBin, + branchpoint: Branchpoint, + restart_after: bool, + write_to_branch_first: bool, ): """ Creates a branch relative to L0 lsn boundary according to Branchpoint. Later the timeline is detached. """ - # TODO: parametrize; currently unimplemented over at pageserver - write_to_branch_first = True - env = neon_env_builder.init_start() env.pageserver.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS) @@ -143,6 +152,13 @@ def test_ancestor_detach_branched_from( else: branch_layers = set() + # run fullbackup to make sure there are no off by one errors + # take this on the parent + fullbackup_before = test_output_dir / "fullbackup-before.tar" + pg_bin.take_fullbackup( + env.pageserver, env.initial_tenant, env.initial_timeline, branch_at, fullbackup_before + ) + all_reparented = client.detach_ancestor(env.initial_tenant, timeline_id) assert all_reparented == set() @@ -170,12 +186,23 @@ def test_ancestor_detach_branched_from( # but if nothing was copied, then there is no nice rule. # there could be a hole in LSNs between copied from the "old main" and the first branch layer. + # take this on the detached, at same lsn + fullbackup_after = test_output_dir / "fullbackup-after.tar" + pg_bin.take_fullbackup( + env.pageserver, env.initial_tenant, timeline_id, branch_at, fullbackup_after + ) + client.timeline_delete(env.initial_tenant, env.initial_timeline) wait_timeline_detail_404(client, env.initial_tenant, env.initial_timeline, 10, 1.0) + # because we do the fullbackup from ancestor at the branch_lsn, the zenith.signal is always different + # as there is always "PREV_LSN: invalid" for "before" + skip_files = {"zenith.signal"} -@pytest.mark.parametrize("restart_after", [True, False]) -def test_ancestor_detach_reparents_earlier(neon_env_builder: NeonEnvBuilder, restart_after: bool): + assert_pageserver_backups_equal(fullbackup_before, fullbackup_after, skip_files) + + +def test_ancestor_detach_reparents_earlier(neon_env_builder: NeonEnvBuilder): """ The case from RFC: @@ -204,9 +231,6 @@ def test_ancestor_detach_reparents_earlier(neon_env_builder: NeonEnvBuilder, res We confirm the end result by being able to delete "old main" after deleting "after". """ - # TODO: support not yet implemented for these - write_to_branch_first = True - env = neon_env_builder.init_start() env.pageserver.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS) @@ -244,42 +268,57 @@ def test_ancestor_detach_reparents_earlier(neon_env_builder: NeonEnvBuilder, res after = env.neon_cli.create_branch("after", "main", env.initial_tenant, ancestor_start_lsn=None) - if write_to_branch_first: - with env.endpoints.create_start("new main", tenant_id=env.initial_tenant) as ep: - assert ep.safe_psql("SELECT count(*) FROM foo;")[0][0] == 8192 - with ep.cursor() as cur: - cur.execute("UPDATE audit SET starts = starts + 1") - assert cur.rowcount == 1 - wait_for_last_flush_lsn(env, ep, env.initial_tenant, timeline_id) - - client.timeline_checkpoint(env.initial_tenant, timeline_id) - all_reparented = client.detach_ancestor(env.initial_tenant, timeline_id) assert all_reparented == {reparented, same_branchpoint} - if restart_after: - env.pageserver.stop() - env.pageserver.start() - env.pageserver.quiesce_tenants() # checking the ancestor after is much faster than waiting for the endpoint not start expected_result = [ ("main", env.initial_timeline, None, 16384, 1), ("after", after, env.initial_timeline, 16384, 1), - ("new main", timeline_id, None, 8192, 2), + ("new main", timeline_id, None, 8192, 1), ("same_branchpoint", same_branchpoint, timeline_id, 8192, 1), ("reparented", reparented, timeline_id, 0, 1), ] - for _, timeline_id, expected_ancestor, _, _ in expected_result: - details = client.timeline_detail(env.initial_tenant, timeline_id) + assert isinstance(env.pageserver_remote_storage, LocalFsStorage) + + for _, queried_timeline, expected_ancestor, _, _ in expected_result: + details = client.timeline_detail(env.initial_tenant, queried_timeline) ancestor_timeline_id = details["ancestor_timeline_id"] if expected_ancestor is None: assert ancestor_timeline_id is None else: assert TimelineId(ancestor_timeline_id) == expected_ancestor + index_part = env.pageserver_remote_storage.index_content( + env.initial_tenant, queried_timeline + ) + lineage = index_part["lineage"] + assert lineage is not None + + assert lineage.get("reparenting_history_overflown", "false") == "false" + + if queried_timeline == timeline_id: + original_ancestor = lineage["original_ancestor"] + assert original_ancestor is not None + assert original_ancestor[0] == str(env.initial_timeline) + assert original_ancestor[1] == str(branchpoint_x) + + # this does not contain Z in the end, so fromisoformat accepts it + # it is to be in line with the deletion timestamp.. well, almost. + when = original_ancestor[2][:26] + when_ts = datetime.datetime.fromisoformat(when) + assert when_ts < datetime.datetime.now() + assert len(lineage.get("reparenting_history", [])) == 0 + elif expected_ancestor == timeline_id: + assert len(lineage.get("original_ancestor", [])) == 0 + assert lineage["reparenting_history"] == [str(env.initial_timeline)] + else: + assert len(lineage.get("original_ancestor", [])) == 0 + assert len(lineage.get("reparenting_history", [])) == 0 + for name, _, _, rows, starts in expected_result: with env.endpoints.create_start(name, tenant_id=env.initial_tenant) as ep: assert ep.safe_psql("SELECT count(*) FROM foo;")[0][0] == rows @@ -293,14 +332,10 @@ def test_ancestor_detach_reparents_earlier(neon_env_builder: NeonEnvBuilder, res wait_timeline_detail_404(client, env.initial_tenant, env.initial_timeline, 10, 1.0) -@pytest.mark.parametrize("restart_after", [True, False]) -def test_detached_receives_flushes_while_being_detached( - neon_env_builder: NeonEnvBuilder, restart_after: bool -): +def test_detached_receives_flushes_while_being_detached(neon_env_builder: NeonEnvBuilder): """ Makes sure that the timeline is able to receive writes through-out the detach process. """ - write_to_branch_first = True env = neon_env_builder.init_start() @@ -330,12 +365,6 @@ def test_detached_receives_flushes_while_being_detached( ep = env.endpoints.create_start("new main", tenant_id=env.initial_tenant) assert ep.safe_psql("SELECT count(*) FROM foo;")[0][0] == rows - if write_to_branch_first: - rows += insert_rows(256, ep) - wait_for_last_flush_lsn(env, ep, env.initial_tenant, timeline_id) - client.timeline_checkpoint(env.initial_tenant, timeline_id) - log.info("completed {write_to_branch_first=}") - def small_txs(ep, queue: Queue[str], barrier): extra_rows = 0 @@ -368,11 +397,6 @@ def test_detached_receives_flushes_while_being_detached( reparented = client.detach_ancestor(env.initial_tenant, timeline_id) assert len(reparented) == 0 - if restart_after: - # ep and row production is kept alive on purpose - env.pageserver.stop() - env.pageserver.start() - env.pageserver.quiesce_tenants() queue.put("done") @@ -394,6 +418,173 @@ def test_detached_receives_flushes_while_being_detached( env.pageserver.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS) +def test_compaction_induced_by_detaches_in_history( + neon_env_builder: NeonEnvBuilder, test_output_dir, pg_bin: PgBin +): + """ + Assuming the tree of timelines: + + root + |- child1 + |- ... + |- wanted_detached_child + + Each detach can add N more L0 per level, this is actually unbounded because + compaction can be arbitrarily delayed (or detach happen right before one + starts). If "wanted_detached_child" has already made progress and compacted + L1s, we want to make sure "compaction in the history" does not leave the + timeline broken. + """ + + env = neon_env_builder.init_start( + initial_tenant_conf={ + # we want to create layers manually so we don't branch on arbitrary + # Lsn, but we also do not want to compact L0 -> L1. + "compaction_threshold": "99999", + "compaction_period": "0s", + # shouldn't matter, but just in case + "gc_period": "0s", + } + ) + env.pageserver.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS) + client = env.pageserver.http_client() + + def delta_layers(timeline_id: TimelineId): + # shorthand for more readable formatting + return client.layer_map_info(env.initial_tenant, timeline_id).delta_layers() + + with env.endpoints.create_start("main", tenant_id=env.initial_tenant) as ep: + ep.safe_psql("create table integers (i bigint not null);") + ep.safe_psql("insert into integers (i) values (42)") + branch_lsn = wait_for_last_flush_lsn(env, ep, env.initial_tenant, env.initial_timeline) + + client.timeline_checkpoint(env.initial_tenant, env.initial_timeline) + + assert len(delta_layers(env.initial_timeline)) == 2 + + more_good_numbers = range(0, 3) + + branches: List[Tuple[str, TimelineId]] = [("main", env.initial_timeline)] + + for num in more_good_numbers: + branch_name = f"br-{len(branches)}" + branch_timeline_id = env.neon_cli.create_branch( + branch_name, + ancestor_branch_name=branches[-1][0], + tenant_id=env.initial_tenant, + ancestor_start_lsn=branch_lsn, + ) + branches.append((branch_name, branch_timeline_id)) + + with env.endpoints.create_start(branches[-1][0], tenant_id=env.initial_tenant) as ep: + ep.safe_psql( + f"insert into integers (i) select i from generate_series({num}, {num + 100}) as s(i)" + ) + branch_lsn = wait_for_last_flush_lsn(env, ep, env.initial_tenant, branch_timeline_id) + client.timeline_checkpoint(env.initial_tenant, branch_timeline_id) + + assert len(delta_layers(branch_timeline_id)) == 1 + + # now fill in the final, most growing timeline + + branch_name, branch_timeline_id = branches[-1] + with env.endpoints.create_start(branch_name, tenant_id=env.initial_tenant) as ep: + ep.safe_psql("insert into integers (i) select i from generate_series(50, 500) s(i)") + + last_suffix = None + for suffix in range(0, 4): + ep.safe_psql(f"create table other_table_{suffix} as select * from integers") + wait_for_last_flush_lsn(env, ep, env.initial_tenant, branch_timeline_id) + client.timeline_checkpoint(env.initial_tenant, branch_timeline_id) + last_suffix = suffix + + assert last_suffix is not None + + assert len(delta_layers(branch_timeline_id)) == 5 + + client.patch_tenant_config_client_side( + env.initial_tenant, {"compaction_threshold": 5}, None + ) + + client.timeline_compact(env.initial_tenant, branch_timeline_id) + + # one more layer + ep.safe_psql(f"create table other_table_{last_suffix + 1} as select * from integers") + wait_for_last_flush_lsn(env, ep, env.initial_tenant, branch_timeline_id) + + # we need to wait here, because the detaches will do implicit tenant restart, + # and we could get unexpected layer counts + client.timeline_checkpoint(env.initial_tenant, branch_timeline_id, wait_until_uploaded=True) + + assert len([filter(lambda x: x.l0, delta_layers(branch_timeline_id))]) == 1 + + skip_main = branches[1:] + branch_lsn = client.timeline_detail(env.initial_tenant, branch_timeline_id)["ancestor_lsn"] + + # take the fullbackup before and after inheriting the new L0s + fullbackup_before = test_output_dir / "fullbackup-before.tar" + pg_bin.take_fullbackup( + env.pageserver, env.initial_tenant, branch_timeline_id, branch_lsn, fullbackup_before + ) + + for _, timeline_id in skip_main: + reparented = client.detach_ancestor(env.initial_tenant, timeline_id) + assert reparented == set(), "we have no earlier branches at any level" + + post_detach_l0s = list(filter(lambda x: x.l0, delta_layers(branch_timeline_id))) + assert len(post_detach_l0s) == 5, "should had inherited 4 L0s, have 5 in total" + + # checkpoint does compaction, which in turn decides to run, because + # there is now in total threshold number L0s even if they are not + # adjacent in Lsn space: + # + # inherited flushed during this checkpoint + # \\\\ / + # 1234X5---> lsn + # | + # l1 layers from "fill in the final, most growing timeline" + # + # branch_lsn is between 4 and first X. + client.timeline_checkpoint(env.initial_tenant, branch_timeline_id) + + post_compact_l0s = list(filter(lambda x: x.l0, delta_layers(branch_timeline_id))) + assert len(post_compact_l0s) == 1, "only the consecutive inherited L0s should be compacted" + + fullbackup_after = test_output_dir / "fullbackup_after.tar" + pg_bin.take_fullbackup( + env.pageserver, env.initial_tenant, branch_timeline_id, branch_lsn, fullbackup_after + ) + + # we don't need to skip any files, because zenith.signal will be identical + assert_pageserver_backups_equal(fullbackup_before, fullbackup_after, set()) + + +def test_timeline_ancestor_errors(neon_env_builder: NeonEnvBuilder): + env = neon_env_builder.init_start() + env.pageserver.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS) + + client = env.pageserver.http_client() + + with pytest.raises(PageserverApiException, match=".* no ancestors") as info: + client.detach_ancestor(env.initial_tenant, env.initial_timeline) + assert info.value.status_code == 409 + + first_branch = env.neon_cli.create_branch("first_branch") + second_branch = env.neon_cli.create_branch("second_branch", ancestor_branch_name="first_branch") + + # funnily enough this does not have a prefix + with pytest.raises(PageserverApiException, match="too many ancestors") as info: + client.detach_ancestor(env.initial_tenant, second_branch) + assert info.value.status_code == 400 + + client.tenant_delete(env.initial_tenant) + wait_tenant_status_404(client, env.initial_tenant, 10, 1) + + with pytest.raises(PageserverApiException) as e: + client.detach_ancestor(env.initial_tenant, first_branch) + assert e.value.status_code == 404 + + # TODO: # - after starting the operation, tenant is deleted # - after starting the operation, pageserver is shutdown, restarted diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py index 628c484fbd..db5297870e 100644 --- a/test_runner/regress/test_timeline_size.py +++ b/test_runner/regress/test_timeline_size.py @@ -10,6 +10,7 @@ from typing import Optional import psycopg2.errors import psycopg2.extras import pytest +from fixtures.common_types import TenantId, TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import ( Endpoint, @@ -31,7 +32,6 @@ from fixtures.pageserver.utils import ( from fixtures.pg_version import PgVersion from fixtures.port_distributor import PortDistributor from fixtures.remote_storage import RemoteStorageKind -from fixtures.types import TenantId, TimelineId from fixtures.utils import get_timeline_dir_size, wait_until @@ -415,11 +415,12 @@ def test_timeline_physical_size_post_compaction(neon_env_builder: NeonEnvBuilder # Disable background compaction as we don't want it to happen after `get_physical_size` request # and before checking the expected size on disk, which makes the assertion failed - neon_env_builder.pageserver_config_override = ( - "tenant_config={checkpoint_distance=100000, compaction_period='10m'}" + env = neon_env_builder.init_start( + initial_tenant_conf={ + "checkpoint_distance": "100000", + "compaction_period": "10m", + } ) - - env = neon_env_builder.init_start() pageserver_http = env.pageserver.http_client() new_timeline_id = env.neon_cli.create_branch("test_timeline_physical_size_post_compaction") @@ -462,9 +463,14 @@ def test_timeline_physical_size_post_gc(neon_env_builder: NeonEnvBuilder): # Disable background compaction and GC as we don't want it to happen after `get_physical_size` request # and before checking the expected size on disk, which makes the assertion failed - neon_env_builder.pageserver_config_override = "tenant_config={checkpoint_distance=100000, compaction_period='0s', gc_period='0s', pitr_interval='1s'}" - - env = neon_env_builder.init_start() + env = neon_env_builder.init_start( + initial_tenant_conf={ + "checkpoint_distance": "100000", + "compaction_period": "0s", + "gc_period": "0s", + "pitr_interval": "1s", + } + ) pageserver_http = env.pageserver.http_client() new_timeline_id = env.neon_cli.create_branch("test_timeline_physical_size_post_gc") @@ -650,7 +656,7 @@ def get_physical_size_values( client = env.pageserver.http_client() res.layer_map_file_size_sum = sum( - layer.layer_file_size or 0 + layer.layer_file_size for layer in client.layer_map_info(tenant_id, timeline_id).historic_layers ) diff --git a/test_runner/regress/test_unlogged.py b/test_runner/regress/test_unlogged.py index 708bf0dfeb..137d28b9fa 100644 --- a/test_runner/regress/test_unlogged.py +++ b/test_runner/regress/test_unlogged.py @@ -1,4 +1,5 @@ from fixtures.neon_fixtures import NeonEnv, fork_at_current_lsn +from fixtures.pg_version import PgVersion # @@ -17,7 +18,8 @@ def test_unlogged(neon_simple_env: NeonEnv): cur.execute("CREATE UNLOGGED TABLE iut (id int);") # create index to test unlogged index relation as well cur.execute("CREATE UNIQUE INDEX iut_idx ON iut (id);") - cur.execute("INSERT INTO iut values (42);") + cur.execute("ALTER TABLE iut ADD COLUMN seq int GENERATED ALWAYS AS IDENTITY;") + cur.execute("INSERT INTO iut (id) values (42);") # create another compute to fetch inital empty contents from pageserver fork_at_current_lsn(env, endpoint, "test_unlogged_basebackup", "test_unlogged") @@ -26,7 +28,15 @@ def test_unlogged(neon_simple_env: NeonEnv): conn2 = endpoint2.connect() cur2 = conn2.cursor() # after restart table should be empty but valid - cur2.execute("PREPARE iut_plan (int) AS INSERT INTO iut VALUES ($1)") + cur2.execute("PREPARE iut_plan (int) AS INSERT INTO iut (id) VALUES ($1)") cur2.execute("EXECUTE iut_plan (43);") cur2.execute("SELECT * FROM iut") - assert cur2.fetchall() == [(43,)] + results = cur2.fetchall() + # Unlogged sequences were introduced in v15. On <= v14, the sequence created + # for the GENERATED ALWAYS AS IDENTITY column is logged, and hence it keeps + # the old value (2) on restart. While on v15 and above, it's unlogged, so it + # gets reset to 1. + if env.pg_version <= PgVersion.V14: + assert results == [(43, 2)] + else: + assert results == [(43, 1)] diff --git a/test_runner/regress/test_vm_bits.py b/test_runner/regress/test_vm_bits.py index 06f2a8befd..b549db1af6 100644 --- a/test_runner/regress/test_vm_bits.py +++ b/test_runner/regress/test_vm_bits.py @@ -168,15 +168,16 @@ def test_vm_bit_clear_on_heap_lock(neon_env_builder: NeonEnvBuilder): # The VM page in shared buffer cache, and the same page as reconstructed # by the pageserver, should be equal. # - # Ignore the LSN on the page though (first 8 bytes). If the dirty - # VM page is flushed from the cache for some reason, it gets WAL-logged, - # which changes the LSN on the page. + # Ignore page header (24 bytes) of visibility map. + # If the dirty VM page is flushed from the cache for some reason, + # it gets WAL-logged, which changes the LSN on the page. + # Also in neon SMGR we can replace empty heap page with zero (uninitialized) heap page. cur.execute("select get_raw_page( 'vmtest_lock', 'vm', 0 )") - vm_page_in_cache = (cur.fetchall()[0][0])[8:100].hex() + vm_page_in_cache = (cur.fetchall()[0][0])[24:100].hex() cur.execute( "select get_raw_page_at_lsn( 'vmtest_lock', 'vm', 0, pg_current_wal_insert_lsn(), NULL )" ) - vm_page_at_pageserver = (cur.fetchall()[0][0])[8:100].hex() + vm_page_at_pageserver = (cur.fetchall()[0][0])[24:100].hex() assert vm_page_at_pageserver == vm_page_in_cache diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index 967d133e18..ea66eeff63 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -18,6 +18,7 @@ import psycopg2.errors import psycopg2.extras import pytest from fixtures.broker import NeonBroker +from fixtures.common_types import Lsn, TenantId, TimelineId from fixtures.log_helper import log from fixtures.metrics import parse_metrics from fixtures.neon_fixtures import ( @@ -47,7 +48,6 @@ from fixtures.remote_storage import ( ) from fixtures.safekeeper.http import SafekeeperHttpClient from fixtures.safekeeper.utils import are_walreceivers_absent -from fixtures.types import Lsn, TenantId, TimelineId from fixtures.utils import get_dir_size, query_scalar, start_in_background diff --git a/test_runner/regress/test_wal_acceptor_async.py b/test_runner/regress/test_wal_acceptor_async.py index dce5616ac6..b5d86de574 100644 --- a/test_runner/regress/test_wal_acceptor_async.py +++ b/test_runner/regress/test_wal_acceptor_async.py @@ -8,10 +8,10 @@ from typing import List, Optional import asyncpg import pytest import toml +from fixtures.common_types import Lsn, TenantId, TimelineId from fixtures.log_helper import getLogger from fixtures.neon_fixtures import Endpoint, NeonEnv, NeonEnvBuilder, Safekeeper from fixtures.remote_storage import RemoteStorageKind -from fixtures.types import Lsn, TenantId, TimelineId log = getLogger("root.safekeeper_async") diff --git a/test_runner/regress/test_wal_receiver.py b/test_runner/regress/test_wal_receiver.py index 7ac6e6332c..6582b34218 100644 --- a/test_runner/regress/test_wal_receiver.py +++ b/test_runner/regress/test_wal_receiver.py @@ -1,8 +1,9 @@ import time +from typing import Any, Dict +from fixtures.common_types import Lsn, TenantId from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder -from fixtures.types import Lsn, TenantId # Checks that pageserver's walreceiver state is printed in the logs during WAL wait timeout. @@ -42,10 +43,14 @@ def test_pageserver_lsn_wait_error_start(neon_env_builder: NeonEnvBuilder): # Kills one of the safekeepers and ensures that only the active ones are printed in the state. def test_pageserver_lsn_wait_error_safekeeper_stop(neon_env_builder: NeonEnvBuilder): # Trigger WAL wait timeout faster - neon_env_builder.pageserver_config_override = """ - wait_lsn_timeout = "1s" - tenant_config={walreceiver_connect_timeout = "2s", lagging_wal_timeout = "2s"} - """ + def customize_pageserver_toml(ps_cfg: Dict[str, Any]): + ps_cfg["wait_lsn_timeout"] = "1s" + tenant_config = ps_cfg.setdefault("tenant_config", {}) + tenant_config["walreceiver_connect_timeout"] = "2s" + tenant_config["lagging_wal_timeout"] = "2s" + + neon_env_builder.pageserver_config_override = customize_pageserver_toml + # Have notable SK ids to ensure we check logs for their presence, not some other random numbers neon_env_builder.safekeepers_id_start = 12345 neon_env_builder.num_safekeepers = 3 diff --git a/test_runner/regress/test_wal_restore.py b/test_runner/regress/test_wal_restore.py index 083a259d85..01a1d5cf55 100644 --- a/test_runner/regress/test_wal_restore.py +++ b/test_runner/regress/test_wal_restore.py @@ -6,6 +6,7 @@ from typing import List import pytest import zstandard +from fixtures.common_types import Lsn, TenantId, TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnvBuilder, @@ -19,7 +20,6 @@ from fixtures.pageserver.utils import ( ) from fixtures.port_distributor import PortDistributor from fixtures.remote_storage import LocalFsStorage, S3Storage, s3_storage -from fixtures.types import Lsn, TenantId, TimelineId from mypy_boto3_s3.type_defs import ( ObjectTypeDef, ) diff --git a/test_runner/regress/test_walredo_not_left_behind_on_detach.py b/test_runner/regress/test_walredo_not_left_behind_on_detach.py index 13159efbe8..ad37807dba 100644 --- a/test_runner/regress/test_walredo_not_left_behind_on_detach.py +++ b/test_runner/regress/test_walredo_not_left_behind_on_detach.py @@ -2,10 +2,10 @@ import time import psutil import pytest +from fixtures.common_types import TenantId from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder from fixtures.pageserver.http import PageserverApiException -from fixtures.types import TenantId def assert_child_processes(pageserver_pid, wal_redo_present=False, defunct_present=False): diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index d6f7e2c604..0d30e28f74 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit d6f7e2c604bfc7cbc4c46bcea0a8e800f4bc778a +Subproject commit 0d30e28f74f49fe6a27a6bd45dcfeb1060656b8f diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index f0d6b0ef75..74fb144890 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit f0d6b0ef7581bd78011832e23d8420a7d2c8a83a +Subproject commit 74fb144890c4f955db1ef50ee1eeb9d8a6c2f69d diff --git a/vendor/postgres-v16 b/vendor/postgres-v16 index 8ef3c33aa0..3c2b9d576c 160000 --- a/vendor/postgres-v16 +++ b/vendor/postgres-v16 @@ -1 +1 @@ -Subproject commit 8ef3c33aa01631e17cb24a122776349fcc777b46 +Subproject commit 3c2b9d576c580e0b5b7108001f959b8c5b42e0a2 diff --git a/vendor/revisions.json b/vendor/revisions.json index c5b55762fa..2f16f334c5 100644 --- a/vendor/revisions.json +++ b/vendor/revisions.json @@ -1,5 +1,5 @@ { - "v16": ["16.2", "8ef3c33aa01631e17cb24a122776349fcc777b46"], - "v15": ["15.6", "f0d6b0ef7581bd78011832e23d8420a7d2c8a83a"], - "v14": ["14.11", "d6f7e2c604bfc7cbc4c46bcea0a8e800f4bc778a"] + "v16": ["16.3", "3c2b9d576c580e0b5b7108001f959b8c5b42e0a2"], + "v15": ["15.7", "74fb144890c4f955db1ef50ee1eeb9d8a6c2f69d"], + "v14": ["14.12", "0d30e28f74f49fe6a27a6bd45dcfeb1060656b8f"] } diff --git a/vm-image-spec.yaml b/vm-image-spec.yaml index 41ca16f16b..0f9d56e466 100644 --- a/vm-image-spec.yaml +++ b/vm-image-spec.yaml @@ -244,6 +244,93 @@ files: values: [approximate_working_set_size] query: | select neon.approximate_working_set_size(false) as approximate_working_set_size; + + - metric_name: current_lsn + type: gauge + help: 'Current LSN of the database' + key_labels: + values: [lsn] + query: | + select + case + when pg_catalog.pg_is_in_recovery() + then (pg_last_wal_replay_lsn() - '0/0')::FLOAT8 + else (pg_current_wal_lsn() - '0/0')::FLOAT8 + end as lsn; + + - metric_name: replication_delay_bytes + type: gauge + help: 'Bytes between received and replayed LSN' + key_labels: + values: [replication_delay_bytes] + query: | + SELECT pg_wal_lsn_diff(pg_last_wal_receive_lsn(), pg_last_wal_replay_lsn()) AS replication_delay_bytes; + + - metric_name: replication_delay_seconds + type: gauge + help: 'Time since last LSN was replayed' + key_labels: + values: [replication_delay_seconds] + query: | + SELECT + CASE + WHEN pg_last_wal_receive_lsn() = pg_last_wal_replay_lsn() THEN 0 + ELSE GREATEST (0, EXTRACT (EPOCH FROM now() - pg_last_xact_replay_timestamp())) + END AS replication_delay_seconds; + + - metric_name: checkpoints_req + type: gauge + help: 'Number of requested checkpoints' + key_labels: + values: [checkpoints_req] + query: | + SELECT checkpoints_req FROM pg_stat_bgwriter; + + - metric_name: checkpoints_timed + type: gauge + help: 'Number of scheduled checkpoints' + key_labels: + values: [checkpoints_timed] + query: | + SELECT checkpoints_timed FROM pg_stat_bgwriter; + + # In all the below metrics, we cast LSNs to floats because Prometheus only supports floats. + # It's probably fine because float64 can store integers from -2^53 to +2^53 exactly. + + # Number of slots is limited by max_replication_slots, so collecting position for all of them shouldn't be bad. + - metric_name: logical_slot_restart_lsn + type: gauge + help: 'restart_lsn of logical slots' + key_labels: + - slot_name + values: [restart_lsn] + query: | + select slot_name, (restart_lsn - '0/0')::FLOAT8 from pg_replication_slots where slot_type = 'logical'; + + - metric_name: retained_wal + type: gauge + help: 'Retained WAL in inactive replication slots' + key_labels: + - slot_name + values: [retained_wal] + query: | + SELECT slot_name, pg_wal_lsn_diff(pg_current_wal_lsn(), restart_lsn)::FLOAT8 AS retained_wal + FROM pg_replication_slots + WHERE active = false; + + - metric_name: wal_is_lost + type: gauge + help: 'Whether or not the replication slot\'s wal_status is lost' + key_labels: + - slot_name + values: [wal_status_is_lost] + query: | + SELECT slot_name, + CASE + WHEN wal_status = 'lost' THEN 1 + ELSE 0 + END AS wal_status_is_lost + FROM pg_replication_slots; - filename: neon_collector_autoscaling.yml content: | collector_name: neon_collector_autoscaling @@ -295,7 +382,6 @@ files: values: [approximate_working_set_size] query: | select neon.approximate_working_set_size(false) as approximate_working_set_size; - build: | # Build cgroup-tools # diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index b2da33e44a..f364a6c2e0 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -13,13 +13,14 @@ publish = false ### BEGIN HAKARI SECTION [dependencies] +ahash = { version = "0.8" } anyhow = { version = "1", features = ["backtrace"] } aws-config = { version = "1", default-features = false, features = ["rustls", "sso"] } aws-runtime = { version = "1", default-features = false, features = ["event-stream", "http-02x", "sigv4a"] } aws-sigv4 = { version = "1", features = ["http0-compat", "sign-eventstream", "sigv4a"] } aws-smithy-async = { version = "1", default-features = false, features = ["rt-tokio"] } aws-smithy-http = { version = "0.60", default-features = false, features = ["event-stream"] } -aws-smithy-types = { version = "1", default-features = false, features = ["byte-stream-poll-next", "http-body-0-4-x", "rt-tokio", "test-util"] } +aws-smithy-types = { version = "1", default-features = false, features = ["byte-stream-poll-next", "http-body-0-4-x", "http-body-1-x", "rt-tokio", "test-util"] } axum = { version = "0.6", features = ["ws"] } base64 = { version = "0.21", features = ["alloc"] } base64ct = { version = "1", default-features = false, features = ["std"] } @@ -51,7 +52,7 @@ num-bigint = { version = "0.4" } num-integer = { version = "0.1", features = ["i128"] } num-traits = { version = "0.2", features = ["i128", "libm"] } once_cell = { version = "1" } -parquet = { git = "https://github.com/neondatabase/arrow-rs", branch = "neon-fix-bugs", default-features = false, features = ["zstd"] } +parquet = { git = "https://github.com/apache/arrow-rs", branch = "master", default-features = false, features = ["zstd"] } prost = { version = "0.11" } rand = { version = "0.8", features = ["small_rng"] } regex = { version = "1" } @@ -85,6 +86,7 @@ zstd-safe = { version = "7", default-features = false, features = ["arrays", "le zstd-sys = { version = "2", default-features = false, features = ["legacy", "std", "zdict_builder"] } [build-dependencies] +ahash = { version = "0.8" } anyhow = { version = "1", features = ["backtrace"] } bytes = { version = "1", features = ["serde"] } cc = { version = "1", default-features = false, features = ["parallel"] } @@ -102,7 +104,7 @@ num-bigint = { version = "0.4" } num-integer = { version = "0.1", features = ["i128"] } num-traits = { version = "0.2", features = ["i128", "libm"] } once_cell = { version = "1" } -parquet = { git = "https://github.com/neondatabase/arrow-rs", branch = "neon-fix-bugs", default-features = false, features = ["zstd"] } +parquet = { git = "https://github.com/apache/arrow-rs", branch = "master", default-features = false, features = ["zstd"] } prost = { version = "0.11" } regex = { version = "1" } regex-automata = { version = "0.4", default-features = false, features = ["dfa-onepass", "hybrid", "meta", "nfa-backtrack", "perf-inline", "perf-literal", "unicode"] }