fmt

Pass down timeouts to the reqwest client
use OIDC role instead of AWS access keys for managing test runner (#10117 )
2026-02-11 06:30:37 +00:00 · 2024-12-13 18:54:20 +01:00 · 2024-12-13 18:32:46 +01:00 · 2024-12-12 20:25:39 +00:00 · 2024-12-12 20:23:24 +00:00 · 2024-12-12 19:35:38 +00:00
96 changed files with 3059 additions and 1233 deletions
--- a/.github/actionlint.yml
+++ b/.github/actionlint.yml
@@ -23,3 +23,5 @@ config-variables:
  - BENCHMARK_INGEST_TARGET_PROJECTID
  - PGREGRESS_PG16_PROJECT_ID
  - PGREGRESS_PG17_PROJECT_ID
+  - SLACK_ON_CALL_QA_STAGING_STREAM
+  - DEV_AWS_OIDC_ROLE_MANAGE_BENCHMARK_EC2_VMS_ARN
--- a/.github/actions/download/action.yml
+++ b/.github/actions/download/action.yml
@@ -15,10 +15,21 @@ inputs:
  prefix:
    description: "S3 prefix. Default is '${GITHUB_RUN_ID}/${GITHUB_RUN_ATTEMPT}'"
    required: false
+  aws_oicd_role_arn:
+    description: "the OIDC role arn for aws auth"
+    required: false
+    default: ""

 runs:
  using: "composite"
  steps:
+    - name: Configure AWS credentials
+      uses: aws-actions/configure-aws-credentials@v4
+      with:
+        aws-region: eu-central-1
+        role-to-assume: ${{ inputs.aws_oicd_role_arn }}
+        role-duration-seconds: 3600
+
    - name: Download artifact
      id: download-artifact
      shell: bash -euxo pipefail {0}
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -62,6 +62,7 @@ runs:
      with:
        name: neon-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build_type }}-artifact
        path: /tmp/neon
+        aws_oicd_role_arn: ${{ inputs.aws_oicd_role_arn }}

    - name: Download Neon binaries for the previous release
      if: inputs.build_type != 'remote'
@@ -70,6 +71,7 @@ runs:
        name: neon-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build_type }}-artifact
        path: /tmp/neon-previous
        prefix: latest
+        aws_oicd_role_arn: ${{ inputs.aws_oicd_role_arn }}

    - name: Download compatibility snapshot
      if: inputs.build_type != 'remote'
@@ -81,6 +83,7 @@ runs:
        # The lack of compatibility snapshot (for example, for the new Postgres version)
        # shouldn't fail the whole job. Only relevant test should fail.
        skip-if-does-not-exist: true
+        aws_oicd_role_arn: ${{ inputs.aws_oicd_role_arn }}

    - name: Checkout
      if: inputs.needs_postgres_source == 'true'
@@ -218,6 +221,7 @@ runs:
        # The lack of compatibility snapshot shouldn't fail the job
        # (for example if we didn't run the test for non build-and-test workflow)
        skip-if-does-not-exist: true
+        aws_oicd_role_arn: ${{ inputs.aws_oicd_role_arn }}

    - name: (Re-)configure AWS credentials # necessary to upload reports to S3 after a long-running test
      if: ${{ !cancelled() && (inputs.aws_oicd_role_arn != '') }}
@@ -232,3 +236,4 @@ runs:
      with:
        report-dir: /tmp/test_output/allure/results
        unique-key: ${{ inputs.build_type }}-${{ inputs.pg_version }}
+        aws_oicd_role_arn: ${{ inputs.aws_oicd_role_arn }}
--- a/.github/actions/save-coverage-data/action.yml
+++ b/.github/actions/save-coverage-data/action.yml
@@ -14,9 +14,11 @@ runs:
        name: coverage-data-artifact
        path: /tmp/coverage
        skip-if-does-not-exist: true # skip if there's no previous coverage to download
+        aws_oicd_role_arn: ${{ inputs.aws_oicd_role_arn }}

    - name: Upload coverage data
      uses: ./.github/actions/upload
      with:
        name: coverage-data-artifact
        path: /tmp/coverage
+        aws_oicd_role_arn: ${{ inputs.aws_oicd_role_arn }}
--- a/.github/actions/upload/action.yml
+++ b/.github/actions/upload/action.yml
@@ -14,6 +14,10 @@ inputs:
  prefix:
    description: "S3 prefix. Default is '${GITHUB_SHA}/${GITHUB_RUN_ID}/${GITHUB_RUN_ATTEMPT}'"
    required: false
+  aws_oicd_role_arn:
+    description: "the OIDC role arn for aws auth"
+    required: false
+    default: ""

 runs:
  using: "composite"
@@ -53,6 +57,13 @@ runs:

        echo 'SKIPPED=false' >> $GITHUB_OUTPUT

+    - name: Configure AWS credentials
+      uses: aws-actions/configure-aws-credentials@v4
+      with:
+        aws-region: eu-central-1
+        role-to-assume: ${{ inputs.aws_oicd_role_arn }}
+        role-duration-seconds: 3600
+
    - name: Upload artifact
      if: ${{ steps.prepare-artifact.outputs.SKIPPED == 'false' }}
      shell: bash -euxo pipefail {0}
--- a/.github/workflows/_benchmarking_preparation.yml
+++ b/.github/workflows/_benchmarking_preparation.yml
@@ -70,6 +70,7 @@ jobs:
        name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
        path: /tmp/neon/
        prefix: latest
+        aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}

    # we create a table that has one row for each database that we want to restore with the status whether the restore is done
    - name: Create benchmark_restore_status table if it does not exist
--- a/.github/workflows/_build-and-test-locally.yml
+++ b/.github/workflows/_build-and-test-locally.yml
@@ -31,12 +31,13 @@ defaults:
 env:
  RUST_BACKTRACE: 1
  COPT: '-Werror'
-  AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
-  AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}

 jobs:
  build-neon:
    runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', inputs.arch == 'arm64' && 'large-arm64' || 'large')) }}
+    permissions:
+      id-token: write # aws-actions/configure-aws-credentials
+      contents: read
    container:
      image: ${{ inputs.build-tools-image }}
      credentials:
@@ -205,6 +206,13 @@ jobs:
            done
          fi

+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          aws-region: eu-central-1
+          role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+          role-duration-seconds: 18000 # 5 hours
+
      - name: Run rust tests
        env:
          NEXTEST_RETRIES: 3
@@ -256,6 +264,7 @@ jobs:
        with:
          name: neon-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-artifact
          path: /tmp/neon
+          aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}

      # XXX: keep this after the binaries.list is formed, so the coverage can properly work later
      - name: Merge and upload coverage data
@@ -265,6 +274,10 @@ jobs:
  regress-tests:
    # Don't run regression tests on debug arm64 builds
    if: inputs.build-type != 'debug' || inputs.arch != 'arm64'
+    permissions:
+      id-token: write # aws-actions/configure-aws-credentials
+      contents: read
+      statuses: write
    needs: [ build-neon ]
    runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', inputs.arch == 'arm64' && 'large-arm64' || 'large')) }}
    container:
@@ -283,7 +296,7 @@ jobs:
          submodules: true

      - name: Pytest regression tests
-        continue-on-error: ${{ matrix.lfc_state == 'with-lfc' }}
+        continue-on-error: ${{ matrix.lfc_state == 'with-lfc' && inputs.build-type == 'debug' }}
        uses: ./.github/actions/run-python-test-set
        timeout-minutes: 60
        with:
@@ -295,6 +308,7 @@ jobs:
          real_s3_region: eu-central-1
          rerun_failed: true
          pg_version: ${{ matrix.pg_version }}
+          aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
        env:
          TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
          CHECK_ONDISK_DATA_COMPATIBILITY: nonempty
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -105,6 +105,7 @@ jobs:
        name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
        path: /tmp/neon/
        prefix: latest
+        aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}

    - name: Create Neon Project
      id: create-neon-project
@@ -204,6 +205,7 @@ jobs:
        name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
        path: /tmp/neon/
        prefix: latest
+        aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}

    - name: Run Logical Replication benchmarks
      uses: ./.github/actions/run-python-test-set
@@ -405,6 +407,7 @@ jobs:
        name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
        path: /tmp/neon/
        prefix: latest
+        aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}

    - name: Create Neon Project
      if: contains(fromJson('["neonvm-captest-new", "neonvm-captest-freetier", "neonvm-azure-captest-freetier", "neonvm-azure-captest-new"]'), matrix.platform)
@@ -708,6 +711,7 @@ jobs:
        name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
        path: /tmp/neon/
        prefix: latest
+        aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}

    - name: Set up Connection String
      id: set-up-connstr
@@ -818,6 +822,7 @@ jobs:
        name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
        path: /tmp/neon/
        prefix: latest
+        aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}

    - name: Get Connstring Secret Name
      run: |
@@ -926,6 +931,7 @@ jobs:
        name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
        path: /tmp/neon/
        prefix: latest
+        aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}

    - name: Set up Connection String
      id: set-up-connstr
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -21,8 +21,6 @@ concurrency:
 env:
  RUST_BACKTRACE: 1
  COPT: '-Werror'
-  AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
-  AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
  # A concurrency group that we use for e2e-tests runs, matches `concurrency.group` above with `github.repository` as a prefix
  E2E_CONCURRENCY_GROUP: ${{ github.repository }}-e2e-tests-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }}

@@ -255,15 +253,17 @@ jobs:
      build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
      build-tag: ${{ needs.tag.outputs.build-tag }}
      build-type: ${{ matrix.build-type }}
-      # Run tests on all Postgres versions in release builds and only on the latest version in debug builds
-      # run without LFC on v17 release only
+      # Run tests on all Postgres versions in release builds and only on the latest version in debug builds.
+      # Run without LFC on v17 release and debug builds only. For all the other cases LFC is enabled. Failure on the
+      # debug build with LFC enabled doesn't block merging.
      test-cfg: |
-        ${{ matrix.build-type == 'release' && '[{"pg_version":"v14", "lfc_state": "without-lfc"},
-                                                {"pg_version":"v15", "lfc_state": "without-lfc"},
-                                                {"pg_version":"v16", "lfc_state": "without-lfc"},
-                                                {"pg_version":"v17", "lfc_state": "without-lfc"},
-                                                {"pg_version":"v17", "lfc_state": "with-lfc"}]'
-                                           || '[{"pg_version":"v17", "lfc_state": "without-lfc"}]' }}
+        ${{ matrix.build-type == 'release' && '[{"pg_version":"v14", "lfc_state": "with-lfc"},
+                                                {"pg_version":"v15", "lfc_state": "with-lfc"},
+                                                {"pg_version":"v16", "lfc_state": "with-lfc"},
+                                                {"pg_version":"v17", "lfc_state": "with-lfc"},
+                                                {"pg_version":"v17", "lfc_state": "without-lfc"}]'
+                                           || '[{"pg_version":"v17", "lfc_state": "without-lfc"},
+                                                {"pg_version":"v17", "lfc_state": "with-lfc" }]' }}
    secrets: inherit

  # Keep `benchmarks` job outside of `build-and-test-locally` workflow to make job failures non-blocking
@@ -360,6 +360,11 @@ jobs:
  create-test-report:
    needs: [ check-permissions, build-and-test-locally, coverage-report, build-build-tools-image, benchmarks ]
    if: ${{ !cancelled() && contains(fromJSON('["skipped", "success"]'), needs.check-permissions.result) }}
+    permissions:
+      id-token: write # aws-actions/configure-aws-credentials
+      statuses: write
+      contents: write
+      pull-requests: write
    outputs:
      report-url: ${{ steps.create-allure-report.outputs.report-url }}

@@ -380,6 +385,7 @@ jobs:
        uses: ./.github/actions/allure-report-generate
        with:
          store-test-results-into-db: true
+          aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
        env:
          REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}

@@ -411,6 +417,10 @@ jobs:
  coverage-report:
    if: ${{ !startsWith(github.ref_name, 'release') }}
    needs: [ check-permissions, build-build-tools-image, build-and-test-locally ]
+    permissions:
+      id-token: write # aws-actions/configure-aws-credentials
+      statuses: write
+      contents: write
    runs-on: [ self-hosted, small ]
    container:
      image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
@@ -437,12 +447,14 @@ jobs:
        with:
          name: neon-${{ runner.os }}-${{ runner.arch }}-${{ matrix.build_type }}-artifact
          path: /tmp/neon
+          aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}

      - name: Get coverage artifact
        uses: ./.github/actions/download
        with:
          name: coverage-data-artifact
          path: /tmp/coverage
+          aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}

      - name: Merge coverage data
        run: scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage merge
@@ -573,6 +585,10 @@ jobs:
  neon-image:
    needs: [ neon-image-arch, tag ]
    runs-on: ubuntu-22.04
+    permissions:
+      id-token: write # aws-actions/configure-aws-credentials
+      statuses: write
+      contents: read

    steps:
      - uses: docker/login-action@v3
@@ -587,11 +603,15 @@ jobs:
                                             neondatabase/neon:${{ needs.tag.outputs.build-tag }}-bookworm-x64 \
                                             neondatabase/neon:${{ needs.tag.outputs.build-tag }}-bookworm-arm64

-      - uses: docker/login-action@v3
+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@v4
        with:
-          registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
-          username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
-          password: ${{ secrets.AWS_SECRET_KEY_DEV }}
+          aws-region: eu-central-1
+          role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+          role-duration-seconds: 3600
+
+      - name: Login to Amazon Dev ECR
+        uses: aws-actions/amazon-ecr-login@v2

      - name: Push multi-arch image to ECR
        run: |
@@ -600,6 +620,10 @@ jobs:

  compute-node-image-arch:
    needs: [ check-permissions, build-build-tools-image, tag ]
+    permissions:
+      id-token: write # aws-actions/configure-aws-credentials
+      statuses: write
+      contents: read
    strategy:
      fail-fast: false
      matrix:
@@ -640,11 +664,15 @@ jobs:
          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}

-      - uses: docker/login-action@v3
+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@v4
        with:
-          registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
-          username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
-          password: ${{ secrets.AWS_SECRET_KEY_DEV }}
+          aws-region: eu-central-1
+          role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+          role-duration-seconds: 3600
+
+      - name: Login to Amazon Dev ECR
+        uses: aws-actions/amazon-ecr-login@v2

      - uses: docker/login-action@v3
        with:
@@ -717,6 +745,10 @@ jobs:

  compute-node-image:
    needs: [ compute-node-image-arch, tag ]
+    permissions:
+      id-token: write # aws-actions/configure-aws-credentials
+      statuses: write
+      contents: read
    runs-on: ubuntu-22.04

    strategy:
@@ -761,11 +793,15 @@ jobs:
                                             neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-x64 \
                                             neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-arm64

-      - uses: docker/login-action@v3
+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@v4
        with:
-          registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
-          username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
-          password: ${{ secrets.AWS_SECRET_KEY_DEV }}
+          aws-region: eu-central-1
+          role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+          role-duration-seconds: 3600
+
+      - name: Login to Amazon Dev ECR
+        uses: aws-actions/amazon-ecr-login@v2

      - name: Push multi-arch compute-node-${{ matrix.version.pg }} image to ECR
        run: |
@@ -795,7 +831,7 @@ jobs:
          - pg: v17
            debian: bookworm
    env:
-      VM_BUILDER_VERSION: v0.35.0
+      VM_BUILDER_VERSION: v0.37.1

    steps:
      - uses: actions/checkout@v4
@@ -890,7 +926,9 @@ jobs:
    runs-on: ubuntu-22.04

    permissions:
-      id-token: write # for `aws-actions/configure-aws-credentials`
+      id-token: write # aws-actions/configure-aws-credentials
+      statuses: write
+      contents: read

    env:
      VERSIONS: v14 v15 v16 v17
@@ -901,12 +939,15 @@ jobs:
          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}

-      - name: Login to dev ECR
-        uses: docker/login-action@v3
+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@v4
        with:
-          registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
-          username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
-          password: ${{ secrets.AWS_SECRET_KEY_DEV }}
+          aws-region: eu-central-1
+          role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+          role-duration-seconds: 3600
+
+      - name: Login to Amazon Dev ECR
+        uses: aws-actions/amazon-ecr-login@v2

      - name: Copy vm-compute-node images to ECR
        run: |
@@ -1060,12 +1101,79 @@ jobs:
    needs: [ check-permissions, promote-images, tag, build-and-test-locally, trigger-custom-extensions-build-and-wait, push-to-acr-dev, push-to-acr-prod ]
    # `!failure() && !cancelled()` is required because the workflow depends on the job that can be skipped: `push-to-acr-dev` and `push-to-acr-prod`
    if: (github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute') && !failure() && !cancelled()
-
+    permissions:
+      id-token: write # aws-actions/configure-aws-credentials
+      statuses: write
+      contents: write
    runs-on: [ self-hosted, small ]
    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
    steps:
      - uses: actions/checkout@v4

+      - name: Create git tag and GitHub release
+        if: github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute'
+        uses: actions/github-script@v7
+        with:
+          retries: 5
+          script: |
+            const tag = "${{ needs.tag.outputs.build-tag }}";
+
+            try {
+              const existingRef = await github.rest.git.getRef({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                ref: `tags/${tag}`,
+              });
+
+              if (existingRef.data.object.sha !== context.sha) {
+                throw new Error(`Tag ${tag} already exists but points to a different commit (expected: ${context.sha}, actual: ${existingRef.data.object.sha}).`);
+              }
+
+              console.log(`Tag ${tag} already exists and points to ${context.sha} as expected.`);
+            } catch (error) {
+              if (error.status !== 404) {
+                throw error;
+              }
+
+              console.log(`Tag ${tag} does not exist. Creating it...`);
+              await github.rest.git.createRef({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                ref: `refs/tags/${tag}`,
+                sha: context.sha,
+              });
+              console.log(`Tag ${tag} created successfully.`);
+            }
+
+            # TODO: check how GitHub releases looks for proxy/compute releases and enable them if they're ok
+            if (context.ref !== 'refs/heads/release') {
+              console.log(`GitHub release skipped for ${context.ref}.`);
+              return;
+            }
+
+            try {
+              const existingRelease = await github.rest.repos.getReleaseByTag({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                tag: tag,
+              });
+
+              console.log(`Release for tag ${tag} already exists (ID: ${existingRelease.data.id}).`);
+            } catch (error) {
+              if (error.status !== 404) {
+                throw error;
+              }
+
+              console.log(`Release for tag ${tag} does not exist. Creating it...`);
+              await github.rest.repos.createRelease({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                tag_name: tag,
+                generate_release_notes: true,
+              });
+              console.log(`Release for tag ${tag} created successfully.`);
+            }
+
      - name: Trigger deploy workflow
        env:
          GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
@@ -1115,38 +1223,13 @@ jobs:
            exit 1
          fi

-      - name: Create git tag
-        if: github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute'
-        uses: actions/github-script@v7
-        with:
-          # Retry script for 5XX server errors: https://github.com/actions/github-script#retries
-          retries: 5
-          script: |
-            await github.rest.git.createRef({
-              owner: context.repo.owner,
-              repo: context.repo.repo,
-              ref: "refs/tags/${{ needs.tag.outputs.build-tag }}",
-              sha: context.sha,
-            })
-
-      # TODO: check how GitHub releases looks for proxy releases and enable it if it's ok
-      - name: Create GitHub release
-        if: github.ref_name == 'release'
-        uses: actions/github-script@v7
-        with:
-          # Retry script for 5XX server errors: https://github.com/actions/github-script#retries
-          retries: 5
-          script: |
-            await github.rest.repos.createRelease({
-              owner: context.repo.owner,
-              repo: context.repo.repo,
-              tag_name: "${{ needs.tag.outputs.build-tag }}",
-              generate_release_notes: true,
-            })
-
  # The job runs on `release` branch and copies compatibility data and Neon artifact from the last *release PR* to the latest directory
  promote-compatibility-data:
    needs: [ deploy ]
+    permissions:
+      id-token: write # aws-actions/configure-aws-credentials
+      statuses: write
+      contents: read
    # `!failure() && !cancelled()` is required because the workflow transitively depends on the job that can be skipped: `push-to-acr-dev` and `push-to-acr-prod`
    if: github.ref_name == 'release' && !failure() && !cancelled()

--- a/.github/workflows/cloud-regress.yml
+++ b/.github/workflows/cloud-regress.yml
@@ -19,14 +19,15 @@ concurrency:
  group: ${{ github.workflow }}
  cancel-in-progress: true

+permissions:
+  id-token: write # aws-actions/configure-aws-credentials
+
 jobs:
  regress:
    env:
      POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
      TEST_OUTPUT: /tmp/test_output
      BUILD_TYPE: remote
-      AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
-      AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
    strategy:
      fail-fast: false
      matrix:
@@ -78,6 +79,7 @@ jobs:
          name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
          path: /tmp/neon/
          prefix: latest
+          aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}

      - name: Create a new branch
        id: create-branch
@@ -107,12 +109,14 @@ jobs:
        id: create-allure-report
        if: ${{ !cancelled() }}
        uses: ./.github/actions/allure-report-generate
+        with:
+          aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}

      - name: Post to a Slack channel
        if: ${{ github.event.schedule && failure() }}
        uses: slackapi/slack-github-action@v1
        with:
-          channel-id: "C033QLM5P7D" # on-call-staging-stream
+          channel-id: ${{ vars.SLACK_ON_CALL_QA_STAGING_STREAM }}
          slack-message: |
            Periodic pg_regress on staging: ${{ job.status }}
            <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>
--- a/.github/workflows/ingest_benchmark.yml
+++ b/.github/workflows/ingest_benchmark.yml
@@ -64,6 +64,7 @@ jobs:
        name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
        path: /tmp/neon/
        prefix: latest
+        aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}

    - name: Create Neon Project
      if: ${{ matrix.target_project == 'new_empty_project' }}
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -143,6 +143,10 @@ jobs:

  gather-rust-build-stats:
    needs: [ check-permissions, build-build-tools-image ]
+    permissions:
+      id-token: write # aws-actions/configure-aws-credentials
+      statuses: write
+      contents: write
    if: |
      contains(github.event.pull_request.labels.*.name, 'run-extra-build-stats') ||
      contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') ||
@@ -177,13 +181,18 @@ jobs:
      - name: Produce the build stats
        run: PQ_LIB_DIR=$(pwd)/pg_install/v17/lib cargo build --all --release --timings -j$(nproc)

+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          aws-region: eu-central-1
+          role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+          role-duration-seconds: 3600
+
      - name: Upload the build stats
        id: upload-stats
        env:
          BUCKET: neon-github-public-dev
          SHA: ${{ github.event.pull_request.head.sha || github.sha }}
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
        run: |
          REPORT_URL=https://${BUCKET}.s3.amazonaws.com/build-stats/${SHA}/${GITHUB_RUN_ID}/cargo-timing.html
          aws s3 cp --only-show-errors ./target/cargo-timings/cargo-timing.html "s3://${BUCKET}/build-stats/${SHA}/${GITHUB_RUN_ID}/"
--- a/.github/workflows/periodic_pagebench.yml
+++ b/.github/workflows/periodic_pagebench.yml
@@ -21,6 +21,9 @@ defaults:
  run:
    shell: bash -euo pipefail {0}

+permissions:
+  id-token: write # aws-actions/configure-aws-credentials
+
 concurrency:
  group: ${{ github.workflow }}
  cancel-in-progress: false
@@ -38,8 +41,6 @@ jobs:
    env:
      API_KEY: ${{ secrets.PERIODIC_PAGEBENCH_EC2_RUNNER_API_KEY }}
      RUN_ID: ${{ github.run_id }}
-      AWS_ACCESS_KEY_ID: ${{ secrets.AWS_EC2_US_TEST_RUNNER_ACCESS_KEY_ID }}
-      AWS_SECRET_ACCESS_KEY : ${{ secrets.AWS_EC2_US_TEST_RUNNER_ACCESS_KEY_SECRET }}
      AWS_DEFAULT_REGION : "eu-central-1"
      AWS_INSTANCE_ID : "i-02a59a3bf86bc7e74"
    steps:
@@ -50,6 +51,13 @@ jobs:
    - name: Show my own (github runner) external IP address - usefull for IP allowlisting
      run: curl https://ifconfig.me

+    - name: Assume AWS OIDC role that allows to manage (start/stop/describe... EC machine)
+      uses: aws-actions/configure-aws-credentials@v4
+      with:
+        aws-region: eu-central-1
+        role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_MANAGE_BENCHMARK_EC2_VMS_ARN }}
+        role-duration-seconds: 3600
+
    - name: Start EC2 instance and wait for the instance to boot up
      run: |
        aws ec2 start-instances --instance-ids $AWS_INSTANCE_ID
@@ -124,11 +132,10 @@ jobs:
        cat "test_log_${GITHUB_RUN_ID}"

    - name: Create Allure report
-      env:
-        AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
-        AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
      if: ${{ !cancelled() }}
      uses: ./.github/actions/allure-report-generate
+      with:
+        aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}

    - name: Post to a Slack channel
      if: ${{ github.event.schedule && failure() }}
@@ -148,6 +155,14 @@ jobs:
        -H "Authorization: Bearer $API_KEY" \
        -d ''

+    - name: Assume AWS OIDC role that allows to manage (start/stop/describe... EC machine)
+      if: always() && steps.poll_step.outputs.too_many_runs != 'true'
+      uses: aws-actions/configure-aws-credentials@v4
+      with:
+        aws-region: eu-central-1
+        role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_MANAGE_BENCHMARK_EC2_VMS_ARN }}
+        role-duration-seconds: 3600
+
    - name: Stop EC2 instance and wait for the instance to be stopped
      if: always() && steps.poll_step.outputs.too_many_runs != 'true'
      run: |
--- a/.github/workflows/pg-clients.yml
+++ b/.github/workflows/pg-clients.yml
@@ -25,11 +25,13 @@ defaults:
  run:
    shell: bash -euxo pipefail {0}

+permissions:
+  id-token: write # aws-actions/configure-aws-credentials
+  statuses: write # require for posting a status update
+
 env:
  DEFAULT_PG_VERSION: 16
  PLATFORM: neon-captest-new
-  AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
-  AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
  AWS_DEFAULT_REGION: eu-central-1

 jobs:
@@ -94,6 +96,7 @@ jobs:
          name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
          path: /tmp/neon/
          prefix: latest
+          aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}

      - name: Create Neon Project
        id: create-neon-project
@@ -126,6 +129,7 @@ jobs:
        uses: ./.github/actions/allure-report-generate
        with:
          store-test-results-into-db: true
+          aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
        env:
          REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}

@@ -159,6 +163,7 @@ jobs:
        name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
        path: /tmp/neon/
        prefix: latest
+        aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}

    - name: Create Neon Project
      id: create-neon-project
@@ -191,6 +196,7 @@ jobs:
      uses: ./.github/actions/allure-report-generate
      with:
        store-test-results-into-db: true
+        aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
      env:
        REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}

--- a/.github/workflows/pin-build-tools-image.yml
+++ b/.github/workflows/pin-build-tools-image.yml
@@ -67,7 +67,7 @@ jobs:
    runs-on: ubuntu-22.04

    permissions:
-      id-token: write # for `azure/login`
+      id-token: write # for `azure/login` and aws auth

    steps:
      - uses: docker/login-action@v3
@@ -75,11 +75,15 @@ jobs:
          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}

-      - uses: docker/login-action@v3
+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@v4
        with:
-          registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
-          username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
-          password: ${{ secrets.AWS_SECRET_KEY_DEV }}
+          aws-region: eu-central-1
+          role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+          role-duration-seconds: 3600
+
+      - name: Login to Amazon Dev ECR
+        uses: aws-actions/amazon-ecr-login@v2

      - name: Azure login
        uses: azure/login@6c251865b4e6290e7b78be643ea2d005bc51f69a  # @v2.1.1
--- a/.github/workflows/pre-merge-checks.yml
+++ b/.github/workflows/pre-merge-checks.yml
@@ -63,6 +63,7 @@ jobs:
    if: always()
    permissions:
      statuses: write # for `github.repos.createCommitStatus(...)`
+      contents: write
    needs:
      - get-changed-files
      - check-codestyle-python
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -3,7 +3,7 @@ name: Create Release Branch
 on:
  schedule:
    # It should be kept in sync with if-condition in jobs
-    - cron: '0 6 * * MON' # Storage release
+    - cron: '0 6 * * FRI' # Storage release
    - cron: '0 6 * * THU' # Proxy release
  workflow_dispatch:
    inputs:
@@ -29,7 +29,7 @@ defaults:

 jobs:
  create-storage-release-branch:
-    if: ${{ github.event.schedule == '0 6 * * MON' || inputs.create-storage-release-branch }}
+    if: ${{ github.event.schedule == '0 6 * * FRI' || inputs.create-storage-release-branch }}

    permissions:
      contents: write
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -5062,6 +5062,7 @@ dependencies = [
 "once_cell",
 "pin-project-lite",
 "rand 0.8.5",
+ "reqwest",
 "scopeguard",
 "serde",
 "serde_json",
--- a/compute/etc/pgbouncer.ini
+++ b/compute/etc/pgbouncer.ini
@@ -19,3 +19,10 @@ max_prepared_statements=0
 admin_users=postgres
 unix_socket_dir=/tmp/
 unix_socket_mode=0777
+
+;; Disable connection logging. It produces a lot of logs that no one looks at,
+;; and we can get similar log entries from the proxy too. We had incidents in
+;; the past where the logging significantly stressed the log device or pgbouncer
+;; itself.
+log_connections=0
+log_disconnections=0
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -246,47 +246,48 @@ fn try_spec_from_cli(
    let compute_id = matches.get_one::<String>("compute-id");
    let control_plane_uri = matches.get_one::<String>("control-plane-uri");

-    let spec;
-    let mut live_config_allowed = false;
-    match spec_json {
-        // First, try to get cluster spec from the cli argument
-        Some(json) => {
-            info!("got spec from cli argument {}", json);
-            spec = Some(serde_json::from_str(json)?);
-        }
-        None => {
-            // Second, try to read it from the file if path is provided
-            if let Some(sp) = spec_path {
-                let path = Path::new(sp);
-                let file = File::open(path)?;
-                spec = Some(serde_json::from_reader(file)?);
-                live_config_allowed = true;
-            } else if let Some(id) = compute_id {
-                if let Some(cp_base) = control_plane_uri {
-                    live_config_allowed = true;
-                    spec = match get_spec_from_control_plane(cp_base, id) {
-                        Ok(s) => s,
-                        Err(e) => {
-                            error!("cannot get response from control plane: {}", e);
-                            panic!("neither spec nor confirmation that compute is in the Empty state was received");
-                        }
-                    };
-                } else {
-                    panic!("must specify both --control-plane-uri and --compute-id or none");
-                }
-            } else {
-                panic!(
-                    "compute spec should be provided by one of the following ways: \
-                    --spec OR --spec-path OR --control-plane-uri and --compute-id"
-                );
-            }
-        }
+    // First, try to get cluster spec from the cli argument
+    if let Some(spec_json) = spec_json {
+        info!("got spec from cli argument {}", spec_json);
+        return Ok(CliSpecParams {
+            spec: Some(serde_json::from_str(spec_json)?),
+            live_config_allowed: false,
+        });
+    }
+
+    // Second, try to read it from the file if path is provided
+    if let Some(spec_path) = spec_path {
+        let file = File::open(Path::new(spec_path))?;
+        return Ok(CliSpecParams {
+            spec: Some(serde_json::from_reader(file)?),
+            live_config_allowed: true,
+        });
+    }
+
+    let Some(compute_id) = compute_id else {
+        panic!(
+            "compute spec should be provided by one of the following ways: \
+                --spec OR --spec-path OR --control-plane-uri and --compute-id"
+        );
+    };
+    let Some(control_plane_uri) = control_plane_uri else {
+        panic!("must specify both --control-plane-uri and --compute-id or none");
    };

-    Ok(CliSpecParams {
-        spec,
-        live_config_allowed,
-    })
+    match get_spec_from_control_plane(control_plane_uri, compute_id) {
+        Ok(spec) => Ok(CliSpecParams {
+            spec,
+            live_config_allowed: true,
+        }),
+        Err(e) => {
+            error!(
+                "cannot get response from control plane: {}\n\
+                neither spec nor confirmation that compute is in the Empty state was received",
+                e
+            );
+            Err(e)
+        }
+    }
 }

 struct CliSpecParams {
--- a/compute_tools/src/http/openapi_spec.yaml
+++ b/compute_tools/src/http/openapi_spec.yaml
@@ -537,12 +537,14 @@ components:
            properties:
              extname:
                type: string
-              versions:
-                type: array
+              version:
+                type: string
                items:
                  type: string
              n_databases:
                type: integer
+              owned_by_superuser:
+                type: integer

    SetRoleGrantsRequest:
      type: object
--- a/compute_tools/src/installed_extensions.rs
+++ b/compute_tools/src/installed_extensions.rs
@@ -1,7 +1,6 @@
 use compute_api::responses::{InstalledExtension, InstalledExtensions};
 use metrics::proto::MetricFamily;
 use std::collections::HashMap;
-use std::collections::HashSet;

 use anyhow::Result;
 use postgres::{Client, NoTls};
@@ -38,61 +37,77 @@ fn list_dbs(client: &mut Client) -> Result<Vec<String>> {
 /// Connect to every database (see list_dbs above) and get the list of installed extensions.
 ///
 /// Same extension can be installed in multiple databases with different versions,
-/// we only keep the highest and lowest version across all databases.
+/// so we report a separate metric (number of databases where it is installed)
+/// for each extension version.
 pub fn get_installed_extensions(mut conf: postgres::config::Config) -> Result<InstalledExtensions> {
    conf.application_name("compute_ctl:get_installed_extensions");
    let mut client = conf.connect(NoTls)?;
-
    let databases: Vec<String> = list_dbs(&mut client)?;

-    let mut extensions_map: HashMap<String, InstalledExtension> = HashMap::new();
+    let mut extensions_map: HashMap<(String, String, String), InstalledExtension> = HashMap::new();
    for db in databases.iter() {
        conf.dbname(db);
        let mut db_client = conf.connect(NoTls)?;
-        let extensions: Vec<(String, String)> = db_client
+        let extensions: Vec<(String, String, i32)> = db_client
            .query(
-                "SELECT extname, extversion FROM pg_catalog.pg_extension;",
+                "SELECT extname, extversion, extowner::integer FROM pg_catalog.pg_extension",
                &[],
            )?
            .iter()
-            .map(|row| (row.get("extname"), row.get("extversion")))
+            .map(|row| {
+                (
+                    row.get("extname"),
+                    row.get("extversion"),
+                    row.get("extowner"),
+                )
+            })
            .collect();

-        for (extname, v) in extensions.iter() {
+        for (extname, v, extowner) in extensions.iter() {
            let version = v.to_string();

-            // increment the number of databases where the version of extension is installed
-            INSTALLED_EXTENSIONS
-                .with_label_values(&[extname, &version])
-                .inc();
+            // check if the extension is owned by superuser
+            // 10 is the oid of superuser
+            let owned_by_superuser = if *extowner == 10 { "1" } else { "0" };

            extensions_map
-                .entry(extname.to_string())
+                .entry((
+                    extname.to_string(),
+                    version.clone(),
+                    owned_by_superuser.to_string(),
+                ))
                .and_modify(|e| {
-                    e.versions.insert(version.clone());
                    // count the number of databases where the extension is installed
                    e.n_databases += 1;
                })
                .or_insert(InstalledExtension {
                    extname: extname.to_string(),
-                    versions: HashSet::from([version.clone()]),
+                    version: version.clone(),
                    n_databases: 1,
+                    owned_by_superuser: owned_by_superuser.to_string(),
                });
        }
    }

-    let res = InstalledExtensions {
-        extensions: extensions_map.into_values().collect(),
-    };
+    for (key, ext) in extensions_map.iter() {
+        let (extname, version, owned_by_superuser) = key;
+        let n_databases = ext.n_databases as u64;

-    Ok(res)
+        INSTALLED_EXTENSIONS
+            .with_label_values(&[extname, version, owned_by_superuser])
+            .set(n_databases);
+    }
+
+    Ok(InstalledExtensions {
+        extensions: extensions_map.into_values().collect(),
+    })
 }

 static INSTALLED_EXTENSIONS: Lazy<UIntGaugeVec> = Lazy::new(|| {
    register_uint_gauge_vec!(
        "compute_installed_extensions",
        "Number of databases where the version of extension is installed",
-        &["extension_name", "version"]
+        &["extension_name", "version", "owned_by_superuser"]
    )
    .expect("failed to define a metric")
 });
--- a/control_plane/src/background_process.rs
+++ b/control_plane/src/background_process.rs
@@ -274,6 +274,7 @@ fn fill_remote_storage_secrets_vars(mut cmd: &mut Command) -> &mut Command {
    for env_key in [
        "AWS_ACCESS_KEY_ID",
        "AWS_SECRET_ACCESS_KEY",
+        "AWS_SESSION_TOKEN",
        "AWS_PROFILE",
        // HOME is needed in combination with `AWS_PROFILE` to pick up the SSO sessions.
        "HOME",
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -810,7 +810,7 @@ impl Endpoint {
        }

        let client = reqwest::Client::builder()
-            .timeout(Duration::from_secs(30))
+            .timeout(Duration::from_secs(120))
            .build()
            .unwrap();
        let response = client
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -435,7 +435,7 @@ impl PageServerNode {
    ) -> anyhow::Result<()> {
        let config = Self::parse_config(settings)?;
        self.http_client
-            .tenant_config(&models::TenantConfigRequest { tenant_id, config })
+            .set_tenant_config(&models::TenantConfigRequest { tenant_id, config })
            .await?;

        Ok(())
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -9,8 +9,8 @@ use pageserver_api::{
    },
    models::{
        EvictionPolicy, EvictionPolicyLayerAccessThreshold, LocationConfigSecondary,
-        ShardParameters, TenantConfig, TenantConfigRequest, TenantShardSplitRequest,
-        TenantShardSplitResponse,
+        ShardParameters, TenantConfig, TenantConfigPatchRequest, TenantConfigRequest,
+        TenantShardSplitRequest, TenantShardSplitResponse,
    },
    shard::{ShardStripeSize, TenantShardId},
 };
@@ -116,9 +116,19 @@ enum Command {
        #[arg(long)]
        tenant_shard_id: TenantShardId,
    },
-    /// Modify the pageserver tenant configuration of a tenant: this is the configuration structure
+    /// Set the pageserver tenant configuration of a tenant: this is the configuration structure
    /// that is passed through to pageservers, and does not affect storage controller behavior.
-    TenantConfig {
+    /// Any previous tenant configs are overwritten.
+    SetTenantConfig {
+        #[arg(long)]
+        tenant_id: TenantId,
+        #[arg(long)]
+        config: String,
+    },
+    /// Patch the pageserver tenant configuration of a tenant. Any fields with null values in the
+    /// provided JSON are unset from the tenant config and all fields with non-null values are set.
+    /// Unspecified fields are not changed.
+    PatchTenantConfig {
        #[arg(long)]
        tenant_id: TenantId,
        #[arg(long)]
@@ -549,11 +559,21 @@ async fn main() -> anyhow::Result<()> {
                )
                .await?;
        }
-        Command::TenantConfig { tenant_id, config } => {
+        Command::SetTenantConfig { tenant_id, config } => {
            let tenant_conf = serde_json::from_str(&config)?;

            vps_client
-                .tenant_config(&TenantConfigRequest {
+                .set_tenant_config(&TenantConfigRequest {
+                    tenant_id,
+                    config: tenant_conf,
+                })
+                .await?;
+        }
+        Command::PatchTenantConfig { tenant_id, config } => {
+            let tenant_conf = serde_json::from_str(&config)?;
+
+            vps_client
+                .patch_tenant_config(&TenantConfigPatchRequest {
                    tenant_id,
                    config: tenant_conf,
                })
@@ -736,7 +756,7 @@ async fn main() -> anyhow::Result<()> {
            threshold,
        } => {
            vps_client
-                .tenant_config(&TenantConfigRequest {
+                .set_tenant_config(&TenantConfigRequest {
                    tenant_id,
                    config: TenantConfig {
                        eviction_policy: Some(EvictionPolicy::LayerAccessThreshold(
--- a/libs/compute_api/src/responses.rs
+++ b/libs/compute_api/src/responses.rs
@@ -1,6 +1,5 @@
 //! Structs representing the JSON formats used in the compute_ctl's HTTP API.

-use std::collections::HashSet;
 use std::fmt::Display;

 use chrono::{DateTime, Utc};
@@ -163,8 +162,9 @@ pub enum ControlPlaneComputeStatus {
 #[derive(Clone, Debug, Default, Serialize)]
 pub struct InstalledExtension {
    pub extname: String,
-    pub versions: HashSet<String>,
+    pub version: String,
    pub n_databases: u32, // Number of databases using this extension
+    pub owned_by_superuser: String,
 }

 #[derive(Clone, Debug, Default, Serialize)]
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -75,7 +75,7 @@ pub struct TenantPolicyRequest {
    pub scheduling: Option<ShardSchedulingPolicy>,
 }

-#[derive(Clone, Serialize, Deserialize, PartialEq, Eq, Hash, Debug)]
+#[derive(Clone, Serialize, Deserialize, PartialEq, Eq, Hash, Debug, PartialOrd, Ord)]
 pub struct AvailabilityZone(pub String);

 impl Display for AvailabilityZone {
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -24,7 +24,7 @@ pub struct Key {

 /// When working with large numbers of Keys in-memory, it is more efficient to handle them as i128 than as
 /// a struct of fields.
-#[derive(Clone, Copy, Hash, PartialEq, Eq, Ord, PartialOrd, Serialize, Deserialize)]
+#[derive(Clone, Copy, Hash, PartialEq, Eq, Ord, PartialOrd, Serialize, Deserialize, Debug)]
 pub struct CompactKey(i128);

 /// The storage key size.
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -17,7 +17,7 @@ use std::{

 use byteorder::{BigEndian, ReadBytesExt};
 use postgres_ffi::BLCKSZ;
-use serde::{Deserialize, Serialize};
+use serde::{Deserialize, Deserializer, Serialize, Serializer};
 use serde_with::serde_as;
 use utils::{
    completion,
@@ -325,6 +325,115 @@ impl Default for ShardParameters {
    }
 }

+#[derive(Debug, Default, Clone, Eq, PartialEq)]
+pub enum FieldPatch<T> {
+    Upsert(T),
+    Remove,
+    #[default]
+    Noop,
+}
+
+impl<T> FieldPatch<T> {
+    fn is_noop(&self) -> bool {
+        matches!(self, FieldPatch::Noop)
+    }
+
+    pub fn apply(self, target: &mut Option<T>) {
+        match self {
+            Self::Upsert(v) => *target = Some(v),
+            Self::Remove => *target = None,
+            Self::Noop => {}
+        }
+    }
+
+    pub fn map<U, E, F: FnOnce(T) -> Result<U, E>>(self, map: F) -> Result<FieldPatch<U>, E> {
+        match self {
+            Self::Upsert(v) => Ok(FieldPatch::<U>::Upsert(map(v)?)),
+            Self::Remove => Ok(FieldPatch::<U>::Remove),
+            Self::Noop => Ok(FieldPatch::<U>::Noop),
+        }
+    }
+}
+
+impl<'de, T: Deserialize<'de>> Deserialize<'de> for FieldPatch<T> {
+    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
+    where
+        D: Deserializer<'de>,
+    {
+        Option::deserialize(deserializer).map(|opt| match opt {
+            None => FieldPatch::Remove,
+            Some(val) => FieldPatch::Upsert(val),
+        })
+    }
+}
+
+impl<T: Serialize> Serialize for FieldPatch<T> {
+    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+    where
+        S: Serializer,
+    {
+        match self {
+            FieldPatch::Upsert(val) => serializer.serialize_some(val),
+            FieldPatch::Remove => serializer.serialize_none(),
+            FieldPatch::Noop => unreachable!(),
+        }
+    }
+}
+
+#[derive(Serialize, Deserialize, Debug, Default, Clone, Eq, PartialEq)]
+#[serde(default)]
+pub struct TenantConfigPatch {
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub checkpoint_distance: FieldPatch<u64>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub checkpoint_timeout: FieldPatch<String>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub compaction_target_size: FieldPatch<u64>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub compaction_period: FieldPatch<String>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub compaction_threshold: FieldPatch<usize>,
+    // defer parsing compaction_algorithm, like eviction_policy
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub compaction_algorithm: FieldPatch<CompactionAlgorithmSettings>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub gc_horizon: FieldPatch<u64>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub gc_period: FieldPatch<String>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub image_creation_threshold: FieldPatch<usize>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub pitr_interval: FieldPatch<String>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub walreceiver_connect_timeout: FieldPatch<String>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub lagging_wal_timeout: FieldPatch<String>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub max_lsn_wal_lag: FieldPatch<NonZeroU64>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub eviction_policy: FieldPatch<EvictionPolicy>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub min_resident_size_override: FieldPatch<u64>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub evictions_low_residence_duration_metric_threshold: FieldPatch<String>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub heatmap_period: FieldPatch<String>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub lazy_slru_download: FieldPatch<bool>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub timeline_get_throttle: FieldPatch<ThrottleConfig>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub image_layer_creation_check_threshold: FieldPatch<u8>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub lsn_lease_length: FieldPatch<String>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub lsn_lease_length_for_ts: FieldPatch<String>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub timeline_offloading: FieldPatch<bool>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub wal_receiver_protocol_override: FieldPatch<PostgresClientProtocol>,
+}
+
 /// An alternative representation of `pageserver::tenant::TenantConf` with
 /// simpler types.
 #[derive(Serialize, Deserialize, Debug, Default, Clone, Eq, PartialEq)]
@@ -356,6 +465,107 @@ pub struct TenantConfig {
    pub wal_receiver_protocol_override: Option<PostgresClientProtocol>,
 }

+impl TenantConfig {
+    pub fn apply_patch(self, patch: TenantConfigPatch) -> TenantConfig {
+        let Self {
+            mut checkpoint_distance,
+            mut checkpoint_timeout,
+            mut compaction_target_size,
+            mut compaction_period,
+            mut compaction_threshold,
+            mut compaction_algorithm,
+            mut gc_horizon,
+            mut gc_period,
+            mut image_creation_threshold,
+            mut pitr_interval,
+            mut walreceiver_connect_timeout,
+            mut lagging_wal_timeout,
+            mut max_lsn_wal_lag,
+            mut eviction_policy,
+            mut min_resident_size_override,
+            mut evictions_low_residence_duration_metric_threshold,
+            mut heatmap_period,
+            mut lazy_slru_download,
+            mut timeline_get_throttle,
+            mut image_layer_creation_check_threshold,
+            mut lsn_lease_length,
+            mut lsn_lease_length_for_ts,
+            mut timeline_offloading,
+            mut wal_receiver_protocol_override,
+        } = self;
+
+        patch.checkpoint_distance.apply(&mut checkpoint_distance);
+        patch.checkpoint_timeout.apply(&mut checkpoint_timeout);
+        patch
+            .compaction_target_size
+            .apply(&mut compaction_target_size);
+        patch.compaction_period.apply(&mut compaction_period);
+        patch.compaction_threshold.apply(&mut compaction_threshold);
+        patch.compaction_algorithm.apply(&mut compaction_algorithm);
+        patch.gc_horizon.apply(&mut gc_horizon);
+        patch.gc_period.apply(&mut gc_period);
+        patch
+            .image_creation_threshold
+            .apply(&mut image_creation_threshold);
+        patch.pitr_interval.apply(&mut pitr_interval);
+        patch
+            .walreceiver_connect_timeout
+            .apply(&mut walreceiver_connect_timeout);
+        patch.lagging_wal_timeout.apply(&mut lagging_wal_timeout);
+        patch.max_lsn_wal_lag.apply(&mut max_lsn_wal_lag);
+        patch.eviction_policy.apply(&mut eviction_policy);
+        patch
+            .min_resident_size_override
+            .apply(&mut min_resident_size_override);
+        patch
+            .evictions_low_residence_duration_metric_threshold
+            .apply(&mut evictions_low_residence_duration_metric_threshold);
+        patch.heatmap_period.apply(&mut heatmap_period);
+        patch.lazy_slru_download.apply(&mut lazy_slru_download);
+        patch
+            .timeline_get_throttle
+            .apply(&mut timeline_get_throttle);
+        patch
+            .image_layer_creation_check_threshold
+            .apply(&mut image_layer_creation_check_threshold);
+        patch.lsn_lease_length.apply(&mut lsn_lease_length);
+        patch
+            .lsn_lease_length_for_ts
+            .apply(&mut lsn_lease_length_for_ts);
+        patch.timeline_offloading.apply(&mut timeline_offloading);
+        patch
+            .wal_receiver_protocol_override
+            .apply(&mut wal_receiver_protocol_override);
+
+        Self {
+            checkpoint_distance,
+            checkpoint_timeout,
+            compaction_target_size,
+            compaction_period,
+            compaction_threshold,
+            compaction_algorithm,
+            gc_horizon,
+            gc_period,
+            image_creation_threshold,
+            pitr_interval,
+            walreceiver_connect_timeout,
+            lagging_wal_timeout,
+            max_lsn_wal_lag,
+            eviction_policy,
+            min_resident_size_override,
+            evictions_low_residence_duration_metric_threshold,
+            heatmap_period,
+            lazy_slru_download,
+            timeline_get_throttle,
+            image_layer_creation_check_threshold,
+            lsn_lease_length,
+            lsn_lease_length_for_ts,
+            timeline_offloading,
+            wal_receiver_protocol_override,
+        }
+    }
+}
+
 /// The policy for the aux file storage.
 ///
 /// It can be switched through `switch_aux_file_policy` tenant config.
@@ -686,6 +896,14 @@ impl TenantConfigRequest {
    }
 }

+#[derive(Serialize, Deserialize, Debug)]
+#[serde(deny_unknown_fields)]
+pub struct TenantConfigPatchRequest {
+    pub tenant_id: TenantId,
+    #[serde(flatten)]
+    pub config: TenantConfigPatch, // as we have a flattened field, we should reject all unknown fields in it
+}
+
 /// See [`TenantState::attachment_status`] and the OpenAPI docs for context.
 #[derive(Serialize, Deserialize, Clone)]
 #[serde(tag = "slug", content = "data", rename_all = "snake_case")]
@@ -1699,4 +1917,45 @@ mod tests {
            );
        }
    }
+
+    #[test]
+    fn test_tenant_config_patch_request_serde() {
+        let patch_request = TenantConfigPatchRequest {
+            tenant_id: TenantId::from_str("17c6d121946a61e5ab0fe5a2fd4d8215").unwrap(),
+            config: TenantConfigPatch {
+                checkpoint_distance: FieldPatch::Upsert(42),
+                gc_horizon: FieldPatch::Remove,
+                compaction_threshold: FieldPatch::Noop,
+                ..TenantConfigPatch::default()
+            },
+        };
+
+        let json = serde_json::to_string(&patch_request).unwrap();
+
+        let expected = r#"{"tenant_id":"17c6d121946a61e5ab0fe5a2fd4d8215","checkpoint_distance":42,"gc_horizon":null}"#;
+        assert_eq!(json, expected);
+
+        let decoded: TenantConfigPatchRequest = serde_json::from_str(&json).unwrap();
+        assert_eq!(decoded.tenant_id, patch_request.tenant_id);
+        assert_eq!(decoded.config, patch_request.config);
+
+        // Now apply the patch to a config to demonstrate semantics
+
+        let base = TenantConfig {
+            checkpoint_distance: Some(28),
+            gc_horizon: Some(100),
+            compaction_target_size: Some(1024),
+            ..Default::default()
+        };
+
+        let expected = TenantConfig {
+            checkpoint_distance: Some(42),
+            gc_horizon: None,
+            ..base.clone()
+        };
+
+        let patched = base.apply_patch(decoded.config);
+
+        assert_eq!(patched, expected);
+    }
 }
--- a/libs/remote_storage/Cargo.toml
+++ b/libs/remote_storage/Cargo.toml
@@ -18,6 +18,7 @@ camino = { workspace = true, features = ["serde1"] }
 humantime-serde.workspace = true
 hyper = { workspace = true, features = ["client"] }
 futures.workspace = true
+reqwest.workspace = true
 serde.workspace = true
 serde_json.workspace = true
 tokio = { workspace = true, features = ["sync", "fs", "io-util"] }
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -8,6 +8,7 @@ use std::io;
 use std::num::NonZeroU32;
 use std::pin::Pin;
 use std::str::FromStr;
+use std::sync::Arc;
 use std::time::Duration;
 use std::time::SystemTime;

@@ -15,7 +16,7 @@ use super::REMOTE_STORAGE_PREFIX_SEPARATOR;
 use anyhow::Context;
 use anyhow::Result;
 use azure_core::request_options::{IfMatchCondition, MaxResults, Metadata, Range};
-use azure_core::{Continuable, RetryOptions};
+use azure_core::{Continuable, HttpClient, RetryOptions, TransportOptions};
 use azure_storage::StorageCredentials;
 use azure_storage_blobs::blob::CopyStatus;
 use azure_storage_blobs::prelude::ClientBuilder;
@@ -81,8 +82,9 @@ impl AzureBlobStorage {
        };

        // we have an outer retry
-        let builder = ClientBuilder::new(account, credentials).retry(RetryOptions::none());
-
+        let builder = ClientBuilder::new(account, credentials)
+            .retry(RetryOptions::none())
+            .transport(TransportOptions::new(reqwest_client(small_timeout)));
        let client = builder.container_client(azure_config.container_name.to_owned());

        let max_keys_per_list_response =
@@ -261,6 +263,16 @@ impl AzureBlobStorage {
    }
 }

+fn reqwest_client(timeout: Duration) -> Arc<dyn HttpClient> {
+    let client = reqwest::ClientBuilder::new()
+        .pool_max_idle_per_host(0)
+        .read_timeout(timeout)
+        .connect_timeout(timeout)
+        .build()
+        .expect("failed to build `reqwest` client");
+    Arc::new(client)
+}
+
 fn to_azure_metadata(metadata: StorageMetadata) -> Metadata {
    let mut res = Metadata::new();
    for (k, v) in metadata.0.into_iter() {
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -94,6 +94,8 @@ pub mod toml_edit_ext;

 pub mod circuit_breaker;

+pub mod try_rcu;
+
 // Re-export used in macro. Avoids adding git-version as dep in target crates.
 #[doc(hidden)]
 pub use git_version;
--- a/libs/utils/src/try_rcu.rs
+++ b/libs/utils/src/try_rcu.rs
@@ -0,0 +1,77 @@
+//! Try RCU extension lifted from <https://github.com/vorner/arc-swap/issues/94#issuecomment-1987154023>
+
+pub trait ArcSwapExt<T> {
+    /// [`ArcSwap::rcu`](arc_swap::ArcSwap::rcu), but with Result that short-circuits on error.
+    fn try_rcu<R, F, E>(&self, f: F) -> Result<T, E>
+    where
+        F: FnMut(&T) -> Result<R, E>,
+        R: Into<T>;
+}
+
+impl<T, S> ArcSwapExt<T> for arc_swap::ArcSwapAny<T, S>
+where
+    T: arc_swap::RefCnt,
+    S: arc_swap::strategy::CaS<T>,
+{
+    fn try_rcu<R, F, E>(&self, mut f: F) -> Result<T, E>
+    where
+        F: FnMut(&T) -> Result<R, E>,
+        R: Into<T>,
+    {
+        fn ptr_eq<Base, A, B>(a: A, b: B) -> bool
+        where
+            A: arc_swap::AsRaw<Base>,
+            B: arc_swap::AsRaw<Base>,
+        {
+            let a = a.as_raw();
+            let b = b.as_raw();
+            std::ptr::eq(a, b)
+        }
+
+        let mut cur = self.load();
+        loop {
+            let new = f(&cur)?.into();
+            let prev = self.compare_and_swap(&*cur, new);
+            let swapped = ptr_eq(&*cur, &*prev);
+            if swapped {
+                return Ok(arc_swap::Guard::into_inner(prev));
+            } else {
+                cur = prev;
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use arc_swap::ArcSwap;
+    use std::sync::Arc;
+
+    #[test]
+    fn test_try_rcu_success() {
+        let swap = ArcSwap::from(Arc::new(42));
+
+        let result = swap.try_rcu(|value| -> Result<_, String> { Ok(**value + 1) });
+
+        assert!(result.is_ok());
+        assert_eq!(**swap.load(), 43);
+    }
+
+    #[test]
+    fn test_try_rcu_error() {
+        let swap = ArcSwap::from(Arc::new(42));
+
+        let result = swap.try_rcu(|value| -> Result<i32, _> {
+            if **value == 42 {
+                Err("err")
+            } else {
+                Ok(**value + 1)
+            }
+        });
+
+        assert!(result.is_err());
+        assert_eq!(result.unwrap_err(), "err");
+        assert_eq!(**swap.load(), 42);
+    }
+}
--- a/libs/wal_decoder/proto/interpreted_wal.proto
+++ b/libs/wal_decoder/proto/interpreted_wal.proto
@@ -37,7 +37,7 @@ message ValueMeta {
 }

 message CompactKey {
-  int64 high = 1;
-  int64 low = 2;
+  uint64 high = 1;
+  uint64 low = 2;
 }

--- a/libs/wal_decoder/src/wire_format.rs
+++ b/libs/wal_decoder/src/wire_format.rs
@@ -236,8 +236,8 @@ impl From<ValueMeta> for proto::ValueMeta {
 impl From<CompactKey> for proto::CompactKey {
    fn from(value: CompactKey) -> Self {
        proto::CompactKey {
-            high: (value.raw() >> 64) as i64,
-            low: value.raw() as i64,
+            high: (value.raw() >> 64) as u64,
+            low: value.raw() as u64,
        }
    }
 }
@@ -354,3 +354,64 @@ impl From<proto::CompactKey> for CompactKey {
        (((value.high as i128) << 64) | (value.low as i128)).into()
    }
 }
+
+#[test]
+fn test_compact_key_with_large_relnode() {
+    use pageserver_api::key::Key;
+
+    let inputs = vec![
+        Key {
+            field1: 0,
+            field2: 0x100,
+            field3: 0x200,
+            field4: 0,
+            field5: 0x10,
+            field6: 0x5,
+        },
+        Key {
+            field1: 0,
+            field2: 0x100,
+            field3: 0x200,
+            field4: 0x007FFFFF,
+            field5: 0x10,
+            field6: 0x5,
+        },
+        Key {
+            field1: 0,
+            field2: 0x100,
+            field3: 0x200,
+            field4: 0x00800000,
+            field5: 0x10,
+            field6: 0x5,
+        },
+        Key {
+            field1: 0,
+            field2: 0x100,
+            field3: 0x200,
+            field4: 0x00800001,
+            field5: 0x10,
+            field6: 0x5,
+        },
+        Key {
+            field1: 0,
+            field2: 0xFFFFFFFF,
+            field3: 0xFFFFFFFF,
+            field4: 0xFFFFFFFF,
+            field5: 0x0,
+            field6: 0x0,
+        },
+    ];
+
+    for input in inputs {
+        assert!(input.is_valid_key_on_write_path());
+        let compact = input.to_compact();
+        let proto: proto::CompactKey = compact.into();
+        let from_proto: CompactKey = proto.into();
+
+        assert_eq!(
+            compact, from_proto,
+            "Round trip failed for key with relnode={:#x}",
+            input.field4
+        );
+    }
+}
--- a/libs/walproposer/build.rs
+++ b/libs/walproposer/build.rs
@@ -30,9 +30,9 @@ fn main() -> anyhow::Result<()> {
    let pgxn_neon = std::fs::canonicalize(pgxn_neon)?;
    let pgxn_neon = pgxn_neon.to_str().ok_or(anyhow!("Bad non-UTF path"))?;

+    println!("cargo:rustc-link-lib=static=walproposer");
    println!("cargo:rustc-link-lib=static=pgport");
    println!("cargo:rustc-link-lib=static=pgcommon");
-    println!("cargo:rustc-link-lib=static=walproposer");
    println!("cargo:rustc-link-search={walproposer_lib_search_str}");

    // Rebuild crate when libwalproposer.a changes
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -270,12 +270,18 @@ impl Client {
        Ok(body)
    }

-    pub async fn tenant_config(&self, req: &TenantConfigRequest) -> Result<()> {
+    pub async fn set_tenant_config(&self, req: &TenantConfigRequest) -> Result<()> {
        let uri = format!("{}/v1/tenant/config", self.mgmt_api_endpoint);
        self.request(Method::PUT, &uri, req).await?;
        Ok(())
    }

+    pub async fn patch_tenant_config(&self, req: &TenantConfigPatchRequest) -> Result<()> {
+        let uri = format!("{}/v1/tenant/config", self.mgmt_api_endpoint);
+        self.request(Method::PATCH, &uri, req).await?;
+        Ok(())
+    }
+
    pub async fn tenant_secondary_download(
        &self,
        tenant_id: TenantShardId,
--- a/pageserver/pagebench/src/cmd/aux_files.rs
+++ b/pageserver/pagebench/src/cmd/aux_files.rs
@@ -64,7 +64,7 @@ async fn main_impl(args: Args) -> anyhow::Result<()> {
    println!("operating on timeline {}", timeline);

    mgmt_api_client
-        .tenant_config(&TenantConfigRequest {
+        .set_tenant_config(&TenantConfigRequest {
            tenant_id: timeline.tenant_id,
            config: TenantConfig::default(),
        })
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -767,7 +767,27 @@ paths:
  /v1/tenant/config:
    put:
      description: |
-        Update tenant's config.
+        Update tenant's config by setting it to the provided value
+
+        Invalid fields in the tenant config will cause the request to be rejected with status 400.
+      requestBody:
+        content:
+          application/json:
+            schema:
+              $ref: "#/components/schemas/TenantConfigRequest"
+      responses:
+        "200":
+          description: OK
+          content:
+            application/json:
+              schema:
+                type: array
+                items:
+                  $ref: "#/components/schemas/TenantInfo"
+    patch:
+      description: |
+        Update tenant's config additively by patching the updated fields provided.
+        Null values unset the field and non-null values upsert it.

        Invalid fields in the tenant config will cause the request to be rejected with status 400.
      requestBody:
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -28,6 +28,7 @@ use pageserver_api::models::LsnLease;
 use pageserver_api::models::LsnLeaseRequest;
 use pageserver_api::models::OffloadedTimelineInfo;
 use pageserver_api::models::ShardParameters;
+use pageserver_api::models::TenantConfigPatchRequest;
 use pageserver_api::models::TenantDetails;
 use pageserver_api::models::TenantLocationConfigRequest;
 use pageserver_api::models::TenantLocationConfigResponse;
@@ -1695,7 +1696,47 @@ async fn update_tenant_config_handler(
    crate::tenant::Tenant::persist_tenant_config(state.conf, &tenant_shard_id, &location_conf)
        .await
        .map_err(|e| ApiError::InternalServerError(anyhow::anyhow!(e)))?;
-    tenant.set_new_tenant_config(new_tenant_conf);
+
+    let _ = tenant
+        .update_tenant_config(|_crnt| Ok(new_tenant_conf.clone()))
+        .expect("Closure returns Ok()");
+
+    json_response(StatusCode::OK, ())
+}
+
+async fn patch_tenant_config_handler(
+    mut request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let request_data: TenantConfigPatchRequest = json_request(&mut request).await?;
+    let tenant_id = request_data.tenant_id;
+    check_permission(&request, Some(tenant_id))?;
+
+    let state = get_state(&request);
+
+    let tenant_shard_id = TenantShardId::unsharded(tenant_id);
+
+    let tenant = state
+        .tenant_manager
+        .get_attached_tenant_shard(tenant_shard_id)?;
+    tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
+
+    let updated = tenant
+        .update_tenant_config(|crnt| crnt.apply_patch(request_data.config.clone()))
+        .map_err(ApiError::BadRequest)?;
+
+    // This is a legacy API that only operates on attached tenants: the preferred
+    // API to use is the location_config/ endpoint, which lets the caller provide
+    // the full LocationConf.
+    let location_conf = LocationConf::attached_single(
+        updated,
+        tenant.get_generation(),
+        &ShardParameters::default(),
+    );
+
+    crate::tenant::Tenant::persist_tenant_config(state.conf, &tenant_shard_id, &location_conf)
+        .await
+        .map_err(|e| ApiError::InternalServerError(anyhow::anyhow!(e)))?;

    json_response(StatusCode::OK, ())
 }
@@ -2040,13 +2081,20 @@ async fn timeline_compact_handler(
        .as_ref()
        .map(|r| r.sub_compaction)
        .unwrap_or(false);
+    let sub_compaction_max_job_size_mb = compact_request
+        .as_ref()
+        .and_then(|r| r.sub_compaction_max_job_size_mb);
+
    let options = CompactOptions {
-        compact_range: compact_request
+        compact_key_range: compact_request
            .as_ref()
-            .and_then(|r| r.compact_range.clone()),
-        compact_below_lsn: compact_request.as_ref().and_then(|r| r.compact_below_lsn),
+            .and_then(|r| r.compact_key_range.clone()),
+        compact_lsn_range: compact_request
+            .as_ref()
+            .and_then(|r| r.compact_lsn_range.clone()),
        flags,
        sub_compaction,
+        sub_compaction_max_job_size_mb,
    };

    let scheduled = compact_request
@@ -3288,6 +3336,9 @@ pub fn make_router(
        .get("/v1/tenant/:tenant_shard_id/synthetic_size", |r| {
            api_handler(r, tenant_size_handler)
        })
+        .patch("/v1/tenant/config", |r| {
+            api_handler(r, patch_tenant_config_handler)
+        })
        .put("/v1/tenant/config", |r| {
            api_handler(r, update_tenant_config_handler)
        })
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -16,7 +16,6 @@ use postgres_backend::{is_expected_io_error, QueryError};
 use pq_proto::framed::ConnectionError;
 use strum::{EnumCount, VariantNames};
 use strum_macros::{IntoStaticStr, VariantNames};
-use tracing::warn;
 use utils::id::TimelineId;

 /// Prometheus histogram buckets (in seconds) for operations in the critical
@@ -1225,32 +1224,58 @@ pub(crate) mod virtual_file_io_engine {

 pub(crate) struct SmgrOpTimer(Option<SmgrOpTimerInner>);
 pub(crate) struct SmgrOpTimerInner {
-    global_latency_histo: Histogram,
+    global_execution_latency_histo: Histogram,
+    per_timeline_execution_latency_histo: Option<Histogram>,

-    // Optional because not all op types are tracked per-timeline
-    per_timeline_latency_histo: Option<Histogram>,
+    global_batch_wait_time: Histogram,
+    per_timeline_batch_wait_time: Histogram,

    global_flush_in_progress_micros: IntCounter,
    per_timeline_flush_in_progress_micros: IntCounter,

-    start: Instant,
-    throttled: Duration,
-    op: SmgrQueryType,
+    timings: SmgrOpTimerState,
+}
+
+#[derive(Debug)]
+enum SmgrOpTimerState {
+    Received {
+        received_at: Instant,
+    },
+    ThrottleDoneExecutionStarting {
+        received_at: Instant,
+        throttle_started_at: Instant,
+        started_execution_at: Instant,
+    },
 }

 pub(crate) struct SmgrOpFlushInProgress {
-    base: Instant,
+    flush_started_at: Instant,
    global_micros: IntCounter,
    per_timeline_micros: IntCounter,
 }

 impl SmgrOpTimer {
-    pub(crate) fn deduct_throttle(&mut self, throttle: &Option<Duration>) {
-        let Some(throttle) = throttle else {
-            return;
-        };
+    pub(crate) fn observe_throttle_done_execution_starting(&mut self, throttle: &ThrottleResult) {
        let inner = self.0.as_mut().expect("other public methods consume self");
-        inner.throttled += *throttle;
+        match (&mut inner.timings, throttle) {
+            (SmgrOpTimerState::Received { received_at }, throttle) => match throttle {
+                ThrottleResult::NotThrottled { start } => {
+                    inner.timings = SmgrOpTimerState::ThrottleDoneExecutionStarting {
+                        received_at: *received_at,
+                        throttle_started_at: *start,
+                        started_execution_at: *start,
+                    };
+                }
+                ThrottleResult::Throttled { start, end } => {
+                    inner.timings = SmgrOpTimerState::ThrottleDoneExecutionStarting {
+                        received_at: *start,
+                        throttle_started_at: *start,
+                        started_execution_at: *end,
+                    };
+                }
+            },
+            (x, _) => panic!("called in unexpected state: {x:?}"),
+        }
    }

    pub(crate) fn observe_smgr_op_completion_and_start_flushing(mut self) -> SmgrOpFlushInProgress {
@@ -1263,7 +1288,7 @@ impl SmgrOpTimer {
            ..
        } = inner;
        SmgrOpFlushInProgress {
-            base: flush_start,
+            flush_started_at: flush_start,
            global_micros: global_flush_in_progress_micros,
            per_timeline_micros: per_timeline_flush_in_progress_micros,
        }
@@ -1274,32 +1299,42 @@ impl SmgrOpTimer {
        let inner = self.0.take()?;

        let now = Instant::now();
-        let elapsed = now - inner.start;

-        let elapsed = match elapsed.checked_sub(inner.throttled) {
-            Some(elapsed) => elapsed,
-            None => {
-                use utils::rate_limit::RateLimit;
-                static LOGGED: Lazy<Mutex<enum_map::EnumMap<SmgrQueryType, RateLimit>>> =
-                    Lazy::new(|| {
-                        Mutex::new(enum_map::EnumMap::from_array(std::array::from_fn(|_| {
-                            RateLimit::new(Duration::from_secs(10))
-                        })))
-                    });
-                let mut guard = LOGGED.lock().unwrap();
-                let rate_limit = &mut guard[inner.op];
-                rate_limit.call(|| {
-                    warn!(op=?inner.op, ?elapsed, ?inner.throttled, "implementation error: time spent throttled exceeds total request wall clock time");
-                });
-                elapsed // un-throttled time, more info than just saturating to 0
+        let batch;
+        let execution;
+        let throttle;
+        match inner.timings {
+            SmgrOpTimerState::Received { received_at } => {
+                batch = (now - received_at).as_secs_f64();
+                // TODO: use label for dropped requests.
+                // This is quite rare in practice, only during tenant/pageservers shutdown.
+                throttle = Duration::ZERO;
+                execution = Duration::ZERO.as_secs_f64();
            }
-        };
+            SmgrOpTimerState::ThrottleDoneExecutionStarting {
+                received_at,
+                throttle_started_at,
+                started_execution_at,
+            } => {
+                batch = (throttle_started_at - received_at).as_secs_f64();
+                throttle = started_execution_at - throttle_started_at;
+                execution = (now - started_execution_at).as_secs_f64();
+            }
+        }

-        let elapsed = elapsed.as_secs_f64();
+        // update time spent in batching
+        inner.global_batch_wait_time.observe(batch);
+        inner.per_timeline_batch_wait_time.observe(batch);

-        inner.global_latency_histo.observe(elapsed);
-        if let Some(per_timeline_getpage_histo) = &inner.per_timeline_latency_histo {
-            per_timeline_getpage_histo.observe(elapsed);
+        // time spent in throttle metric is updated by throttle impl
+        let _ = throttle;
+
+        // update metrics for execution latency
+        inner.global_execution_latency_histo.observe(execution);
+        if let Some(per_timeline_execution_latency_histo) =
+            &inner.per_timeline_execution_latency_histo
+        {
+            per_timeline_execution_latency_histo.observe(execution);
        }

        Some((now, inner))
@@ -1325,12 +1360,12 @@ impl SmgrOpFlushInProgress {
        // Last call is tracked in `now`.
        let mut observe_guard = scopeguard::guard(
            || {
-                let elapsed = now - self.base;
+                let elapsed = now - self.flush_started_at;
                self.global_micros
                    .inc_by(u64::try_from(elapsed.as_micros()).unwrap());
                self.per_timeline_micros
                    .inc_by(u64::try_from(elapsed.as_micros()).unwrap());
-                self.base = now;
+                self.flush_started_at = now;
            },
            |mut observe| {
                observe();
@@ -1377,6 +1412,8 @@ pub(crate) struct SmgrQueryTimePerTimeline {
    per_timeline_batch_size: Histogram,
    global_flush_in_progress_micros: IntCounter,
    per_timeline_flush_in_progress_micros: IntCounter,
+    global_batch_wait_time: Histogram,
+    per_timeline_batch_wait_time: Histogram,
 }

 static SMGR_QUERY_STARTED_GLOBAL: Lazy<IntCounterVec> = Lazy::new(|| {
@@ -1399,12 +1436,15 @@ static SMGR_QUERY_STARTED_PER_TENANT_TIMELINE: Lazy<IntCounterVec> = Lazy::new(|
    .expect("failed to define a metric")
 });

+// Alias so all histograms recording per-timeline smgr timings use the same buckets.
+static SMGR_QUERY_TIME_PER_TENANT_TIMELINE_BUCKETS: &[f64] = CRITICAL_OP_BUCKETS;
+
 static SMGR_QUERY_TIME_PER_TENANT_TIMELINE: Lazy<HistogramVec> = Lazy::new(|| {
    register_histogram_vec!(
        "pageserver_smgr_query_seconds",
-        "Time spent on smgr query handling, aggegated by query type and tenant/timeline.",
+        "Time spent _executing_ smgr query handling, excluding batch and throttle delays.",
        &["smgr_query_type", "tenant_id", "shard_id", "timeline_id"],
-        CRITICAL_OP_BUCKETS.into(),
+        SMGR_QUERY_TIME_PER_TENANT_TIMELINE_BUCKETS.into(),
    )
    .expect("failed to define a metric")
 });
@@ -1462,7 +1502,7 @@ static SMGR_QUERY_TIME_GLOBAL_BUCKETS: Lazy<Vec<f64>> = Lazy::new(|| {
 static SMGR_QUERY_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
    register_histogram_vec!(
        "pageserver_smgr_query_seconds_global",
-        "Time spent on smgr query handling, aggregated by query type.",
+        "Like pageserver_smgr_query_seconds, but aggregated to instance level.",
        &["smgr_query_type"],
        SMGR_QUERY_TIME_GLOBAL_BUCKETS.clone(),
    )
@@ -1559,6 +1599,25 @@ static PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS_GLOBAL: Lazy<IntCounter> = Lazy
    .expect("failed to define a metric")
 });

+static PAGE_SERVICE_SMGR_BATCH_WAIT_TIME: Lazy<HistogramVec> = Lazy::new(|| {
+    register_histogram_vec!(
+        "pageserver_page_service_pagestream_batch_wait_time_seconds",
+        "Time a request spent waiting in its batch until the batch moved to throttle&execution.",
+        &["tenant_id", "shard_id", "timeline_id"],
+        SMGR_QUERY_TIME_PER_TENANT_TIMELINE_BUCKETS.into(),
+    )
+    .expect("failed to define a metric")
+});
+
+static PAGE_SERVICE_SMGR_BATCH_WAIT_TIME_GLOBAL: Lazy<Histogram> = Lazy::new(|| {
+    register_histogram!(
+        "pageserver_page_service_pagestream_batch_wait_time_seconds_global",
+        "Like pageserver_page_service_pagestream_batch_wait_time_seconds, but aggregated to instance level.",
+        SMGR_QUERY_TIME_GLOBAL_BUCKETS.to_vec(),
+    )
+    .expect("failed to define a metric")
+});
+
 impl SmgrQueryTimePerTimeline {
    pub(crate) fn new(tenant_shard_id: &TenantShardId, timeline_id: &TimelineId) -> Self {
        let tenant_id = tenant_shard_id.tenant_id.to_string();
@@ -1599,6 +1658,11 @@ impl SmgrQueryTimePerTimeline {
            .get_metric_with_label_values(&[&tenant_id, &shard_slug, &timeline_id])
            .unwrap();

+        let global_batch_wait_time = PAGE_SERVICE_SMGR_BATCH_WAIT_TIME_GLOBAL.clone();
+        let per_timeline_batch_wait_time = PAGE_SERVICE_SMGR_BATCH_WAIT_TIME
+            .get_metric_with_label_values(&[&tenant_id, &shard_slug, &timeline_id])
+            .unwrap();
+
        let global_flush_in_progress_micros =
            PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS_GLOBAL.clone();
        let per_timeline_flush_in_progress_micros = PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS
@@ -1614,9 +1678,11 @@ impl SmgrQueryTimePerTimeline {
            per_timeline_batch_size,
            global_flush_in_progress_micros,
            per_timeline_flush_in_progress_micros,
+            global_batch_wait_time,
+            per_timeline_batch_wait_time,
        }
    }
-    pub(crate) fn start_smgr_op(&self, op: SmgrQueryType, started_at: Instant) -> SmgrOpTimer {
+    pub(crate) fn start_smgr_op(&self, op: SmgrQueryType, received_at: Instant) -> SmgrOpTimer {
        self.global_started[op as usize].inc();

        let per_timeline_latency_histo = if matches!(op, SmgrQueryType::GetPageAtLsn) {
@@ -1627,15 +1693,15 @@ impl SmgrQueryTimePerTimeline {
        };

        SmgrOpTimer(Some(SmgrOpTimerInner {
-            global_latency_histo: self.global_latency[op as usize].clone(),
-            per_timeline_latency_histo,
-            start: started_at,
-            op,
-            throttled: Duration::ZERO,
+            global_execution_latency_histo: self.global_latency[op as usize].clone(),
+            per_timeline_execution_latency_histo: per_timeline_latency_histo,
+            timings: SmgrOpTimerState::Received { received_at },
            global_flush_in_progress_micros: self.global_flush_in_progress_micros.clone(),
            per_timeline_flush_in_progress_micros: self
                .per_timeline_flush_in_progress_micros
                .clone(),
+            global_batch_wait_time: self.global_batch_wait_time.clone(),
+            per_timeline_batch_wait_time: self.per_timeline_batch_wait_time.clone(),
        }))
    }

@@ -2889,6 +2955,11 @@ impl TimelineMetrics {
            shard_id,
            timeline_id,
        ]);
+        let _ = PAGE_SERVICE_SMGR_BATCH_WAIT_TIME.remove_label_values(&[
+            tenant_id,
+            shard_id,
+            timeline_id,
+        ]);
    }
 }

@@ -2919,6 +2990,7 @@ use crate::context::{PageContentKind, RequestContext};
 use crate::task_mgr::TaskKind;
 use crate::tenant::mgr::TenantSlot;
 use crate::tenant::tasks::BackgroundLoopKind;
+use crate::tenant::throttle::ThrottleResult;
 use crate::tenant::Timeline;

 /// Maintain a per timeline gauge in addition to the global gauge.
@@ -3773,6 +3845,7 @@ pub fn preinitialize_metrics(conf: &'static PageServerConf) {
        &REMOTE_ONDEMAND_DOWNLOADED_BYTES,
        &CIRCUIT_BREAKERS_BROKEN,
        &CIRCUIT_BREAKERS_UNBROKEN,
+        &PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS_GLOBAL,
    ]
    .into_iter()
    .for_each(|c| {
@@ -3820,6 +3893,7 @@ pub fn preinitialize_metrics(conf: &'static PageServerConf) {
        &WAL_REDO_BYTES_HISTOGRAM,
        &WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM,
        &PAGE_SERVICE_BATCH_SIZE_GLOBAL,
+        &PAGE_SERVICE_SMGR_BATCH_WAIT_TIME_GLOBAL,
    ]
    .into_iter()
    .for_each(|h| {
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -575,7 +575,10 @@ enum BatchedFeMessage {
 }

 impl BatchedFeMessage {
-    async fn throttle(&mut self, cancel: &CancellationToken) -> Result<(), QueryError> {
+    async fn throttle_and_record_start_processing(
+        &mut self,
+        cancel: &CancellationToken,
+    ) -> Result<(), QueryError> {
        let (shard, tokens, timers) = match self {
            BatchedFeMessage::Exists { shard, timer, .. }
            | BatchedFeMessage::Nblocks { shard, timer, .. }
@@ -603,7 +606,7 @@ impl BatchedFeMessage {
            }
        };
        for timer in timers {
-            timer.deduct_throttle(&throttled);
+            timer.observe_throttle_done_execution_starting(&throttled);
        }
        Ok(())
    }
@@ -1230,7 +1233,7 @@ impl PageServerHandler {
                }
            };

-            if let Err(cancelled) = msg.throttle(&self.cancel).await {
+            if let Err(cancelled) = msg.throttle_and_record_start_processing(&self.cancel).await {
                break cancelled;
            }

@@ -1397,7 +1400,9 @@ impl PageServerHandler {
                            return Err(e);
                        }
                    };
-                    batch.throttle(&self.cancel).await?;
+                    batch
+                        .throttle_and_record_start_processing(&self.cancel)
+                        .await?;
                    self.pagesteam_handle_batched_message(pgb_writer, batch, &cancel, &ctx)
                        .await?;
                }
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -44,6 +44,7 @@ use std::sync::atomic::AtomicBool;
 use std::sync::Weak;
 use std::time::SystemTime;
 use storage_broker::BrokerClientChannel;
+use timeline::compaction::GcCompactJob;
 use timeline::compaction::ScheduledCompactionTask;
 use timeline::import_pgdata;
 use timeline::offload::offload_timeline;
@@ -68,6 +69,7 @@ use utils::sync::gate::Gate;
 use utils::sync::gate::GateGuard;
 use utils::timeout::timeout_cancellable;
 use utils::timeout::TimeoutCancellableError;
+use utils::try_rcu::ArcSwapExt;
 use utils::zstd::create_zst_tarball;
 use utils::zstd::extract_zst_tarball;

@@ -3016,8 +3018,15 @@ impl Tenant {
                            warn!("ignoring scheduled compaction task: scheduled task must be gc compaction: {:?}", next_scheduled_compaction_task.options);
                        } else if next_scheduled_compaction_task.options.sub_compaction {
                            info!("running scheduled enhanced gc bottom-most compaction with sub-compaction, splitting compaction jobs");
-                            let jobs = timeline
-                                .gc_compaction_split_jobs(next_scheduled_compaction_task.options)
+                            let jobs: Vec<GcCompactJob> = timeline
+                                .gc_compaction_split_jobs(
+                                    GcCompactJob::from_compact_options(
+                                        next_scheduled_compaction_task.options.clone(),
+                                    ),
+                                    next_scheduled_compaction_task
+                                        .options
+                                        .sub_compaction_max_job_size_mb,
+                                )
                                .await
                                .map_err(CompactionError::Other)?;
                            if jobs.is_empty() {
@@ -3028,9 +3037,23 @@ impl Tenant {
                                let mut guard = self.scheduled_compaction_tasks.lock().unwrap();
                                let tline_pending_tasks = guard.entry(*timeline_id).or_default();
                                for (idx, job) in jobs.into_iter().enumerate() {
+                                    // Unfortunately we need to convert the `GcCompactJob` back to `CompactionOptions`
+                                    // until we do further refactors to allow directly call `compact_with_gc`.
+                                    let mut flags: EnumSet<CompactFlags> = EnumSet::default();
+                                    flags |= CompactFlags::EnhancedGcBottomMostCompaction;
+                                    if job.dry_run {
+                                        flags |= CompactFlags::DryRun;
+                                    }
+                                    let options = CompactOptions {
+                                        flags,
+                                        sub_compaction: false,
+                                        compact_key_range: Some(job.compact_key_range.into()),
+                                        compact_lsn_range: Some(job.compact_lsn_range.into()),
+                                        sub_compaction_max_job_size_mb: None,
+                                    };
                                    tline_pending_tasks.push_back(if idx == jobs_len - 1 {
                                        ScheduledCompactionTask {
-                                            options: job,
+                                            options,
                                            // The last job in the queue sends the signal and releases the gc guard
                                            result_tx: next_scheduled_compaction_task
                                                .result_tx
@@ -3041,7 +3064,7 @@ impl Tenant {
                                        }
                                    } else {
                                        ScheduledCompactionTask {
-                                            options: job,
+                                            options,
                                            result_tx: None,
                                            gc_block: None,
                                        }
@@ -3921,25 +3944,28 @@ impl Tenant {
        }
    }

-    pub fn set_new_tenant_config(&self, new_tenant_conf: TenantConfOpt) {
+    pub fn update_tenant_config<F: Fn(TenantConfOpt) -> anyhow::Result<TenantConfOpt>>(
+        &self,
+        update: F,
+    ) -> anyhow::Result<TenantConfOpt> {
        // Use read-copy-update in order to avoid overwriting the location config
        // state if this races with [`Tenant::set_new_location_config`]. Note that
        // this race is not possible if both request types come from the storage
        // controller (as they should!) because an exclusive op lock is required
        // on the storage controller side.

-        self.tenant_conf.rcu(|inner| {
-            Arc::new(AttachedTenantConf {
-                tenant_conf: new_tenant_conf.clone(),
-                location: inner.location,
-                // Attached location is not changed, no need to update lsn lease deadline.
-                lsn_lease_deadline: inner.lsn_lease_deadline,
-            })
-        });
+        self.tenant_conf
+            .try_rcu(|attached_conf| -> Result<_, anyhow::Error> {
+                Ok(Arc::new(AttachedTenantConf {
+                    tenant_conf: update(attached_conf.tenant_conf.clone())?,
+                    location: attached_conf.location,
+                    lsn_lease_deadline: attached_conf.lsn_lease_deadline,
+                }))
+            })?;

-        let updated = self.tenant_conf.load().clone();
+        let updated = self.tenant_conf.load();

-        self.tenant_conf_updated(&new_tenant_conf);
+        self.tenant_conf_updated(&updated.tenant_conf);
        // Don't hold self.timelines.lock() during the notifies.
        // There's no risk of deadlock right now, but there could be if we consolidate
        // mutexes in struct Timeline in the future.
@@ -3947,6 +3973,8 @@ impl Tenant {
        for timeline in timelines {
            timeline.tenant_conf_updated(&updated);
        }
+
+        Ok(updated.tenant_conf.clone())
    }

    pub(crate) fn set_new_location_config(&self, new_conf: AttachedTenantConf) {
@@ -4506,7 +4534,12 @@ impl Tenant {
                // - this timeline was created while we were finding cutoffs
                // - lsn for timestamp search fails for this timeline repeatedly
                if let Some(cutoffs) = gc_cutoffs.get(&timeline.timeline_id) {
-                    target.cutoffs = cutoffs.clone();
+                    let original_cutoffs = target.cutoffs.clone();
+                    // GC cutoffs should never go back
+                    target.cutoffs = GcCutoffs {
+                        space: Lsn(cutoffs.space.0.max(original_cutoffs.space.0)),
+                        time: Lsn(cutoffs.time.0.max(original_cutoffs.time.0)),
+                    }
                }
            }

@@ -5731,6 +5764,8 @@ mod tests {
    #[cfg(feature = "testing")]
    use timeline::compaction::{KeyHistoryRetention, KeyLogAtLsn};
    #[cfg(feature = "testing")]
+    use timeline::CompactLsnRange;
+    #[cfg(feature = "testing")]
    use timeline::GcInfo;

    static TEST_KEY: Lazy<Key> =
@@ -9322,7 +9357,6 @@ mod tests {
                &cancel,
                CompactOptions {
                    flags: dryrun_flags,
-                    compact_range: None,
                    ..Default::default()
                },
                &ctx,
@@ -9571,7 +9605,6 @@ mod tests {
                &cancel,
                CompactOptions {
                    flags: dryrun_flags,
-                    compact_range: None,
                    ..Default::default()
                },
                &ctx,
@@ -9601,6 +9634,8 @@ mod tests {
    #[cfg(feature = "testing")]
    #[tokio::test]
    async fn test_simple_bottom_most_compaction_on_branch() -> anyhow::Result<()> {
+        use timeline::CompactLsnRange;
+
        let harness = TenantHarness::create("test_simple_bottom_most_compaction_on_branch").await?;
        let (tenant, ctx) = harness.load().await;

@@ -9793,6 +9828,22 @@ mod tests {

        verify_result().await;

+        // Piggyback a compaction with above_lsn. Ensure it works correctly when the specified LSN intersects with the layer files.
+        // Now we already have a single large delta layer, so the compaction min_layer_lsn should be the same as ancestor LSN (0x18).
+        branch_tline
+            .compact_with_gc(
+                &cancel,
+                CompactOptions {
+                    compact_lsn_range: Some(CompactLsnRange::above(Lsn(0x40))),
+                    ..Default::default()
+                },
+                &ctx,
+            )
+            .await
+            .unwrap();
+
+        verify_result().await;
+
        Ok(())
    }

@@ -10081,7 +10132,7 @@ mod tests {
                &cancel,
                CompactOptions {
                    flags: EnumSet::new(),
-                    compact_range: Some((get_key(0)..get_key(2)).into()),
+                    compact_key_range: Some((get_key(0)..get_key(2)).into()),
                    ..Default::default()
                },
                &ctx,
@@ -10128,7 +10179,7 @@ mod tests {
                &cancel,
                CompactOptions {
                    flags: EnumSet::new(),
-                    compact_range: Some((get_key(2)..get_key(4)).into()),
+                    compact_key_range: Some((get_key(2)..get_key(4)).into()),
                    ..Default::default()
                },
                &ctx,
@@ -10180,7 +10231,7 @@ mod tests {
                &cancel,
                CompactOptions {
                    flags: EnumSet::new(),
-                    compact_range: Some((get_key(4)..get_key(9)).into()),
+                    compact_key_range: Some((get_key(4)..get_key(9)).into()),
                    ..Default::default()
                },
                &ctx,
@@ -10231,7 +10282,7 @@ mod tests {
                &cancel,
                CompactOptions {
                    flags: EnumSet::new(),
-                    compact_range: Some((get_key(9)..get_key(10)).into()),
+                    compact_key_range: Some((get_key(9)..get_key(10)).into()),
                    ..Default::default()
                },
                &ctx,
@@ -10287,7 +10338,7 @@ mod tests {
                &cancel,
                CompactOptions {
                    flags: EnumSet::new(),
-                    compact_range: Some((get_key(0)..get_key(10)).into()),
+                    compact_key_range: Some((get_key(0)..get_key(10)).into()),
                    ..Default::default()
                },
                &ctx,
@@ -10316,7 +10367,6 @@ mod tests {
                },
            ],
        );
-
        Ok(())
    }

@@ -10369,4 +10419,602 @@ mod tests {

        Ok(())
    }
+
+    #[cfg(feature = "testing")]
+    #[tokio::test]
+    async fn test_simple_bottom_most_compaction_above_lsn() -> anyhow::Result<()> {
+        let harness = TenantHarness::create("test_simple_bottom_most_compaction_above_lsn").await?;
+        let (tenant, ctx) = harness.load().await;
+
+        fn get_key(id: u32) -> Key {
+            // using aux key here b/c they are guaranteed to be inside `collect_keyspace`.
+            let mut key = Key::from_hex("620000000033333333444444445500000000").unwrap();
+            key.field6 = id;
+            key
+        }
+
+        let img_layer = (0..10)
+            .map(|id| (get_key(id), Bytes::from(format!("value {id}@0x10"))))
+            .collect_vec();
+
+        let delta1 = vec![(
+            get_key(1),
+            Lsn(0x20),
+            Value::WalRecord(NeonWalRecord::wal_append("@0x20")),
+        )];
+        let delta4 = vec![(
+            get_key(1),
+            Lsn(0x28),
+            Value::WalRecord(NeonWalRecord::wal_append("@0x28")),
+        )];
+        let delta2 = vec![
+            (
+                get_key(1),
+                Lsn(0x30),
+                Value::WalRecord(NeonWalRecord::wal_append("@0x30")),
+            ),
+            (
+                get_key(1),
+                Lsn(0x38),
+                Value::WalRecord(NeonWalRecord::wal_append("@0x38")),
+            ),
+        ];
+        let delta3 = vec![
+            (
+                get_key(8),
+                Lsn(0x48),
+                Value::WalRecord(NeonWalRecord::wal_append("@0x48")),
+            ),
+            (
+                get_key(9),
+                Lsn(0x48),
+                Value::WalRecord(NeonWalRecord::wal_append("@0x48")),
+            ),
+        ];
+
+        let tline = tenant
+            .create_test_timeline_with_layers(
+                TIMELINE_ID,
+                Lsn(0x10),
+                DEFAULT_PG_VERSION,
+                &ctx,
+                vec![
+                    // delta1/2/4 only contain a single key but multiple updates
+                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x28), delta1),
+                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x30)..Lsn(0x50), delta2),
+                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x28)..Lsn(0x30), delta4),
+                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x30)..Lsn(0x50), delta3),
+                ], // delta layers
+                vec![(Lsn(0x10), img_layer)], // image layers
+                Lsn(0x50),
+            )
+            .await?;
+        {
+            tline
+                .latest_gc_cutoff_lsn
+                .lock_for_write()
+                .store_and_unlock(Lsn(0x30))
+                .wait()
+                .await;
+            // Update GC info
+            let mut guard = tline.gc_info.write().unwrap();
+            *guard = GcInfo {
+                retain_lsns: vec![
+                    (Lsn(0x10), tline.timeline_id, MaybeOffloaded::No),
+                    (Lsn(0x20), tline.timeline_id, MaybeOffloaded::No),
+                ],
+                cutoffs: GcCutoffs {
+                    time: Lsn(0x30),
+                    space: Lsn(0x30),
+                },
+                leases: Default::default(),
+                within_ancestor_pitr: false,
+            };
+        }
+
+        let expected_result = [
+            Bytes::from_static(b"value 0@0x10"),
+            Bytes::from_static(b"value 1@0x10@0x20@0x28@0x30@0x38"),
+            Bytes::from_static(b"value 2@0x10"),
+            Bytes::from_static(b"value 3@0x10"),
+            Bytes::from_static(b"value 4@0x10"),
+            Bytes::from_static(b"value 5@0x10"),
+            Bytes::from_static(b"value 6@0x10"),
+            Bytes::from_static(b"value 7@0x10"),
+            Bytes::from_static(b"value 8@0x10@0x48"),
+            Bytes::from_static(b"value 9@0x10@0x48"),
+        ];
+
+        let expected_result_at_gc_horizon = [
+            Bytes::from_static(b"value 0@0x10"),
+            Bytes::from_static(b"value 1@0x10@0x20@0x28@0x30"),
+            Bytes::from_static(b"value 2@0x10"),
+            Bytes::from_static(b"value 3@0x10"),
+            Bytes::from_static(b"value 4@0x10"),
+            Bytes::from_static(b"value 5@0x10"),
+            Bytes::from_static(b"value 6@0x10"),
+            Bytes::from_static(b"value 7@0x10"),
+            Bytes::from_static(b"value 8@0x10"),
+            Bytes::from_static(b"value 9@0x10"),
+        ];
+
+        let expected_result_at_lsn_20 = [
+            Bytes::from_static(b"value 0@0x10"),
+            Bytes::from_static(b"value 1@0x10@0x20"),
+            Bytes::from_static(b"value 2@0x10"),
+            Bytes::from_static(b"value 3@0x10"),
+            Bytes::from_static(b"value 4@0x10"),
+            Bytes::from_static(b"value 5@0x10"),
+            Bytes::from_static(b"value 6@0x10"),
+            Bytes::from_static(b"value 7@0x10"),
+            Bytes::from_static(b"value 8@0x10"),
+            Bytes::from_static(b"value 9@0x10"),
+        ];
+
+        let expected_result_at_lsn_10 = [
+            Bytes::from_static(b"value 0@0x10"),
+            Bytes::from_static(b"value 1@0x10"),
+            Bytes::from_static(b"value 2@0x10"),
+            Bytes::from_static(b"value 3@0x10"),
+            Bytes::from_static(b"value 4@0x10"),
+            Bytes::from_static(b"value 5@0x10"),
+            Bytes::from_static(b"value 6@0x10"),
+            Bytes::from_static(b"value 7@0x10"),
+            Bytes::from_static(b"value 8@0x10"),
+            Bytes::from_static(b"value 9@0x10"),
+        ];
+
+        let verify_result = || async {
+            let gc_horizon = {
+                let gc_info = tline.gc_info.read().unwrap();
+                gc_info.cutoffs.time
+            };
+            for idx in 0..10 {
+                assert_eq!(
+                    tline
+                        .get(get_key(idx as u32), Lsn(0x50), &ctx)
+                        .await
+                        .unwrap(),
+                    &expected_result[idx]
+                );
+                assert_eq!(
+                    tline
+                        .get(get_key(idx as u32), gc_horizon, &ctx)
+                        .await
+                        .unwrap(),
+                    &expected_result_at_gc_horizon[idx]
+                );
+                assert_eq!(
+                    tline
+                        .get(get_key(idx as u32), Lsn(0x20), &ctx)
+                        .await
+                        .unwrap(),
+                    &expected_result_at_lsn_20[idx]
+                );
+                assert_eq!(
+                    tline
+                        .get(get_key(idx as u32), Lsn(0x10), &ctx)
+                        .await
+                        .unwrap(),
+                    &expected_result_at_lsn_10[idx]
+                );
+            }
+        };
+
+        verify_result().await;
+
+        let cancel = CancellationToken::new();
+        tline
+            .compact_with_gc(
+                &cancel,
+                CompactOptions {
+                    compact_lsn_range: Some(CompactLsnRange::above(Lsn(0x28))),
+                    ..Default::default()
+                },
+                &ctx,
+            )
+            .await
+            .unwrap();
+        verify_result().await;
+
+        let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await;
+        check_layer_map_key_eq(
+            all_layers,
+            vec![
+                // The original image layer, not compacted
+                PersistentLayerKey {
+                    key_range: get_key(0)..get_key(10),
+                    lsn_range: Lsn(0x10)..Lsn(0x11),
+                    is_delta: false,
+                },
+                // Delta layer below the specified above_lsn not compacted
+                PersistentLayerKey {
+                    key_range: get_key(1)..get_key(2),
+                    lsn_range: Lsn(0x20)..Lsn(0x28),
+                    is_delta: true,
+                },
+                // Delta layer compacted above the LSN
+                PersistentLayerKey {
+                    key_range: get_key(1)..get_key(10),
+                    lsn_range: Lsn(0x28)..Lsn(0x50),
+                    is_delta: true,
+                },
+            ],
+        );
+
+        // compact again
+        tline
+            .compact_with_gc(&cancel, CompactOptions::default(), &ctx)
+            .await
+            .unwrap();
+        verify_result().await;
+
+        let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await;
+        check_layer_map_key_eq(
+            all_layers,
+            vec![
+                // The compacted image layer (full key range)
+                PersistentLayerKey {
+                    key_range: Key::MIN..Key::MAX,
+                    lsn_range: Lsn(0x10)..Lsn(0x11),
+                    is_delta: false,
+                },
+                // All other data in the delta layer
+                PersistentLayerKey {
+                    key_range: get_key(1)..get_key(10),
+                    lsn_range: Lsn(0x10)..Lsn(0x50),
+                    is_delta: true,
+                },
+            ],
+        );
+
+        Ok(())
+    }
+
+    #[cfg(feature = "testing")]
+    #[tokio::test]
+    async fn test_simple_bottom_most_compaction_rectangle() -> anyhow::Result<()> {
+        let harness = TenantHarness::create("test_simple_bottom_most_compaction_rectangle").await?;
+        let (tenant, ctx) = harness.load().await;
+
+        fn get_key(id: u32) -> Key {
+            // using aux key here b/c they are guaranteed to be inside `collect_keyspace`.
+            let mut key = Key::from_hex("620000000033333333444444445500000000").unwrap();
+            key.field6 = id;
+            key
+        }
+
+        let img_layer = (0..10)
+            .map(|id| (get_key(id), Bytes::from(format!("value {id}@0x10"))))
+            .collect_vec();
+
+        let delta1 = vec![(
+            get_key(1),
+            Lsn(0x20),
+            Value::WalRecord(NeonWalRecord::wal_append("@0x20")),
+        )];
+        let delta4 = vec![(
+            get_key(1),
+            Lsn(0x28),
+            Value::WalRecord(NeonWalRecord::wal_append("@0x28")),
+        )];
+        let delta2 = vec![
+            (
+                get_key(1),
+                Lsn(0x30),
+                Value::WalRecord(NeonWalRecord::wal_append("@0x30")),
+            ),
+            (
+                get_key(1),
+                Lsn(0x38),
+                Value::WalRecord(NeonWalRecord::wal_append("@0x38")),
+            ),
+        ];
+        let delta3 = vec![
+            (
+                get_key(8),
+                Lsn(0x48),
+                Value::WalRecord(NeonWalRecord::wal_append("@0x48")),
+            ),
+            (
+                get_key(9),
+                Lsn(0x48),
+                Value::WalRecord(NeonWalRecord::wal_append("@0x48")),
+            ),
+        ];
+
+        let tline = tenant
+            .create_test_timeline_with_layers(
+                TIMELINE_ID,
+                Lsn(0x10),
+                DEFAULT_PG_VERSION,
+                &ctx,
+                vec![
+                    // delta1/2/4 only contain a single key but multiple updates
+                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x28), delta1),
+                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x30)..Lsn(0x50), delta2),
+                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x28)..Lsn(0x30), delta4),
+                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x30)..Lsn(0x50), delta3),
+                ], // delta layers
+                vec![(Lsn(0x10), img_layer)], // image layers
+                Lsn(0x50),
+            )
+            .await?;
+        {
+            tline
+                .latest_gc_cutoff_lsn
+                .lock_for_write()
+                .store_and_unlock(Lsn(0x30))
+                .wait()
+                .await;
+            // Update GC info
+            let mut guard = tline.gc_info.write().unwrap();
+            *guard = GcInfo {
+                retain_lsns: vec![
+                    (Lsn(0x10), tline.timeline_id, MaybeOffloaded::No),
+                    (Lsn(0x20), tline.timeline_id, MaybeOffloaded::No),
+                ],
+                cutoffs: GcCutoffs {
+                    time: Lsn(0x30),
+                    space: Lsn(0x30),
+                },
+                leases: Default::default(),
+                within_ancestor_pitr: false,
+            };
+        }
+
+        let expected_result = [
+            Bytes::from_static(b"value 0@0x10"),
+            Bytes::from_static(b"value 1@0x10@0x20@0x28@0x30@0x38"),
+            Bytes::from_static(b"value 2@0x10"),
+            Bytes::from_static(b"value 3@0x10"),
+            Bytes::from_static(b"value 4@0x10"),
+            Bytes::from_static(b"value 5@0x10"),
+            Bytes::from_static(b"value 6@0x10"),
+            Bytes::from_static(b"value 7@0x10"),
+            Bytes::from_static(b"value 8@0x10@0x48"),
+            Bytes::from_static(b"value 9@0x10@0x48"),
+        ];
+
+        let expected_result_at_gc_horizon = [
+            Bytes::from_static(b"value 0@0x10"),
+            Bytes::from_static(b"value 1@0x10@0x20@0x28@0x30"),
+            Bytes::from_static(b"value 2@0x10"),
+            Bytes::from_static(b"value 3@0x10"),
+            Bytes::from_static(b"value 4@0x10"),
+            Bytes::from_static(b"value 5@0x10"),
+            Bytes::from_static(b"value 6@0x10"),
+            Bytes::from_static(b"value 7@0x10"),
+            Bytes::from_static(b"value 8@0x10"),
+            Bytes::from_static(b"value 9@0x10"),
+        ];
+
+        let expected_result_at_lsn_20 = [
+            Bytes::from_static(b"value 0@0x10"),
+            Bytes::from_static(b"value 1@0x10@0x20"),
+            Bytes::from_static(b"value 2@0x10"),
+            Bytes::from_static(b"value 3@0x10"),
+            Bytes::from_static(b"value 4@0x10"),
+            Bytes::from_static(b"value 5@0x10"),
+            Bytes::from_static(b"value 6@0x10"),
+            Bytes::from_static(b"value 7@0x10"),
+            Bytes::from_static(b"value 8@0x10"),
+            Bytes::from_static(b"value 9@0x10"),
+        ];
+
+        let expected_result_at_lsn_10 = [
+            Bytes::from_static(b"value 0@0x10"),
+            Bytes::from_static(b"value 1@0x10"),
+            Bytes::from_static(b"value 2@0x10"),
+            Bytes::from_static(b"value 3@0x10"),
+            Bytes::from_static(b"value 4@0x10"),
+            Bytes::from_static(b"value 5@0x10"),
+            Bytes::from_static(b"value 6@0x10"),
+            Bytes::from_static(b"value 7@0x10"),
+            Bytes::from_static(b"value 8@0x10"),
+            Bytes::from_static(b"value 9@0x10"),
+        ];
+
+        let verify_result = || async {
+            let gc_horizon = {
+                let gc_info = tline.gc_info.read().unwrap();
+                gc_info.cutoffs.time
+            };
+            for idx in 0..10 {
+                assert_eq!(
+                    tline
+                        .get(get_key(idx as u32), Lsn(0x50), &ctx)
+                        .await
+                        .unwrap(),
+                    &expected_result[idx]
+                );
+                assert_eq!(
+                    tline
+                        .get(get_key(idx as u32), gc_horizon, &ctx)
+                        .await
+                        .unwrap(),
+                    &expected_result_at_gc_horizon[idx]
+                );
+                assert_eq!(
+                    tline
+                        .get(get_key(idx as u32), Lsn(0x20), &ctx)
+                        .await
+                        .unwrap(),
+                    &expected_result_at_lsn_20[idx]
+                );
+                assert_eq!(
+                    tline
+                        .get(get_key(idx as u32), Lsn(0x10), &ctx)
+                        .await
+                        .unwrap(),
+                    &expected_result_at_lsn_10[idx]
+                );
+            }
+        };
+
+        verify_result().await;
+
+        let cancel = CancellationToken::new();
+
+        tline
+            .compact_with_gc(
+                &cancel,
+                CompactOptions {
+                    compact_key_range: Some((get_key(0)..get_key(2)).into()),
+                    compact_lsn_range: Some((Lsn(0x20)..Lsn(0x28)).into()),
+                    ..Default::default()
+                },
+                &ctx,
+            )
+            .await
+            .unwrap();
+        verify_result().await;
+
+        let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await;
+        check_layer_map_key_eq(
+            all_layers,
+            vec![
+                // The original image layer, not compacted
+                PersistentLayerKey {
+                    key_range: get_key(0)..get_key(10),
+                    lsn_range: Lsn(0x10)..Lsn(0x11),
+                    is_delta: false,
+                },
+                // According the selection logic, we select all layers with start key <= 0x28, so we would merge the layer 0x20-0x28 and
+                // the layer 0x28-0x30 into one.
+                PersistentLayerKey {
+                    key_range: get_key(1)..get_key(2),
+                    lsn_range: Lsn(0x20)..Lsn(0x30),
+                    is_delta: true,
+                },
+                // Above the upper bound and untouched
+                PersistentLayerKey {
+                    key_range: get_key(1)..get_key(2),
+                    lsn_range: Lsn(0x30)..Lsn(0x50),
+                    is_delta: true,
+                },
+                // This layer is untouched
+                PersistentLayerKey {
+                    key_range: get_key(8)..get_key(10),
+                    lsn_range: Lsn(0x30)..Lsn(0x50),
+                    is_delta: true,
+                },
+            ],
+        );
+
+        tline
+            .compact_with_gc(
+                &cancel,
+                CompactOptions {
+                    compact_key_range: Some((get_key(3)..get_key(8)).into()),
+                    compact_lsn_range: Some((Lsn(0x28)..Lsn(0x40)).into()),
+                    ..Default::default()
+                },
+                &ctx,
+            )
+            .await
+            .unwrap();
+        verify_result().await;
+
+        let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await;
+        check_layer_map_key_eq(
+            all_layers,
+            vec![
+                // The original image layer, not compacted
+                PersistentLayerKey {
+                    key_range: get_key(0)..get_key(10),
+                    lsn_range: Lsn(0x10)..Lsn(0x11),
+                    is_delta: false,
+                },
+                // Not in the compaction key range, uncompacted
+                PersistentLayerKey {
+                    key_range: get_key(1)..get_key(2),
+                    lsn_range: Lsn(0x20)..Lsn(0x30),
+                    is_delta: true,
+                },
+                // Not in the compaction key range, uncompacted but need rewrite because the delta layer overlaps with the range
+                PersistentLayerKey {
+                    key_range: get_key(1)..get_key(2),
+                    lsn_range: Lsn(0x30)..Lsn(0x50),
+                    is_delta: true,
+                },
+                // Note that when we specify the LSN upper bound to be 0x40, the compaction algorithm will not try to cut the layer
+                // horizontally in half. Instead, it will include all LSNs that overlap with 0x40. So the real max_lsn of the compaction
+                // becomes 0x50.
+                PersistentLayerKey {
+                    key_range: get_key(8)..get_key(10),
+                    lsn_range: Lsn(0x30)..Lsn(0x50),
+                    is_delta: true,
+                },
+            ],
+        );
+
+        // compact again
+        tline
+            .compact_with_gc(
+                &cancel,
+                CompactOptions {
+                    compact_key_range: Some((get_key(0)..get_key(5)).into()),
+                    compact_lsn_range: Some((Lsn(0x20)..Lsn(0x50)).into()),
+                    ..Default::default()
+                },
+                &ctx,
+            )
+            .await
+            .unwrap();
+        verify_result().await;
+
+        let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await;
+        check_layer_map_key_eq(
+            all_layers,
+            vec![
+                // The original image layer, not compacted
+                PersistentLayerKey {
+                    key_range: get_key(0)..get_key(10),
+                    lsn_range: Lsn(0x10)..Lsn(0x11),
+                    is_delta: false,
+                },
+                // The range gets compacted
+                PersistentLayerKey {
+                    key_range: get_key(1)..get_key(2),
+                    lsn_range: Lsn(0x20)..Lsn(0x50),
+                    is_delta: true,
+                },
+                // Not touched during this iteration of compaction
+                PersistentLayerKey {
+                    key_range: get_key(8)..get_key(10),
+                    lsn_range: Lsn(0x30)..Lsn(0x50),
+                    is_delta: true,
+                },
+            ],
+        );
+
+        // final full compaction
+        tline
+            .compact_with_gc(&cancel, CompactOptions::default(), &ctx)
+            .await
+            .unwrap();
+        verify_result().await;
+
+        let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await;
+        check_layer_map_key_eq(
+            all_layers,
+            vec![
+                // The compacted image layer (full key range)
+                PersistentLayerKey {
+                    key_range: Key::MIN..Key::MAX,
+                    lsn_range: Lsn(0x10)..Lsn(0x11),
+                    is_delta: false,
+                },
+                // All other data in the delta layer
+                PersistentLayerKey {
+                    key_range: get_key(1)..get_key(10),
+                    lsn_range: Lsn(0x10)..Lsn(0x50),
+                    is_delta: true,
+                },
+            ],
+        );
+
+        Ok(())
+    }
 }
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -11,7 +11,7 @@
 pub(crate) use pageserver_api::config::TenantConfigToml as TenantConf;
 use pageserver_api::models::CompactionAlgorithmSettings;
 use pageserver_api::models::EvictionPolicy;
-use pageserver_api::models::{self, ThrottleConfig};
+use pageserver_api::models::{self, TenantConfigPatch, ThrottleConfig};
 use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize};
 use serde::de::IntoDeserializer;
 use serde::{Deserialize, Serialize};
@@ -427,6 +427,129 @@ impl TenantConfOpt {
                .or(global_conf.wal_receiver_protocol_override),
        }
    }
+
+    pub fn apply_patch(self, patch: TenantConfigPatch) -> anyhow::Result<TenantConfOpt> {
+        let Self {
+            mut checkpoint_distance,
+            mut checkpoint_timeout,
+            mut compaction_target_size,
+            mut compaction_period,
+            mut compaction_threshold,
+            mut compaction_algorithm,
+            mut gc_horizon,
+            mut gc_period,
+            mut image_creation_threshold,
+            mut pitr_interval,
+            mut walreceiver_connect_timeout,
+            mut lagging_wal_timeout,
+            mut max_lsn_wal_lag,
+            mut eviction_policy,
+            mut min_resident_size_override,
+            mut evictions_low_residence_duration_metric_threshold,
+            mut heatmap_period,
+            mut lazy_slru_download,
+            mut timeline_get_throttle,
+            mut image_layer_creation_check_threshold,
+            mut lsn_lease_length,
+            mut lsn_lease_length_for_ts,
+            mut timeline_offloading,
+            mut wal_receiver_protocol_override,
+        } = self;
+
+        patch.checkpoint_distance.apply(&mut checkpoint_distance);
+        patch
+            .checkpoint_timeout
+            .map(|v| humantime::parse_duration(&v))?
+            .apply(&mut checkpoint_timeout);
+        patch
+            .compaction_target_size
+            .apply(&mut compaction_target_size);
+        patch
+            .compaction_period
+            .map(|v| humantime::parse_duration(&v))?
+            .apply(&mut compaction_period);
+        patch.compaction_threshold.apply(&mut compaction_threshold);
+        patch.compaction_algorithm.apply(&mut compaction_algorithm);
+        patch.gc_horizon.apply(&mut gc_horizon);
+        patch
+            .gc_period
+            .map(|v| humantime::parse_duration(&v))?
+            .apply(&mut gc_period);
+        patch
+            .image_creation_threshold
+            .apply(&mut image_creation_threshold);
+        patch
+            .pitr_interval
+            .map(|v| humantime::parse_duration(&v))?
+            .apply(&mut pitr_interval);
+        patch
+            .walreceiver_connect_timeout
+            .map(|v| humantime::parse_duration(&v))?
+            .apply(&mut walreceiver_connect_timeout);
+        patch
+            .lagging_wal_timeout
+            .map(|v| humantime::parse_duration(&v))?
+            .apply(&mut lagging_wal_timeout);
+        patch.max_lsn_wal_lag.apply(&mut max_lsn_wal_lag);
+        patch.eviction_policy.apply(&mut eviction_policy);
+        patch
+            .min_resident_size_override
+            .apply(&mut min_resident_size_override);
+        patch
+            .evictions_low_residence_duration_metric_threshold
+            .map(|v| humantime::parse_duration(&v))?
+            .apply(&mut evictions_low_residence_duration_metric_threshold);
+        patch
+            .heatmap_period
+            .map(|v| humantime::parse_duration(&v))?
+            .apply(&mut heatmap_period);
+        patch.lazy_slru_download.apply(&mut lazy_slru_download);
+        patch
+            .timeline_get_throttle
+            .apply(&mut timeline_get_throttle);
+        patch
+            .image_layer_creation_check_threshold
+            .apply(&mut image_layer_creation_check_threshold);
+        patch
+            .lsn_lease_length
+            .map(|v| humantime::parse_duration(&v))?
+            .apply(&mut lsn_lease_length);
+        patch
+            .lsn_lease_length_for_ts
+            .map(|v| humantime::parse_duration(&v))?
+            .apply(&mut lsn_lease_length_for_ts);
+        patch.timeline_offloading.apply(&mut timeline_offloading);
+        patch
+            .wal_receiver_protocol_override
+            .apply(&mut wal_receiver_protocol_override);
+
+        Ok(Self {
+            checkpoint_distance,
+            checkpoint_timeout,
+            compaction_target_size,
+            compaction_period,
+            compaction_threshold,
+            compaction_algorithm,
+            gc_horizon,
+            gc_period,
+            image_creation_threshold,
+            pitr_interval,
+            walreceiver_connect_timeout,
+            lagging_wal_timeout,
+            max_lsn_wal_lag,
+            eviction_policy,
+            min_resident_size_override,
+            evictions_low_residence_duration_metric_threshold,
+            heatmap_period,
+            lazy_slru_download,
+            timeline_get_throttle,
+            image_layer_creation_check_threshold,
+            lsn_lease_length,
+            lsn_lease_length_for_ts,
+            timeline_offloading,
+            wal_receiver_protocol_override,
+        })
+    }
 }

 impl TryFrom<&'_ models::TenantConfig> for TenantConfOpt {
--- a/pageserver/src/tenant/throttle.rs
+++ b/pageserver/src/tenant/throttle.rs
@@ -58,6 +58,11 @@ pub struct Stats {
    pub sum_throttled_usecs: u64,
 }

+pub enum ThrottleResult {
+    NotThrottled { start: Instant },
+    Throttled { start: Instant, end: Instant },
+}
+
 impl<M> Throttle<M>
 where
    M: Metric,
@@ -122,15 +127,15 @@ where
        self.inner.load().rate_limiter.steady_rps()
    }

-    pub async fn throttle(&self, key_count: usize) -> Option<Duration> {
+    pub async fn throttle(&self, key_count: usize) -> ThrottleResult {
        let inner = self.inner.load_full(); // clones the `Inner` Arc

-        if !inner.enabled {
-            return None;
-        }
-
        let start = std::time::Instant::now();

+        if !inner.enabled {
+            return ThrottleResult::NotThrottled { start };
+        }
+
        self.metric.accounting_start();
        self.count_accounted_start.fetch_add(1, Ordering::Relaxed);
        let did_throttle = inner.rate_limiter.acquire(key_count).await;
@@ -145,9 +150,9 @@ where
                .fetch_add(wait_time.as_micros() as u64, Ordering::Relaxed);
            let observation = Observation { wait_time };
            self.metric.observe_throttling(&observation);
-            Some(wait_time)
+            ThrottleResult::Throttled { start, end: now }
        } else {
-            None
+            ThrottleResult::NotThrottled { start }
        }
    }
 }
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -780,46 +780,90 @@ pub(crate) enum CompactFlags {
 #[serde_with::serde_as]
 #[derive(Debug, Clone, serde::Deserialize)]
 pub(crate) struct CompactRequest {
-    pub compact_range: Option<CompactRange>,
-    pub compact_below_lsn: Option<Lsn>,
+    pub compact_key_range: Option<CompactKeyRange>,
+    pub compact_lsn_range: Option<CompactLsnRange>,
    /// Whether the compaction job should be scheduled.
    #[serde(default)]
    pub scheduled: bool,
    /// Whether the compaction job should be split across key ranges.
    #[serde(default)]
    pub sub_compaction: bool,
+    /// Max job size for each subcompaction job.
+    pub sub_compaction_max_job_size_mb: Option<u64>,
 }

 #[serde_with::serde_as]
 #[derive(Debug, Clone, serde::Deserialize)]
-pub(crate) struct CompactRange {
+pub(crate) struct CompactLsnRange {
+    pub start: Lsn,
+    pub end: Lsn,
+}
+
+#[serde_with::serde_as]
+#[derive(Debug, Clone, serde::Deserialize)]
+pub(crate) struct CompactKeyRange {
    #[serde_as(as = "serde_with::DisplayFromStr")]
    pub start: Key,
    #[serde_as(as = "serde_with::DisplayFromStr")]
    pub end: Key,
 }

-impl From<Range<Key>> for CompactRange {
-    fn from(range: Range<Key>) -> Self {
-        CompactRange {
+impl From<Range<Lsn>> for CompactLsnRange {
+    fn from(range: Range<Lsn>) -> Self {
+        Self {
            start: range.start,
            end: range.end,
        }
    }
 }

+impl From<Range<Key>> for CompactKeyRange {
+    fn from(range: Range<Key>) -> Self {
+        Self {
+            start: range.start,
+            end: range.end,
+        }
+    }
+}
+
+impl From<CompactLsnRange> for Range<Lsn> {
+    fn from(range: CompactLsnRange) -> Self {
+        range.start..range.end
+    }
+}
+
+impl From<CompactKeyRange> for Range<Key> {
+    fn from(range: CompactKeyRange) -> Self {
+        range.start..range.end
+    }
+}
+
+impl CompactLsnRange {
+    #[cfg(test)]
+    #[cfg(feature = "testing")]
+    pub fn above(lsn: Lsn) -> Self {
+        Self {
+            start: lsn,
+            end: Lsn::MAX,
+        }
+    }
+}
+
 #[derive(Debug, Clone, Default)]
 pub(crate) struct CompactOptions {
    pub flags: EnumSet<CompactFlags>,
    /// If set, the compaction will only compact the key range specified by this option.
-    /// This option is only used by GC compaction.
-    pub compact_range: Option<CompactRange>,
-    /// If set, the compaction will only compact the LSN below this value.
-    /// This option is only used by GC compaction.
-    pub compact_below_lsn: Option<Lsn>,
+    /// This option is only used by GC compaction. For the full explanation, see [`compaction::GcCompactJob`].
+    pub compact_key_range: Option<CompactKeyRange>,
+    /// If set, the compaction will only compact the LSN within this value.
+    /// This option is only used by GC compaction. For the full explanation, see [`compaction::GcCompactJob`].
+    pub compact_lsn_range: Option<CompactLsnRange>,
    /// Enable sub-compaction (split compaction job across key ranges).
    /// This option is only used by GC compaction.
    pub sub_compaction: bool,
+    /// Set job size for the GC compaction.
+    /// This option is only used by GC compaction.
+    pub sub_compaction_max_job_size_mb: Option<u64>,
 }

 impl std::fmt::Debug for Timeline {
@@ -1641,9 +1685,10 @@ impl Timeline {
            cancel,
            CompactOptions {
                flags,
-                compact_range: None,
-                compact_below_lsn: None,
+                compact_key_range: None,
+                compact_lsn_range: None,
                sub_compaction: false,
+                sub_compaction_max_job_size_mb: None,
            },
            ctx,
        )
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -10,8 +10,8 @@ use std::sync::Arc;

 use super::layer_manager::LayerManager;
 use super::{
-    CompactFlags, CompactOptions, CompactRange, CreateImageLayersError, DurationRecorder,
-    ImageLayerCreationMode, RecordedDuration, Timeline,
+    CompactFlags, CompactOptions, CreateImageLayersError, DurationRecorder, ImageLayerCreationMode,
+    RecordedDuration, Timeline,
 };

 use anyhow::{anyhow, bail, Context};
@@ -64,6 +64,9 @@ const COMPACTION_DELTA_THRESHOLD: usize = 5;

 /// A scheduled compaction task.
 pub(crate) struct ScheduledCompactionTask {
+    /// It's unfortunate that we need to store a compact options struct here because the only outer
+    /// API we can call here is `compact_with_options` which does a few setup calls before starting the
+    /// actual compaction job... We should refactor this to store `GcCompactionJob` in the future.
    pub options: CompactOptions,
    /// The channel to send the compaction result. If this is a subcompaction, the last compaction job holds the sender.
    pub result_tx: Option<tokio::sync::oneshot::Sender<()>>,
@@ -71,16 +74,57 @@ pub(crate) struct ScheduledCompactionTask {
    pub gc_block: Option<gc_block::Guard>,
 }

+/// A job description for the gc-compaction job. This structure describes the rectangle range that the job will
+/// process. The exact layers that need to be compacted/rewritten will be generated when `compact_with_gc` gets
+/// called.
+#[derive(Debug, Clone)]
+pub(crate) struct GcCompactJob {
+    pub dry_run: bool,
+    /// The key range to be compacted. The compaction algorithm will only regenerate key-value pairs within this range
+    /// [left inclusive, right exclusive), and other pairs will be rewritten into new files if necessary.
+    pub compact_key_range: Range<Key>,
+    /// The LSN range to be compacted. The compaction algorithm will use this range to determine the layers to be
+    /// selected for the compaction, and it does not guarantee the generated layers will have exactly the same LSN range
+    /// as specified here. The true range being compacted is `min_lsn/max_lsn` in [`GcCompactionJobDescription`].
+    /// min_lsn will always <= the lower bound specified here, and max_lsn will always >= the upper bound specified here.
+    pub compact_lsn_range: Range<Lsn>,
+}
+
+impl GcCompactJob {
+    pub fn from_compact_options(options: CompactOptions) -> Self {
+        GcCompactJob {
+            dry_run: options.flags.contains(CompactFlags::DryRun),
+            compact_key_range: options
+                .compact_key_range
+                .map(|x| x.into())
+                .unwrap_or(Key::MIN..Key::MAX),
+            compact_lsn_range: options
+                .compact_lsn_range
+                .map(|x| x.into())
+                .unwrap_or(Lsn::INVALID..Lsn::MAX),
+        }
+    }
+}
+
+/// A job description for the gc-compaction job. This structure is generated when `compact_with_gc` is called
+/// and contains the exact layers we want to compact.
 pub struct GcCompactionJobDescription {
    /// All layers to read in the compaction job
    selected_layers: Vec<Layer>,
-    /// GC cutoff of the job
+    /// GC cutoff of the job. This is the lowest LSN that will be accessed by the read/GC path and we need to
+    /// keep all deltas <= this LSN or generate an image == this LSN.
    gc_cutoff: Lsn,
-    /// LSNs to retain for the job
+    /// LSNs to retain for the job. Read path will use this LSN so we need to keep deltas <= this LSN or
+    /// generate an image == this LSN.
    retain_lsns_below_horizon: Vec<Lsn>,
-    /// Maximum layer LSN processed in this compaction
+    /// Maximum layer LSN processed in this compaction, that is max(end_lsn of layers). Exclusive. All data
+    /// \>= this LSN will be kept and will not be rewritten.
    max_layer_lsn: Lsn,
-    /// Only compact layers overlapping with this range
+    /// Minimum layer LSN processed in this compaction, that is min(start_lsn of layers). Inclusive.
+    /// All access below (strict lower than `<`) this LSN will be routed through the normal read path instead of
+    /// k-merge within gc-compaction.
+    min_layer_lsn: Lsn,
+    /// Only compact layers overlapping with this range.
    compaction_key_range: Range<Key>,
    /// When partial compaction is enabled, these layers need to be rewritten to ensure no overlap.
    /// This field is here solely for debugging. The field will not be read once the compaction
@@ -299,7 +343,7 @@ impl Timeline {
            )));
        }

-        if options.compact_range.is_some() {
+        if options.compact_key_range.is_some() || options.compact_lsn_range.is_some() {
            // maybe useful in the future? could implement this at some point
            return Err(CompactionError::Other(anyhow!(
                "compaction range is not supported for legacy compaction for now"
@@ -1754,25 +1798,26 @@ impl Timeline {
        Ok(())
    }

-    /// Split a gc-compaction job into multiple compaction jobs. Optimally, this function should return a vector of
-    /// `GcCompactionJobDesc`. But we want to keep it simple on the tenant scheduling side without exposing too much
-    /// ad-hoc information about gc compaction itself.
+    /// Split a gc-compaction job into multiple compaction jobs. The split is based on the key range and the estimated size of the compaction job.
+    /// The function returns a list of compaction jobs that can be executed separately. If the upper bound of the compact LSN
+    /// range is not specified, we will use the latest gc_cutoff as the upper bound, so that all jobs in the jobset acts
+    /// like a full compaction of the specified keyspace.
    pub(crate) async fn gc_compaction_split_jobs(
        self: &Arc<Self>,
-        options: CompactOptions,
-    ) -> anyhow::Result<Vec<CompactOptions>> {
-        if !options.sub_compaction {
-            return Ok(vec![options]);
-        }
-        let compact_range = options.compact_range.clone().unwrap_or(CompactRange {
-            start: Key::MIN,
-            end: Key::MAX,
-        });
-        let compact_below_lsn = if let Some(compact_below_lsn) = options.compact_below_lsn {
-            compact_below_lsn
+        job: GcCompactJob,
+        sub_compaction_max_job_size_mb: Option<u64>,
+    ) -> anyhow::Result<Vec<GcCompactJob>> {
+        let compact_below_lsn = if job.compact_lsn_range.end != Lsn::MAX {
+            job.compact_lsn_range.end
        } else {
            *self.get_latest_gc_cutoff_lsn() // use the real gc cutoff
        };
+
+        // Split compaction job to about 4GB each
+        const GC_COMPACT_MAX_SIZE_MB: u64 = 4 * 1024;
+        let sub_compaction_max_job_size_mb =
+            sub_compaction_max_job_size_mb.unwrap_or(GC_COMPACT_MAX_SIZE_MB);
+
        let mut compact_jobs = Vec::new();
        // For now, we simply use the key partitioning information; we should do a more fine-grained partitioning
        // by estimating the amount of files read for a compaction job. We should also partition on LSN.
@@ -1808,8 +1853,8 @@ impl Timeline {
            let Some((start, end)) = truncate_to(
                &range.start,
                &range.end,
-                &compact_range.start,
-                &compact_range.end,
+                &job.compact_key_range.start,
+                &job.compact_key_range.end,
            ) else {
                continue;
            };
@@ -1819,8 +1864,6 @@ impl Timeline {
        let guard = self.layers.read().await;
        let layer_map = guard.layer_map()?;
        let mut current_start = None;
-        // Split compaction job to about 2GB each
-        const GC_COMPACT_MAX_SIZE_MB: u64 = 4 * 1024; // 4GB, TODO: should be configuration in the future
        let ranges_num = split_key_ranges.len();
        for (idx, (start, end)) in split_key_ranges.into_iter().enumerate() {
            if current_start.is_none() {
@@ -1833,8 +1876,7 @@ impl Timeline {
            }
            let res = layer_map.range_search(start..end, compact_below_lsn);
            let total_size = res.found.keys().map(|x| x.layer.file_size()).sum::<u64>();
-            if total_size > GC_COMPACT_MAX_SIZE_MB * 1024 * 1024 || ranges_num == idx + 1 {
-                let mut compact_options = options.clone();
+            if total_size > sub_compaction_max_job_size_mb * 1024 * 1024 || ranges_num == idx + 1 {
                // Try to extend the compaction range so that we include at least one full layer file.
                let extended_end = res
                    .found
@@ -1852,10 +1894,11 @@ impl Timeline {
                    "splitting compaction job: {}..{}, estimated_size={}",
                    start, end, total_size
                );
-                compact_options.compact_range = Some(CompactRange { start, end });
-                compact_options.compact_below_lsn = Some(compact_below_lsn);
-                compact_options.sub_compaction = false;
-                compact_jobs.push(compact_options);
+                compact_jobs.push(GcCompactJob {
+                    dry_run: job.dry_run,
+                    compact_key_range: start..end,
+                    compact_lsn_range: job.compact_lsn_range.start..compact_below_lsn,
+                });
                current_start = Some(end);
            }
        }
@@ -1877,7 +1920,7 @@ impl Timeline {
    /// Key::MIN..Key..MAX to the function indicates a full compaction, though technically, `Key::MAX` is not
    /// part of the range.
    ///
-    /// If `options.compact_below_lsn` is provided, the compaction will only compact layers below or intersect with
+    /// If `options.compact_lsn_range.end` is provided, the compaction will only compact layers below or intersect with
    /// the LSN. Otherwise, it will use the gc cutoff by default.
    pub(crate) async fn compact_with_gc(
        self: &Arc<Self>,
@@ -1885,9 +1928,13 @@ impl Timeline {
        options: CompactOptions,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
-        if options.sub_compaction {
+        let sub_compaction = options.sub_compaction;
+        let job = GcCompactJob::from_compact_options(options.clone());
+        if sub_compaction {
            info!("running enhanced gc bottom-most compaction with sub-compaction, splitting compaction jobs");
-            let jobs = self.gc_compaction_split_jobs(options).await?;
+            let jobs = self
+                .gc_compaction_split_jobs(job, options.sub_compaction_max_job_size_mb)
+                .await?;
            let jobs_len = jobs.len();
            for (idx, job) in jobs.into_iter().enumerate() {
                info!(
@@ -1902,19 +1949,15 @@ impl Timeline {
            }
            return Ok(());
        }
-        self.compact_with_gc_inner(cancel, options, ctx).await
+        self.compact_with_gc_inner(cancel, job, ctx).await
    }

    async fn compact_with_gc_inner(
        self: &Arc<Self>,
        cancel: &CancellationToken,
-        options: CompactOptions,
+        job: GcCompactJob,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
-        assert!(
-            !options.sub_compaction,
-            "sub-compaction should be handled by the outer function"
-        );
        // Block other compaction/GC tasks from running for now. GC-compaction could run along
        // with legacy compaction tasks in the future. Always ensure the lock order is compaction -> gc.
        // Note that we already acquired the compaction lock when the outer `compact` function gets called.
@@ -1934,19 +1977,11 @@ impl Timeline {
        )
        .await?;

-        let flags = options.flags;
-        let compaction_key_range = options
-            .compact_range
-            .map(|range| range.start..range.end)
-            .unwrap_or_else(|| Key::MIN..Key::MAX);
+        let dry_run = job.dry_run;
+        let compact_key_range = job.compact_key_range;
+        let compact_lsn_range = job.compact_lsn_range;

-        let dry_run = flags.contains(CompactFlags::DryRun);
-
-        if compaction_key_range == (Key::MIN..Key::MAX) {
-            info!("running enhanced gc bottom-most compaction, dry_run={dry_run}, compaction_key_range={}..{}", compaction_key_range.start, compaction_key_range.end);
-        } else {
-            info!("running enhanced gc bottom-most compaction, dry_run={dry_run}");
-        }
+        info!("running enhanced gc bottom-most compaction, dry_run={dry_run}, compact_key_range={}..{}, compact_lsn_range={}..{}", compact_key_range.start, compact_key_range.end, compact_lsn_range.start, compact_lsn_range.end);

        scopeguard::defer! {
            info!("done enhanced gc bottom-most compaction");
@@ -1970,11 +2005,15 @@ impl Timeline {
                // to get the truth data.
                let real_gc_cutoff = *self.get_latest_gc_cutoff_lsn();
                // The compaction algorithm will keep all keys above the gc_cutoff while keeping only necessary keys below the gc_cutoff for
-                // each of the retain_lsn. Therefore, if the user-provided `compact_below_lsn` is larger than the real gc cutoff, we will use
+                // each of the retain_lsn. Therefore, if the user-provided `compact_lsn_range.end` is larger than the real gc cutoff, we will use
                // the real cutoff.
-                let mut gc_cutoff = options.compact_below_lsn.unwrap_or(real_gc_cutoff);
+                let mut gc_cutoff = if compact_lsn_range.end == Lsn::MAX {
+                    real_gc_cutoff
+                } else {
+                    compact_lsn_range.end
+                };
                if gc_cutoff > real_gc_cutoff {
-                    warn!("provided compact_below_lsn={} is larger than the real_gc_cutoff={}, using the real gc cutoff", gc_cutoff, real_gc_cutoff);
+                    warn!("provided compact_lsn_range.end={} is larger than the real_gc_cutoff={}, using the real gc cutoff", gc_cutoff, real_gc_cutoff);
                    gc_cutoff = real_gc_cutoff;
                }
                gc_cutoff
@@ -1991,7 +2030,7 @@ impl Timeline {
            }
            let mut selected_layers: Vec<Layer> = Vec::new();
            drop(gc_info);
-            // Pick all the layers intersect or below the gc_cutoff, get the largest LSN in the selected layers.
+            // Firstly, pick all the layers intersect or below the gc_cutoff, get the largest LSN in the selected layers.
            let Some(max_layer_lsn) = layers
                .iter_historic_layers()
                .filter(|desc| desc.get_lsn_range().start <= gc_cutoff)
@@ -2001,27 +2040,45 @@ impl Timeline {
                info!("no layers to compact with gc: no historic layers below gc_cutoff, gc_cutoff={}", gc_cutoff);
                return Ok(());
            };
+            // Next, if the user specifies compact_lsn_range.start, we need to filter some layers out. All the layers (strictly) below
+            // the min_layer_lsn computed as below will be filtered out and the data will be accessed using the normal read path, as if
+            // it is a branch.
+            let Some(min_layer_lsn) = layers
+                .iter_historic_layers()
+                .filter(|desc| {
+                    if compact_lsn_range.start == Lsn::INVALID {
+                        true // select all layers below if start == Lsn(0)
+                    } else {
+                        desc.get_lsn_range().end > compact_lsn_range.start // strictly larger than compact_above_lsn
+                    }
+                })
+                .map(|desc| desc.get_lsn_range().start)
+                .min()
+            else {
+                info!("no layers to compact with gc: no historic layers above compact_above_lsn, compact_above_lsn={}", compact_lsn_range.end);
+                return Ok(());
+            };
            // Then, pick all the layers that are below the max_layer_lsn. This is to ensure we can pick all single-key
            // layers to compact.
            let mut rewrite_layers = Vec::new();
            for desc in layers.iter_historic_layers() {
                if desc.get_lsn_range().end <= max_layer_lsn
-                    && overlaps_with(&desc.get_key_range(), &compaction_key_range)
+                    && desc.get_lsn_range().start >= min_layer_lsn
+                    && overlaps_with(&desc.get_key_range(), &compact_key_range)
                {
                    // If the layer overlaps with the compaction key range, we need to read it to obtain all keys within the range,
                    // even if it might contain extra keys
                    selected_layers.push(guard.get_from_desc(&desc));
                    // If the layer is not fully contained within the key range, we need to rewrite it if it's a delta layer (it's fine
                    // to overlap image layers)
-                    if desc.is_delta()
-                        && !fully_contains(&compaction_key_range, &desc.get_key_range())
+                    if desc.is_delta() && !fully_contains(&compact_key_range, &desc.get_key_range())
                    {
                        rewrite_layers.push(desc);
                    }
                }
            }
            if selected_layers.is_empty() {
-                info!("no layers to compact with gc: no layers within the key range, gc_cutoff={}, key_range={}..{}", gc_cutoff, compaction_key_range.start, compaction_key_range.end);
+                info!("no layers to compact with gc: no layers within the key range, gc_cutoff={}, key_range={}..{}", gc_cutoff, compact_key_range.start, compact_key_range.end);
                return Ok(());
            }
            retain_lsns_below_horizon.sort();
@@ -2029,13 +2086,20 @@ impl Timeline {
                selected_layers,
                gc_cutoff,
                retain_lsns_below_horizon,
+                min_layer_lsn,
                max_layer_lsn,
-                compaction_key_range,
+                compaction_key_range: compact_key_range,
                rewrite_layers,
            }
        };
-        let lowest_retain_lsn = if self.ancestor_timeline.is_some() {
-            Lsn(self.ancestor_lsn.0 + 1)
+        let (has_data_below, lowest_retain_lsn) = if compact_lsn_range.start != Lsn::INVALID {
+            // If we only compact above some LSN, we should get the history from the current branch below the specified LSN.
+            // We use job_desc.min_layer_lsn as if it's the lowest branch point.
+            (true, job_desc.min_layer_lsn)
+        } else if self.ancestor_timeline.is_some() {
+            // In theory, we can also use min_layer_lsn here, but using ancestor LSN makes sure the delta layers cover the
+            // LSN ranges all the way to the ancestor timeline.
+            (true, self.ancestor_lsn)
        } else {
            let res = job_desc
                .retain_lsns_below_horizon
@@ -2053,17 +2117,19 @@ impl Timeline {
                        .unwrap_or(job_desc.gc_cutoff)
                );
            }
-            res
+            (false, res)
        };
        info!(
-            "picked {} layers for compaction ({} layers need rewriting) with max_layer_lsn={} gc_cutoff={} lowest_retain_lsn={}, key_range={}..{}",
+            "picked {} layers for compaction ({} layers need rewriting) with max_layer_lsn={} min_layer_lsn={} gc_cutoff={} lowest_retain_lsn={}, key_range={}..{}, has_data_below={}",
            job_desc.selected_layers.len(),
            job_desc.rewrite_layers.len(),
            job_desc.max_layer_lsn,
+            job_desc.min_layer_lsn,
            job_desc.gc_cutoff,
            lowest_retain_lsn,
            job_desc.compaction_key_range.start,
-            job_desc.compaction_key_range.end
+            job_desc.compaction_key_range.end,
+            has_data_below,
        );

        for layer in &job_desc.selected_layers {
@@ -2107,10 +2173,22 @@ impl Timeline {
        let mut delta_layers = Vec::new();
        let mut image_layers = Vec::new();
        let mut downloaded_layers = Vec::new();
+        let mut total_downloaded_size = 0;
+        let mut total_layer_size = 0;
        for layer in &job_desc.selected_layers {
+            if layer.needs_download().await?.is_some() {
+                total_downloaded_size += layer.layer_desc().file_size;
+            }
+            total_layer_size += layer.layer_desc().file_size;
            let resident_layer = layer.download_and_keep_resident().await?;
            downloaded_layers.push(resident_layer);
        }
+        info!(
+            "finish downloading layers, downloaded={}, total={}, ratio={:.2}",
+            total_downloaded_size,
+            total_layer_size,
+            total_downloaded_size as f64 / total_layer_size as f64
+        );
        for resident_layer in &downloaded_layers {
            if resident_layer.layer_desc().is_delta() {
                let layer = resident_layer.get_as_delta(ctx).await?;
@@ -2133,7 +2211,7 @@ impl Timeline {

        // Only create image layers when there is no ancestor branches. TODO: create covering image layer
        // when some condition meet.
-        let mut image_layer_writer = if self.ancestor_timeline.is_none() {
+        let mut image_layer_writer = if !has_data_below {
            Some(
                SplitImageLayerWriter::new(
                    self.conf,
@@ -2166,7 +2244,11 @@ impl Timeline {
        }
        let mut delta_layer_rewriters = HashMap::<Arc<PersistentLayerKey>, RewritingLayers>::new();

-        /// Returns None if there is no ancestor branch. Throw an error when the key is not found.
+        /// When compacting not at a bottom range (=`[0,X)`) of the root branch, we "have data below" (`has_data_below=true`).
+        /// The two cases are compaction in ancestor branches and when `compact_lsn_range.start` is set.
+        /// In those cases, we need to pull up data from below the LSN range we're compaction.
+        ///
+        /// This function unifies the cases so that later code doesn't have to think about it.
        ///
        /// Currently, we always get the ancestor image for each key in the child branch no matter whether the image
        /// is needed for reconstruction. This should be fixed in the future.
@@ -2174,17 +2256,19 @@ impl Timeline {
        /// Furthermore, we should do vectored get instead of a single get, or better, use k-merge for ancestor
        /// images.
        async fn get_ancestor_image(
-            tline: &Arc<Timeline>,
+            this_tline: &Arc<Timeline>,
            key: Key,
            ctx: &RequestContext,
+            has_data_below: bool,
+            history_lsn_point: Lsn,
        ) -> anyhow::Result<Option<(Key, Lsn, Bytes)>> {
-            if tline.ancestor_timeline.is_none() {
+            if !has_data_below {
                return Ok(None);
            };
            // This function is implemented as a get of the current timeline at ancestor LSN, therefore reusing
            // as much existing code as possible.
-            let img = tline.get(key, tline.ancestor_lsn, ctx).await?;
-            Ok(Some((key, tline.ancestor_lsn, img)))
+            let img = this_tline.get(key, history_lsn_point, ctx).await?;
+            Ok(Some((key, history_lsn_point, img)))
        }

        // Actually, we can decide not to write to the image layer at all at this point because
@@ -2268,7 +2352,8 @@ impl Timeline {
                        job_desc.gc_cutoff,
                        &job_desc.retain_lsns_below_horizon,
                        COMPACTION_DELTA_THRESHOLD,
-                        get_ancestor_image(self, *last_key, ctx).await?,
+                        get_ancestor_image(self, *last_key, ctx, has_data_below, lowest_retain_lsn)
+                            .await?,
                    )
                    .await?;
                retention
@@ -2297,7 +2382,7 @@ impl Timeline {
                job_desc.gc_cutoff,
                &job_desc.retain_lsns_below_horizon,
                COMPACTION_DELTA_THRESHOLD,
-                get_ancestor_image(self, last_key, ctx).await?,
+                get_ancestor_image(self, last_key, ctx, has_data_below, lowest_retain_lsn).await?,
            )
            .await?;
        retention
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -877,22 +877,24 @@ impl WalIngest {
        // will block waiting for the last valid LSN to advance up to
        // it. So we use the previous record's LSN in the get calls
        // instead.
-        for segno in modification
-            .tline
-            .list_slru_segments(SlruKind::Clog, Version::Modified(modification), ctx)
-            .await?
-        {
-            let segpage = segno * pg_constants::SLRU_PAGES_PER_SEGMENT;
+        if modification.tline.get_shard_identity().is_shard_zero() {
+            for segno in modification
+                .tline
+                .list_slru_segments(SlruKind::Clog, Version::Modified(modification), ctx)
+                .await?
+            {
+                let segpage = segno * pg_constants::SLRU_PAGES_PER_SEGMENT;

-            let may_delete = dispatch_pgversion!(modification.tline.pg_version, {
-                pgv::nonrelfile_utils::slru_may_delete_clogsegment(segpage, pageno)
-            });
+                let may_delete = dispatch_pgversion!(modification.tline.pg_version, {
+                    pgv::nonrelfile_utils::slru_may_delete_clogsegment(segpage, pageno)
+                });

-            if may_delete {
-                modification
-                    .drop_slru_segment(SlruKind::Clog, segno, ctx)
-                    .await?;
-                trace!("Drop CLOG segment {:>04X}", segno);
+                if may_delete {
+                    modification
+                        .drop_slru_segment(SlruKind::Clog, segno, ctx)
+                        .await?;
+                    trace!("Drop CLOG segment {:>04X}", segno);
+                }
            }
        }

@@ -1047,16 +1049,18 @@ impl WalIngest {

        // Delete all the segments except the last one. The last segment can still
        // contain, possibly partially, valid data.
-        while segment != endsegment {
-            modification
-                .drop_slru_segment(SlruKind::MultiXactMembers, segment as u32, ctx)
-                .await?;
+        if modification.tline.get_shard_identity().is_shard_zero() {
+            while segment != endsegment {
+                modification
+                    .drop_slru_segment(SlruKind::MultiXactMembers, segment as u32, ctx)
+                    .await?;

-            /* move to next segment, handling wraparound correctly */
-            if segment == maxsegment {
-                segment = 0;
-            } else {
-                segment += 1;
+                /* move to next segment, handling wraparound correctly */
+                if segment == maxsegment {
+                    segment = 0;
+                } else {
+                    segment += 1;
+                }
            }
        }

--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -22,6 +22,7 @@
 #include "libpq/pqformat.h"
 #include "miscadmin.h"
 #include "pgstat.h"
+#include "portability/instr_time.h"
 #include "postmaster/interrupt.h"
 #include "storage/buf_internals.h"
 #include "storage/ipc.h"
@@ -118,6 +119,11 @@ typedef struct
 	 */
 	PSConnectionState state;
 	PGconn		   *conn;
+
+	/* request / response counters for debugging */
+	uint64			nrequests_sent;
+	uint64			nresponses_received;
+
 	/*---
 	 * WaitEventSet containing:
 	 *	- WL_SOCKET_READABLE on 'conn'
@@ -628,6 +634,8 @@ pageserver_connect(shardno_t shard_no, int elevel)
 		}

 		shard->state = PS_Connected;
+		shard->nrequests_sent = 0;
+		shard->nresponses_received = 0;
 	}
 	/* FALLTHROUGH */
 	case PS_Connected:
@@ -656,6 +664,27 @@ call_PQgetCopyData(shardno_t shard_no, char **buffer)
 	int			ret;
 	PageServer *shard = &page_servers[shard_no];
 	PGconn	   *pageserver_conn = shard->conn;
+	instr_time	now,
+				start_ts,
+				since_start,
+				last_log_ts,
+				since_last_log;
+	bool		logged = false;
+
+	/*
+	 * As a debugging aid, if we don't get a response for a long time, print a
+	 * log message.
+	 *
+	 * 10 s is a very generous threshold, normally we expect a response in a
+	 * few milliseconds. We have metrics to track latencies in normal ranges,
+	 * but in the cases that take exceptionally long, it's useful to log the
+	 * exact timestamps.
+	 */
+#define LOG_INTERVAL_US		UINT64CONST(10 * 1000000)
+
+	INSTR_TIME_SET_CURRENT(now);
+	start_ts = last_log_ts = now;
+	INSTR_TIME_SET_ZERO(since_last_log);

 retry:
 	ret = PQgetCopyData(pageserver_conn, buffer, 1 /* async */ );
@@ -663,9 +692,12 @@ retry:
 	if (ret == 0)
 	{
 		WaitEvent	event;
+		long		timeout;
+
+		timeout = Min(0, LOG_INTERVAL_US - INSTR_TIME_GET_MICROSEC(since_last_log));

 		/* Sleep until there's something to do */
-		(void) WaitEventSetWait(shard->wes_read, -1L, &event, 1,
+		(void) WaitEventSetWait(shard->wes_read, timeout, &event, 1,
 								WAIT_EVENT_NEON_PS_READ);
 		ResetLatch(MyLatch);

@@ -684,9 +716,40 @@ retry:
 			}
 		}

+		/*
+		 * Print a message to the log if a long time has passed with no
+		 * response.
+		 */
+		INSTR_TIME_SET_CURRENT(now);
+		since_last_log = now;
+		INSTR_TIME_SUBTRACT(since_last_log, last_log_ts);
+		if (INSTR_TIME_GET_MICROSEC(since_last_log) >= LOG_INTERVAL_US)
+		{
+			since_start = now;
+			INSTR_TIME_SUBTRACT(since_start, start_ts);
+			neon_shard_log(shard_no, LOG, "no response received from pageserver for %0.3f s, still waiting (sent " UINT64_FORMAT " requests, received " UINT64_FORMAT " responses)",
+						   INSTR_TIME_GET_DOUBLE(since_start),
+						   shard->nrequests_sent, shard->nresponses_received);
+			last_log_ts = now;
+			logged = true;
+		}
+
 		goto retry;
 	}

+	/*
+	 * If we logged earlier that the response is taking a long time, log
+	 * another message when the response is finally received.
+	 */
+	if (logged)
+	{
+		INSTR_TIME_SET_CURRENT(now);
+		since_start = now;
+		INSTR_TIME_SUBTRACT(since_start, start_ts);
+		neon_shard_log(shard_no, LOG, "received response from pageserver after %0.3f s",
+					   INSTR_TIME_GET_DOUBLE(since_start));
+	}
+
 	return ret;
 }

@@ -786,6 +849,7 @@ pageserver_send(shardno_t shard_no, NeonRequest *request)
 	 * PGRES_POLLING_WRITING state. It's kinda dirty to disconnect at this
 	 * point, but on the grand scheme of things it's only a small issue.
 	 */
+	shard->nrequests_sent++;
 	if (PQputCopyData(pageserver_conn, req_buff.data, req_buff.len) <= 0)
 	{
 		char	   *msg = pchomp(PQerrorMessage(pageserver_conn));
@@ -878,6 +942,7 @@ pageserver_receive(shardno_t shard_no)
 		neon_shard_log(shard_no, ERROR, "pageserver_receive disconnect: unexpected PQgetCopyData return value: %d", rc);
 	}

+	shard->nresponses_received++;
 	return (NeonResponse *) resp;
 }

--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -423,7 +423,11 @@ readahead_buffer_resize(int newsize, void *extra)
 	 * ensuring we have received all but the last n requests (n = newsize).
 	 */
 	if (MyPState->n_requests_inflight > newsize)
-		prefetch_wait_for(MyPState->ring_unused - newsize);
+	{
+		Assert(MyPState->ring_unused >= MyPState->n_requests_inflight - newsize);
+		prefetch_wait_for(MyPState->ring_unused - (MyPState->n_requests_inflight - newsize));
+		Assert(MyPState->n_requests_inflight <= newsize);
+	}

 	/* construct the new PrefetchState, and copy over the memory contexts */
 	newPState = MemoryContextAllocZero(TopMemoryContext, newprfs_size);
@@ -438,7 +442,6 @@ readahead_buffer_resize(int newsize, void *extra)
 	newPState->ring_last = newsize;
 	newPState->ring_unused = newsize;
 	newPState->ring_receive = newsize;
-	newPState->ring_flush = newsize;
 	newPState->max_shard_no = MyPState->max_shard_no;
 	memcpy(newPState->shard_bitmap, MyPState->shard_bitmap, sizeof(MyPState->shard_bitmap));

@@ -489,6 +492,7 @@ readahead_buffer_resize(int newsize, void *extra)
 		}
 		newPState->n_unused -= 1;
 	}
+	newPState->ring_flush = newPState->ring_receive;

 	MyNeonCounters->getpage_prefetches_buffered =
 		MyPState->n_responses_buffered;
@@ -498,6 +502,7 @@ readahead_buffer_resize(int newsize, void *extra)
 	for (; end >= MyPState->ring_last && end != UINT64_MAX; end -= 1)
 	{
 		PrefetchRequest *slot = GetPrfSlot(end);
+		Assert(slot->status != PRFS_REQUESTED);
 		if (slot->status == PRFS_RECEIVED)
 		{
 			pfree(slot->response);
--- a/proxy/src/auth/backend/mod.rs
+++ b/proxy/src/auth/backend/mod.rs
@@ -74,10 +74,6 @@ impl std::fmt::Display for Backend<'_, ()> {
                    .debug_tuple("ControlPlane::ProxyV1")
                    .field(&endpoint.url())
                    .finish(),
-                ControlPlaneClient::Neon(endpoint) => fmt
-                    .debug_tuple("ControlPlane::Neon")
-                    .field(&endpoint.url())
-                    .finish(),
                #[cfg(any(test, feature = "testing"))]
                ControlPlaneClient::PostgresMock(endpoint) => fmt
                    .debug_tuple("ControlPlane::PostgresMock")
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -43,9 +43,6 @@ static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;

 #[derive(Clone, Debug, ValueEnum)]
 enum AuthBackendType {
-    #[value(name("console"), alias("cplane"))]
-    ControlPlane,
-
    #[value(name("cplane-v1"), alias("control-plane"))]
    ControlPlaneV1,

@@ -488,40 +485,7 @@ async fn main() -> anyhow::Result<()> {
    }

    if let Either::Left(auth::Backend::ControlPlane(api, _)) = &auth_backend {
-        if let proxy::control_plane::client::ControlPlaneClient::Neon(api) = &**api {
-            match (redis_notifications_client, regional_redis_client.clone()) {
-                (None, None) => {}
-                (client1, client2) => {
-                    let cache = api.caches.project_info.clone();
-                    if let Some(client) = client1 {
-                        maintenance_tasks.spawn(notifications::task_main(
-                            client,
-                            cache.clone(),
-                            cancel_map.clone(),
-                            args.region.clone(),
-                        ));
-                    }
-                    if let Some(client) = client2 {
-                        maintenance_tasks.spawn(notifications::task_main(
-                            client,
-                            cache.clone(),
-                            cancel_map.clone(),
-                            args.region.clone(),
-                        ));
-                    }
-                    maintenance_tasks.spawn(async move { cache.clone().gc_worker().await });
-                }
-            }
-            if let Some(regional_redis_client) = regional_redis_client {
-                let cache = api.caches.endpoints_cache.clone();
-                let con = regional_redis_client;
-                let span = tracing::info_span!("endpoints_cache");
-                maintenance_tasks.spawn(
-                    async move { cache.do_read(con, cancellation_token.clone()).await }
-                        .instrument(span),
-                );
-            }
-        } else if let proxy::control_plane::client::ControlPlaneClient::ProxyV1(api) = &**api {
+        if let proxy::control_plane::client::ControlPlaneClient::ProxyV1(api) = &**api {
            match (redis_notifications_client, regional_redis_client.clone()) {
                (None, None) => {}
                (client1, client2) => {
@@ -757,65 +721,6 @@ fn build_auth_backend(
            Ok(Either::Left(config))
        }

-        AuthBackendType::ControlPlane => {
-            let wake_compute_cache_config: CacheOptions = args.wake_compute_cache.parse()?;
-            let project_info_cache_config: ProjectInfoCacheOptions =
-                args.project_info_cache.parse()?;
-            let endpoint_cache_config: config::EndpointCacheConfig =
-                args.endpoint_cache_config.parse()?;
-
-            info!("Using NodeInfoCache (wake_compute) with options={wake_compute_cache_config:?}");
-            info!(
-                "Using AllowedIpsCache (wake_compute) with options={project_info_cache_config:?}"
-            );
-            info!("Using EndpointCacheConfig with options={endpoint_cache_config:?}");
-            let caches = Box::leak(Box::new(control_plane::caches::ApiCaches::new(
-                wake_compute_cache_config,
-                project_info_cache_config,
-                endpoint_cache_config,
-            )));
-
-            let config::ConcurrencyLockOptions {
-                shards,
-                limiter,
-                epoch,
-                timeout,
-            } = args.wake_compute_lock.parse()?;
-            info!(?limiter, shards, ?epoch, "Using NodeLocks (wake_compute)");
-            let locks = Box::leak(Box::new(control_plane::locks::ApiLocks::new(
-                "wake_compute_lock",
-                limiter,
-                shards,
-                timeout,
-                epoch,
-                &Metrics::get().wake_compute_lock,
-            )?));
-            tokio::spawn(locks.garbage_collect_worker());
-
-            let url: proxy::url::ApiUrl = args.auth_endpoint.parse()?;
-
-            let endpoint = http::Endpoint::new(url, http::new_client());
-
-            let mut wake_compute_rps_limit = args.wake_compute_limit.clone();
-            RateBucketInfo::validate(&mut wake_compute_rps_limit)?;
-            let wake_compute_endpoint_rate_limiter =
-                Arc::new(WakeComputeRateLimiter::new(wake_compute_rps_limit));
-
-            let api = control_plane::client::neon::NeonControlPlaneClient::new(
-                endpoint,
-                args.control_plane_token.clone(),
-                caches,
-                locks,
-                wake_compute_endpoint_rate_limiter,
-            );
-            let api = control_plane::client::ControlPlaneClient::Neon(api);
-            let auth_backend = auth::Backend::ControlPlane(MaybeOwned::Owned(api), ());
-
-            let config = Box::leak(Box::new(auth_backend));
-
-            Ok(Either::Left(config))
-        }
-
        #[cfg(feature = "testing")]
        AuthBackendType::Postgres => {
            let url = args.auth_endpoint.parse()?;
--- a/proxy/src/control_plane/client/mod.rs
+++ b/proxy/src/control_plane/client/mod.rs
@@ -1,7 +1,6 @@
 pub mod cplane_proxy_v1;
 #[cfg(any(test, feature = "testing"))]
 pub mod mock;
-pub mod neon;

 use std::hash::Hash;
 use std::sync::Arc;
@@ -28,10 +27,8 @@ use crate::types::EndpointId;
 #[non_exhaustive]
 #[derive(Clone)]
 pub enum ControlPlaneClient {
-    /// New Proxy V1 control plane API
+    /// Proxy V1 control plane API
    ProxyV1(cplane_proxy_v1::NeonControlPlaneClient),
-    /// Current Management API (V2).
-    Neon(neon::NeonControlPlaneClient),
    /// Local mock control plane.
    #[cfg(any(test, feature = "testing"))]
    PostgresMock(mock::MockControlPlane),
@@ -49,7 +46,6 @@ impl ControlPlaneApi for ControlPlaneClient {
    ) -> Result<CachedRoleSecret, errors::GetAuthInfoError> {
        match self {
            Self::ProxyV1(api) => api.get_role_secret(ctx, user_info).await,
-            Self::Neon(api) => api.get_role_secret(ctx, user_info).await,
            #[cfg(any(test, feature = "testing"))]
            Self::PostgresMock(api) => api.get_role_secret(ctx, user_info).await,
            #[cfg(test)]
@@ -66,7 +62,6 @@ impl ControlPlaneApi for ControlPlaneClient {
    ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), errors::GetAuthInfoError> {
        match self {
            Self::ProxyV1(api) => api.get_allowed_ips_and_secret(ctx, user_info).await,
-            Self::Neon(api) => api.get_allowed_ips_and_secret(ctx, user_info).await,
            #[cfg(any(test, feature = "testing"))]
            Self::PostgresMock(api) => api.get_allowed_ips_and_secret(ctx, user_info).await,
            #[cfg(test)]
@@ -81,7 +76,6 @@ impl ControlPlaneApi for ControlPlaneClient {
    ) -> Result<Vec<AuthRule>, errors::GetEndpointJwksError> {
        match self {
            Self::ProxyV1(api) => api.get_endpoint_jwks(ctx, endpoint).await,
-            Self::Neon(api) => api.get_endpoint_jwks(ctx, endpoint).await,
            #[cfg(any(test, feature = "testing"))]
            Self::PostgresMock(api) => api.get_endpoint_jwks(ctx, endpoint).await,
            #[cfg(test)]
@@ -96,7 +90,6 @@ impl ControlPlaneApi for ControlPlaneClient {
    ) -> Result<CachedNodeInfo, errors::WakeComputeError> {
        match self {
            Self::ProxyV1(api) => api.wake_compute(ctx, user_info).await,
-            Self::Neon(api) => api.wake_compute(ctx, user_info).await,
            #[cfg(any(test, feature = "testing"))]
            Self::PostgresMock(api) => api.wake_compute(ctx, user_info).await,
            #[cfg(test)]
--- a/proxy/src/control_plane/client/neon.rs
+++ b/proxy/src/control_plane/client/neon.rs
@@ -1,511 +0,0 @@
-//! Stale console backend, remove after migrating to Proxy V1 API (#15245).
-
-use std::sync::Arc;
-use std::time::Duration;
-
-use ::http::header::AUTHORIZATION;
-use ::http::HeaderName;
-use futures::TryFutureExt;
-use postgres_client::config::SslMode;
-use tokio::time::Instant;
-use tracing::{debug, info, info_span, warn, Instrument};
-
-use super::super::messages::{ControlPlaneErrorMessage, GetRoleSecret, WakeCompute};
-use crate::auth::backend::jwt::AuthRule;
-use crate::auth::backend::ComputeUserInfo;
-use crate::cache::Cached;
-use crate::context::RequestContext;
-use crate::control_plane::caches::ApiCaches;
-use crate::control_plane::errors::{
-    ControlPlaneError, GetAuthInfoError, GetEndpointJwksError, WakeComputeError,
-};
-use crate::control_plane::locks::ApiLocks;
-use crate::control_plane::messages::{ColdStartInfo, EndpointJwksResponse, Reason};
-use crate::control_plane::{
-    AuthInfo, AuthSecret, CachedAllowedIps, CachedNodeInfo, CachedRoleSecret, NodeInfo,
-};
-use crate::metrics::{CacheOutcome, Metrics};
-use crate::rate_limiter::WakeComputeRateLimiter;
-use crate::types::{EndpointCacheKey, EndpointId};
-use crate::{compute, http, scram};
-
-const X_REQUEST_ID: HeaderName = HeaderName::from_static("x-request-id");
-
-#[derive(Clone)]
-pub struct NeonControlPlaneClient {
-    endpoint: http::Endpoint,
-    pub caches: &'static ApiCaches,
-    pub(crate) locks: &'static ApiLocks<EndpointCacheKey>,
-    pub(crate) wake_compute_endpoint_rate_limiter: Arc<WakeComputeRateLimiter>,
-    // put in a shared ref so we don't copy secrets all over in memory
-    jwt: Arc<str>,
-}
-
-impl NeonControlPlaneClient {
-    /// Construct an API object containing the auth parameters.
-    pub fn new(
-        endpoint: http::Endpoint,
-        jwt: Arc<str>,
-        caches: &'static ApiCaches,
-        locks: &'static ApiLocks<EndpointCacheKey>,
-        wake_compute_endpoint_rate_limiter: Arc<WakeComputeRateLimiter>,
-    ) -> Self {
-        Self {
-            endpoint,
-            caches,
-            locks,
-            wake_compute_endpoint_rate_limiter,
-            jwt,
-        }
-    }
-
-    pub(crate) fn url(&self) -> &str {
-        self.endpoint.url().as_str()
-    }
-
-    async fn do_get_auth_info(
-        &self,
-        ctx: &RequestContext,
-        user_info: &ComputeUserInfo,
-    ) -> Result<AuthInfo, GetAuthInfoError> {
-        if !self
-            .caches
-            .endpoints_cache
-            .is_valid(ctx, &user_info.endpoint.normalize())
-        {
-            // TODO: refactor this because it's weird
-            // this is a failure to authenticate but we return Ok.
-            info!("endpoint is not valid, skipping the request");
-            return Ok(AuthInfo::default());
-        }
-        let request_id = ctx.session_id().to_string();
-        let application_name = ctx.console_application_name();
-        async {
-            let request = self
-                .endpoint
-                .get_path("proxy_get_role_secret")
-                .header(X_REQUEST_ID, &request_id)
-                .header(AUTHORIZATION, format!("Bearer {}", &self.jwt))
-                .query(&[("session_id", ctx.session_id())])
-                .query(&[
-                    ("application_name", application_name.as_str()),
-                    ("project", user_info.endpoint.as_str()),
-                    ("role", user_info.user.as_str()),
-                ])
-                .build()?;
-
-            debug!(url = request.url().as_str(), "sending http request");
-            let start = Instant::now();
-            let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Cplane);
-            let response = self.endpoint.execute(request).await?;
-            drop(pause);
-            info!(duration = ?start.elapsed(), "received http response");
-            let body = match parse_body::<GetRoleSecret>(response).await {
-                Ok(body) => body,
-                // Error 404 is special: it's ok not to have a secret.
-                // TODO(anna): retry
-                Err(e) => {
-                    return if e.get_reason().is_not_found() {
-                        // TODO: refactor this because it's weird
-                        // this is a failure to authenticate but we return Ok.
-                        Ok(AuthInfo::default())
-                    } else {
-                        Err(e.into())
-                    };
-                }
-            };
-
-            let secret = if body.role_secret.is_empty() {
-                None
-            } else {
-                let secret = scram::ServerSecret::parse(&body.role_secret)
-                    .map(AuthSecret::Scram)
-                    .ok_or(GetAuthInfoError::BadSecret)?;
-                Some(secret)
-            };
-            let allowed_ips = body.allowed_ips.unwrap_or_default();
-            Metrics::get()
-                .proxy
-                .allowed_ips_number
-                .observe(allowed_ips.len() as f64);
-            Ok(AuthInfo {
-                secret,
-                allowed_ips,
-                project_id: body.project_id,
-            })
-        }
-        .inspect_err(|e| tracing::debug!(error = ?e))
-        .instrument(info_span!("do_get_auth_info"))
-        .await
-    }
-
-    async fn do_get_endpoint_jwks(
-        &self,
-        ctx: &RequestContext,
-        endpoint: EndpointId,
-    ) -> Result<Vec<AuthRule>, GetEndpointJwksError> {
-        if !self
-            .caches
-            .endpoints_cache
-            .is_valid(ctx, &endpoint.normalize())
-        {
-            return Err(GetEndpointJwksError::EndpointNotFound);
-        }
-        let request_id = ctx.session_id().to_string();
-        async {
-            let request = self
-                .endpoint
-                .get_with_url(|url| {
-                    url.path_segments_mut()
-                        .push("endpoints")
-                        .push(endpoint.as_str())
-                        .push("jwks");
-                })
-                .header(X_REQUEST_ID, &request_id)
-                .header(AUTHORIZATION, format!("Bearer {}", &self.jwt))
-                .query(&[("session_id", ctx.session_id())])
-                .build()
-                .map_err(GetEndpointJwksError::RequestBuild)?;
-
-            debug!(url = request.url().as_str(), "sending http request");
-            let start = Instant::now();
-            let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Cplane);
-            let response = self
-                .endpoint
-                .execute(request)
-                .await
-                .map_err(GetEndpointJwksError::RequestExecute)?;
-            drop(pause);
-            info!(duration = ?start.elapsed(), "received http response");
-
-            let body = parse_body::<EndpointJwksResponse>(response).await?;
-
-            let rules = body
-                .jwks
-                .into_iter()
-                .map(|jwks| AuthRule {
-                    id: jwks.id,
-                    jwks_url: jwks.jwks_url,
-                    audience: jwks.jwt_audience,
-                    role_names: jwks.role_names,
-                })
-                .collect();
-
-            Ok(rules)
-        }
-        .inspect_err(|e| tracing::debug!(error = ?e))
-        .instrument(info_span!("do_get_endpoint_jwks"))
-        .await
-    }
-
-    async fn do_wake_compute(
-        &self,
-        ctx: &RequestContext,
-        user_info: &ComputeUserInfo,
-    ) -> Result<NodeInfo, WakeComputeError> {
-        let request_id = ctx.session_id().to_string();
-        let application_name = ctx.console_application_name();
-        async {
-            let mut request_builder = self
-                .endpoint
-                .get_path("proxy_wake_compute")
-                .header("X-Request-ID", &request_id)
-                .header("Authorization", format!("Bearer {}", &self.jwt))
-                .query(&[("session_id", ctx.session_id())])
-                .query(&[
-                    ("application_name", application_name.as_str()),
-                    ("project", user_info.endpoint.as_str()),
-                ]);
-
-            let options = user_info.options.to_deep_object();
-            if !options.is_empty() {
-                request_builder = request_builder.query(&options);
-            }
-
-            let request = request_builder.build()?;
-
-            debug!(url = request.url().as_str(), "sending http request");
-            let start = Instant::now();
-            let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Cplane);
-            let response = self.endpoint.execute(request).await?;
-            drop(pause);
-            info!(duration = ?start.elapsed(), "received http response");
-            let body = parse_body::<WakeCompute>(response).await?;
-
-            // Unfortunately, ownership won't let us use `Option::ok_or` here.
-            let (host, port) = match parse_host_port(&body.address) {
-                None => return Err(WakeComputeError::BadComputeAddress(body.address)),
-                Some(x) => x,
-            };
-
-            // Don't set anything but host and port! This config will be cached.
-            // We'll set username and such later using the startup message.
-            // TODO: add more type safety (in progress).
-            let mut config = compute::ConnCfg::new(host.to_owned(), port);
-            config.ssl_mode(SslMode::Disable); // TLS is not configured on compute nodes.
-
-            let node = NodeInfo {
-                config,
-                aux: body.aux,
-                allow_self_signed_compute: false,
-            };
-
-            Ok(node)
-        }
-        .inspect_err(|e| tracing::debug!(error = ?e))
-        .instrument(info_span!("do_wake_compute"))
-        .await
-    }
-}
-
-impl super::ControlPlaneApi for NeonControlPlaneClient {
-    #[tracing::instrument(skip_all)]
-    async fn get_role_secret(
-        &self,
-        ctx: &RequestContext,
-        user_info: &ComputeUserInfo,
-    ) -> Result<CachedRoleSecret, GetAuthInfoError> {
-        let normalized_ep = &user_info.endpoint.normalize();
-        let user = &user_info.user;
-        if let Some(role_secret) = self
-            .caches
-            .project_info
-            .get_role_secret(normalized_ep, user)
-        {
-            return Ok(role_secret);
-        }
-        let auth_info = self.do_get_auth_info(ctx, user_info).await?;
-        if let Some(project_id) = auth_info.project_id {
-            let normalized_ep_int = normalized_ep.into();
-            self.caches.project_info.insert_role_secret(
-                project_id,
-                normalized_ep_int,
-                user.into(),
-                auth_info.secret.clone(),
-            );
-            self.caches.project_info.insert_allowed_ips(
-                project_id,
-                normalized_ep_int,
-                Arc::new(auth_info.allowed_ips),
-            );
-            ctx.set_project_id(project_id);
-        }
-        // When we just got a secret, we don't need to invalidate it.
-        Ok(Cached::new_uncached(auth_info.secret))
-    }
-
-    async fn get_allowed_ips_and_secret(
-        &self,
-        ctx: &RequestContext,
-        user_info: &ComputeUserInfo,
-    ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), GetAuthInfoError> {
-        let normalized_ep = &user_info.endpoint.normalize();
-        if let Some(allowed_ips) = self.caches.project_info.get_allowed_ips(normalized_ep) {
-            Metrics::get()
-                .proxy
-                .allowed_ips_cache_misses
-                .inc(CacheOutcome::Hit);
-            return Ok((allowed_ips, None));
-        }
-        Metrics::get()
-            .proxy
-            .allowed_ips_cache_misses
-            .inc(CacheOutcome::Miss);
-        let auth_info = self.do_get_auth_info(ctx, user_info).await?;
-        let allowed_ips = Arc::new(auth_info.allowed_ips);
-        let user = &user_info.user;
-        if let Some(project_id) = auth_info.project_id {
-            let normalized_ep_int = normalized_ep.into();
-            self.caches.project_info.insert_role_secret(
-                project_id,
-                normalized_ep_int,
-                user.into(),
-                auth_info.secret.clone(),
-            );
-            self.caches.project_info.insert_allowed_ips(
-                project_id,
-                normalized_ep_int,
-                allowed_ips.clone(),
-            );
-            ctx.set_project_id(project_id);
-        }
-        Ok((
-            Cached::new_uncached(allowed_ips),
-            Some(Cached::new_uncached(auth_info.secret)),
-        ))
-    }
-
-    #[tracing::instrument(skip_all)]
-    async fn get_endpoint_jwks(
-        &self,
-        ctx: &RequestContext,
-        endpoint: EndpointId,
-    ) -> Result<Vec<AuthRule>, GetEndpointJwksError> {
-        self.do_get_endpoint_jwks(ctx, endpoint).await
-    }
-
-    #[tracing::instrument(skip_all)]
-    async fn wake_compute(
-        &self,
-        ctx: &RequestContext,
-        user_info: &ComputeUserInfo,
-    ) -> Result<CachedNodeInfo, WakeComputeError> {
-        let key = user_info.endpoint_cache_key();
-
-        macro_rules! check_cache {
-            () => {
-                if let Some(cached) = self.caches.node_info.get(&key) {
-                    let (cached, info) = cached.take_value();
-                    let info = info.map_err(|c| {
-                        info!(key = &*key, "found cached wake_compute error");
-                        WakeComputeError::ControlPlane(ControlPlaneError::Message(Box::new(*c)))
-                    })?;
-
-                    debug!(key = &*key, "found cached compute node info");
-                    ctx.set_project(info.aux.clone());
-                    return Ok(cached.map(|()| info));
-                }
-            };
-        }
-
-        // Every time we do a wakeup http request, the compute node will stay up
-        // for some time (highly depends on the console's scale-to-zero policy);
-        // The connection info remains the same during that period of time,
-        // which means that we might cache it to reduce the load and latency.
-        check_cache!();
-
-        let permit = self.locks.get_permit(&key).await?;
-
-        // after getting back a permit - it's possible the cache was filled
-        // double check
-        if permit.should_check_cache() {
-            // TODO: if there is something in the cache, mark the permit as success.
-            check_cache!();
-        }
-
-        // check rate limit
-        if !self
-            .wake_compute_endpoint_rate_limiter
-            .check(user_info.endpoint.normalize_intern(), 1)
-        {
-            return Err(WakeComputeError::TooManyConnections);
-        }
-
-        let node = permit.release_result(self.do_wake_compute(ctx, user_info).await);
-        match node {
-            Ok(node) => {
-                ctx.set_project(node.aux.clone());
-                debug!(key = &*key, "created a cache entry for woken compute node");
-
-                let mut stored_node = node.clone();
-                // store the cached node as 'warm_cached'
-                stored_node.aux.cold_start_info = ColdStartInfo::WarmCached;
-
-                let (_, cached) = self.caches.node_info.insert_unit(key, Ok(stored_node));
-
-                Ok(cached.map(|()| node))
-            }
-            Err(err) => match err {
-                WakeComputeError::ControlPlane(ControlPlaneError::Message(err)) => {
-                    let Some(status) = &err.status else {
-                        return Err(WakeComputeError::ControlPlane(ControlPlaneError::Message(
-                            err,
-                        )));
-                    };
-
-                    let reason = status
-                        .details
-                        .error_info
-                        .map_or(Reason::Unknown, |x| x.reason);
-
-                    // if we can retry this error, do not cache it.
-                    if reason.can_retry() {
-                        return Err(WakeComputeError::ControlPlane(ControlPlaneError::Message(
-                            err,
-                        )));
-                    }
-
-                    // at this point, we should only have quota errors.
-                    debug!(
-                        key = &*key,
-                        "created a cache entry for the wake compute error"
-                    );
-
-                    self.caches.node_info.insert_ttl(
-                        key,
-                        Err(err.clone()),
-                        Duration::from_secs(30),
-                    );
-
-                    Err(WakeComputeError::ControlPlane(ControlPlaneError::Message(
-                        err,
-                    )))
-                }
-                err => return Err(err),
-            },
-        }
-    }
-}
-
-/// Parse http response body, taking status code into account.
-async fn parse_body<T: for<'a> serde::Deserialize<'a>>(
-    response: http::Response,
-) -> Result<T, ControlPlaneError> {
-    let status = response.status();
-    if status.is_success() {
-        // We shouldn't log raw body because it may contain secrets.
-        info!("request succeeded, processing the body");
-        return Ok(response.json().await?);
-    }
-    let s = response.bytes().await?;
-    // Log plaintext to be able to detect, whether there are some cases not covered by the error struct.
-    info!("response_error plaintext: {:?}", s);
-
-    // Don't throw an error here because it's not as important
-    // as the fact that the request itself has failed.
-    let mut body = serde_json::from_slice(&s).unwrap_or_else(|e| {
-        warn!("failed to parse error body: {e}");
-        ControlPlaneErrorMessage {
-            error: "reason unclear (malformed error message)".into(),
-            http_status_code: status,
-            status: None,
-        }
-    });
-    body.http_status_code = status;
-
-    warn!("console responded with an error ({status}): {body:?}");
-    Err(ControlPlaneError::Message(Box::new(body)))
-}
-
-fn parse_host_port(input: &str) -> Option<(&str, u16)> {
-    let (host, port) = input.rsplit_once(':')?;
-    let ipv6_brackets: &[_] = &['[', ']'];
-    Some((host.trim_matches(ipv6_brackets), port.parse().ok()?))
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    #[test]
-    fn test_parse_host_port_v4() {
-        let (host, port) = parse_host_port("127.0.0.1:5432").expect("failed to parse");
-        assert_eq!(host, "127.0.0.1");
-        assert_eq!(port, 5432);
-    }
-
-    #[test]
-    fn test_parse_host_port_v6() {
-        let (host, port) = parse_host_port("[2001:db8::1]:5432").expect("failed to parse");
-        assert_eq!(host, "2001:db8::1");
-        assert_eq!(port, 5432);
-    }
-
-    #[test]
-    fn test_parse_host_port_url() {
-        let (host, port) = parse_host_port("compute-foo-bar-1234.default.svc.cluster.local:5432")
-            .expect("failed to parse");
-        assert_eq!(host, "compute-foo-bar-1234.default.svc.cluster.local");
-        assert_eq!(port, 5432);
-    }
-}
--- a/proxy/src/control_plane/messages.rs
+++ b/proxy/src/control_plane/messages.rs
@@ -221,15 +221,6 @@ pub(crate) struct UserFacingMessage {
    pub(crate) message: Box<str>,
 }

-/// Response which holds client's auth secret, e.g. [`crate::scram::ServerSecret`].
-/// Returned by the `/proxy_get_role_secret` API method.
-#[derive(Deserialize)]
-pub(crate) struct GetRoleSecret {
-    pub(crate) role_secret: Box<str>,
-    pub(crate) allowed_ips: Option<Vec<IpPattern>>,
-    pub(crate) project_id: Option<ProjectIdInt>,
-}
-
 /// Response which holds client's auth secret, e.g. [`crate::scram::ServerSecret`].
 /// Returned by the `/get_endpoint_access_control` API method.
 #[derive(Deserialize)]
@@ -240,13 +231,6 @@ pub(crate) struct GetEndpointAccessControl {
    pub(crate) allowed_vpc_endpoint_ids: Option<Vec<EndpointIdInt>>,
 }

-// Manually implement debug to omit sensitive info.
-impl fmt::Debug for GetRoleSecret {
-    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        f.debug_struct("GetRoleSecret").finish_non_exhaustive()
-    }
-}
-
 /// Response which holds compute node's `host:port` pair.
 /// Returned by the `/proxy_wake_compute` API method.
 #[derive(Debug, Deserialize)]
@@ -477,18 +461,18 @@ mod tests {
        let json = json!({
            "role_secret": "secret",
        });
-        serde_json::from_str::<GetRoleSecret>(&json.to_string())?;
+        serde_json::from_str::<GetEndpointAccessControl>(&json.to_string())?;
        let json = json!({
            "role_secret": "secret",
            "allowed_ips": ["8.8.8.8"],
        });
-        serde_json::from_str::<GetRoleSecret>(&json.to_string())?;
+        serde_json::from_str::<GetEndpointAccessControl>(&json.to_string())?;
        let json = json!({
            "role_secret": "secret",
            "allowed_ips": ["8.8.8.8"],
            "project_id": "project",
        });
-        serde_json::from_str::<GetRoleSecret>(&json.to_string())?;
+        serde_json::from_str::<GetEndpointAccessControl>(&json.to_string())?;

        Ok(())
    }
--- a/storage_controller/src/background_node_operations.rs
+++ b/storage_controller/src/background_node_operations.rs
@@ -3,7 +3,7 @@ use std::{borrow::Cow, fmt::Debug, fmt::Display};
 use tokio_util::sync::CancellationToken;
 use utils::id::NodeId;

-pub(crate) const MAX_RECONCILES_PER_OPERATION: usize = 32;
+pub(crate) const MAX_RECONCILES_PER_OPERATION: usize = 64;

 #[derive(Copy, Clone)]
 pub(crate) struct Drain {
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -18,8 +18,9 @@ use pageserver_api::controller_api::{
    ShardsPreferredAzsRequest, TenantCreateRequest,
 };
 use pageserver_api::models::{
-    TenantConfigRequest, TenantLocationConfigRequest, TenantShardSplitRequest,
-    TenantTimeTravelRequest, TimelineArchivalConfigRequest, TimelineCreateRequest,
+    TenantConfigPatchRequest, TenantConfigRequest, TenantLocationConfigRequest,
+    TenantShardSplitRequest, TenantTimeTravelRequest, TimelineArchivalConfigRequest,
+    TimelineCreateRequest,
 };
 use pageserver_api::shard::TenantShardId;
 use pageserver_client::{mgmt_api, BlockUnblock};
@@ -208,6 +209,27 @@ async fn handle_tenant_location_config(
    )
 }

+async fn handle_tenant_config_patch(
+    service: Arc<Service>,
+    req: Request<Body>,
+) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::PageServerApi)?;
+
+    let mut req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
+    let config_req = json_request::<TenantConfigPatchRequest>(&mut req).await?;
+
+    json_response(
+        StatusCode::OK,
+        service.tenant_config_patch(config_req).await?,
+    )
+}
+
 async fn handle_tenant_config_set(
    service: Arc<Service>,
    req: Request<Body>,
@@ -857,6 +879,21 @@ async fn handle_cancel_node_fill(req: Request<Body>) -> Result<Response<Body>, A
    json_response(StatusCode::ACCEPTED, ())
 }

+async fn handle_safekeeper_list(req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Infra)?;
+
+    let req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
+    let state = get_state(&req);
+    let safekeepers = state.service.safekeepers_list().await?;
+    json_response(StatusCode::OK, safekeepers)
+}
+
 async fn handle_metadata_health_update(req: Request<Body>) -> Result<Response<Body>, ApiError> {
    check_permissions(&req, Scope::Scrubber)?;

@@ -1181,7 +1218,7 @@ impl From<ReconcileError> for ApiError {
 ///
 /// Not used by anything except manual testing.
 async fn handle_get_safekeeper(req: Request<Body>) -> Result<Response<Body>, ApiError> {
-    check_permissions(&req, Scope::Admin)?;
+    check_permissions(&req, Scope::Infra)?;

    let id = parse_request_param::<i64>(&req, "id")?;

@@ -1199,7 +1236,7 @@ async fn handle_get_safekeeper(req: Request<Body>) -> Result<Response<Body>, Api
    match res {
        Ok(b) => json_response(StatusCode::OK, b),
        Err(crate::persistence::DatabaseError::Query(diesel::result::Error::NotFound)) => {
-            Err(ApiError::NotFound("unknown instance_id".into()))
+            Err(ApiError::NotFound("unknown instance id".into()))
        }
        Err(other) => Err(other.into()),
    }
@@ -1795,6 +1832,21 @@ pub fn make_router(
                RequestName("control_v1_metadata_health_list_outdated"),
            )
        })
+        // Safekeepers
+        .get("/control/v1/safekeeper", |r| {
+            named_request_span(
+                r,
+                handle_safekeeper_list,
+                RequestName("control_v1_safekeeper_list"),
+            )
+        })
+        .get("/control/v1/safekeeper/:id", |r| {
+            named_request_span(r, handle_get_safekeeper, RequestName("v1_safekeeper"))
+        })
+        .post("/control/v1/safekeeper/:id", |r| {
+            // id is in the body
+            named_request_span(r, handle_upsert_safekeeper, RequestName("v1_safekeeper"))
+        })
        // Tenant Shard operations
        .put("/control/v1/tenant/:tenant_shard_id/migrate", |r| {
            tenant_service_handler(
@@ -1847,13 +1899,6 @@ pub fn make_router(
        .put("/control/v1/step_down", |r| {
            named_request_span(r, handle_step_down, RequestName("control_v1_step_down"))
        })
-        .get("/control/v1/safekeeper/:id", |r| {
-            named_request_span(r, handle_get_safekeeper, RequestName("v1_safekeeper"))
-        })
-        .post("/control/v1/safekeeper/:id", |r| {
-            // id is in the body
-            named_request_span(r, handle_upsert_safekeeper, RequestName("v1_safekeeper"))
-        })
        // Tenant operations
        // The ^/v1/ endpoints act as a "Virtual Pageserver", enabling shard-naive clients to call into
        // this service to manage tenants that actually consist of many tenant shards, as if they are a single entity.
@@ -1863,6 +1908,13 @@ pub fn make_router(
        .delete("/v1/tenant/:tenant_id", |r| {
            tenant_service_handler(r, handle_tenant_delete, RequestName("v1_tenant"))
        })
+        .patch("/v1/tenant/config", |r| {
+            tenant_service_handler(
+                r,
+                handle_tenant_config_patch,
+                RequestName("v1_tenant_config"),
+            )
+        })
        .put("/v1/tenant/config", |r| {
            tenant_service_handler(r, handle_tenant_config_set, RequestName("v1_tenant_config"))
        })
--- a/storage_controller/src/persistence.rs
+++ b/storage_controller/src/persistence.rs
@@ -104,6 +104,7 @@ pub(crate) enum DatabaseOperation {
    ListMetadataHealth,
    ListMetadataHealthUnhealthy,
    ListMetadataHealthOutdated,
+    ListSafekeepers,
    GetLeader,
    UpdateLeader,
    SetPreferredAzs,
@@ -1011,6 +1012,22 @@ impl Persistence {
        Ok(())
    }

+    /// At startup, populate the list of nodes which our shards may be placed on
+    pub(crate) async fn list_safekeepers(&self) -> DatabaseResult<Vec<SafekeeperPersistence>> {
+        let safekeepers: Vec<SafekeeperPersistence> = self
+            .with_measured_conn(
+                DatabaseOperation::ListNodes,
+                move |conn| -> DatabaseResult<_> {
+                    Ok(crate::schema::safekeepers::table.load::<SafekeeperPersistence>(conn)?)
+                },
+            )
+            .await?;
+
+        tracing::info!("list_safekeepers: loaded {} nodes", safekeepers.len());
+
+        Ok(safekeepers)
+    }
+
    pub(crate) async fn safekeeper_get(
        &self,
        id: i64,
--- a/storage_controller/src/scheduler.rs
+++ b/storage_controller/src/scheduler.rs
@@ -742,6 +742,50 @@ impl Scheduler {
        self.schedule_shard::<AttachedShardTag>(&[], &None, &ScheduleContext::default())
    }

+    /// For choosing which AZ to schedule a new shard into, use this.  It will return the
+    /// AZ with the lowest median utilization.
+    ///
+    /// We use an AZ-wide measure rather than simply selecting the AZ of the least-loaded
+    /// node, because while tenants start out single sharded, when they grow and undergo
+    /// shard-split, they will occupy space on many nodes within an AZ.
+    ///
+    /// We use median rather than total free space or mean utilization, because
+    /// we wish to avoid preferring AZs that have low-load nodes resulting from
+    /// recent replacements.
+    ///
+    /// The practical result is that we will pick an AZ based on its median node, and
+    /// then actually _schedule_ the new shard onto the lowest-loaded node in that AZ.
+    pub(crate) fn get_az_for_new_tenant(&self) -> Option<AvailabilityZone> {
+        if self.nodes.is_empty() {
+            return None;
+        }
+
+        let mut scores_by_az = HashMap::new();
+        for (node_id, node) in &self.nodes {
+            let az_scores = scores_by_az.entry(&node.az).or_insert_with(Vec::new);
+            let score = match &node.may_schedule {
+                MaySchedule::Yes(utilization) => utilization.score(),
+                MaySchedule::No => PageserverUtilization::full().score(),
+            };
+            az_scores.push((node_id, node, score));
+        }
+
+        // Sort by utilization.  Also include the node ID to break ties.
+        for scores in scores_by_az.values_mut() {
+            scores.sort_by_key(|i| (i.2, i.0));
+        }
+
+        let mut median_by_az = scores_by_az
+            .iter()
+            .map(|(az, nodes)| (*az, nodes.get(nodes.len() / 2).unwrap().2))
+            .collect::<Vec<_>>();
+        // Sort by utilization.  Also include the AZ to break ties.
+        median_by_az.sort_by_key(|i| (i.1, i.0));
+
+        // Return the AZ with the lowest median utilization
+        Some(median_by_az.first().unwrap().0.clone())
+    }
+
    /// Unit test access to internal state
    #[cfg(test)]
    pub(crate) fn get_node_shard_count(&self, node_id: NodeId) -> usize {
@@ -1087,4 +1131,53 @@ mod tests {
            intent.clear(&mut scheduler);
        }
    }
+
+    #[test]
+    fn az_scheduling_for_new_tenant() {
+        let az_a_tag = AvailabilityZone("az-a".to_string());
+        let az_b_tag = AvailabilityZone("az-b".to_string());
+        let nodes = test_utils::make_test_nodes(
+            6,
+            &[
+                az_a_tag.clone(),
+                az_a_tag.clone(),
+                az_a_tag.clone(),
+                az_b_tag.clone(),
+                az_b_tag.clone(),
+                az_b_tag.clone(),
+            ],
+        );
+
+        let mut scheduler = Scheduler::new(nodes.values());
+
+        /// Force the utilization of a node in Scheduler's state to a particular
+        /// number of bytes used.
+        fn set_utilization(scheduler: &mut Scheduler, node_id: NodeId, shard_count: u32) {
+            let mut node = Node::new(
+                node_id,
+                "".to_string(),
+                0,
+                "".to_string(),
+                0,
+                scheduler.nodes.get(&node_id).unwrap().az.clone(),
+            );
+            node.set_availability(NodeAvailability::Active(test_utilization::simple(
+                shard_count,
+                0,
+            )));
+            scheduler.node_upsert(&node);
+        }
+
+        // Initial empty state.  Scores are tied, scheduler prefers lower AZ ID.
+        assert_eq!(scheduler.get_az_for_new_tenant(), Some(az_a_tag.clone()));
+
+        // Put some utilization on one node in AZ A: this should change nothing, as the median hasn't changed
+        set_utilization(&mut scheduler, NodeId(1), 1000000);
+        assert_eq!(scheduler.get_az_for_new_tenant(), Some(az_a_tag.clone()));
+
+        // Put some utilization on a second node in AZ A: now the median has changed, so the scheduler
+        // should prefer the other AZ.
+        set_utilization(&mut scheduler, NodeId(2), 1000000);
+        assert_eq!(scheduler.get_az_for_new_tenant(), Some(az_b_tag.clone()));
+    }
 }
--- a/storage_controller/src/schema.rs
+++ b/storage_controller/src/schema.rs
@@ -29,6 +29,19 @@ diesel::table! {
    }
 }

+diesel::table! {
+    safekeepers (id) {
+        id -> Int8,
+        region_id -> Text,
+        version -> Int8,
+        host -> Text,
+        port -> Int4,
+        active -> Bool,
+        http_port -> Int4,
+        availability_zone_id -> Text,
+    }
+}
+
 diesel::table! {
    tenant_shards (tenant_id, shard_number, shard_count) {
        tenant_id -> Varchar,
@@ -45,18 +58,10 @@ diesel::table! {
    }
 }

-diesel::allow_tables_to_appear_in_same_query!(controllers, metadata_health, nodes, tenant_shards,);
-
-diesel::table! {
-    safekeepers {
-        id -> Int8,
-        region_id -> Text,
-        version -> Int8,
-        instance_id -> Text,
-        host -> Text,
-        port -> Int4,
-        active -> Bool,
-        http_port -> Int4,
-        availability_zone_id -> Text,
-    }
-}
+diesel::allow_tables_to_appear_in_same_query!(
+    controllers,
+    metadata_health,
+    nodes,
+    safekeepers,
+    tenant_shards,
+);
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -52,8 +52,8 @@ use pageserver_api::{
        TenantPolicyRequest, TenantShardMigrateRequest, TenantShardMigrateResponse,
    },
    models::{
-        SecondaryProgress, TenantConfigRequest, TimelineArchivalConfigRequest,
-        TopTenantShardsRequest,
+        SecondaryProgress, TenantConfigPatchRequest, TenantConfigRequest,
+        TimelineArchivalConfigRequest, TopTenantShardsRequest,
    },
 };
 use reqwest::StatusCode;
@@ -100,6 +100,8 @@ use crate::{

 use context_iterator::TenantShardContextIterator;

+const WAITER_FILL_DRAIN_POLL_TIMEOUT: Duration = Duration::from_millis(500);
+
 // For operations that should be quick, like attaching a new tenant
 const SHORT_RECONCILE_TIMEOUT: Duration = Duration::from_secs(5);

@@ -139,6 +141,7 @@ enum TenantOperations {
    Create,
    LocationConfig,
    ConfigSet,
+    ConfigPatch,
    TimeTravelRemoteStorage,
    Delete,
    UpdatePolicy,
@@ -1579,6 +1582,7 @@ impl Service {
                            attach_req.tenant_shard_id,
                            ShardIdentity::unsharded(),
                            PlacementPolicy::Attached(0),
+                            None,
                        ),
                    );
                    tracing::info!("Inserted shard {} in memory", attach_req.tenant_shard_id);
@@ -2106,6 +2110,16 @@ impl Service {
            )
        };

+        let preferred_az_id = {
+            let locked = self.inner.read().unwrap();
+            // Idempotency: take the existing value if the tenant already exists
+            if let Some(shard) = locked.tenants.get(create_ids.first().unwrap()) {
+                shard.preferred_az().cloned()
+            } else {
+                locked.scheduler.get_az_for_new_tenant()
+            }
+        };
+
        // Ordering: we persist tenant shards before creating them on the pageserver.  This enables a caller
        // to clean up after themselves by issuing a tenant deletion if something goes wrong and we restart
        // during the creation, rather than risking leaving orphan objects in S3.
@@ -2125,7 +2139,7 @@ impl Service {
                splitting: SplitState::default(),
                scheduling_policy: serde_json::to_string(&ShardSchedulingPolicy::default())
                    .unwrap(),
-                preferred_az_id: None,
+                preferred_az_id: preferred_az_id.as_ref().map(|az| az.to_string()),
            })
            .collect();

@@ -2161,6 +2175,7 @@ impl Service {
                    &create_req.shard_parameters,
                    create_req.config.clone(),
                    placement_policy.clone(),
+                    preferred_az_id.as_ref(),
                    &mut schedule_context,
                )
                .await;
@@ -2174,44 +2189,6 @@ impl Service {
            }
        }

-        let preferred_azs = {
-            let locked = self.inner.read().unwrap();
-            response_shards
-                .iter()
-                .filter_map(|resp| {
-                    let az_id = locked
-                        .nodes
-                        .get(&resp.node_id)
-                        .map(|n| n.get_availability_zone_id().clone())?;
-
-                    Some((resp.shard_id, az_id))
-                })
-                .collect::<Vec<_>>()
-        };
-
-        // Note that we persist the preferred AZ for the new shards separately.
-        // In theory, we could "peek" the scheduler to determine where the shard will
-        // land, but the subsequent "real" call into the scheduler might select a different
-        // node. Hence, we do this awkward update to keep things consistent.
-        let updated = self
-            .persistence
-            .set_tenant_shard_preferred_azs(preferred_azs)
-            .await
-            .map_err(|err| {
-                ApiError::InternalServerError(anyhow::anyhow!(
-                    "Failed to persist preferred az ids: {err}"
-                ))
-            })?;
-
-        {
-            let mut locked = self.inner.write().unwrap();
-            for (tid, az_id) in updated {
-                if let Some(shard) = locked.tenants.get_mut(&tid) {
-                    shard.set_preferred_az(az_id);
-                }
-            }
-        }
-
        // If we failed to schedule shards, then they are still created in the controller,
        // but we return an error to the requester to avoid a silent failure when someone
        // tries to e.g. create a tenant whose placement policy requires more nodes than
@@ -2242,6 +2219,7 @@ impl Service {

    /// Helper for tenant creation that does the scheduling for an individual shard. Covers both the
    /// case of a new tenant and a pre-existing one.
+    #[allow(clippy::too_many_arguments)]
    async fn do_initial_shard_scheduling(
        &self,
        tenant_shard_id: TenantShardId,
@@ -2249,6 +2227,7 @@ impl Service {
        shard_params: &ShardParameters,
        config: TenantConfig,
        placement_policy: PlacementPolicy,
+        preferred_az_id: Option<&AvailabilityZone>,
        schedule_context: &mut ScheduleContext,
    ) -> InitialShardScheduleOutcome {
        let mut locked = self.inner.write().unwrap();
@@ -2259,10 +2238,6 @@ impl Service {
            Entry::Occupied(mut entry) => {
                tracing::info!("Tenant shard {tenant_shard_id} already exists while creating");

-                // TODO: schedule() should take an anti-affinity expression that pushes
-                // attached and secondary locations (independently) away frorm those
-                // pageservers also holding a shard for this tenant.
-
                if let Err(err) = entry.get_mut().schedule(scheduler, schedule_context) {
                    return InitialShardScheduleOutcome::ShardScheduleError(err);
                }
@@ -2286,6 +2261,7 @@ impl Service {
                    tenant_shard_id,
                    ShardIdentity::from_params(tenant_shard_id.shard_number, shard_params),
                    placement_policy,
+                    preferred_az_id.cloned(),
                ));

                state.generation = initial_generation;
@@ -2602,6 +2578,55 @@ impl Service {
        Ok(result)
    }

+    pub(crate) async fn tenant_config_patch(
+        &self,
+        req: TenantConfigPatchRequest,
+    ) -> Result<(), ApiError> {
+        let _tenant_lock = trace_exclusive_lock(
+            &self.tenant_op_locks,
+            req.tenant_id,
+            TenantOperations::ConfigPatch,
+        )
+        .await;
+
+        let tenant_id = req.tenant_id;
+        let patch = req.config;
+
+        let base = {
+            let locked = self.inner.read().unwrap();
+            let shards = locked
+                .tenants
+                .range(TenantShardId::tenant_range(req.tenant_id));
+
+            let mut configs = shards.map(|(_sid, shard)| &shard.config).peekable();
+
+            let first = match configs.peek() {
+                Some(first) => (*first).clone(),
+                None => {
+                    return Err(ApiError::NotFound(
+                        anyhow::anyhow!("Tenant {} not found", req.tenant_id).into(),
+                    ));
+                }
+            };
+
+            if !configs.all_equal() {
+                tracing::error!("Tenant configs for {} are mismatched. ", req.tenant_id);
+                // This can't happen because we atomically update the database records
+                // of all shards to the new value in [`Self::set_tenant_config_and_reconcile`].
+                return Err(ApiError::InternalServerError(anyhow::anyhow!(
+                    "Tenant configs for {} are mismatched",
+                    req.tenant_id
+                )));
+            }
+
+            first
+        };
+
+        let updated_config = base.apply_patch(patch);
+        self.set_tenant_config_and_reconcile(tenant_id, updated_config)
+            .await
+    }
+
    pub(crate) async fn tenant_config_set(&self, req: TenantConfigRequest) -> Result<(), ApiError> {
        // We require an exclusive lock, because we are updating persistent and in-memory state
        let _tenant_lock = trace_exclusive_lock(
@@ -2611,12 +2636,32 @@ impl Service {
        )
        .await;

-        let tenant_id = req.tenant_id;
-        let config = req.config;
+        let tenant_exists = {
+            let locked = self.inner.read().unwrap();
+            let mut r = locked
+                .tenants
+                .range(TenantShardId::tenant_range(req.tenant_id));
+            r.next().is_some()
+        };

+        if !tenant_exists {
+            return Err(ApiError::NotFound(
+                anyhow::anyhow!("Tenant {} not found", req.tenant_id).into(),
+            ));
+        }
+
+        self.set_tenant_config_and_reconcile(req.tenant_id, req.config)
+            .await
+    }
+
+    async fn set_tenant_config_and_reconcile(
+        &self,
+        tenant_id: TenantId,
+        config: TenantConfig,
+    ) -> Result<(), ApiError> {
        self.persistence
            .update_tenant_shard(
-                TenantFilter::Tenant(req.tenant_id),
+                TenantFilter::Tenant(tenant_id),
                None,
                Some(config.clone()),
                None,
@@ -4184,7 +4229,8 @@ impl Service {
                        },
                    );

-                    let mut child_state = TenantShard::new(child, child_shard, policy.clone());
+                    let mut child_state =
+                        TenantShard::new(child, child_shard, policy.clone(), preferred_az.clone());
                    child_state.intent = IntentState::single(scheduler, Some(pageserver));
                    child_state.observed = ObservedState {
                        locations: child_observed,
@@ -6728,7 +6774,7 @@ impl Service {
            }

            waiters = self
-                .await_waiters_remainder(waiters, SHORT_RECONCILE_TIMEOUT)
+                .await_waiters_remainder(waiters, WAITER_FILL_DRAIN_POLL_TIMEOUT)
                .await;

            failpoint_support::sleep_millis_async!("sleepy-drain-loop", &cancel);
@@ -6981,7 +7027,7 @@ impl Service {
            }

            waiters = self
-                .await_waiters_remainder(waiters, SHORT_RECONCILE_TIMEOUT)
+                .await_waiters_remainder(waiters, WAITER_FILL_DRAIN_POLL_TIMEOUT)
                .await;
        }

@@ -7113,6 +7159,12 @@ impl Service {
        global_observed
    }

+    pub(crate) async fn safekeepers_list(
+        &self,
+    ) -> Result<Vec<crate::persistence::SafekeeperPersistence>, DatabaseError> {
+        self.persistence.list_safekeepers().await
+    }
+
    pub(crate) async fn get_safekeeper(
        &self,
        id: i64,
--- a/storage_controller/src/tenant_shard.rs
+++ b/storage_controller/src/tenant_shard.rs
@@ -472,6 +472,7 @@ impl TenantShard {
        tenant_shard_id: TenantShardId,
        shard: ShardIdentity,
        policy: PlacementPolicy,
+        preferred_az_id: Option<AvailabilityZone>,
    ) -> Self {
        metrics::METRICS_REGISTRY
            .metrics_group
@@ -495,7 +496,7 @@ impl TenantShard {
            last_error: Arc::default(),
            pending_compute_notification: false,
            scheduling_policy: ShardSchedulingPolicy::default(),
-            preferred_az_id: None,
+            preferred_az_id,
        }
    }

@@ -1571,6 +1572,7 @@ pub(crate) mod tests {
            )
            .unwrap(),
            policy,
+            None,
        )
    }

@@ -1597,7 +1599,7 @@ pub(crate) mod tests {
                    shard_number,
                    shard_count,
                };
-                let mut ts = TenantShard::new(
+                TenantShard::new(
                    tenant_shard_id,
                    ShardIdentity::new(
                        shard_number,
@@ -1606,13 +1608,8 @@ pub(crate) mod tests {
                    )
                    .unwrap(),
                    policy.clone(),
-                );
-
-                if let Some(az) = &preferred_az {
-                    ts.set_preferred_az(az.clone());
-                }
-
-                ts
+                    preferred_az.clone(),
+                )
            })
            .collect()
    }
--- a/storage_scrubber/src/checks.rs
+++ b/storage_scrubber/src/checks.rs
@@ -533,8 +533,9 @@ async fn list_timeline_blobs_impl(
 }

 pub(crate) struct RemoteTenantManifestInfo {
-    pub(crate) latest_generation: Option<Generation>,
-    pub(crate) manifests: Vec<(Generation, ListingObject)>,
+    pub(crate) generation: Generation,
+    pub(crate) manifest: TenantManifest,
+    pub(crate) listing_object: ListingObject,
 }

 pub(crate) enum ListTenantManifestResult {
@@ -543,7 +544,10 @@ pub(crate) enum ListTenantManifestResult {
        #[allow(dead_code)]
        unknown_keys: Vec<ListingObject>,
    },
-    NoErrors(RemoteTenantManifestInfo),
+    NoErrors {
+        latest_generation: Option<RemoteTenantManifestInfo>,
+        manifests: Vec<(Generation, ListingObject)>,
+    },
 }

 /// Lists the tenant manifests in remote storage and parses the latest one, returning a [`ListTenantManifestResult`] object.
@@ -592,14 +596,6 @@ pub(crate) async fn list_tenant_manifests(
        unknown_keys.push(obj);
    }

-    if manifests.is_empty() {
-        tracing::debug!("No manifest for timeline.");
-
-        return Ok(ListTenantManifestResult::WithErrors {
-            errors,
-            unknown_keys,
-        });
-    }
    if !unknown_keys.is_empty() {
        errors.push(((*prefix_str).to_owned(), "unknown keys listed".to_string()));

@@ -609,6 +605,15 @@ pub(crate) async fn list_tenant_manifests(
        });
    }

+    if manifests.is_empty() {
+        tracing::debug!("No manifest for timeline.");
+
+        return Ok(ListTenantManifestResult::NoErrors {
+            latest_generation: None,
+            manifests,
+        });
+    }
+
    // Find the manifest with the highest generation
    let (latest_generation, latest_listing_object) = manifests
        .iter()
@@ -616,6 +621,8 @@ pub(crate) async fn list_tenant_manifests(
        .map(|(g, obj)| (*g, obj.clone()))
        .unwrap();

+    manifests.retain(|(gen, _obj)| gen != &latest_generation);
+
    let manifest_bytes =
        match download_object_with_retries(remote_client, &latest_listing_object.key).await {
            Ok(bytes) => bytes,
@@ -634,13 +641,15 @@ pub(crate) async fn list_tenant_manifests(
        };

    match TenantManifest::from_json_bytes(&manifest_bytes) {
-        Ok(_manifest) => {
-            return Ok(ListTenantManifestResult::NoErrors(
-                RemoteTenantManifestInfo {
-                    latest_generation: Some(latest_generation),
-                    manifests,
-                },
-            ));
+        Ok(manifest) => {
+            return Ok(ListTenantManifestResult::NoErrors {
+                latest_generation: Some(RemoteTenantManifestInfo {
+                    generation: latest_generation,
+                    manifest,
+                    listing_object: latest_listing_object,
+                }),
+                manifests,
+            });
        }
        Err(parse_error) => errors.push((
            latest_listing_object.key.get_path().as_str().to_owned(),
--- a/storage_scrubber/src/main.rs
+++ b/storage_scrubber/src/main.rs
@@ -86,6 +86,8 @@ enum Command {
        /// For safekeeper node_kind only, json list of timelines and their lsn info
        #[arg(long, default_value = None)]
        timeline_lsns: Option<String>,
+        #[arg(long, default_value_t = false)]
+        verbose: bool,
    },
    TenantSnapshot {
        #[arg(long = "tenant-id")]
@@ -166,6 +168,7 @@ async fn main() -> anyhow::Result<()> {
            dump_db_connstr,
            dump_db_table,
            timeline_lsns,
+            verbose,
        } => {
            if let NodeKind::Safekeeper = node_kind {
                let db_or_list = match (timeline_lsns, dump_db_connstr) {
@@ -203,6 +206,7 @@ async fn main() -> anyhow::Result<()> {
                    tenant_ids,
                    json,
                    post_to_storcon,
+                    verbose,
                    cli.exit_code,
                )
                .await
@@ -313,6 +317,7 @@ pub async fn run_cron_job(
        Vec::new(),
        true,
        post_to_storcon,
+        false, // default to non-verbose mode
        exit_code,
    )
    .await?;
@@ -362,12 +367,13 @@ pub async fn scan_pageserver_metadata_cmd(
    tenant_shard_ids: Vec<TenantShardId>,
    json: bool,
    post_to_storcon: bool,
+    verbose: bool,
    exit_code: bool,
 ) -> anyhow::Result<()> {
    if controller_client.is_none() && post_to_storcon {
        return Err(anyhow!("Posting pageserver scan health status to storage controller requires `--controller-api` and `--controller-jwt` to run"));
    }
-    match scan_pageserver_metadata(bucket_config.clone(), tenant_shard_ids).await {
+    match scan_pageserver_metadata(bucket_config.clone(), tenant_shard_ids, verbose).await {
        Err(e) => {
            tracing::error!("Failed: {e}");
            Err(e)
--- a/storage_scrubber/src/pageserver_physical_gc.rs
+++ b/storage_scrubber/src/pageserver_physical_gc.rs
@@ -4,11 +4,13 @@ use std::time::Duration;

 use crate::checks::{
    list_tenant_manifests, list_timeline_blobs, BlobDataParseResult, ListTenantManifestResult,
+    RemoteTenantManifestInfo,
 };
 use crate::metadata_stream::{stream_tenant_timelines, stream_tenants};
 use crate::{init_remote, BucketConfig, NodeKind, RootTarget, TenantShardTimelineId, MAX_RETRIES};
 use futures_util::{StreamExt, TryStreamExt};
 use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata;
+use pageserver::tenant::remote_timeline_client::manifest::OffloadedTimelineManifest;
 use pageserver::tenant::remote_timeline_client::{
    parse_remote_index_path, parse_remote_tenant_manifest_path, remote_layer_path,
 };
@@ -527,7 +529,7 @@ async fn gc_tenant_manifests(
    target: &RootTarget,
    mode: GcMode,
    tenant_shard_id: TenantShardId,
-) -> anyhow::Result<GcSummary> {
+) -> anyhow::Result<(GcSummary, Option<RemoteTenantManifestInfo>)> {
    let mut gc_summary = GcSummary::default();
    match list_tenant_manifests(remote_client, tenant_shard_id, target).await? {
        ListTenantManifestResult::WithErrors {
@@ -537,33 +539,35 @@ async fn gc_tenant_manifests(
            for (_key, error) in errors {
                tracing::warn!(%tenant_shard_id, "list_tenant_manifests: {error}");
            }
+            Ok((gc_summary, None))
        }
-        ListTenantManifestResult::NoErrors(mut manifest_info) => {
-            let Some(latest_gen) = manifest_info.latest_generation else {
-                return Ok(gc_summary);
+        ListTenantManifestResult::NoErrors {
+            latest_generation,
+            mut manifests,
+        } => {
+            let Some(latest_generation) = latest_generation else {
+                return Ok((gc_summary, None));
            };
-            manifest_info
-                .manifests
-                .sort_by_key(|(generation, _obj)| *generation);
+            manifests.sort_by_key(|(generation, _obj)| *generation);
            // skip the two latest generations (they don't neccessarily have to be 1 apart from each other)
-            let candidates = manifest_info.manifests.iter().rev().skip(2);
+            let candidates = manifests.iter().rev().skip(2);
            for (_generation, key) in candidates {
                maybe_delete_tenant_manifest(
                    remote_client,
                    &min_age,
-                    latest_gen,
+                    latest_generation.generation,
                    key,
                    mode,
                    &mut gc_summary,
                )
                .instrument(
-                    info_span!("maybe_delete_tenant_manifest", %tenant_shard_id, ?latest_gen, %key.key),
+                    info_span!("maybe_delete_tenant_manifest", %tenant_shard_id, ?latest_generation.generation, %key.key),
                )
                .await;
            }
+            Ok((gc_summary, Some(latest_generation)))
        }
    }
-    Ok(gc_summary)
 }

 async fn gc_timeline(
@@ -573,6 +577,7 @@ async fn gc_timeline(
    mode: GcMode,
    ttid: TenantShardTimelineId,
    accumulator: &Arc<std::sync::Mutex<TenantRefAccumulator>>,
+    tenant_manifest_info: Arc<Option<RemoteTenantManifestInfo>>,
 ) -> anyhow::Result<GcSummary> {
    let mut summary = GcSummary::default();
    let data = list_timeline_blobs(remote_client, ttid, target).await?;
@@ -597,6 +602,60 @@ async fn gc_timeline(
        }
    };

+    if let Some(tenant_manifest_info) = &*tenant_manifest_info {
+        // TODO: this is O(n^2) in the number of offloaded timelines. Do a hashmap lookup instead.
+        let maybe_offloaded = tenant_manifest_info
+            .manifest
+            .offloaded_timelines
+            .iter()
+            .find(|offloaded_timeline| offloaded_timeline.timeline_id == ttid.timeline_id);
+        if let Some(offloaded) = maybe_offloaded {
+            let warnings = validate_index_part_with_offloaded(index_part, offloaded);
+            let warn = if warnings.is_empty() {
+                false
+            } else {
+                // Verify that the manifest hasn't changed. If it has, a potential racing change could have been cause for our troubles.
+                match list_tenant_manifests(remote_client, ttid.tenant_shard_id, target).await? {
+                    ListTenantManifestResult::WithErrors {
+                        errors,
+                        unknown_keys: _,
+                    } => {
+                        for (_key, error) in errors {
+                            tracing::warn!(%ttid, "list_tenant_manifests in gc_timeline: {error}");
+                        }
+                        true
+                    }
+                    ListTenantManifestResult::NoErrors {
+                        latest_generation,
+                        manifests: _,
+                    } => {
+                        if let Some(new_latest_gen) = latest_generation {
+                            let manifest_changed = (
+                                new_latest_gen.generation,
+                                new_latest_gen.listing_object.last_modified,
+                            ) == (
+                                tenant_manifest_info.generation,
+                                tenant_manifest_info.listing_object.last_modified,
+                            );
+                            if manifest_changed {
+                                tracing::debug!(%ttid, "tenant manifest changed since it was loaded, suppressing {} warnings", warnings.len());
+                            }
+                            manifest_changed
+                        } else {
+                            // The latest generation is gone. This timeline is in the progress of being deleted?
+                            false
+                        }
+                    }
+                }
+            };
+            if warn {
+                for warning in warnings {
+                    tracing::warn!(%ttid, "{}", warning);
+                }
+            }
+        }
+    }
+
    accumulator.lock().unwrap().update(ttid, index_part);

    for key in candidates {
@@ -608,6 +667,35 @@ async fn gc_timeline(
    Ok(summary)
 }

+fn validate_index_part_with_offloaded(
+    index_part: &IndexPart,
+    offloaded: &OffloadedTimelineManifest,
+) -> Vec<String> {
+    let mut warnings = Vec::new();
+    if let Some(archived_at_index_part) = index_part.archived_at {
+        if archived_at_index_part
+            .signed_duration_since(offloaded.archived_at)
+            .num_seconds()
+            != 0
+        {
+            warnings.push(format!(
+                "index-part archived_at={} differs from manifest archived_at={}",
+                archived_at_index_part, offloaded.archived_at
+            ));
+        }
+    } else {
+        warnings.push("Timeline offloaded in manifest but not archived in index-part".to_string());
+    }
+    if index_part.metadata.ancestor_timeline() != offloaded.ancestor_timeline_id {
+        warnings.push(format!(
+            "index-part anestor={:?} differs from manifest ancestor={:?}",
+            index_part.metadata.ancestor_timeline(),
+            offloaded.ancestor_timeline_id
+        ));
+    }
+    warnings
+}
+
 /// Physical garbage collection: removing unused S3 objects.
 ///
 /// This is distinct from the garbage collection done inside the pageserver, which operates at a higher level
@@ -650,29 +738,38 @@ pub async fn pageserver_physical_gc(
        let target_ref = &target;
        let remote_client_ref = &remote_client;
        async move {
-            let summaries_from_manifests = match gc_tenant_manifests(
+            let gc_manifest_result = gc_tenant_manifests(
                remote_client_ref,
                min_age,
                target_ref,
                mode,
                tenant_shard_id,
            )
-            .await
-            {
-                Ok(gc_summary) => vec![Ok(GcSummaryOrContent::<TenantShardTimelineId>::GcSummary(
-                    gc_summary,
-                ))],
+            .await;
+            let (summary_from_manifest, tenant_manifest_opt) = match gc_manifest_result {
+                Ok((gc_summary, tenant_manifest)) => (gc_summary, tenant_manifest),
                Err(e) => {
                    tracing::warn!(%tenant_shard_id, "Error in gc_tenant_manifests: {e}");
-                    Vec::new()
+                    (GcSummary::default(), None)
                }
            };
+            let tenant_manifest_arc = Arc::new(tenant_manifest_opt);
+            let summary_from_manifest = Ok(GcSummaryOrContent::<(_, _)>::GcSummary(
+                summary_from_manifest,
+            ));
            stream_tenant_timelines(remote_client_ref, target_ref, tenant_shard_id)
                .await
                .map(|stream| {
                    stream
-                        .map_ok(GcSummaryOrContent::Content)
-                        .chain(futures::stream::iter(summaries_from_manifests.into_iter()))
+                        .zip(futures::stream::iter(std::iter::repeat(
+                            tenant_manifest_arc,
+                        )))
+                        .map(|(ttid_res, tenant_manifest_arc)| {
+                            ttid_res.map(move |ttid| {
+                                GcSummaryOrContent::Content((ttid, tenant_manifest_arc))
+                            })
+                        })
+                        .chain(futures::stream::iter([summary_from_manifest].into_iter()))
                })
        }
    });
@@ -684,14 +781,17 @@ pub async fn pageserver_physical_gc(
    // Drain futures for per-shard GC, populating accumulator as a side effect
    {
        let timelines = timelines.map_ok(|summary_or_ttid| match summary_or_ttid {
-            GcSummaryOrContent::Content(ttid) => futures::future::Either::Left(gc_timeline(
-                &remote_client,
-                &min_age,
-                &target,
-                mode,
-                ttid,
-                &accumulator,
-            )),
+            GcSummaryOrContent::Content((ttid, tenant_manifest_arc)) => {
+                futures::future::Either::Left(gc_timeline(
+                    &remote_client,
+                    &min_age,
+                    &target,
+                    mode,
+                    ttid,
+                    &accumulator,
+                    tenant_manifest_arc,
+                ))
+            }
            GcSummaryOrContent::GcSummary(gc_summary) => {
                futures::future::Either::Right(futures::future::ok(gc_summary))
            }
--- a/storage_scrubber/src/scan_pageserver_metadata.rs
+++ b/storage_scrubber/src/scan_pageserver_metadata.rs
@@ -21,8 +21,12 @@ pub struct MetadataSummary {
    tenant_count: usize,
    timeline_count: usize,
    timeline_shard_count: usize,
-    with_errors: HashSet<TenantShardTimelineId>,
-    with_warnings: HashSet<TenantShardTimelineId>,
+    /// Tenant-shard timeline (key) mapping to errors. The key has to be a string because it will be serialized to a JSON.
+    /// The key is generated using `TenantShardTimelineId::to_string()`.
+    with_errors: HashMap<String, Vec<String>>,
+    /// Tenant-shard timeline (key) mapping to warnings. The key has to be a string because it will be serialized to a JSON.
+    /// The key is generated using `TenantShardTimelineId::to_string()`.
+    with_warnings: HashMap<String, Vec<String>>,
    with_orphans: HashSet<TenantShardTimelineId>,
    indices_by_version: HashMap<usize, usize>,

@@ -52,7 +56,12 @@ impl MetadataSummary {
        }
    }

-    fn update_analysis(&mut self, id: &TenantShardTimelineId, analysis: &TimelineAnalysis) {
+    fn update_analysis(
+        &mut self,
+        id: &TenantShardTimelineId,
+        analysis: &TimelineAnalysis,
+        verbose: bool,
+    ) {
        if analysis.is_healthy() {
            self.healthy_tenant_shards.insert(id.tenant_shard_id);
        } else {
@@ -61,11 +70,17 @@ impl MetadataSummary {
        }

        if !analysis.errors.is_empty() {
-            self.with_errors.insert(*id);
+            let entry = self.with_errors.entry(id.to_string()).or_default();
+            if verbose {
+                entry.extend(analysis.errors.iter().cloned());
+            }
        }

        if !analysis.warnings.is_empty() {
-            self.with_warnings.insert(*id);
+            let entry = self.with_warnings.entry(id.to_string()).or_default();
+            if verbose {
+                entry.extend(analysis.warnings.iter().cloned());
+            }
        }
    }

@@ -120,6 +135,7 @@ Index versions: {version_summary}
 pub async fn scan_pageserver_metadata(
    bucket_config: BucketConfig,
    tenant_ids: Vec<TenantShardId>,
+    verbose: bool,
 ) -> anyhow::Result<MetadataSummary> {
    let (remote_client, target) = init_remote(bucket_config, NodeKind::Pageserver).await?;

@@ -164,6 +180,7 @@ pub async fn scan_pageserver_metadata(
        mut tenant_objects: TenantObjectListing,
        timelines: Vec<(TenantShardTimelineId, RemoteTimelineBlobData)>,
        highest_shard_count: ShardCount,
+        verbose: bool,
    ) {
        summary.tenant_count += 1;

@@ -203,7 +220,7 @@ pub async fn scan_pageserver_metadata(
                        Some(data),
                    )
                    .await;
-                    summary.update_analysis(&ttid, &analysis);
+                    summary.update_analysis(&ttid, &analysis, verbose);

                    timeline_ids.insert(ttid.timeline_id);
                } else {
@@ -271,10 +288,6 @@ pub async fn scan_pageserver_metadata(
        summary.update_data(&data);

        match tenant_id {
-            None => {
-                tenant_id = Some(ttid.tenant_shard_id.tenant_id);
-                highest_shard_count = highest_shard_count.max(ttid.tenant_shard_id.shard_count);
-            }
            Some(prev_tenant_id) => {
                if prev_tenant_id != ttid.tenant_shard_id.tenant_id {
                    // New tenant: analyze this tenant's timelines, clear accumulated tenant_timeline_results
@@ -287,6 +300,7 @@ pub async fn scan_pageserver_metadata(
                        tenant_objects,
                        timelines,
                        highest_shard_count,
+                        verbose,
                    )
                    .instrument(info_span!("analyze-tenant", tenant = %prev_tenant_id))
                    .await;
@@ -296,6 +310,10 @@ pub async fn scan_pageserver_metadata(
                    highest_shard_count = highest_shard_count.max(ttid.tenant_shard_id.shard_count);
                }
            }
+            None => {
+                tenant_id = Some(ttid.tenant_shard_id.tenant_id);
+                highest_shard_count = highest_shard_count.max(ttid.tenant_shard_id.shard_count);
+            }
        }

        match &data.blob_data {
@@ -326,6 +344,7 @@ pub async fn scan_pageserver_metadata(
            tenant_objects,
            tenant_timeline_results,
            highest_shard_count,
+            verbose,
        )
        .instrument(info_span!("analyze-tenant", tenant = %tenant_id))
        .await;
--- a/test_runner/fixtures/httpserver.py
+++ b/test_runner/fixtures/httpserver.py
@@ -7,24 +7,25 @@ from pytest_httpserver import HTTPServer

 if TYPE_CHECKING:
    from collections.abc import Iterator
+    from ssl import SSLContext

    from fixtures.port_distributor import PortDistributor

-# TODO: mypy fails with:
-#  Module "fixtures.neon_fixtures" does not explicitly export attribute "PortDistributor"  [attr-defined]
-# from fixtures.neon_fixtures import PortDistributor
+    ListenAddress = tuple[str, int]

 # compared to the fixtures from pytest_httpserver with same names, these are
 # always function scoped, so you can check and stop the server in tests.


@pytest.fixture(scope="function")
-def httpserver_ssl_context():
-    return None
+def httpserver_ssl_context() -> Iterator[SSLContext | None]:
+    yield None


@pytest.fixture(scope="function")
-def make_httpserver(httpserver_listen_address, httpserver_ssl_context) -> Iterator[HTTPServer]:
+def make_httpserver(
+    httpserver_listen_address: ListenAddress, httpserver_ssl_context: SSLContext | None
+) -> Iterator[HTTPServer]:
    host, port = httpserver_listen_address
    if not host:
        host = HTTPServer.DEFAULT_LISTEN_HOST
@@ -47,6 +48,6 @@ def httpserver(make_httpserver: HTTPServer) -> Iterator[HTTPServer]:


@pytest.fixture(scope="function")
-def httpserver_listen_address(port_distributor: PortDistributor) -> tuple[str, int]:
+def httpserver_listen_address(port_distributor: PortDistributor) -> ListenAddress:
    port = port_distributor.get_port()
    return ("localhost", port)
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -178,6 +178,7 @@ PAGESERVER_PER_TENANT_METRICS: tuple[str, ...] = (
    counter("pageserver_timeline_wal_records_received"),
    counter("pageserver_page_service_pagestream_flush_in_progress_micros"),
    *histogram("pageserver_page_service_batch_size"),
+    *histogram("pageserver_page_service_pagestream_batch_wait_time_seconds"),
    *PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS,
    # "pageserver_directory_entries_count", -- only used if above a certain threshold
    # "pageserver_broken_tenants_count" -- used only for broken
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -435,7 +435,10 @@ class NeonEnvBuilder:

        self.pageserver_virtual_file_io_mode = pageserver_virtual_file_io_mode

-        self.pageserver_wal_receiver_protocol = pageserver_wal_receiver_protocol
+        if pageserver_wal_receiver_protocol is not None:
+            self.pageserver_wal_receiver_protocol = pageserver_wal_receiver_protocol
+        else:
+            self.pageserver_wal_receiver_protocol = PageserverWalReceiverProtocol.INTERPRETED

        assert test_name.startswith(
            "test_"
@@ -2329,6 +2332,16 @@ class NeonStorageController(MetricsGetter, LogUtils):
                return None
            raise e

+    def get_safekeepers(self) -> list[dict[str, Any]]:
+        response = self.request(
+            "GET",
+            f"{self.api}/control/v1/safekeeper",
+            headers=self.headers(TokenScope.ADMIN),
+        )
+        json = response.json()
+        assert isinstance(json, list)
+        return json
+
    def set_preferred_azs(self, preferred_azs: dict[TenantShardId, str]) -> list[TenantShardId]:
        response = self.request(
            "PUT",
@@ -3209,7 +3222,7 @@ class NeonProxy(PgProtocol):
                *["--allow-self-signed-compute", "true"],
            ]

-    class Console(AuthBackend):
+    class ProxyV1(AuthBackend):
        def __init__(self, endpoint: str, fixed_rate_limit: int | None = None):
            self.endpoint = endpoint
            self.fixed_rate_limit = fixed_rate_limit
@@ -3217,7 +3230,7 @@ class NeonProxy(PgProtocol):
        def extra_args(self) -> list[str]:
            args = [
                # Console auth backend params
-                *["--auth-backend", "console"],
+                *["--auth-backend", "cplane-v1"],
                *["--auth-endpoint", self.endpoint],
                *["--sql-over-http-pool-opt-in", "false"],
            ]
@@ -3465,13 +3478,13 @@ class NeonProxy(PgProtocol):


 class NeonAuthBroker:
-    class ControlPlane:
+    class ProxyV1:
        def __init__(self, endpoint: str):
            self.endpoint = endpoint

        def extra_args(self) -> list[str]:
            args = [
-                *["--auth-backend", "console"],
+                *["--auth-backend", "cplane-v1"],
                *["--auth-endpoint", self.endpoint],
            ]
            return args
@@ -3483,7 +3496,7 @@ class NeonAuthBroker:
        http_port: int,
        mgmt_port: int,
        external_http_port: int,
-        auth_backend: NeonAuthBroker.ControlPlane,
+        auth_backend: NeonAuthBroker.ProxyV1,
    ):
        self.domain = "apiauth.localtest.me"  # resolves to 127.0.0.1
        self.host = "127.0.0.1"
@@ -3669,7 +3682,7 @@ def static_auth_broker(
    local_proxy_addr = f"{http2_echoserver.host}:{http2_echoserver.port}"

    # return local_proxy addr on ProxyWakeCompute.
-    httpserver.expect_request("/cplane/proxy_wake_compute").respond_with_json(
+    httpserver.expect_request("/cplane/wake_compute").respond_with_json(
        {
            "address": local_proxy_addr,
            "aux": {
@@ -3709,7 +3722,7 @@ def static_auth_broker(
        http_port=http_port,
        mgmt_port=mgmt_port,
        external_http_port=external_http_port,
-        auth_backend=NeonAuthBroker.ControlPlane(httpserver.url_for("/cplane")),
+        auth_backend=NeonAuthBroker.ProxyV1(httpserver.url_for("/cplane")),
    ) as proxy:
        proxy.start()
        yield proxy
@@ -4556,6 +4569,7 @@ class StorageScrubber:
    def __init__(self, env: NeonEnv, log_dir: Path):
        self.env = env
        self.log_dir = log_dir
+        self.allowed_errors: list[str] = []

    def scrubber_cli(
        self, args: list[str], timeout, extra_env: dict[str, str] | None = None
@@ -4633,19 +4647,70 @@ class StorageScrubber:
        if timeline_lsns is not None:
            args.append("--timeline-lsns")
            args.append(json.dumps(timeline_lsns))
+        if node_kind == NodeKind.PAGESERVER:
+            args.append("--verbose")
        stdout = self.scrubber_cli(args, timeout=30, extra_env=extra_env)

        try:
            summary = json.loads(stdout)
-            # summary does not contain "with_warnings" if node_kind is the safekeeper
-            no_warnings = "with_warnings" not in summary or not summary["with_warnings"]
-            healthy = not summary["with_errors"] and no_warnings
+            healthy = self._check_run_healthy(summary)
            return healthy, summary
        except:
            log.error("Failed to decode JSON output from `scan-metadata`.  Dumping stdout:")
            log.error(stdout)
            raise

+    def _check_line_allowed(self, line: str) -> bool:
+        for a in self.allowed_errors:
+            try:
+                if re.match(a, line):
+                    return True
+            except re.error:
+                log.error(f"Invalid regex: '{a}'")
+                raise
+        return False
+
+    def _check_line_list_allowed(self, lines: list[str]) -> bool:
+        for line in lines:
+            if not self._check_line_allowed(line):
+                return False
+        return True
+
+    def _check_run_healthy(self, summary: dict[str, Any]) -> bool:
+        # summary does not contain "with_warnings" if node_kind is the safekeeper
+        healthy = True
+        with_warnings = summary.get("with_warnings", None)
+        if with_warnings is not None:
+            if isinstance(with_warnings, list):
+                if len(with_warnings) > 0:
+                    # safekeeper scan_metadata output is a list of tenants
+                    healthy = False
+            else:
+                for _, warnings in with_warnings.items():
+                    assert (
+                        len(warnings) > 0
+                    ), "with_warnings value should not be empty, running without verbose mode?"
+                    if not self._check_line_list_allowed(warnings):
+                        healthy = False
+                        break
+        if not healthy:
+            return healthy
+        with_errors = summary.get("with_errors", None)
+        if with_errors is not None:
+            if isinstance(with_errors, list):
+                if len(with_errors) > 0:
+                    # safekeeper scan_metadata output is a list of tenants
+                    healthy = False
+            else:
+                for _, errors in with_errors.items():
+                    assert (
+                        len(errors) > 0
+                    ), "with_errors value should not be empty, running without verbose mode?"
+                    if not self._check_line_list_allowed(errors):
+                        healthy = False
+                        break
+        return healthy
+
    def tenant_snapshot(self, tenant_id: TenantId, output_path: Path):
        stdout = self.scrubber_cli(
            ["tenant-snapshot", "--tenant-id", str(tenant_id), "--output-path", str(output_path)],
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -488,7 +488,20 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
        )
        self.verbose_error(res)

-    def patch_tenant_config_client_side(
+    def patch_tenant_config(self, tenant_id: TenantId | TenantShardId, updates: dict[str, Any]):
+        """
+        Only use this via storage_controller.pageserver_api().
+
+        See `set_tenant_config` for more information.
+        """
+        assert "tenant_id" not in updates.keys()
+        res = self.patch(
+            f"http://localhost:{self.port}/v1/tenant/config",
+            json={**updates, "tenant_id": str(tenant_id)},
+        )
+        self.verbose_error(res)
+
+    def update_tenant_config(
        self,
        tenant_id: TenantId,
        inserts: dict[str, Any] | None = None,
@@ -499,13 +512,13 @@ class PageserverHttpClient(requests.Session, MetricsGetter):

        See `set_tenant_config` for more information.
        """
-        current = self.tenant_config(tenant_id).tenant_specific_overrides
-        if inserts is not None:
-            current.update(inserts)
-        if removes is not None:
-            for key in removes:
-                del current[key]
-        self.set_tenant_config(tenant_id, current)
+        if inserts is None:
+            inserts = {}
+        if removes is None:
+            removes = []
+
+        patch = inserts | {remove: None for remove in removes}
+        self.patch_tenant_config(tenant_id, patch)

    def tenant_size(self, tenant_id: TenantId | TenantShardId) -> int:
        return self.tenant_size_and_modelinputs(tenant_id)[0]
--- a/test_runner/fixtures/remote_storage.py
+++ b/test_runner/fixtures/remote_storage.py
@@ -70,6 +70,9 @@ class MockS3Server:
    def secret_key(self) -> str:
        return "test"

+    def session_token(self) -> str:
+        return "test"
+
    def kill(self):
        self.server.stop()

@@ -161,6 +164,7 @@ class S3Storage:
    bucket_region: str
    access_key: str | None
    secret_key: str | None
+    session_token: str | None
    aws_profile: str | None
    prefix_in_bucket: str
    client: S3Client
@@ -181,13 +185,18 @@ class S3Storage:
            if home is not None:
                env["HOME"] = home
            return env
-        if self.access_key is not None and self.secret_key is not None:
+        if (
+            self.access_key is not None
+            and self.secret_key is not None
+            and self.session_token is not None
+        ):
            return {
                "AWS_ACCESS_KEY_ID": self.access_key,
                "AWS_SECRET_ACCESS_KEY": self.secret_key,
+                "AWS_SESSION_TOKEN": self.session_token,
            }
        raise RuntimeError(
-            "Either AWS_PROFILE or (AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY) have to be set for S3Storage"
+            "Either AWS_PROFILE or (AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY and AWS_SESSION_TOKEN) have to be set for S3Storage"
        )

    def to_string(self) -> str:
@@ -352,6 +361,7 @@ class RemoteStorageKind(StrEnum):
            mock_region = mock_s3_server.region()

            access_key, secret_key = mock_s3_server.access_key(), mock_s3_server.secret_key()
+            session_token = mock_s3_server.session_token()

            client = boto3.client(
                "s3",
@@ -359,6 +369,7 @@ class RemoteStorageKind(StrEnum):
                region_name=mock_region,
                aws_access_key_id=access_key,
                aws_secret_access_key=secret_key,
+                aws_session_token=session_token,
            )

            bucket_name = to_bucket_name(user, test_name)
@@ -372,6 +383,7 @@ class RemoteStorageKind(StrEnum):
                bucket_region=mock_region,
                access_key=access_key,
                secret_key=secret_key,
+                session_token=session_token,
                aws_profile=None,
                prefix_in_bucket="",
                client=client,
@@ -383,9 +395,10 @@ class RemoteStorageKind(StrEnum):

        env_access_key = os.getenv("AWS_ACCESS_KEY_ID")
        env_secret_key = os.getenv("AWS_SECRET_ACCESS_KEY")
+        env_access_token = os.getenv("AWS_SESSION_TOKEN")
        env_profile = os.getenv("AWS_PROFILE")
        assert (
-            env_access_key and env_secret_key
+            env_access_key and env_secret_key and env_access_token
        ) or env_profile, "need to specify either access key and secret access key or profile"

        bucket_name = bucket_name or os.getenv("REMOTE_STORAGE_S3_BUCKET")
@@ -398,6 +411,9 @@ class RemoteStorageKind(StrEnum):
        client = boto3.client(
            "s3",
            region_name=bucket_region,
+            aws_access_key_id=env_access_key,
+            aws_secret_access_key=env_secret_key,
+            aws_session_token=env_access_token,
        )

        return S3Storage(
@@ -405,6 +421,7 @@ class RemoteStorageKind(StrEnum):
            bucket_region=bucket_region,
            access_key=env_access_key,
            secret_key=env_secret_key,
+            session_token=env_access_token,
            aws_profile=env_profile,
            prefix_in_bucket=prefix_in_bucket,
            client=client,
--- a/test_runner/regress/test_compaction.py
+++ b/test_runner/regress/test_compaction.py
@@ -153,6 +153,7 @@ def test_pageserver_gc_compaction_smoke(neon_env_builder: NeonEnvBuilder):
        if i % 10 == 0:
            log.info(f"Running churn round {i}/{churn_rounds} ...")

+        if (i - 1) % 10 == 0:
            # Run gc-compaction every 10 rounds to ensure the test doesn't take too long time.
            ps_http.timeline_compact(
                tenant_id,
@@ -161,10 +162,11 @@ def test_pageserver_gc_compaction_smoke(neon_env_builder: NeonEnvBuilder):
                body={
                    "scheduled": True,
                    "sub_compaction": True,
-                    "compact_range": {
+                    "compact_key_range": {
                        "start": "000000000000000000000000000000000000",
                        "end": "030000000000000000000000000000000000",
                    },
+                    "sub_compaction_max_job_size_mb": 16,
                },
            )

--- a/test_runner/regress/test_ddl_forwarding.py
+++ b/test_runner/regress/test_ddl_forwarding.py
@@ -15,6 +15,8 @@ from werkzeug.wrappers.response import Response
 if TYPE_CHECKING:
    from typing import Any, Self

+    from fixtures.httpserver import ListenAddress
+

 def handle_db(dbs, roles, operation):
    if operation["op"] == "set":
@@ -120,7 +122,7 @@ class DdlForwardingContext:

@pytest.fixture(scope="function")
 def ddl(
-    httpserver: HTTPServer, vanilla_pg: VanillaPostgres, httpserver_listen_address: tuple[str, int]
+    httpserver: HTTPServer, vanilla_pg: VanillaPostgres, httpserver_listen_address: ListenAddress
 ):
    (host, port) = httpserver_listen_address
    with DdlForwardingContext(httpserver, vanilla_pg, host, port) as ddl:
--- a/test_runner/regress/test_disk_usage_eviction.py
+++ b/test_runner/regress/test_disk_usage_eviction.py
@@ -460,10 +460,10 @@ def test_pageserver_respects_overridden_resident_size(
    assert (
        du_by_timeline[large_tenant] > min_resident_size
    ), "ensure the larger tenant will get a haircut"
-    env.neon_env.storage_controller.pageserver_api().patch_tenant_config_client_side(
+    env.neon_env.storage_controller.pageserver_api().update_tenant_config(
        small_tenant[0], {"min_resident_size_override": min_resident_size}
    )
-    env.neon_env.storage_controller.pageserver_api().patch_tenant_config_client_side(
+    env.neon_env.storage_controller.pageserver_api().update_tenant_config(
        large_tenant[0], {"min_resident_size_override": min_resident_size}
    )

--- a/test_runner/regress/test_download_extensions.py
+++ b/test_runner/regress/test_download_extensions.py
@@ -20,6 +20,8 @@ from werkzeug.wrappers.response import Response
 if TYPE_CHECKING:
    from typing import Any

+    from fixtures.httpserver import ListenAddress
+

 # use neon_env_builder_local fixture to override the default neon_env_builder fixture
 # and use a test-specific pg_install instead of shared one
@@ -47,8 +49,8 @@ def neon_env_builder_local(
 def test_remote_extensions(
    httpserver: HTTPServer,
    neon_env_builder_local: NeonEnvBuilder,
-    httpserver_listen_address,
-    pg_version,
+    httpserver_listen_address: ListenAddress,
+    pg_version: PgVersion,
 ):
    # setup mock http server
    # that expects request for anon.tar.zst
--- a/test_runner/regress/test_ingestion_layer_size.py
+++ b/test_runner/regress/test_ingestion_layer_size.py
@@ -74,7 +74,7 @@ def test_ingesting_large_batches_of_images(neon_env_builder: NeonEnvBuilder):
    print_layer_size_histogram(post_ingest)

    # since all we have are L0s, we should be getting nice L1s and images out of them now
-    env.storage_controller.pageserver_api().patch_tenant_config_client_side(
+    env.storage_controller.pageserver_api().update_tenant_config(
        env.initial_tenant,
        {
            "compaction_threshold": 1,
--- a/test_runner/regress/test_installed_extensions.py
+++ b/test_runner/regress/test_installed_extensions.py
@@ -30,7 +30,7 @@ def test_installed_extensions(neon_simple_env: NeonEnv):
    info("Extensions: %s", res["extensions"])
    # 'plpgsql' is a default extension that is always installed.
    assert any(
-        ext["extname"] == "plpgsql" and ext["versions"] == ["1.0"] for ext in res["extensions"]
+        ext["extname"] == "plpgsql" and ext["version"] == "1.0" for ext in res["extensions"]
    ), "The 'plpgsql' extension is missing"

    # check that the neon_test_utils extension is not installed
@@ -63,7 +63,7 @@ def test_installed_extensions(neon_simple_env: NeonEnv):
    # and has the expected version
    assert any(
        ext["extname"] == "neon_test_utils"
-        and ext["versions"] == [neon_test_utils_version]
+        and ext["version"] == neon_test_utils_version
        and ext["n_databases"] == 1
        for ext in res["extensions"]
    )
@@ -75,9 +75,8 @@ def test_installed_extensions(neon_simple_env: NeonEnv):
    # check that the neon extension is installed and has expected versions
    for ext in res["extensions"]:
        if ext["extname"] == "neon":
-            assert ext["n_databases"] == 2
-            ext["versions"].sort()
-            assert ext["versions"] == ["1.1", "1.2"]
+            assert ext["version"] in ["1.1", "1.2"]
+            assert ext["n_databases"] == 1

    with pg_conn.cursor() as cur:
        cur.execute("ALTER EXTENSION neon UPDATE TO '1.3'")
@@ -90,9 +89,8 @@ def test_installed_extensions(neon_simple_env: NeonEnv):
    # check that the neon_test_utils extension is updated
    for ext in res["extensions"]:
        if ext["extname"] == "neon":
-            assert ext["n_databases"] == 2
-            ext["versions"].sort()
-            assert ext["versions"] == ["1.2", "1.3"]
+            assert ext["version"] in ["1.2", "1.3"]
+            assert ext["n_databases"] == 1

    # check that /metrics endpoint is available
    # ensure that we see the metric before and after restart
@@ -100,13 +98,15 @@ def test_installed_extensions(neon_simple_env: NeonEnv):
    info("Metrics: %s", res)
    m = parse_metrics(res)
    neon_m = m.query_all(
-        "compute_installed_extensions", {"extension_name": "neon", "version": "1.2"}
+        "compute_installed_extensions",
+        {"extension_name": "neon", "version": "1.2", "owned_by_superuser": "1"},
    )
    assert len(neon_m) == 1
    for sample in neon_m:
-        assert sample.value == 2
+        assert sample.value == 1
    neon_m = m.query_all(
-        "compute_installed_extensions", {"extension_name": "neon", "version": "1.3"}
+        "compute_installed_extensions",
+        {"extension_name": "neon", "version": "1.3", "owned_by_superuser": "1"},
    )
    assert len(neon_m) == 1
    for sample in neon_m:
@@ -138,14 +138,16 @@ def test_installed_extensions(neon_simple_env: NeonEnv):
        info("After restart metrics: %s", res)
        m = parse_metrics(res)
        neon_m = m.query_all(
-            "compute_installed_extensions", {"extension_name": "neon", "version": "1.2"}
+            "compute_installed_extensions",
+            {"extension_name": "neon", "version": "1.2", "owned_by_superuser": "1"},
        )
        assert len(neon_m) == 1
        for sample in neon_m:
            assert sample.value == 1

        neon_m = m.query_all(
-            "compute_installed_extensions", {"extension_name": "neon", "version": "1.3"}
+            "compute_installed_extensions",
+            {"extension_name": "neon", "version": "1.3", "owned_by_superuser": "1"},
        )
        assert len(neon_m) == 1
        for sample in neon_m:
--- a/test_runner/regress/test_layers_from_future.py
+++ b/test_runner/regress/test_layers_from_future.py
@@ -132,7 +132,7 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder, attach_mode: str):
    ), "sanity check for what above loop is supposed to do"

    # create the image layer from the future
-    env.storage_controller.pageserver_api().patch_tenant_config_client_side(
+    env.storage_controller.pageserver_api().update_tenant_config(
        tenant_id, {"image_creation_threshold": image_creation_threshold}, None
    )
    assert ps_http.tenant_config(tenant_id).effective_config["image_creation_threshold"] == 1
--- a/test_runner/regress/test_logical_replication.py
+++ b/test_runner/regress/test_logical_replication.py
@@ -573,17 +573,18 @@ def test_subscriber_synchronous_commit(neon_simple_env: NeonEnv, vanilla_pg: Van
    vanilla_pg.safe_psql("create extension neon;")

    env.create_branch("subscriber")
-    # We want all data to fit into shared_buffers because later we stop
-    # safekeeper and insert more; this shouldn't cause page requests as they
-    # will be stuck.
+    # We want all data to fit into shared_buffers or LFC cache because later we
+    # stop safekeeper and insert more; this shouldn't cause page requests as
+    # they will be stuck.
+    if USE_LFC:
+        config_lines = ["neon.max_file_cache_size = 32MB", "neon.file_cache_size_limit = 32MB"]
+    else:
+        config_lines = [
+            "shared_buffers = 32MB",
+        ]
    sub = env.endpoints.create(
        "subscriber",
-        config_lines=[
-            "neon.max_file_cache_size = 32MB",
-            "neon.file_cache_size_limit = 32MB",
-        ]
-        if USE_LFC
-        else [],
+        config_lines=config_lines,
    )
    sub.start()

--- a/test_runner/regress/test_nbtree_pagesplit_cycleid.py
+++ b/test_runner/regress/test_nbtree_pagesplit_cycleid.py
@@ -0,0 +1,124 @@
+import threading
+import time
+
+from fixtures.neon_fixtures import NeonEnv
+
+BTREE_NUM_CYCLEID_PAGES = """
+    WITH raw_pages AS (
+        SELECT blkno, get_raw_page_at_lsn('t_uidx', 'main', blkno, NULL, NULL) page
+        FROM generate_series(1, pg_relation_size('t_uidx'::regclass) / 8192) blkno
+    ),
+    parsed_pages AS (
+        /* cycle ID is the last 2 bytes of the btree page */
+        SELECT blkno, SUBSTRING(page FROM 8191 FOR 2) as cycle_id
+        FROM raw_pages
+    )
+    SELECT count(*),
+           encode(cycle_id, 'hex')
+     FROM parsed_pages
+    WHERE encode(cycle_id, 'hex') != '0000'
+    GROUP BY encode(cycle_id, 'hex');
+    """
+
+
+def test_nbtree_pagesplit_cycleid(neon_simple_env: NeonEnv):
+    env = neon_simple_env
+    endpoint = env.endpoints.create_start("main")
+
+    ses1 = endpoint.connect().cursor()
+    ses1.execute("ALTER SYSTEM SET autovacuum = off;")
+    ses1.execute("ALTER SYSTEM SET enable_seqscan = off;")
+    ses1.execute("ALTER SYSTEM SET full_page_writes = off;")
+    ses1.execute("SELECT pg_reload_conf();")
+    ses1.execute("CREATE EXTENSION neon_test_utils;")
+    # prepare a large index
+    ses1.execute("CREATE TABLE t(id integer GENERATED ALWAYS AS IDENTITY, txt text);")
+    ses1.execute("CREATE UNIQUE INDEX t_uidx ON t(id);")
+    ses1.execute("INSERT INTO t (txt) SELECT i::text FROM generate_series(1, 2035) i;")
+
+    ses1.execute("SELECT neon_xlogflush();")
+    ses1.execute(BTREE_NUM_CYCLEID_PAGES)
+    pages = ses1.fetchall()
+    assert (
+        len(pages) == 0
+    ), f"0 back splits with cycle ID expected, real {len(pages)} first {pages[0]}"
+    # Delete enough tuples to clear the first index page.
+    # (there are up to 407 rows per 8KiB page; 406 for non-rightmost leafs.
+    ses1.execute("DELETE FROM t WHERE id <= 406;")
+    # Make sure the page is cleaned up
+    ses1.execute("VACUUM (FREEZE, INDEX_CLEANUP ON) t;")
+
+    # Do another delete-then-indexcleanup cycle, to move the pages from
+    # "dead" to "reusable"
+    ses1.execute("DELETE FROM t WHERE id <= 446;")
+    ses1.execute("VACUUM (FREEZE, INDEX_CLEANUP ON) t;")
+
+    # Make sure the vacuum we're about to trigger in s3 has cleanup work to do
+    ses1.execute("DELETE FROM t WHERE id <= 610;")
+
+    # Flush wal, for checking purposes
+    ses1.execute("SELECT neon_xlogflush();")
+    ses1.execute(BTREE_NUM_CYCLEID_PAGES)
+    pages = ses1.fetchall()
+    assert len(pages) == 0, f"No back splits with cycle ID expected, got batches of {pages} instead"
+
+    ses2 = endpoint.connect().cursor()
+    ses3 = endpoint.connect().cursor()
+
+    # Session 2 pins a btree page, which prevents vacuum from processing that
+    # page, thus allowing us to reliably split pages while a concurrent vacuum
+    # is running.
+    ses2.execute("BEGIN;")
+    ses2.execute(
+        "DECLARE foo NO SCROLL CURSOR FOR SELECT row_number() over () FROM t ORDER BY id ASC"
+    )
+    ses2.execute("FETCH FROM foo;")  # pins the leaf page with id 611
+    wait_evt = threading.Event()
+
+    # Session 3 runs the VACUUM command. Note that this will block, and
+    # therefore must run on another thread.
+    # We rely on this running quickly enough to hit the pinned page from
+    # session 2 by the time we start other work again in session 1, but
+    # technically there is a race where the thread (and/or PostgreSQL process)
+    # don't get to that pinned page with vacuum until >2s after evt.set() was
+    # called, and session 1 thus might already have split pages.
+    def vacuum_freeze_t(ses3, evt: threading.Event):
+        # Begin parallel vacuum that should hit the index
+        evt.set()
+        # this'll hang until s2 fetches enough new data from its cursor.
+        # this is technically a race with the time.sleep(2) below, but if this
+        # command doesn't hit
+        ses3.execute("VACUUM (FREEZE, INDEX_CLEANUP on, DISABLE_PAGE_SKIPPING on) t;")
+
+    ses3t = threading.Thread(target=vacuum_freeze_t, args=(ses3, wait_evt))
+    ses3t.start()
+    wait_evt.wait()
+    # Make extra sure we got the thread started and vacuum is stuck, by waiting
+    # some time even after wait_evt got set. This isn't truly reliable (it is
+    # possible
+    time.sleep(2)
+
+    # Insert 2 pages worth of new data.
+    # This should reuse the one empty page, plus another page at the end of
+    # the index relation; with split ordering
+    #    old_blk -> blkno=1 -> old_blk + 1.
+    # As this is run while vacuum in session 3 is happening, these splits
+    # should receive cycle IDs where applicable.
+    ses1.execute("INSERT INTO t (txt) SELECT i::text FROM generate_series(1, 812) i;")
+    # unpin the btree page, allowing s3's vacuum to complete
+    ses2.execute("FETCH ALL FROM foo;")
+    ses2.execute("ROLLBACK;")
+    # flush WAL to make sure PS is up-to-date
+    ses1.execute("SELECT neon_xlogflush();")
+    # check that our expectations are correct
+    ses1.execute(BTREE_NUM_CYCLEID_PAGES)
+    pages = ses1.fetchall()
+    assert (
+        len(pages) == 1 and pages[0][0] == 3
+    ), f"3 page splits with cycle ID expected; actual {pages}"
+
+    # final cleanup
+    ses3t.join()
+    ses1.close()
+    ses2.close()
+    ses3.close()
--- a/test_runner/regress/test_pageserver_crash_consistency.py
+++ b/test_runner/regress/test_pageserver_crash_consistency.py
@@ -46,7 +46,7 @@ def test_local_only_layers_after_crash(neon_env_builder: NeonEnvBuilder, pg_bin:
    for sk in env.safekeepers:
        sk.stop()

-    env.storage_controller.pageserver_api().patch_tenant_config_client_side(
+    env.storage_controller.pageserver_api().update_tenant_config(
        tenant_id, {"compaction_threshold": 3}
    )
    # hit the exit failpoint
--- a/test_runner/regress/test_pageserver_metric_collection.py
+++ b/test_runner/regress/test_pageserver_metric_collection.py
@@ -27,6 +27,8 @@ from werkzeug.wrappers.response import Response
 if TYPE_CHECKING:
    from typing import Any

+    from fixtures.httpserver import ListenAddress
+

 # TODO: collect all of the env setup *AFTER* removal of RemoteStorageKind.NOOP

@@ -34,7 +36,7 @@ if TYPE_CHECKING:
 def test_metric_collection(
    httpserver: HTTPServer,
    neon_env_builder: NeonEnvBuilder,
-    httpserver_listen_address,
+    httpserver_listen_address: ListenAddress,
 ):
    (host, port) = httpserver_listen_address
    metric_collection_endpoint = f"http://{host}:{port}/billing/api/v1/usage_events"
@@ -195,7 +197,7 @@ def test_metric_collection(
 def test_metric_collection_cleans_up_tempfile(
    httpserver: HTTPServer,
    neon_env_builder: NeonEnvBuilder,
-    httpserver_listen_address,
+    httpserver_listen_address: ListenAddress,
 ):
    (host, port) = httpserver_listen_address
    metric_collection_endpoint = f"http://{host}:{port}/billing/api/v1/usage_events"
--- a/test_runner/regress/test_proxy_metric_collection.py
+++ b/test_runner/regress/test_proxy_metric_collection.py
@@ -2,6 +2,7 @@ from __future__ import annotations

 from collections.abc import Iterator
 from pathlib import Path
+from typing import TYPE_CHECKING

 import pytest
 from fixtures.log_helper import log
@@ -15,6 +16,9 @@ from pytest_httpserver import HTTPServer
 from werkzeug.wrappers.request import Request
 from werkzeug.wrappers.response import Response

+if TYPE_CHECKING:
+    from fixtures.httpserver import ListenAddress
+

 def proxy_metrics_handler(request: Request) -> Response:
    if request.json is None:
@@ -38,7 +42,7 @@ def proxy_metrics_handler(request: Request) -> Response:
 def proxy_with_metric_collector(
    port_distributor: PortDistributor,
    neon_binpath: Path,
-    httpserver_listen_address,
+    httpserver_listen_address: ListenAddress,
    test_output_dir: Path,
 ) -> Iterator[NeonProxy]:
    """Neon proxy that routes through link auth and has metric collection enabled."""
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -3,7 +3,7 @@ from __future__ import annotations
 import os
 import time
 from collections import defaultdict
-from typing import Any
+from typing import TYPE_CHECKING, Any

 import pytest
 import requests
@@ -27,6 +27,9 @@ from typing_extensions import override
 from werkzeug.wrappers.request import Request
 from werkzeug.wrappers.response import Response

+if TYPE_CHECKING:
+    from fixtures.httpserver import ListenAddress
+

 def test_sharding_smoke(
    neon_env_builder: NeonEnvBuilder,
@@ -759,7 +762,7 @@ def test_sharding_split_smoke(
 def test_sharding_split_stripe_size(
    neon_env_builder: NeonEnvBuilder,
    httpserver: HTTPServer,
-    httpserver_listen_address,
+    httpserver_listen_address: ListenAddress,
    initial_stripe_size: int,
 ):
    """
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -58,6 +58,8 @@ from werkzeug.wrappers.response import Response
 if TYPE_CHECKING:
    from typing import Any

+    from fixtures.httpserver import ListenAddress
+

 def get_node_shard_counts(env: NeonEnv, tenant_ids):
    counts: defaultdict[int, int] = defaultdict(int)
@@ -563,7 +565,7 @@ def test_storage_controller_onboard_detached(neon_env_builder: NeonEnvBuilder):
 def test_storage_controller_compute_hook(
    httpserver: HTTPServer,
    neon_env_builder: NeonEnvBuilder,
-    httpserver_listen_address,
+    httpserver_listen_address: ListenAddress,
 ):
    """
    Test that the sharding service calls out to the configured HTTP endpoint on attachment changes
@@ -681,7 +683,7 @@ NOTIFY_FAILURE_LOGS = [
 def test_storage_controller_stuck_compute_hook(
    httpserver: HTTPServer,
    neon_env_builder: NeonEnvBuilder,
-    httpserver_listen_address,
+    httpserver_listen_address: ListenAddress,
 ):
    """
    Test the migration process's behavior when the compute hook does not enable it to proceed
@@ -818,7 +820,7 @@ def test_storage_controller_stuck_compute_hook(
 def test_storage_controller_compute_hook_revert(
    httpserver: HTTPServer,
    neon_env_builder: NeonEnvBuilder,
-    httpserver_listen_address,
+    httpserver_listen_address: ListenAddress,
 ):
    """
    'revert' in the sense of a migration which gets reversed shortly after, as may happen during
@@ -1768,7 +1770,7 @@ def test_storcon_cli(neon_env_builder: NeonEnvBuilder):
    # Modify a tenant's config
    storcon_cli(
        [
-            "tenant-config",
+            "patch-tenant-config",
            "--tenant-id",
            str(env.initial_tenant),
            "--config",
@@ -2136,7 +2138,7 @@ def test_background_operation_cancellation(neon_env_builder: NeonEnvBuilder):
    env.start()

    tenant_count = 10
-    shard_count_per_tenant = 8
+    shard_count_per_tenant = 16
    tenant_ids = []

    for _ in range(0, tenant_count):
@@ -2403,7 +2405,7 @@ def test_storage_controller_step_down(neon_env_builder: NeonEnvBuilder):

    # Make a change to the tenant config to trigger a slow reconcile
    virtual_ps_http = PageserverHttpClient(env.storage_controller_port, lambda: True)
-    virtual_ps_http.patch_tenant_config_client_side(tid, {"compaction_threshold": 5}, None)
+    virtual_ps_http.update_tenant_config(tid, {"compaction_threshold": 5}, None)
    env.storage_controller.allowed_errors.extend(
        [
            ".*Accepted configuration update but reconciliation failed.*",
@@ -2953,6 +2955,8 @@ def test_safekeeper_deployment_time_update(neon_env_builder: NeonEnvBuilder):

    assert target.get_safekeeper(fake_id) is None

+    assert len(target.get_safekeepers()) == 0
+
    body = {
        "active": True,
        "id": fake_id,
@@ -2970,6 +2974,7 @@ def test_safekeeper_deployment_time_update(neon_env_builder: NeonEnvBuilder):

    inserted = target.get_safekeeper(fake_id)
    assert inserted is not None
+    assert target.get_safekeepers() == [inserted]
    assert eq_safekeeper_records(body, inserted)

    # error out if pk is changed (unexpected)
@@ -2981,6 +2986,7 @@ def test_safekeeper_deployment_time_update(neon_env_builder: NeonEnvBuilder):
    assert exc.value.status_code == 400

    inserted_again = target.get_safekeeper(fake_id)
+    assert target.get_safekeepers() == [inserted_again]
    assert inserted_again is not None
    assert eq_safekeeper_records(inserted, inserted_again)

@@ -2989,6 +2995,7 @@ def test_safekeeper_deployment_time_update(neon_env_builder: NeonEnvBuilder):
    body["version"] += 1
    target.on_safekeeper_deploy(fake_id, body)
    inserted_now = target.get_safekeeper(fake_id)
+    assert target.get_safekeepers() == [inserted_now]
    assert inserted_now is not None

    assert eq_safekeeper_records(body, inserted_now)
--- a/test_runner/regress/test_storage_scrubber.py
+++ b/test_runner/regress/test_storage_scrubber.py
@@ -572,4 +572,10 @@ def test_scrubber_scan_pageserver_metadata(
    unhealthy = env.storage_controller.metadata_health_list_unhealthy()["unhealthy_tenant_shards"]
    assert len(unhealthy) == 1 and unhealthy[0] == str(tenant_shard_id)

-    neon_env_builder.disable_scrub_on_exit()
+    healthy, _ = env.storage_scrubber.scan_metadata()
+    assert not healthy
+    env.storage_scrubber.allowed_errors.append(".*not present in remote storage.*")
+    healthy, _ = env.storage_scrubber.scan_metadata()
+    assert healthy
+
+    neon_env_builder.disable_scrub_on_exit()  # We already ran scrubber, no need to do an extra run
--- a/test_runner/regress/test_tenant_conf.py
+++ b/test_runner/regress/test_tenant_conf.py
@@ -3,13 +3,14 @@ from __future__ import annotations
 import json
 from typing import TYPE_CHECKING

+import pytest
 from fixtures.common_types import Lsn
 from fixtures.neon_fixtures import (
    NeonEnvBuilder,
 )
 from fixtures.pageserver.utils import assert_tenant_state, wait_for_upload
 from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind
-from fixtures.utils import wait_until
+from fixtures.utils import run_only_on_default_postgres, wait_until
 from fixtures.workload import Workload

 if TYPE_CHECKING:
@@ -330,3 +331,83 @@ def test_live_reconfig_get_evictions_low_residence_duration_metric_threshold(
    metric = get_metric()
    assert int(metric.labels["low_threshold_secs"]) == 24 * 60 * 60, "label resets to default"
    assert int(metric.value) == 0, "value resets to default"
+
+
+@run_only_on_default_postgres("Test does not start a compute")
+@pytest.mark.parametrize("ps_managed_by", ["storcon", "cplane"])
+def test_tenant_config_patch(neon_env_builder: NeonEnvBuilder, ps_managed_by: str):
+    """
+    Test tenant config patching (i.e. additive updates)
+
+    The flow is different for storage controller and cplane managed pageserver.
+    1. Storcon managed: /v1/tenant/config request lands on storcon, which generates
+    location_config calls containing the update to the pageserver
+    2. Cplane managed: /v1/tenant/config is called directly on the pageserver
+    """
+
+    def assert_tenant_conf_semantically_equal(lhs, rhs):
+        """
+        Storcon returns None for fields that are not set while the pageserver does not.
+        Compare two tenant's config overrides semantically, by dropping the None values.
+        """
+        lhs = {k: v for k, v in lhs.items() if v is not None}
+        rhs = {k: v for k, v in rhs.items() if v is not None}
+
+        assert lhs == rhs
+
+    env = neon_env_builder.init_start()
+
+    if ps_managed_by == "storcon":
+        api = env.storage_controller.pageserver_api()
+    elif ps_managed_by == "cplane":
+        # Disallow storcon from sending location_configs to the pageserver.
+        # These would overwrite the manually set tenant configs.
+        env.storage_controller.reconcile_until_idle()
+        env.storage_controller.tenant_policy_update(env.initial_tenant, {"scheduling": "Stop"})
+        env.storage_controller.allowed_errors.append(".*Scheduling is disabled by policy Stop.*")
+
+        api = env.pageserver.http_client()
+    else:
+        raise Exception(f"Unexpected value of ps_managed_by param: {ps_managed_by}")
+
+    crnt_tenant_conf = api.tenant_config(env.initial_tenant).tenant_specific_overrides
+
+    patch: dict[str, Any | None] = {
+        "gc_period": "3h",
+        "wal_receiver_protocol_override": {
+            "type": "interpreted",
+            "args": {"format": "bincode", "compression": {"zstd": {"level": 1}}},
+        },
+    }
+    api.patch_tenant_config(env.initial_tenant, patch)
+    tenant_conf_after_patch = api.tenant_config(env.initial_tenant).tenant_specific_overrides
+    if ps_managed_by == "storcon":
+        # Check that the config was propagated to the PS.
+        overrides_on_ps = (
+            env.pageserver.http_client().tenant_config(env.initial_tenant).tenant_specific_overrides
+        )
+        assert_tenant_conf_semantically_equal(overrides_on_ps, tenant_conf_after_patch)
+    assert_tenant_conf_semantically_equal(tenant_conf_after_patch, crnt_tenant_conf | patch)
+    crnt_tenant_conf = tenant_conf_after_patch
+
+    patch = {"gc_period": "5h", "wal_receiver_protocol_override": None}
+    api.patch_tenant_config(env.initial_tenant, patch)
+    tenant_conf_after_patch = api.tenant_config(env.initial_tenant).tenant_specific_overrides
+    if ps_managed_by == "storcon":
+        overrides_on_ps = (
+            env.pageserver.http_client().tenant_config(env.initial_tenant).tenant_specific_overrides
+        )
+        assert_tenant_conf_semantically_equal(overrides_on_ps, tenant_conf_after_patch)
+    assert_tenant_conf_semantically_equal(tenant_conf_after_patch, crnt_tenant_conf | patch)
+    crnt_tenant_conf = tenant_conf_after_patch
+
+    put = {"pitr_interval": "1m 1s"}
+    api.set_tenant_config(env.initial_tenant, put)
+    tenant_conf_after_put = api.tenant_config(env.initial_tenant).tenant_specific_overrides
+    if ps_managed_by == "storcon":
+        overrides_on_ps = (
+            env.pageserver.http_client().tenant_config(env.initial_tenant).tenant_specific_overrides
+        )
+        assert_tenant_conf_semantically_equal(overrides_on_ps, tenant_conf_after_put)
+    assert_tenant_conf_semantically_equal(tenant_conf_after_put, put)
+    crnt_tenant_conf = tenant_conf_after_put
--- a/test_runner/regress/test_threshold_based_eviction.py
+++ b/test_runner/regress/test_threshold_based_eviction.py
@@ -2,6 +2,7 @@ from __future__ import annotations

 import time
 from dataclasses import dataclass
+from typing import TYPE_CHECKING

 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
@@ -13,12 +14,15 @@ from fixtures.pageserver.http import LayerMapInfo
 from fixtures.remote_storage import RemoteStorageKind
 from pytest_httpserver import HTTPServer

+if TYPE_CHECKING:
+    from fixtures.httpserver import ListenAddress
+
 # NB: basic config change tests are in test_tenant_conf.py


 def test_threshold_based_eviction(
    httpserver: HTTPServer,
-    httpserver_listen_address,
+    httpserver_listen_address: ListenAddress,
    pg_bin: PgBin,
    neon_env_builder: NeonEnvBuilder,
 ):
@@ -81,7 +85,7 @@ def test_threshold_based_eviction(

    # create a bunch of L1s, only the least of which will need to be resident
    compaction_threshold = 3  # create L1 layers quickly
-    vps_http.patch_tenant_config_client_side(
+    vps_http.update_tenant_config(
        tenant_id,
        inserts={
            # Disable gc and compaction to avoid on-demand downloads from their side.
--- a/test_runner/regress/test_timeline_detach_ancestor.py
+++ b/test_runner/regress/test_timeline_detach_ancestor.py
@@ -514,7 +514,7 @@ def test_compaction_induced_by_detaches_in_history(

        assert len(delta_layers(branch_timeline_id)) == 5

-        env.storage_controller.pageserver_api().patch_tenant_config_client_side(
+        env.storage_controller.pageserver_api().update_tenant_config(
            env.initial_tenant, {"compaction_threshold": 5}, None
        )

--- a/test_runner/regress/test_vm_bits.py
+++ b/test_runner/regress/test_vm_bits.py
@@ -3,6 +3,7 @@ from __future__ import annotations
 import time
 from contextlib import closing

+import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, PgBin, fork_at_current_lsn
 from fixtures.utils import query_scalar
@@ -294,6 +295,7 @@ def test_vm_bit_clear_on_heap_lock_blackbox(neon_env_builder: NeonEnvBuilder):
    cur.execute("commit transaction")


+@pytest.mark.timeout(600)  # slow in debug builds
 def test_check_visibility_map(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
    """
    Runs pgbench across a few databases on a sharded tenant, then performs a visibility map
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
--- a/vendor/postgres-v17
+++ b/vendor/postgres-v17
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,18 +1,18 @@
 {
  "v17": [
    "17.2",
-    "a10d95be67265e0f10a422ba0457f5a7af01de71"
+    "01fa3c48664ca030cfb69bb4a350aa9df4691d88"
  ],
  "v16": [
    "16.6",
-    "dff6615a8e48a10bb17a03fa3c00635f1ace7a92"
+    "81428621f7c04aed03671cf80a928e0a36d92505"
  ],
  "v15": [
    "15.10",
-    "972e325e62b455957adbbdd8580e31275bb5b8c9"
+    "8736b10c1d93d11b9c0489872dd529c4c0f5338f"
  ],
  "v14": [
    "14.15",
-    "373f9decad933d2d46f321231032ae8b0da81acd"
+    "13ff324150fceaac72920e01742addc053db9462"
  ]
 }
Author	SHA1	Message	Date
Arpad Müller	3c4221db2c	fmt	2024-12-13 18:54:20 +01:00
Arpad Müller	00ff70c905	Pass down timeouts to the reqwest client	2024-12-13 18:32:46 +01:00
Peter Bendel	2f3f98a319	use OIDC role instead of AWS access keys for managing test runner (#10117 ) in periodic pagebench workflow ## Problem for background see https://github.com/neondatabase/cloud/issues/21545 ## Summary of changes use OIDC role to manage runners instead of AWS access key which needs to be periodically rotated ## logs seems to work in https://github.com/neondatabase/neon/actions/runs/12298575888/job/34322306127#step:6:1	2024-12-12 20:25:39 +00:00
Alex Chi Z.	5ff4b991c7	feat(pageserver): gc-compaction split over LSN (#9900 ) ## Problem part of https://github.com/neondatabase/neon/issues/9114, stacked PR over https://github.com/neondatabase/neon/pull/9897, partially refactored to help with https://github.com/neondatabase/neon/issues/10031 ## Summary of changes * gc-compaction takes `above_lsn` parameter. We only compact the layers above this LSN, and all data below the LSN are treated as if they are on the ancestor branch. * refactored gc-compaction to take `GcCompactJob` that describes the rectangular range to be compacted. * Added unit test for this case. --------- Signed-off-by: Alex Chi Z <chi@neon.tech> Co-authored-by: Christian Schwarz <christian@neon.tech>	2024-12-12 20:23:24 +00:00
John Spray	a93e3d31cc	storcon: refine logic for choosing AZ on tenant creation (#10054 ) ## Problem When we update our scheduler/optimization code to respect AZs properly (https://github.com/neondatabase/neon/pull/9916), the choice of AZ becomes a much higher-stakes decision. We will pretty much always run a tenant in its preferred AZ, and that AZ is fixed for the lifetime of the tenant (unless a human intervenes) Eventually, when we do auto-balancing based on utilization, I anticipate that part of that will be to automatically change the AZ of tenants if our original scheduling decisions have caused imbalance, but as an interim measure, we can at least avoid making this scheduling decision based purely on which AZ contains the emptiest node. This is a precursor to https://github.com/neondatabase/neon/pull/9947 ## Summary of changes - When creating a tenant, instead of scheduling a shard and then reading its preferred AZ back, make the AZ decision first. - Instead of choosing AZ based on which node is emptiest, use the median utilization of nodes in each AZ to pick the AZ to use. This avoids bad AZ decisions during periods when some node has very low utilization (such as after replacing a dead node) I considered also making the selection a weighted pseudo-random choice based on utilization, but wanted to avoid destabilising tests with that for now.	2024-12-12 19:35:38 +00:00
Rahul Patil	6d5687521b	fix(ci): Allow github-script to post test reports (#10120 ) Allow github-script to post test reports	2024-12-12 18:53:35 +00:00
Heikki Linnakangas	53721266f1	Disable connection logging in pgbouncer by default (#10118 ) It can produce a lot of logs, making pgbouncer itself consume all CPU in extreme cases. We saw that happen in stress testing.	2024-12-12 17:05:58 +00:00
a-masterov	2f3433876f	Change the channel for notification. (#10112 ) ## Problem Now notifications about failures in `pg_regress` tests run on the staging cloud instance, reach the channel `on-call-staging-stream`, while they should reach `on-call-qa-staging-stream` ## Summary of changes The channel changed.	2024-12-12 16:34:07 +00:00
Rahul Patil	58d45c6e86	ci(fix): Use OIDC auth to login on ECR (#10055 ) ## Problem CI currently uses static credentials in some places. These are less secure and hard to maintain, so we are going to deprecate them and use OIDC auth. ## Summary of changes - ci(fix): Use OIDC auth to upload artifact on s3 - ci(fix): Use OIDC auth to login on ECR	2024-12-12 15:13:08 +00:00
Conrad Ludgate	e502e880b5	chore(proxy): remove code for old API (#10109 ) ## Problem Now that https://github.com/neondatabase/cloud/issues/15245 is done, we can remove the old code. ## Summary of changes Removes support for the ManagementV2 API, in favour of the ProxyV1 API.	2024-12-12 13:42:50 +00:00
Arseny Sher	c9a773af37	Fix test_subscriber_synchronous_commit flakiness. (#10057 ) `6f7aeaa` configured LFC for USE_LFC case, but omitted setting shared_buffers for non USE_LFC, causing flakiness. ref https://github.com/neondatabase/neon/issues/9989	2024-12-12 11:57:00 +00:00
Vlad Lazar	ec0ce06c16	tests: default interpreted proto in tests (#10079 ) ## Problem We aren't using the sharded interpreted wal receiver protocol in all tests. ## Summary of changes Default to the interpreted protocol.	2024-12-12 10:53:10 +00:00
Alexander Bayandin	0bd8eca9ca	Storage: create release PRs On Fridays (#10017 ) ## Problem To give Storage more time on preprod — create a release branch on Friday ## Summary of changes - Automatically create Storage release PR on Friday instead of Monday	2024-12-12 09:18:50 +00:00
Misha Sakhnov	739f627b96	Bump vm-builder v0.35.0 -> v0.37.1 (#10015 ) Bump version to pick up changes introduced in the neonvm-daemon to support sys fs based CPU scaling (https://github.com/neondatabase/autoscaling/issues/1082). Previous update: https://github.com/neondatabase/neon/pull/9208	2024-12-12 08:45:52 +00:00
Arpad Müller	342cbea255	storcon: add safekeeper list API (#10089 ) This adds an API to the storage controller to list safekeepers registered to it. This PR does a `diesel print-schema > storage_controller/src/schema.rs` because of an inconsistency between up.sql and schema.rs, introduced by [this](`2c142f14f7`) commit, so there is some updates of `schema.rs` due to that. As a followup to this, we should maybe think about running `diesel print-schema` in CI. Part of #9981	2024-12-12 01:09:24 +00:00
Tristan Partin	b391b29bdc	Improve typing in test_runner/fixtures/httpserver.py (#10103 ) Signed-off-by: Tristan Partin <tristan@neon.tech>	2024-12-11 22:21:42 +00:00
Erik Grinaker	5126ebbfed	test_runner: bump test_check_visibility_map timeout (#10091 ) ## Problem `test_check_visibility_map` has been seen to time out in debug tests. ## Summary of changes Bump the timeout to 10 minutes (test reports indicate 7 minutes is sufficient). We don't want to disable the test entirely in debug builds, to exercise this with debug assertions enabled. Resolves #10069.	2024-12-11 21:37:25 +00:00
Arpad Müller	7fa986bc92	Do tenant manifest validation with index-part (#10007 ) This adds some validation of invariants that we want to uphold wrt the tenant manifest and `index_part.json`: * the data the manifest has about a timeline must match with the data in `index_part.json`. It might actually change, e.g. when we do reparenting during detach ancestor, but that requires the timeline to be unoffloaded, i.e. removed from the manifest. * any timeline mentioned in index part, must, if present, be archived. If we unarchive, we first update the tenant manifest to unoffload, and only then update index part. And one needs to archive before offloading. * it is legal for timelines to be mentioned in the manifest but have no `index_part`: this is a temporary state visible during deletion of the timeline. if the pageserver crashed, an attach of the tenant will clean the state up. * it is also legal for offloaded timelines to have an `ancestor_retain_lsn` of None while having an `ancestor_timeline_id`. This is for the to-be-added flattening functionality: the plan is to set former to None if we have flattened a timeline. follow-up of #9942 part of #8088	2024-12-11 20:10:22 +00:00
Vlad Lazar	e8395807a5	storcon: allow for more concurrency in drain/fill operations (#10093 ) ## Problem We saw the drain/fill operations not drain fast enough in ap-southeast. ## Summary of changes These are some quick changes to speed it up: * double reconcile concurrency - this is now half of the available reconcile bandwidth * reduce the waiter polling timeout - this way we can spawn new reconciliations faster	2024-12-11 19:43:40 +00:00
Vlad Lazar	a3e80448e8	pageserver/storcon: add patch endpoints for tenant config metrics (#10020 ) ## Problem Cplane and storage controller tenant config changes are not additive. Any change overrides all existing tenant configs. This would be fine if both did client side patching, but that's not the case. Once this merges, we must update cplane to use the PATCH endpoint. ## Summary of changes ### High Level Allow for patching of tenant configuration with a `PATCH /v1/tenant/config` endpoint. It takes the same data as it's PUT counterpart. For example the payload below will update `gc_period` and unset `compaction_period`. All other fields are left in their original state. ``` { "tenant_id": "1234", "gc_period": "10s", "compaction_period": null } ``` ### Low Level * PS and storcon gain `PATCH /v1/tenant/config` endpoints. PS endpoint is only used for cplane managed instances. * `storcon_cli` is updated to have separate commands for `set-tenant-config` and `patch-tenant-config` Related https://github.com/neondatabase/cloud/issues/21043	2024-12-11 19:16:33 +00:00
Anastasia Lubennikova	ef233e91ef	Update compute_installed_extensions metric: (#9891 ) add owned_by_superuser field to filter out system extensions. While on it, also correct related code: - fix the metric setting: use set() instead of inc() in a loop. inc() is not idempotent and can lead to incorrect results if the function called multiple times. Currently it is only called at compute start, but this will change soon. - fix the return type of the installed_extensions endpoint to match the metric. Currently it is only used in the test.	2024-12-11 16:43:26 +00:00
Mikhail Kot	dee2041cd3	walproposer: fix link error on debian 12 / ubuntu 22 (#10090 ) ## Problem Linking walproposer library (e.g. `cargo t`) produces linker errors: /home/myrrc/neon/pgxn/neon/walproposer_compat.c:169: undefined reference to `pg_snprintf' The library with these symbols (libpgcommon.a) is present ## Summary of changes Changed order of libraries resolution for linker	2024-12-11 16:23:59 +00:00
Arseny Sher	e4bb1ca7d8	Increase neon_local http client to compute timeout in reconfigure. (#10088 ) Seems like 30s sometimes not enough when CI runners are overloaded, causing pull_timeline flakiness. ref https://github.com/neondatabase/neon/issues/9731#issuecomment-2535946443	2024-12-11 15:46:50 +00:00
a-masterov	b987648e71	Enable LFC for all the PG versions. (#10068 ) ## Problem We added support for LFC for tests but are still using it only for the PG17 release. ## Summary of changes LFC is enabled for all PG versions. Errors in tests with LFC enabled now block merging as usual. We keep tests with disabled LFC for PG17 release. Tests on debug builds with LFC enabled still don't affect permission to merge.	2024-12-11 15:28:10 +00:00
Mikhail Kot	c79c1dd8e9	compute_ctl: don't panic if control plane can't be reached (#10078 ) ## Problem If the control plane cannot be reached for some reason, compute_ctl panics ## Summary of changes panic is removed in favour of returning an error. Code is reformatted a bit for more flat control flow Resolves: #5391	2024-12-11 15:03:11 +00:00
Vlad Lazar	a53db73851	pageserver: don't drop multixact slrus on non zero shards (#10086 ) ## Problem We get slru truncation commands on non-zero shards. Compaction will drop the slru dir keys and ingest will fail when receiving such records. https://github.com/neondatabase/neon/pull/10080 fixed it for clog, but not for multixact. ## Summary of changes Only truncate multixact slrus on shard zero. I audited the rest of the ingest code and it looks fine from this pov.	2024-12-11 14:28:18 +00:00
Christian Schwarz	9ae980bf4f	page_service: don't count time spent in Batcher towards smgr latency metrics (#10075 ) ## Problem With pipelining enabled, the time a request spends in the batcher stage counts towards the smgr op latency. If pipelining is disabled, that time is not accounted for. In practice, this results in a jump in smgr getpage latencies in various dashboards and degrades the internal SLO. ## Solution In a similar vein to #10042 and with a similar rationale, this PR stops counting the time spent in batcher stage towards smgr op latency. The smgr op latency metric is reduced to the actual execution time. Time spent in batcher stage is tracked in a separate histogram. I expect to remove that histogram after batching rollout is complete, but it will be helpful in the meantime to reason about the rollout.	2024-12-11 13:37:08 +00:00
Vlad Lazar	665369c439	wal_decoder: fix compact key protobuf encoding (#10074 ) ## Problem Protobuf doesn't support 128 bit integers, so we encode the keys as two 64 bit integers. Issue is that when we split the 128 bit compact key we use signed 64 bit integers to represent the two halves. This may result in a negative lower half when relnode is larger than `0x00800000`. When we convert the lower half to an i128 we get a negative `CompactKey`. ## Summary of Changes Use unsigned integers when encoding into Protobuf. ## Deployment * Prod: We disabled the interpreted proto, so no compat concerns. * Staging: Disable the interpreted proto, do one release, and then release the fixed version. We do this because a negative int32 will convert to a large uint32 value and could give a key in the actual pageserver space. In production we would around this by adding new fields to the proto and deprecating the old ones, but we can make our lives easy here. * Pre-prod: Same as staging	2024-12-11 12:35:02 +00:00
JC Grünhage	d7aeca2f34	CI(deploy): create git tags/releases before triggering deploy workflows (#10022 ) ## Problem When dev deployments are disabled (or fail), the tags for releases aren't created. It makes more sense to have tag and release creation before the deployment to prevent situations like [this](https://github.com/neondatabase/neon/pull/9959). It is not enough to move the tag creation before the deployment. If the deployment fails, re-running the job isn't possible because the API call to create the tag will fail. ## Summary of changes - Tag/Release creation now happens before the deployment - The two steps for tag and release have been merged into a bigger one - There's new checks to ensure the that if the tags/releases already exist as expected, things will continue just fine.	2024-12-11 09:41:34 +00:00
John Spray	38415a9816	pageserver: fix ingest handling of CLog truncate (#10080 ) ## Problem In #9786 we stop storing SLRUs on non-zero shards. However, there was one code path during ingest that still tries to enumerate SLRU relations on all shards. This fails if it sees a tenant who has never seen any write to an SLRU, or who has done such thorough compaction+GC that it has dropped its SLRU directory key. ## Summary of changes - Avoid trying to list SLRU relations on nonzero shards	2024-12-11 09:16:11 +00:00
Matthias van de Meent	597125e124	Disable readstream's reliance on seqscan readahead (#9860 ) Neon doesn't have seqscan detection of its own, so stop read_stream from trying to utilize that readahead, and instead make it issue readahead of its own. ## Problem @knizhnik noticed that we didn't issue smgrprefetch[v] calls for seqscans in PG17 due to the move to the read_stream API, which assumes that the underlying IO facilities do seqscan detection for readahead. That is a wrong assumption when Neon is involved, so let's remove the code that applies that assumption. ## Summary of changes Remove the cases where seqscans are detected and prefetch is disabled as a consequence, and instead don't do that detection. PG PR: https://github.com/neondatabase/postgres/pull/532	2024-12-11 00:51:05 +00:00
Matthias van de Meent	e71d20d392	Emit nbtree vacuum cycle id in nbtree xlog through forced FPIs (#9932 ) This fixes neondatabase/neon#9929. ## Postgres repo PRS: - PG17: https://github.com/neondatabase/postgres/pull/538 - PG16: https://github.com/neondatabase/postgres/pull/539 - PG15: https://github.com/neondatabase/postgres/pull/540 - PG14: https://github.com/neondatabase/postgres/pull/541 ## Problem see #9929 ## Summary of changes We update the split code to force the code to emit an FPI whenever the cycle ID might be interesting for concurrent btree vacuum.	2024-12-10 19:42:52 +00:00
Alex Chi Z.	aa0554fd1e	feat(test_runner): allowed_errors in storage scrubber (#10062 ) ## Problem resolve https://github.com/neondatabase/neon/issues/9988#issuecomment-2528239437 ## Summary of changes * New verbose mode for storage scrubber scan metadata (pageserver) that contains the error messages. * Filter allowed_error list from the JSON output to determine the healthy flag status. --------- Signed-off-by: Alex Chi Z <chi@neon.tech>	2024-12-10 17:00:47 +00:00
Heikki Linnakangas	b853f78136	Print a log message if GetPage response takes too long (#10046 ) We have metrics for GetPage request latencies, but this is an extra measure to capture requests that take way too long in the logs. The log message is printed every 10 s, until the response is received: ``` PG:2024-12-09 16:02:07.715 GMT [1782845] LOG: [NEON_SMGR] [shard 0] no response received from pageserver for 10.000 s, still waiting (sent 10613 requests, received 10612 responses) PG:2024-12-09 16:02:17.723 GMT [1782845] LOG: [NEON_SMGR] [shard 0] no response received from pageserver for 20.008 s, still waiting (sent 10613 requests, received 10612 responses) PG:2024-12-09 16:02:19.719 GMT [1782845] LOG: [NEON_SMGR] [shard 0] received response from pageserver after 22.006 s ```	2024-12-10 16:26:56 +00:00
Alex Chi Z.	6ad99826c1	fix(pageserver): refresh_gc_info should always increase cutoff (#9862 ) ## Problem close https://github.com/neondatabase/cloud/issues/19671 ``` Timeline ----------------------------- ^ last GC happened LSN ^ original retention period setting = 24hr > refresh-gc-info updates the gc_info ^ planned cutoff (gc_info) ^ customer set retention to 48hr, and it's still within the last GC LSN ^1 ^2 we have two choices: (1) update the planned cutoff to move backwards, or (2) keep the current one ``` In this patch, we decided to keep the current cutoff instead of moving back the gc_info to avoid races. In the future, we could allow the planned gc cutoff to go back once cplane sends a retention_history tenant config update, but this requires a careful revisit of the code. ## Summary of changes Ensure that GC cutoffs never go back if retention settings get changed. Signed-off-by: Alex Chi Z <chi@neon.tech>	2024-12-10 15:23:26 +00:00
Konstantin Knizhnik	311ee793b9	Fix handling in-flight requersts in prefetch buffer resize (#9968 ) ## Problem See https://github.com/neondatabase/neon/issues/9961 Current implementation of prefetch buffer resize doesn't correctly handle in-flight requests ## Summary of changes 1. Fix index of entry we should wait for if new prefetch buffer size is smaller than number of in-flight requests. 2. Correctly set flush position Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>	2024-12-10 15:01:40 +00:00