fmt

Pass down timeouts to the reqwest client
use OIDC role instead of AWS access keys for managing test runner (#10117 )
2026-02-07 12:40:38 +00:00 · 2024-12-13 18:54:20 +01:00 · 2024-12-13 18:32:46 +01:00 · 2024-12-12 20:25:39 +00:00 · 2024-12-12 20:23:24 +00:00 · 2024-12-12 19:35:38 +00:00
146 changed files with 9206 additions and 1862 deletions
--- a/.github/actionlint.yml
+++ b/.github/actionlint.yml
@@ -21,3 +21,7 @@ config-variables:
  - SLACK_UPCOMING_RELEASE_CHANNEL_ID
  - DEV_AWS_OIDC_ROLE_ARN
  - BENCHMARK_INGEST_TARGET_PROJECTID
+  - PGREGRESS_PG16_PROJECT_ID
+  - PGREGRESS_PG17_PROJECT_ID
+  - SLACK_ON_CALL_QA_STAGING_STREAM
+  - DEV_AWS_OIDC_ROLE_MANAGE_BENCHMARK_EC2_VMS_ARN
--- a/.github/actions/download/action.yml
+++ b/.github/actions/download/action.yml
@@ -15,10 +15,21 @@ inputs:
  prefix:
    description: "S3 prefix. Default is '${GITHUB_RUN_ID}/${GITHUB_RUN_ATTEMPT}'"
    required: false
+  aws_oicd_role_arn:
+    description: "the OIDC role arn for aws auth"
+    required: false
+    default: ""

 runs:
  using: "composite"
  steps:
+    - name: Configure AWS credentials
+      uses: aws-actions/configure-aws-credentials@v4
+      with:
+        aws-region: eu-central-1
+        role-to-assume: ${{ inputs.aws_oicd_role_arn }}
+        role-duration-seconds: 3600
+
    - name: Download artifact
      id: download-artifact
      shell: bash -euxo pipefail {0}
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -62,6 +62,7 @@ runs:
      with:
        name: neon-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build_type }}-artifact
        path: /tmp/neon
+        aws_oicd_role_arn: ${{ inputs.aws_oicd_role_arn }}

    - name: Download Neon binaries for the previous release
      if: inputs.build_type != 'remote'
@@ -70,6 +71,7 @@ runs:
        name: neon-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build_type }}-artifact
        path: /tmp/neon-previous
        prefix: latest
+        aws_oicd_role_arn: ${{ inputs.aws_oicd_role_arn }}

    - name: Download compatibility snapshot
      if: inputs.build_type != 'remote'
@@ -81,6 +83,7 @@ runs:
        # The lack of compatibility snapshot (for example, for the new Postgres version)
        # shouldn't fail the whole job. Only relevant test should fail.
        skip-if-does-not-exist: true
+        aws_oicd_role_arn: ${{ inputs.aws_oicd_role_arn }}

    - name: Checkout
      if: inputs.needs_postgres_source == 'true'
@@ -218,6 +221,7 @@ runs:
        # The lack of compatibility snapshot shouldn't fail the job
        # (for example if we didn't run the test for non build-and-test workflow)
        skip-if-does-not-exist: true
+        aws_oicd_role_arn: ${{ inputs.aws_oicd_role_arn }}

    - name: (Re-)configure AWS credentials # necessary to upload reports to S3 after a long-running test
      if: ${{ !cancelled() && (inputs.aws_oicd_role_arn != '') }}
@@ -232,3 +236,4 @@ runs:
      with:
        report-dir: /tmp/test_output/allure/results
        unique-key: ${{ inputs.build_type }}-${{ inputs.pg_version }}
+        aws_oicd_role_arn: ${{ inputs.aws_oicd_role_arn }}
--- a/.github/actions/save-coverage-data/action.yml
+++ b/.github/actions/save-coverage-data/action.yml
@@ -14,9 +14,11 @@ runs:
        name: coverage-data-artifact
        path: /tmp/coverage
        skip-if-does-not-exist: true # skip if there's no previous coverage to download
+        aws_oicd_role_arn: ${{ inputs.aws_oicd_role_arn }}

    - name: Upload coverage data
      uses: ./.github/actions/upload
      with:
        name: coverage-data-artifact
        path: /tmp/coverage
+        aws_oicd_role_arn: ${{ inputs.aws_oicd_role_arn }}
--- a/.github/actions/upload/action.yml
+++ b/.github/actions/upload/action.yml
@@ -14,6 +14,10 @@ inputs:
  prefix:
    description: "S3 prefix. Default is '${GITHUB_SHA}/${GITHUB_RUN_ID}/${GITHUB_RUN_ATTEMPT}'"
    required: false
+  aws_oicd_role_arn:
+    description: "the OIDC role arn for aws auth"
+    required: false
+    default: ""

 runs:
  using: "composite"
@@ -53,6 +57,13 @@ runs:

        echo 'SKIPPED=false' >> $GITHUB_OUTPUT

+    - name: Configure AWS credentials
+      uses: aws-actions/configure-aws-credentials@v4
+      with:
+        aws-region: eu-central-1
+        role-to-assume: ${{ inputs.aws_oicd_role_arn }}
+        role-duration-seconds: 3600
+
    - name: Upload artifact
      if: ${{ steps.prepare-artifact.outputs.SKIPPED == 'false' }}
      shell: bash -euxo pipefail {0}
--- a/.github/workflows/_benchmarking_preparation.yml
+++ b/.github/workflows/_benchmarking_preparation.yml
@@ -70,6 +70,7 @@ jobs:
        name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
        path: /tmp/neon/
        prefix: latest
+        aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}

    # we create a table that has one row for each database that we want to restore with the status whether the restore is done
    - name: Create benchmark_restore_status table if it does not exist
--- a/.github/workflows/_build-and-test-locally.yml
+++ b/.github/workflows/_build-and-test-locally.yml
@@ -31,12 +31,13 @@ defaults:
 env:
  RUST_BACKTRACE: 1
  COPT: '-Werror'
-  AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
-  AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}

 jobs:
  build-neon:
    runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', inputs.arch == 'arm64' && 'large-arm64' || 'large')) }}
+    permissions:
+      id-token: write # aws-actions/configure-aws-credentials
+      contents: read
    container:
      image: ${{ inputs.build-tools-image }}
      credentials:
@@ -205,6 +206,13 @@ jobs:
            done
          fi

+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          aws-region: eu-central-1
+          role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+          role-duration-seconds: 18000 # 5 hours
+
      - name: Run rust tests
        env:
          NEXTEST_RETRIES: 3
@@ -256,6 +264,7 @@ jobs:
        with:
          name: neon-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-artifact
          path: /tmp/neon
+          aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}

      # XXX: keep this after the binaries.list is formed, so the coverage can properly work later
      - name: Merge and upload coverage data
@@ -265,6 +274,10 @@ jobs:
  regress-tests:
    # Don't run regression tests on debug arm64 builds
    if: inputs.build-type != 'debug' || inputs.arch != 'arm64'
+    permissions:
+      id-token: write # aws-actions/configure-aws-credentials
+      contents: read
+      statuses: write
    needs: [ build-neon ]
    runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', inputs.arch == 'arm64' && 'large-arm64' || 'large')) }}
    container:
@@ -283,7 +296,7 @@ jobs:
          submodules: true

      - name: Pytest regression tests
-        continue-on-error: ${{ matrix.lfc_state == 'with-lfc' }}
+        continue-on-error: ${{ matrix.lfc_state == 'with-lfc' && inputs.build-type == 'debug' }}
        uses: ./.github/actions/run-python-test-set
        timeout-minutes: 60
        with:
@@ -295,6 +308,7 @@ jobs:
          real_s3_region: eu-central-1
          rerun_failed: true
          pg_version: ${{ matrix.pg_version }}
+          aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
        env:
          TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
          CHECK_ONDISK_DATA_COMPATIBILITY: nonempty
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -105,6 +105,7 @@ jobs:
        name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
        path: /tmp/neon/
        prefix: latest
+        aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}

    - name: Create Neon Project
      id: create-neon-project
@@ -204,6 +205,7 @@ jobs:
        name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
        path: /tmp/neon/
        prefix: latest
+        aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}

    - name: Run Logical Replication benchmarks
      uses: ./.github/actions/run-python-test-set
@@ -405,6 +407,7 @@ jobs:
        name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
        path: /tmp/neon/
        prefix: latest
+        aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}

    - name: Create Neon Project
      if: contains(fromJson('["neonvm-captest-new", "neonvm-captest-freetier", "neonvm-azure-captest-freetier", "neonvm-azure-captest-new"]'), matrix.platform)
@@ -708,6 +711,7 @@ jobs:
        name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
        path: /tmp/neon/
        prefix: latest
+        aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}

    - name: Set up Connection String
      id: set-up-connstr
@@ -818,6 +822,7 @@ jobs:
        name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
        path: /tmp/neon/
        prefix: latest
+        aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}

    - name: Get Connstring Secret Name
      run: |
@@ -926,6 +931,7 @@ jobs:
        name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
        path: /tmp/neon/
        prefix: latest
+        aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}

    - name: Set up Connection String
      id: set-up-connstr
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -21,8 +21,6 @@ concurrency:
 env:
  RUST_BACKTRACE: 1
  COPT: '-Werror'
-  AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
-  AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
  # A concurrency group that we use for e2e-tests runs, matches `concurrency.group` above with `github.repository` as a prefix
  E2E_CONCURRENCY_GROUP: ${{ github.repository }}-e2e-tests-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }}

@@ -255,15 +253,17 @@ jobs:
      build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
      build-tag: ${{ needs.tag.outputs.build-tag }}
      build-type: ${{ matrix.build-type }}
-      # Run tests on all Postgres versions in release builds and only on the latest version in debug builds
-      # run without LFC on v17 release only
+      # Run tests on all Postgres versions in release builds and only on the latest version in debug builds.
+      # Run without LFC on v17 release and debug builds only. For all the other cases LFC is enabled. Failure on the
+      # debug build with LFC enabled doesn't block merging.
      test-cfg: |
-        ${{ matrix.build-type == 'release' && '[{"pg_version":"v14", "lfc_state": "without-lfc"},
-                                                {"pg_version":"v15", "lfc_state": "without-lfc"},
-                                                {"pg_version":"v16", "lfc_state": "without-lfc"},
-                                                {"pg_version":"v17", "lfc_state": "without-lfc"},
-                                                {"pg_version":"v17", "lfc_state": "with-lfc"}]'
-                                           || '[{"pg_version":"v17", "lfc_state": "without-lfc"}]' }}
+        ${{ matrix.build-type == 'release' && '[{"pg_version":"v14", "lfc_state": "with-lfc"},
+                                                {"pg_version":"v15", "lfc_state": "with-lfc"},
+                                                {"pg_version":"v16", "lfc_state": "with-lfc"},
+                                                {"pg_version":"v17", "lfc_state": "with-lfc"},
+                                                {"pg_version":"v17", "lfc_state": "without-lfc"}]'
+                                           || '[{"pg_version":"v17", "lfc_state": "without-lfc"},
+                                                {"pg_version":"v17", "lfc_state": "with-lfc" }]' }}
    secrets: inherit

  # Keep `benchmarks` job outside of `build-and-test-locally` workflow to make job failures non-blocking
@@ -360,6 +360,11 @@ jobs:
  create-test-report:
    needs: [ check-permissions, build-and-test-locally, coverage-report, build-build-tools-image, benchmarks ]
    if: ${{ !cancelled() && contains(fromJSON('["skipped", "success"]'), needs.check-permissions.result) }}
+    permissions:
+      id-token: write # aws-actions/configure-aws-credentials
+      statuses: write
+      contents: write
+      pull-requests: write
    outputs:
      report-url: ${{ steps.create-allure-report.outputs.report-url }}

@@ -380,6 +385,7 @@ jobs:
        uses: ./.github/actions/allure-report-generate
        with:
          store-test-results-into-db: true
+          aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
        env:
          REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}

@@ -411,6 +417,10 @@ jobs:
  coverage-report:
    if: ${{ !startsWith(github.ref_name, 'release') }}
    needs: [ check-permissions, build-build-tools-image, build-and-test-locally ]
+    permissions:
+      id-token: write # aws-actions/configure-aws-credentials
+      statuses: write
+      contents: write
    runs-on: [ self-hosted, small ]
    container:
      image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
@@ -437,12 +447,14 @@ jobs:
        with:
          name: neon-${{ runner.os }}-${{ runner.arch }}-${{ matrix.build_type }}-artifact
          path: /tmp/neon
+          aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}

      - name: Get coverage artifact
        uses: ./.github/actions/download
        with:
          name: coverage-data-artifact
          path: /tmp/coverage
+          aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}

      - name: Merge coverage data
        run: scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage merge
@@ -573,6 +585,10 @@ jobs:
  neon-image:
    needs: [ neon-image-arch, tag ]
    runs-on: ubuntu-22.04
+    permissions:
+      id-token: write # aws-actions/configure-aws-credentials
+      statuses: write
+      contents: read

    steps:
      - uses: docker/login-action@v3
@@ -587,11 +603,15 @@ jobs:
                                             neondatabase/neon:${{ needs.tag.outputs.build-tag }}-bookworm-x64 \
                                             neondatabase/neon:${{ needs.tag.outputs.build-tag }}-bookworm-arm64

-      - uses: docker/login-action@v3
+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@v4
        with:
-          registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
-          username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
-          password: ${{ secrets.AWS_SECRET_KEY_DEV }}
+          aws-region: eu-central-1
+          role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+          role-duration-seconds: 3600
+
+      - name: Login to Amazon Dev ECR
+        uses: aws-actions/amazon-ecr-login@v2

      - name: Push multi-arch image to ECR
        run: |
@@ -600,6 +620,10 @@ jobs:

  compute-node-image-arch:
    needs: [ check-permissions, build-build-tools-image, tag ]
+    permissions:
+      id-token: write # aws-actions/configure-aws-credentials
+      statuses: write
+      contents: read
    strategy:
      fail-fast: false
      matrix:
@@ -640,11 +664,15 @@ jobs:
          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}

-      - uses: docker/login-action@v3
+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@v4
        with:
-          registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
-          username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
-          password: ${{ secrets.AWS_SECRET_KEY_DEV }}
+          aws-region: eu-central-1
+          role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+          role-duration-seconds: 3600
+
+      - name: Login to Amazon Dev ECR
+        uses: aws-actions/amazon-ecr-login@v2

      - uses: docker/login-action@v3
        with:
@@ -717,6 +745,10 @@ jobs:

  compute-node-image:
    needs: [ compute-node-image-arch, tag ]
+    permissions:
+      id-token: write # aws-actions/configure-aws-credentials
+      statuses: write
+      contents: read
    runs-on: ubuntu-22.04

    strategy:
@@ -761,11 +793,15 @@ jobs:
                                             neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-x64 \
                                             neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-arm64

-      - uses: docker/login-action@v3
+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@v4
        with:
-          registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
-          username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
-          password: ${{ secrets.AWS_SECRET_KEY_DEV }}
+          aws-region: eu-central-1
+          role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+          role-duration-seconds: 3600
+
+      - name: Login to Amazon Dev ECR
+        uses: aws-actions/amazon-ecr-login@v2

      - name: Push multi-arch compute-node-${{ matrix.version.pg }} image to ECR
        run: |
@@ -795,7 +831,7 @@ jobs:
          - pg: v17
            debian: bookworm
    env:
-      VM_BUILDER_VERSION: v0.35.0
+      VM_BUILDER_VERSION: v0.37.1

    steps:
      - uses: actions/checkout@v4
@@ -890,7 +926,9 @@ jobs:
    runs-on: ubuntu-22.04

    permissions:
-      id-token: write # for `aws-actions/configure-aws-credentials`
+      id-token: write # aws-actions/configure-aws-credentials
+      statuses: write
+      contents: read

    env:
      VERSIONS: v14 v15 v16 v17
@@ -901,12 +939,15 @@ jobs:
          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}

-      - name: Login to dev ECR
-        uses: docker/login-action@v3
+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@v4
        with:
-          registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
-          username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
-          password: ${{ secrets.AWS_SECRET_KEY_DEV }}
+          aws-region: eu-central-1
+          role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+          role-duration-seconds: 3600
+
+      - name: Login to Amazon Dev ECR
+        uses: aws-actions/amazon-ecr-login@v2

      - name: Copy vm-compute-node images to ECR
        run: |
@@ -1060,12 +1101,79 @@ jobs:
    needs: [ check-permissions, promote-images, tag, build-and-test-locally, trigger-custom-extensions-build-and-wait, push-to-acr-dev, push-to-acr-prod ]
    # `!failure() && !cancelled()` is required because the workflow depends on the job that can be skipped: `push-to-acr-dev` and `push-to-acr-prod`
    if: (github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute') && !failure() && !cancelled()
-
+    permissions:
+      id-token: write # aws-actions/configure-aws-credentials
+      statuses: write
+      contents: write
    runs-on: [ self-hosted, small ]
    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
    steps:
      - uses: actions/checkout@v4

+      - name: Create git tag and GitHub release
+        if: github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute'
+        uses: actions/github-script@v7
+        with:
+          retries: 5
+          script: |
+            const tag = "${{ needs.tag.outputs.build-tag }}";
+
+            try {
+              const existingRef = await github.rest.git.getRef({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                ref: `tags/${tag}`,
+              });
+
+              if (existingRef.data.object.sha !== context.sha) {
+                throw new Error(`Tag ${tag} already exists but points to a different commit (expected: ${context.sha}, actual: ${existingRef.data.object.sha}).`);
+              }
+
+              console.log(`Tag ${tag} already exists and points to ${context.sha} as expected.`);
+            } catch (error) {
+              if (error.status !== 404) {
+                throw error;
+              }
+
+              console.log(`Tag ${tag} does not exist. Creating it...`);
+              await github.rest.git.createRef({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                ref: `refs/tags/${tag}`,
+                sha: context.sha,
+              });
+              console.log(`Tag ${tag} created successfully.`);
+            }
+
+            # TODO: check how GitHub releases looks for proxy/compute releases and enable them if they're ok
+            if (context.ref !== 'refs/heads/release') {
+              console.log(`GitHub release skipped for ${context.ref}.`);
+              return;
+            }
+
+            try {
+              const existingRelease = await github.rest.repos.getReleaseByTag({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                tag: tag,
+              });
+
+              console.log(`Release for tag ${tag} already exists (ID: ${existingRelease.data.id}).`);
+            } catch (error) {
+              if (error.status !== 404) {
+                throw error;
+              }
+
+              console.log(`Release for tag ${tag} does not exist. Creating it...`);
+              await github.rest.repos.createRelease({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                tag_name: tag,
+                generate_release_notes: true,
+              });
+              console.log(`Release for tag ${tag} created successfully.`);
+            }
+
      - name: Trigger deploy workflow
        env:
          GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
@@ -1115,38 +1223,13 @@ jobs:
            exit 1
          fi

-      - name: Create git tag
-        if: github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute'
-        uses: actions/github-script@v7
-        with:
-          # Retry script for 5XX server errors: https://github.com/actions/github-script#retries
-          retries: 5
-          script: |
-            await github.rest.git.createRef({
-              owner: context.repo.owner,
-              repo: context.repo.repo,
-              ref: "refs/tags/${{ needs.tag.outputs.build-tag }}",
-              sha: context.sha,
-            })
-
-      # TODO: check how GitHub releases looks for proxy releases and enable it if it's ok
-      - name: Create GitHub release
-        if: github.ref_name == 'release'
-        uses: actions/github-script@v7
-        with:
-          # Retry script for 5XX server errors: https://github.com/actions/github-script#retries
-          retries: 5
-          script: |
-            await github.rest.repos.createRelease({
-              owner: context.repo.owner,
-              repo: context.repo.repo,
-              tag_name: "${{ needs.tag.outputs.build-tag }}",
-              generate_release_notes: true,
-            })
-
  # The job runs on `release` branch and copies compatibility data and Neon artifact from the last *release PR* to the latest directory
  promote-compatibility-data:
    needs: [ deploy ]
+    permissions:
+      id-token: write # aws-actions/configure-aws-credentials
+      statuses: write
+      contents: read
    # `!failure() && !cancelled()` is required because the workflow transitively depends on the job that can be skipped: `push-to-acr-dev` and `push-to-acr-prod`
    if: github.ref_name == 'release' && !failure() && !cancelled()

--- a/.github/workflows/cloud-regress.yml
+++ b/.github/workflows/cloud-regress.yml
@@ -19,15 +19,19 @@ concurrency:
  group: ${{ github.workflow }}
  cancel-in-progress: true

+permissions:
+  id-token: write # aws-actions/configure-aws-credentials
+
 jobs:
  regress:
    env:
      POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
-      DEFAULT_PG_VERSION: 16
      TEST_OUTPUT: /tmp/test_output
      BUILD_TYPE: remote
-      AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
-      AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
+    strategy:
+      fail-fast: false
+      matrix:
+        pg-version: [16, 17]

    runs-on: us-east-2
    container:
@@ -40,9 +44,11 @@ jobs:
          submodules: true

      - name: Patch the test
+        env:
+          PG_VERSION: ${{matrix.pg-version}}
        run: |
-          cd "vendor/postgres-v${DEFAULT_PG_VERSION}"
-          patch -p1 < "../../compute/patches/cloud_regress_pg${DEFAULT_PG_VERSION}.patch"
+          cd "vendor/postgres-v${PG_VERSION}"
+          patch -p1 < "../../compute/patches/cloud_regress_pg${PG_VERSION}.patch"

      - name: Generate a random password
        id: pwgen
@@ -55,8 +61,9 @@ jobs:
      - name: Change tests according to the generated password
        env:
          DBPASS: ${{ steps.pwgen.outputs.DBPASS }}
+          PG_VERSION: ${{matrix.pg-version}}
        run: |
-          cd vendor/postgres-v"${DEFAULT_PG_VERSION}"/src/test/regress
+          cd vendor/postgres-v"${PG_VERSION}"/src/test/regress
          for fname in sql/*.sql expected/*.out; do
            sed -i.bak s/NEON_PASSWORD_PLACEHOLDER/"'${DBPASS}'"/ "${fname}"
          done
@@ -72,27 +79,44 @@ jobs:
          name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
          path: /tmp/neon/
          prefix: latest
+          aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+
+      - name: Create a new branch
+        id: create-branch
+        uses: ./.github/actions/neon-branch-create
+        with:
+          api_key: ${{ secrets.NEON_STAGING_API_KEY }}
+          project_id: ${{ vars[format('PGREGRESS_PG{0}_PROJECT_ID', matrix.pg-version)] }}

      - name: Run the regression tests
        uses: ./.github/actions/run-python-test-set
        with:
          build_type: ${{ env.BUILD_TYPE }}
          test_selection: cloud_regress
-          pg_version: ${{ env.DEFAULT_PG_VERSION }}
+          pg_version: ${{matrix.pg-version}}
          extra_params: -m remote_cluster
        env:
-          BENCHMARK_CONNSTR: ${{ secrets.PG_REGRESS_CONNSTR }}
+          BENCHMARK_CONNSTR: ${{steps.create-branch.outputs.dsn}}
+
+      - name: Delete branch
+        uses: ./.github/actions/neon-branch-delete
+        with:
+          api_key: ${{ secrets.NEON_STAGING_API_KEY }}
+          project_id: ${{ vars[format('PGREGRESS_PG{0}_PROJECT_ID', matrix.pg-version)] }}
+          branch_id: ${{steps.create-branch.outputs.branch_id}}

      - name: Create Allure report
        id: create-allure-report
        if: ${{ !cancelled() }}
        uses: ./.github/actions/allure-report-generate
+        with:
+          aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}

      - name: Post to a Slack channel
        if: ${{ github.event.schedule && failure() }}
        uses: slackapi/slack-github-action@v1
        with:
-          channel-id: "C033QLM5P7D" # on-call-staging-stream
+          channel-id: ${{ vars.SLACK_ON_CALL_QA_STAGING_STREAM }}
          slack-message: |
            Periodic pg_regress on staging: ${{ job.status }}
            <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>
--- a/.github/workflows/ingest_benchmark.yml
+++ b/.github/workflows/ingest_benchmark.yml
@@ -64,6 +64,7 @@ jobs:
        name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
        path: /tmp/neon/
        prefix: latest
+        aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}

    - name: Create Neon Project
      if: ${{ matrix.target_project == 'new_empty_project' }}
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -143,6 +143,10 @@ jobs:

  gather-rust-build-stats:
    needs: [ check-permissions, build-build-tools-image ]
+    permissions:
+      id-token: write # aws-actions/configure-aws-credentials
+      statuses: write
+      contents: write
    if: |
      contains(github.event.pull_request.labels.*.name, 'run-extra-build-stats') ||
      contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') ||
@@ -177,13 +181,18 @@ jobs:
      - name: Produce the build stats
        run: PQ_LIB_DIR=$(pwd)/pg_install/v17/lib cargo build --all --release --timings -j$(nproc)

+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          aws-region: eu-central-1
+          role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+          role-duration-seconds: 3600
+
      - name: Upload the build stats
        id: upload-stats
        env:
          BUCKET: neon-github-public-dev
          SHA: ${{ github.event.pull_request.head.sha || github.sha }}
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
        run: |
          REPORT_URL=https://${BUCKET}.s3.amazonaws.com/build-stats/${SHA}/${GITHUB_RUN_ID}/cargo-timing.html
          aws s3 cp --only-show-errors ./target/cargo-timings/cargo-timing.html "s3://${BUCKET}/build-stats/${SHA}/${GITHUB_RUN_ID}/"
--- a/.github/workflows/periodic_pagebench.yml
+++ b/.github/workflows/periodic_pagebench.yml
@@ -21,6 +21,9 @@ defaults:
  run:
    shell: bash -euo pipefail {0}

+permissions:
+  id-token: write # aws-actions/configure-aws-credentials
+
 concurrency:
  group: ${{ github.workflow }}
  cancel-in-progress: false
@@ -38,8 +41,6 @@ jobs:
    env:
      API_KEY: ${{ secrets.PERIODIC_PAGEBENCH_EC2_RUNNER_API_KEY }}
      RUN_ID: ${{ github.run_id }}
-      AWS_ACCESS_KEY_ID: ${{ secrets.AWS_EC2_US_TEST_RUNNER_ACCESS_KEY_ID }}
-      AWS_SECRET_ACCESS_KEY : ${{ secrets.AWS_EC2_US_TEST_RUNNER_ACCESS_KEY_SECRET }}
      AWS_DEFAULT_REGION : "eu-central-1"
      AWS_INSTANCE_ID : "i-02a59a3bf86bc7e74"
    steps:
@@ -50,6 +51,13 @@ jobs:
    - name: Show my own (github runner) external IP address - usefull for IP allowlisting
      run: curl https://ifconfig.me

+    - name: Assume AWS OIDC role that allows to manage (start/stop/describe... EC machine)
+      uses: aws-actions/configure-aws-credentials@v4
+      with:
+        aws-region: eu-central-1
+        role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_MANAGE_BENCHMARK_EC2_VMS_ARN }}
+        role-duration-seconds: 3600
+
    - name: Start EC2 instance and wait for the instance to boot up
      run: |
        aws ec2 start-instances --instance-ids $AWS_INSTANCE_ID
@@ -124,11 +132,10 @@ jobs:
        cat "test_log_${GITHUB_RUN_ID}"

    - name: Create Allure report
-      env:
-        AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
-        AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
      if: ${{ !cancelled() }}
      uses: ./.github/actions/allure-report-generate
+      with:
+        aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}

    - name: Post to a Slack channel
      if: ${{ github.event.schedule && failure() }}
@@ -148,6 +155,14 @@ jobs:
        -H "Authorization: Bearer $API_KEY" \
        -d ''

+    - name: Assume AWS OIDC role that allows to manage (start/stop/describe... EC machine)
+      if: always() && steps.poll_step.outputs.too_many_runs != 'true'
+      uses: aws-actions/configure-aws-credentials@v4
+      with:
+        aws-region: eu-central-1
+        role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_MANAGE_BENCHMARK_EC2_VMS_ARN }}
+        role-duration-seconds: 3600
+
    - name: Stop EC2 instance and wait for the instance to be stopped
      if: always() && steps.poll_step.outputs.too_many_runs != 'true'
      run: |
--- a/.github/workflows/pg-clients.yml
+++ b/.github/workflows/pg-clients.yml
@@ -25,11 +25,13 @@ defaults:
  run:
    shell: bash -euxo pipefail {0}

+permissions:
+  id-token: write # aws-actions/configure-aws-credentials
+  statuses: write # require for posting a status update
+
 env:
  DEFAULT_PG_VERSION: 16
  PLATFORM: neon-captest-new
-  AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
-  AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
  AWS_DEFAULT_REGION: eu-central-1

 jobs:
@@ -94,6 +96,7 @@ jobs:
          name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
          path: /tmp/neon/
          prefix: latest
+          aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}

      - name: Create Neon Project
        id: create-neon-project
@@ -126,6 +129,7 @@ jobs:
        uses: ./.github/actions/allure-report-generate
        with:
          store-test-results-into-db: true
+          aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
        env:
          REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}

@@ -159,6 +163,7 @@ jobs:
        name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
        path: /tmp/neon/
        prefix: latest
+        aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}

    - name: Create Neon Project
      id: create-neon-project
@@ -191,6 +196,7 @@ jobs:
      uses: ./.github/actions/allure-report-generate
      with:
        store-test-results-into-db: true
+        aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
      env:
        REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}

--- a/.github/workflows/pin-build-tools-image.yml
+++ b/.github/workflows/pin-build-tools-image.yml
@@ -67,7 +67,7 @@ jobs:
    runs-on: ubuntu-22.04

    permissions:
-      id-token: write # for `azure/login`
+      id-token: write # for `azure/login` and aws auth

    steps:
      - uses: docker/login-action@v3
@@ -75,11 +75,15 @@ jobs:
          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}

-      - uses: docker/login-action@v3
+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@v4
        with:
-          registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
-          username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
-          password: ${{ secrets.AWS_SECRET_KEY_DEV }}
+          aws-region: eu-central-1
+          role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+          role-duration-seconds: 3600
+
+      - name: Login to Amazon Dev ECR
+        uses: aws-actions/amazon-ecr-login@v2

      - name: Azure login
        uses: azure/login@6c251865b4e6290e7b78be643ea2d005bc51f69a  # @v2.1.1
--- a/.github/workflows/pre-merge-checks.yml
+++ b/.github/workflows/pre-merge-checks.yml
@@ -63,6 +63,7 @@ jobs:
    if: always()
    permissions:
      statuses: write # for `github.repos.createCommitStatus(...)`
+      contents: write
    needs:
      - get-changed-files
      - check-codestyle-python
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -3,7 +3,7 @@ name: Create Release Branch
 on:
  schedule:
    # It should be kept in sync with if-condition in jobs
-    - cron: '0 6 * * MON' # Storage release
+    - cron: '0 6 * * FRI' # Storage release
    - cron: '0 6 * * THU' # Proxy release
  workflow_dispatch:
    inputs:
@@ -29,7 +29,7 @@ defaults:

 jobs:
  create-storage-release-branch:
-    if: ${{ github.event.schedule == '0 6 * * MON' || inputs.create-storage-release-branch }}
+    if: ${{ github.event.schedule == '0 6 * * FRI' || inputs.create-storage-release-branch }}

    permissions:
      contents: write
--- a/33
+++ b/33
@@ -1,16 +1,29 @@
-/.github/ @neondatabase/developer-productivity
-/compute_tools/ @neondatabase/control-plane @neondatabase/compute
-/libs/pageserver_api/ @neondatabase/storage
-/libs/postgres_ffi/ @neondatabase/compute @neondatabase/storage
-/libs/proxy/ @neondatabase/proxy
-/libs/remote_storage/ @neondatabase/storage
-/libs/safekeeper_api/ @neondatabase/storage
+# Autoscaling
 /libs/vm_monitor/ @neondatabase/autoscaling
-/pageserver/ @neondatabase/storage
+
+# DevProd
+/.github/ @neondatabase/developer-productivity
+
+# Compute
 /pgxn/ @neondatabase/compute
-/pgxn/neon/ @neondatabase/compute @neondatabase/storage
+/vendor/ @neondatabase/compute
+/compute/ @neondatabase/compute
+/compute_tools/ @neondatabase/compute
+
+# Proxy
+/libs/proxy/ @neondatabase/proxy
 /proxy/ @neondatabase/proxy
+
+# Storage
+/pageserver/ @neondatabase/storage
 /safekeeper/ @neondatabase/storage
 /storage_controller @neondatabase/storage
 /storage_scrubber @neondatabase/storage
-/vendor/ @neondatabase/compute
+/libs/pageserver_api/ @neondatabase/storage
+/libs/remote_storage/ @neondatabase/storage
+/libs/safekeeper_api/ @neondatabase/storage
+
+# Shared
+/pgxn/neon/ @neondatabase/compute @neondatabase/storage
+/libs/compute_api/ @neondatabase/compute @neondatabase/control-plane
+/libs/postgres_ffi/ @neondatabase/compute @neondatabase/storage
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -51,10 +51,6 @@ anyhow = { version = "1.0", features = ["backtrace"] }
 arc-swap = "1.6"
 async-compression = { version = "0.4.0", features = ["tokio", "gzip", "zstd"] }
 atomic-take = "1.1.0"
-azure_core = { version = "0.19", default-features = false, features = ["enable_reqwest_rustls", "hmac_rust"] }
-azure_identity = { version = "0.19", default-features = false, features = ["enable_reqwest_rustls"] }
-azure_storage = { version = "0.19", default-features = false, features = ["enable_reqwest_rustls"] }
-azure_storage_blobs = { version = "0.19", default-features = false, features = ["enable_reqwest_rustls"] }
 flate2 = "1.0.26"
 async-stream = "0.3"
 async-trait = "0.1"
@@ -216,6 +212,12 @@ postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git",
 postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", branch = "neon" }
 tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch = "neon" }

+## Azure SDK crates
+azure_core = { git = "https://github.com/neondatabase/azure-sdk-for-rust.git", branch = "neon", default-features = false, features = ["enable_reqwest_rustls", "hmac_rust"] }
+azure_identity = { git = "https://github.com/neondatabase/azure-sdk-for-rust.git", branch = "neon", default-features = false, features = ["enable_reqwest_rustls"] }
+azure_storage = { git = "https://github.com/neondatabase/azure-sdk-for-rust.git", branch = "neon", default-features = false, features = ["enable_reqwest_rustls"] }
+azure_storage_blobs = { git = "https://github.com/neondatabase/azure-sdk-for-rust.git", branch = "neon", default-features = false, features = ["enable_reqwest_rustls"] }
+
 ## Local libraries
 compute_api = { version = "0.1", path = "./libs/compute_api/" }
 consumption_metrics = { version = "0.1", path = "./libs/consumption_metrics/" }
--- a/build-tools.Dockerfile
+++ b/build-tools.Dockerfile
@@ -115,7 +115,7 @@ RUN set -e \

 # Keep the version the same as in compute/compute-node.Dockerfile and
 # test_runner/regress/test_compute_metrics.py.
-ENV SQL_EXPORTER_VERSION=0.13.1
+ENV SQL_EXPORTER_VERSION=0.16.0
 RUN curl -fsSL \
    "https://github.com/burningalchemist/sql_exporter/releases/download/${SQL_EXPORTER_VERSION}/sql_exporter-${SQL_EXPORTER_VERSION}.linux-$(case "$(uname -m)" in x86_64) echo amd64;; aarch64) echo arm64;; esac).tar.gz" \
    --output sql_exporter.tar.gz \
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -1324,7 +1324,7 @@ FROM quay.io/prometheuscommunity/postgres-exporter:v0.12.1 AS postgres-exporter

 # Keep the version the same as in build-tools.Dockerfile and
 # test_runner/regress/test_compute_metrics.py.
-FROM burningalchemist/sql_exporter:0.13.1 AS sql-exporter
+FROM burningalchemist/sql_exporter:0.16.0 AS sql-exporter

 #########################################################################################
 #
--- a/compute/etc/neon_collector.jsonnet
+++ b/compute/etc/neon_collector.jsonnet
@@ -6,6 +6,7 @@
    import 'sql_exporter/compute_backpressure_throttling_seconds.libsonnet',
    import 'sql_exporter/compute_current_lsn.libsonnet',
    import 'sql_exporter/compute_logical_snapshot_files.libsonnet',
+    import 'sql_exporter/compute_logical_snapshots_bytes.libsonnet',
    import 'sql_exporter/compute_max_connections.libsonnet',
    import 'sql_exporter/compute_receive_lsn.libsonnet',
    import 'sql_exporter/compute_subscriptions_count.libsonnet',
--- a/compute/etc/pgbouncer.ini
+++ b/compute/etc/pgbouncer.ini
@@ -19,3 +19,10 @@ max_prepared_statements=0
 admin_users=postgres
 unix_socket_dir=/tmp/
 unix_socket_mode=0777
+
+;; Disable connection logging. It produces a lot of logs that no one looks at,
+;; and we can get similar log entries from the proxy too. We had incidents in
+;; the past where the logging significantly stressed the log device or pgbouncer
+;; itself.
+log_connections=0
+log_disconnections=0
--- a/compute/etc/sql_exporter/compute_logical_snapshots_bytes.15.sql
+++ b/compute/etc/sql_exporter/compute_logical_snapshots_bytes.15.sql
@@ -0,0 +1,7 @@
+SELECT
+  (SELECT current_setting('neon.timeline_id')) AS timeline_id,
+  -- Postgres creates temporary snapshot files of the form %X-%X.snap.%d.tmp.
+  -- These temporary snapshot files are renamed to the actual snapshot files
+  -- after they are completely built. We only WAL-log the completely built
+  -- snapshot files
+  (SELECT COALESCE(sum(size), 0) FROM pg_ls_logicalsnapdir() WHERE name LIKE '%.snap') AS logical_snapshots_bytes;
--- a/compute/etc/sql_exporter/compute_logical_snapshots_bytes.libsonnet
+++ b/compute/etc/sql_exporter/compute_logical_snapshots_bytes.libsonnet
@@ -0,0 +1,17 @@
+local neon = import 'neon.libsonnet';
+
+local pg_ls_logicalsnapdir = importstr 'sql_exporter/compute_logical_snapshots_bytes.15.sql';
+local pg_ls_dir = importstr 'sql_exporter/compute_logical_snapshots_bytes.sql';
+
+{
+  metric_name: 'compute_logical_snapshots_bytes',
+  type: 'gauge',
+  help: 'Size of the pg_logical/snapshots directory, not including temporary files',
+  key_labels: [
+    'timeline_id',
+  ],
+  values: [
+    'logical_snapshots_bytes',
+  ],
+  query: if neon.PG_MAJORVERSION_NUM < 15 then pg_ls_dir else pg_ls_logicalsnapdir,
+}
--- a/compute/etc/sql_exporter/compute_logical_snapshots_bytes.sql
+++ b/compute/etc/sql_exporter/compute_logical_snapshots_bytes.sql
@@ -0,0 +1,9 @@
+SELECT
+  (SELECT setting FROM pg_settings WHERE name = 'neon.timeline_id') AS timeline_id,
+  -- Postgres creates temporary snapshot files of the form %X-%X.snap.%d.tmp.
+  -- These temporary snapshot files are renamed to the actual snapshot files
+  -- after they are completely built. We only WAL-log the completely built
+  -- snapshot files
+  (SELECT COALESCE(sum((pg_stat_file('pg_logical/snapshots/' || name, missing_ok => true)).size), 0)
+    FROM (SELECT * FROM pg_ls_dir('pg_logical/snapshots') WHERE pg_ls_dir LIKE '%.snap') AS name
+  ) AS logical_snapshots_bytes;
--- a/compute/patches/cloud_regress_pg17.patch
+++ b/compute/patches/cloud_regress_pg17.patch
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -246,47 +246,48 @@ fn try_spec_from_cli(
    let compute_id = matches.get_one::<String>("compute-id");
    let control_plane_uri = matches.get_one::<String>("control-plane-uri");

-    let spec;
-    let mut live_config_allowed = false;
-    match spec_json {
-        // First, try to get cluster spec from the cli argument
-        Some(json) => {
-            info!("got spec from cli argument {}", json);
-            spec = Some(serde_json::from_str(json)?);
-        }
-        None => {
-            // Second, try to read it from the file if path is provided
-            if let Some(sp) = spec_path {
-                let path = Path::new(sp);
-                let file = File::open(path)?;
-                spec = Some(serde_json::from_reader(file)?);
-                live_config_allowed = true;
-            } else if let Some(id) = compute_id {
-                if let Some(cp_base) = control_plane_uri {
-                    live_config_allowed = true;
-                    spec = match get_spec_from_control_plane(cp_base, id) {
-                        Ok(s) => s,
-                        Err(e) => {
-                            error!("cannot get response from control plane: {}", e);
-                            panic!("neither spec nor confirmation that compute is in the Empty state was received");
-                        }
-                    };
-                } else {
-                    panic!("must specify both --control-plane-uri and --compute-id or none");
-                }
-            } else {
-                panic!(
-                    "compute spec should be provided by one of the following ways: \
-                    --spec OR --spec-path OR --control-plane-uri and --compute-id"
-                );
-            }
-        }
+    // First, try to get cluster spec from the cli argument
+    if let Some(spec_json) = spec_json {
+        info!("got spec from cli argument {}", spec_json);
+        return Ok(CliSpecParams {
+            spec: Some(serde_json::from_str(spec_json)?),
+            live_config_allowed: false,
+        });
+    }
+
+    // Second, try to read it from the file if path is provided
+    if let Some(spec_path) = spec_path {
+        let file = File::open(Path::new(spec_path))?;
+        return Ok(CliSpecParams {
+            spec: Some(serde_json::from_reader(file)?),
+            live_config_allowed: true,
+        });
+    }
+
+    let Some(compute_id) = compute_id else {
+        panic!(
+            "compute spec should be provided by one of the following ways: \
+                --spec OR --spec-path OR --control-plane-uri and --compute-id"
+        );
+    };
+    let Some(control_plane_uri) = control_plane_uri else {
+        panic!("must specify both --control-plane-uri and --compute-id or none");
    };

-    Ok(CliSpecParams {
-        spec,
-        live_config_allowed,
-    })
+    match get_spec_from_control_plane(control_plane_uri, compute_id) {
+        Ok(spec) => Ok(CliSpecParams {
+            spec,
+            live_config_allowed: true,
+        }),
+        Err(e) => {
+            error!(
+                "cannot get response from control plane: {}\n\
+                neither spec nor confirmation that compute is in the Empty state was received",
+                e
+            );
+            Err(e)
+        }
+    }
 }

 struct CliSpecParams {
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -1243,12 +1243,7 @@ impl ComputeNode {
        let postgresql_conf_path = pgdata_path.join("postgresql.conf");
        config::write_postgres_conf(&postgresql_conf_path, &spec, self.http_port)?;

-        // TODO(ololobus): We need a concurrency during reconfiguration as well,
-        // but DB is already running and used by user. We can easily get out of
-        // `max_connections` limit, and the current code won't handle that.
-        // let compute_state = self.state.lock().unwrap().clone();
-        // let max_concurrent_connections = self.max_service_connections(&compute_state, &spec);
-        let max_concurrent_connections = 1;
+        let max_concurrent_connections = spec.reconfigure_concurrency;

        // Temporarily reset max_cluster_size in config
        // to avoid the possibility of hitting the limit, while we are reconfiguring:
--- a/compute_tools/src/http/openapi_spec.yaml
+++ b/compute_tools/src/http/openapi_spec.yaml
@@ -537,12 +537,14 @@ components:
            properties:
              extname:
                type: string
-              versions:
-                type: array
+              version:
+                type: string
                items:
                  type: string
              n_databases:
                type: integer
+              owned_by_superuser:
+                type: integer

    SetRoleGrantsRequest:
      type: object
--- a/compute_tools/src/installed_extensions.rs
+++ b/compute_tools/src/installed_extensions.rs
@@ -1,7 +1,6 @@
 use compute_api::responses::{InstalledExtension, InstalledExtensions};
 use metrics::proto::MetricFamily;
 use std::collections::HashMap;
-use std::collections::HashSet;

 use anyhow::Result;
 use postgres::{Client, NoTls};
@@ -38,61 +37,77 @@ fn list_dbs(client: &mut Client) -> Result<Vec<String>> {
 /// Connect to every database (see list_dbs above) and get the list of installed extensions.
 ///
 /// Same extension can be installed in multiple databases with different versions,
-/// we only keep the highest and lowest version across all databases.
+/// so we report a separate metric (number of databases where it is installed)
+/// for each extension version.
 pub fn get_installed_extensions(mut conf: postgres::config::Config) -> Result<InstalledExtensions> {
    conf.application_name("compute_ctl:get_installed_extensions");
    let mut client = conf.connect(NoTls)?;
-
    let databases: Vec<String> = list_dbs(&mut client)?;

-    let mut extensions_map: HashMap<String, InstalledExtension> = HashMap::new();
+    let mut extensions_map: HashMap<(String, String, String), InstalledExtension> = HashMap::new();
    for db in databases.iter() {
        conf.dbname(db);
        let mut db_client = conf.connect(NoTls)?;
-        let extensions: Vec<(String, String)> = db_client
+        let extensions: Vec<(String, String, i32)> = db_client
            .query(
-                "SELECT extname, extversion FROM pg_catalog.pg_extension;",
+                "SELECT extname, extversion, extowner::integer FROM pg_catalog.pg_extension",
                &[],
            )?
            .iter()
-            .map(|row| (row.get("extname"), row.get("extversion")))
+            .map(|row| {
+                (
+                    row.get("extname"),
+                    row.get("extversion"),
+                    row.get("extowner"),
+                )
+            })
            .collect();

-        for (extname, v) in extensions.iter() {
+        for (extname, v, extowner) in extensions.iter() {
            let version = v.to_string();

-            // increment the number of databases where the version of extension is installed
-            INSTALLED_EXTENSIONS
-                .with_label_values(&[extname, &version])
-                .inc();
+            // check if the extension is owned by superuser
+            // 10 is the oid of superuser
+            let owned_by_superuser = if *extowner == 10 { "1" } else { "0" };

            extensions_map
-                .entry(extname.to_string())
+                .entry((
+                    extname.to_string(),
+                    version.clone(),
+                    owned_by_superuser.to_string(),
+                ))
                .and_modify(|e| {
-                    e.versions.insert(version.clone());
                    // count the number of databases where the extension is installed
                    e.n_databases += 1;
                })
                .or_insert(InstalledExtension {
                    extname: extname.to_string(),
-                    versions: HashSet::from([version.clone()]),
+                    version: version.clone(),
                    n_databases: 1,
+                    owned_by_superuser: owned_by_superuser.to_string(),
                });
        }
    }

-    let res = InstalledExtensions {
-        extensions: extensions_map.into_values().collect(),
-    };
+    for (key, ext) in extensions_map.iter() {
+        let (extname, version, owned_by_superuser) = key;
+        let n_databases = ext.n_databases as u64;

-    Ok(res)
+        INSTALLED_EXTENSIONS
+            .with_label_values(&[extname, version, owned_by_superuser])
+            .set(n_databases);
+    }
+
+    Ok(InstalledExtensions {
+        extensions: extensions_map.into_values().collect(),
+    })
 }

 static INSTALLED_EXTENSIONS: Lazy<UIntGaugeVec> = Lazy::new(|| {
    register_uint_gauge_vec!(
        "compute_installed_extensions",
        "Number of databases where the version of extension is installed",
-        &["extension_name", "version"]
+        &["extension_name", "version", "owned_by_superuser"]
    )
    .expect("failed to define a metric")
 });
--- a/control_plane/src/background_process.rs
+++ b/control_plane/src/background_process.rs
@@ -274,6 +274,7 @@ fn fill_remote_storage_secrets_vars(mut cmd: &mut Command) -> &mut Command {
    for env_key in [
        "AWS_ACCESS_KEY_ID",
        "AWS_SECRET_ACCESS_KEY",
+        "AWS_SESSION_TOKEN",
        "AWS_PROFILE",
        // HOME is needed in combination with `AWS_PROFILE` to pick up the SSO sessions.
        "HOME",
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -53,6 +53,7 @@ use compute_api::spec::Role;
 use nix::sys::signal::kill;
 use nix::sys::signal::Signal;
 use pageserver_api::shard::ShardStripeSize;
+use reqwest::header::CONTENT_TYPE;
 use serde::{Deserialize, Serialize};
 use url::Host;
 use utils::id::{NodeId, TenantId, TimelineId};
@@ -618,6 +619,7 @@ impl Endpoint {
            pgbouncer_settings: None,
            shard_stripe_size: Some(shard_stripe_size),
            local_proxy_config: None,
+            reconfigure_concurrency: 1,
        };
        let spec_path = self.endpoint_path().join("spec.json");
        std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?;
@@ -808,7 +810,7 @@ impl Endpoint {
        }

        let client = reqwest::Client::builder()
-            .timeout(Duration::from_secs(30))
+            .timeout(Duration::from_secs(120))
            .build()
            .unwrap();
        let response = client
@@ -817,6 +819,7 @@ impl Endpoint {
                self.http_address.ip(),
                self.http_address.port()
            ))
+            .header(CONTENT_TYPE.as_str(), "application/json")
            .body(format!(
                "{{\"spec\":{}}}",
                serde_json::to_string_pretty(&spec)?
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -435,7 +435,7 @@ impl PageServerNode {
    ) -> anyhow::Result<()> {
        let config = Self::parse_config(settings)?;
        self.http_client
-            .tenant_config(&models::TenantConfigRequest { tenant_id, config })
+            .set_tenant_config(&models::TenantConfigRequest { tenant_id, config })
            .await?;

        Ok(())
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -9,8 +9,8 @@ use pageserver_api::{
    },
    models::{
        EvictionPolicy, EvictionPolicyLayerAccessThreshold, LocationConfigSecondary,
-        ShardParameters, TenantConfig, TenantConfigRequest, TenantShardSplitRequest,
-        TenantShardSplitResponse,
+        ShardParameters, TenantConfig, TenantConfigPatchRequest, TenantConfigRequest,
+        TenantShardSplitRequest, TenantShardSplitResponse,
    },
    shard::{ShardStripeSize, TenantShardId},
 };
@@ -116,9 +116,19 @@ enum Command {
        #[arg(long)]
        tenant_shard_id: TenantShardId,
    },
-    /// Modify the pageserver tenant configuration of a tenant: this is the configuration structure
+    /// Set the pageserver tenant configuration of a tenant: this is the configuration structure
    /// that is passed through to pageservers, and does not affect storage controller behavior.
-    TenantConfig {
+    /// Any previous tenant configs are overwritten.
+    SetTenantConfig {
+        #[arg(long)]
+        tenant_id: TenantId,
+        #[arg(long)]
+        config: String,
+    },
+    /// Patch the pageserver tenant configuration of a tenant. Any fields with null values in the
+    /// provided JSON are unset from the tenant config and all fields with non-null values are set.
+    /// Unspecified fields are not changed.
+    PatchTenantConfig {
        #[arg(long)]
        tenant_id: TenantId,
        #[arg(long)]
@@ -549,11 +559,21 @@ async fn main() -> anyhow::Result<()> {
                )
                .await?;
        }
-        Command::TenantConfig { tenant_id, config } => {
+        Command::SetTenantConfig { tenant_id, config } => {
            let tenant_conf = serde_json::from_str(&config)?;

            vps_client
-                .tenant_config(&TenantConfigRequest {
+                .set_tenant_config(&TenantConfigRequest {
+                    tenant_id,
+                    config: tenant_conf,
+                })
+                .await?;
+        }
+        Command::PatchTenantConfig { tenant_id, config } => {
+            let tenant_conf = serde_json::from_str(&config)?;
+
+            vps_client
+                .patch_tenant_config(&TenantConfigPatchRequest {
                    tenant_id,
                    config: tenant_conf,
                })
@@ -736,7 +756,7 @@ async fn main() -> anyhow::Result<()> {
            threshold,
        } => {
            vps_client
-                .tenant_config(&TenantConfigRequest {
+                .set_tenant_config(&TenantConfigRequest {
                    tenant_id,
                    config: TenantConfig {
                        eviction_policy: Some(EvictionPolicy::LayerAccessThreshold(
--- a/deny.toml
+++ b/deny.toml
@@ -42,6 +42,7 @@ allow = [
    "MPL-2.0",
    "OpenSSL",
    "Unicode-DFS-2016",
+    "Unicode-3.0",
 ]
 confidence-threshold = 0.8
 exceptions = [
--- a/libs/compute_api/src/responses.rs
+++ b/libs/compute_api/src/responses.rs
@@ -1,6 +1,5 @@
 //! Structs representing the JSON formats used in the compute_ctl's HTTP API.

-use std::collections::HashSet;
 use std::fmt::Display;

 use chrono::{DateTime, Utc};
@@ -163,8 +162,9 @@ pub enum ControlPlaneComputeStatus {
 #[derive(Clone, Debug, Default, Serialize)]
 pub struct InstalledExtension {
    pub extname: String,
-    pub versions: HashSet<String>,
+    pub version: String,
    pub n_databases: u32, // Number of databases using this extension
+    pub owned_by_superuser: String,
 }

 #[derive(Clone, Debug, Default, Serialize)]
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -19,6 +19,10 @@ pub type PgIdent = String;
 /// String type alias representing Postgres extension version
 pub type ExtVersion = String;

+fn default_reconfigure_concurrency() -> usize {
+    1
+}
+
 /// Cluster spec or configuration represented as an optional number of
 /// delta operations + final cluster state description.
 #[derive(Clone, Debug, Default, Deserialize, Serialize)]
@@ -67,7 +71,7 @@ pub struct ComputeSpec {
    pub cluster: Cluster,
    pub delta_operations: Option<Vec<DeltaOp>>,

-    /// An optinal hint that can be passed to speed up startup time if we know
+    /// An optional hint that can be passed to speed up startup time if we know
    /// that no pg catalog mutations (like role creation, database creation,
    /// extension creation) need to be done on the actual database to start.
    #[serde(default)] // Default false
@@ -86,9 +90,7 @@ pub struct ComputeSpec {
    // etc. GUCs in cluster.settings. TODO: Once the control plane has been
    // updated to fill these fields, we can make these non optional.
    pub tenant_id: Option<TenantId>,
-
    pub timeline_id: Option<TimelineId>,
-
    pub pageserver_connstring: Option<String>,

    #[serde(default)]
@@ -113,6 +115,20 @@ pub struct ComputeSpec {
    /// Local Proxy configuration used for JWT authentication
    #[serde(default)]
    pub local_proxy_config: Option<LocalProxySpec>,
+
+    /// Number of concurrent connections during the parallel RunInEachDatabase
+    /// phase of the apply config process.
+    ///
+    /// We need a higher concurrency during reconfiguration in case of many DBs,
+    /// but instance is already running and used by client. We can easily get out of
+    /// `max_connections` limit, and the current code won't handle that.
+    ///
+    /// Default is 1, but also allow control plane to override this value for specific
+    /// projects. It's also recommended to bump `superuser_reserved_connections` +=
+    /// `reconfigure_concurrency` for such projects to ensure that we always have
+    /// enough spare connections for reconfiguration process to succeed.
+    #[serde(default = "default_reconfigure_concurrency")]
+    pub reconfigure_concurrency: usize,
 }

 /// Feature flag to signal `compute_ctl` to enable certain experimental functionality.
@@ -315,6 +331,9 @@ mod tests {

        // Features list defaults to empty vector.
        assert!(spec.features.is_empty());
+
+        // Reconfigure concurrency defaults to 1.
+        assert_eq!(spec.reconfigure_concurrency, 1);
    }

    #[test]
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -75,7 +75,7 @@ pub struct TenantPolicyRequest {
    pub scheduling: Option<ShardSchedulingPolicy>,
 }

-#[derive(Clone, Serialize, Deserialize, PartialEq, Eq, Hash, Debug)]
+#[derive(Clone, Serialize, Deserialize, PartialEq, Eq, Hash, Debug, PartialOrd, Ord)]
 pub struct AvailabilityZone(pub String);

 impl Display for AvailabilityZone {
@@ -245,6 +245,17 @@ impl From<NodeAvailability> for NodeAvailabilityWrapper {
    }
 }

+/// Scheduling policy enables us to selectively disable some automatic actions that the
+/// controller performs on a tenant shard. This is only set to a non-default value by
+/// human intervention, and it is reset to the default value (Active) when the tenant's
+/// placement policy is modified away from Attached.
+///
+/// The typical use of a non-Active scheduling policy is one of:
+/// - Pinnning a shard to a node (i.e. migrating it there & setting a non-Active scheduling policy)
+/// - Working around a bug (e.g. if something is flapping and we need to stop it until the bug is fixed)
+///
+/// If you're not sure which policy to use to pin a shard to its current location, you probably
+/// want Pause.
 #[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, Debug)]
 pub enum ShardSchedulingPolicy {
    // Normal mode: the tenant's scheduled locations may be updated at will, including
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -24,7 +24,7 @@ pub struct Key {

 /// When working with large numbers of Keys in-memory, it is more efficient to handle them as i128 than as
 /// a struct of fields.
-#[derive(Clone, Copy, Hash, PartialEq, Eq, Ord, PartialOrd, Serialize, Deserialize)]
+#[derive(Clone, Copy, Hash, PartialEq, Eq, Ord, PartialOrd, Serialize, Deserialize, Debug)]
 pub struct CompactKey(i128);

 /// The storage key size.
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -17,7 +17,7 @@ use std::{

 use byteorder::{BigEndian, ReadBytesExt};
 use postgres_ffi::BLCKSZ;
-use serde::{Deserialize, Serialize};
+use serde::{Deserialize, Deserializer, Serialize, Serializer};
 use serde_with::serde_as;
 use utils::{
    completion,
@@ -325,6 +325,115 @@ impl Default for ShardParameters {
    }
 }

+#[derive(Debug, Default, Clone, Eq, PartialEq)]
+pub enum FieldPatch<T> {
+    Upsert(T),
+    Remove,
+    #[default]
+    Noop,
+}
+
+impl<T> FieldPatch<T> {
+    fn is_noop(&self) -> bool {
+        matches!(self, FieldPatch::Noop)
+    }
+
+    pub fn apply(self, target: &mut Option<T>) {
+        match self {
+            Self::Upsert(v) => *target = Some(v),
+            Self::Remove => *target = None,
+            Self::Noop => {}
+        }
+    }
+
+    pub fn map<U, E, F: FnOnce(T) -> Result<U, E>>(self, map: F) -> Result<FieldPatch<U>, E> {
+        match self {
+            Self::Upsert(v) => Ok(FieldPatch::<U>::Upsert(map(v)?)),
+            Self::Remove => Ok(FieldPatch::<U>::Remove),
+            Self::Noop => Ok(FieldPatch::<U>::Noop),
+        }
+    }
+}
+
+impl<'de, T: Deserialize<'de>> Deserialize<'de> for FieldPatch<T> {
+    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
+    where
+        D: Deserializer<'de>,
+    {
+        Option::deserialize(deserializer).map(|opt| match opt {
+            None => FieldPatch::Remove,
+            Some(val) => FieldPatch::Upsert(val),
+        })
+    }
+}
+
+impl<T: Serialize> Serialize for FieldPatch<T> {
+    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+    where
+        S: Serializer,
+    {
+        match self {
+            FieldPatch::Upsert(val) => serializer.serialize_some(val),
+            FieldPatch::Remove => serializer.serialize_none(),
+            FieldPatch::Noop => unreachable!(),
+        }
+    }
+}
+
+#[derive(Serialize, Deserialize, Debug, Default, Clone, Eq, PartialEq)]
+#[serde(default)]
+pub struct TenantConfigPatch {
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub checkpoint_distance: FieldPatch<u64>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub checkpoint_timeout: FieldPatch<String>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub compaction_target_size: FieldPatch<u64>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub compaction_period: FieldPatch<String>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub compaction_threshold: FieldPatch<usize>,
+    // defer parsing compaction_algorithm, like eviction_policy
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub compaction_algorithm: FieldPatch<CompactionAlgorithmSettings>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub gc_horizon: FieldPatch<u64>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub gc_period: FieldPatch<String>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub image_creation_threshold: FieldPatch<usize>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub pitr_interval: FieldPatch<String>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub walreceiver_connect_timeout: FieldPatch<String>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub lagging_wal_timeout: FieldPatch<String>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub max_lsn_wal_lag: FieldPatch<NonZeroU64>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub eviction_policy: FieldPatch<EvictionPolicy>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub min_resident_size_override: FieldPatch<u64>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub evictions_low_residence_duration_metric_threshold: FieldPatch<String>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub heatmap_period: FieldPatch<String>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub lazy_slru_download: FieldPatch<bool>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub timeline_get_throttle: FieldPatch<ThrottleConfig>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub image_layer_creation_check_threshold: FieldPatch<u8>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub lsn_lease_length: FieldPatch<String>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub lsn_lease_length_for_ts: FieldPatch<String>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub timeline_offloading: FieldPatch<bool>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub wal_receiver_protocol_override: FieldPatch<PostgresClientProtocol>,
+}
+
 /// An alternative representation of `pageserver::tenant::TenantConf` with
 /// simpler types.
 #[derive(Serialize, Deserialize, Debug, Default, Clone, Eq, PartialEq)]
@@ -356,6 +465,107 @@ pub struct TenantConfig {
    pub wal_receiver_protocol_override: Option<PostgresClientProtocol>,
 }

+impl TenantConfig {
+    pub fn apply_patch(self, patch: TenantConfigPatch) -> TenantConfig {
+        let Self {
+            mut checkpoint_distance,
+            mut checkpoint_timeout,
+            mut compaction_target_size,
+            mut compaction_period,
+            mut compaction_threshold,
+            mut compaction_algorithm,
+            mut gc_horizon,
+            mut gc_period,
+            mut image_creation_threshold,
+            mut pitr_interval,
+            mut walreceiver_connect_timeout,
+            mut lagging_wal_timeout,
+            mut max_lsn_wal_lag,
+            mut eviction_policy,
+            mut min_resident_size_override,
+            mut evictions_low_residence_duration_metric_threshold,
+            mut heatmap_period,
+            mut lazy_slru_download,
+            mut timeline_get_throttle,
+            mut image_layer_creation_check_threshold,
+            mut lsn_lease_length,
+            mut lsn_lease_length_for_ts,
+            mut timeline_offloading,
+            mut wal_receiver_protocol_override,
+        } = self;
+
+        patch.checkpoint_distance.apply(&mut checkpoint_distance);
+        patch.checkpoint_timeout.apply(&mut checkpoint_timeout);
+        patch
+            .compaction_target_size
+            .apply(&mut compaction_target_size);
+        patch.compaction_period.apply(&mut compaction_period);
+        patch.compaction_threshold.apply(&mut compaction_threshold);
+        patch.compaction_algorithm.apply(&mut compaction_algorithm);
+        patch.gc_horizon.apply(&mut gc_horizon);
+        patch.gc_period.apply(&mut gc_period);
+        patch
+            .image_creation_threshold
+            .apply(&mut image_creation_threshold);
+        patch.pitr_interval.apply(&mut pitr_interval);
+        patch
+            .walreceiver_connect_timeout
+            .apply(&mut walreceiver_connect_timeout);
+        patch.lagging_wal_timeout.apply(&mut lagging_wal_timeout);
+        patch.max_lsn_wal_lag.apply(&mut max_lsn_wal_lag);
+        patch.eviction_policy.apply(&mut eviction_policy);
+        patch
+            .min_resident_size_override
+            .apply(&mut min_resident_size_override);
+        patch
+            .evictions_low_residence_duration_metric_threshold
+            .apply(&mut evictions_low_residence_duration_metric_threshold);
+        patch.heatmap_period.apply(&mut heatmap_period);
+        patch.lazy_slru_download.apply(&mut lazy_slru_download);
+        patch
+            .timeline_get_throttle
+            .apply(&mut timeline_get_throttle);
+        patch
+            .image_layer_creation_check_threshold
+            .apply(&mut image_layer_creation_check_threshold);
+        patch.lsn_lease_length.apply(&mut lsn_lease_length);
+        patch
+            .lsn_lease_length_for_ts
+            .apply(&mut lsn_lease_length_for_ts);
+        patch.timeline_offloading.apply(&mut timeline_offloading);
+        patch
+            .wal_receiver_protocol_override
+            .apply(&mut wal_receiver_protocol_override);
+
+        Self {
+            checkpoint_distance,
+            checkpoint_timeout,
+            compaction_target_size,
+            compaction_period,
+            compaction_threshold,
+            compaction_algorithm,
+            gc_horizon,
+            gc_period,
+            image_creation_threshold,
+            pitr_interval,
+            walreceiver_connect_timeout,
+            lagging_wal_timeout,
+            max_lsn_wal_lag,
+            eviction_policy,
+            min_resident_size_override,
+            evictions_low_residence_duration_metric_threshold,
+            heatmap_period,
+            lazy_slru_download,
+            timeline_get_throttle,
+            image_layer_creation_check_threshold,
+            lsn_lease_length,
+            lsn_lease_length_for_ts,
+            timeline_offloading,
+            wal_receiver_protocol_override,
+        }
+    }
+}
+
 /// The policy for the aux file storage.
 ///
 /// It can be switched through `switch_aux_file_policy` tenant config.
@@ -686,6 +896,14 @@ impl TenantConfigRequest {
    }
 }

+#[derive(Serialize, Deserialize, Debug)]
+#[serde(deny_unknown_fields)]
+pub struct TenantConfigPatchRequest {
+    pub tenant_id: TenantId,
+    #[serde(flatten)]
+    pub config: TenantConfigPatch, // as we have a flattened field, we should reject all unknown fields in it
+}
+
 /// See [`TenantState::attachment_status`] and the OpenAPI docs for context.
 #[derive(Serialize, Deserialize, Clone)]
 #[serde(tag = "slug", content = "data", rename_all = "snake_case")]
@@ -1699,4 +1917,45 @@ mod tests {
            );
        }
    }
+
+    #[test]
+    fn test_tenant_config_patch_request_serde() {
+        let patch_request = TenantConfigPatchRequest {
+            tenant_id: TenantId::from_str("17c6d121946a61e5ab0fe5a2fd4d8215").unwrap(),
+            config: TenantConfigPatch {
+                checkpoint_distance: FieldPatch::Upsert(42),
+                gc_horizon: FieldPatch::Remove,
+                compaction_threshold: FieldPatch::Noop,
+                ..TenantConfigPatch::default()
+            },
+        };
+
+        let json = serde_json::to_string(&patch_request).unwrap();
+
+        let expected = r#"{"tenant_id":"17c6d121946a61e5ab0fe5a2fd4d8215","checkpoint_distance":42,"gc_horizon":null}"#;
+        assert_eq!(json, expected);
+
+        let decoded: TenantConfigPatchRequest = serde_json::from_str(&json).unwrap();
+        assert_eq!(decoded.tenant_id, patch_request.tenant_id);
+        assert_eq!(decoded.config, patch_request.config);
+
+        // Now apply the patch to a config to demonstrate semantics
+
+        let base = TenantConfig {
+            checkpoint_distance: Some(28),
+            gc_horizon: Some(100),
+            compaction_target_size: Some(1024),
+            ..Default::default()
+        };
+
+        let expected = TenantConfig {
+            checkpoint_distance: Some(42),
+            gc_horizon: None,
+            ..base.clone()
+        };
+
+        let patched = base.apply_patch(decoded.config);
+
+        assert_eq!(patched, expected);
+    }
 }
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -158,7 +158,8 @@ impl ShardIdentity {
        key_to_shard_number(self.count, self.stripe_size, key)
    }

-    /// Return true if the key should be ingested by this shard
+    /// Return true if the key is stored only on this shard. This does not include
+    /// global keys, see is_key_global().
    ///
    /// Shards must ingest _at least_ keys which return true from this check.
    pub fn is_key_local(&self, key: &Key) -> bool {
@@ -171,7 +172,7 @@ impl ShardIdentity {
    }

    /// Return true if the key should be stored on all shards, not just one.
-    fn is_key_global(&self, key: &Key) -> bool {
+    pub fn is_key_global(&self, key: &Key) -> bool {
        if key.is_slru_block_key() || key.is_slru_segment_size_key() || key.is_aux_file_key() {
            // Special keys that are only stored on shard 0
            false
--- a/libs/remote_storage/Cargo.toml
+++ b/libs/remote_storage/Cargo.toml
@@ -18,6 +18,7 @@ camino = { workspace = true, features = ["serde1"] }
 humantime-serde.workspace = true
 hyper = { workspace = true, features = ["client"] }
 futures.workspace = true
+reqwest.workspace = true
 serde.workspace = true
 serde_json.workspace = true
 tokio = { workspace = true, features = ["sync", "fs", "io-util"] }
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -13,10 +13,10 @@ use std::time::Duration;
 use std::time::SystemTime;

 use super::REMOTE_STORAGE_PREFIX_SEPARATOR;
+use anyhow::Context;
 use anyhow::Result;
 use azure_core::request_options::{IfMatchCondition, MaxResults, Metadata, Range};
-use azure_core::{Continuable, RetryOptions};
-use azure_identity::DefaultAzureCredential;
+use azure_core::{Continuable, HttpClient, RetryOptions, TransportOptions};
 use azure_storage::StorageCredentials;
 use azure_storage_blobs::blob::CopyStatus;
 use azure_storage_blobs::prelude::ClientBuilder;
@@ -76,13 +76,15 @@ impl AzureBlobStorage {
        let credentials = if let Ok(access_key) = env::var("AZURE_STORAGE_ACCESS_KEY") {
            StorageCredentials::access_key(account.clone(), access_key)
        } else {
-            let token_credential = DefaultAzureCredential::default();
-            StorageCredentials::token_credential(Arc::new(token_credential))
+            let token_credential = azure_identity::create_default_credential()
+                .context("trying to obtain Azure default credentials")?;
+            StorageCredentials::token_credential(token_credential)
        };

        // we have an outer retry
-        let builder = ClientBuilder::new(account, credentials).retry(RetryOptions::none());
-
+        let builder = ClientBuilder::new(account, credentials)
+            .retry(RetryOptions::none())
+            .transport(TransportOptions::new(reqwest_client(small_timeout)));
        let client = builder.container_client(azure_config.container_name.to_owned());

        let max_keys_per_list_response =
@@ -261,6 +263,16 @@ impl AzureBlobStorage {
    }
 }

+fn reqwest_client(timeout: Duration) -> Arc<dyn HttpClient> {
+    let client = reqwest::ClientBuilder::new()
+        .pool_max_idle_per_host(0)
+        .read_timeout(timeout)
+        .connect_timeout(timeout)
+        .build()
+        .expect("failed to build `reqwest` client");
+    Arc::new(client)
+}
+
 fn to_azure_metadata(metadata: StorageMetadata) -> Metadata {
    let mut res = Metadata::new();
    for (k, v) in metadata.0.into_iter() {
@@ -624,6 +636,10 @@ impl RemoteStorage for AzureBlobStorage {
        res
    }

+    fn max_keys_per_delete(&self) -> usize {
+        super::MAX_KEYS_PER_DELETE_AZURE
+    }
+
    async fn copy(
        &self,
        from: &RemotePath,
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -70,7 +70,14 @@ pub const DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT: usize = 100;
 pub const DEFAULT_MAX_KEYS_PER_LIST_RESPONSE: Option<i32> = None;

 /// As defined in S3 docs
-pub const MAX_KEYS_PER_DELETE: usize = 1000;
+///
+/// <https://docs.aws.amazon.com/AmazonS3/latest/API/API_DeleteObjects.html>
+pub const MAX_KEYS_PER_DELETE_S3: usize = 1000;
+
+/// As defined in Azure docs
+///
+/// <https://learn.microsoft.com/en-us/rest/api/storageservices/blob-batch>
+pub const MAX_KEYS_PER_DELETE_AZURE: usize = 256;

 const REMOTE_STORAGE_PREFIX_SEPARATOR: char = '/';

@@ -340,6 +347,14 @@ pub trait RemoteStorage: Send + Sync + 'static {
        cancel: &CancellationToken,
    ) -> anyhow::Result<()>;

+    /// Returns the maximum number of keys that a call to [`Self::delete_objects`] can delete without chunking
+    ///
+    /// The value returned is only an optimization hint, One can pass larger number of objects to
+    /// `delete_objects` as well.
+    ///
+    /// The value is guaranteed to be >= 1.
+    fn max_keys_per_delete(&self) -> usize;
+
    /// Deletes all objects matching the given prefix.
    ///
    /// NB: this uses NoDelimiter and will match partial prefixes. For example, the prefix /a/b will
@@ -533,6 +548,16 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
        }
    }

+    /// [`RemoteStorage::max_keys_per_delete`]
+    pub fn max_keys_per_delete(&self) -> usize {
+        match self {
+            Self::LocalFs(s) => s.max_keys_per_delete(),
+            Self::AwsS3(s) => s.max_keys_per_delete(),
+            Self::AzureBlob(s) => s.max_keys_per_delete(),
+            Self::Unreliable(s) => s.max_keys_per_delete(),
+        }
+    }
+
    /// See [`RemoteStorage::delete_prefix`]
    pub async fn delete_prefix(
        &self,
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -573,6 +573,10 @@ impl RemoteStorage for LocalFs {
        Ok(())
    }

+    fn max_keys_per_delete(&self) -> usize {
+        super::MAX_KEYS_PER_DELETE_S3
+    }
+
    async fn copy(
        &self,
        from: &RemotePath,
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -48,7 +48,7 @@ use crate::{
    metrics::{start_counting_cancelled_wait, start_measuring_requests},
    support::PermitCarrying,
    ConcurrencyLimiter, Download, DownloadError, DownloadOpts, Listing, ListingMode, ListingObject,
-    RemotePath, RemoteStorage, TimeTravelError, TimeoutOrCancel, MAX_KEYS_PER_DELETE,
+    RemotePath, RemoteStorage, TimeTravelError, TimeoutOrCancel, MAX_KEYS_PER_DELETE_S3,
    REMOTE_STORAGE_PREFIX_SEPARATOR,
 };

@@ -355,7 +355,7 @@ impl S3Bucket {
        let kind = RequestKind::Delete;
        let mut cancel = std::pin::pin!(cancel.cancelled());

-        for chunk in delete_objects.chunks(MAX_KEYS_PER_DELETE) {
+        for chunk in delete_objects.chunks(MAX_KEYS_PER_DELETE_S3) {
            let started_at = start_measuring_requests(kind);

            let req = self
@@ -832,6 +832,10 @@ impl RemoteStorage for S3Bucket {
        self.delete_oids(&permit, &delete_objects, cancel).await
    }

+    fn max_keys_per_delete(&self) -> usize {
+        MAX_KEYS_PER_DELETE_S3
+    }
+
    async fn delete(&self, path: &RemotePath, cancel: &CancellationToken) -> anyhow::Result<()> {
        let paths = std::array::from_ref(path);
        self.delete_objects(paths, cancel).await
--- a/libs/remote_storage/src/simulate_failures.rs
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -203,6 +203,10 @@ impl RemoteStorage for UnreliableWrapper {
        Ok(())
    }

+    fn max_keys_per_delete(&self) -> usize {
+        self.inner.max_keys_per_delete()
+    }
+
    async fn copy(
        &self,
        from: &RemotePath,
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -94,6 +94,8 @@ pub mod toml_edit_ext;

 pub mod circuit_breaker;

+pub mod try_rcu;
+
 // Re-export used in macro. Avoids adding git-version as dep in target crates.
 #[doc(hidden)]
 pub use git_version;
--- a/libs/utils/src/shard.rs
+++ b/libs/utils/src/shard.rs
@@ -164,6 +164,12 @@ impl TenantShardId {
    }
 }

+impl std::fmt::Display for ShardNumber {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        self.0.fmt(f)
+    }
+}
+
 impl std::fmt::Display for ShardSlug<'_> {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(
--- a/libs/utils/src/try_rcu.rs
+++ b/libs/utils/src/try_rcu.rs
@@ -0,0 +1,77 @@
+//! Try RCU extension lifted from <https://github.com/vorner/arc-swap/issues/94#issuecomment-1987154023>
+
+pub trait ArcSwapExt<T> {
+    /// [`ArcSwap::rcu`](arc_swap::ArcSwap::rcu), but with Result that short-circuits on error.
+    fn try_rcu<R, F, E>(&self, f: F) -> Result<T, E>
+    where
+        F: FnMut(&T) -> Result<R, E>,
+        R: Into<T>;
+}
+
+impl<T, S> ArcSwapExt<T> for arc_swap::ArcSwapAny<T, S>
+where
+    T: arc_swap::RefCnt,
+    S: arc_swap::strategy::CaS<T>,
+{
+    fn try_rcu<R, F, E>(&self, mut f: F) -> Result<T, E>
+    where
+        F: FnMut(&T) -> Result<R, E>,
+        R: Into<T>,
+    {
+        fn ptr_eq<Base, A, B>(a: A, b: B) -> bool
+        where
+            A: arc_swap::AsRaw<Base>,
+            B: arc_swap::AsRaw<Base>,
+        {
+            let a = a.as_raw();
+            let b = b.as_raw();
+            std::ptr::eq(a, b)
+        }
+
+        let mut cur = self.load();
+        loop {
+            let new = f(&cur)?.into();
+            let prev = self.compare_and_swap(&*cur, new);
+            let swapped = ptr_eq(&*cur, &*prev);
+            if swapped {
+                return Ok(arc_swap::Guard::into_inner(prev));
+            } else {
+                cur = prev;
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use arc_swap::ArcSwap;
+    use std::sync::Arc;
+
+    #[test]
+    fn test_try_rcu_success() {
+        let swap = ArcSwap::from(Arc::new(42));
+
+        let result = swap.try_rcu(|value| -> Result<_, String> { Ok(**value + 1) });
+
+        assert!(result.is_ok());
+        assert_eq!(**swap.load(), 43);
+    }
+
+    #[test]
+    fn test_try_rcu_error() {
+        let swap = ArcSwap::from(Arc::new(42));
+
+        let result = swap.try_rcu(|value| -> Result<i32, _> {
+            if **value == 42 {
+                Err("err")
+            } else {
+                Ok(**value + 1)
+            }
+        });
+
+        assert!(result.is_err());
+        assert_eq!(result.unwrap_err(), "err");
+        assert_eq!(**swap.load(), 42);
+    }
+}
--- a/libs/wal_decoder/proto/interpreted_wal.proto
+++ b/libs/wal_decoder/proto/interpreted_wal.proto
@@ -37,7 +37,7 @@ message ValueMeta {
 }

 message CompactKey {
-  int64 high = 1;
-  int64 low = 2;
+  uint64 high = 1;
+  uint64 low = 2;
 }

--- a/libs/wal_decoder/src/wire_format.rs
+++ b/libs/wal_decoder/src/wire_format.rs
@@ -236,8 +236,8 @@ impl From<ValueMeta> for proto::ValueMeta {
 impl From<CompactKey> for proto::CompactKey {
    fn from(value: CompactKey) -> Self {
        proto::CompactKey {
-            high: (value.raw() >> 64) as i64,
-            low: value.raw() as i64,
+            high: (value.raw() >> 64) as u64,
+            low: value.raw() as u64,
        }
    }
 }
@@ -354,3 +354,64 @@ impl From<proto::CompactKey> for CompactKey {
        (((value.high as i128) << 64) | (value.low as i128)).into()
    }
 }
+
+#[test]
+fn test_compact_key_with_large_relnode() {
+    use pageserver_api::key::Key;
+
+    let inputs = vec![
+        Key {
+            field1: 0,
+            field2: 0x100,
+            field3: 0x200,
+            field4: 0,
+            field5: 0x10,
+            field6: 0x5,
+        },
+        Key {
+            field1: 0,
+            field2: 0x100,
+            field3: 0x200,
+            field4: 0x007FFFFF,
+            field5: 0x10,
+            field6: 0x5,
+        },
+        Key {
+            field1: 0,
+            field2: 0x100,
+            field3: 0x200,
+            field4: 0x00800000,
+            field5: 0x10,
+            field6: 0x5,
+        },
+        Key {
+            field1: 0,
+            field2: 0x100,
+            field3: 0x200,
+            field4: 0x00800001,
+            field5: 0x10,
+            field6: 0x5,
+        },
+        Key {
+            field1: 0,
+            field2: 0xFFFFFFFF,
+            field3: 0xFFFFFFFF,
+            field4: 0xFFFFFFFF,
+            field5: 0x0,
+            field6: 0x0,
+        },
+    ];
+
+    for input in inputs {
+        assert!(input.is_valid_key_on_write_path());
+        let compact = input.to_compact();
+        let proto: proto::CompactKey = compact.into();
+        let from_proto: CompactKey = proto.into();
+
+        assert_eq!(
+            compact, from_proto,
+            "Round trip failed for key with relnode={:#x}",
+            input.field4
+        );
+    }
+}
--- a/libs/walproposer/build.rs
+++ b/libs/walproposer/build.rs
@@ -30,9 +30,9 @@ fn main() -> anyhow::Result<()> {
    let pgxn_neon = std::fs::canonicalize(pgxn_neon)?;
    let pgxn_neon = pgxn_neon.to_str().ok_or(anyhow!("Bad non-UTF path"))?;

+    println!("cargo:rustc-link-lib=static=walproposer");
    println!("cargo:rustc-link-lib=static=pgport");
    println!("cargo:rustc-link-lib=static=pgcommon");
-    println!("cargo:rustc-link-lib=static=walproposer");
    println!("cargo:rustc-link-search={walproposer_lib_search_str}");

    // Rebuild crate when libwalproposer.a changes
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -270,12 +270,18 @@ impl Client {
        Ok(body)
    }

-    pub async fn tenant_config(&self, req: &TenantConfigRequest) -> Result<()> {
+    pub async fn set_tenant_config(&self, req: &TenantConfigRequest) -> Result<()> {
        let uri = format!("{}/v1/tenant/config", self.mgmt_api_endpoint);
        self.request(Method::PUT, &uri, req).await?;
        Ok(())
    }

+    pub async fn patch_tenant_config(&self, req: &TenantConfigPatchRequest) -> Result<()> {
+        let uri = format!("{}/v1/tenant/config", self.mgmt_api_endpoint);
+        self.request(Method::PATCH, &uri, req).await?;
+        Ok(())
+    }
+
    pub async fn tenant_secondary_download(
        &self,
        tenant_id: TenantShardId,
--- a/pageserver/pagebench/src/cmd/aux_files.rs
+++ b/pageserver/pagebench/src/cmd/aux_files.rs
@@ -64,7 +64,7 @@ async fn main_impl(args: Args) -> anyhow::Result<()> {
    println!("operating on timeline {}", timeline);

    mgmt_api_client
-        .tenant_config(&TenantConfigRequest {
+        .set_tenant_config(&TenantConfigRequest {
            tenant_id: timeline.tenant_id,
            config: TenantConfig::default(),
        })
--- a/pageserver/src/deletion_queue/deleter.rs
+++ b/pageserver/src/deletion_queue/deleter.rs
@@ -9,7 +9,6 @@
 use remote_storage::GenericRemoteStorage;
 use remote_storage::RemotePath;
 use remote_storage::TimeoutOrCancel;
-use remote_storage::MAX_KEYS_PER_DELETE;
 use std::time::Duration;
 use tokio_util::sync::CancellationToken;
 use tracing::info;
@@ -131,7 +130,8 @@ impl Deleter {
    }

    pub(super) async fn background(&mut self) -> Result<(), DeletionQueueError> {
-        self.accumulator.reserve(MAX_KEYS_PER_DELETE);
+        let max_keys_per_delete = self.remote_storage.max_keys_per_delete();
+        self.accumulator.reserve(max_keys_per_delete);

        loop {
            if self.cancel.is_cancelled() {
@@ -156,14 +156,14 @@ impl Deleter {

            match msg {
                DeleterMessage::Delete(mut list) => {
-                    while !list.is_empty() || self.accumulator.len() == MAX_KEYS_PER_DELETE {
-                        if self.accumulator.len() == MAX_KEYS_PER_DELETE {
+                    while !list.is_empty() || self.accumulator.len() == max_keys_per_delete {
+                        if self.accumulator.len() == max_keys_per_delete {
                            self.flush().await?;
                            // If we have received this number of keys, proceed with attempting to execute
                            assert_eq!(self.accumulator.len(), 0);
                        }

-                        let available_slots = MAX_KEYS_PER_DELETE - self.accumulator.len();
+                        let available_slots = max_keys_per_delete - self.accumulator.len();
                        let take_count = std::cmp::min(available_slots, list.len());
                        for path in list.drain(list.len() - take_count..) {
                            self.accumulator.push(path);
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -767,7 +767,27 @@ paths:
  /v1/tenant/config:
    put:
      description: |
-        Update tenant's config.
+        Update tenant's config by setting it to the provided value
+
+        Invalid fields in the tenant config will cause the request to be rejected with status 400.
+      requestBody:
+        content:
+          application/json:
+            schema:
+              $ref: "#/components/schemas/TenantConfigRequest"
+      responses:
+        "200":
+          description: OK
+          content:
+            application/json:
+              schema:
+                type: array
+                items:
+                  $ref: "#/components/schemas/TenantInfo"
+    patch:
+      description: |
+        Update tenant's config additively by patching the updated fields provided.
+        Null values unset the field and non-null values upsert it.

        Invalid fields in the tenant config will cause the request to be rejected with status 400.
      requestBody:
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -28,6 +28,7 @@ use pageserver_api::models::LsnLease;
 use pageserver_api::models::LsnLeaseRequest;
 use pageserver_api::models::OffloadedTimelineInfo;
 use pageserver_api::models::ShardParameters;
+use pageserver_api::models::TenantConfigPatchRequest;
 use pageserver_api::models::TenantDetails;
 use pageserver_api::models::TenantLocationConfigRequest;
 use pageserver_api::models::TenantLocationConfigResponse;
@@ -87,7 +88,7 @@ use crate::tenant::timeline::offload::offload_timeline;
 use crate::tenant::timeline::offload::OffloadError;
 use crate::tenant::timeline::CompactFlags;
 use crate::tenant::timeline::CompactOptions;
-use crate::tenant::timeline::CompactRange;
+use crate::tenant::timeline::CompactRequest;
 use crate::tenant::timeline::CompactionError;
 use crate::tenant::timeline::Timeline;
 use crate::tenant::GetTimelineError;
@@ -1695,7 +1696,47 @@ async fn update_tenant_config_handler(
    crate::tenant::Tenant::persist_tenant_config(state.conf, &tenant_shard_id, &location_conf)
        .await
        .map_err(|e| ApiError::InternalServerError(anyhow::anyhow!(e)))?;
-    tenant.set_new_tenant_config(new_tenant_conf);
+
+    let _ = tenant
+        .update_tenant_config(|_crnt| Ok(new_tenant_conf.clone()))
+        .expect("Closure returns Ok()");
+
+    json_response(StatusCode::OK, ())
+}
+
+async fn patch_tenant_config_handler(
+    mut request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let request_data: TenantConfigPatchRequest = json_request(&mut request).await?;
+    let tenant_id = request_data.tenant_id;
+    check_permission(&request, Some(tenant_id))?;
+
+    let state = get_state(&request);
+
+    let tenant_shard_id = TenantShardId::unsharded(tenant_id);
+
+    let tenant = state
+        .tenant_manager
+        .get_attached_tenant_shard(tenant_shard_id)?;
+    tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
+
+    let updated = tenant
+        .update_tenant_config(|crnt| crnt.apply_patch(request_data.config.clone()))
+        .map_err(ApiError::BadRequest)?;
+
+    // This is a legacy API that only operates on attached tenants: the preferred
+    // API to use is the location_config/ endpoint, which lets the caller provide
+    // the full LocationConf.
+    let location_conf = LocationConf::attached_single(
+        updated,
+        tenant.get_generation(),
+        &ShardParameters::default(),
+    );
+
+    crate::tenant::Tenant::persist_tenant_config(state.conf, &tenant_shard_id, &location_conf)
+        .await
+        .map_err(|e| ApiError::InternalServerError(anyhow::anyhow!(e)))?;

    json_response(StatusCode::OK, ())
 }
@@ -1978,6 +2019,26 @@ async fn timeline_gc_handler(
    json_response(StatusCode::OK, gc_result)
 }

+// Cancel scheduled compaction tasks
+async fn timeline_cancel_compact_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
+    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
+    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
+    let state = get_state(&request);
+    async {
+        let tenant = state
+            .tenant_manager
+            .get_attached_tenant_shard(tenant_shard_id)?;
+        tenant.cancel_scheduled_compaction(timeline_id);
+        json_response(StatusCode::OK, ())
+    }
+    .instrument(info_span!("timeline_cancel_compact", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id))
+    .await
+}
+
 // Run compaction immediately on given timeline.
 async fn timeline_compact_handler(
    mut request: Request<Body>,
@@ -1987,7 +2048,7 @@ async fn timeline_compact_handler(
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
    check_permission(&request, Some(tenant_shard_id.tenant_id))?;

-    let compact_range = json_request_maybe::<Option<CompactRange>>(&mut request).await?;
+    let compact_request = json_request_maybe::<Option<CompactRequest>>(&mut request).await?;

    let state = get_state(&request);

@@ -2012,22 +2073,57 @@ async fn timeline_compact_handler(
    let wait_until_uploaded =
        parse_query_param::<_, bool>(&request, "wait_until_uploaded")?.unwrap_or(false);

+    let wait_until_scheduled_compaction_done =
+        parse_query_param::<_, bool>(&request, "wait_until_scheduled_compaction_done")?
+            .unwrap_or(false);
+
+    let sub_compaction = compact_request
+        .as_ref()
+        .map(|r| r.sub_compaction)
+        .unwrap_or(false);
+    let sub_compaction_max_job_size_mb = compact_request
+        .as_ref()
+        .and_then(|r| r.sub_compaction_max_job_size_mb);
+
    let options = CompactOptions {
-        compact_range,
+        compact_key_range: compact_request
+            .as_ref()
+            .and_then(|r| r.compact_key_range.clone()),
+        compact_lsn_range: compact_request
+            .as_ref()
+            .and_then(|r| r.compact_lsn_range.clone()),
        flags,
+        sub_compaction,
+        sub_compaction_max_job_size_mb,
    };

+    let scheduled = compact_request
+        .as_ref()
+        .map(|r| r.scheduled)
+        .unwrap_or(false);
+
    async {
        let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
        let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?;
-        timeline
-            .compact_with_options(&cancel, options, &ctx)
-            .await
-            .map_err(|e| ApiError::InternalServerError(e.into()))?;
-        if wait_until_uploaded {
-            timeline.remote_client.wait_completion().await
-            // XXX map to correct ApiError for the cases where it's due to shutdown
-            .context("wait completion").map_err(ApiError::InternalServerError)?;
+        if scheduled {
+            let tenant = state
+                .tenant_manager
+                .get_attached_tenant_shard(tenant_shard_id)?;
+            let rx = tenant.schedule_compaction(timeline_id, options).await.map_err(ApiError::InternalServerError)?;
+            if wait_until_scheduled_compaction_done {
+                // It is possible that this will take a long time, dropping the HTTP request will not cancel the compaction.
+                rx.await.ok();
+            }
+        } else {
+            timeline
+                .compact_with_options(&cancel, options, &ctx)
+                .await
+                .map_err(|e| ApiError::InternalServerError(e.into()))?;
+            if wait_until_uploaded {
+                timeline.remote_client.wait_completion().await
+                // XXX map to correct ApiError for the cases where it's due to shutdown
+                .context("wait completion").map_err(ApiError::InternalServerError)?;
+            }
        }
        json_response(StatusCode::OK, ())
    }
@@ -2108,16 +2204,20 @@ async fn timeline_checkpoint_handler(
    // By default, checkpoints come with a compaction, but this may be optionally disabled by tests that just want to flush + upload.
    let compact = parse_query_param::<_, bool>(&request, "compact")?.unwrap_or(true);

+    let wait_until_flushed: bool =
+        parse_query_param(&request, "wait_until_flushed")?.unwrap_or(true);
+
    let wait_until_uploaded =
        parse_query_param::<_, bool>(&request, "wait_until_uploaded")?.unwrap_or(false);

    async {
        let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
        let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?;
-        timeline
-            .freeze_and_flush()
-            .await
-            .map_err(|e| {
+        if wait_until_flushed {
+            timeline.freeze_and_flush().await
+        } else {
+            timeline.freeze().await.and(Ok(()))
+        }.map_err(|e| {
                match e {
                    tenant::timeline::FlushLayerError::Cancelled => ApiError::ShuttingDown,
                    other => ApiError::InternalServerError(other.into()),
@@ -3236,6 +3336,9 @@ pub fn make_router(
        .get("/v1/tenant/:tenant_shard_id/synthetic_size", |r| {
            api_handler(r, tenant_size_handler)
        })
+        .patch("/v1/tenant/config", |r| {
+            api_handler(r, patch_tenant_config_handler)
+        })
        .put("/v1/tenant/config", |r| {
            api_handler(r, update_tenant_config_handler)
        })
@@ -3301,6 +3404,10 @@ pub fn make_router(
            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/compact",
            |r| api_handler(r, timeline_compact_handler),
        )
+        .delete(
+            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/compact",
+            |r| api_handler(r, timeline_cancel_compact_handler),
+        )
        .put(
            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/offload",
            |r| testing_api_handler("attempt timeline offload", r, timeline_offload_handler),
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -16,7 +16,6 @@ use postgres_backend::{is_expected_io_error, QueryError};
 use pq_proto::framed::ConnectionError;
 use strum::{EnumCount, VariantNames};
 use strum_macros::{IntoStaticStr, VariantNames};
-use tracing::warn;
 use utils::id::TimelineId;

 /// Prometheus histogram buckets (in seconds) for operations in the critical
@@ -464,6 +463,24 @@ static LAST_RECORD_LSN: Lazy<IntGaugeVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

+static DISK_CONSISTENT_LSN: Lazy<IntGaugeVec> = Lazy::new(|| {
+    register_int_gauge_vec!(
+        "pageserver_disk_consistent_lsn",
+        "Disk consistent LSN grouped by timeline",
+        &["tenant_id", "shard_id", "timeline_id"]
+    )
+    .expect("failed to define a metric")
+});
+
+pub(crate) static PROJECTED_REMOTE_CONSISTENT_LSN: Lazy<UIntGaugeVec> = Lazy::new(|| {
+    register_uint_gauge_vec!(
+        "pageserver_projected_remote_consistent_lsn",
+        "Projected remote consistent LSN grouped by timeline",
+        &["tenant_id", "shard_id", "timeline_id"]
+    )
+    .expect("failed to define a metric")
+});
+
 static PITR_HISTORY_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
    register_uint_gauge_vec!(
        "pageserver_pitr_history_size",
@@ -1205,54 +1222,163 @@ pub(crate) mod virtual_file_io_engine {
    });
 }

-pub(crate) struct SmgrOpTimer {
-    global_latency_histo: Histogram,
+pub(crate) struct SmgrOpTimer(Option<SmgrOpTimerInner>);
+pub(crate) struct SmgrOpTimerInner {
+    global_execution_latency_histo: Histogram,
+    per_timeline_execution_latency_histo: Option<Histogram>,

-    // Optional because not all op types are tracked per-timeline
-    per_timeline_latency_histo: Option<Histogram>,
+    global_batch_wait_time: Histogram,
+    per_timeline_batch_wait_time: Histogram,

-    start: Instant,
-    throttled: Duration,
-    op: SmgrQueryType,
+    global_flush_in_progress_micros: IntCounter,
+    per_timeline_flush_in_progress_micros: IntCounter,
+
+    timings: SmgrOpTimerState,
+}
+
+#[derive(Debug)]
+enum SmgrOpTimerState {
+    Received {
+        received_at: Instant,
+    },
+    ThrottleDoneExecutionStarting {
+        received_at: Instant,
+        throttle_started_at: Instant,
+        started_execution_at: Instant,
+    },
+}
+
+pub(crate) struct SmgrOpFlushInProgress {
+    flush_started_at: Instant,
+    global_micros: IntCounter,
+    per_timeline_micros: IntCounter,
 }

 impl SmgrOpTimer {
-    pub(crate) fn deduct_throttle(&mut self, throttle: &Option<Duration>) {
-        let Some(throttle) = throttle else {
-            return;
-        };
-        self.throttled += *throttle;
+    pub(crate) fn observe_throttle_done_execution_starting(&mut self, throttle: &ThrottleResult) {
+        let inner = self.0.as_mut().expect("other public methods consume self");
+        match (&mut inner.timings, throttle) {
+            (SmgrOpTimerState::Received { received_at }, throttle) => match throttle {
+                ThrottleResult::NotThrottled { start } => {
+                    inner.timings = SmgrOpTimerState::ThrottleDoneExecutionStarting {
+                        received_at: *received_at,
+                        throttle_started_at: *start,
+                        started_execution_at: *start,
+                    };
+                }
+                ThrottleResult::Throttled { start, end } => {
+                    inner.timings = SmgrOpTimerState::ThrottleDoneExecutionStarting {
+                        received_at: *start,
+                        throttle_started_at: *start,
+                        started_execution_at: *end,
+                    };
+                }
+            },
+            (x, _) => panic!("called in unexpected state: {x:?}"),
+        }
+    }
+
+    pub(crate) fn observe_smgr_op_completion_and_start_flushing(mut self) -> SmgrOpFlushInProgress {
+        let (flush_start, inner) = self
+            .smgr_op_end()
+            .expect("this method consume self, and the only other caller is drop handler");
+        let SmgrOpTimerInner {
+            global_flush_in_progress_micros,
+            per_timeline_flush_in_progress_micros,
+            ..
+        } = inner;
+        SmgrOpFlushInProgress {
+            flush_started_at: flush_start,
+            global_micros: global_flush_in_progress_micros,
+            per_timeline_micros: per_timeline_flush_in_progress_micros,
+        }
+    }
+
+    /// Returns `None`` if this method has already been called, `Some` otherwise.
+    fn smgr_op_end(&mut self) -> Option<(Instant, SmgrOpTimerInner)> {
+        let inner = self.0.take()?;
+
+        let now = Instant::now();
+
+        let batch;
+        let execution;
+        let throttle;
+        match inner.timings {
+            SmgrOpTimerState::Received { received_at } => {
+                batch = (now - received_at).as_secs_f64();
+                // TODO: use label for dropped requests.
+                // This is quite rare in practice, only during tenant/pageservers shutdown.
+                throttle = Duration::ZERO;
+                execution = Duration::ZERO.as_secs_f64();
+            }
+            SmgrOpTimerState::ThrottleDoneExecutionStarting {
+                received_at,
+                throttle_started_at,
+                started_execution_at,
+            } => {
+                batch = (throttle_started_at - received_at).as_secs_f64();
+                throttle = started_execution_at - throttle_started_at;
+                execution = (now - started_execution_at).as_secs_f64();
+            }
+        }
+
+        // update time spent in batching
+        inner.global_batch_wait_time.observe(batch);
+        inner.per_timeline_batch_wait_time.observe(batch);
+
+        // time spent in throttle metric is updated by throttle impl
+        let _ = throttle;
+
+        // update metrics for execution latency
+        inner.global_execution_latency_histo.observe(execution);
+        if let Some(per_timeline_execution_latency_histo) =
+            &inner.per_timeline_execution_latency_histo
+        {
+            per_timeline_execution_latency_histo.observe(execution);
+        }
+
+        Some((now, inner))
    }
 }

 impl Drop for SmgrOpTimer {
    fn drop(&mut self) {
-        let elapsed = self.start.elapsed();
+        self.smgr_op_end();
+    }
+}

-        let elapsed = match elapsed.checked_sub(self.throttled) {
-            Some(elapsed) => elapsed,
-            None => {
-                use utils::rate_limit::RateLimit;
-                static LOGGED: Lazy<Mutex<enum_map::EnumMap<SmgrQueryType, RateLimit>>> =
-                    Lazy::new(|| {
-                        Mutex::new(enum_map::EnumMap::from_array(std::array::from_fn(|_| {
-                            RateLimit::new(Duration::from_secs(10))
-                        })))
-                    });
-                let mut guard = LOGGED.lock().unwrap();
-                let rate_limit = &mut guard[self.op];
-                rate_limit.call(|| {
-                    warn!(op=?self.op, ?elapsed, ?self.throttled, "implementation error: time spent throttled exceeds total request wall clock time");
-                });
-                elapsed // un-throttled time, more info than just saturating to 0
+impl SmgrOpFlushInProgress {
+    pub(crate) async fn measure<Fut, O>(mut self, mut fut: Fut) -> O
+    where
+        Fut: std::future::Future<Output = O>,
+    {
+        let mut fut = std::pin::pin!(fut);
+
+        let now = Instant::now();
+        // Whenever observe_guard gets called, or dropped,
+        // it adds the time elapsed since its last call to metrics.
+        // Last call is tracked in `now`.
+        let mut observe_guard = scopeguard::guard(
+            || {
+                let elapsed = now - self.flush_started_at;
+                self.global_micros
+                    .inc_by(u64::try_from(elapsed.as_micros()).unwrap());
+                self.per_timeline_micros
+                    .inc_by(u64::try_from(elapsed.as_micros()).unwrap());
+                self.flush_started_at = now;
+            },
+            |mut observe| {
+                observe();
+            },
+        );
+
+        loop {
+            match tokio::time::timeout(Duration::from_secs(10), &mut fut).await {
+                Ok(v) => return v,
+                Err(_timeout) => {
+                    (*observe_guard)();
+                }
            }
-        };
-
-        let elapsed = elapsed.as_secs_f64();
-
-        self.global_latency_histo.observe(elapsed);
-        if let Some(per_timeline_getpage_histo) = &self.per_timeline_latency_histo {
-            per_timeline_getpage_histo.observe(elapsed);
        }
    }
 }
@@ -1284,6 +1410,10 @@ pub(crate) struct SmgrQueryTimePerTimeline {
    per_timeline_getpage_latency: Histogram,
    global_batch_size: Histogram,
    per_timeline_batch_size: Histogram,
+    global_flush_in_progress_micros: IntCounter,
+    per_timeline_flush_in_progress_micros: IntCounter,
+    global_batch_wait_time: Histogram,
+    per_timeline_batch_wait_time: Histogram,
 }

 static SMGR_QUERY_STARTED_GLOBAL: Lazy<IntCounterVec> = Lazy::new(|| {
@@ -1306,12 +1436,15 @@ static SMGR_QUERY_STARTED_PER_TENANT_TIMELINE: Lazy<IntCounterVec> = Lazy::new(|
    .expect("failed to define a metric")
 });

+// Alias so all histograms recording per-timeline smgr timings use the same buckets.
+static SMGR_QUERY_TIME_PER_TENANT_TIMELINE_BUCKETS: &[f64] = CRITICAL_OP_BUCKETS;
+
 static SMGR_QUERY_TIME_PER_TENANT_TIMELINE: Lazy<HistogramVec> = Lazy::new(|| {
    register_histogram_vec!(
        "pageserver_smgr_query_seconds",
-        "Time spent on smgr query handling, aggegated by query type and tenant/timeline.",
+        "Time spent _executing_ smgr query handling, excluding batch and throttle delays.",
        &["smgr_query_type", "tenant_id", "shard_id", "timeline_id"],
-        CRITICAL_OP_BUCKETS.into(),
+        SMGR_QUERY_TIME_PER_TENANT_TIMELINE_BUCKETS.into(),
    )
    .expect("failed to define a metric")
 });
@@ -1369,7 +1502,7 @@ static SMGR_QUERY_TIME_GLOBAL_BUCKETS: Lazy<Vec<f64>> = Lazy::new(|| {
 static SMGR_QUERY_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
    register_histogram_vec!(
        "pageserver_smgr_query_seconds_global",
-        "Time spent on smgr query handling, aggregated by query type.",
+        "Like pageserver_smgr_query_seconds, but aggregated to instance level.",
        &["smgr_query_type"],
        SMGR_QUERY_TIME_GLOBAL_BUCKETS.clone(),
    )
@@ -1446,6 +1579,45 @@ fn set_page_service_config_max_batch_size(conf: &PageServicePipeliningConfig) {
        .set(value.try_into().unwrap());
 }

+static PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "pageserver_page_service_pagestream_flush_in_progress_micros",
+        "Counter that sums up the microseconds that a pagestream response was being flushed into the TCP connection. \
+         If the flush is particularly slow, this counter will be updated periodically to make slow flushes \
+         easily discoverable in monitoring. \
+         Hence, this is NOT a completion latency historgram.",
+        &["tenant_id", "shard_id", "timeline_id"],
+    )
+    .expect("failed to define a metric")
+});
+
+static PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS_GLOBAL: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
+        "pageserver_page_service_pagestream_flush_in_progress_micros_global",
+        "Like pageserver_page_service_pagestream_flush_in_progress_seconds, but instance-wide.",
+    )
+    .expect("failed to define a metric")
+});
+
+static PAGE_SERVICE_SMGR_BATCH_WAIT_TIME: Lazy<HistogramVec> = Lazy::new(|| {
+    register_histogram_vec!(
+        "pageserver_page_service_pagestream_batch_wait_time_seconds",
+        "Time a request spent waiting in its batch until the batch moved to throttle&execution.",
+        &["tenant_id", "shard_id", "timeline_id"],
+        SMGR_QUERY_TIME_PER_TENANT_TIMELINE_BUCKETS.into(),
+    )
+    .expect("failed to define a metric")
+});
+
+static PAGE_SERVICE_SMGR_BATCH_WAIT_TIME_GLOBAL: Lazy<Histogram> = Lazy::new(|| {
+    register_histogram!(
+        "pageserver_page_service_pagestream_batch_wait_time_seconds_global",
+        "Like pageserver_page_service_pagestream_batch_wait_time_seconds, but aggregated to instance level.",
+        SMGR_QUERY_TIME_GLOBAL_BUCKETS.to_vec(),
+    )
+    .expect("failed to define a metric")
+});
+
 impl SmgrQueryTimePerTimeline {
    pub(crate) fn new(tenant_shard_id: &TenantShardId, timeline_id: &TimelineId) -> Self {
        let tenant_id = tenant_shard_id.tenant_id.to_string();
@@ -1486,6 +1658,17 @@ impl SmgrQueryTimePerTimeline {
            .get_metric_with_label_values(&[&tenant_id, &shard_slug, &timeline_id])
            .unwrap();

+        let global_batch_wait_time = PAGE_SERVICE_SMGR_BATCH_WAIT_TIME_GLOBAL.clone();
+        let per_timeline_batch_wait_time = PAGE_SERVICE_SMGR_BATCH_WAIT_TIME
+            .get_metric_with_label_values(&[&tenant_id, &shard_slug, &timeline_id])
+            .unwrap();
+
+        let global_flush_in_progress_micros =
+            PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS_GLOBAL.clone();
+        let per_timeline_flush_in_progress_micros = PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS
+            .get_metric_with_label_values(&[&tenant_id, &shard_slug, &timeline_id])
+            .unwrap();
+
        Self {
            global_started,
            global_latency,
@@ -1493,9 +1676,13 @@ impl SmgrQueryTimePerTimeline {
            per_timeline_getpage_started,
            global_batch_size,
            per_timeline_batch_size,
+            global_flush_in_progress_micros,
+            per_timeline_flush_in_progress_micros,
+            global_batch_wait_time,
+            per_timeline_batch_wait_time,
        }
    }
-    pub(crate) fn start_smgr_op(&self, op: SmgrQueryType, started_at: Instant) -> SmgrOpTimer {
+    pub(crate) fn start_smgr_op(&self, op: SmgrQueryType, received_at: Instant) -> SmgrOpTimer {
        self.global_started[op as usize].inc();

        let per_timeline_latency_histo = if matches!(op, SmgrQueryType::GetPageAtLsn) {
@@ -1505,13 +1692,17 @@ impl SmgrQueryTimePerTimeline {
            None
        };

-        SmgrOpTimer {
-            global_latency_histo: self.global_latency[op as usize].clone(),
-            per_timeline_latency_histo,
-            start: started_at,
-            op,
-            throttled: Duration::ZERO,
-        }
+        SmgrOpTimer(Some(SmgrOpTimerInner {
+            global_execution_latency_histo: self.global_latency[op as usize].clone(),
+            per_timeline_execution_latency_histo: per_timeline_latency_histo,
+            timings: SmgrOpTimerState::Received { received_at },
+            global_flush_in_progress_micros: self.global_flush_in_progress_micros.clone(),
+            per_timeline_flush_in_progress_micros: self
+                .per_timeline_flush_in_progress_micros
+                .clone(),
+            global_batch_wait_time: self.global_batch_wait_time.clone(),
+            per_timeline_batch_wait_time: self.per_timeline_batch_wait_time.clone(),
+        }))
    }

    pub(crate) fn observe_getpage_batch_start(&self, batch_size: usize) {
@@ -2186,6 +2377,15 @@ pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| WalIngestMet
    .expect("failed to define a metric"),
 });

+pub(crate) static PAGESERVER_TIMELINE_WAL_RECORDS_RECEIVED: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "pageserver_timeline_wal_records_received",
+        "Number of WAL records received per shard",
+        &["tenant_id", "shard_id", "timeline_id"]
+    )
+    .expect("failed to define a metric")
+});
+
 pub(crate) static WAL_REDO_TIME: Lazy<Histogram> = Lazy::new(|| {
    register_histogram!(
        "pageserver_wal_redo_seconds",
@@ -2394,7 +2594,8 @@ pub(crate) struct TimelineMetrics {
    pub load_layer_map_histo: StorageTimeMetrics,
    pub garbage_collect_histo: StorageTimeMetrics,
    pub find_gc_cutoffs_histo: StorageTimeMetrics,
-    pub last_record_gauge: IntGauge,
+    pub last_record_lsn_gauge: IntGauge,
+    pub disk_consistent_lsn_gauge: IntGauge,
    pub pitr_history_size: UIntGauge,
    pub archival_size: UIntGauge,
    pub(crate) layer_size_image: UIntGauge,
@@ -2412,6 +2613,7 @@ pub(crate) struct TimelineMetrics {
    pub evictions_with_low_residence_duration: std::sync::RwLock<EvictionsWithLowResidenceDuration>,
    /// Number of valid LSN leases.
    pub valid_lsn_lease_count_gauge: UIntGauge,
+    pub wal_records_received: IntCounter,
    shutdown: std::sync::atomic::AtomicBool,
 }

@@ -2475,7 +2677,11 @@ impl TimelineMetrics {
            &shard_id,
            &timeline_id,
        );
-        let last_record_gauge = LAST_RECORD_LSN
+        let last_record_lsn_gauge = LAST_RECORD_LSN
+            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
+            .unwrap();
+
+        let disk_consistent_lsn_gauge = DISK_CONSISTENT_LSN
            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
            .unwrap();

@@ -2565,6 +2771,10 @@ impl TimelineMetrics {
            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
            .unwrap();

+        let wal_records_received = PAGESERVER_TIMELINE_WAL_RECORDS_RECEIVED
+            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
+            .unwrap();
+
        TimelineMetrics {
            tenant_id,
            shard_id,
@@ -2578,7 +2788,8 @@ impl TimelineMetrics {
            garbage_collect_histo,
            find_gc_cutoffs_histo,
            load_layer_map_histo,
-            last_record_gauge,
+            last_record_lsn_gauge,
+            disk_consistent_lsn_gauge,
            pitr_history_size,
            archival_size,
            layer_size_image,
@@ -2596,6 +2807,7 @@ impl TimelineMetrics {
                evictions_with_low_residence_duration,
            ),
            valid_lsn_lease_count_gauge,
+            wal_records_received,
            shutdown: std::sync::atomic::AtomicBool::default(),
        }
    }
@@ -2642,6 +2854,7 @@ impl TimelineMetrics {
        let timeline_id = &self.timeline_id;
        let shard_id = &self.shard_id;
        let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, shard_id, timeline_id]);
+        let _ = DISK_CONSISTENT_LSN.remove_label_values(&[tenant_id, shard_id, timeline_id]);
        let _ = FLUSH_WAIT_UPLOAD_TIME.remove_label_values(&[tenant_id, shard_id, timeline_id]);
        let _ = STANDBY_HORIZON.remove_label_values(&[tenant_id, shard_id, timeline_id]);
        {
@@ -2732,6 +2945,21 @@ impl TimelineMetrics {
            shard_id,
            timeline_id,
        ]);
+        let _ = PAGESERVER_TIMELINE_WAL_RECORDS_RECEIVED.remove_label_values(&[
+            tenant_id,
+            shard_id,
+            timeline_id,
+        ]);
+        let _ = PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS.remove_label_values(&[
+            tenant_id,
+            shard_id,
+            timeline_id,
+        ]);
+        let _ = PAGE_SERVICE_SMGR_BATCH_WAIT_TIME.remove_label_values(&[
+            tenant_id,
+            shard_id,
+            timeline_id,
+        ]);
    }
 }

@@ -2762,6 +2990,7 @@ use crate::context::{PageContentKind, RequestContext};
 use crate::task_mgr::TaskKind;
 use crate::tenant::mgr::TenantSlot;
 use crate::tenant::tasks::BackgroundLoopKind;
+use crate::tenant::throttle::ThrottleResult;
 use crate::tenant::Timeline;

 /// Maintain a per timeline gauge in addition to the global gauge.
@@ -2805,6 +3034,7 @@ pub(crate) struct RemoteTimelineClientMetrics {
    calls: Mutex<HashMap<(&'static str, &'static str), IntCounterPair>>,
    bytes_started_counter: Mutex<HashMap<(&'static str, &'static str), IntCounter>>,
    bytes_finished_counter: Mutex<HashMap<(&'static str, &'static str), IntCounter>>,
+    pub(crate) projected_remote_consistent_lsn_gauge: UIntGauge,
 }

 impl RemoteTimelineClientMetrics {
@@ -2819,6 +3049,10 @@ impl RemoteTimelineClientMetrics {
                .unwrap(),
        );

+        let projected_remote_consistent_lsn_gauge = PROJECTED_REMOTE_CONSISTENT_LSN
+            .get_metric_with_label_values(&[&tenant_id_str, &shard_id_str, &timeline_id_str])
+            .unwrap();
+
        RemoteTimelineClientMetrics {
            tenant_id: tenant_id_str,
            shard_id: shard_id_str,
@@ -2827,6 +3061,7 @@ impl RemoteTimelineClientMetrics {
            bytes_started_counter: Mutex::new(HashMap::default()),
            bytes_finished_counter: Mutex::new(HashMap::default()),
            remote_physical_size_gauge,
+            projected_remote_consistent_lsn_gauge,
        }
    }

@@ -3040,6 +3275,7 @@ impl Drop for RemoteTimelineClientMetrics {
            calls,
            bytes_started_counter,
            bytes_finished_counter,
+            projected_remote_consistent_lsn_gauge,
        } = self;
        for ((a, b), _) in calls.get_mut().unwrap().drain() {
            let mut res = [Ok(()), Ok(())];
@@ -3069,6 +3305,14 @@ impl Drop for RemoteTimelineClientMetrics {
            let _ = remote_physical_size_gauge; // use to avoid 'unused' warning in desctructuring above
            let _ = REMOTE_PHYSICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
        }
+        {
+            let _ = projected_remote_consistent_lsn_gauge;
+            let _ = PROJECTED_REMOTE_CONSISTENT_LSN.remove_label_values(&[
+                tenant_id,
+                shard_id,
+                timeline_id,
+            ]);
+        }
    }
 }

@@ -3601,6 +3845,7 @@ pub fn preinitialize_metrics(conf: &'static PageServerConf) {
        &REMOTE_ONDEMAND_DOWNLOADED_BYTES,
        &CIRCUIT_BREAKERS_BROKEN,
        &CIRCUIT_BREAKERS_UNBROKEN,
+        &PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS_GLOBAL,
    ]
    .into_iter()
    .for_each(|c| {
@@ -3648,6 +3893,7 @@ pub fn preinitialize_metrics(conf: &'static PageServerConf) {
        &WAL_REDO_BYTES_HISTOGRAM,
        &WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM,
        &PAGE_SERVICE_BATCH_SIZE_GLOBAL,
+        &PAGE_SERVICE_SMGR_BATCH_WAIT_TIME_GLOBAL,
    ]
    .into_iter()
    .for_each(|h| {
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -575,7 +575,10 @@ enum BatchedFeMessage {
 }

 impl BatchedFeMessage {
-    async fn throttle(&mut self, cancel: &CancellationToken) -> Result<(), QueryError> {
+    async fn throttle_and_record_start_processing(
+        &mut self,
+        cancel: &CancellationToken,
+    ) -> Result<(), QueryError> {
        let (shard, tokens, timers) = match self {
            BatchedFeMessage::Exists { shard, timer, .. }
            | BatchedFeMessage::Nblocks { shard, timer, .. }
@@ -603,7 +606,7 @@ impl BatchedFeMessage {
            }
        };
        for timer in timers {
-            timer.deduct_throttle(&throttled);
+            timer.observe_throttle_done_execution_starting(&throttled);
        }
        Ok(())
    }
@@ -1017,10 +1020,8 @@ impl PageServerHandler {
        // Map handler result to protocol behavior.
        // Some handler errors cause exit from pagestream protocol.
        // Other handler errors are sent back as an error message and we stay in pagestream protocol.
-        let mut timers: smallvec::SmallVec<[_; 1]> =
-            smallvec::SmallVec::with_capacity(handler_results.len());
        for handler_result in handler_results {
-            let response_msg = match handler_result {
+            let (response_msg, timer) = match handler_result {
                Err(e) => match &e {
                    PageStreamError::Shutdown => {
                        // If we fail to fulfil a request during shutdown, which may be _because_ of
@@ -1044,34 +1045,66 @@ impl PageServerHandler {
                        span.in_scope(|| {
                            error!("error reading relation or page version: {full:#}")
                        });
-                        PagestreamBeMessage::Error(PagestreamErrorResponse {
-                            message: e.to_string(),
-                        })
+                        (
+                            PagestreamBeMessage::Error(PagestreamErrorResponse {
+                                message: e.to_string(),
+                            }),
+                            None, // TODO: measure errors
+                        )
                    }
                },
-                Ok((response_msg, timer)) => {
-                    // Extending the lifetime of the timers so observations on drop
-                    // include the flush time.
-                    timers.push(timer);
-                    response_msg
-                }
+                Ok((response_msg, timer)) => (response_msg, Some(timer)),
            };

+            //
            // marshal & transmit response message
+            //
+
            pgb_writer.write_message_noflush(&BeMessage::CopyData(&response_msg.serialize()))?;
-        }
-        tokio::select! {
-            biased;
-            _ = cancel.cancelled() => {
-                // We were requested to shut down.
-                info!("shutdown request received in page handler");
-                return Err(QueryError::Shutdown)
-            }
-            res = pgb_writer.flush() => {
-                res?;
+
+            // We purposefully don't count flush time into the timer.
+            //
+            // The reason is that current compute client will not perform protocol processing
+            // if the postgres backend process is doing things other than `->smgr_read()`.
+            // This is especially the case for prefetch.
+            //
+            // If the compute doesn't read from the connection, eventually TCP will backpressure
+            // all the way into our flush call below.
+            //
+            // The timer's underlying metric is used for a storage-internal latency SLO and
+            // we don't want to include latency in it that we can't control.
+            // And as pointed out above, in this case, we don't control the time that flush will take.
+            let flushing_timer =
+                timer.map(|timer| timer.observe_smgr_op_completion_and_start_flushing());
+
+            // what we want to do
+            let flush_fut = pgb_writer.flush();
+            // metric for how long flushing takes
+            let flush_fut = match flushing_timer {
+                Some(flushing_timer) => {
+                    futures::future::Either::Left(flushing_timer.measure(flush_fut))
+                }
+                None => futures::future::Either::Right(flush_fut),
+            };
+            // do it while respecting cancellation
+            let _: () = async move {
+                tokio::select! {
+                    biased;
+                    _ = cancel.cancelled() => {
+                        // We were requested to shut down.
+                        info!("shutdown request received in page handler");
+                        return Err(QueryError::Shutdown)
+                    }
+                    res = flush_fut => {
+                        res?;
+                    }
+                }
+                Ok(())
            }
+            // and log the info! line inside the request span
+            .instrument(span.clone())
+            .await?;
        }
-        drop(timers);
        Ok(())
    }

@@ -1200,7 +1233,7 @@ impl PageServerHandler {
                }
            };

-            if let Err(cancelled) = msg.throttle(&self.cancel).await {
+            if let Err(cancelled) = msg.throttle_and_record_start_processing(&self.cancel).await {
                break cancelled;
            }

@@ -1367,7 +1400,9 @@ impl PageServerHandler {
                            return Err(e);
                        }
                    };
-                    batch.throttle(&self.cancel).await?;
+                    batch
+                        .throttle_and_record_start_processing(&self.cancel)
+                        .await?;
                    self.pagesteam_handle_batched_message(pgb_writer, batch, &cancel, &ctx)
                        .await?;
                }
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -11,7 +11,7 @@
 pub(crate) use pageserver_api::config::TenantConfigToml as TenantConf;
 use pageserver_api::models::CompactionAlgorithmSettings;
 use pageserver_api::models::EvictionPolicy;
-use pageserver_api::models::{self, ThrottleConfig};
+use pageserver_api::models::{self, TenantConfigPatch, ThrottleConfig};
 use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize};
 use serde::de::IntoDeserializer;
 use serde::{Deserialize, Serialize};
@@ -427,6 +427,129 @@ impl TenantConfOpt {
                .or(global_conf.wal_receiver_protocol_override),
        }
    }
+
+    pub fn apply_patch(self, patch: TenantConfigPatch) -> anyhow::Result<TenantConfOpt> {
+        let Self {
+            mut checkpoint_distance,
+            mut checkpoint_timeout,
+            mut compaction_target_size,
+            mut compaction_period,
+            mut compaction_threshold,
+            mut compaction_algorithm,
+            mut gc_horizon,
+            mut gc_period,
+            mut image_creation_threshold,
+            mut pitr_interval,
+            mut walreceiver_connect_timeout,
+            mut lagging_wal_timeout,
+            mut max_lsn_wal_lag,
+            mut eviction_policy,
+            mut min_resident_size_override,
+            mut evictions_low_residence_duration_metric_threshold,
+            mut heatmap_period,
+            mut lazy_slru_download,
+            mut timeline_get_throttle,
+            mut image_layer_creation_check_threshold,
+            mut lsn_lease_length,
+            mut lsn_lease_length_for_ts,
+            mut timeline_offloading,
+            mut wal_receiver_protocol_override,
+        } = self;
+
+        patch.checkpoint_distance.apply(&mut checkpoint_distance);
+        patch
+            .checkpoint_timeout
+            .map(|v| humantime::parse_duration(&v))?
+            .apply(&mut checkpoint_timeout);
+        patch
+            .compaction_target_size
+            .apply(&mut compaction_target_size);
+        patch
+            .compaction_period
+            .map(|v| humantime::parse_duration(&v))?
+            .apply(&mut compaction_period);
+        patch.compaction_threshold.apply(&mut compaction_threshold);
+        patch.compaction_algorithm.apply(&mut compaction_algorithm);
+        patch.gc_horizon.apply(&mut gc_horizon);
+        patch
+            .gc_period
+            .map(|v| humantime::parse_duration(&v))?
+            .apply(&mut gc_period);
+        patch
+            .image_creation_threshold
+            .apply(&mut image_creation_threshold);
+        patch
+            .pitr_interval
+            .map(|v| humantime::parse_duration(&v))?
+            .apply(&mut pitr_interval);
+        patch
+            .walreceiver_connect_timeout
+            .map(|v| humantime::parse_duration(&v))?
+            .apply(&mut walreceiver_connect_timeout);
+        patch
+            .lagging_wal_timeout
+            .map(|v| humantime::parse_duration(&v))?
+            .apply(&mut lagging_wal_timeout);
+        patch.max_lsn_wal_lag.apply(&mut max_lsn_wal_lag);
+        patch.eviction_policy.apply(&mut eviction_policy);
+        patch
+            .min_resident_size_override
+            .apply(&mut min_resident_size_override);
+        patch
+            .evictions_low_residence_duration_metric_threshold
+            .map(|v| humantime::parse_duration(&v))?
+            .apply(&mut evictions_low_residence_duration_metric_threshold);
+        patch
+            .heatmap_period
+            .map(|v| humantime::parse_duration(&v))?
+            .apply(&mut heatmap_period);
+        patch.lazy_slru_download.apply(&mut lazy_slru_download);
+        patch
+            .timeline_get_throttle
+            .apply(&mut timeline_get_throttle);
+        patch
+            .image_layer_creation_check_threshold
+            .apply(&mut image_layer_creation_check_threshold);
+        patch
+            .lsn_lease_length
+            .map(|v| humantime::parse_duration(&v))?
+            .apply(&mut lsn_lease_length);
+        patch
+            .lsn_lease_length_for_ts
+            .map(|v| humantime::parse_duration(&v))?
+            .apply(&mut lsn_lease_length_for_ts);
+        patch.timeline_offloading.apply(&mut timeline_offloading);
+        patch
+            .wal_receiver_protocol_override
+            .apply(&mut wal_receiver_protocol_override);
+
+        Ok(Self {
+            checkpoint_distance,
+            checkpoint_timeout,
+            compaction_target_size,
+            compaction_period,
+            compaction_threshold,
+            compaction_algorithm,
+            gc_horizon,
+            gc_period,
+            image_creation_threshold,
+            pitr_interval,
+            walreceiver_connect_timeout,
+            lagging_wal_timeout,
+            max_lsn_wal_lag,
+            eviction_policy,
+            min_resident_size_override,
+            evictions_low_residence_duration_metric_threshold,
+            heatmap_period,
+            lazy_slru_download,
+            timeline_get_throttle,
+            image_layer_creation_check_threshold,
+            lsn_lease_length,
+            lsn_lease_length_for_ts,
+            timeline_offloading,
+            wal_receiver_protocol_override,
+        })
+    }
 }

 impl TryFrom<&'_ models::TenantConfig> for TenantConfOpt {
--- a/pageserver/src/tenant/gc_block.rs
+++ b/pageserver/src/tenant/gc_block.rs
@@ -1,4 +1,4 @@
-use std::collections::HashMap;
+use std::{collections::HashMap, sync::Arc};

 use utils::id::TimelineId;

@@ -20,7 +20,7 @@ pub(crate) struct GcBlock {
    /// Do not add any more features taking and forbidding taking this lock. It should be
    /// `tokio::sync::Notify`, but that is rarely used. On the other side, [`GcBlock::insert`]
    /// synchronizes with gc attempts by locking and unlocking this mutex.
-    blocking: tokio::sync::Mutex<()>,
+    blocking: Arc<tokio::sync::Mutex<()>>,
 }

 impl GcBlock {
@@ -30,7 +30,7 @@ impl GcBlock {
    /// it's ending, or if not currently possible, a value describing the reasons why not.
    ///
    /// Cancellation safe.
-    pub(super) async fn start(&self) -> Result<Guard<'_>, BlockingReasons> {
+    pub(super) async fn start(&self) -> Result<Guard, BlockingReasons> {
        let reasons = {
            let g = self.reasons.lock().unwrap();

@@ -44,7 +44,7 @@ impl GcBlock {
            Err(reasons)
        } else {
            Ok(Guard {
-                _inner: self.blocking.lock().await,
+                _inner: self.blocking.clone().lock_owned().await,
            })
        }
    }
@@ -170,8 +170,8 @@ impl GcBlock {
    }
 }

-pub(super) struct Guard<'a> {
-    _inner: tokio::sync::MutexGuard<'a, ()>,
+pub(crate) struct Guard {
+    _inner: tokio::sync::OwnedMutexGuard<()>,
 }

 #[derive(Debug)]
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -2192,6 +2192,9 @@ impl RemoteTimelineClient {
                    upload_queue.clean.1 = Some(task.task_id);

                    let lsn = upload_queue.clean.0.metadata.disk_consistent_lsn();
+                    self.metrics
+                        .projected_remote_consistent_lsn_gauge
+                        .set(lsn.0);

                    if self.generation.is_none() {
                        // Legacy mode: skip validating generation
--- a/pageserver/src/tenant/throttle.rs
+++ b/pageserver/src/tenant/throttle.rs
@@ -58,6 +58,11 @@ pub struct Stats {
    pub sum_throttled_usecs: u64,
 }

+pub enum ThrottleResult {
+    NotThrottled { start: Instant },
+    Throttled { start: Instant, end: Instant },
+}
+
 impl<M> Throttle<M>
 where
    M: Metric,
@@ -122,15 +127,15 @@ where
        self.inner.load().rate_limiter.steady_rps()
    }

-    pub async fn throttle(&self, key_count: usize) -> Option<Duration> {
+    pub async fn throttle(&self, key_count: usize) -> ThrottleResult {
        let inner = self.inner.load_full(); // clones the `Inner` Arc

-        if !inner.enabled {
-            return None;
-        }
-
        let start = std::time::Instant::now();

+        if !inner.enabled {
+            return ThrottleResult::NotThrottled { start };
+        }
+
        self.metric.accounting_start();
        self.count_accounted_start.fetch_add(1, Ordering::Relaxed);
        let did_throttle = inner.rate_limiter.acquire(key_count).await;
@@ -145,9 +150,9 @@ where
                .fetch_add(wait_time.as_micros() as u64, Ordering::Relaxed);
            let observation = Observation { wait_time };
            self.metric.observe_throttling(&observation);
-            Some(wait_time)
+            ThrottleResult::Throttled { start, end: now }
        } else {
-            None
+            ThrottleResult::NotThrottled { start }
        }
    }
 }
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -53,7 +53,7 @@ use utils::{
    postgres_client::PostgresClientProtocol,
    sync::gate::{Gate, GateGuard},
 };
-use wal_decoder::serialized_batch::SerializedValueBatch;
+use wal_decoder::serialized_batch::{SerializedValueBatch, ValueMeta};

 use std::sync::atomic::Ordering as AtomicOrdering;
 use std::sync::{Arc, Mutex, RwLock, Weak};
@@ -768,7 +768,7 @@ pub enum GetLogicalSizePriority {
    Background,
 }

-#[derive(enumset::EnumSetType)]
+#[derive(Debug, enumset::EnumSetType)]
 pub(crate) enum CompactFlags {
    ForceRepartition,
    ForceImageLayerCreation,
@@ -779,17 +779,91 @@ pub(crate) enum CompactFlags {

 #[serde_with::serde_as]
 #[derive(Debug, Clone, serde::Deserialize)]
-pub(crate) struct CompactRange {
+pub(crate) struct CompactRequest {
+    pub compact_key_range: Option<CompactKeyRange>,
+    pub compact_lsn_range: Option<CompactLsnRange>,
+    /// Whether the compaction job should be scheduled.
+    #[serde(default)]
+    pub scheduled: bool,
+    /// Whether the compaction job should be split across key ranges.
+    #[serde(default)]
+    pub sub_compaction: bool,
+    /// Max job size for each subcompaction job.
+    pub sub_compaction_max_job_size_mb: Option<u64>,
+}
+
+#[serde_with::serde_as]
+#[derive(Debug, Clone, serde::Deserialize)]
+pub(crate) struct CompactLsnRange {
+    pub start: Lsn,
+    pub end: Lsn,
+}
+
+#[serde_with::serde_as]
+#[derive(Debug, Clone, serde::Deserialize)]
+pub(crate) struct CompactKeyRange {
    #[serde_as(as = "serde_with::DisplayFromStr")]
    pub start: Key,
    #[serde_as(as = "serde_with::DisplayFromStr")]
    pub end: Key,
 }

-#[derive(Clone, Default)]
+impl From<Range<Lsn>> for CompactLsnRange {
+    fn from(range: Range<Lsn>) -> Self {
+        Self {
+            start: range.start,
+            end: range.end,
+        }
+    }
+}
+
+impl From<Range<Key>> for CompactKeyRange {
+    fn from(range: Range<Key>) -> Self {
+        Self {
+            start: range.start,
+            end: range.end,
+        }
+    }
+}
+
+impl From<CompactLsnRange> for Range<Lsn> {
+    fn from(range: CompactLsnRange) -> Self {
+        range.start..range.end
+    }
+}
+
+impl From<CompactKeyRange> for Range<Key> {
+    fn from(range: CompactKeyRange) -> Self {
+        range.start..range.end
+    }
+}
+
+impl CompactLsnRange {
+    #[cfg(test)]
+    #[cfg(feature = "testing")]
+    pub fn above(lsn: Lsn) -> Self {
+        Self {
+            start: lsn,
+            end: Lsn::MAX,
+        }
+    }
+}
+
+#[derive(Debug, Clone, Default)]
 pub(crate) struct CompactOptions {
    pub flags: EnumSet<CompactFlags>,
-    pub compact_range: Option<CompactRange>,
+    /// If set, the compaction will only compact the key range specified by this option.
+    /// This option is only used by GC compaction. For the full explanation, see [`compaction::GcCompactJob`].
+    pub compact_key_range: Option<CompactKeyRange>,
+    /// If set, the compaction will only compact the LSN within this value.
+    /// This option is only used by GC compaction. For the full explanation, see [`compaction::GcCompactJob`].
+    pub compact_lsn_range: Option<CompactLsnRange>,
+    /// Enable sub-compaction (split compaction job across key ranges).
+    /// This option is only used by GC compaction.
+    pub sub_compaction: bool,
+    /// Set job size for the GC compaction.
+    /// This option is only used by GC compaction.
+    pub sub_compaction_max_job_size_mb: Option<u64>,
 }

 impl std::fmt::Debug for Timeline {
@@ -1433,23 +1507,31 @@ impl Timeline {
        Ok(lease)
    }

-    /// Flush to disk all data that was written with the put_* functions
+    /// Freeze the current open in-memory layer. It will be written to disk on next iteration.
+    /// Returns the flush request ID which can be awaited with wait_flush_completion().
+    #[instrument(skip(self), fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%self.timeline_id))]
+    pub(crate) async fn freeze(&self) -> Result<u64, FlushLayerError> {
+        self.freeze0().await
+    }
+
+    /// Freeze and flush the open in-memory layer, waiting for it to be written to disk.
    #[instrument(skip(self), fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%self.timeline_id))]
    pub(crate) async fn freeze_and_flush(&self) -> Result<(), FlushLayerError> {
        self.freeze_and_flush0().await
    }

+    /// Freeze the current open in-memory layer. It will be written to disk on next iteration.
+    /// Returns the flush request ID which can be awaited with wait_flush_completion().
+    pub(crate) async fn freeze0(&self) -> Result<u64, FlushLayerError> {
+        let mut g = self.write_lock.lock().await;
+        let to_lsn = self.get_last_record_lsn();
+        self.freeze_inmem_layer_at(to_lsn, &mut g).await
+    }
+
    // This exists to provide a non-span creating version of `freeze_and_flush` we can call without
    // polluting the span hierarchy.
    pub(crate) async fn freeze_and_flush0(&self) -> Result<(), FlushLayerError> {
-        let token = {
-            // Freeze the current open in-memory layer. It will be written to disk on next
-            // iteration.
-            let mut g = self.write_lock.lock().await;
-
-            let to_lsn = self.get_last_record_lsn();
-            self.freeze_inmem_layer_at(to_lsn, &mut g).await?
-        };
+        let token = self.freeze0().await?;
        self.wait_flush_completion(token).await
    }

@@ -1603,7 +1685,10 @@ impl Timeline {
            cancel,
            CompactOptions {
                flags,
-                compact_range: None,
+                compact_key_range: None,
+                compact_lsn_range: None,
+                sub_compaction: false,
+                sub_compaction_max_job_size_mb: None,
            },
            ctx,
        )
@@ -2359,7 +2444,7 @@ impl Timeline {

            result
                .metrics
-                .last_record_gauge
+                .last_record_lsn_gauge
                .set(disk_consistent_lsn.0 as i64);
            result
        })
@@ -3481,7 +3566,7 @@ impl Timeline {
    pub(crate) fn finish_write(&self, new_lsn: Lsn) {
        assert!(new_lsn.is_aligned());

-        self.metrics.last_record_gauge.set(new_lsn.0 as i64);
+        self.metrics.last_record_lsn_gauge.set(new_lsn.0 as i64);
        self.last_record_lsn.advance(new_lsn);
    }

@@ -3849,6 +3934,10 @@ impl Timeline {
    fn set_disk_consistent_lsn(&self, new_value: Lsn) -> bool {
        let old_value = self.disk_consistent_lsn.fetch_max(new_value);
        assert!(new_value >= old_value, "disk_consistent_lsn must be growing monotonously at runtime; current {old_value}, offered {new_value}");
+
+        self.metrics
+            .disk_consistent_lsn_gauge
+            .set(new_value.0 as i64);
        new_value != old_value
    }

@@ -5887,6 +5976,23 @@ impl<'a> TimelineWriter<'a> {
            return Ok(());
        }

+        // In debug builds, assert that we don't write any keys that don't belong to this shard.
+        // We don't assert this in release builds, since key ownership policies may change over
+        // time. Stray keys will be removed during compaction.
+        if cfg!(debug_assertions) {
+            for metadata in &batch.metadata {
+                if let ValueMeta::Serialized(metadata) = metadata {
+                    let key = Key::from_compact(metadata.key);
+                    assert!(
+                        self.shard_identity.is_key_local(&key)
+                            || self.shard_identity.is_key_global(&key),
+                        "key {key} does not belong on shard {}",
+                        self.shard_identity.shard_index()
+                    );
+                }
+            }
+        }
+
        let batch_max_lsn = batch.max_lsn;
        let buf_size: u64 = batch.buffer_size() as u64;

--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -16,7 +16,6 @@ use super::{

 use anyhow::{anyhow, bail, Context};
 use bytes::Bytes;
-use enumset::EnumSet;
 use fail::fail_point;
 use itertools::Itertools;
 use pageserver_api::key::KEY_SIZE;
@@ -30,7 +29,6 @@ use utils::id::TimelineId;
 use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder};
 use crate::page_cache;
 use crate::statvfs::Statvfs;
-use crate::tenant::checks::check_valid_layermap;
 use crate::tenant::remote_timeline_client::WaitCompletionError;
 use crate::tenant::storage_layer::batch_split_writer::{
    BatchWriterResult, SplitDeltaLayerWriter, SplitImageLayerWriter,
@@ -43,7 +41,7 @@ use crate::tenant::storage_layer::{
 use crate::tenant::timeline::ImageLayerCreationOutcome;
 use crate::tenant::timeline::{drop_rlock, DeltaLayerWriter, ImageLayerWriter};
 use crate::tenant::timeline::{Layer, ResidentLayer};
-use crate::tenant::{DeltaLayer, MaybeOffloaded};
+use crate::tenant::{gc_block, DeltaLayer, MaybeOffloaded};
 use crate::virtual_file::{MaybeFatalIo, VirtualFile};
 use pageserver_api::config::tenant_conf_defaults::{
    DEFAULT_CHECKPOINT_DISTANCE, DEFAULT_COMPACTION_THRESHOLD,
@@ -64,16 +62,69 @@ use super::CompactionError;
 /// Maximum number of deltas before generating an image layer in bottom-most compaction.
 const COMPACTION_DELTA_THRESHOLD: usize = 5;

+/// A scheduled compaction task.
+pub(crate) struct ScheduledCompactionTask {
+    /// It's unfortunate that we need to store a compact options struct here because the only outer
+    /// API we can call here is `compact_with_options` which does a few setup calls before starting the
+    /// actual compaction job... We should refactor this to store `GcCompactionJob` in the future.
+    pub options: CompactOptions,
+    /// The channel to send the compaction result. If this is a subcompaction, the last compaction job holds the sender.
+    pub result_tx: Option<tokio::sync::oneshot::Sender<()>>,
+    /// Hold the GC block. If this is a subcompaction, the last compaction job holds the gc block guard.
+    pub gc_block: Option<gc_block::Guard>,
+}
+
+/// A job description for the gc-compaction job. This structure describes the rectangle range that the job will
+/// process. The exact layers that need to be compacted/rewritten will be generated when `compact_with_gc` gets
+/// called.
+#[derive(Debug, Clone)]
+pub(crate) struct GcCompactJob {
+    pub dry_run: bool,
+    /// The key range to be compacted. The compaction algorithm will only regenerate key-value pairs within this range
+    /// [left inclusive, right exclusive), and other pairs will be rewritten into new files if necessary.
+    pub compact_key_range: Range<Key>,
+    /// The LSN range to be compacted. The compaction algorithm will use this range to determine the layers to be
+    /// selected for the compaction, and it does not guarantee the generated layers will have exactly the same LSN range
+    /// as specified here. The true range being compacted is `min_lsn/max_lsn` in [`GcCompactionJobDescription`].
+    /// min_lsn will always <= the lower bound specified here, and max_lsn will always >= the upper bound specified here.
+    pub compact_lsn_range: Range<Lsn>,
+}
+
+impl GcCompactJob {
+    pub fn from_compact_options(options: CompactOptions) -> Self {
+        GcCompactJob {
+            dry_run: options.flags.contains(CompactFlags::DryRun),
+            compact_key_range: options
+                .compact_key_range
+                .map(|x| x.into())
+                .unwrap_or(Key::MIN..Key::MAX),
+            compact_lsn_range: options
+                .compact_lsn_range
+                .map(|x| x.into())
+                .unwrap_or(Lsn::INVALID..Lsn::MAX),
+        }
+    }
+}
+
+/// A job description for the gc-compaction job. This structure is generated when `compact_with_gc` is called
+/// and contains the exact layers we want to compact.
 pub struct GcCompactionJobDescription {
    /// All layers to read in the compaction job
    selected_layers: Vec<Layer>,
-    /// GC cutoff of the job
+    /// GC cutoff of the job. This is the lowest LSN that will be accessed by the read/GC path and we need to
+    /// keep all deltas <= this LSN or generate an image == this LSN.
    gc_cutoff: Lsn,
-    /// LSNs to retain for the job
+    /// LSNs to retain for the job. Read path will use this LSN so we need to keep deltas <= this LSN or
+    /// generate an image == this LSN.
    retain_lsns_below_horizon: Vec<Lsn>,
-    /// Maximum layer LSN processed in this compaction
+    /// Maximum layer LSN processed in this compaction, that is max(end_lsn of layers). Exclusive. All data
+    /// \>= this LSN will be kept and will not be rewritten.
    max_layer_lsn: Lsn,
-    /// Only compact layers overlapping with this range
+    /// Minimum layer LSN processed in this compaction, that is min(start_lsn of layers). Inclusive.
+    /// All access below (strict lower than `<`) this LSN will be routed through the normal read path instead of
+    /// k-merge within gc-compaction.
+    min_layer_lsn: Lsn,
+    /// Only compact layers overlapping with this range.
    compaction_key_range: Range<Key>,
    /// When partial compaction is enabled, these layers need to be rewritten to ensure no overlap.
    /// This field is here solely for debugging. The field will not be read once the compaction
@@ -292,7 +343,7 @@ impl Timeline {
            )));
        }

-        if options.compact_range.is_some() {
+        if options.compact_key_range.is_some() || options.compact_lsn_range.is_some() {
            // maybe useful in the future? could implement this at some point
            return Err(CompactionError::Other(anyhow!(
                "compaction range is not supported for legacy compaction for now"
@@ -1174,11 +1225,12 @@ impl Timeline {
                    .await
                    .map_err(CompactionError::Other)?;
            } else {
-                debug!(
-                    "Dropping key {} during compaction (it belongs on shard {:?})",
-                    key,
-                    self.shard_identity.get_shard_number(&key)
-                );
+                let shard = self.shard_identity.shard_index();
+                let owner = self.shard_identity.get_shard_number(&key);
+                if cfg!(debug_assertions) {
+                    panic!("key {key} does not belong on shard {shard}, owned by {owner}");
+                }
+                debug!("dropping key {key} during compaction (it belongs on shard {owner})");
            }

            if !new_layers.is_empty() {
@@ -1746,22 +1798,112 @@ impl Timeline {
        Ok(())
    }

-    pub(crate) async fn compact_with_gc(
+    /// Split a gc-compaction job into multiple compaction jobs. The split is based on the key range and the estimated size of the compaction job.
+    /// The function returns a list of compaction jobs that can be executed separately. If the upper bound of the compact LSN
+    /// range is not specified, we will use the latest gc_cutoff as the upper bound, so that all jobs in the jobset acts
+    /// like a full compaction of the specified keyspace.
+    pub(crate) async fn gc_compaction_split_jobs(
        self: &Arc<Self>,
-        cancel: &CancellationToken,
-        options: CompactOptions,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
-        self.partial_compact_with_gc(
-            options
-                .compact_range
-                .map(|range| range.start..range.end)
-                .unwrap_or_else(|| Key::MIN..Key::MAX),
-            cancel,
-            options.flags,
-            ctx,
-        )
-        .await
+        job: GcCompactJob,
+        sub_compaction_max_job_size_mb: Option<u64>,
+    ) -> anyhow::Result<Vec<GcCompactJob>> {
+        let compact_below_lsn = if job.compact_lsn_range.end != Lsn::MAX {
+            job.compact_lsn_range.end
+        } else {
+            *self.get_latest_gc_cutoff_lsn() // use the real gc cutoff
+        };
+
+        // Split compaction job to about 4GB each
+        const GC_COMPACT_MAX_SIZE_MB: u64 = 4 * 1024;
+        let sub_compaction_max_job_size_mb =
+            sub_compaction_max_job_size_mb.unwrap_or(GC_COMPACT_MAX_SIZE_MB);
+
+        let mut compact_jobs = Vec::new();
+        // For now, we simply use the key partitioning information; we should do a more fine-grained partitioning
+        // by estimating the amount of files read for a compaction job. We should also partition on LSN.
+        let Ok(partition) = self.partitioning.try_lock() else {
+            bail!("failed to acquire partition lock");
+        };
+        let ((dense_ks, sparse_ks), _) = &*partition;
+        // Truncate the key range to be within user specified compaction range.
+        fn truncate_to(
+            source_start: &Key,
+            source_end: &Key,
+            target_start: &Key,
+            target_end: &Key,
+        ) -> Option<(Key, Key)> {
+            let start = source_start.max(target_start);
+            let end = source_end.min(target_end);
+            if start < end {
+                Some((*start, *end))
+            } else {
+                None
+            }
+        }
+        let mut split_key_ranges = Vec::new();
+        let ranges = dense_ks
+            .parts
+            .iter()
+            .map(|partition| partition.ranges.iter())
+            .chain(sparse_ks.parts.iter().map(|x| x.0.ranges.iter()))
+            .flatten()
+            .cloned()
+            .collect_vec();
+        for range in ranges.iter() {
+            let Some((start, end)) = truncate_to(
+                &range.start,
+                &range.end,
+                &job.compact_key_range.start,
+                &job.compact_key_range.end,
+            ) else {
+                continue;
+            };
+            split_key_ranges.push((start, end));
+        }
+        split_key_ranges.sort();
+        let guard = self.layers.read().await;
+        let layer_map = guard.layer_map()?;
+        let mut current_start = None;
+        let ranges_num = split_key_ranges.len();
+        for (idx, (start, end)) in split_key_ranges.into_iter().enumerate() {
+            if current_start.is_none() {
+                current_start = Some(start);
+            }
+            let start = current_start.unwrap();
+            if start >= end {
+                // We have already processed this partition.
+                continue;
+            }
+            let res = layer_map.range_search(start..end, compact_below_lsn);
+            let total_size = res.found.keys().map(|x| x.layer.file_size()).sum::<u64>();
+            if total_size > sub_compaction_max_job_size_mb * 1024 * 1024 || ranges_num == idx + 1 {
+                // Try to extend the compaction range so that we include at least one full layer file.
+                let extended_end = res
+                    .found
+                    .keys()
+                    .map(|layer| layer.layer.key_range.end)
+                    .min();
+                // It is possible that the search range does not contain any layer files when we reach the end of the loop.
+                // In this case, we simply use the specified key range end.
+                let end = if let Some(extended_end) = extended_end {
+                    extended_end.max(end)
+                } else {
+                    end
+                };
+                info!(
+                    "splitting compaction job: {}..{}, estimated_size={}",
+                    start, end, total_size
+                );
+                compact_jobs.push(GcCompactJob {
+                    dry_run: job.dry_run,
+                    compact_key_range: start..end,
+                    compact_lsn_range: job.compact_lsn_range.start..compact_below_lsn,
+                });
+                current_start = Some(end);
+            }
+        }
+        drop(guard);
+        Ok(compact_jobs)
    }

    /// An experimental compaction building block that combines compaction with garbage collection.
@@ -1771,17 +1913,49 @@ impl Timeline {
    /// layers and image layers, which generates image layers on the gc horizon, drop deltas below gc horizon,
    /// and create delta layers with all deltas >= gc horizon.
    ///
-    /// If `key_range` is provided, it will only compact the keys within the range, aka partial compaction.
+    /// If `options.compact_range` is provided, it will only compact the keys within the range, aka partial compaction.
    /// Partial compaction will read and process all layers overlapping with the key range, even if it might
    /// contain extra keys. After the gc-compaction phase completes, delta layers that are not fully contained
    /// within the key range will be rewritten to ensure they do not overlap with the delta layers. Providing
    /// Key::MIN..Key..MAX to the function indicates a full compaction, though technically, `Key::MAX` is not
    /// part of the range.
-    pub(crate) async fn partial_compact_with_gc(
+    ///
+    /// If `options.compact_lsn_range.end` is provided, the compaction will only compact layers below or intersect with
+    /// the LSN. Otherwise, it will use the gc cutoff by default.
+    pub(crate) async fn compact_with_gc(
        self: &Arc<Self>,
-        compaction_key_range: Range<Key>,
        cancel: &CancellationToken,
-        flags: EnumSet<CompactFlags>,
+        options: CompactOptions,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
+        let sub_compaction = options.sub_compaction;
+        let job = GcCompactJob::from_compact_options(options.clone());
+        if sub_compaction {
+            info!("running enhanced gc bottom-most compaction with sub-compaction, splitting compaction jobs");
+            let jobs = self
+                .gc_compaction_split_jobs(job, options.sub_compaction_max_job_size_mb)
+                .await?;
+            let jobs_len = jobs.len();
+            for (idx, job) in jobs.into_iter().enumerate() {
+                info!(
+                    "running enhanced gc bottom-most compaction, sub-compaction {}/{}",
+                    idx + 1,
+                    jobs_len
+                );
+                self.compact_with_gc_inner(cancel, job, ctx).await?;
+            }
+            if jobs_len == 0 {
+                info!("no jobs to run, skipping gc bottom-most compaction");
+            }
+            return Ok(());
+        }
+        self.compact_with_gc_inner(cancel, job, ctx).await
+    }
+
+    async fn compact_with_gc_inner(
+        self: &Arc<Self>,
+        cancel: &CancellationToken,
+        job: GcCompactJob,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
        // Block other compaction/GC tasks from running for now. GC-compaction could run along
@@ -1803,13 +1977,11 @@ impl Timeline {
        )
        .await?;

-        let dry_run = flags.contains(CompactFlags::DryRun);
+        let dry_run = job.dry_run;
+        let compact_key_range = job.compact_key_range;
+        let compact_lsn_range = job.compact_lsn_range;

-        if compaction_key_range == (Key::MIN..Key::MAX) {
-            info!("running enhanced gc bottom-most compaction, dry_run={dry_run}, compaction_key_range={}..{}", compaction_key_range.start, compaction_key_range.end);
-        } else {
-            info!("running enhanced gc bottom-most compaction, dry_run={dry_run}");
-        }
+        info!("running enhanced gc bottom-most compaction, dry_run={dry_run}, compact_key_range={}..{}, compact_lsn_range={}..{}", compact_key_range.start, compact_key_range.end, compact_lsn_range.start, compact_lsn_range.end);

        scopeguard::defer! {
            info!("done enhanced gc bottom-most compaction");
@@ -1826,7 +1998,26 @@ impl Timeline {
            let layers = guard.layer_map()?;
            let gc_info = self.gc_info.read().unwrap();
            let mut retain_lsns_below_horizon = Vec::new();
-            let gc_cutoff = gc_info.cutoffs.select_min();
+            let gc_cutoff = {
+                // Currently, gc-compaction only kicks in after the legacy gc has updated the gc_cutoff.
+                // Therefore, it can only clean up data that cannot be cleaned up with legacy gc, instead of
+                // cleaning everything that theoritically it could. In the future, it should use `self.gc_info`
+                // to get the truth data.
+                let real_gc_cutoff = *self.get_latest_gc_cutoff_lsn();
+                // The compaction algorithm will keep all keys above the gc_cutoff while keeping only necessary keys below the gc_cutoff for
+                // each of the retain_lsn. Therefore, if the user-provided `compact_lsn_range.end` is larger than the real gc cutoff, we will use
+                // the real cutoff.
+                let mut gc_cutoff = if compact_lsn_range.end == Lsn::MAX {
+                    real_gc_cutoff
+                } else {
+                    compact_lsn_range.end
+                };
+                if gc_cutoff > real_gc_cutoff {
+                    warn!("provided compact_lsn_range.end={} is larger than the real_gc_cutoff={}, using the real gc cutoff", gc_cutoff, real_gc_cutoff);
+                    gc_cutoff = real_gc_cutoff;
+                }
+                gc_cutoff
+            };
            for (lsn, _timeline_id, _is_offloaded) in &gc_info.retain_lsns {
                if lsn < &gc_cutoff {
                    retain_lsns_below_horizon.push(*lsn);
@@ -1839,14 +2030,32 @@ impl Timeline {
            }
            let mut selected_layers: Vec<Layer> = Vec::new();
            drop(gc_info);
-            // Pick all the layers intersect or below the gc_cutoff, get the largest LSN in the selected layers.
+            // Firstly, pick all the layers intersect or below the gc_cutoff, get the largest LSN in the selected layers.
            let Some(max_layer_lsn) = layers
                .iter_historic_layers()
                .filter(|desc| desc.get_lsn_range().start <= gc_cutoff)
                .map(|desc| desc.get_lsn_range().end)
                .max()
            else {
-                info!("no layers to compact with gc");
+                info!("no layers to compact with gc: no historic layers below gc_cutoff, gc_cutoff={}", gc_cutoff);
+                return Ok(());
+            };
+            // Next, if the user specifies compact_lsn_range.start, we need to filter some layers out. All the layers (strictly) below
+            // the min_layer_lsn computed as below will be filtered out and the data will be accessed using the normal read path, as if
+            // it is a branch.
+            let Some(min_layer_lsn) = layers
+                .iter_historic_layers()
+                .filter(|desc| {
+                    if compact_lsn_range.start == Lsn::INVALID {
+                        true // select all layers below if start == Lsn(0)
+                    } else {
+                        desc.get_lsn_range().end > compact_lsn_range.start // strictly larger than compact_above_lsn
+                    }
+                })
+                .map(|desc| desc.get_lsn_range().start)
+                .min()
+            else {
+                info!("no layers to compact with gc: no historic layers above compact_above_lsn, compact_above_lsn={}", compact_lsn_range.end);
                return Ok(());
            };
            // Then, pick all the layers that are below the max_layer_lsn. This is to ensure we can pick all single-key
@@ -1854,22 +2063,22 @@ impl Timeline {
            let mut rewrite_layers = Vec::new();
            for desc in layers.iter_historic_layers() {
                if desc.get_lsn_range().end <= max_layer_lsn
-                    && overlaps_with(&desc.get_key_range(), &compaction_key_range)
+                    && desc.get_lsn_range().start >= min_layer_lsn
+                    && overlaps_with(&desc.get_key_range(), &compact_key_range)
                {
                    // If the layer overlaps with the compaction key range, we need to read it to obtain all keys within the range,
                    // even if it might contain extra keys
                    selected_layers.push(guard.get_from_desc(&desc));
                    // If the layer is not fully contained within the key range, we need to rewrite it if it's a delta layer (it's fine
                    // to overlap image layers)
-                    if desc.is_delta()
-                        && !fully_contains(&compaction_key_range, &desc.get_key_range())
+                    if desc.is_delta() && !fully_contains(&compact_key_range, &desc.get_key_range())
                    {
                        rewrite_layers.push(desc);
                    }
                }
            }
            if selected_layers.is_empty() {
-                info!("no layers to compact with gc");
+                info!("no layers to compact with gc: no layers within the key range, gc_cutoff={}, key_range={}..{}", gc_cutoff, compact_key_range.start, compact_key_range.end);
                return Ok(());
            }
            retain_lsns_below_horizon.sort();
@@ -1877,13 +2086,20 @@ impl Timeline {
                selected_layers,
                gc_cutoff,
                retain_lsns_below_horizon,
+                min_layer_lsn,
                max_layer_lsn,
-                compaction_key_range,
+                compaction_key_range: compact_key_range,
                rewrite_layers,
            }
        };
-        let lowest_retain_lsn = if self.ancestor_timeline.is_some() {
-            Lsn(self.ancestor_lsn.0 + 1)
+        let (has_data_below, lowest_retain_lsn) = if compact_lsn_range.start != Lsn::INVALID {
+            // If we only compact above some LSN, we should get the history from the current branch below the specified LSN.
+            // We use job_desc.min_layer_lsn as if it's the lowest branch point.
+            (true, job_desc.min_layer_lsn)
+        } else if self.ancestor_timeline.is_some() {
+            // In theory, we can also use min_layer_lsn here, but using ancestor LSN makes sure the delta layers cover the
+            // LSN ranges all the way to the ancestor timeline.
+            (true, self.ancestor_lsn)
        } else {
            let res = job_desc
                .retain_lsns_below_horizon
@@ -1901,17 +2117,19 @@ impl Timeline {
                        .unwrap_or(job_desc.gc_cutoff)
                );
            }
-            res
+            (false, res)
        };
        info!(
-            "picked {} layers for compaction ({} layers need rewriting) with max_layer_lsn={} gc_cutoff={} lowest_retain_lsn={}, key_range={}..{}",
+            "picked {} layers for compaction ({} layers need rewriting) with max_layer_lsn={} min_layer_lsn={} gc_cutoff={} lowest_retain_lsn={}, key_range={}..{}, has_data_below={}",
            job_desc.selected_layers.len(),
            job_desc.rewrite_layers.len(),
            job_desc.max_layer_lsn,
+            job_desc.min_layer_lsn,
            job_desc.gc_cutoff,
            lowest_retain_lsn,
            job_desc.compaction_key_range.start,
-            job_desc.compaction_key_range.end
+            job_desc.compaction_key_range.end,
+            has_data_below,
        );

        for layer in &job_desc.selected_layers {
@@ -1936,14 +2154,15 @@ impl Timeline {

        // Step 1: construct a k-merge iterator over all layers.
        // Also, verify if the layer map can be split by drawing a horizontal line at every LSN start/end split point.
-        let layer_names = job_desc
-            .selected_layers
-            .iter()
-            .map(|layer| layer.layer_desc().layer_name())
-            .collect_vec();
-        if let Some(err) = check_valid_layermap(&layer_names) {
-            warn!("gc-compaction layer map check failed because {}, this is normal if partial compaction is not finished yet", err);
-        }
+        // disable the check for now because we need to adjust the check for partial compactions, will enable later.
+        // let layer_names = job_desc
+        //     .selected_layers
+        //     .iter()
+        //     .map(|layer| layer.layer_desc().layer_name())
+        //     .collect_vec();
+        // if let Some(err) = check_valid_layermap(&layer_names) {
+        //     warn!("gc-compaction layer map check failed because {}, this is normal if partial compaction is not finished yet", err);
+        // }
        // The maximum LSN we are processing in this compaction loop
        let end_lsn = job_desc
            .selected_layers
@@ -1954,10 +2173,22 @@ impl Timeline {
        let mut delta_layers = Vec::new();
        let mut image_layers = Vec::new();
        let mut downloaded_layers = Vec::new();
+        let mut total_downloaded_size = 0;
+        let mut total_layer_size = 0;
        for layer in &job_desc.selected_layers {
+            if layer.needs_download().await?.is_some() {
+                total_downloaded_size += layer.layer_desc().file_size;
+            }
+            total_layer_size += layer.layer_desc().file_size;
            let resident_layer = layer.download_and_keep_resident().await?;
            downloaded_layers.push(resident_layer);
        }
+        info!(
+            "finish downloading layers, downloaded={}, total={}, ratio={:.2}",
+            total_downloaded_size,
+            total_layer_size,
+            total_downloaded_size as f64 / total_layer_size as f64
+        );
        for resident_layer in &downloaded_layers {
            if resident_layer.layer_desc().is_delta() {
                let layer = resident_layer.get_as_delta(ctx).await?;
@@ -1980,7 +2211,7 @@ impl Timeline {

        // Only create image layers when there is no ancestor branches. TODO: create covering image layer
        // when some condition meet.
-        let mut image_layer_writer = if self.ancestor_timeline.is_none() {
+        let mut image_layer_writer = if !has_data_below {
            Some(
                SplitImageLayerWriter::new(
                    self.conf,
@@ -2013,7 +2244,11 @@ impl Timeline {
        }
        let mut delta_layer_rewriters = HashMap::<Arc<PersistentLayerKey>, RewritingLayers>::new();

-        /// Returns None if there is no ancestor branch. Throw an error when the key is not found.
+        /// When compacting not at a bottom range (=`[0,X)`) of the root branch, we "have data below" (`has_data_below=true`).
+        /// The two cases are compaction in ancestor branches and when `compact_lsn_range.start` is set.
+        /// In those cases, we need to pull up data from below the LSN range we're compaction.
+        ///
+        /// This function unifies the cases so that later code doesn't have to think about it.
        ///
        /// Currently, we always get the ancestor image for each key in the child branch no matter whether the image
        /// is needed for reconstruction. This should be fixed in the future.
@@ -2021,17 +2256,19 @@ impl Timeline {
        /// Furthermore, we should do vectored get instead of a single get, or better, use k-merge for ancestor
        /// images.
        async fn get_ancestor_image(
-            tline: &Arc<Timeline>,
+            this_tline: &Arc<Timeline>,
            key: Key,
            ctx: &RequestContext,
+            has_data_below: bool,
+            history_lsn_point: Lsn,
        ) -> anyhow::Result<Option<(Key, Lsn, Bytes)>> {
-            if tline.ancestor_timeline.is_none() {
+            if !has_data_below {
                return Ok(None);
            };
            // This function is implemented as a get of the current timeline at ancestor LSN, therefore reusing
            // as much existing code as possible.
-            let img = tline.get(key, tline.ancestor_lsn, ctx).await?;
-            Ok(Some((key, tline.ancestor_lsn, img)))
+            let img = this_tline.get(key, history_lsn_point, ctx).await?;
+            Ok(Some((key, history_lsn_point, img)))
        }

        // Actually, we can decide not to write to the image layer at all at this point because
@@ -2048,6 +2285,11 @@ impl Timeline {
                // This is not handled in the filter iterator because shard is determined by hash.
                // Therefore, it does not give us any performance benefit to do things like skip
                // a whole layer file as handling key spaces (ranges).
+                if cfg!(debug_assertions) {
+                    let shard = self.shard_identity.shard_index();
+                    let owner = self.shard_identity.get_shard_number(&key);
+                    panic!("key {key} does not belong on shard {shard}, owned by {owner}");
+                }
                continue;
            }
            if !job_desc.compaction_key_range.contains(&key) {
@@ -2110,7 +2352,8 @@ impl Timeline {
                        job_desc.gc_cutoff,
                        &job_desc.retain_lsns_below_horizon,
                        COMPACTION_DELTA_THRESHOLD,
-                        get_ancestor_image(self, *last_key, ctx).await?,
+                        get_ancestor_image(self, *last_key, ctx, has_data_below, lowest_retain_lsn)
+                            .await?,
                    )
                    .await?;
                retention
@@ -2139,7 +2382,7 @@ impl Timeline {
                job_desc.gc_cutoff,
                &job_desc.retain_lsns_below_horizon,
                COMPACTION_DELTA_THRESHOLD,
-                get_ancestor_image(self, last_key, ctx).await?,
+                get_ancestor_image(self, last_key, ctx, has_data_below, lowest_retain_lsn).await?,
            )
            .await?;
        retention
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -369,6 +369,13 @@ pub(super) async fn handle_walreceiver_connection(
                // advances it to its end LSN. 0 is just an initialization placeholder.
                let mut modification = timeline.begin_modification(Lsn(0));

+                if !records.is_empty() {
+                    timeline
+                        .metrics
+                        .wal_records_received
+                        .inc_by(records.len() as u64);
+                }
+
                for interpreted in records {
                    if matches!(interpreted.flush_uncommitted, FlushUncommittedRecords::Yes)
                        && uncommitted_records > 0
@@ -510,6 +517,7 @@ pub(super) async fn handle_walreceiver_connection(
                        }

                        // Ingest the records without immediately committing them.
+                        timeline.metrics.wal_records_received.inc();
                        let ingested = walingest
                            .ingest_record(interpreted, &mut modification, &ctx)
                            .await
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -582,18 +582,21 @@ impl WalIngest {
                forknum: FSM_FORKNUM,
            };

+            // Zero out the last remaining FSM page, if this shard owns it. We are not precise here,
+            // and instead of digging in the FSM bitmap format we just clear the whole page.
            let fsm_logical_page_no = blkno / pg_constants::SLOTS_PER_FSM_PAGE;
            let mut fsm_physical_page_no = fsm_logical_to_physical(fsm_logical_page_no);
-            if blkno % pg_constants::SLOTS_PER_FSM_PAGE != 0 {
-                // Tail of last remaining FSM page has to be zeroed.
-                // We are not precise here and instead of digging in FSM bitmap format just clear the whole page.
+            if blkno % pg_constants::SLOTS_PER_FSM_PAGE != 0
+                && self
+                    .shard
+                    .is_key_local(&rel_block_to_key(rel, fsm_physical_page_no))
+            {
                modification.put_rel_page_image_zero(rel, fsm_physical_page_no)?;
                fsm_physical_page_no += 1;
            }
-            // TODO: re-examine the None case here wrt. sharding; should we error?
+            // Truncate this shard's view of the FSM relation size, if it even has one.
            let nblocks = get_relsize(modification, rel, ctx).await?.unwrap_or(0);
            if nblocks > fsm_physical_page_no {
-                // check if something to do: FSM is larger than truncate position
                self.put_rel_truncation(modification, rel, fsm_physical_page_no, ctx)
                    .await?;
            }
@@ -617,7 +620,7 @@ impl WalIngest {
            // tail bits in the last remaining map page, representing truncated heap
            // blocks, need to be cleared. This is not only tidy, but also necessary
            // because we don't get a chance to clear the bits if the heap is extended
-            // again.
+            // again. Only do this on the shard that owns the page.
            if (trunc_byte != 0 || trunc_offs != 0)
                && self.shard.is_key_local(&rel_block_to_key(rel, vm_page_no))
            {
@@ -631,10 +634,9 @@ impl WalIngest {
                )?;
                vm_page_no += 1;
            }
-            // TODO: re-examine the None case here wrt. sharding; should we error?
+            // Truncate this shard's view of the VM relation size, if it even has one.
            let nblocks = get_relsize(modification, rel, ctx).await?.unwrap_or(0);
            if nblocks > vm_page_no {
-                // check if something to do: VM is larger than truncate position
                self.put_rel_truncation(modification, rel, vm_page_no, ctx)
                    .await?;
            }
@@ -875,22 +877,24 @@ impl WalIngest {
        // will block waiting for the last valid LSN to advance up to
        // it. So we use the previous record's LSN in the get calls
        // instead.
-        for segno in modification
-            .tline
-            .list_slru_segments(SlruKind::Clog, Version::Modified(modification), ctx)
-            .await?
-        {
-            let segpage = segno * pg_constants::SLRU_PAGES_PER_SEGMENT;
+        if modification.tline.get_shard_identity().is_shard_zero() {
+            for segno in modification
+                .tline
+                .list_slru_segments(SlruKind::Clog, Version::Modified(modification), ctx)
+                .await?
+            {
+                let segpage = segno * pg_constants::SLRU_PAGES_PER_SEGMENT;

-            let may_delete = dispatch_pgversion!(modification.tline.pg_version, {
-                pgv::nonrelfile_utils::slru_may_delete_clogsegment(segpage, pageno)
-            });
+                let may_delete = dispatch_pgversion!(modification.tline.pg_version, {
+                    pgv::nonrelfile_utils::slru_may_delete_clogsegment(segpage, pageno)
+                });

-            if may_delete {
-                modification
-                    .drop_slru_segment(SlruKind::Clog, segno, ctx)
-                    .await?;
-                trace!("Drop CLOG segment {:>04X}", segno);
+                if may_delete {
+                    modification
+                        .drop_slru_segment(SlruKind::Clog, segno, ctx)
+                        .await?;
+                    trace!("Drop CLOG segment {:>04X}", segno);
+                }
            }
        }

@@ -1045,16 +1049,18 @@ impl WalIngest {

        // Delete all the segments except the last one. The last segment can still
        // contain, possibly partially, valid data.
-        while segment != endsegment {
-            modification
-                .drop_slru_segment(SlruKind::MultiXactMembers, segment as u32, ctx)
-                .await?;
+        if modification.tline.get_shard_identity().is_shard_zero() {
+            while segment != endsegment {
+                modification
+                    .drop_slru_segment(SlruKind::MultiXactMembers, segment as u32, ctx)
+                    .await?;

-            /* move to next segment, handling wraparound correctly */
-            if segment == maxsegment {
-                segment = 0;
-            } else {
-                segment += 1;
+                /* move to next segment, handling wraparound correctly */
+                if segment == maxsegment {
+                    segment = 0;
+                } else {
+                    segment += 1;
+                }
            }
        }

--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -22,6 +22,7 @@
 #include "libpq/pqformat.h"
 #include "miscadmin.h"
 #include "pgstat.h"
+#include "portability/instr_time.h"
 #include "postmaster/interrupt.h"
 #include "storage/buf_internals.h"
 #include "storage/ipc.h"
@@ -118,6 +119,11 @@ typedef struct
 	 */
 	PSConnectionState state;
 	PGconn		   *conn;
+
+	/* request / response counters for debugging */
+	uint64			nrequests_sent;
+	uint64			nresponses_received;
+
 	/*---
 	 * WaitEventSet containing:
 	 *	- WL_SOCKET_READABLE on 'conn'
@@ -628,6 +634,8 @@ pageserver_connect(shardno_t shard_no, int elevel)
 		}

 		shard->state = PS_Connected;
+		shard->nrequests_sent = 0;
+		shard->nresponses_received = 0;
 	}
 	/* FALLTHROUGH */
 	case PS_Connected:
@@ -656,6 +664,27 @@ call_PQgetCopyData(shardno_t shard_no, char **buffer)
 	int			ret;
 	PageServer *shard = &page_servers[shard_no];
 	PGconn	   *pageserver_conn = shard->conn;
+	instr_time	now,
+				start_ts,
+				since_start,
+				last_log_ts,
+				since_last_log;
+	bool		logged = false;
+
+	/*
+	 * As a debugging aid, if we don't get a response for a long time, print a
+	 * log message.
+	 *
+	 * 10 s is a very generous threshold, normally we expect a response in a
+	 * few milliseconds. We have metrics to track latencies in normal ranges,
+	 * but in the cases that take exceptionally long, it's useful to log the
+	 * exact timestamps.
+	 */
+#define LOG_INTERVAL_US		UINT64CONST(10 * 1000000)
+
+	INSTR_TIME_SET_CURRENT(now);
+	start_ts = last_log_ts = now;
+	INSTR_TIME_SET_ZERO(since_last_log);

 retry:
 	ret = PQgetCopyData(pageserver_conn, buffer, 1 /* async */ );
@@ -663,9 +692,12 @@ retry:
 	if (ret == 0)
 	{
 		WaitEvent	event;
+		long		timeout;
+
+		timeout = Min(0, LOG_INTERVAL_US - INSTR_TIME_GET_MICROSEC(since_last_log));

 		/* Sleep until there's something to do */
-		(void) WaitEventSetWait(shard->wes_read, -1L, &event, 1,
+		(void) WaitEventSetWait(shard->wes_read, timeout, &event, 1,
 								WAIT_EVENT_NEON_PS_READ);
 		ResetLatch(MyLatch);

@@ -684,9 +716,40 @@ retry:
 			}
 		}

+		/*
+		 * Print a message to the log if a long time has passed with no
+		 * response.
+		 */
+		INSTR_TIME_SET_CURRENT(now);
+		since_last_log = now;
+		INSTR_TIME_SUBTRACT(since_last_log, last_log_ts);
+		if (INSTR_TIME_GET_MICROSEC(since_last_log) >= LOG_INTERVAL_US)
+		{
+			since_start = now;
+			INSTR_TIME_SUBTRACT(since_start, start_ts);
+			neon_shard_log(shard_no, LOG, "no response received from pageserver for %0.3f s, still waiting (sent " UINT64_FORMAT " requests, received " UINT64_FORMAT " responses)",
+						   INSTR_TIME_GET_DOUBLE(since_start),
+						   shard->nrequests_sent, shard->nresponses_received);
+			last_log_ts = now;
+			logged = true;
+		}
+
 		goto retry;
 	}

+	/*
+	 * If we logged earlier that the response is taking a long time, log
+	 * another message when the response is finally received.
+	 */
+	if (logged)
+	{
+		INSTR_TIME_SET_CURRENT(now);
+		since_start = now;
+		INSTR_TIME_SUBTRACT(since_start, start_ts);
+		neon_shard_log(shard_no, LOG, "received response from pageserver after %0.3f s",
+					   INSTR_TIME_GET_DOUBLE(since_start));
+	}
+
 	return ret;
 }

@@ -786,6 +849,7 @@ pageserver_send(shardno_t shard_no, NeonRequest *request)
 	 * PGRES_POLLING_WRITING state. It's kinda dirty to disconnect at this
 	 * point, but on the grand scheme of things it's only a small issue.
 	 */
+	shard->nrequests_sent++;
 	if (PQputCopyData(pageserver_conn, req_buff.data, req_buff.len) <= 0)
 	{
 		char	   *msg = pchomp(PQerrorMessage(pageserver_conn));
@@ -878,6 +942,7 @@ pageserver_receive(shardno_t shard_no)
 		neon_shard_log(shard_no, ERROR, "pageserver_receive disconnect: unexpected PQgetCopyData return value: %d", rc);
 	}

+	shard->nresponses_received++;
 	return (NeonResponse *) resp;
 }

--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -423,7 +423,11 @@ readahead_buffer_resize(int newsize, void *extra)
 	 * ensuring we have received all but the last n requests (n = newsize).
 	 */
 	if (MyPState->n_requests_inflight > newsize)
-		prefetch_wait_for(MyPState->ring_unused - newsize);
+	{
+		Assert(MyPState->ring_unused >= MyPState->n_requests_inflight - newsize);
+		prefetch_wait_for(MyPState->ring_unused - (MyPState->n_requests_inflight - newsize));
+		Assert(MyPState->n_requests_inflight <= newsize);
+	}

 	/* construct the new PrefetchState, and copy over the memory contexts */
 	newPState = MemoryContextAllocZero(TopMemoryContext, newprfs_size);
@@ -438,7 +442,6 @@ readahead_buffer_resize(int newsize, void *extra)
 	newPState->ring_last = newsize;
 	newPState->ring_unused = newsize;
 	newPState->ring_receive = newsize;
-	newPState->ring_flush = newsize;
 	newPState->max_shard_no = MyPState->max_shard_no;
 	memcpy(newPState->shard_bitmap, MyPState->shard_bitmap, sizeof(MyPState->shard_bitmap));

@@ -489,6 +492,7 @@ readahead_buffer_resize(int newsize, void *extra)
 		}
 		newPState->n_unused -= 1;
 	}
+	newPState->ring_flush = newPState->ring_receive;

 	MyNeonCounters->getpage_prefetches_buffered =
 		MyPState->n_responses_buffered;
@@ -498,6 +502,7 @@ readahead_buffer_resize(int newsize, void *extra)
 	for (; end >= MyPState->ring_last && end != UINT64_MAX; end -= 1)
 	{
 		PrefetchRequest *slot = GetPrfSlot(end);
+		Assert(slot->status != PRFS_REQUESTED);
 		if (slot->status == PRFS_RECEIVED)
 		{
 			pfree(slot->response);
@@ -610,6 +615,9 @@ prefetch_read(PrefetchRequest *slot)
 {
 	NeonResponse *response;
 	MemoryContext old;
+	BufferTag	buftag;
+	shardno_t	shard_no;
+	uint64		my_ring_index;

 	Assert(slot->status == PRFS_REQUESTED);
 	Assert(slot->response == NULL);
@@ -623,11 +631,29 @@ prefetch_read(PrefetchRequest *slot)
 					   slot->status, slot->response,
 					   (long)slot->my_ring_index, (long)MyPState->ring_receive);

+	/*
+	 * Copy the request info so that if an error happens and the prefetch
+	 * queue is flushed during the receive call, we can print the original
+	 * values in the error message
+	 */
+	buftag = slot->buftag;
+	shard_no = slot->shard_no;
+	my_ring_index = slot->my_ring_index;
+
 	old = MemoryContextSwitchTo(MyPState->errctx);
-	response = (NeonResponse *) page_server->receive(slot->shard_no);
+	response = (NeonResponse *) page_server->receive(shard_no);
 	MemoryContextSwitchTo(old);
 	if (response)
 	{
+		/* The slot should still be valid */
+		if (slot->status != PRFS_REQUESTED ||
+			slot->response != NULL ||
+			slot->my_ring_index != MyPState->ring_receive)
+			neon_shard_log(shard_no, ERROR,
+						   "Incorrect prefetch slot state after receive: status=%d response=%p my=%lu receive=%lu",
+						   slot->status, slot->response,
+						   (long) slot->my_ring_index, (long) MyPState->ring_receive);
+
 		/* update prefetch state */
 		MyPState->n_responses_buffered += 1;
 		MyPState->n_requests_inflight -= 1;
@@ -642,11 +668,15 @@ prefetch_read(PrefetchRequest *slot)
 	}
 	else
 	{
-		neon_shard_log(slot->shard_no, LOG,
+		/*
+		 * Note: The slot might no longer be valid, if the connection was lost
+		 * and the prefetch queue was flushed during the receive call
+		 */
+		neon_shard_log(shard_no, LOG,
 					   "No response from reading prefetch entry %lu: %u/%u/%u.%u block %u. This can be caused by a concurrent disconnect",
-					   (long)slot->my_ring_index,
-					   RelFileInfoFmt(BufTagGetNRelFileInfo(slot->buftag)),
-					   slot->buftag.forkNum, slot->buftag.blockNum);
+					   (long) my_ring_index,
+					   RelFileInfoFmt(BufTagGetNRelFileInfo(buftag)),
+					   buftag.forkNum, buftag.blockNum);
 		return false;
 	}
 }
--- a/proxy/src/auth/backend/mod.rs
+++ b/proxy/src/auth/backend/mod.rs
@@ -74,10 +74,6 @@ impl std::fmt::Display for Backend<'_, ()> {
                    .debug_tuple("ControlPlane::ProxyV1")
                    .field(&endpoint.url())
                    .finish(),
-                ControlPlaneClient::Neon(endpoint) => fmt
-                    .debug_tuple("ControlPlane::Neon")
-                    .field(&endpoint.url())
-                    .finish(),
                #[cfg(any(test, feature = "testing"))]
                ControlPlaneClient::PostgresMock(endpoint) => fmt
                    .debug_tuple("ControlPlane::PostgresMock")
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -43,9 +43,6 @@ static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;

 #[derive(Clone, Debug, ValueEnum)]
 enum AuthBackendType {
-    #[value(name("console"), alias("cplane"))]
-    ControlPlane,
-
    #[value(name("cplane-v1"), alias("control-plane"))]
    ControlPlaneV1,

@@ -488,40 +485,7 @@ async fn main() -> anyhow::Result<()> {
    }

    if let Either::Left(auth::Backend::ControlPlane(api, _)) = &auth_backend {
-        if let proxy::control_plane::client::ControlPlaneClient::Neon(api) = &**api {
-            match (redis_notifications_client, regional_redis_client.clone()) {
-                (None, None) => {}
-                (client1, client2) => {
-                    let cache = api.caches.project_info.clone();
-                    if let Some(client) = client1 {
-                        maintenance_tasks.spawn(notifications::task_main(
-                            client,
-                            cache.clone(),
-                            cancel_map.clone(),
-                            args.region.clone(),
-                        ));
-                    }
-                    if let Some(client) = client2 {
-                        maintenance_tasks.spawn(notifications::task_main(
-                            client,
-                            cache.clone(),
-                            cancel_map.clone(),
-                            args.region.clone(),
-                        ));
-                    }
-                    maintenance_tasks.spawn(async move { cache.clone().gc_worker().await });
-                }
-            }
-            if let Some(regional_redis_client) = regional_redis_client {
-                let cache = api.caches.endpoints_cache.clone();
-                let con = regional_redis_client;
-                let span = tracing::info_span!("endpoints_cache");
-                maintenance_tasks.spawn(
-                    async move { cache.do_read(con, cancellation_token.clone()).await }
-                        .instrument(span),
-                );
-            }
-        } else if let proxy::control_plane::client::ControlPlaneClient::ProxyV1(api) = &**api {
+        if let proxy::control_plane::client::ControlPlaneClient::ProxyV1(api) = &**api {
            match (redis_notifications_client, regional_redis_client.clone()) {
                (None, None) => {}
                (client1, client2) => {
@@ -757,65 +721,6 @@ fn build_auth_backend(
            Ok(Either::Left(config))
        }

-        AuthBackendType::ControlPlane => {
-            let wake_compute_cache_config: CacheOptions = args.wake_compute_cache.parse()?;
-            let project_info_cache_config: ProjectInfoCacheOptions =
-                args.project_info_cache.parse()?;
-            let endpoint_cache_config: config::EndpointCacheConfig =
-                args.endpoint_cache_config.parse()?;
-
-            info!("Using NodeInfoCache (wake_compute) with options={wake_compute_cache_config:?}");
-            info!(
-                "Using AllowedIpsCache (wake_compute) with options={project_info_cache_config:?}"
-            );
-            info!("Using EndpointCacheConfig with options={endpoint_cache_config:?}");
-            let caches = Box::leak(Box::new(control_plane::caches::ApiCaches::new(
-                wake_compute_cache_config,
-                project_info_cache_config,
-                endpoint_cache_config,
-            )));
-
-            let config::ConcurrencyLockOptions {
-                shards,
-                limiter,
-                epoch,
-                timeout,
-            } = args.wake_compute_lock.parse()?;
-            info!(?limiter, shards, ?epoch, "Using NodeLocks (wake_compute)");
-            let locks = Box::leak(Box::new(control_plane::locks::ApiLocks::new(
-                "wake_compute_lock",
-                limiter,
-                shards,
-                timeout,
-                epoch,
-                &Metrics::get().wake_compute_lock,
-            )?));
-            tokio::spawn(locks.garbage_collect_worker());
-
-            let url: proxy::url::ApiUrl = args.auth_endpoint.parse()?;
-
-            let endpoint = http::Endpoint::new(url, http::new_client());
-
-            let mut wake_compute_rps_limit = args.wake_compute_limit.clone();
-            RateBucketInfo::validate(&mut wake_compute_rps_limit)?;
-            let wake_compute_endpoint_rate_limiter =
-                Arc::new(WakeComputeRateLimiter::new(wake_compute_rps_limit));
-
-            let api = control_plane::client::neon::NeonControlPlaneClient::new(
-                endpoint,
-                args.control_plane_token.clone(),
-                caches,
-                locks,
-                wake_compute_endpoint_rate_limiter,
-            );
-            let api = control_plane::client::ControlPlaneClient::Neon(api);
-            let auth_backend = auth::Backend::ControlPlane(MaybeOwned::Owned(api), ());
-
-            let config = Box::leak(Box::new(auth_backend));
-
-            Ok(Either::Left(config))
-        }
-
        #[cfg(feature = "testing")]
        AuthBackendType::Postgres => {
            let url = args.auth_endpoint.parse()?;
--- a/proxy/src/cancellation.rs
+++ b/proxy/src/cancellation.rs
@@ -115,7 +115,8 @@ impl<P: CancellationPublisher> CancellationHandler<P> {
                IpAddr::V6(ip) => IpNet::V6(Ipv6Net::new_assert(ip, 64).trunc()),
            };
            if !self.limiter.lock().unwrap().check(subnet_key, 1) {
-                tracing::debug!("Rate limit exceeded. Skipping cancellation message");
+                // log only the subnet part of the IP address to know which subnet is rate limited
+                tracing::warn!("Rate limit exceeded. Skipping cancellation message, {subnet_key}");
                Metrics::get()
                    .proxy
                    .cancellation_requests_total
--- a/proxy/src/console_redirect_proxy.rs
+++ b/proxy/src/console_redirect_proxy.rs
@@ -163,32 +163,36 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
    let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Client);
    let do_handshake = handshake(ctx, stream, tls, record_handshake_error);

-    let (mut stream, params) =
-        match tokio::time::timeout(config.handshake_timeout, do_handshake).await?? {
-            HandshakeData::Startup(stream, params) => (stream, params),
-            HandshakeData::Cancel(cancel_key_data) => {
-                // spawn a task to cancel the session, but don't wait for it
-                cancellations.spawn({
-                    let cancellation_handler_clone = Arc::clone(&cancellation_handler);
-                    let session_id = ctx.session_id();
-                    let peer_ip = ctx.peer_addr();
-                    async move {
-                        drop(
-                            cancellation_handler_clone
-                                .cancel_session(
-                                    cancel_key_data,
-                                    session_id,
-                                    peer_ip,
-                                    config.authentication_config.ip_allowlist_check_enabled,
-                                )
-                                .await,
-                        );
-                    }
-                });
+    let (mut stream, params) = match tokio::time::timeout(config.handshake_timeout, do_handshake)
+        .await??
+    {
+        HandshakeData::Startup(stream, params) => (stream, params),
+        HandshakeData::Cancel(cancel_key_data) => {
+            // spawn a task to cancel the session, but don't wait for it
+            cancellations.spawn({
+                let cancellation_handler_clone = Arc::clone(&cancellation_handler);
+                let session_id = ctx.session_id();
+                let peer_ip = ctx.peer_addr();
+                let cancel_span = tracing::span!(parent: None, tracing::Level::INFO, "cancel_session", session_id = ?session_id);
+                cancel_span.follows_from(tracing::Span::current());
+                async move {
+                    drop(
+                        cancellation_handler_clone
+                            .cancel_session(
+                                cancel_key_data,
+                                session_id,
+                                peer_ip,
+                                config.authentication_config.ip_allowlist_check_enabled,
+                            )
+                            .instrument(cancel_span)
+                            .await,
+                    );
+                }
+            });

-                return Ok(None);
-            }
-        };
+            return Ok(None);
+        }
+    };
    drop(pause);

    ctx.set_db_options(params.clone());
--- a/proxy/src/control_plane/client/mod.rs
+++ b/proxy/src/control_plane/client/mod.rs
@@ -1,7 +1,6 @@
 pub mod cplane_proxy_v1;
 #[cfg(any(test, feature = "testing"))]
 pub mod mock;
-pub mod neon;

 use std::hash::Hash;
 use std::sync::Arc;
@@ -28,10 +27,8 @@ use crate::types::EndpointId;
 #[non_exhaustive]
 #[derive(Clone)]
 pub enum ControlPlaneClient {
-    /// New Proxy V1 control plane API
+    /// Proxy V1 control plane API
    ProxyV1(cplane_proxy_v1::NeonControlPlaneClient),
-    /// Current Management API (V2).
-    Neon(neon::NeonControlPlaneClient),
    /// Local mock control plane.
    #[cfg(any(test, feature = "testing"))]
    PostgresMock(mock::MockControlPlane),
@@ -49,7 +46,6 @@ impl ControlPlaneApi for ControlPlaneClient {
    ) -> Result<CachedRoleSecret, errors::GetAuthInfoError> {
        match self {
            Self::ProxyV1(api) => api.get_role_secret(ctx, user_info).await,
-            Self::Neon(api) => api.get_role_secret(ctx, user_info).await,
            #[cfg(any(test, feature = "testing"))]
            Self::PostgresMock(api) => api.get_role_secret(ctx, user_info).await,
            #[cfg(test)]
@@ -66,7 +62,6 @@ impl ControlPlaneApi for ControlPlaneClient {
    ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), errors::GetAuthInfoError> {
        match self {
            Self::ProxyV1(api) => api.get_allowed_ips_and_secret(ctx, user_info).await,
-            Self::Neon(api) => api.get_allowed_ips_and_secret(ctx, user_info).await,
            #[cfg(any(test, feature = "testing"))]
            Self::PostgresMock(api) => api.get_allowed_ips_and_secret(ctx, user_info).await,
            #[cfg(test)]
@@ -81,7 +76,6 @@ impl ControlPlaneApi for ControlPlaneClient {
    ) -> Result<Vec<AuthRule>, errors::GetEndpointJwksError> {
        match self {
            Self::ProxyV1(api) => api.get_endpoint_jwks(ctx, endpoint).await,
-            Self::Neon(api) => api.get_endpoint_jwks(ctx, endpoint).await,
            #[cfg(any(test, feature = "testing"))]
            Self::PostgresMock(api) => api.get_endpoint_jwks(ctx, endpoint).await,
            #[cfg(test)]
@@ -96,7 +90,6 @@ impl ControlPlaneApi for ControlPlaneClient {
    ) -> Result<CachedNodeInfo, errors::WakeComputeError> {
        match self {
            Self::ProxyV1(api) => api.wake_compute(ctx, user_info).await,
-            Self::Neon(api) => api.wake_compute(ctx, user_info).await,
            #[cfg(any(test, feature = "testing"))]
            Self::PostgresMock(api) => api.wake_compute(ctx, user_info).await,
            #[cfg(test)]
--- a/proxy/src/control_plane/client/neon.rs
+++ b/proxy/src/control_plane/client/neon.rs
@@ -1,511 +0,0 @@
-//! Stale console backend, remove after migrating to Proxy V1 API (#15245).
-
-use std::sync::Arc;
-use std::time::Duration;
-
-use ::http::header::AUTHORIZATION;
-use ::http::HeaderName;
-use futures::TryFutureExt;
-use postgres_client::config::SslMode;
-use tokio::time::Instant;
-use tracing::{debug, info, info_span, warn, Instrument};
-
-use super::super::messages::{ControlPlaneErrorMessage, GetRoleSecret, WakeCompute};
-use crate::auth::backend::jwt::AuthRule;
-use crate::auth::backend::ComputeUserInfo;
-use crate::cache::Cached;
-use crate::context::RequestContext;
-use crate::control_plane::caches::ApiCaches;
-use crate::control_plane::errors::{
-    ControlPlaneError, GetAuthInfoError, GetEndpointJwksError, WakeComputeError,
-};
-use crate::control_plane::locks::ApiLocks;
-use crate::control_plane::messages::{ColdStartInfo, EndpointJwksResponse, Reason};
-use crate::control_plane::{
-    AuthInfo, AuthSecret, CachedAllowedIps, CachedNodeInfo, CachedRoleSecret, NodeInfo,
-};
-use crate::metrics::{CacheOutcome, Metrics};
-use crate::rate_limiter::WakeComputeRateLimiter;
-use crate::types::{EndpointCacheKey, EndpointId};
-use crate::{compute, http, scram};
-
-const X_REQUEST_ID: HeaderName = HeaderName::from_static("x-request-id");
-
-#[derive(Clone)]
-pub struct NeonControlPlaneClient {
-    endpoint: http::Endpoint,
-    pub caches: &'static ApiCaches,
-    pub(crate) locks: &'static ApiLocks<EndpointCacheKey>,
-    pub(crate) wake_compute_endpoint_rate_limiter: Arc<WakeComputeRateLimiter>,
-    // put in a shared ref so we don't copy secrets all over in memory
-    jwt: Arc<str>,
-}
-
-impl NeonControlPlaneClient {
-    /// Construct an API object containing the auth parameters.
-    pub fn new(
-        endpoint: http::Endpoint,
-        jwt: Arc<str>,
-        caches: &'static ApiCaches,
-        locks: &'static ApiLocks<EndpointCacheKey>,
-        wake_compute_endpoint_rate_limiter: Arc<WakeComputeRateLimiter>,
-    ) -> Self {
-        Self {
-            endpoint,
-            caches,
-            locks,
-            wake_compute_endpoint_rate_limiter,
-            jwt,
-        }
-    }
-
-    pub(crate) fn url(&self) -> &str {
-        self.endpoint.url().as_str()
-    }
-
-    async fn do_get_auth_info(
-        &self,
-        ctx: &RequestContext,
-        user_info: &ComputeUserInfo,
-    ) -> Result<AuthInfo, GetAuthInfoError> {
-        if !self
-            .caches
-            .endpoints_cache
-            .is_valid(ctx, &user_info.endpoint.normalize())
-        {
-            // TODO: refactor this because it's weird
-            // this is a failure to authenticate but we return Ok.
-            info!("endpoint is not valid, skipping the request");
-            return Ok(AuthInfo::default());
-        }
-        let request_id = ctx.session_id().to_string();
-        let application_name = ctx.console_application_name();
-        async {
-            let request = self
-                .endpoint
-                .get_path("proxy_get_role_secret")
-                .header(X_REQUEST_ID, &request_id)
-                .header(AUTHORIZATION, format!("Bearer {}", &self.jwt))
-                .query(&[("session_id", ctx.session_id())])
-                .query(&[
-                    ("application_name", application_name.as_str()),
-                    ("project", user_info.endpoint.as_str()),
-                    ("role", user_info.user.as_str()),
-                ])
-                .build()?;
-
-            debug!(url = request.url().as_str(), "sending http request");
-            let start = Instant::now();
-            let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Cplane);
-            let response = self.endpoint.execute(request).await?;
-            drop(pause);
-            info!(duration = ?start.elapsed(), "received http response");
-            let body = match parse_body::<GetRoleSecret>(response).await {
-                Ok(body) => body,
-                // Error 404 is special: it's ok not to have a secret.
-                // TODO(anna): retry
-                Err(e) => {
-                    return if e.get_reason().is_not_found() {
-                        // TODO: refactor this because it's weird
-                        // this is a failure to authenticate but we return Ok.
-                        Ok(AuthInfo::default())
-                    } else {
-                        Err(e.into())
-                    };
-                }
-            };
-
-            let secret = if body.role_secret.is_empty() {
-                None
-            } else {
-                let secret = scram::ServerSecret::parse(&body.role_secret)
-                    .map(AuthSecret::Scram)
-                    .ok_or(GetAuthInfoError::BadSecret)?;
-                Some(secret)
-            };
-            let allowed_ips = body.allowed_ips.unwrap_or_default();
-            Metrics::get()
-                .proxy
-                .allowed_ips_number
-                .observe(allowed_ips.len() as f64);
-            Ok(AuthInfo {
-                secret,
-                allowed_ips,
-                project_id: body.project_id,
-            })
-        }
-        .inspect_err(|e| tracing::debug!(error = ?e))
-        .instrument(info_span!("do_get_auth_info"))
-        .await
-    }
-
-    async fn do_get_endpoint_jwks(
-        &self,
-        ctx: &RequestContext,
-        endpoint: EndpointId,
-    ) -> Result<Vec<AuthRule>, GetEndpointJwksError> {
-        if !self
-            .caches
-            .endpoints_cache
-            .is_valid(ctx, &endpoint.normalize())
-        {
-            return Err(GetEndpointJwksError::EndpointNotFound);
-        }
-        let request_id = ctx.session_id().to_string();
-        async {
-            let request = self
-                .endpoint
-                .get_with_url(|url| {
-                    url.path_segments_mut()
-                        .push("endpoints")
-                        .push(endpoint.as_str())
-                        .push("jwks");
-                })
-                .header(X_REQUEST_ID, &request_id)
-                .header(AUTHORIZATION, format!("Bearer {}", &self.jwt))
-                .query(&[("session_id", ctx.session_id())])
-                .build()
-                .map_err(GetEndpointJwksError::RequestBuild)?;
-
-            debug!(url = request.url().as_str(), "sending http request");
-            let start = Instant::now();
-            let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Cplane);
-            let response = self
-                .endpoint
-                .execute(request)
-                .await
-                .map_err(GetEndpointJwksError::RequestExecute)?;
-            drop(pause);
-            info!(duration = ?start.elapsed(), "received http response");
-
-            let body = parse_body::<EndpointJwksResponse>(response).await?;
-
-            let rules = body
-                .jwks
-                .into_iter()
-                .map(|jwks| AuthRule {
-                    id: jwks.id,
-                    jwks_url: jwks.jwks_url,
-                    audience: jwks.jwt_audience,
-                    role_names: jwks.role_names,
-                })
-                .collect();
-
-            Ok(rules)
-        }
-        .inspect_err(|e| tracing::debug!(error = ?e))
-        .instrument(info_span!("do_get_endpoint_jwks"))
-        .await
-    }
-
-    async fn do_wake_compute(
-        &self,
-        ctx: &RequestContext,
-        user_info: &ComputeUserInfo,
-    ) -> Result<NodeInfo, WakeComputeError> {
-        let request_id = ctx.session_id().to_string();
-        let application_name = ctx.console_application_name();
-        async {
-            let mut request_builder = self
-                .endpoint
-                .get_path("proxy_wake_compute")
-                .header("X-Request-ID", &request_id)
-                .header("Authorization", format!("Bearer {}", &self.jwt))
-                .query(&[("session_id", ctx.session_id())])
-                .query(&[
-                    ("application_name", application_name.as_str()),
-                    ("project", user_info.endpoint.as_str()),
-                ]);
-
-            let options = user_info.options.to_deep_object();
-            if !options.is_empty() {
-                request_builder = request_builder.query(&options);
-            }
-
-            let request = request_builder.build()?;
-
-            debug!(url = request.url().as_str(), "sending http request");
-            let start = Instant::now();
-            let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Cplane);
-            let response = self.endpoint.execute(request).await?;
-            drop(pause);
-            info!(duration = ?start.elapsed(), "received http response");
-            let body = parse_body::<WakeCompute>(response).await?;
-
-            // Unfortunately, ownership won't let us use `Option::ok_or` here.
-            let (host, port) = match parse_host_port(&body.address) {
-                None => return Err(WakeComputeError::BadComputeAddress(body.address)),
-                Some(x) => x,
-            };
-
-            // Don't set anything but host and port! This config will be cached.
-            // We'll set username and such later using the startup message.
-            // TODO: add more type safety (in progress).
-            let mut config = compute::ConnCfg::new(host.to_owned(), port);
-            config.ssl_mode(SslMode::Disable); // TLS is not configured on compute nodes.
-
-            let node = NodeInfo {
-                config,
-                aux: body.aux,
-                allow_self_signed_compute: false,
-            };
-
-            Ok(node)
-        }
-        .inspect_err(|e| tracing::debug!(error = ?e))
-        .instrument(info_span!("do_wake_compute"))
-        .await
-    }
-}
-
-impl super::ControlPlaneApi for NeonControlPlaneClient {
-    #[tracing::instrument(skip_all)]
-    async fn get_role_secret(
-        &self,
-        ctx: &RequestContext,
-        user_info: &ComputeUserInfo,
-    ) -> Result<CachedRoleSecret, GetAuthInfoError> {
-        let normalized_ep = &user_info.endpoint.normalize();
-        let user = &user_info.user;
-        if let Some(role_secret) = self
-            .caches
-            .project_info
-            .get_role_secret(normalized_ep, user)
-        {
-            return Ok(role_secret);
-        }
-        let auth_info = self.do_get_auth_info(ctx, user_info).await?;
-        if let Some(project_id) = auth_info.project_id {
-            let normalized_ep_int = normalized_ep.into();
-            self.caches.project_info.insert_role_secret(
-                project_id,
-                normalized_ep_int,
-                user.into(),
-                auth_info.secret.clone(),
-            );
-            self.caches.project_info.insert_allowed_ips(
-                project_id,
-                normalized_ep_int,
-                Arc::new(auth_info.allowed_ips),
-            );
-            ctx.set_project_id(project_id);
-        }
-        // When we just got a secret, we don't need to invalidate it.
-        Ok(Cached::new_uncached(auth_info.secret))
-    }
-
-    async fn get_allowed_ips_and_secret(
-        &self,
-        ctx: &RequestContext,
-        user_info: &ComputeUserInfo,
-    ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), GetAuthInfoError> {
-        let normalized_ep = &user_info.endpoint.normalize();
-        if let Some(allowed_ips) = self.caches.project_info.get_allowed_ips(normalized_ep) {
-            Metrics::get()
-                .proxy
-                .allowed_ips_cache_misses
-                .inc(CacheOutcome::Hit);
-            return Ok((allowed_ips, None));
-        }
-        Metrics::get()
-            .proxy
-            .allowed_ips_cache_misses
-            .inc(CacheOutcome::Miss);
-        let auth_info = self.do_get_auth_info(ctx, user_info).await?;
-        let allowed_ips = Arc::new(auth_info.allowed_ips);
-        let user = &user_info.user;
-        if let Some(project_id) = auth_info.project_id {
-            let normalized_ep_int = normalized_ep.into();
-            self.caches.project_info.insert_role_secret(
-                project_id,
-                normalized_ep_int,
-                user.into(),
-                auth_info.secret.clone(),
-            );
-            self.caches.project_info.insert_allowed_ips(
-                project_id,
-                normalized_ep_int,
-                allowed_ips.clone(),
-            );
-            ctx.set_project_id(project_id);
-        }
-        Ok((
-            Cached::new_uncached(allowed_ips),
-            Some(Cached::new_uncached(auth_info.secret)),
-        ))
-    }
-
-    #[tracing::instrument(skip_all)]
-    async fn get_endpoint_jwks(
-        &self,
-        ctx: &RequestContext,
-        endpoint: EndpointId,
-    ) -> Result<Vec<AuthRule>, GetEndpointJwksError> {
-        self.do_get_endpoint_jwks(ctx, endpoint).await
-    }
-
-    #[tracing::instrument(skip_all)]
-    async fn wake_compute(
-        &self,
-        ctx: &RequestContext,
-        user_info: &ComputeUserInfo,
-    ) -> Result<CachedNodeInfo, WakeComputeError> {
-        let key = user_info.endpoint_cache_key();
-
-        macro_rules! check_cache {
-            () => {
-                if let Some(cached) = self.caches.node_info.get(&key) {
-                    let (cached, info) = cached.take_value();
-                    let info = info.map_err(|c| {
-                        info!(key = &*key, "found cached wake_compute error");
-                        WakeComputeError::ControlPlane(ControlPlaneError::Message(Box::new(*c)))
-                    })?;
-
-                    debug!(key = &*key, "found cached compute node info");
-                    ctx.set_project(info.aux.clone());
-                    return Ok(cached.map(|()| info));
-                }
-            };
-        }
-
-        // Every time we do a wakeup http request, the compute node will stay up
-        // for some time (highly depends on the console's scale-to-zero policy);
-        // The connection info remains the same during that period of time,
-        // which means that we might cache it to reduce the load and latency.
-        check_cache!();
-
-        let permit = self.locks.get_permit(&key).await?;
-
-        // after getting back a permit - it's possible the cache was filled
-        // double check
-        if permit.should_check_cache() {
-            // TODO: if there is something in the cache, mark the permit as success.
-            check_cache!();
-        }
-
-        // check rate limit
-        if !self
-            .wake_compute_endpoint_rate_limiter
-            .check(user_info.endpoint.normalize_intern(), 1)
-        {
-            return Err(WakeComputeError::TooManyConnections);
-        }
-
-        let node = permit.release_result(self.do_wake_compute(ctx, user_info).await);
-        match node {
-            Ok(node) => {
-                ctx.set_project(node.aux.clone());
-                debug!(key = &*key, "created a cache entry for woken compute node");
-
-                let mut stored_node = node.clone();
-                // store the cached node as 'warm_cached'
-                stored_node.aux.cold_start_info = ColdStartInfo::WarmCached;
-
-                let (_, cached) = self.caches.node_info.insert_unit(key, Ok(stored_node));
-
-                Ok(cached.map(|()| node))
-            }
-            Err(err) => match err {
-                WakeComputeError::ControlPlane(ControlPlaneError::Message(err)) => {
-                    let Some(status) = &err.status else {
-                        return Err(WakeComputeError::ControlPlane(ControlPlaneError::Message(
-                            err,
-                        )));
-                    };
-
-                    let reason = status
-                        .details
-                        .error_info
-                        .map_or(Reason::Unknown, |x| x.reason);
-
-                    // if we can retry this error, do not cache it.
-                    if reason.can_retry() {
-                        return Err(WakeComputeError::ControlPlane(ControlPlaneError::Message(
-                            err,
-                        )));
-                    }
-
-                    // at this point, we should only have quota errors.
-                    debug!(
-                        key = &*key,
-                        "created a cache entry for the wake compute error"
-                    );
-
-                    self.caches.node_info.insert_ttl(
-                        key,
-                        Err(err.clone()),
-                        Duration::from_secs(30),
-                    );
-
-                    Err(WakeComputeError::ControlPlane(ControlPlaneError::Message(
-                        err,
-                    )))
-                }
-                err => return Err(err),
-            },
-        }
-    }
-}
-
-/// Parse http response body, taking status code into account.
-async fn parse_body<T: for<'a> serde::Deserialize<'a>>(
-    response: http::Response,
-) -> Result<T, ControlPlaneError> {
-    let status = response.status();
-    if status.is_success() {
-        // We shouldn't log raw body because it may contain secrets.
-        info!("request succeeded, processing the body");
-        return Ok(response.json().await?);
-    }
-    let s = response.bytes().await?;
-    // Log plaintext to be able to detect, whether there are some cases not covered by the error struct.
-    info!("response_error plaintext: {:?}", s);
-
-    // Don't throw an error here because it's not as important
-    // as the fact that the request itself has failed.
-    let mut body = serde_json::from_slice(&s).unwrap_or_else(|e| {
-        warn!("failed to parse error body: {e}");
-        ControlPlaneErrorMessage {
-            error: "reason unclear (malformed error message)".into(),
-            http_status_code: status,
-            status: None,
-        }
-    });
-    body.http_status_code = status;
-
-    warn!("console responded with an error ({status}): {body:?}");
-    Err(ControlPlaneError::Message(Box::new(body)))
-}
-
-fn parse_host_port(input: &str) -> Option<(&str, u16)> {
-    let (host, port) = input.rsplit_once(':')?;
-    let ipv6_brackets: &[_] = &['[', ']'];
-    Some((host.trim_matches(ipv6_brackets), port.parse().ok()?))
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    #[test]
-    fn test_parse_host_port_v4() {
-        let (host, port) = parse_host_port("127.0.0.1:5432").expect("failed to parse");
-        assert_eq!(host, "127.0.0.1");
-        assert_eq!(port, 5432);
-    }
-
-    #[test]
-    fn test_parse_host_port_v6() {
-        let (host, port) = parse_host_port("[2001:db8::1]:5432").expect("failed to parse");
-        assert_eq!(host, "2001:db8::1");
-        assert_eq!(port, 5432);
-    }
-
-    #[test]
-    fn test_parse_host_port_url() {
-        let (host, port) = parse_host_port("compute-foo-bar-1234.default.svc.cluster.local:5432")
-            .expect("failed to parse");
-        assert_eq!(host, "compute-foo-bar-1234.default.svc.cluster.local");
-        assert_eq!(port, 5432);
-    }
-}
--- a/proxy/src/control_plane/messages.rs
+++ b/proxy/src/control_plane/messages.rs
@@ -221,15 +221,6 @@ pub(crate) struct UserFacingMessage {
    pub(crate) message: Box<str>,
 }

-/// Response which holds client's auth secret, e.g. [`crate::scram::ServerSecret`].
-/// Returned by the `/proxy_get_role_secret` API method.
-#[derive(Deserialize)]
-pub(crate) struct GetRoleSecret {
-    pub(crate) role_secret: Box<str>,
-    pub(crate) allowed_ips: Option<Vec<IpPattern>>,
-    pub(crate) project_id: Option<ProjectIdInt>,
-}
-
 /// Response which holds client's auth secret, e.g. [`crate::scram::ServerSecret`].
 /// Returned by the `/get_endpoint_access_control` API method.
 #[derive(Deserialize)]
@@ -240,13 +231,6 @@ pub(crate) struct GetEndpointAccessControl {
    pub(crate) allowed_vpc_endpoint_ids: Option<Vec<EndpointIdInt>>,
 }

-// Manually implement debug to omit sensitive info.
-impl fmt::Debug for GetRoleSecret {
-    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        f.debug_struct("GetRoleSecret").finish_non_exhaustive()
-    }
-}
-
 /// Response which holds compute node's `host:port` pair.
 /// Returned by the `/proxy_wake_compute` API method.
 #[derive(Debug, Deserialize)]
@@ -477,18 +461,18 @@ mod tests {
        let json = json!({
            "role_secret": "secret",
        });
-        serde_json::from_str::<GetRoleSecret>(&json.to_string())?;
+        serde_json::from_str::<GetEndpointAccessControl>(&json.to_string())?;
        let json = json!({
            "role_secret": "secret",
            "allowed_ips": ["8.8.8.8"],
        });
-        serde_json::from_str::<GetRoleSecret>(&json.to_string())?;
+        serde_json::from_str::<GetEndpointAccessControl>(&json.to_string())?;
        let json = json!({
            "role_secret": "secret",
            "allowed_ips": ["8.8.8.8"],
            "project_id": "project",
        });
-        serde_json::from_str::<GetRoleSecret>(&json.to_string())?;
+        serde_json::from_str::<GetEndpointAccessControl>(&json.to_string())?;

        Ok(())
    }
--- a/proxy/src/proxy/mod.rs
+++ b/proxy/src/proxy/mod.rs
@@ -272,32 +272,36 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
    let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Client);
    let do_handshake = handshake(ctx, stream, mode.handshake_tls(tls), record_handshake_error);

-    let (mut stream, params) =
-        match tokio::time::timeout(config.handshake_timeout, do_handshake).await?? {
-            HandshakeData::Startup(stream, params) => (stream, params),
-            HandshakeData::Cancel(cancel_key_data) => {
-                // spawn a task to cancel the session, but don't wait for it
-                cancellations.spawn({
-                    let cancellation_handler_clone = Arc::clone(&cancellation_handler);
-                    let session_id = ctx.session_id();
-                    let peer_ip = ctx.peer_addr();
-                    async move {
-                        drop(
-                            cancellation_handler_clone
-                                .cancel_session(
-                                    cancel_key_data,
-                                    session_id,
-                                    peer_ip,
-                                    config.authentication_config.ip_allowlist_check_enabled,
-                                )
-                                .await,
-                        );
-                    }
-                });
+    let (mut stream, params) = match tokio::time::timeout(config.handshake_timeout, do_handshake)
+        .await??
+    {
+        HandshakeData::Startup(stream, params) => (stream, params),
+        HandshakeData::Cancel(cancel_key_data) => {
+            // spawn a task to cancel the session, but don't wait for it
+            cancellations.spawn({
+                let cancellation_handler_clone = Arc::clone(&cancellation_handler);
+                let session_id = ctx.session_id();
+                let peer_ip = ctx.peer_addr();
+                let cancel_span = tracing::span!(parent: None, tracing::Level::INFO, "cancel_session", session_id = ?session_id);
+                cancel_span.follows_from(tracing::Span::current());
+                async move {
+                    drop(
+                        cancellation_handler_clone
+                            .cancel_session(
+                                cancel_key_data,
+                                session_id,
+                                peer_ip,
+                                config.authentication_config.ip_allowlist_check_enabled,
+                            )
+                            .instrument(cancel_span)
+                            .await,
+                    );
+                }
+            });

-                return Ok(None);
-            }
-        };
+            return Ok(None);
+        }
+    };
    drop(pause);

    ctx.set_db_options(params.clone());
--- a/proxy/src/redis/notifications.rs
+++ b/proxy/src/redis/notifications.rs
@@ -13,6 +13,7 @@ use crate::cache::project_info::ProjectInfoCache;
 use crate::cancellation::{CancelMap, CancellationHandler};
 use crate::intern::{ProjectIdInt, RoleNameInt};
 use crate::metrics::{Metrics, RedisErrors, RedisEventsCount};
+use tracing::Instrument;

 const CPLANE_CHANNEL_NAME: &str = "neondb-proxy-ws-updates";
 pub(crate) const PROXY_CHANNEL_NAME: &str = "neondb-proxy-to-proxy-updates";
@@ -143,6 +144,8 @@ impl<C: ProjectInfoCache + Send + Sync + 'static> MessageHandler<C> {
                let peer_addr = cancel_session
                    .peer_addr
                    .unwrap_or(std::net::IpAddr::V4(std::net::Ipv4Addr::UNSPECIFIED));
+                let cancel_span = tracing::span!(parent: None, tracing::Level::INFO, "cancel_session", session_id = ?cancel_session.session_id);
+                cancel_span.follows_from(tracing::Span::current());
                // This instance of cancellation_handler doesn't have a RedisPublisherClient so it can't publish the message.
                match self
                    .cancellation_handler
@@ -152,6 +155,7 @@ impl<C: ProjectInfoCache + Send + Sync + 'static> MessageHandler<C> {
                        peer_addr,
                        cancel_session.peer_addr.is_some(),
                    )
+                    .instrument(cancel_span)
                    .await
                {
                    Ok(()) => {}
--- a/safekeeper/benches/benchutils.rs
+++ b/safekeeper/benches/benchutils.rs
@@ -83,14 +83,20 @@ impl Env {
        node_id: NodeId,
        ttid: TenantTimelineId,
    ) -> anyhow::Result<Arc<Timeline>> {
-        let conf = self.make_conf(node_id);
+        let conf = Arc::new(self.make_conf(node_id));
        let timeline_dir = get_timeline_dir(&conf, &ttid);
        let remote_path = remote_timeline_path(&ttid)?;

        let safekeeper = self.make_safekeeper(node_id, ttid).await?;
        let shared_state = SharedState::new(StateSK::Loaded(safekeeper));

-        let timeline = Timeline::new(ttid, &timeline_dir, &remote_path, shared_state);
+        let timeline = Timeline::new(
+            ttid,
+            &timeline_dir,
+            &remote_path,
+            shared_state,
+            conf.clone(),
+        );
        timeline.bootstrap(
            &mut timeline.write_shared_state().await,
            &conf,
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -338,7 +338,7 @@ async fn main() -> anyhow::Result<()> {
        }
    };

-    let conf = SafeKeeperConf {
+    let conf = Arc::new(SafeKeeperConf {
        workdir,
        my_id: id,
        listen_pg_addr: args.listen_pg,
@@ -368,7 +368,7 @@ async fn main() -> anyhow::Result<()> {
        control_file_save_interval: args.control_file_save_interval,
        partial_backup_concurrency: args.partial_backup_concurrency,
        eviction_min_resident: args.eviction_min_resident,
-    };
+    });

    // initialize sentry if SENTRY_DSN is provided
    let _sentry_guard = init_sentry(
@@ -382,7 +382,7 @@ async fn main() -> anyhow::Result<()> {
 /// complete, e.g. panicked, inner is error produced by task itself.
 type JoinTaskRes = Result<anyhow::Result<()>, JoinError>;

-async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
+async fn start_safekeeper(conf: Arc<SafeKeeperConf>) -> Result<()> {
    // fsync the datadir to make sure we have a consistent state on disk.
    if !conf.no_sync {
        let dfd = File::open(&conf.workdir).context("open datadir for syncfs")?;
@@ -428,9 +428,11 @@ async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
        e
    })?;

+    let global_timelines = Arc::new(GlobalTimelines::new(conf.clone()));
+
    // Register metrics collector for active timelines. It's important to do this
    // after daemonizing, otherwise process collector will be upset.
-    let timeline_collector = safekeeper::metrics::TimelineCollector::new();
+    let timeline_collector = safekeeper::metrics::TimelineCollector::new(global_timelines.clone());
    metrics::register_internal(Box::new(timeline_collector))?;

    wal_backup::init_remote_storage(&conf).await;
@@ -447,9 +449,8 @@ async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
        .then(|| Handle::try_current().expect("no runtime in main"));

    // Load all timelines from disk to memory.
-    GlobalTimelines::init(conf.clone()).await?;
+    global_timelines.init().await?;

-    let conf_ = conf.clone();
    // Run everything in current thread rt, if asked.
    if conf.current_thread_runtime {
        info!("running in current thread runtime");
@@ -459,14 +460,16 @@ async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
        .as_ref()
        .unwrap_or_else(|| WAL_SERVICE_RUNTIME.handle())
        .spawn(wal_service::task_main(
-            conf_,
+            conf.clone(),
            pg_listener,
            Scope::SafekeeperData,
+            global_timelines.clone(),
        ))
        // wrap with task name for error reporting
        .map(|res| ("WAL service main".to_owned(), res));
    tasks_handles.push(Box::pin(wal_service_handle));

+    let global_timelines_ = global_timelines.clone();
    let timeline_housekeeping_handle = current_thread_rt
        .as_ref()
        .unwrap_or_else(|| WAL_SERVICE_RUNTIME.handle())
@@ -474,40 +477,45 @@ async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
            const TOMBSTONE_TTL: Duration = Duration::from_secs(3600 * 24);
            loop {
                tokio::time::sleep(TOMBSTONE_TTL).await;
-                GlobalTimelines::housekeeping(&TOMBSTONE_TTL);
+                global_timelines_.housekeeping(&TOMBSTONE_TTL);
            }
        })
        .map(|res| ("Timeline map housekeeping".to_owned(), res));
    tasks_handles.push(Box::pin(timeline_housekeeping_handle));

    if let Some(pg_listener_tenant_only) = pg_listener_tenant_only {
-        let conf_ = conf.clone();
        let wal_service_handle = current_thread_rt
            .as_ref()
            .unwrap_or_else(|| WAL_SERVICE_RUNTIME.handle())
            .spawn(wal_service::task_main(
-                conf_,
+                conf.clone(),
                pg_listener_tenant_only,
                Scope::Tenant,
+                global_timelines.clone(),
            ))
            // wrap with task name for error reporting
            .map(|res| ("WAL service tenant only main".to_owned(), res));
        tasks_handles.push(Box::pin(wal_service_handle));
    }

-    let conf_ = conf.clone();
    let http_handle = current_thread_rt
        .as_ref()
        .unwrap_or_else(|| HTTP_RUNTIME.handle())
-        .spawn(http::task_main(conf_, http_listener))
+        .spawn(http::task_main(
+            conf.clone(),
+            http_listener,
+            global_timelines.clone(),
+        ))
        .map(|res| ("HTTP service main".to_owned(), res));
    tasks_handles.push(Box::pin(http_handle));

-    let conf_ = conf.clone();
    let broker_task_handle = current_thread_rt
        .as_ref()
        .unwrap_or_else(|| BROKER_RUNTIME.handle())
-        .spawn(broker::task_main(conf_).instrument(info_span!("broker")))
+        .spawn(
+            broker::task_main(conf.clone(), global_timelines.clone())
+                .instrument(info_span!("broker")),
+        )
        .map(|res| ("broker main".to_owned(), res));
    tasks_handles.push(Box::pin(broker_task_handle));

--- a/safekeeper/src/broker.rs
+++ b/safekeeper/src/broker.rs
@@ -39,14 +39,17 @@ const RETRY_INTERVAL_MSEC: u64 = 1000;
 const PUSH_INTERVAL_MSEC: u64 = 1000;

 /// Push once in a while data about all active timelines to the broker.
-async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> {
+async fn push_loop(
+    conf: Arc<SafeKeeperConf>,
+    global_timelines: Arc<GlobalTimelines>,
+) -> anyhow::Result<()> {
    if conf.disable_periodic_broker_push {
        info!("broker push_loop is disabled, doing nothing...");
        futures::future::pending::<()>().await; // sleep forever
        return Ok(());
    }

-    let active_timelines_set = GlobalTimelines::get_global_broker_active_set();
+    let active_timelines_set = global_timelines.get_global_broker_active_set();

    let mut client =
        storage_broker::connect(conf.broker_endpoint.clone(), conf.broker_keepalive_interval)?;
@@ -87,8 +90,13 @@ async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> {

 /// Subscribe and fetch all the interesting data from the broker.
 #[instrument(name = "broker_pull", skip_all)]
-async fn pull_loop(conf: SafeKeeperConf, stats: Arc<BrokerStats>) -> Result<()> {
-    let mut client = storage_broker::connect(conf.broker_endpoint, conf.broker_keepalive_interval)?;
+async fn pull_loop(
+    conf: Arc<SafeKeeperConf>,
+    global_timelines: Arc<GlobalTimelines>,
+    stats: Arc<BrokerStats>,
+) -> Result<()> {
+    let mut client =
+        storage_broker::connect(conf.broker_endpoint.clone(), conf.broker_keepalive_interval)?;

    // TODO: subscribe only to local timelines instead of all
    let request = SubscribeSafekeeperInfoRequest {
@@ -113,7 +121,7 @@ async fn pull_loop(conf: SafeKeeperConf, stats: Arc<BrokerStats>) -> Result<()>
            .as_ref()
            .ok_or_else(|| anyhow!("missing tenant_timeline_id"))?;
        let ttid = parse_proto_ttid(proto_ttid)?;
-        if let Ok(tli) = GlobalTimelines::get(ttid) {
+        if let Ok(tli) = global_timelines.get(ttid) {
            // Note that we also receive *our own* info. That's
            // important, as it is used as an indication of live
            // connection to the broker.
@@ -135,7 +143,11 @@ async fn pull_loop(conf: SafeKeeperConf, stats: Arc<BrokerStats>) -> Result<()>

 /// Process incoming discover requests. This is done in a separate task to avoid
 /// interfering with the normal pull/push loops.
-async fn discover_loop(conf: SafeKeeperConf, stats: Arc<BrokerStats>) -> Result<()> {
+async fn discover_loop(
+    conf: Arc<SafeKeeperConf>,
+    global_timelines: Arc<GlobalTimelines>,
+    stats: Arc<BrokerStats>,
+) -> Result<()> {
    let mut client =
        storage_broker::connect(conf.broker_endpoint.clone(), conf.broker_keepalive_interval)?;

@@ -171,7 +183,7 @@ async fn discover_loop(conf: SafeKeeperConf, stats: Arc<BrokerStats>) -> Result<
                    .as_ref()
                    .ok_or_else(|| anyhow!("missing tenant_timeline_id"))?;
                let ttid = parse_proto_ttid(proto_ttid)?;
-                if let Ok(tli) = GlobalTimelines::get(ttid) {
+                if let Ok(tli) = global_timelines.get(ttid) {
                    // we received a discovery request for a timeline we know about
                    discover_counter.inc();

@@ -210,7 +222,10 @@ async fn discover_loop(conf: SafeKeeperConf, stats: Arc<BrokerStats>) -> Result<
    bail!("end of stream");
 }

-pub async fn task_main(conf: SafeKeeperConf) -> anyhow::Result<()> {
+pub async fn task_main(
+    conf: Arc<SafeKeeperConf>,
+    global_timelines: Arc<GlobalTimelines>,
+) -> anyhow::Result<()> {
    info!("started, broker endpoint {:?}", conf.broker_endpoint);

    let mut ticker = tokio::time::interval(Duration::from_millis(RETRY_INTERVAL_MSEC));
@@ -261,13 +276,13 @@ pub async fn task_main(conf: SafeKeeperConf) -> anyhow::Result<()> {
                },
                _ = ticker.tick() => {
                    if push_handle.is_none() {
-                        push_handle = Some(tokio::spawn(push_loop(conf.clone())));
+                        push_handle = Some(tokio::spawn(push_loop(conf.clone(), global_timelines.clone())));
                    }
                    if pull_handle.is_none() {
-                        pull_handle = Some(tokio::spawn(pull_loop(conf.clone(), stats.clone())));
+                        pull_handle = Some(tokio::spawn(pull_loop(conf.clone(), global_timelines.clone(), stats.clone())));
                    }
                    if discover_handle.is_none() {
-                        discover_handle = Some(tokio::spawn(discover_loop(conf.clone(), stats.clone())));
+                        discover_handle = Some(tokio::spawn(discover_loop(conf.clone(), global_timelines.clone(), stats.clone())));
                    }
                },
                _ = &mut stats_task => {}
--- a/safekeeper/src/copy_timeline.rs
+++ b/safekeeper/src/copy_timeline.rs
@@ -1,9 +1,7 @@
-use std::sync::Arc;
-
 use anyhow::{bail, Result};
 use camino::Utf8PathBuf;
-
 use postgres_ffi::{MAX_SEND_SIZE, WAL_SEGMENT_SIZE};
+use std::sync::Arc;
 use tokio::{
    fs::OpenOptions,
    io::{AsyncSeekExt, AsyncWriteExt},
@@ -14,7 +12,7 @@ use utils::{id::TenantTimelineId, lsn::Lsn};
 use crate::{
    control_file::FileStorage,
    state::TimelinePersistentState,
-    timeline::{Timeline, TimelineError, WalResidentTimeline},
+    timeline::{TimelineError, WalResidentTimeline},
    timelines_global_map::{create_temp_timeline_dir, validate_temp_timeline},
    wal_backup::copy_s3_segments,
    wal_storage::{wal_file_paths, WalReader},
@@ -25,16 +23,19 @@ use crate::{
 const MAX_BACKUP_LAG: u64 = 10 * WAL_SEGMENT_SIZE as u64;

 pub struct Request {
-    pub source: Arc<Timeline>,
+    pub source_ttid: TenantTimelineId,
    pub until_lsn: Lsn,
    pub destination_ttid: TenantTimelineId,
 }

-pub async fn handle_request(request: Request) -> Result<()> {
+pub async fn handle_request(
+    request: Request,
+    global_timelines: Arc<GlobalTimelines>,
+) -> Result<()> {
    // TODO: request.until_lsn MUST be a valid LSN, and we cannot check it :(
    //   if LSN will point to the middle of a WAL record, timeline will be in "broken" state

-    match GlobalTimelines::get(request.destination_ttid) {
+    match global_timelines.get(request.destination_ttid) {
        // timeline already exists. would be good to check that this timeline is the copy
        // of the source timeline, but it isn't obvious how to do that
        Ok(_) => return Ok(()),
@@ -46,9 +47,10 @@ pub async fn handle_request(request: Request) -> Result<()> {
        }
    }

-    let source_tli = request.source.wal_residence_guard().await?;
+    let source = global_timelines.get(request.source_ttid)?;
+    let source_tli = source.wal_residence_guard().await?;

-    let conf = &GlobalTimelines::get_global_config();
+    let conf = &global_timelines.get_global_config();
    let ttid = request.destination_ttid;

    let (_tmp_dir, tli_dir_path) = create_temp_timeline_dir(conf, ttid).await?;
@@ -127,7 +129,7 @@ pub async fn handle_request(request: Request) -> Result<()> {

    copy_s3_segments(
        wal_seg_size,
-        &request.source.ttid,
+        &request.source_ttid,
        &request.destination_ttid,
        first_segment,
        first_ondisk_segment,
@@ -158,7 +160,9 @@ pub async fn handle_request(request: Request) -> Result<()> {

    // now we have a ready timeline in a temp directory
    validate_temp_timeline(conf, request.destination_ttid, &tli_dir_path).await?;
-    GlobalTimelines::load_temp_timeline(request.destination_ttid, &tli_dir_path, true).await?;
+    global_timelines
+        .load_temp_timeline(request.destination_ttid, &tli_dir_path, true)
+        .await?;

    Ok(())
 }
--- a/safekeeper/src/debug_dump.rs
+++ b/safekeeper/src/debug_dump.rs
@@ -207,23 +207,23 @@ pub struct FileInfo {
 }

 /// Build debug dump response, using the provided [`Args`] filters.
-pub async fn build(args: Args) -> Result<Response> {
+pub async fn build(args: Args, global_timelines: Arc<GlobalTimelines>) -> Result<Response> {
    let start_time = Utc::now();
-    let timelines_count = GlobalTimelines::timelines_count();
-    let config = GlobalTimelines::get_global_config();
+    let timelines_count = global_timelines.timelines_count();
+    let config = global_timelines.get_global_config();

    let ptrs_snapshot = if args.tenant_id.is_some() && args.timeline_id.is_some() {
        // If both tenant_id and timeline_id are specified, we can just get the
        // timeline directly, without taking a snapshot of the whole list.
        let ttid = TenantTimelineId::new(args.tenant_id.unwrap(), args.timeline_id.unwrap());
-        if let Ok(tli) = GlobalTimelines::get(ttid) {
+        if let Ok(tli) = global_timelines.get(ttid) {
            vec![tli]
        } else {
            vec![]
        }
    } else {
        // Otherwise, take a snapshot of the whole list.
-        GlobalTimelines::get_all()
+        global_timelines.get_all()
    };

    let mut timelines = Vec::new();
@@ -344,12 +344,12 @@ fn get_wal_last_modified(path: &Utf8Path) -> Result<Option<DateTime<Utc>>> {

 /// Converts SafeKeeperConf to Config, filtering out the fields that are not
 /// supposed to be exposed.
-fn build_config(config: SafeKeeperConf) -> Config {
+fn build_config(config: Arc<SafeKeeperConf>) -> Config {
    Config {
        id: config.my_id,
-        workdir: config.workdir.into(),
-        listen_pg_addr: config.listen_pg_addr,
-        listen_http_addr: config.listen_http_addr,
+        workdir: config.workdir.clone().into(),
+        listen_pg_addr: config.listen_pg_addr.clone(),
+        listen_http_addr: config.listen_http_addr.clone(),
        no_sync: config.no_sync,
        max_offloader_lag_bytes: config.max_offloader_lag_bytes,
        wal_backup_enabled: config.wal_backup_enabled,
--- a/safekeeper/src/handler.rs
+++ b/safekeeper/src/handler.rs
@@ -33,7 +33,7 @@ use utils::{

 /// Safekeeper handler of postgres commands
 pub struct SafekeeperPostgresHandler {
-    pub conf: SafeKeeperConf,
+    pub conf: Arc<SafeKeeperConf>,
    /// assigned application name
    pub appname: Option<String>,
    pub tenant_id: Option<TenantId>,
@@ -43,6 +43,7 @@ pub struct SafekeeperPostgresHandler {
    pub protocol: Option<PostgresClientProtocol>,
    /// Unique connection id is logged in spans for observability.
    pub conn_id: ConnectionId,
+    pub global_timelines: Arc<GlobalTimelines>,
    /// Auth scope allowed on the connections and public key used to check auth tokens. None if auth is not configured.
    auth: Option<(Scope, Arc<JwtAuth>)>,
    claims: Option<Claims>,
@@ -314,10 +315,11 @@ impl<IO: AsyncRead + AsyncWrite + Unpin + Send> postgres_backend::Handler<IO>

 impl SafekeeperPostgresHandler {
    pub fn new(
-        conf: SafeKeeperConf,
+        conf: Arc<SafeKeeperConf>,
        conn_id: u32,
        io_metrics: Option<TrafficMetrics>,
        auth: Option<(Scope, Arc<JwtAuth>)>,
+        global_timelines: Arc<GlobalTimelines>,
    ) -> Self {
        SafekeeperPostgresHandler {
            conf,
@@ -331,6 +333,7 @@ impl SafekeeperPostgresHandler {
            claims: None,
            auth,
            io_metrics,
+            global_timelines,
        }
    }

@@ -360,7 +363,7 @@ impl SafekeeperPostgresHandler {
        pgb: &mut PostgresBackend<IO>,
    ) -> Result<(), QueryError> {
        // Get timeline, handling "not found" error
-        let tli = match GlobalTimelines::get(self.ttid) {
+        let tli = match self.global_timelines.get(self.ttid) {
            Ok(tli) => Ok(Some(tli)),
            Err(TimelineError::NotFound(_)) => Ok(None),
            Err(e) => Err(QueryError::Other(e.into())),
@@ -394,7 +397,10 @@ impl SafekeeperPostgresHandler {
        &mut self,
        pgb: &mut PostgresBackend<IO>,
    ) -> Result<(), QueryError> {
-        let tli = GlobalTimelines::get(self.ttid).map_err(|e| QueryError::Other(e.into()))?;
+        let tli = self
+            .global_timelines
+            .get(self.ttid)
+            .map_err(|e| QueryError::Other(e.into()))?;

        let lsn = if self.is_walproposer_recovery() {
            // walproposer should get all local WAL until flush_lsn
--- a/safekeeper/src/http/mod.rs
+++ b/safekeeper/src/http/mod.rs
@@ -3,14 +3,16 @@ pub mod routes;
 pub use routes::make_router;

 pub use safekeeper_api::models;
+use std::sync::Arc;

-use crate::SafeKeeperConf;
+use crate::{GlobalTimelines, SafeKeeperConf};

 pub async fn task_main(
-    conf: SafeKeeperConf,
+    conf: Arc<SafeKeeperConf>,
    http_listener: std::net::TcpListener,
+    global_timelines: Arc<GlobalTimelines>,
 ) -> anyhow::Result<()> {
-    let router = make_router(conf)
+    let router = make_router(conf, global_timelines)
        .build()
        .map_err(|err| anyhow::anyhow!(err))?;
    let service = utils::http::RouterService::new(router).unwrap();
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -66,6 +66,13 @@ fn get_conf(request: &Request<Body>) -> &SafeKeeperConf {
        .as_ref()
 }

+fn get_global_timelines(request: &Request<Body>) -> Arc<GlobalTimelines> {
+    request
+        .data::<Arc<GlobalTimelines>>()
+        .expect("unknown state type")
+        .clone()
+}
+
 /// Same as TermLsn, but serializes LSN using display serializer
 /// in Postgres format, i.e. 0/FFFFFFFF. Used only for the API response.
 #[derive(Debug, Clone, Copy, Serialize, Deserialize)]
@@ -123,9 +130,11 @@ async fn tenant_delete_handler(mut request: Request<Body>) -> Result<Response<Bo
    let only_local = parse_query_param(&request, "only_local")?.unwrap_or(false);
    check_permission(&request, Some(tenant_id))?;
    ensure_no_body(&mut request).await?;
+    let global_timelines = get_global_timelines(&request);
    // FIXME: `delete_force_all_for_tenant` can return an error for multiple different reasons;
    // Using an `InternalServerError` should be fixed when the types support it
-    let delete_info = GlobalTimelines::delete_force_all_for_tenant(&tenant_id, only_local)
+    let delete_info = global_timelines
+        .delete_force_all_for_tenant(&tenant_id, only_local)
        .await
        .map_err(ApiError::InternalServerError)?;
    json_response(
@@ -156,7 +165,9 @@ async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<
            .commit_lsn
            .segment_lsn(server_info.wal_seg_size as usize)
    });
-    GlobalTimelines::create(ttid, server_info, request_data.commit_lsn, local_start_lsn)
+    let global_timelines = get_global_timelines(&request);
+    global_timelines
+        .create(ttid, server_info, request_data.commit_lsn, local_start_lsn)
        .await
        .map_err(ApiError::InternalServerError)?;

@@ -167,7 +178,9 @@ async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<
 /// Note: it is possible to do the same with debug_dump.
 async fn timeline_list_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
    check_permission(&request, None)?;
-    let res: Vec<TenantTimelineId> = GlobalTimelines::get_all()
+    let global_timelines = get_global_timelines(&request);
+    let res: Vec<TenantTimelineId> = global_timelines
+        .get_all()
        .iter()
        .map(|tli| tli.ttid)
        .collect();
@@ -182,7 +195,8 @@ async fn timeline_status_handler(request: Request<Body>) -> Result<Response<Body
    );
    check_permission(&request, Some(ttid.tenant_id))?;

-    let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?;
+    let global_timelines = get_global_timelines(&request);
+    let tli = global_timelines.get(ttid).map_err(ApiError::from)?;
    let (inmem, state) = tli.get_state().await;
    let flush_lsn = tli.get_flush_lsn().await;

@@ -233,9 +247,11 @@ async fn timeline_delete_handler(mut request: Request<Body>) -> Result<Response<
    let only_local = parse_query_param(&request, "only_local")?.unwrap_or(false);
    check_permission(&request, Some(ttid.tenant_id))?;
    ensure_no_body(&mut request).await?;
+    let global_timelines = get_global_timelines(&request);
    // FIXME: `delete_force` can fail from both internal errors and bad requests. Add better
    // error handling here when we're able to.
-    let resp = GlobalTimelines::delete(&ttid, only_local)
+    let resp = global_timelines
+        .delete(&ttid, only_local)
        .await
        .map_err(ApiError::InternalServerError)?;
    json_response(StatusCode::OK, resp)
@@ -247,8 +263,9 @@ async fn timeline_pull_handler(mut request: Request<Body>) -> Result<Response<Bo

    let data: pull_timeline::Request = json_request(&mut request).await?;
    let conf = get_conf(&request);
+    let global_timelines = get_global_timelines(&request);

-    let resp = pull_timeline::handle_request(data, conf.sk_auth_token.clone())
+    let resp = pull_timeline::handle_request(data, conf.sk_auth_token.clone(), global_timelines)
        .await
        .map_err(ApiError::InternalServerError)?;
    json_response(StatusCode::OK, resp)
@@ -263,7 +280,8 @@ async fn timeline_snapshot_handler(request: Request<Body>) -> Result<Response<Bo
    );
    check_permission(&request, Some(ttid.tenant_id))?;

-    let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?;
+    let global_timelines = get_global_timelines(&request);
+    let tli = global_timelines.get(ttid).map_err(ApiError::from)?;

    // To stream the body use wrap_stream which wants Stream of Result<Bytes>,
    // so create the chan and write to it in another task.
@@ -293,19 +311,19 @@ async fn timeline_copy_handler(mut request: Request<Body>) -> Result<Response<Bo
    check_permission(&request, None)?;

    let request_data: TimelineCopyRequest = json_request(&mut request).await?;
-    let ttid = TenantTimelineId::new(
+    let source_ttid = TenantTimelineId::new(
        parse_request_param(&request, "tenant_id")?,
        parse_request_param(&request, "source_timeline_id")?,
    );

-    let source = GlobalTimelines::get(ttid)?;
+    let global_timelines = get_global_timelines(&request);

    copy_timeline::handle_request(copy_timeline::Request{
-        source,
+        source_ttid,
        until_lsn: request_data.until_lsn,
-        destination_ttid: TenantTimelineId::new(ttid.tenant_id, request_data.target_timeline_id),
-    })
-        .instrument(info_span!("copy_timeline", from=%ttid, to=%request_data.target_timeline_id, until_lsn=%request_data.until_lsn))
+        destination_ttid: TenantTimelineId::new(source_ttid.tenant_id, request_data.target_timeline_id),
+    }, global_timelines)
+        .instrument(info_span!("copy_timeline", from=%source_ttid, to=%request_data.target_timeline_id, until_lsn=%request_data.until_lsn))
        .await
        .map_err(ApiError::InternalServerError)?;

@@ -322,7 +340,8 @@ async fn patch_control_file_handler(
        parse_request_param(&request, "timeline_id")?,
    );

-    let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?;
+    let global_timelines = get_global_timelines(&request);
+    let tli = global_timelines.get(ttid).map_err(ApiError::from)?;

    let patch_request: patch_control_file::Request = json_request(&mut request).await?;
    let response = patch_control_file::handle_request(tli, patch_request)
@@ -341,7 +360,8 @@ async fn timeline_checkpoint_handler(request: Request<Body>) -> Result<Response<
        parse_request_param(&request, "timeline_id")?,
    );

-    let tli = GlobalTimelines::get(ttid)?;
+    let global_timelines = get_global_timelines(&request);
+    let tli = global_timelines.get(ttid)?;
    tli.write_shared_state()
        .await
        .sk
@@ -359,6 +379,7 @@ async fn timeline_digest_handler(request: Request<Body>) -> Result<Response<Body
    );
    check_permission(&request, Some(ttid.tenant_id))?;

+    let global_timelines = get_global_timelines(&request);
    let from_lsn: Option<Lsn> = parse_query_param(&request, "from_lsn")?;
    let until_lsn: Option<Lsn> = parse_query_param(&request, "until_lsn")?;

@@ -371,7 +392,7 @@ async fn timeline_digest_handler(request: Request<Body>) -> Result<Response<Body
        )))?,
    };

-    let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?;
+    let tli = global_timelines.get(ttid).map_err(ApiError::from)?;
    let tli = tli
        .wal_residence_guard()
        .await
@@ -393,7 +414,8 @@ async fn timeline_backup_partial_reset(request: Request<Body>) -> Result<Respons
    );
    check_permission(&request, Some(ttid.tenant_id))?;

-    let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?;
+    let global_timelines = get_global_timelines(&request);
+    let tli = global_timelines.get(ttid).map_err(ApiError::from)?;

    let response = tli
        .backup_partial_reset()
@@ -415,7 +437,8 @@ async fn timeline_term_bump_handler(

    let request_data: TimelineTermBumpRequest = json_request(&mut request).await?;

-    let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?;
+    let global_timelines = get_global_timelines(&request);
+    let tli = global_timelines.get(ttid).map_err(ApiError::from)?;
    let response = tli
        .term_bump(request_data.term)
        .await
@@ -452,7 +475,8 @@ async fn record_safekeeper_info(mut request: Request<Body>) -> Result<Response<B
        standby_horizon: sk_info.standby_horizon.0,
    };

-    let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?;
+    let global_timelines = get_global_timelines(&request);
+    let tli = global_timelines.get(ttid).map_err(ApiError::from)?;
    tli.record_safekeeper_info(proto_sk_info)
        .await
        .map_err(ApiError::InternalServerError)?;
@@ -506,6 +530,8 @@ async fn dump_debug_handler(mut request: Request<Body>) -> Result<Response<Body>
    let dump_term_history = dump_term_history.unwrap_or(true);
    let dump_wal_last_modified = dump_wal_last_modified.unwrap_or(dump_all);

+    let global_timelines = get_global_timelines(&request);
+
    let args = debug_dump::Args {
        dump_all,
        dump_control_file,
@@ -517,7 +543,7 @@ async fn dump_debug_handler(mut request: Request<Body>) -> Result<Response<Body>
        timeline_id,
    };

-    let resp = debug_dump::build(args)
+    let resp = debug_dump::build(args, global_timelines)
        .await
        .map_err(ApiError::InternalServerError)?;

@@ -570,7 +596,10 @@ async fn dump_debug_handler(mut request: Request<Body>) -> Result<Response<Body>
 }

 /// Safekeeper http router.
-pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError> {
+pub fn make_router(
+    conf: Arc<SafeKeeperConf>,
+    global_timelines: Arc<GlobalTimelines>,
+) -> RouterBuilder<hyper::Body, ApiError> {
    let mut router = endpoint::make_router();
    if conf.http_auth.is_some() {
        router = router.middleware(auth_middleware(|request| {
@@ -592,7 +621,8 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError>
    // located nearby (/safekeeper/src/http/openapi_spec.yaml).
    let auth = conf.http_auth.clone();
    router
-        .data(Arc::new(conf))
+        .data(conf)
+        .data(global_timelines)
        .data(auth)
        .get("/metrics", |r| request_span(r, prometheus_metrics_handler))
        .get("/profile/cpu", |r| request_span(r, profile_cpu_handler))
--- a/safekeeper/src/json_ctrl.rs
+++ b/safekeeper/src/json_ctrl.rs
@@ -11,7 +11,6 @@ use postgres_backend::QueryError;
 use serde::{Deserialize, Serialize};
 use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::*;
-use utils::id::TenantTimelineId;

 use crate::handler::SafekeeperPostgresHandler;
 use crate::safekeeper::{AcceptorProposerMessage, AppendResponse, ServerInfo};
@@ -21,7 +20,6 @@ use crate::safekeeper::{
 use crate::safekeeper::{Term, TermHistory, TermLsn};
 use crate::state::TimelinePersistentState;
 use crate::timeline::WalResidentTimeline;
-use crate::GlobalTimelines;
 use postgres_backend::PostgresBackend;
 use postgres_ffi::encode_logical_message;
 use postgres_ffi::WAL_SEGMENT_SIZE;
@@ -70,7 +68,7 @@ pub async fn handle_json_ctrl<IO: AsyncRead + AsyncWrite + Unpin>(
    info!("JSON_CTRL request: {append_request:?}");

    // need to init safekeeper state before AppendRequest
-    let tli = prepare_safekeeper(spg.ttid, append_request.pg_version).await?;
+    let tli = prepare_safekeeper(spg, append_request.pg_version).await?;

    // if send_proposer_elected is true, we need to update local history
    if append_request.send_proposer_elected {
@@ -99,20 +97,22 @@ pub async fn handle_json_ctrl<IO: AsyncRead + AsyncWrite + Unpin>(
 /// Prepare safekeeper to process append requests without crashes,
 /// by sending ProposerGreeting with default server.wal_seg_size.
 async fn prepare_safekeeper(
-    ttid: TenantTimelineId,
+    spg: &SafekeeperPostgresHandler,
    pg_version: u32,
 ) -> anyhow::Result<WalResidentTimeline> {
-    let tli = GlobalTimelines::create(
-        ttid,
-        ServerInfo {
-            pg_version,
-            wal_seg_size: WAL_SEGMENT_SIZE as u32,
-            system_id: 0,
-        },
-        Lsn::INVALID,
-        Lsn::INVALID,
-    )
-    .await?;
+    let tli = spg
+        .global_timelines
+        .create(
+            spg.ttid,
+            ServerInfo {
+                pg_version,
+                wal_seg_size: WAL_SEGMENT_SIZE as u32,
+                system_id: 0,
+            },
+            Lsn::INVALID,
+            Lsn::INVALID,
+        )
+        .await?;

    tli.wal_residence_guard().await
 }
--- a/safekeeper/src/metrics.rs
+++ b/safekeeper/src/metrics.rs
@@ -455,6 +455,7 @@ pub struct FullTimelineInfo {

 /// Collects metrics for all active timelines.
 pub struct TimelineCollector {
+    global_timelines: Arc<GlobalTimelines>,
    descs: Vec<Desc>,
    commit_lsn: GenericGaugeVec<AtomicU64>,
    backup_lsn: GenericGaugeVec<AtomicU64>,
@@ -478,14 +479,8 @@ pub struct TimelineCollector {
    active_timelines_count: IntGauge,
 }

-impl Default for TimelineCollector {
-    fn default() -> Self {
-        Self::new()
-    }
-}
-
 impl TimelineCollector {
-    pub fn new() -> TimelineCollector {
+    pub fn new(global_timelines: Arc<GlobalTimelines>) -> TimelineCollector {
        let mut descs = Vec::new();

        let commit_lsn = GenericGaugeVec::new(
@@ -676,6 +671,7 @@ impl TimelineCollector {
        descs.extend(active_timelines_count.desc().into_iter().cloned());

        TimelineCollector {
+            global_timelines,
            descs,
            commit_lsn,
            backup_lsn,
@@ -728,17 +724,18 @@ impl Collector for TimelineCollector {
        self.written_wal_seconds.reset();
        self.flushed_wal_seconds.reset();

-        let timelines_count = GlobalTimelines::get_all().len();
+        let timelines_count = self.global_timelines.get_all().len();
        let mut active_timelines_count = 0;

        // Prometheus Collector is sync, and data is stored under async lock. To
        // bridge the gap with a crutch, collect data in spawned thread with
        // local tokio runtime.
+        let global_timelines = self.global_timelines.clone();
        let infos = std::thread::spawn(|| {
            let rt = tokio::runtime::Builder::new_current_thread()
                .build()
                .expect("failed to create rt");
-            rt.block_on(collect_timeline_metrics())
+            rt.block_on(collect_timeline_metrics(global_timelines))
        })
        .join()
        .expect("collect_timeline_metrics thread panicked");
@@ -857,9 +854,9 @@ impl Collector for TimelineCollector {
    }
 }

-async fn collect_timeline_metrics() -> Vec<FullTimelineInfo> {
+async fn collect_timeline_metrics(global_timelines: Arc<GlobalTimelines>) -> Vec<FullTimelineInfo> {
    let mut res = vec![];
-    let active_timelines = GlobalTimelines::get_global_broker_active_set().get_all();
+    let active_timelines = global_timelines.get_global_broker_active_set().get_all();

    for tli in active_timelines {
        if let Some(info) = tli.info_for_metrics().await {
--- a/safekeeper/src/pull_timeline.rs
+++ b/safekeeper/src/pull_timeline.rs
@@ -409,8 +409,9 @@ pub struct DebugDumpResponse {
 pub async fn handle_request(
    request: Request,
    sk_auth_token: Option<SecretString>,
+    global_timelines: Arc<GlobalTimelines>,
 ) -> Result<Response> {
-    let existing_tli = GlobalTimelines::get(TenantTimelineId::new(
+    let existing_tli = global_timelines.get(TenantTimelineId::new(
        request.tenant_id,
        request.timeline_id,
    ));
@@ -453,13 +454,14 @@ pub async fn handle_request(
    assert!(status.tenant_id == request.tenant_id);
    assert!(status.timeline_id == request.timeline_id);

-    pull_timeline(status, safekeeper_host, sk_auth_token).await
+    pull_timeline(status, safekeeper_host, sk_auth_token, global_timelines).await
 }

 async fn pull_timeline(
    status: TimelineStatus,
    host: String,
    sk_auth_token: Option<SecretString>,
+    global_timelines: Arc<GlobalTimelines>,
 ) -> Result<Response> {
    let ttid = TenantTimelineId::new(status.tenant_id, status.timeline_id);
    info!(
@@ -472,7 +474,7 @@ async fn pull_timeline(
        status.acceptor_state.epoch
    );

-    let conf = &GlobalTimelines::get_global_config();
+    let conf = &global_timelines.get_global_config();

    let (_tmp_dir, tli_dir_path) = create_temp_timeline_dir(conf, ttid).await?;

@@ -531,7 +533,9 @@ async fn pull_timeline(
    assert!(status.commit_lsn <= status.flush_lsn);

    // Finally, load the timeline.
-    let _tli = GlobalTimelines::load_temp_timeline(ttid, &tli_dir_path, false).await?;
+    let _tli = global_timelines
+        .load_temp_timeline(ttid, &tli_dir_path, false)
+        .await?;

    Ok(Response {
        safekeeper_host: host,
--- a/safekeeper/src/receive_wal.rs
+++ b/safekeeper/src/receive_wal.rs
@@ -267,6 +267,7 @@ impl SafekeeperPostgresHandler {
            pgb_reader: &mut pgb_reader,
            peer_addr,
            acceptor_handle: &mut acceptor_handle,
+            global_timelines: self.global_timelines.clone(),
        };

        // Read first message and create timeline if needed.
@@ -331,6 +332,7 @@ struct NetworkReader<'a, IO> {
    // WalAcceptor is spawned when we learn server info from walproposer and
    // create timeline; handle is put here.
    acceptor_handle: &'a mut Option<JoinHandle<anyhow::Result<()>>>,
+    global_timelines: Arc<GlobalTimelines>,
 }

 impl<'a, IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'a, IO> {
@@ -350,10 +352,11 @@ impl<'a, IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'a, IO> {
                    system_id: greeting.system_id,
                    wal_seg_size: greeting.wal_seg_size,
                };
-                let tli =
-                    GlobalTimelines::create(self.ttid, server_info, Lsn::INVALID, Lsn::INVALID)
-                        .await
-                        .context("create timeline")?;
+                let tli = self
+                    .global_timelines
+                    .create(self.ttid, server_info, Lsn::INVALID, Lsn::INVALID)
+                    .await
+                    .context("create timeline")?;
                tli.wal_residence_guard().await?
            }
            _ => {
--- a/safekeeper/src/send_wal.rs
+++ b/safekeeper/src/send_wal.rs
@@ -10,7 +10,6 @@ use crate::timeline::WalResidentTimeline;
 use crate::wal_reader_stream::WalReaderStreamBuilder;
 use crate::wal_service::ConnectionId;
 use crate::wal_storage::WalReader;
-use crate::GlobalTimelines;
 use anyhow::{bail, Context as AnyhowContext};
 use bytes::Bytes;
 use futures::future::Either;
@@ -400,7 +399,10 @@ impl SafekeeperPostgresHandler {
        start_pos: Lsn,
        term: Option<Term>,
    ) -> Result<(), QueryError> {
-        let tli = GlobalTimelines::get(self.ttid).map_err(|e| QueryError::Other(e.into()))?;
+        let tli = self
+            .global_timelines
+            .get(self.ttid)
+            .map_err(|e| QueryError::Other(e.into()))?;
        let residence_guard = tli.wal_residence_guard().await?;

        if let Err(end) = self
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -44,8 +44,8 @@ use crate::wal_backup_partial::PartialRemoteSegment;

 use crate::metrics::{FullTimelineInfo, WalStorageMetrics, MISC_OPERATION_SECONDS};
 use crate::wal_storage::{Storage as wal_storage_iface, WalReader};
+use crate::SafeKeeperConf;
 use crate::{debug_dump, timeline_manager, wal_storage};
-use crate::{GlobalTimelines, SafeKeeperConf};

 /// Things safekeeper should know about timeline state on peers.
 #[derive(Debug, Clone, Serialize, Deserialize)]
@@ -467,6 +467,7 @@ pub struct Timeline {
    walreceivers: Arc<WalReceivers>,
    timeline_dir: Utf8PathBuf,
    manager_ctl: ManagerCtl,
+    conf: Arc<SafeKeeperConf>,

    /// Hold this gate from code that depends on the Timeline's non-shut-down state.  While holding
    /// this gate, you must respect [`Timeline::cancel`]
@@ -489,6 +490,7 @@ impl Timeline {
        timeline_dir: &Utf8Path,
        remote_path: &RemotePath,
        shared_state: SharedState,
+        conf: Arc<SafeKeeperConf>,
    ) -> Arc<Self> {
        let (commit_lsn_watch_tx, commit_lsn_watch_rx) =
            watch::channel(shared_state.sk.state().commit_lsn);
@@ -516,6 +518,7 @@ impl Timeline {
            gate: Default::default(),
            cancel: CancellationToken::default(),
            manager_ctl: ManagerCtl::new(),
+            conf,
            broker_active: AtomicBool::new(false),
            wal_backup_active: AtomicBool::new(false),
            last_removed_segno: AtomicU64::new(0),
@@ -524,11 +527,14 @@ impl Timeline {
    }

    /// Load existing timeline from disk.
-    pub fn load_timeline(conf: &SafeKeeperConf, ttid: TenantTimelineId) -> Result<Arc<Timeline>> {
+    pub fn load_timeline(
+        conf: Arc<SafeKeeperConf>,
+        ttid: TenantTimelineId,
+    ) -> Result<Arc<Timeline>> {
        let _enter = info_span!("load_timeline", timeline = %ttid.timeline_id).entered();

-        let shared_state = SharedState::restore(conf, &ttid)?;
-        let timeline_dir = get_timeline_dir(conf, &ttid);
+        let shared_state = SharedState::restore(conf.as_ref(), &ttid)?;
+        let timeline_dir = get_timeline_dir(conf.as_ref(), &ttid);
        let remote_path = remote_timeline_path(&ttid)?;

        Ok(Timeline::new(
@@ -536,6 +542,7 @@ impl Timeline {
            &timeline_dir,
            &remote_path,
            shared_state,
+            conf,
        ))
    }

@@ -604,8 +611,7 @@ impl Timeline {
        // it is cancelled, so WAL storage won't be opened again.
        shared_state.sk.close_wal_store();

-        let conf = GlobalTimelines::get_global_config();
-        if !only_local && conf.is_wal_backup_enabled() {
+        if !only_local && self.conf.is_wal_backup_enabled() {
            // Note: we concurrently delete remote storage data from multiple
            // safekeepers. That's ok, s3 replies 200 if object doesn't exist and we
            // do some retries anyway.
@@ -951,7 +957,7 @@ impl WalResidentTimeline {

    pub async fn get_walreader(&self, start_lsn: Lsn) -> Result<WalReader> {
        let (_, persisted_state) = self.get_state().await;
-        let enable_remote_read = GlobalTimelines::get_global_config().is_wal_backup_enabled();
+        let enable_remote_read = self.conf.is_wal_backup_enabled();

        WalReader::new(
            &self.ttid,
@@ -1061,7 +1067,6 @@ impl ManagerTimeline {

    /// Try to switch state Offloaded->Present.
    pub(crate) async fn switch_to_present(&self) -> anyhow::Result<()> {
-        let conf = GlobalTimelines::get_global_config();
        let mut shared = self.write_shared_state().await;

        // trying to restore WAL storage
@@ -1069,7 +1074,7 @@ impl ManagerTimeline {
            &self.ttid,
            &self.timeline_dir,
            shared.sk.state(),
-            conf.no_sync,
+            self.conf.no_sync,
        )?;

        // updating control file
@@ -1096,7 +1101,7 @@ impl ManagerTimeline {
        // now we can switch shared.sk to Present, shouldn't fail
        let prev_sk = std::mem::replace(&mut shared.sk, StateSK::Empty);
        let cfile_state = prev_sk.take_state();
-        shared.sk = StateSK::Loaded(SafeKeeper::new(cfile_state, wal_store, conf.my_id)?);
+        shared.sk = StateSK::Loaded(SafeKeeper::new(cfile_state, wal_store, self.conf.my_id)?);

        Ok(())
    }
--- a/safekeeper/src/timelines_global_map.rs
+++ b/safekeeper/src/timelines_global_map.rs
@@ -13,7 +13,6 @@ use crate::{control_file, wal_storage, SafeKeeperConf};
 use anyhow::{bail, Context, Result};
 use camino::Utf8PathBuf;
 use camino_tempfile::Utf8TempDir;
-use once_cell::sync::Lazy;
 use serde::Serialize;
 use std::collections::HashMap;
 use std::str::FromStr;
@@ -42,23 +41,16 @@ struct GlobalTimelinesState {
    // this map is dropped on restart.
    tombstones: HashMap<TenantTimelineId, Instant>,

-    conf: Option<SafeKeeperConf>,
+    conf: Arc<SafeKeeperConf>,
    broker_active_set: Arc<TimelinesSet>,
    global_rate_limiter: RateLimiter,
 }

 impl GlobalTimelinesState {
-    /// Get configuration, which must be set once during init.
-    fn get_conf(&self) -> &SafeKeeperConf {
-        self.conf
-            .as_ref()
-            .expect("GlobalTimelinesState conf is not initialized")
-    }
-
    /// Get dependencies for a timeline constructor.
-    fn get_dependencies(&self) -> (SafeKeeperConf, Arc<TimelinesSet>, RateLimiter) {
+    fn get_dependencies(&self) -> (Arc<SafeKeeperConf>, Arc<TimelinesSet>, RateLimiter) {
        (
-            self.get_conf().clone(),
+            self.conf.clone(),
            self.broker_active_set.clone(),
            self.global_rate_limiter.clone(),
        )
@@ -82,35 +74,39 @@ impl GlobalTimelinesState {
    }
 }

-static TIMELINES_STATE: Lazy<Mutex<GlobalTimelinesState>> = Lazy::new(|| {
-    Mutex::new(GlobalTimelinesState {
-        timelines: HashMap::new(),
-        tombstones: HashMap::new(),
-        conf: None,
-        broker_active_set: Arc::new(TimelinesSet::default()),
-        global_rate_limiter: RateLimiter::new(1, 1),
-    })
-});
-
-/// A zero-sized struct used to manage access to the global timelines map.
-pub struct GlobalTimelines;
+/// A struct used to manage access to the global timelines map.
+pub struct GlobalTimelines {
+    state: Mutex<GlobalTimelinesState>,
+}

 impl GlobalTimelines {
+    /// Create a new instance of the global timelines map.
+    pub fn new(conf: Arc<SafeKeeperConf>) -> Self {
+        Self {
+            state: Mutex::new(GlobalTimelinesState {
+                timelines: HashMap::new(),
+                tombstones: HashMap::new(),
+                conf,
+                broker_active_set: Arc::new(TimelinesSet::default()),
+                global_rate_limiter: RateLimiter::new(1, 1),
+            }),
+        }
+    }
+
    /// Inject dependencies needed for the timeline constructors and load all timelines to memory.
-    pub async fn init(conf: SafeKeeperConf) -> Result<()> {
+    pub async fn init(&self) -> Result<()> {
        // clippy isn't smart enough to understand that drop(state) releases the
        // lock, so use explicit block
        let tenants_dir = {
-            let mut state = TIMELINES_STATE.lock().unwrap();
+            let mut state = self.state.lock().unwrap();
            state.global_rate_limiter = RateLimiter::new(
-                conf.partial_backup_concurrency,
+                state.conf.partial_backup_concurrency,
                DEFAULT_EVICTION_CONCURRENCY,
            );
-            state.conf = Some(conf);

            // Iterate through all directories and load tenants for all directories
            // named as a valid tenant_id.
-            state.get_conf().workdir.clone()
+            state.conf.workdir.clone()
        };
        let mut tenant_count = 0;
        for tenants_dir_entry in std::fs::read_dir(&tenants_dir)
@@ -122,7 +118,7 @@ impl GlobalTimelines {
                        TenantId::from_str(tenants_dir_entry.file_name().to_str().unwrap_or(""))
                    {
                        tenant_count += 1;
-                        GlobalTimelines::load_tenant_timelines(tenant_id).await?;
+                        self.load_tenant_timelines(tenant_id).await?;
                    }
                }
                Err(e) => error!(
@@ -135,7 +131,7 @@ impl GlobalTimelines {
        info!(
            "found {} tenants directories, successfully loaded {} timelines",
            tenant_count,
-            TIMELINES_STATE.lock().unwrap().timelines.len()
+            self.state.lock().unwrap().timelines.len()
        );
        Ok(())
    }
@@ -143,13 +139,13 @@ impl GlobalTimelines {
    /// Loads all timelines for the given tenant to memory. Returns fs::read_dir
    /// errors if any.
    ///
-    /// It is async, but TIMELINES_STATE lock is sync and there is no important
+    /// It is async, but self.state lock is sync and there is no important
    /// reason to make it async (it is always held for a short while), so we
    /// just lock and unlock it for each timeline -- this function is called
    /// during init when nothing else is running, so this is fine.
-    async fn load_tenant_timelines(tenant_id: TenantId) -> Result<()> {
+    async fn load_tenant_timelines(&self, tenant_id: TenantId) -> Result<()> {
        let (conf, broker_active_set, partial_backup_rate_limiter) = {
-            let state = TIMELINES_STATE.lock().unwrap();
+            let state = self.state.lock().unwrap();
            state.get_dependencies()
        };

@@ -163,10 +159,10 @@ impl GlobalTimelines {
                        TimelineId::from_str(timeline_dir_entry.file_name().to_str().unwrap_or(""))
                    {
                        let ttid = TenantTimelineId::new(tenant_id, timeline_id);
-                        match Timeline::load_timeline(&conf, ttid) {
+                        match Timeline::load_timeline(conf.clone(), ttid) {
                            Ok(tli) => {
                                let mut shared_state = tli.write_shared_state().await;
-                                TIMELINES_STATE
+                                self.state
                                    .lock()
                                    .unwrap()
                                    .timelines
@@ -200,29 +196,30 @@ impl GlobalTimelines {
    }

    /// Get the number of timelines in the map.
-    pub fn timelines_count() -> usize {
-        TIMELINES_STATE.lock().unwrap().timelines.len()
+    pub fn timelines_count(&self) -> usize {
+        self.state.lock().unwrap().timelines.len()
    }

    /// Get the global safekeeper config.
-    pub fn get_global_config() -> SafeKeeperConf {
-        TIMELINES_STATE.lock().unwrap().get_conf().clone()
+    pub fn get_global_config(&self) -> Arc<SafeKeeperConf> {
+        self.state.lock().unwrap().conf.clone()
    }

-    pub fn get_global_broker_active_set() -> Arc<TimelinesSet> {
-        TIMELINES_STATE.lock().unwrap().broker_active_set.clone()
+    pub fn get_global_broker_active_set(&self) -> Arc<TimelinesSet> {
+        self.state.lock().unwrap().broker_active_set.clone()
    }

    /// Create a new timeline with the given id. If the timeline already exists, returns
    /// an existing timeline.
    pub(crate) async fn create(
+        &self,
        ttid: TenantTimelineId,
        server_info: ServerInfo,
        commit_lsn: Lsn,
        local_start_lsn: Lsn,
    ) -> Result<Arc<Timeline>> {
        let (conf, _, _) = {
-            let state = TIMELINES_STATE.lock().unwrap();
+            let state = self.state.lock().unwrap();
            if let Ok(timeline) = state.get(&ttid) {
                // Timeline already exists, return it.
                return Ok(timeline);
@@ -245,7 +242,7 @@ impl GlobalTimelines {
        let state =
            TimelinePersistentState::new(&ttid, server_info, vec![], commit_lsn, local_start_lsn)?;
        control_file::FileStorage::create_new(&tmp_dir_path, state, conf.no_sync).await?;
-        let timeline = GlobalTimelines::load_temp_timeline(ttid, &tmp_dir_path, true).await?;
+        let timeline = self.load_temp_timeline(ttid, &tmp_dir_path, true).await?;
        Ok(timeline)
    }

@@ -261,13 +258,14 @@ impl GlobalTimelines {
    /// 2) move the directory and load the timeline
    /// 3) take lock again and insert the timeline into the global map.
    pub async fn load_temp_timeline(
+        &self,
        ttid: TenantTimelineId,
        tmp_path: &Utf8PathBuf,
        check_tombstone: bool,
    ) -> Result<Arc<Timeline>> {
        // Check for existence and mark that we're creating it.
        let (conf, broker_active_set, partial_backup_rate_limiter) = {
-            let mut state = TIMELINES_STATE.lock().unwrap();
+            let mut state = self.state.lock().unwrap();
            match state.timelines.get(&ttid) {
                Some(GlobalMapTimeline::CreationInProgress) => {
                    bail!(TimelineError::CreationInProgress(ttid));
@@ -295,10 +293,10 @@ impl GlobalTimelines {
        };

        // Do the actual move and reflect the result in the map.
-        match GlobalTimelines::install_temp_timeline(ttid, tmp_path, &conf).await {
+        match GlobalTimelines::install_temp_timeline(ttid, tmp_path, conf.clone()).await {
            Ok(timeline) => {
                let mut timeline_shared_state = timeline.write_shared_state().await;
-                let mut state = TIMELINES_STATE.lock().unwrap();
+                let mut state = self.state.lock().unwrap();
                assert!(matches!(
                    state.timelines.get(&ttid),
                    Some(GlobalMapTimeline::CreationInProgress)
@@ -319,7 +317,7 @@ impl GlobalTimelines {
            }
            Err(e) => {
                // Init failed, remove the marker from the map
-                let mut state = TIMELINES_STATE.lock().unwrap();
+                let mut state = self.state.lock().unwrap();
                assert!(matches!(
                    state.timelines.get(&ttid),
                    Some(GlobalMapTimeline::CreationInProgress)
@@ -334,10 +332,10 @@ impl GlobalTimelines {
    async fn install_temp_timeline(
        ttid: TenantTimelineId,
        tmp_path: &Utf8PathBuf,
-        conf: &SafeKeeperConf,
+        conf: Arc<SafeKeeperConf>,
    ) -> Result<Arc<Timeline>> {
-        let tenant_path = get_tenant_dir(conf, &ttid.tenant_id);
-        let timeline_path = get_timeline_dir(conf, &ttid);
+        let tenant_path = get_tenant_dir(conf.as_ref(), &ttid.tenant_id);
+        let timeline_path = get_timeline_dir(conf.as_ref(), &ttid);

        // We must have already checked that timeline doesn't exist in the map,
        // but there might be existing datadir: if timeline is corrupted it is
@@ -382,9 +380,9 @@ impl GlobalTimelines {
    /// Get a timeline from the global map. If it's not present, it doesn't exist on disk,
    /// or was corrupted and couldn't be loaded on startup. Returned timeline is always valid,
    /// i.e. loaded in memory and not cancelled.
-    pub(crate) fn get(ttid: TenantTimelineId) -> Result<Arc<Timeline>, TimelineError> {
+    pub(crate) fn get(&self, ttid: TenantTimelineId) -> Result<Arc<Timeline>, TimelineError> {
        let tli_res = {
-            let state = TIMELINES_STATE.lock().unwrap();
+            let state = self.state.lock().unwrap();
            state.get(&ttid)
        };
        match tli_res {
@@ -399,8 +397,8 @@ impl GlobalTimelines {
    }

    /// Returns all timelines. This is used for background timeline processes.
-    pub fn get_all() -> Vec<Arc<Timeline>> {
-        let global_lock = TIMELINES_STATE.lock().unwrap();
+    pub fn get_all(&self) -> Vec<Arc<Timeline>> {
+        let global_lock = self.state.lock().unwrap();
        global_lock
            .timelines
            .values()
@@ -419,8 +417,8 @@ impl GlobalTimelines {

    /// Returns all timelines belonging to a given tenant. Used for deleting all timelines of a tenant,
    /// and that's why it can return cancelled timelines, to retry deleting them.
-    fn get_all_for_tenant(tenant_id: TenantId) -> Vec<Arc<Timeline>> {
-        let global_lock = TIMELINES_STATE.lock().unwrap();
+    fn get_all_for_tenant(&self, tenant_id: TenantId) -> Vec<Arc<Timeline>> {
+        let global_lock = self.state.lock().unwrap();
        global_lock
            .timelines
            .values()
@@ -435,11 +433,12 @@ impl GlobalTimelines {
    /// Cancels timeline, then deletes the corresponding data directory.
    /// If only_local, doesn't remove WAL segments in remote storage.
    pub(crate) async fn delete(
+        &self,
        ttid: &TenantTimelineId,
        only_local: bool,
    ) -> Result<TimelineDeleteForceResult> {
        let tli_res = {
-            let state = TIMELINES_STATE.lock().unwrap();
+            let state = self.state.lock().unwrap();

            if state.tombstones.contains_key(ttid) {
                // Presence of a tombstone guarantees that a previous deletion has completed and there is no work to do.
@@ -472,7 +471,7 @@ impl GlobalTimelines {
            }
            Err(_) => {
                // Timeline is not memory, but it may still exist on disk in broken state.
-                let dir_path = get_timeline_dir(TIMELINES_STATE.lock().unwrap().get_conf(), ttid);
+                let dir_path = get_timeline_dir(self.state.lock().unwrap().conf.as_ref(), ttid);
                let dir_existed = delete_dir(dir_path)?;

                Ok(TimelineDeleteForceResult {
@@ -485,7 +484,7 @@ impl GlobalTimelines {
        // Finalize deletion, by dropping Timeline objects and storing smaller tombstones.  The tombstones
        // are used to prevent still-running computes from re-creating the same timeline when they send data,
        // and to speed up repeated deletion calls by avoiding re-listing objects.
-        TIMELINES_STATE.lock().unwrap().delete(*ttid);
+        self.state.lock().unwrap().delete(*ttid);

        result
    }
@@ -497,17 +496,18 @@ impl GlobalTimelines {
    ///
    /// If only_local, doesn't remove WAL segments in remote storage.
    pub async fn delete_force_all_for_tenant(
+        &self,
        tenant_id: &TenantId,
        only_local: bool,
    ) -> Result<HashMap<TenantTimelineId, TimelineDeleteForceResult>> {
        info!("deleting all timelines for tenant {}", tenant_id);
-        let to_delete = Self::get_all_for_tenant(*tenant_id);
+        let to_delete = self.get_all_for_tenant(*tenant_id);

        let mut err = None;

        let mut deleted = HashMap::new();
        for tli in &to_delete {
-            match Self::delete(&tli.ttid, only_local).await {
+            match self.delete(&tli.ttid, only_local).await {
                Ok(result) => {
                    deleted.insert(tli.ttid, result);
                }
@@ -529,15 +529,15 @@ impl GlobalTimelines {
        // so the directory may be not empty. In this case timelines will have bad state
        // and timeline background jobs can panic.
        delete_dir(get_tenant_dir(
-            TIMELINES_STATE.lock().unwrap().get_conf(),
+            self.state.lock().unwrap().conf.as_ref(),
            tenant_id,
        ))?;

        Ok(deleted)
    }

-    pub fn housekeeping(tombstone_ttl: &Duration) {
-        let mut state = TIMELINES_STATE.lock().unwrap();
+    pub fn housekeeping(&self, tombstone_ttl: &Duration) {
+        let mut state = self.state.lock().unwrap();

        // We keep tombstones long enough to have a good chance of preventing rogue computes from re-creating deleted
        // timelines.  If a compute kept running for longer than this TTL (or across a safekeeper restart) then they
--- a/safekeeper/src/wal_service.rs
+++ b/safekeeper/src/wal_service.rs
@@ -4,6 +4,7 @@
 //!
 use anyhow::{Context, Result};
 use postgres_backend::QueryError;
+use std::sync::Arc;
 use std::time::Duration;
 use tokio::net::TcpStream;
 use tokio_io_timeout::TimeoutReader;
@@ -11,9 +12,9 @@ use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::{auth::Scope, measured_stream::MeasuredStream};

-use crate::handler::SafekeeperPostgresHandler;
 use crate::metrics::TrafficMetrics;
 use crate::SafeKeeperConf;
+use crate::{handler::SafekeeperPostgresHandler, GlobalTimelines};
 use postgres_backend::{AuthType, PostgresBackend};

 /// Accept incoming TCP connections and spawn them into a background thread.
@@ -22,9 +23,10 @@ use postgres_backend::{AuthType, PostgresBackend};
 /// to any tenant are allowed) or Tenant (only tokens giving access to specific
 /// tenant are allowed). Doesn't matter if auth is disabled in conf.
 pub async fn task_main(
-    conf: SafeKeeperConf,
+    conf: Arc<SafeKeeperConf>,
    pg_listener: std::net::TcpListener,
    allowed_auth_scope: Scope,
+    global_timelines: Arc<GlobalTimelines>,
 ) -> anyhow::Result<()> {
    // Tokio's from_std won't do this for us, per its comment.
    pg_listener.set_nonblocking(true)?;
@@ -37,10 +39,10 @@ pub async fn task_main(
        debug!("accepted connection from {}", peer_addr);
        let conf = conf.clone();
        let conn_id = issue_connection_id(&mut connection_count);
-
+        let global_timelines = global_timelines.clone();
        tokio::spawn(
            async move {
-                if let Err(err) = handle_socket(socket, conf, conn_id, allowed_auth_scope).await {
+                if let Err(err) = handle_socket(socket, conf, conn_id, allowed_auth_scope, global_timelines).await {
                    error!("connection handler exited: {}", err);
                }
            }
@@ -53,9 +55,10 @@ pub async fn task_main(
 ///
 async fn handle_socket(
    socket: TcpStream,
-    conf: SafeKeeperConf,
+    conf: Arc<SafeKeeperConf>,
    conn_id: ConnectionId,
    allowed_auth_scope: Scope,
+    global_timelines: Arc<GlobalTimelines>,
 ) -> Result<(), QueryError> {
    socket.set_nodelay(true)?;
    let peer_addr = socket.peer_addr()?;
@@ -96,8 +99,13 @@ async fn handle_socket(
        Some(_) => AuthType::NeonJWT,
    };
    let auth_pair = auth_key.map(|key| (allowed_auth_scope, key));
-    let mut conn_handler =
-        SafekeeperPostgresHandler::new(conf, conn_id, Some(traffic_metrics.clone()), auth_pair);
+    let mut conn_handler = SafekeeperPostgresHandler::new(
+        conf,
+        conn_id,
+        Some(traffic_metrics.clone()),
+        auth_pair,
+        global_timelines,
+    );
    let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?;
    // libpq protocol between safekeeper and walproposer / pageserver
    // We don't use shutdown.
--- a/storage_controller/src/background_node_operations.rs
+++ b/storage_controller/src/background_node_operations.rs
@@ -3,7 +3,7 @@ use std::{borrow::Cow, fmt::Debug, fmt::Display};
 use tokio_util::sync::CancellationToken;
 use utils::id::NodeId;

-pub(crate) const MAX_RECONCILES_PER_OPERATION: usize = 32;
+pub(crate) const MAX_RECONCILES_PER_OPERATION: usize = 64;

 #[derive(Copy, Clone)]
 pub(crate) struct Drain {
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -18,8 +18,9 @@ use pageserver_api::controller_api::{
    ShardsPreferredAzsRequest, TenantCreateRequest,
 };
 use pageserver_api::models::{
-    TenantConfigRequest, TenantLocationConfigRequest, TenantShardSplitRequest,
-    TenantTimeTravelRequest, TimelineArchivalConfigRequest, TimelineCreateRequest,
+    TenantConfigPatchRequest, TenantConfigRequest, TenantLocationConfigRequest,
+    TenantShardSplitRequest, TenantTimeTravelRequest, TimelineArchivalConfigRequest,
+    TimelineCreateRequest,
 };
 use pageserver_api::shard::TenantShardId;
 use pageserver_client::{mgmt_api, BlockUnblock};
@@ -208,6 +209,27 @@ async fn handle_tenant_location_config(
    )
 }

+async fn handle_tenant_config_patch(
+    service: Arc<Service>,
+    req: Request<Body>,
+) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::PageServerApi)?;
+
+    let mut req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
+    let config_req = json_request::<TenantConfigPatchRequest>(&mut req).await?;
+
+    json_response(
+        StatusCode::OK,
+        service.tenant_config_patch(config_req).await?,
+    )
+}
+
 async fn handle_tenant_config_set(
    service: Arc<Service>,
    req: Request<Body>,
@@ -857,6 +879,21 @@ async fn handle_cancel_node_fill(req: Request<Body>) -> Result<Response<Body>, A
    json_response(StatusCode::ACCEPTED, ())
 }

+async fn handle_safekeeper_list(req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Infra)?;
+
+    let req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
+    let state = get_state(&req);
+    let safekeepers = state.service.safekeepers_list().await?;
+    json_response(StatusCode::OK, safekeepers)
+}
+
 async fn handle_metadata_health_update(req: Request<Body>) -> Result<Response<Body>, ApiError> {
    check_permissions(&req, Scope::Scrubber)?;

@@ -1181,7 +1218,7 @@ impl From<ReconcileError> for ApiError {
 ///
 /// Not used by anything except manual testing.
 async fn handle_get_safekeeper(req: Request<Body>) -> Result<Response<Body>, ApiError> {
-    check_permissions(&req, Scope::Admin)?;
+    check_permissions(&req, Scope::Infra)?;

    let id = parse_request_param::<i64>(&req, "id")?;

@@ -1199,7 +1236,7 @@ async fn handle_get_safekeeper(req: Request<Body>) -> Result<Response<Body>, Api
    match res {
        Ok(b) => json_response(StatusCode::OK, b),
        Err(crate::persistence::DatabaseError::Query(diesel::result::Error::NotFound)) => {
-            Err(ApiError::NotFound("unknown instance_id".into()))
+            Err(ApiError::NotFound("unknown instance id".into()))
        }
        Err(other) => Err(other.into()),
    }
@@ -1795,6 +1832,21 @@ pub fn make_router(
                RequestName("control_v1_metadata_health_list_outdated"),
            )
        })
+        // Safekeepers
+        .get("/control/v1/safekeeper", |r| {
+            named_request_span(
+                r,
+                handle_safekeeper_list,
+                RequestName("control_v1_safekeeper_list"),
+            )
+        })
+        .get("/control/v1/safekeeper/:id", |r| {
+            named_request_span(r, handle_get_safekeeper, RequestName("v1_safekeeper"))
+        })
+        .post("/control/v1/safekeeper/:id", |r| {
+            // id is in the body
+            named_request_span(r, handle_upsert_safekeeper, RequestName("v1_safekeeper"))
+        })
        // Tenant Shard operations
        .put("/control/v1/tenant/:tenant_shard_id/migrate", |r| {
            tenant_service_handler(
@@ -1847,13 +1899,6 @@ pub fn make_router(
        .put("/control/v1/step_down", |r| {
            named_request_span(r, handle_step_down, RequestName("control_v1_step_down"))
        })
-        .get("/control/v1/safekeeper/:id", |r| {
-            named_request_span(r, handle_get_safekeeper, RequestName("v1_safekeeper"))
-        })
-        .post("/control/v1/safekeeper/:id", |r| {
-            // id is in the body
-            named_request_span(r, handle_upsert_safekeeper, RequestName("v1_safekeeper"))
-        })
        // Tenant operations
        // The ^/v1/ endpoints act as a "Virtual Pageserver", enabling shard-naive clients to call into
        // this service to manage tenants that actually consist of many tenant shards, as if they are a single entity.
@@ -1863,6 +1908,13 @@ pub fn make_router(
        .delete("/v1/tenant/:tenant_id", |r| {
            tenant_service_handler(r, handle_tenant_delete, RequestName("v1_tenant"))
        })
+        .patch("/v1/tenant/config", |r| {
+            tenant_service_handler(
+                r,
+                handle_tenant_config_patch,
+                RequestName("v1_tenant_config"),
+            )
+        })
        .put("/v1/tenant/config", |r| {
            tenant_service_handler(r, handle_tenant_config_set, RequestName("v1_tenant_config"))
        })
--- a/Show More
+++ b/Show More