Add test for prewarm under workload

Make ruffhappy
Eliminate stale reads from LFC in case of prewarm conflict
2026-02-09 21:50:37 +00:00 · 2024-12-23 11:36:47 +02:00 · 2024-12-15 08:02:35 +02:00 · 2024-12-14 21:58:26 +02:00 · 2024-12-14 21:11:20 +02:00 · 2024-12-14 21:09:58 +02:00
124 changed files with 3924 additions and 1753 deletions
--- a/.github/actionlint.yml
+++ b/.github/actionlint.yml
@@ -23,3 +23,5 @@ config-variables:
  - BENCHMARK_INGEST_TARGET_PROJECTID
  - PGREGRESS_PG16_PROJECT_ID
  - PGREGRESS_PG17_PROJECT_ID
+  - SLACK_ON_CALL_QA_STAGING_STREAM
+  - DEV_AWS_OIDC_ROLE_MANAGE_BENCHMARK_EC2_VMS_ARN
--- a/.github/actions/allure-report-generate/action.yml
+++ b/.github/actions/allure-report-generate/action.yml
@@ -7,10 +7,9 @@ inputs:
    type: boolean
    required: false
    default: false
-  aws_oicd_role_arn:
-    description: 'the OIDC role arn to (re-)acquire for allure report upload - if not set call must acquire OIDC role'
-    required: false
-    default: ''
+  aws-oicd-role-arn:
+    description: 'OIDC role arn to interract with S3'
+    required: true

 outputs:
  base-url:
@@ -84,12 +83,11 @@ runs:
        ALLURE_VERSION: 2.27.0
        ALLURE_ZIP_SHA256: b071858fb2fa542c65d8f152c5c40d26267b2dfb74df1f1608a589ecca38e777

-    - name: (Re-)configure AWS credentials # necessary to upload reports to S3 after a long-running test
-      if: ${{ !cancelled() && (inputs.aws_oicd_role_arn != '') }}
-      uses: aws-actions/configure-aws-credentials@v4
+    - uses: aws-actions/configure-aws-credentials@v4
+      if: ${{ !cancelled() }}
      with:
        aws-region: eu-central-1
-        role-to-assume: ${{ inputs.aws_oicd_role_arn }}
+        role-to-assume: ${{ inputs.aws-oicd-role-arn }}
        role-duration-seconds: 3600 # 1 hour should be more than enough to upload report

    # Potentially we could have several running build for the same key (for example, for the main branch), so we use improvised lock for this
--- a/.github/actions/allure-report-store/action.yml
+++ b/.github/actions/allure-report-store/action.yml
@@ -8,10 +8,9 @@ inputs:
  unique-key:
    description: 'string to distinguish different results in the same run'
    required: true
-  aws_oicd_role_arn:
-    description: 'the OIDC role arn to (re-)acquire for allure report upload - if not set call must acquire OIDC role'
-    required: false
-    default: ''
+  aws-oicd-role-arn:
+    description: 'OIDC role arn to interract with S3'
+    required: true

 runs:
  using: "composite"
@@ -36,12 +35,11 @@ runs:
      env:
        REPORT_DIR: ${{ inputs.report-dir }}

-    - name: (Re-)configure AWS credentials # necessary to upload reports to S3 after a long-running test
-      if: ${{ !cancelled() && (inputs.aws_oicd_role_arn != '') }}
-      uses: aws-actions/configure-aws-credentials@v4
+    - uses: aws-actions/configure-aws-credentials@v4
+      if: ${{ !cancelled() }}
      with:
        aws-region: eu-central-1
-        role-to-assume: ${{ inputs.aws_oicd_role_arn }}
+        role-to-assume: ${{ inputs.aws-oicd-role-arn }}
        role-duration-seconds: 3600 # 1 hour should be more than enough to upload report

    - name: Upload test results
--- a/.github/actions/download/action.yml
+++ b/.github/actions/download/action.yml
@@ -15,10 +15,19 @@ inputs:
  prefix:
    description: "S3 prefix. Default is '${GITHUB_RUN_ID}/${GITHUB_RUN_ATTEMPT}'"
    required: false
+  aws-oicd-role-arn:
+    description: 'OIDC role arn to interract with S3'
+    required: true

 runs:
  using: "composite"
  steps:
+    - uses: aws-actions/configure-aws-credentials@v4
+      with:
+        aws-region: eu-central-1
+        role-to-assume: ${{ inputs.aws-oicd-role-arn }}
+        role-duration-seconds: 3600
+
    - name: Download artifact
      id: download-artifact
      shell: bash -euxo pipefail {0}
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -48,10 +48,9 @@ inputs:
    description: 'benchmark durations JSON'
    required: false
    default: '{}'
-  aws_oicd_role_arn:
-    description: 'the OIDC role arn to (re-)acquire for allure report upload - if not set call must acquire OIDC role'
-    required: false
-    default: ''
+  aws-oicd-role-arn:
+    description: 'OIDC role arn to interract with S3'
+    required: true

 runs:
  using: "composite"
@@ -62,6 +61,7 @@ runs:
      with:
        name: neon-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build_type }}-artifact
        path: /tmp/neon
+        aws-oicd-role-arn: ${{ inputs.aws-oicd-role-arn }}

    - name: Download Neon binaries for the previous release
      if: inputs.build_type != 'remote'
@@ -70,6 +70,7 @@ runs:
        name: neon-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build_type }}-artifact
        path: /tmp/neon-previous
        prefix: latest
+        aws-oicd-role-arn: ${{ inputs.aws-oicd-role-arn }}

    - name: Download compatibility snapshot
      if: inputs.build_type != 'remote'
@@ -81,6 +82,7 @@ runs:
        # The lack of compatibility snapshot (for example, for the new Postgres version)
        # shouldn't fail the whole job. Only relevant test should fail.
        skip-if-does-not-exist: true
+        aws-oicd-role-arn: ${{ inputs.aws-oicd-role-arn }}

    - name: Checkout
      if: inputs.needs_postgres_source == 'true'
@@ -218,17 +220,19 @@ runs:
        # The lack of compatibility snapshot shouldn't fail the job
        # (for example if we didn't run the test for non build-and-test workflow)
        skip-if-does-not-exist: true
+        aws-oicd-role-arn: ${{ inputs.aws-oicd-role-arn }}

-    - name: (Re-)configure AWS credentials # necessary to upload reports to S3 after a long-running test
-      if: ${{ !cancelled() && (inputs.aws_oicd_role_arn != '') }}
-      uses: aws-actions/configure-aws-credentials@v4
+    - uses: aws-actions/configure-aws-credentials@v4
+      if: ${{ !cancelled() }}
      with:
        aws-region: eu-central-1
-        role-to-assume: ${{ inputs.aws_oicd_role_arn }}
+        role-to-assume: ${{ inputs.aws-oicd-role-arn }}
        role-duration-seconds: 3600 # 1 hour should be more than enough to upload report
+
    - name: Upload test results
      if: ${{ !cancelled() }}
      uses: ./.github/actions/allure-report-store
      with:
        report-dir: /tmp/test_output/allure/results
        unique-key: ${{ inputs.build_type }}-${{ inputs.pg_version }}
+        aws-oicd-role-arn: ${{ inputs.aws-oicd-role-arn }}
--- a/.github/actions/save-coverage-data/action.yml
+++ b/.github/actions/save-coverage-data/action.yml
@@ -14,9 +14,11 @@ runs:
        name: coverage-data-artifact
        path: /tmp/coverage
        skip-if-does-not-exist: true # skip if there's no previous coverage to download
+        aws-oicd-role-arn: ${{ inputs.aws-oicd-role-arn }}

    - name: Upload coverage data
      uses: ./.github/actions/upload
      with:
        name: coverage-data-artifact
        path: /tmp/coverage
+        aws-oicd-role-arn: ${{ inputs.aws-oicd-role-arn }}
--- a/.github/actions/upload/action.yml
+++ b/.github/actions/upload/action.yml
@@ -14,6 +14,10 @@ inputs:
  prefix:
    description: "S3 prefix. Default is '${GITHUB_SHA}/${GITHUB_RUN_ID}/${GITHUB_RUN_ATTEMPT}'"
    required: false
+  aws-oicd-role-arn:
+    description: "the OIDC role arn for aws auth"
+    required: false
+    default: ""

 runs:
  using: "composite"
@@ -53,6 +57,13 @@ runs:

        echo 'SKIPPED=false' >> $GITHUB_OUTPUT

+    - name: Configure AWS credentials
+      uses: aws-actions/configure-aws-credentials@v4
+      with:
+        aws-region: eu-central-1
+        role-to-assume: ${{ inputs.aws-oicd-role-arn }}
+        role-duration-seconds: 3600
+
    - name: Upload artifact
      if: ${{ steps.prepare-artifact.outputs.SKIPPED == 'false' }}
      shell: bash -euxo pipefail {0}
--- a/.github/workflows/_benchmarking_preparation.yml
+++ b/.github/workflows/_benchmarking_preparation.yml
@@ -70,6 +70,7 @@ jobs:
        name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
        path: /tmp/neon/
        prefix: latest
+        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}

    # we create a table that has one row for each database that we want to restore with the status whether the restore is done
    - name: Create benchmark_restore_status table if it does not exist
--- a/.github/workflows/_build-and-test-locally.yml
+++ b/.github/workflows/_build-and-test-locally.yml
@@ -31,12 +31,13 @@ defaults:
 env:
  RUST_BACKTRACE: 1
  COPT: '-Werror'
-  AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
-  AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}

 jobs:
  build-neon:
    runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', inputs.arch == 'arm64' && 'large-arm64' || 'large')) }}
+    permissions:
+      id-token: write # aws-actions/configure-aws-credentials
+      contents: read
    container:
      image: ${{ inputs.build-tools-image }}
      credentials:
@@ -205,6 +206,13 @@ jobs:
            done
          fi

+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          aws-region: eu-central-1
+          role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+          role-duration-seconds: 18000 # 5 hours
+
      - name: Run rust tests
        env:
          NEXTEST_RETRIES: 3
@@ -256,6 +264,7 @@ jobs:
        with:
          name: neon-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-artifact
          path: /tmp/neon
+          aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}

      # XXX: keep this after the binaries.list is formed, so the coverage can properly work later
      - name: Merge and upload coverage data
@@ -265,6 +274,10 @@ jobs:
  regress-tests:
    # Don't run regression tests on debug arm64 builds
    if: inputs.build-type != 'debug' || inputs.arch != 'arm64'
+    permissions:
+      id-token: write # aws-actions/configure-aws-credentials
+      contents: read
+      statuses: write
    needs: [ build-neon ]
    runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', inputs.arch == 'arm64' && 'large-arm64' || 'large')) }}
    container:
@@ -283,7 +296,7 @@ jobs:
          submodules: true

      - name: Pytest regression tests
-        continue-on-error: ${{ matrix.lfc_state == 'with-lfc' }}
+        continue-on-error: ${{ matrix.lfc_state == 'with-lfc' && inputs.build-type == 'debug' }}
        uses: ./.github/actions/run-python-test-set
        timeout-minutes: 60
        with:
@@ -295,6 +308,7 @@ jobs:
          real_s3_region: eu-central-1
          rerun_failed: true
          pg_version: ${{ matrix.pg_version }}
+          aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
        env:
          TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
          CHECK_ONDISK_DATA_COMPATIBILITY: nonempty
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -105,6 +105,7 @@ jobs:
        name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
        path: /tmp/neon/
        prefix: latest
+        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}

    - name: Create Neon Project
      id: create-neon-project
@@ -122,7 +123,7 @@ jobs:
        run_in_parallel: false
        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
        pg_version: ${{ env.DEFAULT_PG_VERSION }}
-        aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
        # Set --sparse-ordering option of pytest-order plugin
        # to ensure tests are running in order of appears in the file.
        # It's important for test_perf_pgbench.py::test_pgbench_remote_* tests
@@ -152,7 +153,7 @@ jobs:
      if: ${{ !cancelled() }}
      uses: ./.github/actions/allure-report-generate
      with:
-        aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}

    - name: Post to a Slack channel
      if: ${{ github.event.schedule && failure() }}
@@ -204,6 +205,7 @@ jobs:
        name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
        path: /tmp/neon/
        prefix: latest
+        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}

    - name: Run Logical Replication benchmarks
      uses: ./.github/actions/run-python-test-set
@@ -214,7 +216,7 @@ jobs:
        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
        extra_params: -m remote_cluster --timeout 5400
        pg_version: ${{ env.DEFAULT_PG_VERSION }}
-        aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
      env:
        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
@@ -231,7 +233,7 @@ jobs:
        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
        extra_params: -m remote_cluster --timeout 5400
        pg_version: ${{ env.DEFAULT_PG_VERSION }}
-        aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
      env:
        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
@@ -243,7 +245,7 @@ jobs:
      uses: ./.github/actions/allure-report-generate
      with:
        store-test-results-into-db: true
-        aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
      env:
        REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}

@@ -405,6 +407,7 @@ jobs:
        name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
        path: /tmp/neon/
        prefix: latest
+        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}

    - name: Create Neon Project
      if: contains(fromJson('["neonvm-captest-new", "neonvm-captest-freetier", "neonvm-azure-captest-freetier", "neonvm-azure-captest-new"]'), matrix.platform)
@@ -452,7 +455,7 @@ jobs:
        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
        extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_init
        pg_version: ${{ env.DEFAULT_PG_VERSION }}
-        aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
      env:
        BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
@@ -467,7 +470,7 @@ jobs:
        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
        extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_simple_update
        pg_version: ${{ env.DEFAULT_PG_VERSION }}
-        aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
      env:
        BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
@@ -482,7 +485,7 @@ jobs:
        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
        extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_select_only
        pg_version: ${{ env.DEFAULT_PG_VERSION }}
-        aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
      env:
        BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
@@ -500,7 +503,7 @@ jobs:
      if: ${{ !cancelled() }}
      uses: ./.github/actions/allure-report-generate
      with:
-        aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}

    - name: Post to a Slack channel
      if: ${{ github.event.schedule && failure() }}
@@ -611,7 +614,7 @@ jobs:
        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
        extra_params: -m remote_cluster --timeout 21600 -k test_pgvector_indexing
        pg_version: ${{ env.DEFAULT_PG_VERSION }}
-        aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
      env:
        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
@@ -626,7 +629,7 @@ jobs:
        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
        extra_params: -m remote_cluster --timeout 21600
        pg_version: ${{ env.DEFAULT_PG_VERSION }}
-        aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
      env:
        BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
@@ -637,7 +640,7 @@ jobs:
      if: ${{ !cancelled() }}
      uses: ./.github/actions/allure-report-generate
      with:
-        aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}

    - name: Post to a Slack channel
      if: ${{ github.event.schedule && failure() }}
@@ -708,6 +711,7 @@ jobs:
        name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
        path: /tmp/neon/
        prefix: latest
+        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}

    - name: Set up Connection String
      id: set-up-connstr
@@ -739,7 +743,7 @@ jobs:
        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
        extra_params: -m remote_cluster --timeout 43200 -k test_clickbench
        pg_version: ${{ env.DEFAULT_PG_VERSION }}
-        aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
      env:
        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
@@ -753,7 +757,7 @@ jobs:
      if: ${{ !cancelled() }}
      uses: ./.github/actions/allure-report-generate
      with:
-        aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}

    - name: Post to a Slack channel
      if: ${{ github.event.schedule && failure() }}
@@ -818,6 +822,7 @@ jobs:
        name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
        path: /tmp/neon/
        prefix: latest
+        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}

    - name: Get Connstring Secret Name
      run: |
@@ -856,7 +861,7 @@ jobs:
        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
        extra_params: -m remote_cluster --timeout 21600 -k test_tpch
        pg_version: ${{ env.DEFAULT_PG_VERSION }}
-        aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
      env:
        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
@@ -868,7 +873,7 @@ jobs:
      if: ${{ !cancelled() }}
      uses: ./.github/actions/allure-report-generate
      with:
-        aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}

    - name: Post to a Slack channel
      if: ${{ github.event.schedule && failure() }}
@@ -926,6 +931,7 @@ jobs:
        name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
        path: /tmp/neon/
        prefix: latest
+        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}

    - name: Set up Connection String
      id: set-up-connstr
@@ -957,7 +963,7 @@ jobs:
        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
        extra_params: -m remote_cluster --timeout 21600 -k test_user_examples
        pg_version: ${{ env.DEFAULT_PG_VERSION }}
-        aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
      env:
        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
@@ -968,7 +974,7 @@ jobs:
      if: ${{ !cancelled() }}
      uses: ./.github/actions/allure-report-generate
      with:
-        aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}

    - name: Post to a Slack channel
      if: ${{ github.event.schedule && failure() }}
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -21,8 +21,6 @@ concurrency:
 env:
  RUST_BACKTRACE: 1
  COPT: '-Werror'
-  AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
-  AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
  # A concurrency group that we use for e2e-tests runs, matches `concurrency.group` above with `github.repository` as a prefix
  E2E_CONCURRENCY_GROUP: ${{ github.repository }}-e2e-tests-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }}

@@ -255,15 +253,15 @@ jobs:
      build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
      build-tag: ${{ needs.tag.outputs.build-tag }}
      build-type: ${{ matrix.build-type }}
-      # Run tests on all Postgres versions in release builds and only on the latest version in debug builds
-      # run without LFC on v17 release only
+      # Run tests on all Postgres versions in release builds and only on the latest version in debug builds.
+      # Run without LFC on v17 release and debug builds only. For all the other cases LFC is enabled.
      test-cfg: |
-        ${{ matrix.build-type == 'release' && '[{"pg_version":"v14", "lfc_state": "without-lfc"},
-                                                {"pg_version":"v15", "lfc_state": "without-lfc"},
-                                                {"pg_version":"v16", "lfc_state": "without-lfc"},
-                                                {"pg_version":"v17", "lfc_state": "without-lfc"},
-                                                {"pg_version":"v17", "lfc_state": "with-lfc"}]'
-                                           || '[{"pg_version":"v17", "lfc_state": "without-lfc"}]' }}
+        ${{ matrix.build-type == 'release' && '[{"pg_version":"v14", "lfc_state": "with-lfc"},
+                                                {"pg_version":"v15", "lfc_state": "with-lfc"},
+                                                {"pg_version":"v16", "lfc_state": "with-lfc"},
+                                                {"pg_version":"v17", "lfc_state": "with-lfc"},
+                                                {"pg_version":"v17", "lfc_state": "without-lfc"}]'
+                                           || '[{"pg_version":"v17", "lfc_state": "without-lfc" }]' }}
    secrets: inherit

  # Keep `benchmarks` job outside of `build-and-test-locally` workflow to make job failures non-blocking
@@ -305,6 +303,11 @@ jobs:
  benchmarks:
    if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks')
    needs: [ check-permissions, build-and-test-locally, build-build-tools-image, get-benchmarks-durations ]
+    permissions:
+      id-token: write # aws-actions/configure-aws-credentials
+      statuses: write
+      contents: write
+      pull-requests: write
    runs-on: [ self-hosted, small ]
    container:
      image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
@@ -333,6 +336,7 @@ jobs:
          extra_params: --splits 5 --group ${{ matrix.pytest_split_group }}
          benchmark_durations: ${{ needs.get-benchmarks-durations.outputs.json }}
          pg_version: v16
+          aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
        env:
          VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
          PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
@@ -345,6 +349,11 @@ jobs:
  report-benchmarks-failures:
    needs: [ benchmarks, create-test-report ]
    if: github.ref_name == 'main' && failure() && needs.benchmarks.result == 'failure'
+    permissions:
+      id-token: write # aws-actions/configure-aws-credentials
+      statuses: write
+      contents: write
+      pull-requests: write
    runs-on: ubuntu-22.04

    steps:
@@ -360,6 +369,11 @@ jobs:
  create-test-report:
    needs: [ check-permissions, build-and-test-locally, coverage-report, build-build-tools-image, benchmarks ]
    if: ${{ !cancelled() && contains(fromJSON('["skipped", "success"]'), needs.check-permissions.result) }}
+    permissions:
+      id-token: write # aws-actions/configure-aws-credentials
+      statuses: write
+      contents: write
+      pull-requests: write
    outputs:
      report-url: ${{ steps.create-allure-report.outputs.report-url }}

@@ -380,6 +394,7 @@ jobs:
        uses: ./.github/actions/allure-report-generate
        with:
          store-test-results-into-db: true
+          aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
        env:
          REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}

@@ -411,6 +426,10 @@ jobs:
  coverage-report:
    if: ${{ !startsWith(github.ref_name, 'release') }}
    needs: [ check-permissions, build-build-tools-image, build-and-test-locally ]
+    permissions:
+      id-token: write # aws-actions/configure-aws-credentials
+      statuses: write
+      contents: write
    runs-on: [ self-hosted, small ]
    container:
      image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
@@ -437,12 +456,14 @@ jobs:
        with:
          name: neon-${{ runner.os }}-${{ runner.arch }}-${{ matrix.build_type }}-artifact
          path: /tmp/neon
+          aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}

      - name: Get coverage artifact
        uses: ./.github/actions/download
        with:
          name: coverage-data-artifact
          path: /tmp/coverage
+          aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}

      - name: Merge coverage data
        run: scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage merge
@@ -573,6 +594,10 @@ jobs:
  neon-image:
    needs: [ neon-image-arch, tag ]
    runs-on: ubuntu-22.04
+    permissions:
+      id-token: write # aws-actions/configure-aws-credentials
+      statuses: write
+      contents: read

    steps:
      - uses: docker/login-action@v3
@@ -587,11 +612,15 @@ jobs:
                                             neondatabase/neon:${{ needs.tag.outputs.build-tag }}-bookworm-x64 \
                                             neondatabase/neon:${{ needs.tag.outputs.build-tag }}-bookworm-arm64

-      - uses: docker/login-action@v3
+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@v4
        with:
-          registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
-          username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
-          password: ${{ secrets.AWS_SECRET_KEY_DEV }}
+          aws-region: eu-central-1
+          role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+          role-duration-seconds: 3600
+
+      - name: Login to Amazon Dev ECR
+        uses: aws-actions/amazon-ecr-login@v2

      - name: Push multi-arch image to ECR
        run: |
@@ -600,6 +629,10 @@ jobs:

  compute-node-image-arch:
    needs: [ check-permissions, build-build-tools-image, tag ]
+    permissions:
+      id-token: write # aws-actions/configure-aws-credentials
+      statuses: write
+      contents: read
    strategy:
      fail-fast: false
      matrix:
@@ -640,11 +673,15 @@ jobs:
          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}

-      - uses: docker/login-action@v3
+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@v4
        with:
-          registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
-          username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
-          password: ${{ secrets.AWS_SECRET_KEY_DEV }}
+          aws-region: eu-central-1
+          role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+          role-duration-seconds: 3600
+
+      - name: Login to Amazon Dev ECR
+        uses: aws-actions/amazon-ecr-login@v2

      - uses: docker/login-action@v3
        with:
@@ -717,6 +754,10 @@ jobs:

  compute-node-image:
    needs: [ compute-node-image-arch, tag ]
+    permissions:
+      id-token: write # aws-actions/configure-aws-credentials
+      statuses: write
+      contents: read
    runs-on: ubuntu-22.04

    strategy:
@@ -761,11 +802,15 @@ jobs:
                                             neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-x64 \
                                             neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-arm64

-      - uses: docker/login-action@v3
+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@v4
        with:
-          registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
-          username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
-          password: ${{ secrets.AWS_SECRET_KEY_DEV }}
+          aws-region: eu-central-1
+          role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+          role-duration-seconds: 3600
+
+      - name: Login to Amazon Dev ECR
+        uses: aws-actions/amazon-ecr-login@v2

      - name: Push multi-arch compute-node-${{ matrix.version.pg }} image to ECR
        run: |
@@ -795,7 +840,7 @@ jobs:
          - pg: v17
            debian: bookworm
    env:
-      VM_BUILDER_VERSION: v0.35.0
+      VM_BUILDER_VERSION: v0.37.1

    steps:
      - uses: actions/checkout@v4
@@ -890,7 +935,9 @@ jobs:
    runs-on: ubuntu-22.04

    permissions:
-      id-token: write # for `aws-actions/configure-aws-credentials`
+      id-token: write # aws-actions/configure-aws-credentials
+      statuses: write
+      contents: read

    env:
      VERSIONS: v14 v15 v16 v17
@@ -901,12 +948,15 @@ jobs:
          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}

-      - name: Login to dev ECR
-        uses: docker/login-action@v3
+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@v4
        with:
-          registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
-          username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
-          password: ${{ secrets.AWS_SECRET_KEY_DEV }}
+          aws-region: eu-central-1
+          role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+          role-duration-seconds: 3600
+
+      - name: Login to Amazon Dev ECR
+        uses: aws-actions/amazon-ecr-login@v2

      - name: Copy vm-compute-node images to ECR
        run: |
@@ -985,6 +1035,11 @@ jobs:
  trigger-custom-extensions-build-and-wait:
    needs: [ check-permissions, tag ]
    runs-on: ubuntu-22.04
+    permissions:
+      id-token: write # aws-actions/configure-aws-credentials
+      statuses: write
+      contents: write
+      pull-requests: write
    steps:
      - name: Set PR's status to pending and request a remote CI test
        run: |
@@ -1060,12 +1115,79 @@ jobs:
    needs: [ check-permissions, promote-images, tag, build-and-test-locally, trigger-custom-extensions-build-and-wait, push-to-acr-dev, push-to-acr-prod ]
    # `!failure() && !cancelled()` is required because the workflow depends on the job that can be skipped: `push-to-acr-dev` and `push-to-acr-prod`
    if: (github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute') && !failure() && !cancelled()
-
+    permissions:
+      id-token: write # aws-actions/configure-aws-credentials
+      statuses: write
+      contents: write
    runs-on: [ self-hosted, small ]
    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
    steps:
      - uses: actions/checkout@v4

+      - name: Create git tag and GitHub release
+        if: github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute'
+        uses: actions/github-script@v7
+        with:
+          retries: 5
+          script: |
+            const tag = "${{ needs.tag.outputs.build-tag }}";
+
+            try {
+              const existingRef = await github.rest.git.getRef({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                ref: `tags/${tag}`,
+              });
+
+              if (existingRef.data.object.sha !== context.sha) {
+                throw new Error(`Tag ${tag} already exists but points to a different commit (expected: ${context.sha}, actual: ${existingRef.data.object.sha}).`);
+              }
+
+              console.log(`Tag ${tag} already exists and points to ${context.sha} as expected.`);
+            } catch (error) {
+              if (error.status !== 404) {
+                throw error;
+              }
+
+              console.log(`Tag ${tag} does not exist. Creating it...`);
+              await github.rest.git.createRef({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                ref: `refs/tags/${tag}`,
+                sha: context.sha,
+              });
+              console.log(`Tag ${tag} created successfully.`);
+            }
+
+            // TODO: check how GitHub releases looks for proxy/compute releases and enable them if they're ok
+            if (context.ref !== 'refs/heads/release') {
+              console.log(`GitHub release skipped for ${context.ref}.`);
+              return;
+            }
+
+            try {
+              const existingRelease = await github.rest.repos.getReleaseByTag({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                tag: tag,
+              });
+
+              console.log(`Release for tag ${tag} already exists (ID: ${existingRelease.data.id}).`);
+            } catch (error) {
+              if (error.status !== 404) {
+                throw error;
+              }
+
+              console.log(`Release for tag ${tag} does not exist. Creating it...`);
+              await github.rest.repos.createRelease({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                tag_name: tag,
+                generate_release_notes: true,
+              });
+              console.log(`Release for tag ${tag} created successfully.`);
+            }
+
      - name: Trigger deploy workflow
        env:
          GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
@@ -1115,38 +1237,13 @@ jobs:
            exit 1
          fi

-      - name: Create git tag
-        if: github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute'
-        uses: actions/github-script@v7
-        with:
-          # Retry script for 5XX server errors: https://github.com/actions/github-script#retries
-          retries: 5
-          script: |
-            await github.rest.git.createRef({
-              owner: context.repo.owner,
-              repo: context.repo.repo,
-              ref: "refs/tags/${{ needs.tag.outputs.build-tag }}",
-              sha: context.sha,
-            })
-
-      # TODO: check how GitHub releases looks for proxy releases and enable it if it's ok
-      - name: Create GitHub release
-        if: github.ref_name == 'release'
-        uses: actions/github-script@v7
-        with:
-          # Retry script for 5XX server errors: https://github.com/actions/github-script#retries
-          retries: 5
-          script: |
-            await github.rest.repos.createRelease({
-              owner: context.repo.owner,
-              repo: context.repo.repo,
-              tag_name: "${{ needs.tag.outputs.build-tag }}",
-              generate_release_notes: true,
-            })
-
  # The job runs on `release` branch and copies compatibility data and Neon artifact from the last *release PR* to the latest directory
  promote-compatibility-data:
    needs: [ deploy ]
+    permissions:
+      id-token: write # aws-actions/configure-aws-credentials
+      statuses: write
+      contents: read
    # `!failure() && !cancelled()` is required because the workflow transitively depends on the job that can be skipped: `push-to-acr-dev` and `push-to-acr-prod`
    if: github.ref_name == 'release' && !failure() && !cancelled()

@@ -1183,6 +1280,12 @@ jobs:
          echo "run-id=${run_id}" | tee -a ${GITHUB_OUTPUT}
          echo "commit-sha=${last_commit_sha}" | tee -a ${GITHUB_OUTPUT}

+      - uses: aws-actions/configure-aws-credentials@v4
+        with:
+          aws-region: eu-central-1
+          role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+          role-duration-seconds: 3600
+
      - name: Promote compatibility snapshot and Neon artifact
        env:
          BUCKET: neon-github-public-dev
--- a/.github/workflows/cloud-regress.yml
+++ b/.github/workflows/cloud-regress.yml
@@ -19,14 +19,15 @@ concurrency:
  group: ${{ github.workflow }}
  cancel-in-progress: true

+permissions:
+  id-token: write # aws-actions/configure-aws-credentials
+
 jobs:
  regress:
    env:
      POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
      TEST_OUTPUT: /tmp/test_output
      BUILD_TYPE: remote
-      AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
-      AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
    strategy:
      fail-fast: false
      matrix:
@@ -78,6 +79,7 @@ jobs:
          name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
          path: /tmp/neon/
          prefix: latest
+          aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}

      - name: Create a new branch
        id: create-branch
@@ -93,10 +95,12 @@ jobs:
          test_selection: cloud_regress
          pg_version: ${{matrix.pg-version}}
          extra_params: -m remote_cluster
+          aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
        env:
          BENCHMARK_CONNSTR: ${{steps.create-branch.outputs.dsn}}

      - name: Delete branch
+        if: always()
        uses: ./.github/actions/neon-branch-delete
        with:
          api_key: ${{ secrets.NEON_STAGING_API_KEY }}
@@ -107,12 +111,14 @@ jobs:
        id: create-allure-report
        if: ${{ !cancelled() }}
        uses: ./.github/actions/allure-report-generate
+        with:
+          aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}

      - name: Post to a Slack channel
        if: ${{ github.event.schedule && failure() }}
        uses: slackapi/slack-github-action@v1
        with:
-          channel-id: "C033QLM5P7D" # on-call-staging-stream
+          channel-id: ${{ vars.SLACK_ON_CALL_QA_STAGING_STREAM }}
          slack-message: |
            Periodic pg_regress on staging: ${{ job.status }}
            <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>
--- a/.github/workflows/ingest_benchmark.yml
+++ b/.github/workflows/ingest_benchmark.yml
@@ -13,7 +13,7 @@ on:
    #          │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
    - cron:   '0 9 * * *' # run once a day, timezone is utc
  workflow_dispatch: # adds ability to run this manually
-    
+
 defaults:
  run:
    shell: bash -euxo pipefail {0}
@@ -28,7 +28,7 @@ jobs:
    strategy:
      fail-fast: false # allow other variants to continue even if one fails
      matrix:
-        target_project: [new_empty_project, large_existing_project]  
+        target_project: [new_empty_project, large_existing_project]
    permissions:
      contents: write
      statuses: write
@@ -56,7 +56,7 @@ jobs:
      with:
        aws-region: eu-central-1
        role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
-        role-duration-seconds: 18000 # 5 hours is currently max associated with IAM role 
+        role-duration-seconds: 18000 # 5 hours is currently max associated with IAM role

    - name: Download Neon artifact
      uses: ./.github/actions/download
@@ -64,6 +64,7 @@ jobs:
        name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
        path: /tmp/neon/
        prefix: latest
+        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}

    - name: Create Neon Project
      if: ${{ matrix.target_project == 'new_empty_project' }}
@@ -94,7 +95,7 @@ jobs:
        project_id: ${{ vars.BENCHMARK_INGEST_TARGET_PROJECTID }}
        api_key: ${{ secrets.NEON_STAGING_API_KEY }}

-    - name: Initialize Neon project 
+    - name: Initialize Neon project
      if: ${{ matrix.target_project == 'large_existing_project' }}
      env:
          BENCHMARK_INGEST_TARGET_CONNSTR: ${{ steps.create-neon-branch-ingest-target.outputs.dsn }}
@@ -122,7 +123,7 @@ jobs:
        ${PSQL} "${BENCHMARK_INGEST_TARGET_CONNSTR}" -c "CREATE EXTENSION IF NOT EXISTS neon; CREATE EXTENSION IF NOT EXISTS neon_utils;"
        echo "BENCHMARK_INGEST_TARGET_CONNSTR=${BENCHMARK_INGEST_TARGET_CONNSTR}" >> $GITHUB_ENV

-    - name: Invoke pgcopydb  
+    - name: Invoke pgcopydb
      uses: ./.github/actions/run-python-test-set
      with:
        build_type: remote
@@ -131,7 +132,7 @@ jobs:
        extra_params: -s -m remote_cluster --timeout 86400 -k test_ingest_performance_using_pgcopydb
        pg_version: v16
        save_perf_report: true
-        aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
      env:
        BENCHMARK_INGEST_SOURCE_CONNSTR: ${{ secrets.BENCHMARK_INGEST_SOURCE_CONNSTR }}
        TARGET_PROJECT_TYPE: ${{ matrix.target_project }}
@@ -143,7 +144,7 @@ jobs:
      run: |
        export LD_LIBRARY_PATH=${PG_16_LIB_PATH}
        ${PSQL} "${BENCHMARK_INGEST_TARGET_CONNSTR}" -c "\dt+"
-      
+
    - name: Delete Neon Project
      if: ${{ always() && matrix.target_project == 'new_empty_project' }}
      uses: ./.github/actions/neon-project-delete
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -143,6 +143,10 @@ jobs:

  gather-rust-build-stats:
    needs: [ check-permissions, build-build-tools-image ]
+    permissions:
+      id-token: write # aws-actions/configure-aws-credentials
+      statuses: write
+      contents: write
    if: |
      contains(github.event.pull_request.labels.*.name, 'run-extra-build-stats') ||
      contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') ||
@@ -177,13 +181,18 @@ jobs:
      - name: Produce the build stats
        run: PQ_LIB_DIR=$(pwd)/pg_install/v17/lib cargo build --all --release --timings -j$(nproc)

+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          aws-region: eu-central-1
+          role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+          role-duration-seconds: 3600
+
      - name: Upload the build stats
        id: upload-stats
        env:
          BUCKET: neon-github-public-dev
          SHA: ${{ github.event.pull_request.head.sha || github.sha }}
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
        run: |
          REPORT_URL=https://${BUCKET}.s3.amazonaws.com/build-stats/${SHA}/${GITHUB_RUN_ID}/cargo-timing.html
          aws s3 cp --only-show-errors ./target/cargo-timings/cargo-timing.html "s3://${BUCKET}/build-stats/${SHA}/${GITHUB_RUN_ID}/"
--- a/.github/workflows/periodic_pagebench.yml
+++ b/.github/workflows/periodic_pagebench.yml
@@ -27,6 +27,11 @@ concurrency:

 jobs:
  trigger_bench_on_ec2_machine_in_eu_central_1:
+    permissions:
+      id-token: write # aws-actions/configure-aws-credentials
+      statuses: write
+      contents: write
+      pull-requests: write
    runs-on: [ self-hosted, small ]
    container:
      image: neondatabase/build-tools:pinned-bookworm
@@ -38,8 +43,6 @@ jobs:
    env:
      API_KEY: ${{ secrets.PERIODIC_PAGEBENCH_EC2_RUNNER_API_KEY }}
      RUN_ID: ${{ github.run_id }}
-      AWS_ACCESS_KEY_ID: ${{ secrets.AWS_EC2_US_TEST_RUNNER_ACCESS_KEY_ID }}
-      AWS_SECRET_ACCESS_KEY : ${{ secrets.AWS_EC2_US_TEST_RUNNER_ACCESS_KEY_SECRET }}
      AWS_DEFAULT_REGION : "eu-central-1"
      AWS_INSTANCE_ID : "i-02a59a3bf86bc7e74"
    steps:
@@ -50,6 +53,13 @@ jobs:
    - name: Show my own (github runner) external IP address - usefull for IP allowlisting
      run: curl https://ifconfig.me

+    - name: Assume AWS OIDC role that allows to manage (start/stop/describe... EC machine)
+      uses: aws-actions/configure-aws-credentials@v4
+      with:
+        aws-region: eu-central-1
+        role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_MANAGE_BENCHMARK_EC2_VMS_ARN }}
+        role-duration-seconds: 3600
+
    - name: Start EC2 instance and wait for the instance to boot up
      run: |
        aws ec2 start-instances --instance-ids $AWS_INSTANCE_ID
@@ -124,11 +134,10 @@ jobs:
        cat "test_log_${GITHUB_RUN_ID}"

    - name: Create Allure report
-      env:
-        AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
-        AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
      if: ${{ !cancelled() }}
      uses: ./.github/actions/allure-report-generate
+      with:
+        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}

    - name: Post to a Slack channel
      if: ${{ github.event.schedule && failure() }}
@@ -148,6 +157,14 @@ jobs:
        -H "Authorization: Bearer $API_KEY" \
        -d ''

+    - name: Assume AWS OIDC role that allows to manage (start/stop/describe... EC machine)
+      if: always() && steps.poll_step.outputs.too_many_runs != 'true'
+      uses: aws-actions/configure-aws-credentials@v4
+      with:
+        aws-region: eu-central-1
+        role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_MANAGE_BENCHMARK_EC2_VMS_ARN }}
+        role-duration-seconds: 3600
+
    - name: Stop EC2 instance and wait for the instance to be stopped
      if: always() && steps.poll_step.outputs.too_many_runs != 'true'
      run: |
--- a/.github/workflows/pg-clients.yml
+++ b/.github/workflows/pg-clients.yml
@@ -25,11 +25,13 @@ defaults:
  run:
    shell: bash -euxo pipefail {0}

+permissions:
+  id-token: write # aws-actions/configure-aws-credentials
+  statuses: write # require for posting a status update
+
 env:
  DEFAULT_PG_VERSION: 16
  PLATFORM: neon-captest-new
-  AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
-  AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
  AWS_DEFAULT_REGION: eu-central-1

 jobs:
@@ -94,6 +96,7 @@ jobs:
          name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
          path: /tmp/neon/
          prefix: latest
+          aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}

      - name: Create Neon Project
        id: create-neon-project
@@ -110,6 +113,7 @@ jobs:
          run_in_parallel: false
          extra_params: -m remote_cluster
          pg_version: ${{ env.DEFAULT_PG_VERSION }}
+          aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
        env:
          BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }}

@@ -126,6 +130,7 @@ jobs:
        uses: ./.github/actions/allure-report-generate
        with:
          store-test-results-into-db: true
+          aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
        env:
          REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}

@@ -159,6 +164,7 @@ jobs:
        name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
        path: /tmp/neon/
        prefix: latest
+        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}

    - name: Create Neon Project
      id: create-neon-project
@@ -175,6 +181,7 @@ jobs:
        run_in_parallel: false
        extra_params: -m remote_cluster
        pg_version: ${{ env.DEFAULT_PG_VERSION }}
+        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
      env:
        BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }}

@@ -191,6 +198,7 @@ jobs:
      uses: ./.github/actions/allure-report-generate
      with:
        store-test-results-into-db: true
+        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
      env:
        REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}

--- a/.github/workflows/pin-build-tools-image.yml
+++ b/.github/workflows/pin-build-tools-image.yml
@@ -67,7 +67,7 @@ jobs:
    runs-on: ubuntu-22.04

    permissions:
-      id-token: write # for `azure/login`
+      id-token: write # for `azure/login` and aws auth

    steps:
      - uses: docker/login-action@v3
@@ -75,11 +75,15 @@ jobs:
          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}

-      - uses: docker/login-action@v3
+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@v4
        with:
-          registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
-          username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
-          password: ${{ secrets.AWS_SECRET_KEY_DEV }}
+          aws-region: eu-central-1
+          role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+          role-duration-seconds: 3600
+
+      - name: Login to Amazon Dev ECR
+        uses: aws-actions/amazon-ecr-login@v2

      - name: Azure login
        uses: azure/login@6c251865b4e6290e7b78be643ea2d005bc51f69a  # @v2.1.1
--- a/.github/workflows/pre-merge-checks.yml
+++ b/.github/workflows/pre-merge-checks.yml
@@ -63,6 +63,7 @@ jobs:
    if: always()
    permissions:
      statuses: write # for `github.repos.createCommitStatus(...)`
+      contents: write
    needs:
      - get-changed-files
      - check-codestyle-python
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -3,7 +3,7 @@ name: Create Release Branch
 on:
  schedule:
    # It should be kept in sync with if-condition in jobs
-    - cron: '0 6 * * MON' # Storage release
+    - cron: '0 6 * * FRI' # Storage release
    - cron: '0 6 * * THU' # Proxy release
  workflow_dispatch:
    inputs:
@@ -29,7 +29,7 @@ defaults:

 jobs:
  create-storage-release-branch:
-    if: ${{ github.event.schedule == '0 6 * * MON' || inputs.create-storage-release-branch }}
+    if: ${{ github.event.schedule == '0 6 * * FRI' || inputs.create-storage-release-branch }}

    permissions:
      contents: write
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -5565,7 +5565,10 @@ name = "safekeeper_api"
 version = "0.1.0"
 dependencies = [
 "const_format",
+ "postgres_ffi",
+ "pq_proto",
 "serde",
+ "tokio",
 "utils",
 ]

--- a/compute/etc/pgbouncer.ini
+++ b/compute/etc/pgbouncer.ini
@@ -19,3 +19,10 @@ max_prepared_statements=0
 admin_users=postgres
 unix_socket_dir=/tmp/
 unix_socket_mode=0777
+
+;; Disable connection logging. It produces a lot of logs that no one looks at,
+;; and we can get similar log entries from the proxy too. We had incidents in
+;; the past where the logging significantly stressed the log device or pgbouncer
+;; itself.
+log_connections=0
+log_disconnections=0
--- a/compute/patches/cloud_regress_pg16.patch
+++ b/compute/patches/cloud_regress_pg16.patch
@@ -981,7 +981,7 @@ index fc42d418bf..e38f517574 100644
 CREATE SCHEMA addr_nsp;
 SET search_path TO 'addr_nsp';
 diff --git a/src/test/regress/expected/password.out b/src/test/regress/expected/password.out
-index 8475231735..1afae5395f 100644
+index 8475231735..0653946337 100644
 --- a/src/test/regress/expected/password.out
 +++ b/src/test/regress/expected/password.out
@@ -12,11 +12,11 @@ SET password_encryption = 'md5'; -- ok
@@ -1006,65 +1006,63 @@ index 8475231735..1afae5395f 100644
 -----------------+---------------------------------------------------
 - regress_passwd1 | md5783277baca28003b33453252be4dbb34
 - regress_passwd2 | md54044304ba511dd062133eb5b4b84a2a3
-+ regress_passwd1 | NEON_MD5_PLACEHOLDER_regress_passwd1
-+ regress_passwd2 | NEON_MD5_PLACEHOLDER_regress_passwd2
+ regress_passwd1 | NEON_MD5_PLACEHOLDER:regress_passwd1
+ regress_passwd2 | NEON_MD5_PLACEHOLDER:regress_passwd2
  regress_passwd3 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey>
 - regress_passwd4 | 
 + regress_passwd4 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey>
 (4 rows)
 
 -- Rename a role
-@@ -54,24 +54,30 @@ ALTER ROLE regress_passwd2_new RENAME TO regress_passwd2;
+@@ -54,24 +54,16 @@ ALTER ROLE regress_passwd2_new RENAME TO regress_passwd2;
 -- passwords.
 SET password_encryption = 'md5';
 -- encrypt with MD5
 -ALTER ROLE regress_passwd2 PASSWORD 'foo';
+--- already encrypted, use as they are
+-ALTER ROLE regress_passwd1 PASSWORD 'md5cd3578025fe2c3d7ed1b9a9b26238b70';
+-ALTER ROLE regress_passwd3 PASSWORD 'SCRAM-SHA-256$4096:VLK4RMaQLCvNtQ==$6YtlR4t69SguDiwFvbVgVZtuz6gpJQQqUMZ7IQJK5yI=:ps75jrHeYU4lXCcXI4O8oIdJ3eO8o2jirjruw9phBTo=';
 +ALTER ROLE regress_passwd2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
- -- already encrypted, use as they are
- ALTER ROLE regress_passwd1 PASSWORD 'md5cd3578025fe2c3d7ed1b9a9b26238b70';
-+ERROR:  Received HTTP code 400 from control plane: {"error":"Neon only supports being given plaintext passwords"}
- ALTER ROLE regress_passwd3 PASSWORD 'SCRAM-SHA-256$4096:VLK4RMaQLCvNtQ==$6YtlR4t69SguDiwFvbVgVZtuz6gpJQQqUMZ7IQJK5yI=:ps75jrHeYU4lXCcXI4O8oIdJ3eO8o2jirjruw9phBTo=';
-+ERROR:  Received HTTP code 400 from control plane: {"error":"Neon only supports being given plaintext passwords"}
 SET password_encryption = 'scram-sha-256';
 -- create SCRAM secret
 -ALTER ROLE  regress_passwd4 PASSWORD 'foo';
+--- already encrypted with MD5, use as it is
+-CREATE ROLE regress_passwd5 PASSWORD 'md5e73a4b11df52a6068f8b39f90be36023';
+--- This looks like a valid SCRAM-SHA-256 secret, but it is not
+--- so it should be hashed with SCRAM-SHA-256.
+-CREATE ROLE regress_passwd6 PASSWORD 'SCRAM-SHA-256$1234';
+--- These may look like valid MD5 secrets, but they are not, so they
+--- should be hashed with SCRAM-SHA-256.
+--- trailing garbage at the end
+-CREATE ROLE regress_passwd7 PASSWORD 'md5012345678901234567890123456789zz';
+--- invalid length
+-CREATE ROLE regress_passwd8 PASSWORD 'md501234567890123456789012345678901zz';
 +ALTER ROLE  regress_passwd4 PASSWORD NEON_PASSWORD_PLACEHOLDER;
- -- already encrypted with MD5, use as it is
- CREATE ROLE regress_passwd5 PASSWORD 'md5e73a4b11df52a6068f8b39f90be36023';
-+ERROR:  Received HTTP code 400 from control plane: {"error":"Neon only supports being given plaintext passwords"}
- -- This looks like a valid SCRAM-SHA-256 secret, but it is not
- -- so it should be hashed with SCRAM-SHA-256.
- CREATE ROLE regress_passwd6 PASSWORD 'SCRAM-SHA-256$1234';
-+ERROR:  Received HTTP code 400 from control plane: {"error":"Neon only supports being given plaintext passwords"}
- -- These may look like valid MD5 secrets, but they are not, so they
- -- should be hashed with SCRAM-SHA-256.
- -- trailing garbage at the end
- CREATE ROLE regress_passwd7 PASSWORD 'md5012345678901234567890123456789zz';
-+ERROR:  Received HTTP code 400 from control plane: {"error":"Neon only supports being given plaintext passwords"}
- -- invalid length
- CREATE ROLE regress_passwd8 PASSWORD 'md501234567890123456789012345678901zz';
-+ERROR:  Received HTTP code 400 from control plane: {"error":"Neon only supports being given plaintext passwords"}
+-- Neon does not support encrypted passwords, use unencrypted instead
+CREATE ROLE regress_passwd5 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+-- Neon does not support encrypted passwords, use unencrypted instead
+CREATE ROLE regress_passwd6 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+CREATE ROLE regress_passwd7 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+CREATE ROLE regress_passwd8 PASSWORD NEON_PASSWORD_PLACEHOLDER;
 -- Changing the SCRAM iteration count
 SET scram_iterations = 1024;
 CREATE ROLE regress_passwd9 PASSWORD 'alterediterationcount';
-@@ -81,63 +87,67 @@ SELECT rolname, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+
+@@ -81,11 +73,11 @@ SELECT rolname, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+
     ORDER BY rolname, rolpassword;
      rolname     |                rolpassword_masked                 
 -----------------+---------------------------------------------------
 - regress_passwd1 | md5cd3578025fe2c3d7ed1b9a9b26238b70
 - regress_passwd2 | md5dfa155cadd5f4ad57860162f3fab9cdb
-+ regress_passwd1 | NEON_MD5_PLACEHOLDER_regress_passwd1
-+ regress_passwd2 | NEON_MD5_PLACEHOLDER_regress_passwd2
+ regress_passwd1 | NEON_MD5_PLACEHOLDER:regress_passwd1
+ regress_passwd2 | NEON_MD5_PLACEHOLDER:regress_passwd2
  regress_passwd3 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey>
  regress_passwd4 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey>
 - regress_passwd5 | md5e73a4b11df52a6068f8b39f90be36023
- regress_passwd6 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey>
- regress_passwd7 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey>
- regress_passwd8 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey>
-  regress_passwd9 | SCRAM-SHA-256$1024:<salt>$<storedkey>:<serverkey>
-(9 rows)
-+(5 rows)
- 
+ regress_passwd5 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey>
+  regress_passwd6 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey>
+  regress_passwd7 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey>
+  regress_passwd8 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey>
+@@ -95,23 +87,20 @@ SELECT rolname, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+
 -- An empty password is not allowed, in any form
 CREATE ROLE regress_passwd_empty PASSWORD '';
 NOTICE:  empty string is not a valid password, clearing password
@@ -1082,56 +1080,37 @@ index 8475231735..1afae5395f 100644
 -(1 row)
 +(0 rows)
 
- -- Test with invalid stored and server keys.
- --
- -- The first is valid, to act as a control. The others have too long
- -- stored/server keys. They will be re-hashed.
- CREATE ROLE regress_passwd_sha_len0 PASSWORD 'SCRAM-SHA-256$4096:A6xHKoH/494E941doaPOYg==$Ky+A30sewHIH3VHQLRN9vYsuzlgNyGNKCh37dy96Rqw=:COPdlNiIkrsacU5QoxydEuOH6e/KfiipeETb/bPw8ZI=';
-+ERROR:  Received HTTP code 400 from control plane: {"error":"Neon only supports being given plaintext passwords"}
- CREATE ROLE regress_passwd_sha_len1 PASSWORD 'SCRAM-SHA-256$4096:A6xHKoH/494E941doaPOYg==$Ky+A30sewHIH3VHQLRN9vYsuzlgNyGNKCh37dy96RqwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=:COPdlNiIkrsacU5QoxydEuOH6e/KfiipeETb/bPw8ZI=';
-+ERROR:  Received HTTP code 400 from control plane: {"error":"Neon only supports being given plaintext passwords"}
- CREATE ROLE regress_passwd_sha_len2 PASSWORD 'SCRAM-SHA-256$4096:A6xHKoH/494E941doaPOYg==$Ky+A30sewHIH3VHQLRN9vYsuzlgNyGNKCh37dy96Rqw=:COPdlNiIkrsacU5QoxydEuOH6e/KfiipeETb/bPw8ZIAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=';
-+ERROR:  Received HTTP code 400 from control plane: {"error":"Neon only supports being given plaintext passwords"}
+--- Test with invalid stored and server keys.
+---
+--- The first is valid, to act as a control. The others have too long
+--- stored/server keys. They will be re-hashed.
+-CREATE ROLE regress_passwd_sha_len0 PASSWORD 'SCRAM-SHA-256$4096:A6xHKoH/494E941doaPOYg==$Ky+A30sewHIH3VHQLRN9vYsuzlgNyGNKCh37dy96Rqw=:COPdlNiIkrsacU5QoxydEuOH6e/KfiipeETb/bPw8ZI=';
+-CREATE ROLE regress_passwd_sha_len1 PASSWORD 'SCRAM-SHA-256$4096:A6xHKoH/494E941doaPOYg==$Ky+A30sewHIH3VHQLRN9vYsuzlgNyGNKCh37dy96RqwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=:COPdlNiIkrsacU5QoxydEuOH6e/KfiipeETb/bPw8ZI=';
+-CREATE ROLE regress_passwd_sha_len2 PASSWORD 'SCRAM-SHA-256$4096:A6xHKoH/494E941doaPOYg==$Ky+A30sewHIH3VHQLRN9vYsuzlgNyGNKCh37dy96Rqw=:COPdlNiIkrsacU5QoxydEuOH6e/KfiipeETb/bPw8ZIAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=';
+-- Neon does not support encrypted passwords, use unencrypted instead
+CREATE ROLE regress_passwd_sha_len0 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+CREATE ROLE regress_passwd_sha_len1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+CREATE ROLE regress_passwd_sha_len2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
 -- Check that the invalid secrets were re-hashed. A re-hashed secret
 -- should not contain the original salt.
 SELECT rolname, rolpassword not like '%A6xHKoH/494E941doaPOYg==%' as is_rolpassword_rehashed
-     FROM pg_authid
-     WHERE rolname LIKE 'regress_passwd_sha_len%'
+@@ -120,7 +109,7 @@ SELECT rolname, rolpassword not like '%A6xHKoH/494E941doaPOYg==%' as is_rolpassw
     ORDER BY rolname;
-         rolname         | is_rolpassword_rehashed 
--------------------------+-------------------------
+          rolname         | is_rolpassword_rehashed 
+ -------------------------+-------------------------
 - regress_passwd_sha_len0 | f
- regress_passwd_sha_len1 | t
- regress_passwd_sha_len2 | t
-(3 rows)
-+ rolname | is_rolpassword_rehashed 
-+---------+-------------------------
-+(0 rows)
- 
- DROP ROLE regress_passwd1;
- DROP ROLE regress_passwd2;
- DROP ROLE regress_passwd3;
- DROP ROLE regress_passwd4;
- DROP ROLE regress_passwd5;
-+ERROR:  role "regress_passwd5" does not exist
- DROP ROLE regress_passwd6;
-+ERROR:  role "regress_passwd6" does not exist
- DROP ROLE regress_passwd7;
-+ERROR:  role "regress_passwd7" does not exist
+ regress_passwd_sha_len0 | t
+  regress_passwd_sha_len1 | t
+  regress_passwd_sha_len2 | t
+ (3 rows)
+@@ -135,6 +124,7 @@ DROP ROLE regress_passwd7;
 DROP ROLE regress_passwd8;
-+ERROR:  role "regress_passwd8" does not exist
 DROP ROLE regress_passwd9;
 DROP ROLE regress_passwd_empty;
 +ERROR:  role "regress_passwd_empty" does not exist
 DROP ROLE regress_passwd_sha_len0;
-+ERROR:  role "regress_passwd_sha_len0" does not exist
 DROP ROLE regress_passwd_sha_len1;
-+ERROR:  role "regress_passwd_sha_len1" does not exist
 DROP ROLE regress_passwd_sha_len2;
-+ERROR:  role "regress_passwd_sha_len2" does not exist
- -- all entries should have been removed
- SELECT rolname, rolpassword
-     FROM pg_authid
 diff --git a/src/test/regress/expected/privileges.out b/src/test/regress/expected/privileges.out
 index 5b9dba7b32..cc408dad42 100644
 --- a/src/test/regress/expected/privileges.out
@@ -3194,7 +3173,7 @@ index 1a6c61f49d..1c31ac6a53 100644
 -- Test generic object addressing/identification functions
 CREATE SCHEMA addr_nsp;
 diff --git a/src/test/regress/sql/password.sql b/src/test/regress/sql/password.sql
-index 53e86b0b6c..f07cf1ec54 100644
+index 53e86b0b6c..0303fdfe96 100644
 --- a/src/test/regress/sql/password.sql
 +++ b/src/test/regress/sql/password.sql
@@ -10,11 +10,11 @@ SET password_encryption = 'scram-sha-256'; -- ok
@@ -3213,23 +3192,59 @@ index 53e86b0b6c..f07cf1ec54 100644
 
 -- check list of created entries
 --
-@@ -42,14 +42,14 @@ ALTER ROLE regress_passwd2_new RENAME TO regress_passwd2;
+@@ -42,26 +42,18 @@ ALTER ROLE regress_passwd2_new RENAME TO regress_passwd2;
 SET password_encryption = 'md5';
 
 -- encrypt with MD5
 -ALTER ROLE regress_passwd2 PASSWORD 'foo';
+--- already encrypted, use as they are
+-ALTER ROLE regress_passwd1 PASSWORD 'md5cd3578025fe2c3d7ed1b9a9b26238b70';
+-ALTER ROLE regress_passwd3 PASSWORD 'SCRAM-SHA-256$4096:VLK4RMaQLCvNtQ==$6YtlR4t69SguDiwFvbVgVZtuz6gpJQQqUMZ7IQJK5yI=:ps75jrHeYU4lXCcXI4O8oIdJ3eO8o2jirjruw9phBTo=';
 +ALTER ROLE regress_passwd2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
- -- already encrypted, use as they are
- ALTER ROLE regress_passwd1 PASSWORD 'md5cd3578025fe2c3d7ed1b9a9b26238b70';
- ALTER ROLE regress_passwd3 PASSWORD 'SCRAM-SHA-256$4096:VLK4RMaQLCvNtQ==$6YtlR4t69SguDiwFvbVgVZtuz6gpJQQqUMZ7IQJK5yI=:ps75jrHeYU4lXCcXI4O8oIdJ3eO8o2jirjruw9phBTo=';
 
 SET password_encryption = 'scram-sha-256';
 -- create SCRAM secret
 -ALTER ROLE  regress_passwd4 PASSWORD 'foo';
+--- already encrypted with MD5, use as it is
+-CREATE ROLE regress_passwd5 PASSWORD 'md5e73a4b11df52a6068f8b39f90be36023';
 +ALTER ROLE  regress_passwd4 PASSWORD NEON_PASSWORD_PLACEHOLDER;
- -- already encrypted with MD5, use as it is
- CREATE ROLE regress_passwd5 PASSWORD 'md5e73a4b11df52a6068f8b39f90be36023';
+-- Neon does not support encrypted passwords, use unencrypted instead
+CREATE ROLE regress_passwd5 PASSWORD NEON_PASSWORD_PLACEHOLDER;
 
+--- This looks like a valid SCRAM-SHA-256 secret, but it is not
+--- so it should be hashed with SCRAM-SHA-256.
+-CREATE ROLE regress_passwd6 PASSWORD 'SCRAM-SHA-256$1234';
+--- These may look like valid MD5 secrets, but they are not, so they
+--- should be hashed with SCRAM-SHA-256.
+--- trailing garbage at the end
+-CREATE ROLE regress_passwd7 PASSWORD 'md5012345678901234567890123456789zz';
+--- invalid length
+-CREATE ROLE regress_passwd8 PASSWORD 'md501234567890123456789012345678901zz';
+-- Neon does not support encrypted passwords, use unencrypted instead
+CREATE ROLE regress_passwd6 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+CREATE ROLE regress_passwd7 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+CREATE ROLE regress_passwd8 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ -- Changing the SCRAM iteration count
+ SET scram_iterations = 1024;
+@@ -78,13 +70,10 @@ ALTER ROLE regress_passwd_empty PASSWORD 'md585939a5ce845f1a1b620742e3c659e0a';
+ ALTER ROLE regress_passwd_empty PASSWORD 'SCRAM-SHA-256$4096:hpFyHTUsSWcR7O9P$LgZFIt6Oqdo27ZFKbZ2nV+vtnYM995pDh9ca6WSi120=:qVV5NeluNfUPkwm7Vqat25RjSPLkGeoZBQs6wVv+um4=';
+ SELECT rolpassword FROM pg_authid WHERE rolname='regress_passwd_empty';
+ 
+--- Test with invalid stored and server keys.
+---
+--- The first is valid, to act as a control. The others have too long
+--- stored/server keys. They will be re-hashed.
+-CREATE ROLE regress_passwd_sha_len0 PASSWORD 'SCRAM-SHA-256$4096:A6xHKoH/494E941doaPOYg==$Ky+A30sewHIH3VHQLRN9vYsuzlgNyGNKCh37dy96Rqw=:COPdlNiIkrsacU5QoxydEuOH6e/KfiipeETb/bPw8ZI=';
+-CREATE ROLE regress_passwd_sha_len1 PASSWORD 'SCRAM-SHA-256$4096:A6xHKoH/494E941doaPOYg==$Ky+A30sewHIH3VHQLRN9vYsuzlgNyGNKCh37dy96RqwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=:COPdlNiIkrsacU5QoxydEuOH6e/KfiipeETb/bPw8ZI=';
+-CREATE ROLE regress_passwd_sha_len2 PASSWORD 'SCRAM-SHA-256$4096:A6xHKoH/494E941doaPOYg==$Ky+A30sewHIH3VHQLRN9vYsuzlgNyGNKCh37dy96Rqw=:COPdlNiIkrsacU5QoxydEuOH6e/KfiipeETb/bPw8ZIAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=';
+-- Neon does not support encrypted passwords, use unencrypted instead
+CREATE ROLE regress_passwd_sha_len0 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+CREATE ROLE regress_passwd_sha_len1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+CREATE ROLE regress_passwd_sha_len2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ -- Check that the invalid secrets were re-hashed. A re-hashed secret
+ -- should not contain the original salt.
 diff --git a/src/test/regress/sql/privileges.sql b/src/test/regress/sql/privileges.sql
 index 249df17a58..b258e7f26a 100644
 --- a/src/test/regress/sql/privileges.sql
--- a/compute/patches/cloud_regress_pg17.patch
+++ b/compute/patches/cloud_regress_pg17.patch
@@ -1014,10 +1014,10 @@ index fc42d418bf..e38f517574 100644
 CREATE SCHEMA addr_nsp;
 SET search_path TO 'addr_nsp';
 diff --git a/src/test/regress/expected/password.out b/src/test/regress/expected/password.out
-index 924d6e001d..5966531db6 100644
+index 924d6e001d..7fdda73439 100644
 --- a/src/test/regress/expected/password.out
 +++ b/src/test/regress/expected/password.out
-@@ -12,13 +12,13 @@ SET password_encryption = 'md5'; -- ok
+@@ -12,13 +12,11 @@ SET password_encryption = 'md5'; -- ok
 SET password_encryption = 'scram-sha-256'; -- ok
 -- consistency of password entries
 SET password_encryption = 'md5';
@@ -1026,9 +1026,7 @@ index 924d6e001d..5966531db6 100644
 -CREATE ROLE regress_passwd2;
 -ALTER ROLE regress_passwd2 PASSWORD 'role_pwd2';
 +CREATE ROLE regress_passwd1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
-+ALTER ROLE regress_passwd1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
 +CREATE ROLE regress_passwd2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
-+ALTER ROLE regress_passwd2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
 SET password_encryption = 'scram-sha-256';
 -CREATE ROLE regress_passwd3 PASSWORD 'role_pwd3';
 -CREATE ROLE regress_passwd4 PASSWORD NULL;
@@ -1037,71 +1035,69 @@ index 924d6e001d..5966531db6 100644
 -- check list of created entries
 --
 -- The scram secret will look something like:
-@@ -32,10 +32,10 @@ SELECT rolname, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+
+@@ -32,10 +30,10 @@ SELECT rolname, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+
     ORDER BY rolname, rolpassword;
      rolname     |                rolpassword_masked                 
 -----------------+---------------------------------------------------
 - regress_passwd1 | md5783277baca28003b33453252be4dbb34
 - regress_passwd2 | md54044304ba511dd062133eb5b4b84a2a3
-+ regress_passwd1 | NEON_MD5_PLACEHOLDER_regress_passwd1
-+ regress_passwd2 | NEON_MD5_PLACEHOLDER_regress_passwd2
+ regress_passwd1 | NEON_MD5_PLACEHOLDER:regress_passwd1
+ regress_passwd2 | NEON_MD5_PLACEHOLDER:regress_passwd2
  regress_passwd3 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey>
 - regress_passwd4 | 
 + regress_passwd4 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey>
 (4 rows)
 
 -- Rename a role
-@@ -56,24 +56,30 @@ ALTER ROLE regress_passwd2_new RENAME TO regress_passwd2;
+@@ -56,24 +54,17 @@ ALTER ROLE regress_passwd2_new RENAME TO regress_passwd2;
 -- passwords.
 SET password_encryption = 'md5';
 -- encrypt with MD5
 -ALTER ROLE regress_passwd2 PASSWORD 'foo';
+--- already encrypted, use as they are
+-ALTER ROLE regress_passwd1 PASSWORD 'md5cd3578025fe2c3d7ed1b9a9b26238b70';
+-ALTER ROLE regress_passwd3 PASSWORD 'SCRAM-SHA-256$4096:VLK4RMaQLCvNtQ==$6YtlR4t69SguDiwFvbVgVZtuz6gpJQQqUMZ7IQJK5yI=:ps75jrHeYU4lXCcXI4O8oIdJ3eO8o2jirjruw9phBTo=';
 +ALTER ROLE regress_passwd2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
- -- already encrypted, use as they are
- ALTER ROLE regress_passwd1 PASSWORD 'md5cd3578025fe2c3d7ed1b9a9b26238b70';
-+ERROR:  Received HTTP code 400 from control plane: {"error":"Neon only supports being given plaintext passwords"}
- ALTER ROLE regress_passwd3 PASSWORD 'SCRAM-SHA-256$4096:VLK4RMaQLCvNtQ==$6YtlR4t69SguDiwFvbVgVZtuz6gpJQQqUMZ7IQJK5yI=:ps75jrHeYU4lXCcXI4O8oIdJ3eO8o2jirjruw9phBTo=';
-+ERROR:  Received HTTP code 400 from control plane: {"error":"Neon only supports being given plaintext passwords"}
 SET password_encryption = 'scram-sha-256';
 -- create SCRAM secret
 -ALTER ROLE  regress_passwd4 PASSWORD 'foo';
 +ALTER ROLE  regress_passwd4 PASSWORD NEON_PASSWORD_PLACEHOLDER;
 -- already encrypted with MD5, use as it is
- CREATE ROLE regress_passwd5 PASSWORD 'md5e73a4b11df52a6068f8b39f90be36023';
-+ERROR:  Received HTTP code 400 from control plane: {"error":"Neon only supports being given plaintext passwords"}
- -- This looks like a valid SCRAM-SHA-256 secret, but it is not
- -- so it should be hashed with SCRAM-SHA-256.
- CREATE ROLE regress_passwd6 PASSWORD 'SCRAM-SHA-256$1234';
-+ERROR:  Received HTTP code 400 from control plane: {"error":"Neon only supports being given plaintext passwords"}
- -- These may look like valid MD5 secrets, but they are not, so they
- -- should be hashed with SCRAM-SHA-256.
- -- trailing garbage at the end
- CREATE ROLE regress_passwd7 PASSWORD 'md5012345678901234567890123456789zz';
-+ERROR:  Received HTTP code 400 from control plane: {"error":"Neon only supports being given plaintext passwords"}
- -- invalid length
- CREATE ROLE regress_passwd8 PASSWORD 'md501234567890123456789012345678901zz';
-+ERROR:  Received HTTP code 400 from control plane: {"error":"Neon only supports being given plaintext passwords"}
+-CREATE ROLE regress_passwd5 PASSWORD 'md5e73a4b11df52a6068f8b39f90be36023';
+--- This looks like a valid SCRAM-SHA-256 secret, but it is not
+--- so it should be hashed with SCRAM-SHA-256.
+-CREATE ROLE regress_passwd6 PASSWORD 'SCRAM-SHA-256$1234';
+--- These may look like valid MD5 secrets, but they are not, so they
+--- should be hashed with SCRAM-SHA-256.
+--- trailing garbage at the end
+-CREATE ROLE regress_passwd7 PASSWORD 'md5012345678901234567890123456789zz';
+--- invalid length
+-CREATE ROLE regress_passwd8 PASSWORD 'md501234567890123456789012345678901zz';
+-- Neon does not support encrypted passwords, use unencrypted instead
+CREATE ROLE regress_passwd5 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+-- Neon does not support encrypted passwords, use unencrypted instead
+CREATE ROLE regress_passwd6 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+CREATE ROLE regress_passwd7 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+CREATE ROLE regress_passwd8 PASSWORD NEON_PASSWORD_PLACEHOLDER;
 -- Changing the SCRAM iteration count
 SET scram_iterations = 1024;
 CREATE ROLE regress_passwd9 PASSWORD 'alterediterationcount';
-@@ -83,63 +89,67 @@ SELECT rolname, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+
+@@ -83,11 +74,11 @@ SELECT rolname, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+
     ORDER BY rolname, rolpassword;
      rolname     |                rolpassword_masked                 
 -----------------+---------------------------------------------------
 - regress_passwd1 | md5cd3578025fe2c3d7ed1b9a9b26238b70
 - regress_passwd2 | md5dfa155cadd5f4ad57860162f3fab9cdb
-+ regress_passwd1 | NEON_MD5_PLACEHOLDER_regress_passwd1
-+ regress_passwd2 | NEON_MD5_PLACEHOLDER_regress_passwd2
+ regress_passwd1 | NEON_MD5_PLACEHOLDER:regress_passwd1
+ regress_passwd2 | NEON_MD5_PLACEHOLDER:regress_passwd2
  regress_passwd3 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey>
  regress_passwd4 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey>
 - regress_passwd5 | md5e73a4b11df52a6068f8b39f90be36023
- regress_passwd6 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey>
- regress_passwd7 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey>
- regress_passwd8 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey>
-  regress_passwd9 | SCRAM-SHA-256$1024:<salt>$<storedkey>:<serverkey>
-(9 rows)
-+(5 rows)
- 
+ regress_passwd5 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey>
+  regress_passwd6 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey>
+  regress_passwd7 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey>
+  regress_passwd8 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey>
+@@ -97,23 +88,20 @@ SELECT rolname, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+
 -- An empty password is not allowed, in any form
 CREATE ROLE regress_passwd_empty PASSWORD '';
 NOTICE:  empty string is not a valid password, clearing password
@@ -1119,56 +1115,37 @@ index 924d6e001d..5966531db6 100644
 -(1 row)
 +(0 rows)
 
- -- Test with invalid stored and server keys.
- --
- -- The first is valid, to act as a control. The others have too long
- -- stored/server keys. They will be re-hashed.
- CREATE ROLE regress_passwd_sha_len0 PASSWORD 'SCRAM-SHA-256$4096:A6xHKoH/494E941doaPOYg==$Ky+A30sewHIH3VHQLRN9vYsuzlgNyGNKCh37dy96Rqw=:COPdlNiIkrsacU5QoxydEuOH6e/KfiipeETb/bPw8ZI=';
-+ERROR:  Received HTTP code 400 from control plane: {"error":"Neon only supports being given plaintext passwords"}
- CREATE ROLE regress_passwd_sha_len1 PASSWORD 'SCRAM-SHA-256$4096:A6xHKoH/494E941doaPOYg==$Ky+A30sewHIH3VHQLRN9vYsuzlgNyGNKCh37dy96RqwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=:COPdlNiIkrsacU5QoxydEuOH6e/KfiipeETb/bPw8ZI=';
-+ERROR:  Received HTTP code 400 from control plane: {"error":"Neon only supports being given plaintext passwords"}
- CREATE ROLE regress_passwd_sha_len2 PASSWORD 'SCRAM-SHA-256$4096:A6xHKoH/494E941doaPOYg==$Ky+A30sewHIH3VHQLRN9vYsuzlgNyGNKCh37dy96Rqw=:COPdlNiIkrsacU5QoxydEuOH6e/KfiipeETb/bPw8ZIAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=';
-+ERROR:  Received HTTP code 400 from control plane: {"error":"Neon only supports being given plaintext passwords"}
+--- Test with invalid stored and server keys.
+---
+--- The first is valid, to act as a control. The others have too long
+--- stored/server keys. They will be re-hashed.
+-CREATE ROLE regress_passwd_sha_len0 PASSWORD 'SCRAM-SHA-256$4096:A6xHKoH/494E941doaPOYg==$Ky+A30sewHIH3VHQLRN9vYsuzlgNyGNKCh37dy96Rqw=:COPdlNiIkrsacU5QoxydEuOH6e/KfiipeETb/bPw8ZI=';
+-CREATE ROLE regress_passwd_sha_len1 PASSWORD 'SCRAM-SHA-256$4096:A6xHKoH/494E941doaPOYg==$Ky+A30sewHIH3VHQLRN9vYsuzlgNyGNKCh37dy96RqwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=:COPdlNiIkrsacU5QoxydEuOH6e/KfiipeETb/bPw8ZI=';
+-CREATE ROLE regress_passwd_sha_len2 PASSWORD 'SCRAM-SHA-256$4096:A6xHKoH/494E941doaPOYg==$Ky+A30sewHIH3VHQLRN9vYsuzlgNyGNKCh37dy96Rqw=:COPdlNiIkrsacU5QoxydEuOH6e/KfiipeETb/bPw8ZIAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=';
+-- Neon does not support encrypted passwords, use unencrypted instead
+CREATE ROLE regress_passwd_sha_len0 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+CREATE ROLE regress_passwd_sha_len1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+CREATE ROLE regress_passwd_sha_len2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
 -- Check that the invalid secrets were re-hashed. A re-hashed secret
 -- should not contain the original salt.
 SELECT rolname, rolpassword not like '%A6xHKoH/494E941doaPOYg==%' as is_rolpassword_rehashed
-     FROM pg_authid
-     WHERE rolname LIKE 'regress_passwd_sha_len%'
+@@ -122,7 +110,7 @@ SELECT rolname, rolpassword not like '%A6xHKoH/494E941doaPOYg==%' as is_rolpassw
     ORDER BY rolname;
-         rolname         | is_rolpassword_rehashed 
--------------------------+-------------------------
+          rolname         | is_rolpassword_rehashed 
+ -------------------------+-------------------------
 - regress_passwd_sha_len0 | f
- regress_passwd_sha_len1 | t
- regress_passwd_sha_len2 | t
-(3 rows)
-+ rolname | is_rolpassword_rehashed 
-+---------+-------------------------
-+(0 rows)
- 
- DROP ROLE regress_passwd1;
- DROP ROLE regress_passwd2;
- DROP ROLE regress_passwd3;
- DROP ROLE regress_passwd4;
- DROP ROLE regress_passwd5;
-+ERROR:  role "regress_passwd5" does not exist
- DROP ROLE regress_passwd6;
-+ERROR:  role "regress_passwd6" does not exist
- DROP ROLE regress_passwd7;
-+ERROR:  role "regress_passwd7" does not exist
+ regress_passwd_sha_len0 | t
+  regress_passwd_sha_len1 | t
+  regress_passwd_sha_len2 | t
+ (3 rows)
+@@ -137,6 +125,7 @@ DROP ROLE regress_passwd7;
 DROP ROLE regress_passwd8;
-+ERROR:  role "regress_passwd8" does not exist
 DROP ROLE regress_passwd9;
 DROP ROLE regress_passwd_empty;
 +ERROR:  role "regress_passwd_empty" does not exist
 DROP ROLE regress_passwd_sha_len0;
-+ERROR:  role "regress_passwd_sha_len0" does not exist
 DROP ROLE regress_passwd_sha_len1;
-+ERROR:  role "regress_passwd_sha_len1" does not exist
 DROP ROLE regress_passwd_sha_len2;
-+ERROR:  role "regress_passwd_sha_len2" does not exist
- -- all entries should have been removed
- SELECT rolname, rolpassword
-     FROM pg_authid
 diff --git a/src/test/regress/expected/privileges.out b/src/test/regress/expected/privileges.out
 index 1296da0d57..f43fffa44c 100644
 --- a/src/test/regress/expected/privileges.out
@@ -3249,10 +3226,10 @@ index 1a6c61f49d..1c31ac6a53 100644
 -- Test generic object addressing/identification functions
 CREATE SCHEMA addr_nsp;
 diff --git a/src/test/regress/sql/password.sql b/src/test/regress/sql/password.sql
-index bb82aa4aa2..7424c91b10 100644
+index bb82aa4aa2..dd8a05e24d 100644
 --- a/src/test/regress/sql/password.sql
 +++ b/src/test/regress/sql/password.sql
-@@ -10,13 +10,13 @@ SET password_encryption = 'scram-sha-256'; -- ok
+@@ -10,13 +10,11 @@ SET password_encryption = 'scram-sha-256'; -- ok
 
 -- consistency of password entries
 SET password_encryption = 'md5';
@@ -3261,9 +3238,7 @@ index bb82aa4aa2..7424c91b10 100644
 -CREATE ROLE regress_passwd2;
 -ALTER ROLE regress_passwd2 PASSWORD 'role_pwd2';
 +CREATE ROLE regress_passwd1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
-+ALTER ROLE regress_passwd1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
 +CREATE ROLE regress_passwd2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
-+ALTER ROLE regress_passwd2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
 SET password_encryption = 'scram-sha-256';
 -CREATE ROLE regress_passwd3 PASSWORD 'role_pwd3';
 -CREATE ROLE regress_passwd4 PASSWORD NULL;
@@ -3272,23 +3247,59 @@ index bb82aa4aa2..7424c91b10 100644
 
 -- check list of created entries
 --
-@@ -44,14 +44,14 @@ ALTER ROLE regress_passwd2_new RENAME TO regress_passwd2;
+@@ -44,26 +42,19 @@ ALTER ROLE regress_passwd2_new RENAME TO regress_passwd2;
 SET password_encryption = 'md5';
 
 -- encrypt with MD5
 -ALTER ROLE regress_passwd2 PASSWORD 'foo';
+--- already encrypted, use as they are
+-ALTER ROLE regress_passwd1 PASSWORD 'md5cd3578025fe2c3d7ed1b9a9b26238b70';
+-ALTER ROLE regress_passwd3 PASSWORD 'SCRAM-SHA-256$4096:VLK4RMaQLCvNtQ==$6YtlR4t69SguDiwFvbVgVZtuz6gpJQQqUMZ7IQJK5yI=:ps75jrHeYU4lXCcXI4O8oIdJ3eO8o2jirjruw9phBTo=';
 +ALTER ROLE regress_passwd2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
- -- already encrypted, use as they are
- ALTER ROLE regress_passwd1 PASSWORD 'md5cd3578025fe2c3d7ed1b9a9b26238b70';
- ALTER ROLE regress_passwd3 PASSWORD 'SCRAM-SHA-256$4096:VLK4RMaQLCvNtQ==$6YtlR4t69SguDiwFvbVgVZtuz6gpJQQqUMZ7IQJK5yI=:ps75jrHeYU4lXCcXI4O8oIdJ3eO8o2jirjruw9phBTo=';
 
 SET password_encryption = 'scram-sha-256';
 -- create SCRAM secret
 -ALTER ROLE  regress_passwd4 PASSWORD 'foo';
 +ALTER ROLE  regress_passwd4 PASSWORD NEON_PASSWORD_PLACEHOLDER;
 -- already encrypted with MD5, use as it is
- CREATE ROLE regress_passwd5 PASSWORD 'md5e73a4b11df52a6068f8b39f90be36023';
+-CREATE ROLE regress_passwd5 PASSWORD 'md5e73a4b11df52a6068f8b39f90be36023';
+-- Neon does not support encrypted passwords, use unencrypted instead
+CREATE ROLE regress_passwd5 PASSWORD NEON_PASSWORD_PLACEHOLDER;
 
+--- This looks like a valid SCRAM-SHA-256 secret, but it is not
+--- so it should be hashed with SCRAM-SHA-256.
+-CREATE ROLE regress_passwd6 PASSWORD 'SCRAM-SHA-256$1234';
+--- These may look like valid MD5 secrets, but they are not, so they
+--- should be hashed with SCRAM-SHA-256.
+--- trailing garbage at the end
+-CREATE ROLE regress_passwd7 PASSWORD 'md5012345678901234567890123456789zz';
+--- invalid length
+-CREATE ROLE regress_passwd8 PASSWORD 'md501234567890123456789012345678901zz';
+-- Neon does not support encrypted passwords, use unencrypted instead
+CREATE ROLE regress_passwd6 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+CREATE ROLE regress_passwd7 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+CREATE ROLE regress_passwd8 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ -- Changing the SCRAM iteration count
+ SET scram_iterations = 1024;
+@@ -80,13 +71,10 @@ ALTER ROLE regress_passwd_empty PASSWORD 'md585939a5ce845f1a1b620742e3c659e0a';
+ ALTER ROLE regress_passwd_empty PASSWORD 'SCRAM-SHA-256$4096:hpFyHTUsSWcR7O9P$LgZFIt6Oqdo27ZFKbZ2nV+vtnYM995pDh9ca6WSi120=:qVV5NeluNfUPkwm7Vqat25RjSPLkGeoZBQs6wVv+um4=';
+ SELECT rolpassword FROM pg_authid WHERE rolname='regress_passwd_empty';
+ 
+--- Test with invalid stored and server keys.
+---
+--- The first is valid, to act as a control. The others have too long
+--- stored/server keys. They will be re-hashed.
+-CREATE ROLE regress_passwd_sha_len0 PASSWORD 'SCRAM-SHA-256$4096:A6xHKoH/494E941doaPOYg==$Ky+A30sewHIH3VHQLRN9vYsuzlgNyGNKCh37dy96Rqw=:COPdlNiIkrsacU5QoxydEuOH6e/KfiipeETb/bPw8ZI=';
+-CREATE ROLE regress_passwd_sha_len1 PASSWORD 'SCRAM-SHA-256$4096:A6xHKoH/494E941doaPOYg==$Ky+A30sewHIH3VHQLRN9vYsuzlgNyGNKCh37dy96RqwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=:COPdlNiIkrsacU5QoxydEuOH6e/KfiipeETb/bPw8ZI=';
+-CREATE ROLE regress_passwd_sha_len2 PASSWORD 'SCRAM-SHA-256$4096:A6xHKoH/494E941doaPOYg==$Ky+A30sewHIH3VHQLRN9vYsuzlgNyGNKCh37dy96Rqw=:COPdlNiIkrsacU5QoxydEuOH6e/KfiipeETb/bPw8ZIAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=';
+-- Neon does not support encrypted passwords, use unencrypted instead
+CREATE ROLE regress_passwd_sha_len0 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+CREATE ROLE regress_passwd_sha_len1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+CREATE ROLE regress_passwd_sha_len2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ -- Check that the invalid secrets were re-hashed. A re-hashed secret
+ -- should not contain the original salt.
 diff --git a/src/test/regress/sql/privileges.sql b/src/test/regress/sql/privileges.sql
 index 5880bc018d..27aa952b18 100644
 --- a/src/test/regress/sql/privileges.sql
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -246,47 +246,48 @@ fn try_spec_from_cli(
    let compute_id = matches.get_one::<String>("compute-id");
    let control_plane_uri = matches.get_one::<String>("control-plane-uri");

-    let spec;
-    let mut live_config_allowed = false;
-    match spec_json {
-        // First, try to get cluster spec from the cli argument
-        Some(json) => {
-            info!("got spec from cli argument {}", json);
-            spec = Some(serde_json::from_str(json)?);
-        }
-        None => {
-            // Second, try to read it from the file if path is provided
-            if let Some(sp) = spec_path {
-                let path = Path::new(sp);
-                let file = File::open(path)?;
-                spec = Some(serde_json::from_reader(file)?);
-                live_config_allowed = true;
-            } else if let Some(id) = compute_id {
-                if let Some(cp_base) = control_plane_uri {
-                    live_config_allowed = true;
-                    spec = match get_spec_from_control_plane(cp_base, id) {
-                        Ok(s) => s,
-                        Err(e) => {
-                            error!("cannot get response from control plane: {}", e);
-                            panic!("neither spec nor confirmation that compute is in the Empty state was received");
-                        }
-                    };
-                } else {
-                    panic!("must specify both --control-plane-uri and --compute-id or none");
-                }
-            } else {
-                panic!(
-                    "compute spec should be provided by one of the following ways: \
-                    --spec OR --spec-path OR --control-plane-uri and --compute-id"
-                );
-            }
-        }
+    // First, try to get cluster spec from the cli argument
+    if let Some(spec_json) = spec_json {
+        info!("got spec from cli argument {}", spec_json);
+        return Ok(CliSpecParams {
+            spec: Some(serde_json::from_str(spec_json)?),
+            live_config_allowed: false,
+        });
+    }
+
+    // Second, try to read it from the file if path is provided
+    if let Some(spec_path) = spec_path {
+        let file = File::open(Path::new(spec_path))?;
+        return Ok(CliSpecParams {
+            spec: Some(serde_json::from_reader(file)?),
+            live_config_allowed: true,
+        });
+    }
+
+    let Some(compute_id) = compute_id else {
+        panic!(
+            "compute spec should be provided by one of the following ways: \
+                --spec OR --spec-path OR --control-plane-uri and --compute-id"
+        );
+    };
+    let Some(control_plane_uri) = control_plane_uri else {
+        panic!("must specify both --control-plane-uri and --compute-id or none");
    };

-    Ok(CliSpecParams {
-        spec,
-        live_config_allowed,
-    })
+    match get_spec_from_control_plane(control_plane_uri, compute_id) {
+        Ok(spec) => Ok(CliSpecParams {
+            spec,
+            live_config_allowed: true,
+        }),
+        Err(e) => {
+            error!(
+                "cannot get response from control plane: {}\n\
+                neither spec nor confirmation that compute is in the Empty state was received",
+                e
+            );
+            Err(e)
+        }
+    }
 }

 struct CliSpecParams {
--- a/compute_tools/src/http/openapi_spec.yaml
+++ b/compute_tools/src/http/openapi_spec.yaml
@@ -537,12 +537,14 @@ components:
            properties:
              extname:
                type: string
-              versions:
-                type: array
+              version:
+                type: string
                items:
                  type: string
              n_databases:
                type: integer
+              owned_by_superuser:
+                type: integer

    SetRoleGrantsRequest:
      type: object
--- a/compute_tools/src/installed_extensions.rs
+++ b/compute_tools/src/installed_extensions.rs
@@ -1,7 +1,6 @@
 use compute_api::responses::{InstalledExtension, InstalledExtensions};
 use metrics::proto::MetricFamily;
 use std::collections::HashMap;
-use std::collections::HashSet;

 use anyhow::Result;
 use postgres::{Client, NoTls};
@@ -38,61 +37,77 @@ fn list_dbs(client: &mut Client) -> Result<Vec<String>> {
 /// Connect to every database (see list_dbs above) and get the list of installed extensions.
 ///
 /// Same extension can be installed in multiple databases with different versions,
-/// we only keep the highest and lowest version across all databases.
+/// so we report a separate metric (number of databases where it is installed)
+/// for each extension version.
 pub fn get_installed_extensions(mut conf: postgres::config::Config) -> Result<InstalledExtensions> {
    conf.application_name("compute_ctl:get_installed_extensions");
    let mut client = conf.connect(NoTls)?;
-
    let databases: Vec<String> = list_dbs(&mut client)?;

-    let mut extensions_map: HashMap<String, InstalledExtension> = HashMap::new();
+    let mut extensions_map: HashMap<(String, String, String), InstalledExtension> = HashMap::new();
    for db in databases.iter() {
        conf.dbname(db);
        let mut db_client = conf.connect(NoTls)?;
-        let extensions: Vec<(String, String)> = db_client
+        let extensions: Vec<(String, String, i32)> = db_client
            .query(
-                "SELECT extname, extversion FROM pg_catalog.pg_extension;",
+                "SELECT extname, extversion, extowner::integer FROM pg_catalog.pg_extension",
                &[],
            )?
            .iter()
-            .map(|row| (row.get("extname"), row.get("extversion")))
+            .map(|row| {
+                (
+                    row.get("extname"),
+                    row.get("extversion"),
+                    row.get("extowner"),
+                )
+            })
            .collect();

-        for (extname, v) in extensions.iter() {
+        for (extname, v, extowner) in extensions.iter() {
            let version = v.to_string();

-            // increment the number of databases where the version of extension is installed
-            INSTALLED_EXTENSIONS
-                .with_label_values(&[extname, &version])
-                .inc();
+            // check if the extension is owned by superuser
+            // 10 is the oid of superuser
+            let owned_by_superuser = if *extowner == 10 { "1" } else { "0" };

            extensions_map
-                .entry(extname.to_string())
+                .entry((
+                    extname.to_string(),
+                    version.clone(),
+                    owned_by_superuser.to_string(),
+                ))
                .and_modify(|e| {
-                    e.versions.insert(version.clone());
                    // count the number of databases where the extension is installed
                    e.n_databases += 1;
                })
                .or_insert(InstalledExtension {
                    extname: extname.to_string(),
-                    versions: HashSet::from([version.clone()]),
+                    version: version.clone(),
                    n_databases: 1,
+                    owned_by_superuser: owned_by_superuser.to_string(),
                });
        }
    }

-    let res = InstalledExtensions {
-        extensions: extensions_map.into_values().collect(),
-    };
+    for (key, ext) in extensions_map.iter() {
+        let (extname, version, owned_by_superuser) = key;
+        let n_databases = ext.n_databases as u64;

-    Ok(res)
+        INSTALLED_EXTENSIONS
+            .with_label_values(&[extname, version, owned_by_superuser])
+            .set(n_databases);
+    }
+
+    Ok(InstalledExtensions {
+        extensions: extensions_map.into_values().collect(),
+    })
 }

 static INSTALLED_EXTENSIONS: Lazy<UIntGaugeVec> = Lazy::new(|| {
    register_uint_gauge_vec!(
        "compute_installed_extensions",
        "Number of databases where the version of extension is installed",
-        &["extension_name", "version"]
+        &["extension_name", "version", "owned_by_superuser"]
    )
    .expect("failed to define a metric")
 });
--- a/control_plane/src/background_process.rs
+++ b/control_plane/src/background_process.rs
@@ -274,6 +274,7 @@ fn fill_remote_storage_secrets_vars(mut cmd: &mut Command) -> &mut Command {
    for env_key in [
        "AWS_ACCESS_KEY_ID",
        "AWS_SECRET_ACCESS_KEY",
+        "AWS_SESSION_TOKEN",
        "AWS_PROFILE",
        // HOME is needed in combination with `AWS_PROFILE` to pick up the SSO sessions.
        "HOME",
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -810,7 +810,7 @@ impl Endpoint {
        }

        let client = reqwest::Client::builder()
-            .timeout(Duration::from_secs(30))
+            .timeout(Duration::from_secs(120))
            .build()
            .unwrap();
        let response = client
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -435,7 +435,7 @@ impl PageServerNode {
    ) -> anyhow::Result<()> {
        let config = Self::parse_config(settings)?;
        self.http_client
-            .tenant_config(&models::TenantConfigRequest { tenant_id, config })
+            .set_tenant_config(&models::TenantConfigRequest { tenant_id, config })
            .await?;

        Ok(())
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -9,8 +9,8 @@ use pageserver_api::{
    },
    models::{
        EvictionPolicy, EvictionPolicyLayerAccessThreshold, LocationConfigSecondary,
-        ShardParameters, TenantConfig, TenantConfigRequest, TenantShardSplitRequest,
-        TenantShardSplitResponse,
+        ShardParameters, TenantConfig, TenantConfigPatchRequest, TenantConfigRequest,
+        TenantShardSplitRequest, TenantShardSplitResponse,
    },
    shard::{ShardStripeSize, TenantShardId},
 };
@@ -116,9 +116,19 @@ enum Command {
        #[arg(long)]
        tenant_shard_id: TenantShardId,
    },
-    /// Modify the pageserver tenant configuration of a tenant: this is the configuration structure
+    /// Set the pageserver tenant configuration of a tenant: this is the configuration structure
    /// that is passed through to pageservers, and does not affect storage controller behavior.
-    TenantConfig {
+    /// Any previous tenant configs are overwritten.
+    SetTenantConfig {
+        #[arg(long)]
+        tenant_id: TenantId,
+        #[arg(long)]
+        config: String,
+    },
+    /// Patch the pageserver tenant configuration of a tenant. Any fields with null values in the
+    /// provided JSON are unset from the tenant config and all fields with non-null values are set.
+    /// Unspecified fields are not changed.
+    PatchTenantConfig {
        #[arg(long)]
        tenant_id: TenantId,
        #[arg(long)]
@@ -549,11 +559,21 @@ async fn main() -> anyhow::Result<()> {
                )
                .await?;
        }
-        Command::TenantConfig { tenant_id, config } => {
+        Command::SetTenantConfig { tenant_id, config } => {
            let tenant_conf = serde_json::from_str(&config)?;

            vps_client
-                .tenant_config(&TenantConfigRequest {
+                .set_tenant_config(&TenantConfigRequest {
+                    tenant_id,
+                    config: tenant_conf,
+                })
+                .await?;
+        }
+        Command::PatchTenantConfig { tenant_id, config } => {
+            let tenant_conf = serde_json::from_str(&config)?;
+
+            vps_client
+                .patch_tenant_config(&TenantConfigPatchRequest {
                    tenant_id,
                    config: tenant_conf,
                })
@@ -736,7 +756,7 @@ async fn main() -> anyhow::Result<()> {
            threshold,
        } => {
            vps_client
-                .tenant_config(&TenantConfigRequest {
+                .set_tenant_config(&TenantConfigRequest {
                    tenant_id,
                    config: TenantConfig {
                        eviction_policy: Some(EvictionPolicy::LayerAccessThreshold(
--- a/libs/compute_api/src/responses.rs
+++ b/libs/compute_api/src/responses.rs
@@ -1,6 +1,5 @@
 //! Structs representing the JSON formats used in the compute_ctl's HTTP API.

-use std::collections::HashSet;
 use std::fmt::Display;

 use chrono::{DateTime, Utc};
@@ -163,8 +162,9 @@ pub enum ControlPlaneComputeStatus {
 #[derive(Clone, Debug, Default, Serialize)]
 pub struct InstalledExtension {
    pub extname: String,
-    pub versions: HashSet<String>,
+    pub version: String,
    pub n_databases: u32, // Number of databases using this extension
+    pub owned_by_superuser: String,
 }

 #[derive(Clone, Debug, Default, Serialize)]
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -75,7 +75,7 @@ pub struct TenantPolicyRequest {
    pub scheduling: Option<ShardSchedulingPolicy>,
 }

-#[derive(Clone, Serialize, Deserialize, PartialEq, Eq, Hash, Debug)]
+#[derive(Clone, Serialize, Deserialize, PartialEq, Eq, Hash, Debug, PartialOrd, Ord)]
 pub struct AvailabilityZone(pub String);

 impl Display for AvailabilityZone {
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -24,7 +24,7 @@ pub struct Key {

 /// When working with large numbers of Keys in-memory, it is more efficient to handle them as i128 than as
 /// a struct of fields.
-#[derive(Clone, Copy, Hash, PartialEq, Eq, Ord, PartialOrd, Serialize, Deserialize)]
+#[derive(Clone, Copy, Hash, PartialEq, Eq, Ord, PartialOrd, Serialize, Deserialize, Debug)]
 pub struct CompactKey(i128);

 /// The storage key size.
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -17,7 +17,7 @@ use std::{

 use byteorder::{BigEndian, ReadBytesExt};
 use postgres_ffi::BLCKSZ;
-use serde::{Deserialize, Serialize};
+use serde::{Deserialize, Deserializer, Serialize, Serializer};
 use serde_with::serde_as;
 use utils::{
    completion,
@@ -325,6 +325,115 @@ impl Default for ShardParameters {
    }
 }

+#[derive(Debug, Default, Clone, Eq, PartialEq)]
+pub enum FieldPatch<T> {
+    Upsert(T),
+    Remove,
+    #[default]
+    Noop,
+}
+
+impl<T> FieldPatch<T> {
+    fn is_noop(&self) -> bool {
+        matches!(self, FieldPatch::Noop)
+    }
+
+    pub fn apply(self, target: &mut Option<T>) {
+        match self {
+            Self::Upsert(v) => *target = Some(v),
+            Self::Remove => *target = None,
+            Self::Noop => {}
+        }
+    }
+
+    pub fn map<U, E, F: FnOnce(T) -> Result<U, E>>(self, map: F) -> Result<FieldPatch<U>, E> {
+        match self {
+            Self::Upsert(v) => Ok(FieldPatch::<U>::Upsert(map(v)?)),
+            Self::Remove => Ok(FieldPatch::<U>::Remove),
+            Self::Noop => Ok(FieldPatch::<U>::Noop),
+        }
+    }
+}
+
+impl<'de, T: Deserialize<'de>> Deserialize<'de> for FieldPatch<T> {
+    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
+    where
+        D: Deserializer<'de>,
+    {
+        Option::deserialize(deserializer).map(|opt| match opt {
+            None => FieldPatch::Remove,
+            Some(val) => FieldPatch::Upsert(val),
+        })
+    }
+}
+
+impl<T: Serialize> Serialize for FieldPatch<T> {
+    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+    where
+        S: Serializer,
+    {
+        match self {
+            FieldPatch::Upsert(val) => serializer.serialize_some(val),
+            FieldPatch::Remove => serializer.serialize_none(),
+            FieldPatch::Noop => unreachable!(),
+        }
+    }
+}
+
+#[derive(Serialize, Deserialize, Debug, Default, Clone, Eq, PartialEq)]
+#[serde(default)]
+pub struct TenantConfigPatch {
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub checkpoint_distance: FieldPatch<u64>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub checkpoint_timeout: FieldPatch<String>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub compaction_target_size: FieldPatch<u64>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub compaction_period: FieldPatch<String>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub compaction_threshold: FieldPatch<usize>,
+    // defer parsing compaction_algorithm, like eviction_policy
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub compaction_algorithm: FieldPatch<CompactionAlgorithmSettings>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub gc_horizon: FieldPatch<u64>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub gc_period: FieldPatch<String>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub image_creation_threshold: FieldPatch<usize>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub pitr_interval: FieldPatch<String>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub walreceiver_connect_timeout: FieldPatch<String>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub lagging_wal_timeout: FieldPatch<String>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub max_lsn_wal_lag: FieldPatch<NonZeroU64>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub eviction_policy: FieldPatch<EvictionPolicy>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub min_resident_size_override: FieldPatch<u64>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub evictions_low_residence_duration_metric_threshold: FieldPatch<String>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub heatmap_period: FieldPatch<String>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub lazy_slru_download: FieldPatch<bool>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub timeline_get_throttle: FieldPatch<ThrottleConfig>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub image_layer_creation_check_threshold: FieldPatch<u8>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub lsn_lease_length: FieldPatch<String>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub lsn_lease_length_for_ts: FieldPatch<String>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub timeline_offloading: FieldPatch<bool>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub wal_receiver_protocol_override: FieldPatch<PostgresClientProtocol>,
+}
+
 /// An alternative representation of `pageserver::tenant::TenantConf` with
 /// simpler types.
 #[derive(Serialize, Deserialize, Debug, Default, Clone, Eq, PartialEq)]
@@ -356,6 +465,107 @@ pub struct TenantConfig {
    pub wal_receiver_protocol_override: Option<PostgresClientProtocol>,
 }

+impl TenantConfig {
+    pub fn apply_patch(self, patch: TenantConfigPatch) -> TenantConfig {
+        let Self {
+            mut checkpoint_distance,
+            mut checkpoint_timeout,
+            mut compaction_target_size,
+            mut compaction_period,
+            mut compaction_threshold,
+            mut compaction_algorithm,
+            mut gc_horizon,
+            mut gc_period,
+            mut image_creation_threshold,
+            mut pitr_interval,
+            mut walreceiver_connect_timeout,
+            mut lagging_wal_timeout,
+            mut max_lsn_wal_lag,
+            mut eviction_policy,
+            mut min_resident_size_override,
+            mut evictions_low_residence_duration_metric_threshold,
+            mut heatmap_period,
+            mut lazy_slru_download,
+            mut timeline_get_throttle,
+            mut image_layer_creation_check_threshold,
+            mut lsn_lease_length,
+            mut lsn_lease_length_for_ts,
+            mut timeline_offloading,
+            mut wal_receiver_protocol_override,
+        } = self;
+
+        patch.checkpoint_distance.apply(&mut checkpoint_distance);
+        patch.checkpoint_timeout.apply(&mut checkpoint_timeout);
+        patch
+            .compaction_target_size
+            .apply(&mut compaction_target_size);
+        patch.compaction_period.apply(&mut compaction_period);
+        patch.compaction_threshold.apply(&mut compaction_threshold);
+        patch.compaction_algorithm.apply(&mut compaction_algorithm);
+        patch.gc_horizon.apply(&mut gc_horizon);
+        patch.gc_period.apply(&mut gc_period);
+        patch
+            .image_creation_threshold
+            .apply(&mut image_creation_threshold);
+        patch.pitr_interval.apply(&mut pitr_interval);
+        patch
+            .walreceiver_connect_timeout
+            .apply(&mut walreceiver_connect_timeout);
+        patch.lagging_wal_timeout.apply(&mut lagging_wal_timeout);
+        patch.max_lsn_wal_lag.apply(&mut max_lsn_wal_lag);
+        patch.eviction_policy.apply(&mut eviction_policy);
+        patch
+            .min_resident_size_override
+            .apply(&mut min_resident_size_override);
+        patch
+            .evictions_low_residence_duration_metric_threshold
+            .apply(&mut evictions_low_residence_duration_metric_threshold);
+        patch.heatmap_period.apply(&mut heatmap_period);
+        patch.lazy_slru_download.apply(&mut lazy_slru_download);
+        patch
+            .timeline_get_throttle
+            .apply(&mut timeline_get_throttle);
+        patch
+            .image_layer_creation_check_threshold
+            .apply(&mut image_layer_creation_check_threshold);
+        patch.lsn_lease_length.apply(&mut lsn_lease_length);
+        patch
+            .lsn_lease_length_for_ts
+            .apply(&mut lsn_lease_length_for_ts);
+        patch.timeline_offloading.apply(&mut timeline_offloading);
+        patch
+            .wal_receiver_protocol_override
+            .apply(&mut wal_receiver_protocol_override);
+
+        Self {
+            checkpoint_distance,
+            checkpoint_timeout,
+            compaction_target_size,
+            compaction_period,
+            compaction_threshold,
+            compaction_algorithm,
+            gc_horizon,
+            gc_period,
+            image_creation_threshold,
+            pitr_interval,
+            walreceiver_connect_timeout,
+            lagging_wal_timeout,
+            max_lsn_wal_lag,
+            eviction_policy,
+            min_resident_size_override,
+            evictions_low_residence_duration_metric_threshold,
+            heatmap_period,
+            lazy_slru_download,
+            timeline_get_throttle,
+            image_layer_creation_check_threshold,
+            lsn_lease_length,
+            lsn_lease_length_for_ts,
+            timeline_offloading,
+            wal_receiver_protocol_override,
+        }
+    }
+}
+
 /// The policy for the aux file storage.
 ///
 /// It can be switched through `switch_aux_file_policy` tenant config.
@@ -686,6 +896,14 @@ impl TenantConfigRequest {
    }
 }

+#[derive(Serialize, Deserialize, Debug)]
+#[serde(deny_unknown_fields)]
+pub struct TenantConfigPatchRequest {
+    pub tenant_id: TenantId,
+    #[serde(flatten)]
+    pub config: TenantConfigPatch, // as we have a flattened field, we should reject all unknown fields in it
+}
+
 /// See [`TenantState::attachment_status`] and the OpenAPI docs for context.
 #[derive(Serialize, Deserialize, Clone)]
 #[serde(tag = "slug", content = "data", rename_all = "snake_case")]
@@ -1699,4 +1917,45 @@ mod tests {
            );
        }
    }
+
+    #[test]
+    fn test_tenant_config_patch_request_serde() {
+        let patch_request = TenantConfigPatchRequest {
+            tenant_id: TenantId::from_str("17c6d121946a61e5ab0fe5a2fd4d8215").unwrap(),
+            config: TenantConfigPatch {
+                checkpoint_distance: FieldPatch::Upsert(42),
+                gc_horizon: FieldPatch::Remove,
+                compaction_threshold: FieldPatch::Noop,
+                ..TenantConfigPatch::default()
+            },
+        };
+
+        let json = serde_json::to_string(&patch_request).unwrap();
+
+        let expected = r#"{"tenant_id":"17c6d121946a61e5ab0fe5a2fd4d8215","checkpoint_distance":42,"gc_horizon":null}"#;
+        assert_eq!(json, expected);
+
+        let decoded: TenantConfigPatchRequest = serde_json::from_str(&json).unwrap();
+        assert_eq!(decoded.tenant_id, patch_request.tenant_id);
+        assert_eq!(decoded.config, patch_request.config);
+
+        // Now apply the patch to a config to demonstrate semantics
+
+        let base = TenantConfig {
+            checkpoint_distance: Some(28),
+            gc_horizon: Some(100),
+            compaction_target_size: Some(1024),
+            ..Default::default()
+        };
+
+        let expected = TenantConfig {
+            checkpoint_distance: Some(42),
+            gc_horizon: None,
+            ..base.clone()
+        };
+
+        let patched = base.apply_patch(decoded.config);
+
+        assert_eq!(patched, expected);
+    }
 }
--- a/libs/safekeeper_api/Cargo.toml
+++ b/libs/safekeeper_api/Cargo.toml
@@ -5,6 +5,9 @@ edition.workspace = true
 license.workspace = true

 [dependencies]
-serde.workspace = true
 const_format.workspace = true
+serde.workspace = true
+postgres_ffi.workspace = true
+pq_proto.workspace = true
+tokio.workspace = true
 utils.workspace = true
--- a/libs/safekeeper_api/src/lib.rs
+++ b/libs/safekeeper_api/src/lib.rs
@@ -1,10 +1,27 @@
 #![deny(unsafe_code)]
 #![deny(clippy::undocumented_unsafe_blocks)]
 use const_format::formatcp;
+use pq_proto::SystemId;
+use serde::{Deserialize, Serialize};

 /// Public API types
 pub mod models;

+/// Consensus logical timestamp. Note: it is a part of sk control file.
+pub type Term = u64;
+pub const INVALID_TERM: Term = 0;
+
+/// Information about Postgres. Safekeeper gets it once and then verifies all
+/// further connections from computes match. Note: it is a part of sk control
+/// file.
+#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
+pub struct ServerInfo {
+    /// Postgres server version
+    pub pg_version: u32,
+    pub system_id: SystemId,
+    pub wal_seg_size: u32,
+}
+
 pub const DEFAULT_PG_LISTEN_PORT: u16 = 5454;
 pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}");

--- a/libs/safekeeper_api/src/models.rs
+++ b/libs/safekeeper_api/src/models.rs
@@ -1,10 +1,23 @@
+//! Types used in safekeeper http API. Many of them are also reused internally.
+
+use postgres_ffi::TimestampTz;
 use serde::{Deserialize, Serialize};
+use std::net::SocketAddr;
+use tokio::time::Instant;

 use utils::{
-    id::{NodeId, TenantId, TimelineId},
+    id::{NodeId, TenantId, TenantTimelineId, TimelineId},
    lsn::Lsn,
+    pageserver_feedback::PageserverFeedback,
 };

+use crate::{ServerInfo, Term};
+
+#[derive(Debug, Serialize)]
+pub struct SafekeeperStatus {
+    pub id: NodeId,
+}
+
 #[derive(Serialize, Deserialize)]
 pub struct TimelineCreateRequest {
    pub tenant_id: TenantId,
@@ -18,6 +31,161 @@ pub struct TimelineCreateRequest {
    pub local_start_lsn: Option<Lsn>,
 }

+/// Same as TermLsn, but serializes LSN using display serializer
+/// in Postgres format, i.e. 0/FFFFFFFF. Used only for the API response.
+#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
+pub struct TermSwitchApiEntry {
+    pub term: Term,
+    pub lsn: Lsn,
+}
+
+/// Augment AcceptorState with last_log_term for convenience
+#[derive(Debug, Serialize, Deserialize)]
+pub struct AcceptorStateStatus {
+    pub term: Term,
+    pub epoch: Term, // aka last_log_term, old `epoch` name is left for compatibility
+    pub term_history: Vec<TermSwitchApiEntry>,
+}
+
+/// Things safekeeper should know about timeline state on peers.
+/// Used as both model and internally.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct PeerInfo {
+    pub sk_id: NodeId,
+    pub term: Term,
+    /// Term of the last entry.
+    pub last_log_term: Term,
+    /// LSN of the last record.
+    pub flush_lsn: Lsn,
+    pub commit_lsn: Lsn,
+    /// Since which LSN safekeeper has WAL.
+    pub local_start_lsn: Lsn,
+    /// When info was received. Serde annotations are not very useful but make
+    /// the code compile -- we don't rely on this field externally.
+    #[serde(skip)]
+    #[serde(default = "Instant::now")]
+    pub ts: Instant,
+    pub pg_connstr: String,
+    pub http_connstr: String,
+}
+
+pub type FullTransactionId = u64;
+
+/// Hot standby feedback received from replica
+#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
+pub struct HotStandbyFeedback {
+    pub ts: TimestampTz,
+    pub xmin: FullTransactionId,
+    pub catalog_xmin: FullTransactionId,
+}
+
+pub const INVALID_FULL_TRANSACTION_ID: FullTransactionId = 0;
+
+impl HotStandbyFeedback {
+    pub fn empty() -> HotStandbyFeedback {
+        HotStandbyFeedback {
+            ts: 0,
+            xmin: 0,
+            catalog_xmin: 0,
+        }
+    }
+}
+
+/// Standby status update
+#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
+pub struct StandbyReply {
+    pub write_lsn: Lsn, // The location of the last WAL byte + 1 received and written to disk in the standby.
+    pub flush_lsn: Lsn, // The location of the last WAL byte + 1 flushed to disk in the standby.
+    pub apply_lsn: Lsn, // The location of the last WAL byte + 1 applied in the standby.
+    pub reply_ts: TimestampTz, // The client's system clock at the time of transmission, as microseconds since midnight on 2000-01-01.
+    pub reply_requested: bool,
+}
+
+impl StandbyReply {
+    pub fn empty() -> Self {
+        StandbyReply {
+            write_lsn: Lsn::INVALID,
+            flush_lsn: Lsn::INVALID,
+            apply_lsn: Lsn::INVALID,
+            reply_ts: 0,
+            reply_requested: false,
+        }
+    }
+}
+
+#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
+pub struct StandbyFeedback {
+    pub reply: StandbyReply,
+    pub hs_feedback: HotStandbyFeedback,
+}
+
+impl StandbyFeedback {
+    pub fn empty() -> Self {
+        StandbyFeedback {
+            reply: StandbyReply::empty(),
+            hs_feedback: HotStandbyFeedback::empty(),
+        }
+    }
+}
+
+/// Receiver is either pageserver or regular standby, which have different
+/// feedbacks.
+/// Used as both model and internally.
+#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
+pub enum ReplicationFeedback {
+    Pageserver(PageserverFeedback),
+    Standby(StandbyFeedback),
+}
+
+/// Uniquely identifies a WAL service connection. Logged in spans for
+/// observability.
+pub type ConnectionId = u32;
+
+/// Serialize is used only for json'ing in API response. Also used internally.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct WalSenderState {
+    pub ttid: TenantTimelineId,
+    pub addr: SocketAddr,
+    pub conn_id: ConnectionId,
+    // postgres application_name
+    pub appname: Option<String>,
+    pub feedback: ReplicationFeedback,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct WalReceiverState {
+    /// None means it is recovery initiated by us (this safekeeper).
+    pub conn_id: Option<ConnectionId>,
+    pub status: WalReceiverStatus,
+}
+
+/// Walreceiver status. Currently only whether it passed voting stage and
+/// started receiving the stream, but it is easy to add more if needed.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub enum WalReceiverStatus {
+    Voting,
+    Streaming,
+}
+
+/// Info about timeline on safekeeper ready for reporting.
+#[derive(Debug, Serialize, Deserialize)]
+pub struct TimelineStatus {
+    pub tenant_id: TenantId,
+    pub timeline_id: TimelineId,
+    pub acceptor_state: AcceptorStateStatus,
+    pub pg_info: ServerInfo,
+    pub flush_lsn: Lsn,
+    pub timeline_start_lsn: Lsn,
+    pub local_start_lsn: Lsn,
+    pub commit_lsn: Lsn,
+    pub backup_lsn: Lsn,
+    pub peer_horizon_lsn: Lsn,
+    pub remote_consistent_lsn: Lsn,
+    pub peers: Vec<PeerInfo>,
+    pub walsenders: Vec<WalSenderState>,
+    pub walreceivers: Vec<WalReceiverState>,
+}
+
 fn lsn_invalid() -> Lsn {
    Lsn::INVALID
 }
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -94,6 +94,8 @@ pub mod toml_edit_ext;

 pub mod circuit_breaker;

+pub mod try_rcu;
+
 // Re-export used in macro. Avoids adding git-version as dep in target crates.
 #[doc(hidden)]
 pub use git_version;
--- a/libs/utils/src/try_rcu.rs
+++ b/libs/utils/src/try_rcu.rs
@@ -0,0 +1,77 @@
+//! Try RCU extension lifted from <https://github.com/vorner/arc-swap/issues/94#issuecomment-1987154023>
+
+pub trait ArcSwapExt<T> {
+    /// [`ArcSwap::rcu`](arc_swap::ArcSwap::rcu), but with Result that short-circuits on error.
+    fn try_rcu<R, F, E>(&self, f: F) -> Result<T, E>
+    where
+        F: FnMut(&T) -> Result<R, E>,
+        R: Into<T>;
+}
+
+impl<T, S> ArcSwapExt<T> for arc_swap::ArcSwapAny<T, S>
+where
+    T: arc_swap::RefCnt,
+    S: arc_swap::strategy::CaS<T>,
+{
+    fn try_rcu<R, F, E>(&self, mut f: F) -> Result<T, E>
+    where
+        F: FnMut(&T) -> Result<R, E>,
+        R: Into<T>,
+    {
+        fn ptr_eq<Base, A, B>(a: A, b: B) -> bool
+        where
+            A: arc_swap::AsRaw<Base>,
+            B: arc_swap::AsRaw<Base>,
+        {
+            let a = a.as_raw();
+            let b = b.as_raw();
+            std::ptr::eq(a, b)
+        }
+
+        let mut cur = self.load();
+        loop {
+            let new = f(&cur)?.into();
+            let prev = self.compare_and_swap(&*cur, new);
+            let swapped = ptr_eq(&*cur, &*prev);
+            if swapped {
+                return Ok(arc_swap::Guard::into_inner(prev));
+            } else {
+                cur = prev;
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use arc_swap::ArcSwap;
+    use std::sync::Arc;
+
+    #[test]
+    fn test_try_rcu_success() {
+        let swap = ArcSwap::from(Arc::new(42));
+
+        let result = swap.try_rcu(|value| -> Result<_, String> { Ok(**value + 1) });
+
+        assert!(result.is_ok());
+        assert_eq!(**swap.load(), 43);
+    }
+
+    #[test]
+    fn test_try_rcu_error() {
+        let swap = ArcSwap::from(Arc::new(42));
+
+        let result = swap.try_rcu(|value| -> Result<i32, _> {
+            if **value == 42 {
+                Err("err")
+            } else {
+                Ok(**value + 1)
+            }
+        });
+
+        assert!(result.is_err());
+        assert_eq!(result.unwrap_err(), "err");
+        assert_eq!(**swap.load(), 42);
+    }
+}
--- a/libs/wal_decoder/proto/interpreted_wal.proto
+++ b/libs/wal_decoder/proto/interpreted_wal.proto
@@ -37,7 +37,7 @@ message ValueMeta {
 }

 message CompactKey {
-  int64 high = 1;
-  int64 low = 2;
+  uint64 high = 1;
+  uint64 low = 2;
 }

--- a/libs/wal_decoder/src/wire_format.rs
+++ b/libs/wal_decoder/src/wire_format.rs
@@ -236,8 +236,8 @@ impl From<ValueMeta> for proto::ValueMeta {
 impl From<CompactKey> for proto::CompactKey {
    fn from(value: CompactKey) -> Self {
        proto::CompactKey {
-            high: (value.raw() >> 64) as i64,
-            low: value.raw() as i64,
+            high: (value.raw() >> 64) as u64,
+            low: value.raw() as u64,
        }
    }
 }
@@ -354,3 +354,64 @@ impl From<proto::CompactKey> for CompactKey {
        (((value.high as i128) << 64) | (value.low as i128)).into()
    }
 }
+
+#[test]
+fn test_compact_key_with_large_relnode() {
+    use pageserver_api::key::Key;
+
+    let inputs = vec![
+        Key {
+            field1: 0,
+            field2: 0x100,
+            field3: 0x200,
+            field4: 0,
+            field5: 0x10,
+            field6: 0x5,
+        },
+        Key {
+            field1: 0,
+            field2: 0x100,
+            field3: 0x200,
+            field4: 0x007FFFFF,
+            field5: 0x10,
+            field6: 0x5,
+        },
+        Key {
+            field1: 0,
+            field2: 0x100,
+            field3: 0x200,
+            field4: 0x00800000,
+            field5: 0x10,
+            field6: 0x5,
+        },
+        Key {
+            field1: 0,
+            field2: 0x100,
+            field3: 0x200,
+            field4: 0x00800001,
+            field5: 0x10,
+            field6: 0x5,
+        },
+        Key {
+            field1: 0,
+            field2: 0xFFFFFFFF,
+            field3: 0xFFFFFFFF,
+            field4: 0xFFFFFFFF,
+            field5: 0x0,
+            field6: 0x0,
+        },
+    ];
+
+    for input in inputs {
+        assert!(input.is_valid_key_on_write_path());
+        let compact = input.to_compact();
+        let proto: proto::CompactKey = compact.into();
+        let from_proto: CompactKey = proto.into();
+
+        assert_eq!(
+            compact, from_proto,
+            "Round trip failed for key with relnode={:#x}",
+            input.field4
+        );
+    }
+}
--- a/libs/walproposer/build.rs
+++ b/libs/walproposer/build.rs
@@ -30,9 +30,9 @@ fn main() -> anyhow::Result<()> {
    let pgxn_neon = std::fs::canonicalize(pgxn_neon)?;
    let pgxn_neon = pgxn_neon.to_str().ok_or(anyhow!("Bad non-UTF path"))?;

+    println!("cargo:rustc-link-lib=static=walproposer");
    println!("cargo:rustc-link-lib=static=pgport");
    println!("cargo:rustc-link-lib=static=pgcommon");
-    println!("cargo:rustc-link-lib=static=walproposer");
    println!("cargo:rustc-link-search={walproposer_lib_search_str}");

    // Rebuild crate when libwalproposer.a changes
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -270,12 +270,18 @@ impl Client {
        Ok(body)
    }

-    pub async fn tenant_config(&self, req: &TenantConfigRequest) -> Result<()> {
+    pub async fn set_tenant_config(&self, req: &TenantConfigRequest) -> Result<()> {
        let uri = format!("{}/v1/tenant/config", self.mgmt_api_endpoint);
        self.request(Method::PUT, &uri, req).await?;
        Ok(())
    }

+    pub async fn patch_tenant_config(&self, req: &TenantConfigPatchRequest) -> Result<()> {
+        let uri = format!("{}/v1/tenant/config", self.mgmt_api_endpoint);
+        self.request(Method::PATCH, &uri, req).await?;
+        Ok(())
+    }
+
    pub async fn tenant_secondary_download(
        &self,
        tenant_id: TenantShardId,
--- a/pageserver/pagebench/src/cmd/aux_files.rs
+++ b/pageserver/pagebench/src/cmd/aux_files.rs
@@ -64,7 +64,7 @@ async fn main_impl(args: Args) -> anyhow::Result<()> {
    println!("operating on timeline {}", timeline);

    mgmt_api_client
-        .tenant_config(&TenantConfigRequest {
+        .set_tenant_config(&TenantConfigRequest {
            tenant_id: timeline.tenant_id,
            config: TenantConfig::default(),
        })
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -767,7 +767,27 @@ paths:
  /v1/tenant/config:
    put:
      description: |
-        Update tenant's config.
+        Update tenant's config by setting it to the provided value
+
+        Invalid fields in the tenant config will cause the request to be rejected with status 400.
+      requestBody:
+        content:
+          application/json:
+            schema:
+              $ref: "#/components/schemas/TenantConfigRequest"
+      responses:
+        "200":
+          description: OK
+          content:
+            application/json:
+              schema:
+                type: array
+                items:
+                  $ref: "#/components/schemas/TenantInfo"
+    patch:
+      description: |
+        Update tenant's config additively by patching the updated fields provided.
+        Null values unset the field and non-null values upsert it.

        Invalid fields in the tenant config will cause the request to be rejected with status 400.
      requestBody:
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -28,6 +28,7 @@ use pageserver_api::models::LsnLease;
 use pageserver_api::models::LsnLeaseRequest;
 use pageserver_api::models::OffloadedTimelineInfo;
 use pageserver_api::models::ShardParameters;
+use pageserver_api::models::TenantConfigPatchRequest;
 use pageserver_api::models::TenantDetails;
 use pageserver_api::models::TenantLocationConfigRequest;
 use pageserver_api::models::TenantLocationConfigResponse;
@@ -1695,7 +1696,47 @@ async fn update_tenant_config_handler(
    crate::tenant::Tenant::persist_tenant_config(state.conf, &tenant_shard_id, &location_conf)
        .await
        .map_err(|e| ApiError::InternalServerError(anyhow::anyhow!(e)))?;
-    tenant.set_new_tenant_config(new_tenant_conf);
+
+    let _ = tenant
+        .update_tenant_config(|_crnt| Ok(new_tenant_conf.clone()))
+        .expect("Closure returns Ok()");
+
+    json_response(StatusCode::OK, ())
+}
+
+async fn patch_tenant_config_handler(
+    mut request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let request_data: TenantConfigPatchRequest = json_request(&mut request).await?;
+    let tenant_id = request_data.tenant_id;
+    check_permission(&request, Some(tenant_id))?;
+
+    let state = get_state(&request);
+
+    let tenant_shard_id = TenantShardId::unsharded(tenant_id);
+
+    let tenant = state
+        .tenant_manager
+        .get_attached_tenant_shard(tenant_shard_id)?;
+    tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
+
+    let updated = tenant
+        .update_tenant_config(|crnt| crnt.apply_patch(request_data.config.clone()))
+        .map_err(ApiError::BadRequest)?;
+
+    // This is a legacy API that only operates on attached tenants: the preferred
+    // API to use is the location_config/ endpoint, which lets the caller provide
+    // the full LocationConf.
+    let location_conf = LocationConf::attached_single(
+        updated,
+        tenant.get_generation(),
+        &ShardParameters::default(),
+    );
+
+    crate::tenant::Tenant::persist_tenant_config(state.conf, &tenant_shard_id, &location_conf)
+        .await
+        .map_err(|e| ApiError::InternalServerError(anyhow::anyhow!(e)))?;

    json_response(StatusCode::OK, ())
 }
@@ -2040,13 +2081,20 @@ async fn timeline_compact_handler(
        .as_ref()
        .map(|r| r.sub_compaction)
        .unwrap_or(false);
+    let sub_compaction_max_job_size_mb = compact_request
+        .as_ref()
+        .and_then(|r| r.sub_compaction_max_job_size_mb);
+
    let options = CompactOptions {
-        compact_range: compact_request
+        compact_key_range: compact_request
            .as_ref()
-            .and_then(|r| r.compact_range.clone()),
-        compact_below_lsn: compact_request.as_ref().and_then(|r| r.compact_below_lsn),
+            .and_then(|r| r.compact_key_range.clone()),
+        compact_lsn_range: compact_request
+            .as_ref()
+            .and_then(|r| r.compact_lsn_range.clone()),
        flags,
        sub_compaction,
+        sub_compaction_max_job_size_mb,
    };

    let scheduled = compact_request
@@ -3288,6 +3336,9 @@ pub fn make_router(
        .get("/v1/tenant/:tenant_shard_id/synthetic_size", |r| {
            api_handler(r, tenant_size_handler)
        })
+        .patch("/v1/tenant/config", |r| {
+            api_handler(r, patch_tenant_config_handler)
+        })
        .put("/v1/tenant/config", |r| {
            api_handler(r, update_tenant_config_handler)
        })
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -16,7 +16,6 @@ use postgres_backend::{is_expected_io_error, QueryError};
 use pq_proto::framed::ConnectionError;
 use strum::{EnumCount, VariantNames};
 use strum_macros::{IntoStaticStr, VariantNames};
-use tracing::warn;
 use utils::id::TimelineId;

 /// Prometheus histogram buckets (in seconds) for operations in the critical
@@ -1225,32 +1224,58 @@ pub(crate) mod virtual_file_io_engine {

 pub(crate) struct SmgrOpTimer(Option<SmgrOpTimerInner>);
 pub(crate) struct SmgrOpTimerInner {
-    global_latency_histo: Histogram,
+    global_execution_latency_histo: Histogram,
+    per_timeline_execution_latency_histo: Option<Histogram>,

-    // Optional because not all op types are tracked per-timeline
-    per_timeline_latency_histo: Option<Histogram>,
+    global_batch_wait_time: Histogram,
+    per_timeline_batch_wait_time: Histogram,

    global_flush_in_progress_micros: IntCounter,
    per_timeline_flush_in_progress_micros: IntCounter,

-    start: Instant,
-    throttled: Duration,
-    op: SmgrQueryType,
+    timings: SmgrOpTimerState,
+}
+
+#[derive(Debug)]
+enum SmgrOpTimerState {
+    Received {
+        received_at: Instant,
+    },
+    ThrottleDoneExecutionStarting {
+        received_at: Instant,
+        throttle_started_at: Instant,
+        started_execution_at: Instant,
+    },
 }

 pub(crate) struct SmgrOpFlushInProgress {
-    base: Instant,
+    flush_started_at: Instant,
    global_micros: IntCounter,
    per_timeline_micros: IntCounter,
 }

 impl SmgrOpTimer {
-    pub(crate) fn deduct_throttle(&mut self, throttle: &Option<Duration>) {
-        let Some(throttle) = throttle else {
-            return;
-        };
+    pub(crate) fn observe_throttle_done_execution_starting(&mut self, throttle: &ThrottleResult) {
        let inner = self.0.as_mut().expect("other public methods consume self");
-        inner.throttled += *throttle;
+        match (&mut inner.timings, throttle) {
+            (SmgrOpTimerState::Received { received_at }, throttle) => match throttle {
+                ThrottleResult::NotThrottled { start } => {
+                    inner.timings = SmgrOpTimerState::ThrottleDoneExecutionStarting {
+                        received_at: *received_at,
+                        throttle_started_at: *start,
+                        started_execution_at: *start,
+                    };
+                }
+                ThrottleResult::Throttled { start, end } => {
+                    inner.timings = SmgrOpTimerState::ThrottleDoneExecutionStarting {
+                        received_at: *start,
+                        throttle_started_at: *start,
+                        started_execution_at: *end,
+                    };
+                }
+            },
+            (x, _) => panic!("called in unexpected state: {x:?}"),
+        }
    }

    pub(crate) fn observe_smgr_op_completion_and_start_flushing(mut self) -> SmgrOpFlushInProgress {
@@ -1263,7 +1288,7 @@ impl SmgrOpTimer {
            ..
        } = inner;
        SmgrOpFlushInProgress {
-            base: flush_start,
+            flush_started_at: flush_start,
            global_micros: global_flush_in_progress_micros,
            per_timeline_micros: per_timeline_flush_in_progress_micros,
        }
@@ -1274,32 +1299,42 @@ impl SmgrOpTimer {
        let inner = self.0.take()?;

        let now = Instant::now();
-        let elapsed = now - inner.start;

-        let elapsed = match elapsed.checked_sub(inner.throttled) {
-            Some(elapsed) => elapsed,
-            None => {
-                use utils::rate_limit::RateLimit;
-                static LOGGED: Lazy<Mutex<enum_map::EnumMap<SmgrQueryType, RateLimit>>> =
-                    Lazy::new(|| {
-                        Mutex::new(enum_map::EnumMap::from_array(std::array::from_fn(|_| {
-                            RateLimit::new(Duration::from_secs(10))
-                        })))
-                    });
-                let mut guard = LOGGED.lock().unwrap();
-                let rate_limit = &mut guard[inner.op];
-                rate_limit.call(|| {
-                    warn!(op=?inner.op, ?elapsed, ?inner.throttled, "implementation error: time spent throttled exceeds total request wall clock time");
-                });
-                elapsed // un-throttled time, more info than just saturating to 0
+        let batch;
+        let execution;
+        let throttle;
+        match inner.timings {
+            SmgrOpTimerState::Received { received_at } => {
+                batch = (now - received_at).as_secs_f64();
+                // TODO: use label for dropped requests.
+                // This is quite rare in practice, only during tenant/pageservers shutdown.
+                throttle = Duration::ZERO;
+                execution = Duration::ZERO.as_secs_f64();
            }
-        };
+            SmgrOpTimerState::ThrottleDoneExecutionStarting {
+                received_at,
+                throttle_started_at,
+                started_execution_at,
+            } => {
+                batch = (throttle_started_at - received_at).as_secs_f64();
+                throttle = started_execution_at - throttle_started_at;
+                execution = (now - started_execution_at).as_secs_f64();
+            }
+        }

-        let elapsed = elapsed.as_secs_f64();
+        // update time spent in batching
+        inner.global_batch_wait_time.observe(batch);
+        inner.per_timeline_batch_wait_time.observe(batch);

-        inner.global_latency_histo.observe(elapsed);
-        if let Some(per_timeline_getpage_histo) = &inner.per_timeline_latency_histo {
-            per_timeline_getpage_histo.observe(elapsed);
+        // time spent in throttle metric is updated by throttle impl
+        let _ = throttle;
+
+        // update metrics for execution latency
+        inner.global_execution_latency_histo.observe(execution);
+        if let Some(per_timeline_execution_latency_histo) =
+            &inner.per_timeline_execution_latency_histo
+        {
+            per_timeline_execution_latency_histo.observe(execution);
        }

        Some((now, inner))
@@ -1325,12 +1360,12 @@ impl SmgrOpFlushInProgress {
        // Last call is tracked in `now`.
        let mut observe_guard = scopeguard::guard(
            || {
-                let elapsed = now - self.base;
+                let elapsed = now - self.flush_started_at;
                self.global_micros
                    .inc_by(u64::try_from(elapsed.as_micros()).unwrap());
                self.per_timeline_micros
                    .inc_by(u64::try_from(elapsed.as_micros()).unwrap());
-                self.base = now;
+                self.flush_started_at = now;
            },
            |mut observe| {
                observe();
@@ -1377,6 +1412,8 @@ pub(crate) struct SmgrQueryTimePerTimeline {
    per_timeline_batch_size: Histogram,
    global_flush_in_progress_micros: IntCounter,
    per_timeline_flush_in_progress_micros: IntCounter,
+    global_batch_wait_time: Histogram,
+    per_timeline_batch_wait_time: Histogram,
 }

 static SMGR_QUERY_STARTED_GLOBAL: Lazy<IntCounterVec> = Lazy::new(|| {
@@ -1399,12 +1436,15 @@ static SMGR_QUERY_STARTED_PER_TENANT_TIMELINE: Lazy<IntCounterVec> = Lazy::new(|
    .expect("failed to define a metric")
 });

+// Alias so all histograms recording per-timeline smgr timings use the same buckets.
+static SMGR_QUERY_TIME_PER_TENANT_TIMELINE_BUCKETS: &[f64] = CRITICAL_OP_BUCKETS;
+
 static SMGR_QUERY_TIME_PER_TENANT_TIMELINE: Lazy<HistogramVec> = Lazy::new(|| {
    register_histogram_vec!(
        "pageserver_smgr_query_seconds",
-        "Time spent on smgr query handling, aggegated by query type and tenant/timeline.",
+        "Time spent _executing_ smgr query handling, excluding batch and throttle delays.",
        &["smgr_query_type", "tenant_id", "shard_id", "timeline_id"],
-        CRITICAL_OP_BUCKETS.into(),
+        SMGR_QUERY_TIME_PER_TENANT_TIMELINE_BUCKETS.into(),
    )
    .expect("failed to define a metric")
 });
@@ -1462,7 +1502,7 @@ static SMGR_QUERY_TIME_GLOBAL_BUCKETS: Lazy<Vec<f64>> = Lazy::new(|| {
 static SMGR_QUERY_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
    register_histogram_vec!(
        "pageserver_smgr_query_seconds_global",
-        "Time spent on smgr query handling, aggregated by query type.",
+        "Like pageserver_smgr_query_seconds, but aggregated to instance level.",
        &["smgr_query_type"],
        SMGR_QUERY_TIME_GLOBAL_BUCKETS.clone(),
    )
@@ -1559,6 +1599,25 @@ static PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS_GLOBAL: Lazy<IntCounter> = Lazy
    .expect("failed to define a metric")
 });

+static PAGE_SERVICE_SMGR_BATCH_WAIT_TIME: Lazy<HistogramVec> = Lazy::new(|| {
+    register_histogram_vec!(
+        "pageserver_page_service_pagestream_batch_wait_time_seconds",
+        "Time a request spent waiting in its batch until the batch moved to throttle&execution.",
+        &["tenant_id", "shard_id", "timeline_id"],
+        SMGR_QUERY_TIME_PER_TENANT_TIMELINE_BUCKETS.into(),
+    )
+    .expect("failed to define a metric")
+});
+
+static PAGE_SERVICE_SMGR_BATCH_WAIT_TIME_GLOBAL: Lazy<Histogram> = Lazy::new(|| {
+    register_histogram!(
+        "pageserver_page_service_pagestream_batch_wait_time_seconds_global",
+        "Like pageserver_page_service_pagestream_batch_wait_time_seconds, but aggregated to instance level.",
+        SMGR_QUERY_TIME_GLOBAL_BUCKETS.to_vec(),
+    )
+    .expect("failed to define a metric")
+});
+
 impl SmgrQueryTimePerTimeline {
    pub(crate) fn new(tenant_shard_id: &TenantShardId, timeline_id: &TimelineId) -> Self {
        let tenant_id = tenant_shard_id.tenant_id.to_string();
@@ -1599,6 +1658,11 @@ impl SmgrQueryTimePerTimeline {
            .get_metric_with_label_values(&[&tenant_id, &shard_slug, &timeline_id])
            .unwrap();

+        let global_batch_wait_time = PAGE_SERVICE_SMGR_BATCH_WAIT_TIME_GLOBAL.clone();
+        let per_timeline_batch_wait_time = PAGE_SERVICE_SMGR_BATCH_WAIT_TIME
+            .get_metric_with_label_values(&[&tenant_id, &shard_slug, &timeline_id])
+            .unwrap();
+
        let global_flush_in_progress_micros =
            PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS_GLOBAL.clone();
        let per_timeline_flush_in_progress_micros = PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS
@@ -1614,9 +1678,11 @@ impl SmgrQueryTimePerTimeline {
            per_timeline_batch_size,
            global_flush_in_progress_micros,
            per_timeline_flush_in_progress_micros,
+            global_batch_wait_time,
+            per_timeline_batch_wait_time,
        }
    }
-    pub(crate) fn start_smgr_op(&self, op: SmgrQueryType, started_at: Instant) -> SmgrOpTimer {
+    pub(crate) fn start_smgr_op(&self, op: SmgrQueryType, received_at: Instant) -> SmgrOpTimer {
        self.global_started[op as usize].inc();

        let per_timeline_latency_histo = if matches!(op, SmgrQueryType::GetPageAtLsn) {
@@ -1627,15 +1693,15 @@ impl SmgrQueryTimePerTimeline {
        };

        SmgrOpTimer(Some(SmgrOpTimerInner {
-            global_latency_histo: self.global_latency[op as usize].clone(),
-            per_timeline_latency_histo,
-            start: started_at,
-            op,
-            throttled: Duration::ZERO,
+            global_execution_latency_histo: self.global_latency[op as usize].clone(),
+            per_timeline_execution_latency_histo: per_timeline_latency_histo,
+            timings: SmgrOpTimerState::Received { received_at },
            global_flush_in_progress_micros: self.global_flush_in_progress_micros.clone(),
            per_timeline_flush_in_progress_micros: self
                .per_timeline_flush_in_progress_micros
                .clone(),
+            global_batch_wait_time: self.global_batch_wait_time.clone(),
+            per_timeline_batch_wait_time: self.per_timeline_batch_wait_time.clone(),
        }))
    }

@@ -2889,6 +2955,11 @@ impl TimelineMetrics {
            shard_id,
            timeline_id,
        ]);
+        let _ = PAGE_SERVICE_SMGR_BATCH_WAIT_TIME.remove_label_values(&[
+            tenant_id,
+            shard_id,
+            timeline_id,
+        ]);
    }
 }

@@ -2919,6 +2990,7 @@ use crate::context::{PageContentKind, RequestContext};
 use crate::task_mgr::TaskKind;
 use crate::tenant::mgr::TenantSlot;
 use crate::tenant::tasks::BackgroundLoopKind;
+use crate::tenant::throttle::ThrottleResult;
 use crate::tenant::Timeline;

 /// Maintain a per timeline gauge in addition to the global gauge.
@@ -3773,6 +3845,7 @@ pub fn preinitialize_metrics(conf: &'static PageServerConf) {
        &REMOTE_ONDEMAND_DOWNLOADED_BYTES,
        &CIRCUIT_BREAKERS_BROKEN,
        &CIRCUIT_BREAKERS_UNBROKEN,
+        &PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS_GLOBAL,
    ]
    .into_iter()
    .for_each(|c| {
@@ -3820,6 +3893,7 @@ pub fn preinitialize_metrics(conf: &'static PageServerConf) {
        &WAL_REDO_BYTES_HISTOGRAM,
        &WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM,
        &PAGE_SERVICE_BATCH_SIZE_GLOBAL,
+        &PAGE_SERVICE_SMGR_BATCH_WAIT_TIME_GLOBAL,
    ]
    .into_iter()
    .for_each(|h| {
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -575,7 +575,10 @@ enum BatchedFeMessage {
 }

 impl BatchedFeMessage {
-    async fn throttle(&mut self, cancel: &CancellationToken) -> Result<(), QueryError> {
+    async fn throttle_and_record_start_processing(
+        &mut self,
+        cancel: &CancellationToken,
+    ) -> Result<(), QueryError> {
        let (shard, tokens, timers) = match self {
            BatchedFeMessage::Exists { shard, timer, .. }
            | BatchedFeMessage::Nblocks { shard, timer, .. }
@@ -603,7 +606,7 @@ impl BatchedFeMessage {
            }
        };
        for timer in timers {
-            timer.deduct_throttle(&throttled);
+            timer.observe_throttle_done_execution_starting(&throttled);
        }
        Ok(())
    }
@@ -1230,7 +1233,7 @@ impl PageServerHandler {
                }
            };

-            if let Err(cancelled) = msg.throttle(&self.cancel).await {
+            if let Err(cancelled) = msg.throttle_and_record_start_processing(&self.cancel).await {
                break cancelled;
            }

@@ -1397,7 +1400,9 @@ impl PageServerHandler {
                            return Err(e);
                        }
                    };
-                    batch.throttle(&self.cancel).await?;
+                    batch
+                        .throttle_and_record_start_processing(&self.cancel)
+                        .await?;
                    self.pagesteam_handle_batched_message(pgb_writer, batch, &cancel, &ctx)
                        .await?;
                }
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -44,6 +44,7 @@ use std::sync::atomic::AtomicBool;
 use std::sync::Weak;
 use std::time::SystemTime;
 use storage_broker::BrokerClientChannel;
+use timeline::compaction::GcCompactJob;
 use timeline::compaction::ScheduledCompactionTask;
 use timeline::import_pgdata;
 use timeline::offload::offload_timeline;
@@ -68,6 +69,7 @@ use utils::sync::gate::Gate;
 use utils::sync::gate::GateGuard;
 use utils::timeout::timeout_cancellable;
 use utils::timeout::TimeoutCancellableError;
+use utils::try_rcu::ArcSwapExt;
 use utils::zstd::create_zst_tarball;
 use utils::zstd::extract_zst_tarball;

@@ -3016,8 +3018,15 @@ impl Tenant {
                            warn!("ignoring scheduled compaction task: scheduled task must be gc compaction: {:?}", next_scheduled_compaction_task.options);
                        } else if next_scheduled_compaction_task.options.sub_compaction {
                            info!("running scheduled enhanced gc bottom-most compaction with sub-compaction, splitting compaction jobs");
-                            let jobs = timeline
-                                .gc_compaction_split_jobs(next_scheduled_compaction_task.options)
+                            let jobs: Vec<GcCompactJob> = timeline
+                                .gc_compaction_split_jobs(
+                                    GcCompactJob::from_compact_options(
+                                        next_scheduled_compaction_task.options.clone(),
+                                    ),
+                                    next_scheduled_compaction_task
+                                        .options
+                                        .sub_compaction_max_job_size_mb,
+                                )
                                .await
                                .map_err(CompactionError::Other)?;
                            if jobs.is_empty() {
@@ -3028,9 +3037,23 @@ impl Tenant {
                                let mut guard = self.scheduled_compaction_tasks.lock().unwrap();
                                let tline_pending_tasks = guard.entry(*timeline_id).or_default();
                                for (idx, job) in jobs.into_iter().enumerate() {
+                                    // Unfortunately we need to convert the `GcCompactJob` back to `CompactionOptions`
+                                    // until we do further refactors to allow directly call `compact_with_gc`.
+                                    let mut flags: EnumSet<CompactFlags> = EnumSet::default();
+                                    flags |= CompactFlags::EnhancedGcBottomMostCompaction;
+                                    if job.dry_run {
+                                        flags |= CompactFlags::DryRun;
+                                    }
+                                    let options = CompactOptions {
+                                        flags,
+                                        sub_compaction: false,
+                                        compact_key_range: Some(job.compact_key_range.into()),
+                                        compact_lsn_range: Some(job.compact_lsn_range.into()),
+                                        sub_compaction_max_job_size_mb: None,
+                                    };
                                    tline_pending_tasks.push_back(if idx == jobs_len - 1 {
                                        ScheduledCompactionTask {
-                                            options: job,
+                                            options,
                                            // The last job in the queue sends the signal and releases the gc guard
                                            result_tx: next_scheduled_compaction_task
                                                .result_tx
@@ -3041,7 +3064,7 @@ impl Tenant {
                                        }
                                    } else {
                                        ScheduledCompactionTask {
-                                            options: job,
+                                            options,
                                            result_tx: None,
                                            gc_block: None,
                                        }
@@ -3921,25 +3944,28 @@ impl Tenant {
        }
    }

-    pub fn set_new_tenant_config(&self, new_tenant_conf: TenantConfOpt) {
+    pub fn update_tenant_config<F: Fn(TenantConfOpt) -> anyhow::Result<TenantConfOpt>>(
+        &self,
+        update: F,
+    ) -> anyhow::Result<TenantConfOpt> {
        // Use read-copy-update in order to avoid overwriting the location config
        // state if this races with [`Tenant::set_new_location_config`]. Note that
        // this race is not possible if both request types come from the storage
        // controller (as they should!) because an exclusive op lock is required
        // on the storage controller side.

-        self.tenant_conf.rcu(|inner| {
-            Arc::new(AttachedTenantConf {
-                tenant_conf: new_tenant_conf.clone(),
-                location: inner.location,
-                // Attached location is not changed, no need to update lsn lease deadline.
-                lsn_lease_deadline: inner.lsn_lease_deadline,
-            })
-        });
+        self.tenant_conf
+            .try_rcu(|attached_conf| -> Result<_, anyhow::Error> {
+                Ok(Arc::new(AttachedTenantConf {
+                    tenant_conf: update(attached_conf.tenant_conf.clone())?,
+                    location: attached_conf.location,
+                    lsn_lease_deadline: attached_conf.lsn_lease_deadline,
+                }))
+            })?;

-        let updated = self.tenant_conf.load().clone();
+        let updated = self.tenant_conf.load();

-        self.tenant_conf_updated(&new_tenant_conf);
+        self.tenant_conf_updated(&updated.tenant_conf);
        // Don't hold self.timelines.lock() during the notifies.
        // There's no risk of deadlock right now, but there could be if we consolidate
        // mutexes in struct Timeline in the future.
@@ -3947,6 +3973,8 @@ impl Tenant {
        for timeline in timelines {
            timeline.tenant_conf_updated(&updated);
        }
+
+        Ok(updated.tenant_conf.clone())
    }

    pub(crate) fn set_new_location_config(&self, new_conf: AttachedTenantConf) {
@@ -5736,6 +5764,8 @@ mod tests {
    #[cfg(feature = "testing")]
    use timeline::compaction::{KeyHistoryRetention, KeyLogAtLsn};
    #[cfg(feature = "testing")]
+    use timeline::CompactLsnRange;
+    #[cfg(feature = "testing")]
    use timeline::GcInfo;

    static TEST_KEY: Lazy<Key> =
@@ -9327,7 +9357,6 @@ mod tests {
                &cancel,
                CompactOptions {
                    flags: dryrun_flags,
-                    compact_range: None,
                    ..Default::default()
                },
                &ctx,
@@ -9576,7 +9605,6 @@ mod tests {
                &cancel,
                CompactOptions {
                    flags: dryrun_flags,
-                    compact_range: None,
                    ..Default::default()
                },
                &ctx,
@@ -9606,6 +9634,8 @@ mod tests {
    #[cfg(feature = "testing")]
    #[tokio::test]
    async fn test_simple_bottom_most_compaction_on_branch() -> anyhow::Result<()> {
+        use timeline::CompactLsnRange;
+
        let harness = TenantHarness::create("test_simple_bottom_most_compaction_on_branch").await?;
        let (tenant, ctx) = harness.load().await;

@@ -9798,6 +9828,22 @@ mod tests {

        verify_result().await;

+        // Piggyback a compaction with above_lsn. Ensure it works correctly when the specified LSN intersects with the layer files.
+        // Now we already have a single large delta layer, so the compaction min_layer_lsn should be the same as ancestor LSN (0x18).
+        branch_tline
+            .compact_with_gc(
+                &cancel,
+                CompactOptions {
+                    compact_lsn_range: Some(CompactLsnRange::above(Lsn(0x40))),
+                    ..Default::default()
+                },
+                &ctx,
+            )
+            .await
+            .unwrap();
+
+        verify_result().await;
+
        Ok(())
    }

@@ -10086,7 +10132,7 @@ mod tests {
                &cancel,
                CompactOptions {
                    flags: EnumSet::new(),
-                    compact_range: Some((get_key(0)..get_key(2)).into()),
+                    compact_key_range: Some((get_key(0)..get_key(2)).into()),
                    ..Default::default()
                },
                &ctx,
@@ -10133,7 +10179,7 @@ mod tests {
                &cancel,
                CompactOptions {
                    flags: EnumSet::new(),
-                    compact_range: Some((get_key(2)..get_key(4)).into()),
+                    compact_key_range: Some((get_key(2)..get_key(4)).into()),
                    ..Default::default()
                },
                &ctx,
@@ -10185,7 +10231,7 @@ mod tests {
                &cancel,
                CompactOptions {
                    flags: EnumSet::new(),
-                    compact_range: Some((get_key(4)..get_key(9)).into()),
+                    compact_key_range: Some((get_key(4)..get_key(9)).into()),
                    ..Default::default()
                },
                &ctx,
@@ -10236,7 +10282,7 @@ mod tests {
                &cancel,
                CompactOptions {
                    flags: EnumSet::new(),
-                    compact_range: Some((get_key(9)..get_key(10)).into()),
+                    compact_key_range: Some((get_key(9)..get_key(10)).into()),
                    ..Default::default()
                },
                &ctx,
@@ -10292,7 +10338,7 @@ mod tests {
                &cancel,
                CompactOptions {
                    flags: EnumSet::new(),
-                    compact_range: Some((get_key(0)..get_key(10)).into()),
+                    compact_key_range: Some((get_key(0)..get_key(10)).into()),
                    ..Default::default()
                },
                &ctx,
@@ -10321,7 +10367,6 @@ mod tests {
                },
            ],
        );
-
        Ok(())
    }

@@ -10374,4 +10419,602 @@ mod tests {

        Ok(())
    }
+
+    #[cfg(feature = "testing")]
+    #[tokio::test]
+    async fn test_simple_bottom_most_compaction_above_lsn() -> anyhow::Result<()> {
+        let harness = TenantHarness::create("test_simple_bottom_most_compaction_above_lsn").await?;
+        let (tenant, ctx) = harness.load().await;
+
+        fn get_key(id: u32) -> Key {
+            // using aux key here b/c they are guaranteed to be inside `collect_keyspace`.
+            let mut key = Key::from_hex("620000000033333333444444445500000000").unwrap();
+            key.field6 = id;
+            key
+        }
+
+        let img_layer = (0..10)
+            .map(|id| (get_key(id), Bytes::from(format!("value {id}@0x10"))))
+            .collect_vec();
+
+        let delta1 = vec![(
+            get_key(1),
+            Lsn(0x20),
+            Value::WalRecord(NeonWalRecord::wal_append("@0x20")),
+        )];
+        let delta4 = vec![(
+            get_key(1),
+            Lsn(0x28),
+            Value::WalRecord(NeonWalRecord::wal_append("@0x28")),
+        )];
+        let delta2 = vec![
+            (
+                get_key(1),
+                Lsn(0x30),
+                Value::WalRecord(NeonWalRecord::wal_append("@0x30")),
+            ),
+            (
+                get_key(1),
+                Lsn(0x38),
+                Value::WalRecord(NeonWalRecord::wal_append("@0x38")),
+            ),
+        ];
+        let delta3 = vec![
+            (
+                get_key(8),
+                Lsn(0x48),
+                Value::WalRecord(NeonWalRecord::wal_append("@0x48")),
+            ),
+            (
+                get_key(9),
+                Lsn(0x48),
+                Value::WalRecord(NeonWalRecord::wal_append("@0x48")),
+            ),
+        ];
+
+        let tline = tenant
+            .create_test_timeline_with_layers(
+                TIMELINE_ID,
+                Lsn(0x10),
+                DEFAULT_PG_VERSION,
+                &ctx,
+                vec![
+                    // delta1/2/4 only contain a single key but multiple updates
+                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x28), delta1),
+                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x30)..Lsn(0x50), delta2),
+                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x28)..Lsn(0x30), delta4),
+                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x30)..Lsn(0x50), delta3),
+                ], // delta layers
+                vec![(Lsn(0x10), img_layer)], // image layers
+                Lsn(0x50),
+            )
+            .await?;
+        {
+            tline
+                .latest_gc_cutoff_lsn
+                .lock_for_write()
+                .store_and_unlock(Lsn(0x30))
+                .wait()
+                .await;
+            // Update GC info
+            let mut guard = tline.gc_info.write().unwrap();
+            *guard = GcInfo {
+                retain_lsns: vec![
+                    (Lsn(0x10), tline.timeline_id, MaybeOffloaded::No),
+                    (Lsn(0x20), tline.timeline_id, MaybeOffloaded::No),
+                ],
+                cutoffs: GcCutoffs {
+                    time: Lsn(0x30),
+                    space: Lsn(0x30),
+                },
+                leases: Default::default(),
+                within_ancestor_pitr: false,
+            };
+        }
+
+        let expected_result = [
+            Bytes::from_static(b"value 0@0x10"),
+            Bytes::from_static(b"value 1@0x10@0x20@0x28@0x30@0x38"),
+            Bytes::from_static(b"value 2@0x10"),
+            Bytes::from_static(b"value 3@0x10"),
+            Bytes::from_static(b"value 4@0x10"),
+            Bytes::from_static(b"value 5@0x10"),
+            Bytes::from_static(b"value 6@0x10"),
+            Bytes::from_static(b"value 7@0x10"),
+            Bytes::from_static(b"value 8@0x10@0x48"),
+            Bytes::from_static(b"value 9@0x10@0x48"),
+        ];
+
+        let expected_result_at_gc_horizon = [
+            Bytes::from_static(b"value 0@0x10"),
+            Bytes::from_static(b"value 1@0x10@0x20@0x28@0x30"),
+            Bytes::from_static(b"value 2@0x10"),
+            Bytes::from_static(b"value 3@0x10"),
+            Bytes::from_static(b"value 4@0x10"),
+            Bytes::from_static(b"value 5@0x10"),
+            Bytes::from_static(b"value 6@0x10"),
+            Bytes::from_static(b"value 7@0x10"),
+            Bytes::from_static(b"value 8@0x10"),
+            Bytes::from_static(b"value 9@0x10"),
+        ];
+
+        let expected_result_at_lsn_20 = [
+            Bytes::from_static(b"value 0@0x10"),
+            Bytes::from_static(b"value 1@0x10@0x20"),
+            Bytes::from_static(b"value 2@0x10"),
+            Bytes::from_static(b"value 3@0x10"),
+            Bytes::from_static(b"value 4@0x10"),
+            Bytes::from_static(b"value 5@0x10"),
+            Bytes::from_static(b"value 6@0x10"),
+            Bytes::from_static(b"value 7@0x10"),
+            Bytes::from_static(b"value 8@0x10"),
+            Bytes::from_static(b"value 9@0x10"),
+        ];
+
+        let expected_result_at_lsn_10 = [
+            Bytes::from_static(b"value 0@0x10"),
+            Bytes::from_static(b"value 1@0x10"),
+            Bytes::from_static(b"value 2@0x10"),
+            Bytes::from_static(b"value 3@0x10"),
+            Bytes::from_static(b"value 4@0x10"),
+            Bytes::from_static(b"value 5@0x10"),
+            Bytes::from_static(b"value 6@0x10"),
+            Bytes::from_static(b"value 7@0x10"),
+            Bytes::from_static(b"value 8@0x10"),
+            Bytes::from_static(b"value 9@0x10"),
+        ];
+
+        let verify_result = || async {
+            let gc_horizon = {
+                let gc_info = tline.gc_info.read().unwrap();
+                gc_info.cutoffs.time
+            };
+            for idx in 0..10 {
+                assert_eq!(
+                    tline
+                        .get(get_key(idx as u32), Lsn(0x50), &ctx)
+                        .await
+                        .unwrap(),
+                    &expected_result[idx]
+                );
+                assert_eq!(
+                    tline
+                        .get(get_key(idx as u32), gc_horizon, &ctx)
+                        .await
+                        .unwrap(),
+                    &expected_result_at_gc_horizon[idx]
+                );
+                assert_eq!(
+                    tline
+                        .get(get_key(idx as u32), Lsn(0x20), &ctx)
+                        .await
+                        .unwrap(),
+                    &expected_result_at_lsn_20[idx]
+                );
+                assert_eq!(
+                    tline
+                        .get(get_key(idx as u32), Lsn(0x10), &ctx)
+                        .await
+                        .unwrap(),
+                    &expected_result_at_lsn_10[idx]
+                );
+            }
+        };
+
+        verify_result().await;
+
+        let cancel = CancellationToken::new();
+        tline
+            .compact_with_gc(
+                &cancel,
+                CompactOptions {
+                    compact_lsn_range: Some(CompactLsnRange::above(Lsn(0x28))),
+                    ..Default::default()
+                },
+                &ctx,
+            )
+            .await
+            .unwrap();
+        verify_result().await;
+
+        let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await;
+        check_layer_map_key_eq(
+            all_layers,
+            vec![
+                // The original image layer, not compacted
+                PersistentLayerKey {
+                    key_range: get_key(0)..get_key(10),
+                    lsn_range: Lsn(0x10)..Lsn(0x11),
+                    is_delta: false,
+                },
+                // Delta layer below the specified above_lsn not compacted
+                PersistentLayerKey {
+                    key_range: get_key(1)..get_key(2),
+                    lsn_range: Lsn(0x20)..Lsn(0x28),
+                    is_delta: true,
+                },
+                // Delta layer compacted above the LSN
+                PersistentLayerKey {
+                    key_range: get_key(1)..get_key(10),
+                    lsn_range: Lsn(0x28)..Lsn(0x50),
+                    is_delta: true,
+                },
+            ],
+        );
+
+        // compact again
+        tline
+            .compact_with_gc(&cancel, CompactOptions::default(), &ctx)
+            .await
+            .unwrap();
+        verify_result().await;
+
+        let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await;
+        check_layer_map_key_eq(
+            all_layers,
+            vec![
+                // The compacted image layer (full key range)
+                PersistentLayerKey {
+                    key_range: Key::MIN..Key::MAX,
+                    lsn_range: Lsn(0x10)..Lsn(0x11),
+                    is_delta: false,
+                },
+                // All other data in the delta layer
+                PersistentLayerKey {
+                    key_range: get_key(1)..get_key(10),
+                    lsn_range: Lsn(0x10)..Lsn(0x50),
+                    is_delta: true,
+                },
+            ],
+        );
+
+        Ok(())
+    }
+
+    #[cfg(feature = "testing")]
+    #[tokio::test]
+    async fn test_simple_bottom_most_compaction_rectangle() -> anyhow::Result<()> {
+        let harness = TenantHarness::create("test_simple_bottom_most_compaction_rectangle").await?;
+        let (tenant, ctx) = harness.load().await;
+
+        fn get_key(id: u32) -> Key {
+            // using aux key here b/c they are guaranteed to be inside `collect_keyspace`.
+            let mut key = Key::from_hex("620000000033333333444444445500000000").unwrap();
+            key.field6 = id;
+            key
+        }
+
+        let img_layer = (0..10)
+            .map(|id| (get_key(id), Bytes::from(format!("value {id}@0x10"))))
+            .collect_vec();
+
+        let delta1 = vec![(
+            get_key(1),
+            Lsn(0x20),
+            Value::WalRecord(NeonWalRecord::wal_append("@0x20")),
+        )];
+        let delta4 = vec![(
+            get_key(1),
+            Lsn(0x28),
+            Value::WalRecord(NeonWalRecord::wal_append("@0x28")),
+        )];
+        let delta2 = vec![
+            (
+                get_key(1),
+                Lsn(0x30),
+                Value::WalRecord(NeonWalRecord::wal_append("@0x30")),
+            ),
+            (
+                get_key(1),
+                Lsn(0x38),
+                Value::WalRecord(NeonWalRecord::wal_append("@0x38")),
+            ),
+        ];
+        let delta3 = vec![
+            (
+                get_key(8),
+                Lsn(0x48),
+                Value::WalRecord(NeonWalRecord::wal_append("@0x48")),
+            ),
+            (
+                get_key(9),
+                Lsn(0x48),
+                Value::WalRecord(NeonWalRecord::wal_append("@0x48")),
+            ),
+        ];
+
+        let tline = tenant
+            .create_test_timeline_with_layers(
+                TIMELINE_ID,
+                Lsn(0x10),
+                DEFAULT_PG_VERSION,
+                &ctx,
+                vec![
+                    // delta1/2/4 only contain a single key but multiple updates
+                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x28), delta1),
+                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x30)..Lsn(0x50), delta2),
+                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x28)..Lsn(0x30), delta4),
+                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x30)..Lsn(0x50), delta3),
+                ], // delta layers
+                vec![(Lsn(0x10), img_layer)], // image layers
+                Lsn(0x50),
+            )
+            .await?;
+        {
+            tline
+                .latest_gc_cutoff_lsn
+                .lock_for_write()
+                .store_and_unlock(Lsn(0x30))
+                .wait()
+                .await;
+            // Update GC info
+            let mut guard = tline.gc_info.write().unwrap();
+            *guard = GcInfo {
+                retain_lsns: vec![
+                    (Lsn(0x10), tline.timeline_id, MaybeOffloaded::No),
+                    (Lsn(0x20), tline.timeline_id, MaybeOffloaded::No),
+                ],
+                cutoffs: GcCutoffs {
+                    time: Lsn(0x30),
+                    space: Lsn(0x30),
+                },
+                leases: Default::default(),
+                within_ancestor_pitr: false,
+            };
+        }
+
+        let expected_result = [
+            Bytes::from_static(b"value 0@0x10"),
+            Bytes::from_static(b"value 1@0x10@0x20@0x28@0x30@0x38"),
+            Bytes::from_static(b"value 2@0x10"),
+            Bytes::from_static(b"value 3@0x10"),
+            Bytes::from_static(b"value 4@0x10"),
+            Bytes::from_static(b"value 5@0x10"),
+            Bytes::from_static(b"value 6@0x10"),
+            Bytes::from_static(b"value 7@0x10"),
+            Bytes::from_static(b"value 8@0x10@0x48"),
+            Bytes::from_static(b"value 9@0x10@0x48"),
+        ];
+
+        let expected_result_at_gc_horizon = [
+            Bytes::from_static(b"value 0@0x10"),
+            Bytes::from_static(b"value 1@0x10@0x20@0x28@0x30"),
+            Bytes::from_static(b"value 2@0x10"),
+            Bytes::from_static(b"value 3@0x10"),
+            Bytes::from_static(b"value 4@0x10"),
+            Bytes::from_static(b"value 5@0x10"),
+            Bytes::from_static(b"value 6@0x10"),
+            Bytes::from_static(b"value 7@0x10"),
+            Bytes::from_static(b"value 8@0x10"),
+            Bytes::from_static(b"value 9@0x10"),
+        ];
+
+        let expected_result_at_lsn_20 = [
+            Bytes::from_static(b"value 0@0x10"),
+            Bytes::from_static(b"value 1@0x10@0x20"),
+            Bytes::from_static(b"value 2@0x10"),
+            Bytes::from_static(b"value 3@0x10"),
+            Bytes::from_static(b"value 4@0x10"),
+            Bytes::from_static(b"value 5@0x10"),
+            Bytes::from_static(b"value 6@0x10"),
+            Bytes::from_static(b"value 7@0x10"),
+            Bytes::from_static(b"value 8@0x10"),
+            Bytes::from_static(b"value 9@0x10"),
+        ];
+
+        let expected_result_at_lsn_10 = [
+            Bytes::from_static(b"value 0@0x10"),
+            Bytes::from_static(b"value 1@0x10"),
+            Bytes::from_static(b"value 2@0x10"),
+            Bytes::from_static(b"value 3@0x10"),
+            Bytes::from_static(b"value 4@0x10"),
+            Bytes::from_static(b"value 5@0x10"),
+            Bytes::from_static(b"value 6@0x10"),
+            Bytes::from_static(b"value 7@0x10"),
+            Bytes::from_static(b"value 8@0x10"),
+            Bytes::from_static(b"value 9@0x10"),
+        ];
+
+        let verify_result = || async {
+            let gc_horizon = {
+                let gc_info = tline.gc_info.read().unwrap();
+                gc_info.cutoffs.time
+            };
+            for idx in 0..10 {
+                assert_eq!(
+                    tline
+                        .get(get_key(idx as u32), Lsn(0x50), &ctx)
+                        .await
+                        .unwrap(),
+                    &expected_result[idx]
+                );
+                assert_eq!(
+                    tline
+                        .get(get_key(idx as u32), gc_horizon, &ctx)
+                        .await
+                        .unwrap(),
+                    &expected_result_at_gc_horizon[idx]
+                );
+                assert_eq!(
+                    tline
+                        .get(get_key(idx as u32), Lsn(0x20), &ctx)
+                        .await
+                        .unwrap(),
+                    &expected_result_at_lsn_20[idx]
+                );
+                assert_eq!(
+                    tline
+                        .get(get_key(idx as u32), Lsn(0x10), &ctx)
+                        .await
+                        .unwrap(),
+                    &expected_result_at_lsn_10[idx]
+                );
+            }
+        };
+
+        verify_result().await;
+
+        let cancel = CancellationToken::new();
+
+        tline
+            .compact_with_gc(
+                &cancel,
+                CompactOptions {
+                    compact_key_range: Some((get_key(0)..get_key(2)).into()),
+                    compact_lsn_range: Some((Lsn(0x20)..Lsn(0x28)).into()),
+                    ..Default::default()
+                },
+                &ctx,
+            )
+            .await
+            .unwrap();
+        verify_result().await;
+
+        let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await;
+        check_layer_map_key_eq(
+            all_layers,
+            vec![
+                // The original image layer, not compacted
+                PersistentLayerKey {
+                    key_range: get_key(0)..get_key(10),
+                    lsn_range: Lsn(0x10)..Lsn(0x11),
+                    is_delta: false,
+                },
+                // According the selection logic, we select all layers with start key <= 0x28, so we would merge the layer 0x20-0x28 and
+                // the layer 0x28-0x30 into one.
+                PersistentLayerKey {
+                    key_range: get_key(1)..get_key(2),
+                    lsn_range: Lsn(0x20)..Lsn(0x30),
+                    is_delta: true,
+                },
+                // Above the upper bound and untouched
+                PersistentLayerKey {
+                    key_range: get_key(1)..get_key(2),
+                    lsn_range: Lsn(0x30)..Lsn(0x50),
+                    is_delta: true,
+                },
+                // This layer is untouched
+                PersistentLayerKey {
+                    key_range: get_key(8)..get_key(10),
+                    lsn_range: Lsn(0x30)..Lsn(0x50),
+                    is_delta: true,
+                },
+            ],
+        );
+
+        tline
+            .compact_with_gc(
+                &cancel,
+                CompactOptions {
+                    compact_key_range: Some((get_key(3)..get_key(8)).into()),
+                    compact_lsn_range: Some((Lsn(0x28)..Lsn(0x40)).into()),
+                    ..Default::default()
+                },
+                &ctx,
+            )
+            .await
+            .unwrap();
+        verify_result().await;
+
+        let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await;
+        check_layer_map_key_eq(
+            all_layers,
+            vec![
+                // The original image layer, not compacted
+                PersistentLayerKey {
+                    key_range: get_key(0)..get_key(10),
+                    lsn_range: Lsn(0x10)..Lsn(0x11),
+                    is_delta: false,
+                },
+                // Not in the compaction key range, uncompacted
+                PersistentLayerKey {
+                    key_range: get_key(1)..get_key(2),
+                    lsn_range: Lsn(0x20)..Lsn(0x30),
+                    is_delta: true,
+                },
+                // Not in the compaction key range, uncompacted but need rewrite because the delta layer overlaps with the range
+                PersistentLayerKey {
+                    key_range: get_key(1)..get_key(2),
+                    lsn_range: Lsn(0x30)..Lsn(0x50),
+                    is_delta: true,
+                },
+                // Note that when we specify the LSN upper bound to be 0x40, the compaction algorithm will not try to cut the layer
+                // horizontally in half. Instead, it will include all LSNs that overlap with 0x40. So the real max_lsn of the compaction
+                // becomes 0x50.
+                PersistentLayerKey {
+                    key_range: get_key(8)..get_key(10),
+                    lsn_range: Lsn(0x30)..Lsn(0x50),
+                    is_delta: true,
+                },
+            ],
+        );
+
+        // compact again
+        tline
+            .compact_with_gc(
+                &cancel,
+                CompactOptions {
+                    compact_key_range: Some((get_key(0)..get_key(5)).into()),
+                    compact_lsn_range: Some((Lsn(0x20)..Lsn(0x50)).into()),
+                    ..Default::default()
+                },
+                &ctx,
+            )
+            .await
+            .unwrap();
+        verify_result().await;
+
+        let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await;
+        check_layer_map_key_eq(
+            all_layers,
+            vec![
+                // The original image layer, not compacted
+                PersistentLayerKey {
+                    key_range: get_key(0)..get_key(10),
+                    lsn_range: Lsn(0x10)..Lsn(0x11),
+                    is_delta: false,
+                },
+                // The range gets compacted
+                PersistentLayerKey {
+                    key_range: get_key(1)..get_key(2),
+                    lsn_range: Lsn(0x20)..Lsn(0x50),
+                    is_delta: true,
+                },
+                // Not touched during this iteration of compaction
+                PersistentLayerKey {
+                    key_range: get_key(8)..get_key(10),
+                    lsn_range: Lsn(0x30)..Lsn(0x50),
+                    is_delta: true,
+                },
+            ],
+        );
+
+        // final full compaction
+        tline
+            .compact_with_gc(&cancel, CompactOptions::default(), &ctx)
+            .await
+            .unwrap();
+        verify_result().await;
+
+        let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await;
+        check_layer_map_key_eq(
+            all_layers,
+            vec![
+                // The compacted image layer (full key range)
+                PersistentLayerKey {
+                    key_range: Key::MIN..Key::MAX,
+                    lsn_range: Lsn(0x10)..Lsn(0x11),
+                    is_delta: false,
+                },
+                // All other data in the delta layer
+                PersistentLayerKey {
+                    key_range: get_key(1)..get_key(10),
+                    lsn_range: Lsn(0x10)..Lsn(0x50),
+                    is_delta: true,
+                },
+            ],
+        );
+
+        Ok(())
+    }
 }
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -11,7 +11,7 @@
 pub(crate) use pageserver_api::config::TenantConfigToml as TenantConf;
 use pageserver_api::models::CompactionAlgorithmSettings;
 use pageserver_api::models::EvictionPolicy;
-use pageserver_api::models::{self, ThrottleConfig};
+use pageserver_api::models::{self, TenantConfigPatch, ThrottleConfig};
 use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize};
 use serde::de::IntoDeserializer;
 use serde::{Deserialize, Serialize};
@@ -427,6 +427,129 @@ impl TenantConfOpt {
                .or(global_conf.wal_receiver_protocol_override),
        }
    }
+
+    pub fn apply_patch(self, patch: TenantConfigPatch) -> anyhow::Result<TenantConfOpt> {
+        let Self {
+            mut checkpoint_distance,
+            mut checkpoint_timeout,
+            mut compaction_target_size,
+            mut compaction_period,
+            mut compaction_threshold,
+            mut compaction_algorithm,
+            mut gc_horizon,
+            mut gc_period,
+            mut image_creation_threshold,
+            mut pitr_interval,
+            mut walreceiver_connect_timeout,
+            mut lagging_wal_timeout,
+            mut max_lsn_wal_lag,
+            mut eviction_policy,
+            mut min_resident_size_override,
+            mut evictions_low_residence_duration_metric_threshold,
+            mut heatmap_period,
+            mut lazy_slru_download,
+            mut timeline_get_throttle,
+            mut image_layer_creation_check_threshold,
+            mut lsn_lease_length,
+            mut lsn_lease_length_for_ts,
+            mut timeline_offloading,
+            mut wal_receiver_protocol_override,
+        } = self;
+
+        patch.checkpoint_distance.apply(&mut checkpoint_distance);
+        patch
+            .checkpoint_timeout
+            .map(|v| humantime::parse_duration(&v))?
+            .apply(&mut checkpoint_timeout);
+        patch
+            .compaction_target_size
+            .apply(&mut compaction_target_size);
+        patch
+            .compaction_period
+            .map(|v| humantime::parse_duration(&v))?
+            .apply(&mut compaction_period);
+        patch.compaction_threshold.apply(&mut compaction_threshold);
+        patch.compaction_algorithm.apply(&mut compaction_algorithm);
+        patch.gc_horizon.apply(&mut gc_horizon);
+        patch
+            .gc_period
+            .map(|v| humantime::parse_duration(&v))?
+            .apply(&mut gc_period);
+        patch
+            .image_creation_threshold
+            .apply(&mut image_creation_threshold);
+        patch
+            .pitr_interval
+            .map(|v| humantime::parse_duration(&v))?
+            .apply(&mut pitr_interval);
+        patch
+            .walreceiver_connect_timeout
+            .map(|v| humantime::parse_duration(&v))?
+            .apply(&mut walreceiver_connect_timeout);
+        patch
+            .lagging_wal_timeout
+            .map(|v| humantime::parse_duration(&v))?
+            .apply(&mut lagging_wal_timeout);
+        patch.max_lsn_wal_lag.apply(&mut max_lsn_wal_lag);
+        patch.eviction_policy.apply(&mut eviction_policy);
+        patch
+            .min_resident_size_override
+            .apply(&mut min_resident_size_override);
+        patch
+            .evictions_low_residence_duration_metric_threshold
+            .map(|v| humantime::parse_duration(&v))?
+            .apply(&mut evictions_low_residence_duration_metric_threshold);
+        patch
+            .heatmap_period
+            .map(|v| humantime::parse_duration(&v))?
+            .apply(&mut heatmap_period);
+        patch.lazy_slru_download.apply(&mut lazy_slru_download);
+        patch
+            .timeline_get_throttle
+            .apply(&mut timeline_get_throttle);
+        patch
+            .image_layer_creation_check_threshold
+            .apply(&mut image_layer_creation_check_threshold);
+        patch
+            .lsn_lease_length
+            .map(|v| humantime::parse_duration(&v))?
+            .apply(&mut lsn_lease_length);
+        patch
+            .lsn_lease_length_for_ts
+            .map(|v| humantime::parse_duration(&v))?
+            .apply(&mut lsn_lease_length_for_ts);
+        patch.timeline_offloading.apply(&mut timeline_offloading);
+        patch
+            .wal_receiver_protocol_override
+            .apply(&mut wal_receiver_protocol_override);
+
+        Ok(Self {
+            checkpoint_distance,
+            checkpoint_timeout,
+            compaction_target_size,
+            compaction_period,
+            compaction_threshold,
+            compaction_algorithm,
+            gc_horizon,
+            gc_period,
+            image_creation_threshold,
+            pitr_interval,
+            walreceiver_connect_timeout,
+            lagging_wal_timeout,
+            max_lsn_wal_lag,
+            eviction_policy,
+            min_resident_size_override,
+            evictions_low_residence_duration_metric_threshold,
+            heatmap_period,
+            lazy_slru_download,
+            timeline_get_throttle,
+            image_layer_creation_check_threshold,
+            lsn_lease_length,
+            lsn_lease_length_for_ts,
+            timeline_offloading,
+            wal_receiver_protocol_override,
+        })
+    }
 }

 impl TryFrom<&'_ models::TenantConfig> for TenantConfOpt {
--- a/pageserver/src/tenant/throttle.rs
+++ b/pageserver/src/tenant/throttle.rs
@@ -58,6 +58,11 @@ pub struct Stats {
    pub sum_throttled_usecs: u64,
 }

+pub enum ThrottleResult {
+    NotThrottled { start: Instant },
+    Throttled { start: Instant, end: Instant },
+}
+
 impl<M> Throttle<M>
 where
    M: Metric,
@@ -122,15 +127,15 @@ where
        self.inner.load().rate_limiter.steady_rps()
    }

-    pub async fn throttle(&self, key_count: usize) -> Option<Duration> {
+    pub async fn throttle(&self, key_count: usize) -> ThrottleResult {
        let inner = self.inner.load_full(); // clones the `Inner` Arc

-        if !inner.enabled {
-            return None;
-        }
-
        let start = std::time::Instant::now();

+        if !inner.enabled {
+            return ThrottleResult::NotThrottled { start };
+        }
+
        self.metric.accounting_start();
        self.count_accounted_start.fetch_add(1, Ordering::Relaxed);
        let did_throttle = inner.rate_limiter.acquire(key_count).await;
@@ -145,9 +150,9 @@ where
                .fetch_add(wait_time.as_micros() as u64, Ordering::Relaxed);
            let observation = Observation { wait_time };
            self.metric.observe_throttling(&observation);
-            Some(wait_time)
+            ThrottleResult::Throttled { start, end: now }
        } else {
-            None
+            ThrottleResult::NotThrottled { start }
        }
    }
 }
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -780,46 +780,90 @@ pub(crate) enum CompactFlags {
 #[serde_with::serde_as]
 #[derive(Debug, Clone, serde::Deserialize)]
 pub(crate) struct CompactRequest {
-    pub compact_range: Option<CompactRange>,
-    pub compact_below_lsn: Option<Lsn>,
+    pub compact_key_range: Option<CompactKeyRange>,
+    pub compact_lsn_range: Option<CompactLsnRange>,
    /// Whether the compaction job should be scheduled.
    #[serde(default)]
    pub scheduled: bool,
    /// Whether the compaction job should be split across key ranges.
    #[serde(default)]
    pub sub_compaction: bool,
+    /// Max job size for each subcompaction job.
+    pub sub_compaction_max_job_size_mb: Option<u64>,
 }

 #[serde_with::serde_as]
 #[derive(Debug, Clone, serde::Deserialize)]
-pub(crate) struct CompactRange {
+pub(crate) struct CompactLsnRange {
+    pub start: Lsn,
+    pub end: Lsn,
+}
+
+#[serde_with::serde_as]
+#[derive(Debug, Clone, serde::Deserialize)]
+pub(crate) struct CompactKeyRange {
    #[serde_as(as = "serde_with::DisplayFromStr")]
    pub start: Key,
    #[serde_as(as = "serde_with::DisplayFromStr")]
    pub end: Key,
 }

-impl From<Range<Key>> for CompactRange {
-    fn from(range: Range<Key>) -> Self {
-        CompactRange {
+impl From<Range<Lsn>> for CompactLsnRange {
+    fn from(range: Range<Lsn>) -> Self {
+        Self {
            start: range.start,
            end: range.end,
        }
    }
 }

+impl From<Range<Key>> for CompactKeyRange {
+    fn from(range: Range<Key>) -> Self {
+        Self {
+            start: range.start,
+            end: range.end,
+        }
+    }
+}
+
+impl From<CompactLsnRange> for Range<Lsn> {
+    fn from(range: CompactLsnRange) -> Self {
+        range.start..range.end
+    }
+}
+
+impl From<CompactKeyRange> for Range<Key> {
+    fn from(range: CompactKeyRange) -> Self {
+        range.start..range.end
+    }
+}
+
+impl CompactLsnRange {
+    #[cfg(test)]
+    #[cfg(feature = "testing")]
+    pub fn above(lsn: Lsn) -> Self {
+        Self {
+            start: lsn,
+            end: Lsn::MAX,
+        }
+    }
+}
+
 #[derive(Debug, Clone, Default)]
 pub(crate) struct CompactOptions {
    pub flags: EnumSet<CompactFlags>,
    /// If set, the compaction will only compact the key range specified by this option.
-    /// This option is only used by GC compaction.
-    pub compact_range: Option<CompactRange>,
-    /// If set, the compaction will only compact the LSN below this value.
-    /// This option is only used by GC compaction.
-    pub compact_below_lsn: Option<Lsn>,
+    /// This option is only used by GC compaction. For the full explanation, see [`compaction::GcCompactJob`].
+    pub compact_key_range: Option<CompactKeyRange>,
+    /// If set, the compaction will only compact the LSN within this value.
+    /// This option is only used by GC compaction. For the full explanation, see [`compaction::GcCompactJob`].
+    pub compact_lsn_range: Option<CompactLsnRange>,
    /// Enable sub-compaction (split compaction job across key ranges).
    /// This option is only used by GC compaction.
    pub sub_compaction: bool,
+    /// Set job size for the GC compaction.
+    /// This option is only used by GC compaction.
+    pub sub_compaction_max_job_size_mb: Option<u64>,
 }

 impl std::fmt::Debug for Timeline {
@@ -1641,9 +1685,10 @@ impl Timeline {
            cancel,
            CompactOptions {
                flags,
-                compact_range: None,
-                compact_below_lsn: None,
+                compact_key_range: None,
+                compact_lsn_range: None,
                sub_compaction: false,
+                sub_compaction_max_job_size_mb: None,
            },
            ctx,
        )
@@ -4019,8 +4064,11 @@ impl Timeline {
            // NB: there are two callers, one is the compaction task, of which there is only one per struct Tenant and hence Timeline.
            // The other is the initdb optimization in flush_frozen_layer, used by `boostrap_timeline`, which runs before `.activate()`
            // and hence before the compaction task starts.
+            // Note that there are a third "caller" that will take the `partitioning` lock. It is `gc_compaction_split_jobs` for
+            // gc-compaction where it uses the repartition data to determine the split jobs. In the future, it might use its own
+            // heuristics, but for now, we should allow concurrent access to it and let the caller retry compaction.
            return Err(CompactionError::Other(anyhow!(
-                "repartition() called concurrently, this should not happen"
+                "repartition() called concurrently, this is rare and a retry should be fine"
            )));
        };
        let ((dense_partition, sparse_partition), partition_lsn) = &*partitioning_guard;
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -10,8 +10,8 @@ use std::sync::Arc;

 use super::layer_manager::LayerManager;
 use super::{
-    CompactFlags, CompactOptions, CompactRange, CreateImageLayersError, DurationRecorder,
-    ImageLayerCreationMode, RecordedDuration, Timeline,
+    CompactFlags, CompactOptions, CreateImageLayersError, DurationRecorder, ImageLayerCreationMode,
+    RecordedDuration, Timeline,
 };

 use anyhow::{anyhow, bail, Context};
@@ -64,6 +64,9 @@ const COMPACTION_DELTA_THRESHOLD: usize = 5;

 /// A scheduled compaction task.
 pub(crate) struct ScheduledCompactionTask {
+    /// It's unfortunate that we need to store a compact options struct here because the only outer
+    /// API we can call here is `compact_with_options` which does a few setup calls before starting the
+    /// actual compaction job... We should refactor this to store `GcCompactionJob` in the future.
    pub options: CompactOptions,
    /// The channel to send the compaction result. If this is a subcompaction, the last compaction job holds the sender.
    pub result_tx: Option<tokio::sync::oneshot::Sender<()>>,
@@ -71,16 +74,57 @@ pub(crate) struct ScheduledCompactionTask {
    pub gc_block: Option<gc_block::Guard>,
 }

+/// A job description for the gc-compaction job. This structure describes the rectangle range that the job will
+/// process. The exact layers that need to be compacted/rewritten will be generated when `compact_with_gc` gets
+/// called.
+#[derive(Debug, Clone)]
+pub(crate) struct GcCompactJob {
+    pub dry_run: bool,
+    /// The key range to be compacted. The compaction algorithm will only regenerate key-value pairs within this range
+    /// [left inclusive, right exclusive), and other pairs will be rewritten into new files if necessary.
+    pub compact_key_range: Range<Key>,
+    /// The LSN range to be compacted. The compaction algorithm will use this range to determine the layers to be
+    /// selected for the compaction, and it does not guarantee the generated layers will have exactly the same LSN range
+    /// as specified here. The true range being compacted is `min_lsn/max_lsn` in [`GcCompactionJobDescription`].
+    /// min_lsn will always <= the lower bound specified here, and max_lsn will always >= the upper bound specified here.
+    pub compact_lsn_range: Range<Lsn>,
+}
+
+impl GcCompactJob {
+    pub fn from_compact_options(options: CompactOptions) -> Self {
+        GcCompactJob {
+            dry_run: options.flags.contains(CompactFlags::DryRun),
+            compact_key_range: options
+                .compact_key_range
+                .map(|x| x.into())
+                .unwrap_or(Key::MIN..Key::MAX),
+            compact_lsn_range: options
+                .compact_lsn_range
+                .map(|x| x.into())
+                .unwrap_or(Lsn::INVALID..Lsn::MAX),
+        }
+    }
+}
+
+/// A job description for the gc-compaction job. This structure is generated when `compact_with_gc` is called
+/// and contains the exact layers we want to compact.
 pub struct GcCompactionJobDescription {
    /// All layers to read in the compaction job
    selected_layers: Vec<Layer>,
-    /// GC cutoff of the job
+    /// GC cutoff of the job. This is the lowest LSN that will be accessed by the read/GC path and we need to
+    /// keep all deltas <= this LSN or generate an image == this LSN.
    gc_cutoff: Lsn,
-    /// LSNs to retain for the job
+    /// LSNs to retain for the job. Read path will use this LSN so we need to keep deltas <= this LSN or
+    /// generate an image == this LSN.
    retain_lsns_below_horizon: Vec<Lsn>,
-    /// Maximum layer LSN processed in this compaction
+    /// Maximum layer LSN processed in this compaction, that is max(end_lsn of layers). Exclusive. All data
+    /// \>= this LSN will be kept and will not be rewritten.
    max_layer_lsn: Lsn,
-    /// Only compact layers overlapping with this range
+    /// Minimum layer LSN processed in this compaction, that is min(start_lsn of layers). Inclusive.
+    /// All access below (strict lower than `<`) this LSN will be routed through the normal read path instead of
+    /// k-merge within gc-compaction.
+    min_layer_lsn: Lsn,
+    /// Only compact layers overlapping with this range.
    compaction_key_range: Range<Key>,
    /// When partial compaction is enabled, these layers need to be rewritten to ensure no overlap.
    /// This field is here solely for debugging. The field will not be read once the compaction
@@ -299,7 +343,7 @@ impl Timeline {
            )));
        }

-        if options.compact_range.is_some() {
+        if options.compact_key_range.is_some() || options.compact_lsn_range.is_some() {
            // maybe useful in the future? could implement this at some point
            return Err(CompactionError::Other(anyhow!(
                "compaction range is not supported for legacy compaction for now"
@@ -1754,32 +1798,35 @@ impl Timeline {
        Ok(())
    }

-    /// Split a gc-compaction job into multiple compaction jobs. Optimally, this function should return a vector of
-    /// `GcCompactionJobDesc`. But we want to keep it simple on the tenant scheduling side without exposing too much
-    /// ad-hoc information about gc compaction itself.
+    /// Split a gc-compaction job into multiple compaction jobs. The split is based on the key range and the estimated size of the compaction job.
+    /// The function returns a list of compaction jobs that can be executed separately. If the upper bound of the compact LSN
+    /// range is not specified, we will use the latest gc_cutoff as the upper bound, so that all jobs in the jobset acts
+    /// like a full compaction of the specified keyspace.
    pub(crate) async fn gc_compaction_split_jobs(
        self: &Arc<Self>,
-        options: CompactOptions,
-    ) -> anyhow::Result<Vec<CompactOptions>> {
-        if !options.sub_compaction {
-            return Ok(vec![options]);
-        }
-        let compact_range = options.compact_range.clone().unwrap_or(CompactRange {
-            start: Key::MIN,
-            end: Key::MAX,
-        });
-        let compact_below_lsn = if let Some(compact_below_lsn) = options.compact_below_lsn {
-            compact_below_lsn
+        job: GcCompactJob,
+        sub_compaction_max_job_size_mb: Option<u64>,
+    ) -> anyhow::Result<Vec<GcCompactJob>> {
+        let compact_below_lsn = if job.compact_lsn_range.end != Lsn::MAX {
+            job.compact_lsn_range.end
        } else {
            *self.get_latest_gc_cutoff_lsn() // use the real gc cutoff
        };
+
+        // Split compaction job to about 4GB each
+        const GC_COMPACT_MAX_SIZE_MB: u64 = 4 * 1024;
+        let sub_compaction_max_job_size_mb =
+            sub_compaction_max_job_size_mb.unwrap_or(GC_COMPACT_MAX_SIZE_MB);
+
        let mut compact_jobs = Vec::new();
        // For now, we simply use the key partitioning information; we should do a more fine-grained partitioning
        // by estimating the amount of files read for a compaction job. We should also partition on LSN.
-        let Ok(partition) = self.partitioning.try_lock() else {
-            bail!("failed to acquire partition lock");
+        let ((dense_ks, sparse_ks), _) = {
+            let Ok(partition) = self.partitioning.try_lock() else {
+                bail!("failed to acquire partition lock");
+            };
+            partition.clone()
        };
-        let ((dense_ks, sparse_ks), _) = &*partition;
        // Truncate the key range to be within user specified compaction range.
        fn truncate_to(
            source_start: &Key,
@@ -1808,8 +1855,8 @@ impl Timeline {
            let Some((start, end)) = truncate_to(
                &range.start,
                &range.end,
-                &compact_range.start,
-                &compact_range.end,
+                &job.compact_key_range.start,
+                &job.compact_key_range.end,
            ) else {
                continue;
            };
@@ -1819,8 +1866,6 @@ impl Timeline {
        let guard = self.layers.read().await;
        let layer_map = guard.layer_map()?;
        let mut current_start = None;
-        // Split compaction job to about 2GB each
-        const GC_COMPACT_MAX_SIZE_MB: u64 = 4 * 1024; // 4GB, TODO: should be configuration in the future
        let ranges_num = split_key_ranges.len();
        for (idx, (start, end)) in split_key_ranges.into_iter().enumerate() {
            if current_start.is_none() {
@@ -1833,8 +1878,7 @@ impl Timeline {
            }
            let res = layer_map.range_search(start..end, compact_below_lsn);
            let total_size = res.found.keys().map(|x| x.layer.file_size()).sum::<u64>();
-            if total_size > GC_COMPACT_MAX_SIZE_MB * 1024 * 1024 || ranges_num == idx + 1 {
-                let mut compact_options = options.clone();
+            if total_size > sub_compaction_max_job_size_mb * 1024 * 1024 || ranges_num == idx + 1 {
                // Try to extend the compaction range so that we include at least one full layer file.
                let extended_end = res
                    .found
@@ -1852,10 +1896,11 @@ impl Timeline {
                    "splitting compaction job: {}..{}, estimated_size={}",
                    start, end, total_size
                );
-                compact_options.compact_range = Some(CompactRange { start, end });
-                compact_options.compact_below_lsn = Some(compact_below_lsn);
-                compact_options.sub_compaction = false;
-                compact_jobs.push(compact_options);
+                compact_jobs.push(GcCompactJob {
+                    dry_run: job.dry_run,
+                    compact_key_range: start..end,
+                    compact_lsn_range: job.compact_lsn_range.start..compact_below_lsn,
+                });
                current_start = Some(end);
            }
        }
@@ -1877,7 +1922,7 @@ impl Timeline {
    /// Key::MIN..Key..MAX to the function indicates a full compaction, though technically, `Key::MAX` is not
    /// part of the range.
    ///
-    /// If `options.compact_below_lsn` is provided, the compaction will only compact layers below or intersect with
+    /// If `options.compact_lsn_range.end` is provided, the compaction will only compact layers below or intersect with
    /// the LSN. Otherwise, it will use the gc cutoff by default.
    pub(crate) async fn compact_with_gc(
        self: &Arc<Self>,
@@ -1885,9 +1930,13 @@ impl Timeline {
        options: CompactOptions,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
-        if options.sub_compaction {
+        let sub_compaction = options.sub_compaction;
+        let job = GcCompactJob::from_compact_options(options.clone());
+        if sub_compaction {
            info!("running enhanced gc bottom-most compaction with sub-compaction, splitting compaction jobs");
-            let jobs = self.gc_compaction_split_jobs(options).await?;
+            let jobs = self
+                .gc_compaction_split_jobs(job, options.sub_compaction_max_job_size_mb)
+                .await?;
            let jobs_len = jobs.len();
            for (idx, job) in jobs.into_iter().enumerate() {
                info!(
@@ -1902,19 +1951,15 @@ impl Timeline {
            }
            return Ok(());
        }
-        self.compact_with_gc_inner(cancel, options, ctx).await
+        self.compact_with_gc_inner(cancel, job, ctx).await
    }

    async fn compact_with_gc_inner(
        self: &Arc<Self>,
        cancel: &CancellationToken,
-        options: CompactOptions,
+        job: GcCompactJob,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
-        assert!(
-            !options.sub_compaction,
-            "sub-compaction should be handled by the outer function"
-        );
        // Block other compaction/GC tasks from running for now. GC-compaction could run along
        // with legacy compaction tasks in the future. Always ensure the lock order is compaction -> gc.
        // Note that we already acquired the compaction lock when the outer `compact` function gets called.
@@ -1934,19 +1979,11 @@ impl Timeline {
        )
        .await?;

-        let flags = options.flags;
-        let compaction_key_range = options
-            .compact_range
-            .map(|range| range.start..range.end)
-            .unwrap_or_else(|| Key::MIN..Key::MAX);
+        let dry_run = job.dry_run;
+        let compact_key_range = job.compact_key_range;
+        let compact_lsn_range = job.compact_lsn_range;

-        let dry_run = flags.contains(CompactFlags::DryRun);
-
-        if compaction_key_range == (Key::MIN..Key::MAX) {
-            info!("running enhanced gc bottom-most compaction, dry_run={dry_run}, compaction_key_range={}..{}", compaction_key_range.start, compaction_key_range.end);
-        } else {
-            info!("running enhanced gc bottom-most compaction, dry_run={dry_run}");
-        }
+        info!("running enhanced gc bottom-most compaction, dry_run={dry_run}, compact_key_range={}..{}, compact_lsn_range={}..{}", compact_key_range.start, compact_key_range.end, compact_lsn_range.start, compact_lsn_range.end);

        scopeguard::defer! {
            info!("done enhanced gc bottom-most compaction");
@@ -1970,11 +2007,15 @@ impl Timeline {
                // to get the truth data.
                let real_gc_cutoff = *self.get_latest_gc_cutoff_lsn();
                // The compaction algorithm will keep all keys above the gc_cutoff while keeping only necessary keys below the gc_cutoff for
-                // each of the retain_lsn. Therefore, if the user-provided `compact_below_lsn` is larger than the real gc cutoff, we will use
+                // each of the retain_lsn. Therefore, if the user-provided `compact_lsn_range.end` is larger than the real gc cutoff, we will use
                // the real cutoff.
-                let mut gc_cutoff = options.compact_below_lsn.unwrap_or(real_gc_cutoff);
+                let mut gc_cutoff = if compact_lsn_range.end == Lsn::MAX {
+                    real_gc_cutoff
+                } else {
+                    compact_lsn_range.end
+                };
                if gc_cutoff > real_gc_cutoff {
-                    warn!("provided compact_below_lsn={} is larger than the real_gc_cutoff={}, using the real gc cutoff", gc_cutoff, real_gc_cutoff);
+                    warn!("provided compact_lsn_range.end={} is larger than the real_gc_cutoff={}, using the real gc cutoff", gc_cutoff, real_gc_cutoff);
                    gc_cutoff = real_gc_cutoff;
                }
                gc_cutoff
@@ -1991,7 +2032,7 @@ impl Timeline {
            }
            let mut selected_layers: Vec<Layer> = Vec::new();
            drop(gc_info);
-            // Pick all the layers intersect or below the gc_cutoff, get the largest LSN in the selected layers.
+            // Firstly, pick all the layers intersect or below the gc_cutoff, get the largest LSN in the selected layers.
            let Some(max_layer_lsn) = layers
                .iter_historic_layers()
                .filter(|desc| desc.get_lsn_range().start <= gc_cutoff)
@@ -2001,27 +2042,45 @@ impl Timeline {
                info!("no layers to compact with gc: no historic layers below gc_cutoff, gc_cutoff={}", gc_cutoff);
                return Ok(());
            };
+            // Next, if the user specifies compact_lsn_range.start, we need to filter some layers out. All the layers (strictly) below
+            // the min_layer_lsn computed as below will be filtered out and the data will be accessed using the normal read path, as if
+            // it is a branch.
+            let Some(min_layer_lsn) = layers
+                .iter_historic_layers()
+                .filter(|desc| {
+                    if compact_lsn_range.start == Lsn::INVALID {
+                        true // select all layers below if start == Lsn(0)
+                    } else {
+                        desc.get_lsn_range().end > compact_lsn_range.start // strictly larger than compact_above_lsn
+                    }
+                })
+                .map(|desc| desc.get_lsn_range().start)
+                .min()
+            else {
+                info!("no layers to compact with gc: no historic layers above compact_above_lsn, compact_above_lsn={}", compact_lsn_range.end);
+                return Ok(());
+            };
            // Then, pick all the layers that are below the max_layer_lsn. This is to ensure we can pick all single-key
            // layers to compact.
            let mut rewrite_layers = Vec::new();
            for desc in layers.iter_historic_layers() {
                if desc.get_lsn_range().end <= max_layer_lsn
-                    && overlaps_with(&desc.get_key_range(), &compaction_key_range)
+                    && desc.get_lsn_range().start >= min_layer_lsn
+                    && overlaps_with(&desc.get_key_range(), &compact_key_range)
                {
                    // If the layer overlaps with the compaction key range, we need to read it to obtain all keys within the range,
                    // even if it might contain extra keys
                    selected_layers.push(guard.get_from_desc(&desc));
                    // If the layer is not fully contained within the key range, we need to rewrite it if it's a delta layer (it's fine
                    // to overlap image layers)
-                    if desc.is_delta()
-                        && !fully_contains(&compaction_key_range, &desc.get_key_range())
+                    if desc.is_delta() && !fully_contains(&compact_key_range, &desc.get_key_range())
                    {
                        rewrite_layers.push(desc);
                    }
                }
            }
            if selected_layers.is_empty() {
-                info!("no layers to compact with gc: no layers within the key range, gc_cutoff={}, key_range={}..{}", gc_cutoff, compaction_key_range.start, compaction_key_range.end);
+                info!("no layers to compact with gc: no layers within the key range, gc_cutoff={}, key_range={}..{}", gc_cutoff, compact_key_range.start, compact_key_range.end);
                return Ok(());
            }
            retain_lsns_below_horizon.sort();
@@ -2029,13 +2088,20 @@ impl Timeline {
                selected_layers,
                gc_cutoff,
                retain_lsns_below_horizon,
+                min_layer_lsn,
                max_layer_lsn,
-                compaction_key_range,
+                compaction_key_range: compact_key_range,
                rewrite_layers,
            }
        };
-        let lowest_retain_lsn = if self.ancestor_timeline.is_some() {
-            Lsn(self.ancestor_lsn.0 + 1)
+        let (has_data_below, lowest_retain_lsn) = if compact_lsn_range.start != Lsn::INVALID {
+            // If we only compact above some LSN, we should get the history from the current branch below the specified LSN.
+            // We use job_desc.min_layer_lsn as if it's the lowest branch point.
+            (true, job_desc.min_layer_lsn)
+        } else if self.ancestor_timeline.is_some() {
+            // In theory, we can also use min_layer_lsn here, but using ancestor LSN makes sure the delta layers cover the
+            // LSN ranges all the way to the ancestor timeline.
+            (true, self.ancestor_lsn)
        } else {
            let res = job_desc
                .retain_lsns_below_horizon
@@ -2053,17 +2119,19 @@ impl Timeline {
                        .unwrap_or(job_desc.gc_cutoff)
                );
            }
-            res
+            (false, res)
        };
        info!(
-            "picked {} layers for compaction ({} layers need rewriting) with max_layer_lsn={} gc_cutoff={} lowest_retain_lsn={}, key_range={}..{}",
+            "picked {} layers for compaction ({} layers need rewriting) with max_layer_lsn={} min_layer_lsn={} gc_cutoff={} lowest_retain_lsn={}, key_range={}..{}, has_data_below={}",
            job_desc.selected_layers.len(),
            job_desc.rewrite_layers.len(),
            job_desc.max_layer_lsn,
+            job_desc.min_layer_lsn,
            job_desc.gc_cutoff,
            lowest_retain_lsn,
            job_desc.compaction_key_range.start,
-            job_desc.compaction_key_range.end
+            job_desc.compaction_key_range.end,
+            has_data_below,
        );

        for layer in &job_desc.selected_layers {
@@ -2107,10 +2175,22 @@ impl Timeline {
        let mut delta_layers = Vec::new();
        let mut image_layers = Vec::new();
        let mut downloaded_layers = Vec::new();
+        let mut total_downloaded_size = 0;
+        let mut total_layer_size = 0;
        for layer in &job_desc.selected_layers {
+            if layer.needs_download().await?.is_some() {
+                total_downloaded_size += layer.layer_desc().file_size;
+            }
+            total_layer_size += layer.layer_desc().file_size;
            let resident_layer = layer.download_and_keep_resident().await?;
            downloaded_layers.push(resident_layer);
        }
+        info!(
+            "finish downloading layers, downloaded={}, total={}, ratio={:.2}",
+            total_downloaded_size,
+            total_layer_size,
+            total_downloaded_size as f64 / total_layer_size as f64
+        );
        for resident_layer in &downloaded_layers {
            if resident_layer.layer_desc().is_delta() {
                let layer = resident_layer.get_as_delta(ctx).await?;
@@ -2133,7 +2213,7 @@ impl Timeline {

        // Only create image layers when there is no ancestor branches. TODO: create covering image layer
        // when some condition meet.
-        let mut image_layer_writer = if self.ancestor_timeline.is_none() {
+        let mut image_layer_writer = if !has_data_below {
            Some(
                SplitImageLayerWriter::new(
                    self.conf,
@@ -2166,7 +2246,11 @@ impl Timeline {
        }
        let mut delta_layer_rewriters = HashMap::<Arc<PersistentLayerKey>, RewritingLayers>::new();

-        /// Returns None if there is no ancestor branch. Throw an error when the key is not found.
+        /// When compacting not at a bottom range (=`[0,X)`) of the root branch, we "have data below" (`has_data_below=true`).
+        /// The two cases are compaction in ancestor branches and when `compact_lsn_range.start` is set.
+        /// In those cases, we need to pull up data from below the LSN range we're compaction.
+        ///
+        /// This function unifies the cases so that later code doesn't have to think about it.
        ///
        /// Currently, we always get the ancestor image for each key in the child branch no matter whether the image
        /// is needed for reconstruction. This should be fixed in the future.
@@ -2174,17 +2258,19 @@ impl Timeline {
        /// Furthermore, we should do vectored get instead of a single get, or better, use k-merge for ancestor
        /// images.
        async fn get_ancestor_image(
-            tline: &Arc<Timeline>,
+            this_tline: &Arc<Timeline>,
            key: Key,
            ctx: &RequestContext,
+            has_data_below: bool,
+            history_lsn_point: Lsn,
        ) -> anyhow::Result<Option<(Key, Lsn, Bytes)>> {
-            if tline.ancestor_timeline.is_none() {
+            if !has_data_below {
                return Ok(None);
            };
            // This function is implemented as a get of the current timeline at ancestor LSN, therefore reusing
            // as much existing code as possible.
-            let img = tline.get(key, tline.ancestor_lsn, ctx).await?;
-            Ok(Some((key, tline.ancestor_lsn, img)))
+            let img = this_tline.get(key, history_lsn_point, ctx).await?;
+            Ok(Some((key, history_lsn_point, img)))
        }

        // Actually, we can decide not to write to the image layer at all at this point because
@@ -2268,7 +2354,8 @@ impl Timeline {
                        job_desc.gc_cutoff,
                        &job_desc.retain_lsns_below_horizon,
                        COMPACTION_DELTA_THRESHOLD,
-                        get_ancestor_image(self, *last_key, ctx).await?,
+                        get_ancestor_image(self, *last_key, ctx, has_data_below, lowest_retain_lsn)
+                            .await?,
                    )
                    .await?;
                retention
@@ -2297,7 +2384,7 @@ impl Timeline {
                job_desc.gc_cutoff,
                &job_desc.retain_lsns_below_horizon,
                COMPACTION_DELTA_THRESHOLD,
-                get_ancestor_image(self, last_key, ctx).await?,
+                get_ancestor_image(self, last_key, ctx, has_data_below, lowest_retain_lsn).await?,
            )
            .await?;
        retention
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -877,22 +877,24 @@ impl WalIngest {
        // will block waiting for the last valid LSN to advance up to
        // it. So we use the previous record's LSN in the get calls
        // instead.
-        for segno in modification
-            .tline
-            .list_slru_segments(SlruKind::Clog, Version::Modified(modification), ctx)
-            .await?
-        {
-            let segpage = segno * pg_constants::SLRU_PAGES_PER_SEGMENT;
+        if modification.tline.get_shard_identity().is_shard_zero() {
+            for segno in modification
+                .tline
+                .list_slru_segments(SlruKind::Clog, Version::Modified(modification), ctx)
+                .await?
+            {
+                let segpage = segno * pg_constants::SLRU_PAGES_PER_SEGMENT;

-            let may_delete = dispatch_pgversion!(modification.tline.pg_version, {
-                pgv::nonrelfile_utils::slru_may_delete_clogsegment(segpage, pageno)
-            });
+                let may_delete = dispatch_pgversion!(modification.tline.pg_version, {
+                    pgv::nonrelfile_utils::slru_may_delete_clogsegment(segpage, pageno)
+                });

-            if may_delete {
-                modification
-                    .drop_slru_segment(SlruKind::Clog, segno, ctx)
-                    .await?;
-                trace!("Drop CLOG segment {:>04X}", segno);
+                if may_delete {
+                    modification
+                        .drop_slru_segment(SlruKind::Clog, segno, ctx)
+                        .await?;
+                    trace!("Drop CLOG segment {:>04X}", segno);
+                }
            }
        }

@@ -1047,16 +1049,18 @@ impl WalIngest {

        // Delete all the segments except the last one. The last segment can still
        // contain, possibly partially, valid data.
-        while segment != endsegment {
-            modification
-                .drop_slru_segment(SlruKind::MultiXactMembers, segment as u32, ctx)
-                .await?;
+        if modification.tline.get_shard_identity().is_shard_zero() {
+            while segment != endsegment {
+                modification
+                    .drop_slru_segment(SlruKind::MultiXactMembers, segment as u32, ctx)
+                    .await?;

-            /* move to next segment, handling wraparound correctly */
-            if segment == maxsegment {
-                segment = 0;
-            } else {
-                segment += 1;
+                /* move to next segment, handling wraparound correctly */
+                if segment == maxsegment {
+                    segment = 0;
+                } else {
+                    segment += 1;
+                }
            }
        }

--- a/pgxn/neon/Makefile
+++ b/pgxn/neon/Makefile
@@ -34,6 +34,8 @@ DATA = \
 	neon--1.2--1.3.sql \
 	neon--1.3--1.4.sql \
 	neon--1.4--1.5.sql \
+	neon--1.5--1.6.sql \
+	neon--1.6--1.5.sql \
 	neon--1.5--1.4.sql \
 	neon--1.4--1.3.sql \
 	neon--1.3--1.2.sql \
--- a/pgxn/neon/control_plane_connector.c
+++ b/pgxn/neon/control_plane_connector.c
@@ -428,6 +428,8 @@ MergeTable()
 		hash_seq_init(&status, old_table->role_table);
 		while ((entry = hash_seq_search(&status)) != NULL)
 		{
+			RoleEntry * old;
+			bool found_old = false;
 			RoleEntry  *to_write = hash_search(
 											   CurrentDdlTable->role_table,
 											   entry->name,
@@ -435,30 +437,23 @@ MergeTable()
 											   NULL);

 			to_write->type = entry->type;
-			if (entry->password)
-				to_write->password = entry->password;
+			to_write->password = entry->password;
 			strlcpy(to_write->old_name, entry->old_name, NAMEDATALEN);
-			if (entry->old_name[0] != '\0')
-			{
-				bool		found_old = false;
-				RoleEntry  *old = hash_search(
-											  CurrentDdlTable->role_table,
-											  entry->old_name,
-											  HASH_FIND,
-											  &found_old);
+			if (entry->old_name[0] == '\0')
+				continue;

-				if (found_old)
-				{
-					if (old->old_name[0] != '\0')
-						strlcpy(to_write->old_name, old->old_name, NAMEDATALEN);
-					else
-						strlcpy(to_write->old_name, entry->old_name, NAMEDATALEN);
-					hash_search(CurrentDdlTable->role_table,
-								entry->old_name,
-								HASH_REMOVE,
-								NULL);
-				}
-			}
+			old = hash_search(
+							  CurrentDdlTable->role_table,
+							  entry->old_name,
+							  HASH_FIND,
+							  &found_old);
+			if (!found_old)
+				continue;
+			strlcpy(to_write->old_name, old->old_name, NAMEDATALEN);
+			hash_search(CurrentDdlTable->role_table,
+						entry->old_name,
+						HASH_REMOVE,
+						NULL);
 		}
 		hash_destroy(old_table->role_table);
 	}
--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -22,6 +22,7 @@
 #include "neon_pgversioncompat.h"

 #include "access/parallel.h"
+#include "access/xlog.h"
 #include "funcapi.h"
 #include "miscadmin.h"
 #include "pagestore_client.h"
@@ -40,12 +41,16 @@
 #include "utils/dynahash.h"
 #include "utils/guc.h"

+#if PG_VERSION_NUM >= 150000
+#include "access/xlogrecovery.h"
+#endif
+
 #include "hll.h"
 #include "bitmap.h"
 #include "neon.h"
 #include "neon_perf_counters.h"

-#define CriticalAssert(cond) do if (!(cond)) elog(PANIC, "Assertion %s failed at %s:%d: ", #cond, __FILE__, __LINE__); while (0)
+#define CriticalAssert(cond) do if (!(cond)) elog(PANIC, "LFC: assertion %s failed at %s:%d: ", #cond, __FILE__, __LINE__); while (0)

 /*
 * Local file cache is used to temporary store relations pages in local file system.
@@ -100,7 +105,9 @@ typedef struct FileCacheEntry
 	BufferTag	key;
 	uint32		hash;
 	uint32		offset;
-	uint32		access_count;
+	uint32		access_count : 30;
+	uint32      prewarm_requested : 1; /* entry should be filled by prewarm */
+	uint32      prewarm_started : 1;   /* chunk is written by lfc_prewarm */
 	uint32		bitmap[CHUNK_BITMAP_SIZE];
 	dlist_node	list_node;		/* LRU/holes list node */
 } FileCacheEntry;
@@ -118,17 +125,29 @@ typedef struct FileCacheControl
 	uint64		writes;			/* number of writes issued */
 	uint64		time_read;		/* time spent reading (us) */
 	uint64		time_write;		/* time spent writing (us) */
+	uint32		prewarm_total_chunks;
+	uint32		prewarm_curr_chunk;
+	uint32		prewarmed_pages;
+	uint32		skipped_pages;
 	dlist_head	lru;			/* double linked list for LRU replacement
 								 * algorithm */
 	dlist_head  holes;          /* double linked list of punched holes */
 	HyperLogLogState wss_estimation; /* estimation of working set size */
 } FileCacheControl;

+typedef struct FileCacheStateEntry
+{
+	BufferTag	key;
+	uint32		bitmap[CHUNK_BITMAP_SIZE];
+} FileCacheStateEntry;
+
 static HTAB *lfc_hash;
 static int	lfc_desc = 0;
 static LWLockId lfc_lock;
 static int	lfc_max_size;
 static int	lfc_size_limit;
+static int	lfc_prewarm_limit;
+static int	lfc_prewarm_batch;
 static char *lfc_path;
 static FileCacheControl *lfc_ctl;
 static shmem_startup_hook_type prev_shmem_startup_hook;
@@ -149,7 +168,7 @@ lfc_disable(char const *op)
 {
 	int			fd;

-	elog(WARNING, "Failed to %s local file cache at %s: %m, disabling local file cache", op, lfc_path);
+	elog(WARNING, "LFC: failed to %s local file cache at %s: %m, disabling local file cache", op, lfc_path);

 	/* Invalidate hash */
 	LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
@@ -184,7 +203,7 @@ lfc_disable(char const *op)
 			pgstat_report_wait_end();

 			if (rc < 0)
-				elog(WARNING, "Failed to truncate local file cache %s: %m", lfc_path);
+				elog(WARNING, "LFC: failed to truncate local file cache %s: %m", lfc_path);
 		}
 	}

@@ -196,7 +215,7 @@ lfc_disable(char const *op)

 	fd = BasicOpenFile(lfc_path, O_RDWR | O_CREAT | O_TRUNC);
 	if (fd < 0)
-		elog(WARNING, "Failed to recreate local file cache %s: %m", lfc_path);
+		elog(WARNING, "LFC: failed to recreate local file cache %s: %m", lfc_path);
 	else
 		close(fd);

@@ -267,14 +286,7 @@ lfc_shmem_startup(void)
 								 n_chunks + 1, n_chunks + 1,
 								 &info,
 								 HASH_ELEM | HASH_BLOBS);
-		lfc_ctl->generation = 0;
-		lfc_ctl->size = 0;
-		lfc_ctl->used = 0;
-		lfc_ctl->hits = 0;
-		lfc_ctl->misses = 0;
-		lfc_ctl->writes = 0;
-		lfc_ctl->time_read = 0;
-		lfc_ctl->time_write = 0;
+		memset(lfc_ctl, 0, sizeof *lfc_ctl);
 		dlist_init(&lfc_ctl->lru);
 		dlist_init(&lfc_ctl->holes);

@@ -285,7 +297,7 @@ lfc_shmem_startup(void)
 		fd = BasicOpenFile(lfc_path, O_RDWR | O_CREAT | O_TRUNC);
 		if (fd < 0)
 		{
-			elog(WARNING, "Failed to create local file cache %s: %m", lfc_path);
+			elog(WARNING, "LFC: failed to create local file cache %s: %m", lfc_path);
 			lfc_ctl->limit = 0;
 		}
 		else
@@ -327,7 +339,7 @@ lfc_check_limit_hook(int *newval, void **extra, GucSource source)
 {
 	if (*newval > lfc_max_size)
 	{
-		elog(ERROR, "neon.file_cache_size_limit can not be larger than neon.max_file_cache_size");
+		elog(ERROR, "LFC: neon.file_cache_size_limit can not be larger than neon.max_file_cache_size");
 		return false;
 	}
 	return true;
@@ -365,6 +377,10 @@ lfc_change_limit_hook(int newval, void *extra)
 			neon_log(LOG, "Failed to punch hole in file: %m");
 #endif
 		/* We remove the old entry, and re-enter a hole to the hash table */
+		for (int i = 0; i < BLOCKS_PER_CHUNK; i++)
+		{
+			lfc_ctl->used_pages -= (victim->bitmap[i >> 5] >> (i & 31)) & 1;
+		}
 		hash_search_with_hash_value(lfc_hash, &victim->key, victim->hash, HASH_REMOVE, NULL);

 		memset(&holetag, 0, sizeof(holetag));
@@ -436,6 +452,32 @@ lfc_init(void)
 							   NULL,
 							   NULL);

+	DefineCustomIntVariable("neon.file_cache_prewarm_limit",
+							"Maximal number of prewarmed pages",
+							NULL,
+							&lfc_prewarm_limit,
+							0,	/* disabled by default */
+							0,
+							INT_MAX,
+							PGC_SIGHUP,
+							0,
+							NULL,
+							NULL,
+							NULL);
+
+	DefineCustomIntVariable("neon.file_cache_prewarm_batch",
+							"Number of pages retrivied by prewarm from page server",
+							NULL,
+							&lfc_prewarm_batch,
+							64,
+							1,
+							INT_MAX,
+							PGC_SIGHUP,
+							0,
+							NULL,
+							NULL,
+							NULL);
+
 	if (lfc_max_size == 0)
 		return;

@@ -449,6 +491,264 @@ lfc_init(void)
 #endif
 }

+static FileCacheStateEntry*
+lfc_get_state(size_t* n_entries)
+{
+	size_t max_entries = *n_entries;
+	size_t i = 0;
+	FileCacheStateEntry* fs;
+
+	if (lfc_maybe_disabled() || max_entries == 0)	/* fast exit if file cache is disabled */
+		return NULL;
+
+	fs = (FileCacheStateEntry*)palloc(sizeof(FileCacheStateEntry) * max_entries);
+
+	LWLockAcquire(lfc_lock, LW_SHARED);
+
+	if (LFC_ENABLED())
+	{
+		dlist_iter	iter;
+		dlist_reverse_foreach(iter, &lfc_ctl->lru)
+		{
+			FileCacheEntry *entry = dlist_container(FileCacheEntry, list_node, iter.cur);
+			memcpy(&fs[i].key, &entry->key, sizeof entry->key);
+			memcpy(fs[i].bitmap, entry->bitmap, sizeof entry->bitmap);
+			if (++i == max_entries)
+				break;
+		}
+		elog(LOG, "LFC: save state of %ld chunks", (long)i);
+	}
+
+	LWLockRelease(lfc_lock);
+
+	*n_entries = i;
+	return fs;
+}
+
+/*
+ * Prewarm LFC cache to the specified state.
+ *
+ * Prewarming can interfere with accesses to the pages by other backends. Usually access to LFC is protected by shared buffers: when Postgres
+ * is reading page, it pins shared buffer and enforces that only one backend is reading it, while other are waiting for read completion.
+ *
+ * But it is not true for prewarming: backend can fetch page itself, modify and then write it to LFC. At the
+ * same time `lfc_prewarm` tries to write deteriorated image of this page in LFC. To increase concurrency, access to LFC files (both read and write)
+ * is performed without holding locks. So it can happen that two or more processes write different content to the same location in the LFC file.
+ * Certainly we can not rely on disk content in this case.
+ *
+ * To solve this problem we use two flags in LFC entry: `prewarm_requested` and `prewarm_started`. First is set before prewarm is actually started.
+ * `lfc_prewarm` writes to LFC file only if this flag is set. This flag is cleared if any other backend performs write to this LFC chunk.
+ * In this case data loaded by `lfc_prewarm` is considered to be deteriorated and should be just ignored.
+ *
+ * But as far as write to LFC is performed without holding lock, there is no guarantee that no such write is in progress.
+ * This is why second flag is used: `prewarm_started`. It is set by `lfc_prewarm` when is starts writing page and cleared when write is completed.
+ * Any other backend writing to LFC should abandon it's write to LFC file (just not mark page as loaded in bitmap) if this flag is set.
+ * So neither `lfc_prewarm`, neither backend are saving page in LFC in this case - it is just skipped.
+ */
+
+static void
+lfc_prewarm(FileCacheStateEntry* fs, size_t n_entries)
+{
+	ssize_t rc;
+	size_t snd_idx = 0, rcv_idx = 0;
+	size_t n_sent = 0, n_received = 0;
+	FileCacheEntry *entry;
+	uint64 generation;
+	uint32 entry_offset;
+	uint32 hash;
+	size_t i;
+	bool   found;
+	int    shard_no;
+
+	if (!lfc_ensure_opened())
+		return;
+
+	if (n_entries == 0 || fs == NULL)
+	{
+		elog(LOG, "LFC: prewarm is disabled");
+		return;
+	}
+
+	LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
+
+	/* Do not prewarm more entries than LFC limit */
+	if (lfc_ctl->limit <= lfc_ctl->size)
+	{
+		LWLockRelease(lfc_lock);
+		return;
+	}
+	if (n_entries > lfc_ctl->limit - lfc_ctl->size)
+	{
+		n_entries = lfc_ctl->limit - lfc_ctl->size;
+	}
+
+	/* Initialize fields used to track prewarming progress */
+	lfc_ctl->prewarm_total_chunks = n_entries;
+	lfc_ctl->prewarm_curr_chunk = 0;
+
+    /*
+	 * Load LFC state and add entries in hash table.
+	 * It is needed to track modification of prewarmed pages.
+	 * All such entries have `prewarm_requested` flag set. When entry is updated (some backed reads or writes
+	 * some pages from this chunk), then `prewarm_requested` flag is cleared, prohibiting prewarm of this chunk.
+	 * It prevents overwritting page updated or loaded by backend with older one, loaded by prewarm.
+	 */
+	for (i = 0; i < n_entries; i++)
+	{
+		hash = get_hash_value(lfc_hash, &fs[i].key);
+		entry = hash_search_with_hash_value(lfc_hash, &fs[i].key, hash, HASH_ENTER, &found);
+		/* Do not prewarm chunks which are already present in LFC */
+		if (!found)
+		{
+			entry->offset = lfc_ctl->size++;
+			entry->hash = hash;
+			entry->access_count = 0;
+			entry->prewarm_requested = true;
+			entry->prewarm_started = false;
+			memset(entry->bitmap, 0, sizeof entry->bitmap);
+			/* Most recently visted pages are stored first */
+			dlist_push_head(&lfc_ctl->lru, &entry->list_node);
+			lfc_ctl->used += 1;
+		}
+	}
+	LWLockRelease(lfc_lock);
+
+	elog(LOG, "LFC: start loading %ld chunks", (long)n_entries);
+
+	while (true)
+	{
+		size_t chunk_no = snd_idx / BLOCKS_PER_CHUNK;
+		size_t offs_in_chunk = snd_idx % BLOCKS_PER_CHUNK;
+		if (chunk_no < n_entries)
+		{
+			if (fs[chunk_no].bitmap[offs_in_chunk >> 5] & (1 << (offs_in_chunk & 31)))
+			{
+				/*
+				 * In case of prewarming replica we should be careful not to load too new version
+				 * of the page - with LSN larger than current replay LSN.
+				 * At primary we are always loading latest version.
+				 */
+				XLogRecPtr req_lsn = RecoveryInProgress() ? GetXLogReplayRecPtr(NULL) : UINT64_MAX;
+
+				NeonGetPageRequest request = {
+					.req.tag = T_NeonGetPageRequest,
+					/* lsn and not_modified_since are filled in below */
+					.rinfo = BufTagGetNRelFileInfo(fs[chunk_no].key),
+					.forknum = fs[chunk_no].key.forkNum,
+					.blkno = fs[chunk_no].key.blockNum + offs_in_chunk,
+					.req.lsn = req_lsn,
+					.req.not_modified_since = 0
+				};
+				shard_no = get_shard_number(&fs[chunk_no].key);
+				while (!page_server->send(shard_no, (NeonRequest *) &request)
+					|| !page_server->flush(shard_no))
+				{
+					/* page server disconnected: all previusly sent prefetch requests are lost */
+					n_sent = 0;
+				}
+				n_sent += 1;
+			}
+			snd_idx += 1;
+		}
+		if (n_sent >= n_received + lfc_prewarm_batch || chunk_no == n_entries)
+		{
+			NeonResponse * resp;
+			do
+			{
+				chunk_no = rcv_idx / BLOCKS_PER_CHUNK;
+				offs_in_chunk = rcv_idx % BLOCKS_PER_CHUNK;
+				rcv_idx += 1;
+			} while (!(fs[chunk_no].bitmap[offs_in_chunk >> 5] & (1 << (offs_in_chunk & 31))));
+
+			shard_no = get_shard_number(&fs[chunk_no].key);
+			resp = page_server->receive(shard_no);
+			lfc_ctl->prewarm_curr_chunk = chunk_no;
+
+			switch (resp->tag)
+			{
+				case T_NeonGetPageResponse:
+					break;
+				case T_NeonErrorResponse:
+				{
+					/* Prefech can request page which is already dropped so PS can respond with error: just ignore it */
+					NeonErrorResponse *err_resp = (NeonErrorResponse *) resp;
+					elog(LOG, "LFC: page server failed to load page %u of relation %u/%u/%u.%u: %s",
+						 fs[chunk_no].key.blockNum + (BlockNumber)offs_in_chunk, RelFileInfoFmt(BufTagGetNRelFileInfo(fs[chunk_no].key)), fs[chunk_no].key.forkNum, err_resp->message);
+					continue;
+				}
+				default:
+					elog(LOG, "LFC: unexpected response type: %d", resp->tag);
+					return;
+			}
+
+			hash = get_hash_value(lfc_hash, &fs[chunk_no].key);
+
+			LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
+			entry = hash_search_with_hash_value(lfc_hash, &fs[chunk_no].key, hash, HASH_FIND, NULL);
+			if (entry != NULL && entry->prewarm_requested)
+			{
+				/* Unlink entry from LRU list to pin it for the duration of IO operation */
+				if (entry->access_count++ == 0)
+					dlist_delete(&entry->list_node);
+
+				generation = lfc_ctl->generation;
+				entry_offset = entry->offset;
+				Assert(!entry->prewarm_started);
+				entry->prewarm_started = true;
+
+				LWLockRelease(lfc_lock);
+
+				rc = pwrite(lfc_desc, ((NeonGetPageResponse*)resp)->page, BLCKSZ, ((off_t) entry_offset * BLOCKS_PER_CHUNK + offs_in_chunk) * BLCKSZ);
+				if (rc != BLCKSZ)
+				{
+					lfc_disable("write");
+					break;
+				}
+				else
+				{
+					LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
+
+					if (lfc_ctl->generation == generation)
+					{
+						CriticalAssert(LFC_ENABLED());
+						if (--entry->access_count == 0)
+							dlist_push_tail(&lfc_ctl->lru, &entry->list_node);
+						if (entry->prewarm_requested)
+						{
+							lfc_ctl->used_pages += 1 - ((entry->bitmap[offs_in_chunk >> 5] >> (offs_in_chunk & 31)) & 1);
+							entry->bitmap[offs_in_chunk >> 5] |= 1 << (offs_in_chunk & 31);
+							lfc_ctl->prewarmed_pages += 1;
+						}
+						else
+						{
+							lfc_ctl->skipped_pages += 1;
+						}
+						Assert(entry->prewarm_started);
+						entry->prewarm_started = false;
+					}
+
+					LWLockRelease(lfc_lock);
+				}
+			}
+			else
+			{
+				Assert(!entry || !entry->prewarm_started);
+				lfc_ctl->skipped_pages += 1;
+				LWLockRelease(lfc_lock);
+			}
+
+			if (++n_received == n_sent && snd_idx >= n_entries * BLOCKS_PER_CHUNK)
+			{
+				break;
+			}
+		}
+	}
+	Assert(n_sent == n_received);
+	lfc_ctl->prewarm_curr_chunk = n_entries;
+	elog(LOG, "LFC: complete prewarming: loaded %ld pages", (long)n_received);
+}
+
+
 /*
 * Check if page is present in the cache.
 * Returns true if page is found in local cache.
@@ -616,6 +916,7 @@ lfc_evict(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno)

 	/* remove the page from the cache */
 	entry->bitmap[chunk_offs >> 5] &= ~(1 << (chunk_offs & (32 - 1)));
+	entry->prewarm_requested = false; /* prohibit prewarm of this LFC entry */

 	if (entry->access_count == 0)
 	{
@@ -861,7 +1162,15 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,

 	CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber);

-	/* 
+	LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
+
+	if (!LFC_ENABLED())
+	{
+		LWLockRelease(lfc_lock);
+		return;
+	}
+
+	/*
 	 * For every chunk that has blocks we're interested in, we
 	 * 1. get the chunk header
 	 * 2. Check if the chunk actually has the blocks we're interested in
@@ -887,18 +1196,26 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 		tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK - 1);
 		hash = get_hash_value(lfc_hash, &tag);

-		LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
-
-		if (!LFC_ENABLED())
-		{
-			LWLockRelease(lfc_lock);
-			return;
-		}
-
 		entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_ENTER, &found);

 		if (found)
 		{
+			if (entry->prewarm_started)
+			{
+				/*
+				 * Some page of this chunk is currently written by `lfc_prewarm`.
+				 * We should give-up not to interfere with it.
+				 * But clearing `prewarm_requested` flag also will not allow `lfc_prewarm` to fix it result.
+				 */
+				entry->prewarm_requested = false;
+				/* cleanup all affected pages of the chunk: we do not know which one of them is conflicting with prewarm */
+				for (int i = 0; i < blocks_in_chunk; i++)
+				{
+					lfc_ctl->used_pages -= ((entry->bitmap[(chunk_offs + i) >> 5] >> ((chunk_offs + i) & 31)) & 1);
+					entry->bitmap[(chunk_offs + i) >> 5] &= ~(1 << ((chunk_offs + i) & 31));
+				}
+				goto next_chunk;
+			}
 			/*
 			 * Unlink entry from LRU list to pin it for the duration of IO
 			 * operation
@@ -928,7 +1245,7 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 			{
 				/* Cache overflow: evict least recently used chunk */
 				FileCacheEntry *victim = dlist_container(FileCacheEntry, list_node, dlist_pop_head_node(&lfc_ctl->lru));
-	
+
 				for (int i = 0; i < BLOCKS_PER_CHUNK; i++)
 				{
 					lfc_ctl->used_pages -= (victim->bitmap[i >> 5] >> (i & 31)) & 1;
@@ -944,10 +1261,10 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 				FileCacheEntry *hole = dlist_container(FileCacheEntry, list_node, dlist_pop_head_node(&lfc_ctl->holes));
 				uint32		offset = hole->offset;
 				bool		hole_found;
-	
+
 				hash_search_with_hash_value(lfc_hash, &hole->key, hole->hash, HASH_REMOVE, &hole_found);
 				CriticalAssert(hole_found);
-	
+
 				lfc_ctl->used += 1;
 				entry->offset = offset;	/* reuse the hole */
 			}
@@ -959,9 +1276,11 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 			}
 			entry->access_count = 1;
 			entry->hash = hash;
+			entry->prewarm_started = false;
 			memset(entry->bitmap, 0, sizeof entry->bitmap);
 		}

+		entry->prewarm_requested = false; /* prohibit prewarm if LFC entry is updated by some backend */
 		generation = lfc_ctl->generation;
 		entry_offset = entry->offset;
 		LWLockRelease(lfc_lock);
@@ -976,6 +1295,7 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 		if (rc != BLCKSZ * blocks_in_chunk)
 		{
 			lfc_disable("write");
+			return;
 		}
 		else
 		{
@@ -1005,12 +1325,13 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 				}
 			}

-			LWLockRelease(lfc_lock);
 		}
+	  next_chunk:
 		blkno += blocks_in_chunk;
 		buf_offset += blocks_in_chunk;
 		nblocks -= blocks_in_chunk;
 	}
+	LWLockRelease(lfc_lock);
 }

 typedef struct
@@ -1334,3 +1655,69 @@ approximate_working_set_size(PG_FUNCTION_ARGS)
 	}
 	PG_RETURN_NULL();
 }
+
+PG_FUNCTION_INFO_V1(get_local_cache_state);
+
+Datum
+get_local_cache_state(PG_FUNCTION_ARGS)
+{
+	size_t n_entries = PG_ARGISNULL(0) ? lfc_prewarm_limit : PG_GETARG_INT32(0);
+	FileCacheStateEntry* fs = lfc_get_state(&n_entries);
+	if (fs != NULL)
+	{
+		size_t size_in_bytes = sizeof(FileCacheStateEntry) * n_entries;
+		bytea* res = (bytea*)palloc(VARHDRSZ + size_in_bytes);
+
+		SET_VARSIZE(res, VARHDRSZ + size_in_bytes);
+		memcpy(VARDATA(res), fs, size_in_bytes);
+		pfree(fs);
+
+		PG_RETURN_BYTEA_P(res);
+	}
+	PG_RETURN_NULL();
+}
+
+PG_FUNCTION_INFO_V1(prewarm_local_cache);
+
+Datum
+prewarm_local_cache(PG_FUNCTION_ARGS)
+{
+	bytea* state = PG_GETARG_BYTEA_PP(0);
+	uint32 n_entries = VARSIZE_ANY_EXHDR(state)/sizeof(FileCacheStateEntry);
+	FileCacheStateEntry* fs = (FileCacheStateEntry*)VARDATA_ANY(state);
+
+	lfc_prewarm(fs, n_entries);
+
+	PG_RETURN_NULL();
+}
+
+PG_FUNCTION_INFO_V1(get_prewarm_info);
+
+Datum
+get_prewarm_info(PG_FUNCTION_ARGS)
+{
+	Datum		values[4];
+	bool		nulls[4];
+	TupleDesc	tupdesc;
+
+	if (lfc_size_limit == 0)
+		PG_RETURN_NULL();
+
+	tupdesc = CreateTemplateTupleDesc(4);
+	TupleDescInitEntry(tupdesc, (AttrNumber) 1, "total_chunks", INT4OID, -1, 0);
+	TupleDescInitEntry(tupdesc, (AttrNumber) 2, "curr_chunk", INT4OID, -1, 0);
+	TupleDescInitEntry(tupdesc, (AttrNumber) 3, "prewarmed_pages", INT4OID, -1, 0);
+	TupleDescInitEntry(tupdesc, (AttrNumber) 4, "skipped_pages", INT4OID, -1, 0);
+	tupdesc = BlessTupleDesc(tupdesc);
+
+	MemSet(nulls, 0, sizeof(nulls));
+	LWLockAcquire(lfc_lock, LW_SHARED);
+	values[0] = Int32GetDatum(lfc_ctl->prewarm_total_chunks);
+	values[1] = Int32GetDatum(lfc_ctl->prewarm_curr_chunk);
+	values[2] = Int32GetDatum(lfc_ctl->prewarmed_pages);
+	values[3] = Int32GetDatum(lfc_ctl->skipped_pages);
+	LWLockRelease(lfc_lock);
+
+	PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, values, nulls)));
+}
+
--- a/pgxn/neon/logical_replication_monitor.c
+++ b/pgxn/neon/logical_replication_monitor.c
@@ -131,8 +131,8 @@ get_snapshots_cutoff_lsn(void)
 	{
 		cutoff = snapshot_descriptors[logical_replication_max_snap_files - 1].lsn;
 		elog(LOG,
-			"ls_monitor: dropping logical slots with restart_lsn lower %X/%X, found %zu snapshot files, limit is %d",
-			LSN_FORMAT_ARGS(cutoff), snapshot_index, logical_replication_max_snap_files);
+			"ls_monitor: number of snapshot files, %zu, is larger than limit of %d",
+			snapshot_index, logical_replication_max_snap_files);
 	}

 	/* Is the size of the logical snapshots directory larger than specified?
@@ -162,8 +162,8 @@ get_snapshots_cutoff_lsn(void)
 		}

 		if (cutoff != original)
-			elog(LOG, "ls_monitor: dropping logical slots with restart_lsn lower than %X/%X, " SNAPDIR " is larger than %d KB",
-					LSN_FORMAT_ARGS(cutoff), logical_replication_max_logicalsnapdir_size);
+			elog(LOG, "ls_monitor: " SNAPDIR " is larger than %d KB",
+				 logical_replication_max_logicalsnapdir_size);
 	}

 	pfree(snapshot_descriptors);
@@ -214,9 +214,13 @@ InitLogicalReplicationMonitor(void)
 }

 /*
- * Unused logical replication slots pins WAL and prevents deletion of snapshots.
+ * Unused logical replication slots pins WAL and prevent deletion of snapshots.
 * WAL bloat is guarded by max_slot_wal_keep_size; this bgw removes slots which
- * need too many .snap files.
+ * need too many .snap files. These files are stored as AUX files, which are a
+ * pageserver mechanism for storing non-relation data. AUX files are shipped in
+ * in the basebackup which is requested by compute_ctl before Postgres starts.
+ * The larger the time to retrieve the basebackup, the more likely it is the
+ * compute will be killed by the control plane due to a timeout.
 */
 void
 LogicalSlotsMonitorMain(Datum main_arg)
@@ -239,10 +243,7 @@ LogicalSlotsMonitorMain(Datum main_arg)
 			ProcessConfigFile(PGC_SIGHUP);
 		}

-		/*
-		 * If there are too many .snap files, just drop all logical slots to
-		 * prevent aux files bloat.
-		 */
+		/* Get the cutoff LSN */
 		cutoff_lsn = get_snapshots_cutoff_lsn();
 		if (cutoff_lsn > 0)
 		{
@@ -252,31 +253,37 @@ LogicalSlotsMonitorMain(Datum main_arg)
 				ReplicationSlot *s = &ReplicationSlotCtl->replication_slots[i];
 				XLogRecPtr	restart_lsn;

-				/* find the name */
 				LWLockAcquire(ReplicationSlotControlLock, LW_SHARED);
-				/* Consider only logical repliction slots */
+
+				/* Consider only active logical repliction slots */
 				if (!s->in_use || !SlotIsLogical(s))
 				{
 					LWLockRelease(ReplicationSlotControlLock);
 					continue;
 				}

-				/* do we need to drop it? */
+				/*
+				 * Retrieve the restart LSN to determine if we need to drop the
+				 * slot
+				 */
 				SpinLockAcquire(&s->mutex);
 				restart_lsn = s->data.restart_lsn;
 				SpinLockRelease(&s->mutex);
+
+				strlcpy(slot_name, s->data.name.data, sizeof(slot_name));
+				LWLockRelease(ReplicationSlotControlLock);
+
 				if (restart_lsn >= cutoff_lsn)
 				{
-					LWLockRelease(ReplicationSlotControlLock);
+					elog(LOG, "ls_monitor: not dropping replication slot %s because restart LSN %X/%X is greater than cutoff LSN %X/%X",
+						 slot_name, LSN_FORMAT_ARGS(restart_lsn), LSN_FORMAT_ARGS(cutoff_lsn));
 					continue;
 				}

-				strlcpy(slot_name, s->data.name.data, NAMEDATALEN);
-				elog(LOG, "ls_monitor: dropping slot %s with restart_lsn %X/%X below horizon %X/%X",
+				elog(LOG, "ls_monitor: dropping replication slot %s because restart LSN %X/%X lower than cutoff LSN %X/%X",
 					 slot_name, LSN_FORMAT_ARGS(restart_lsn), LSN_FORMAT_ARGS(cutoff_lsn));
-				LWLockRelease(ReplicationSlotControlLock);

-				/* now try to drop it, killing owner before if any */
+				/* now try to drop it, killing owner before, if any */
 				for (;;)
 				{
 					pid_t		active_pid;
@@ -288,9 +295,9 @@ LogicalSlotsMonitorMain(Datum main_arg)
 					if (active_pid == 0)
 					{
 						/*
-						 * Slot is releasted, try to drop it. Though of course
+						 * Slot is released, try to drop it. Though of course,
 						 * it could have been reacquired, so drop can ERROR
-						 * out. Similarly it could have been dropped in the
+						 * out. Similarly, it could have been dropped in the
 						 * meanwhile.
 						 *
 						 * In principle we could remove pg_try/pg_catch, that
@@ -300,14 +307,14 @@ LogicalSlotsMonitorMain(Datum main_arg)
 						PG_TRY();
 						{
 							ReplicationSlotDrop(slot_name, true);
-							elog(LOG, "ls_monitor: slot %s dropped", slot_name);
+							elog(LOG, "ls_monitor: replication slot %s dropped", slot_name);
 						}
 						PG_CATCH();
 						{
 							/* log ERROR and reset elog stack */
 							EmitErrorReport();
 							FlushErrorState();
-							elog(LOG, "ls_monitor: failed to drop slot %s", slot_name);
+							elog(LOG, "ls_monitor: failed to drop replication slot %s", slot_name);
 						}
 						PG_END_TRY();
 						break;
@@ -315,7 +322,7 @@ LogicalSlotsMonitorMain(Datum main_arg)
 					else
 					{
 						/* kill the owner and wait for release */
-						elog(LOG, "ls_monitor: killing slot %s owner %d", slot_name, active_pid);
+						elog(LOG, "ls_monitor: killing replication slot %s owner %d", slot_name, active_pid);
 						(void) kill(active_pid, SIGTERM);
 						/* We shouldn't get stuck, but to be safe add timeout. */
 						ConditionVariableTimedSleep(&s->active_cv, 1000, WAIT_EVENT_REPLICATION_SLOT_DROP);
--- a/pgxn/neon/neon--1.5--1.6.sql
+++ b/pgxn/neon/neon--1.5--1.6.sql
@@ -0,0 +1,22 @@
+\echo Use "ALTER EXTENSION neon UPDATE TO '1.6'" to load this file. \quit
+
+CREATE FUNCTION get_prewarm_info(out total_chunks integer, out curr_chunk integer, out prewarmed_pages integer, out skipped_pages integer)
+RETURNS record
+AS 'MODULE_PATHNAME', 'get_prewarm_info'
+LANGUAGE C STRICT
+PARALLEL SAFE;
+
+CREATE FUNCTION get_local_cache_state(max_chunks integer default null)
+RETURNS bytea
+AS 'MODULE_PATHNAME', 'get_local_cache_state'
+LANGUAGE C
+PARALLEL UNSAFE;
+
+CREATE FUNCTION prewarm_local_cache(state bytea)
+RETURNS void
+AS 'MODULE_PATHNAME', 'prewarm_local_cache'
+LANGUAGE C STRICT
+PARALLEL UNSAFE;
+
+
+
--- a/pgxn/neon/neon--1.6--1.5.sql
+++ b/pgxn/neon/neon--1.6--1.5.sql
@@ -0,0 +1,7 @@
+DROP FUNCTION IF EXISTS get_prewarm_info(out total_chunks integer, out curr_chunk integer, out prewarmed_pages integer, out skipped_pages integer);
+
+DROP FUNCTION IF EXISTS get_local_cache_state(max_chunks integer);
+
+DROP FUNCTION IF EXISTS prewarm_local_cache(state bytea);
+
+
--- a/proxy/src/auth/backend/mod.rs
+++ b/proxy/src/auth/backend/mod.rs
@@ -74,10 +74,6 @@ impl std::fmt::Display for Backend<'_, ()> {
                    .debug_tuple("ControlPlane::ProxyV1")
                    .field(&endpoint.url())
                    .finish(),
-                ControlPlaneClient::Neon(endpoint) => fmt
-                    .debug_tuple("ControlPlane::Neon")
-                    .field(&endpoint.url())
-                    .finish(),
                #[cfg(any(test, feature = "testing"))]
                ControlPlaneClient::PostgresMock(endpoint) => fmt
                    .debug_tuple("ControlPlane::PostgresMock")
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -43,9 +43,6 @@ static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;

 #[derive(Clone, Debug, ValueEnum)]
 enum AuthBackendType {
-    #[value(name("console"), alias("cplane"))]
-    ControlPlane,
-
    #[value(name("cplane-v1"), alias("control-plane"))]
    ControlPlaneV1,

@@ -488,40 +485,7 @@ async fn main() -> anyhow::Result<()> {
    }

    if let Either::Left(auth::Backend::ControlPlane(api, _)) = &auth_backend {
-        if let proxy::control_plane::client::ControlPlaneClient::Neon(api) = &**api {
-            match (redis_notifications_client, regional_redis_client.clone()) {
-                (None, None) => {}
-                (client1, client2) => {
-                    let cache = api.caches.project_info.clone();
-                    if let Some(client) = client1 {
-                        maintenance_tasks.spawn(notifications::task_main(
-                            client,
-                            cache.clone(),
-                            cancel_map.clone(),
-                            args.region.clone(),
-                        ));
-                    }
-                    if let Some(client) = client2 {
-                        maintenance_tasks.spawn(notifications::task_main(
-                            client,
-                            cache.clone(),
-                            cancel_map.clone(),
-                            args.region.clone(),
-                        ));
-                    }
-                    maintenance_tasks.spawn(async move { cache.clone().gc_worker().await });
-                }
-            }
-            if let Some(regional_redis_client) = regional_redis_client {
-                let cache = api.caches.endpoints_cache.clone();
-                let con = regional_redis_client;
-                let span = tracing::info_span!("endpoints_cache");
-                maintenance_tasks.spawn(
-                    async move { cache.do_read(con, cancellation_token.clone()).await }
-                        .instrument(span),
-                );
-            }
-        } else if let proxy::control_plane::client::ControlPlaneClient::ProxyV1(api) = &**api {
+        if let proxy::control_plane::client::ControlPlaneClient::ProxyV1(api) = &**api {
            match (redis_notifications_client, regional_redis_client.clone()) {
                (None, None) => {}
                (client1, client2) => {
@@ -757,65 +721,6 @@ fn build_auth_backend(
            Ok(Either::Left(config))
        }

-        AuthBackendType::ControlPlane => {
-            let wake_compute_cache_config: CacheOptions = args.wake_compute_cache.parse()?;
-            let project_info_cache_config: ProjectInfoCacheOptions =
-                args.project_info_cache.parse()?;
-            let endpoint_cache_config: config::EndpointCacheConfig =
-                args.endpoint_cache_config.parse()?;
-
-            info!("Using NodeInfoCache (wake_compute) with options={wake_compute_cache_config:?}");
-            info!(
-                "Using AllowedIpsCache (wake_compute) with options={project_info_cache_config:?}"
-            );
-            info!("Using EndpointCacheConfig with options={endpoint_cache_config:?}");
-            let caches = Box::leak(Box::new(control_plane::caches::ApiCaches::new(
-                wake_compute_cache_config,
-                project_info_cache_config,
-                endpoint_cache_config,
-            )));
-
-            let config::ConcurrencyLockOptions {
-                shards,
-                limiter,
-                epoch,
-                timeout,
-            } = args.wake_compute_lock.parse()?;
-            info!(?limiter, shards, ?epoch, "Using NodeLocks (wake_compute)");
-            let locks = Box::leak(Box::new(control_plane::locks::ApiLocks::new(
-                "wake_compute_lock",
-                limiter,
-                shards,
-                timeout,
-                epoch,
-                &Metrics::get().wake_compute_lock,
-            )?));
-            tokio::spawn(locks.garbage_collect_worker());
-
-            let url: proxy::url::ApiUrl = args.auth_endpoint.parse()?;
-
-            let endpoint = http::Endpoint::new(url, http::new_client());
-
-            let mut wake_compute_rps_limit = args.wake_compute_limit.clone();
-            RateBucketInfo::validate(&mut wake_compute_rps_limit)?;
-            let wake_compute_endpoint_rate_limiter =
-                Arc::new(WakeComputeRateLimiter::new(wake_compute_rps_limit));
-
-            let api = control_plane::client::neon::NeonControlPlaneClient::new(
-                endpoint,
-                args.control_plane_token.clone(),
-                caches,
-                locks,
-                wake_compute_endpoint_rate_limiter,
-            );
-            let api = control_plane::client::ControlPlaneClient::Neon(api);
-            let auth_backend = auth::Backend::ControlPlane(MaybeOwned::Owned(api), ());
-
-            let config = Box::leak(Box::new(auth_backend));
-
-            Ok(Either::Left(config))
-        }
-
        #[cfg(feature = "testing")]
        AuthBackendType::Postgres => {
            let url = args.auth_endpoint.parse()?;
--- a/proxy/src/control_plane/client/mod.rs
+++ b/proxy/src/control_plane/client/mod.rs
@@ -1,7 +1,6 @@
 pub mod cplane_proxy_v1;
 #[cfg(any(test, feature = "testing"))]
 pub mod mock;
-pub mod neon;

 use std::hash::Hash;
 use std::sync::Arc;
@@ -28,10 +27,8 @@ use crate::types::EndpointId;
 #[non_exhaustive]
 #[derive(Clone)]
 pub enum ControlPlaneClient {
-    /// New Proxy V1 control plane API
+    /// Proxy V1 control plane API
    ProxyV1(cplane_proxy_v1::NeonControlPlaneClient),
-    /// Current Management API (V2).
-    Neon(neon::NeonControlPlaneClient),
    /// Local mock control plane.
    #[cfg(any(test, feature = "testing"))]
    PostgresMock(mock::MockControlPlane),
@@ -49,7 +46,6 @@ impl ControlPlaneApi for ControlPlaneClient {
    ) -> Result<CachedRoleSecret, errors::GetAuthInfoError> {
        match self {
            Self::ProxyV1(api) => api.get_role_secret(ctx, user_info).await,
-            Self::Neon(api) => api.get_role_secret(ctx, user_info).await,
            #[cfg(any(test, feature = "testing"))]
            Self::PostgresMock(api) => api.get_role_secret(ctx, user_info).await,
            #[cfg(test)]
@@ -66,7 +62,6 @@ impl ControlPlaneApi for ControlPlaneClient {
    ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), errors::GetAuthInfoError> {
        match self {
            Self::ProxyV1(api) => api.get_allowed_ips_and_secret(ctx, user_info).await,
-            Self::Neon(api) => api.get_allowed_ips_and_secret(ctx, user_info).await,
            #[cfg(any(test, feature = "testing"))]
            Self::PostgresMock(api) => api.get_allowed_ips_and_secret(ctx, user_info).await,
            #[cfg(test)]
@@ -81,7 +76,6 @@ impl ControlPlaneApi for ControlPlaneClient {
    ) -> Result<Vec<AuthRule>, errors::GetEndpointJwksError> {
        match self {
            Self::ProxyV1(api) => api.get_endpoint_jwks(ctx, endpoint).await,
-            Self::Neon(api) => api.get_endpoint_jwks(ctx, endpoint).await,
            #[cfg(any(test, feature = "testing"))]
            Self::PostgresMock(api) => api.get_endpoint_jwks(ctx, endpoint).await,
            #[cfg(test)]
@@ -96,7 +90,6 @@ impl ControlPlaneApi for ControlPlaneClient {
    ) -> Result<CachedNodeInfo, errors::WakeComputeError> {
        match self {
            Self::ProxyV1(api) => api.wake_compute(ctx, user_info).await,
-            Self::Neon(api) => api.wake_compute(ctx, user_info).await,
            #[cfg(any(test, feature = "testing"))]
            Self::PostgresMock(api) => api.wake_compute(ctx, user_info).await,
            #[cfg(test)]
--- a/proxy/src/control_plane/client/neon.rs
+++ b/proxy/src/control_plane/client/neon.rs
@@ -1,511 +0,0 @@
-//! Stale console backend, remove after migrating to Proxy V1 API (#15245).
-
-use std::sync::Arc;
-use std::time::Duration;
-
-use ::http::header::AUTHORIZATION;
-use ::http::HeaderName;
-use futures::TryFutureExt;
-use postgres_client::config::SslMode;
-use tokio::time::Instant;
-use tracing::{debug, info, info_span, warn, Instrument};
-
-use super::super::messages::{ControlPlaneErrorMessage, GetRoleSecret, WakeCompute};
-use crate::auth::backend::jwt::AuthRule;
-use crate::auth::backend::ComputeUserInfo;
-use crate::cache::Cached;
-use crate::context::RequestContext;
-use crate::control_plane::caches::ApiCaches;
-use crate::control_plane::errors::{
-    ControlPlaneError, GetAuthInfoError, GetEndpointJwksError, WakeComputeError,
-};
-use crate::control_plane::locks::ApiLocks;
-use crate::control_plane::messages::{ColdStartInfo, EndpointJwksResponse, Reason};
-use crate::control_plane::{
-    AuthInfo, AuthSecret, CachedAllowedIps, CachedNodeInfo, CachedRoleSecret, NodeInfo,
-};
-use crate::metrics::{CacheOutcome, Metrics};
-use crate::rate_limiter::WakeComputeRateLimiter;
-use crate::types::{EndpointCacheKey, EndpointId};
-use crate::{compute, http, scram};
-
-const X_REQUEST_ID: HeaderName = HeaderName::from_static("x-request-id");
-
-#[derive(Clone)]
-pub struct NeonControlPlaneClient {
-    endpoint: http::Endpoint,
-    pub caches: &'static ApiCaches,
-    pub(crate) locks: &'static ApiLocks<EndpointCacheKey>,
-    pub(crate) wake_compute_endpoint_rate_limiter: Arc<WakeComputeRateLimiter>,
-    // put in a shared ref so we don't copy secrets all over in memory
-    jwt: Arc<str>,
-}
-
-impl NeonControlPlaneClient {
-    /// Construct an API object containing the auth parameters.
-    pub fn new(
-        endpoint: http::Endpoint,
-        jwt: Arc<str>,
-        caches: &'static ApiCaches,
-        locks: &'static ApiLocks<EndpointCacheKey>,
-        wake_compute_endpoint_rate_limiter: Arc<WakeComputeRateLimiter>,
-    ) -> Self {
-        Self {
-            endpoint,
-            caches,
-            locks,
-            wake_compute_endpoint_rate_limiter,
-            jwt,
-        }
-    }
-
-    pub(crate) fn url(&self) -> &str {
-        self.endpoint.url().as_str()
-    }
-
-    async fn do_get_auth_info(
-        &self,
-        ctx: &RequestContext,
-        user_info: &ComputeUserInfo,
-    ) -> Result<AuthInfo, GetAuthInfoError> {
-        if !self
-            .caches
-            .endpoints_cache
-            .is_valid(ctx, &user_info.endpoint.normalize())
-        {
-            // TODO: refactor this because it's weird
-            // this is a failure to authenticate but we return Ok.
-            info!("endpoint is not valid, skipping the request");
-            return Ok(AuthInfo::default());
-        }
-        let request_id = ctx.session_id().to_string();
-        let application_name = ctx.console_application_name();
-        async {
-            let request = self
-                .endpoint
-                .get_path("proxy_get_role_secret")
-                .header(X_REQUEST_ID, &request_id)
-                .header(AUTHORIZATION, format!("Bearer {}", &self.jwt))
-                .query(&[("session_id", ctx.session_id())])
-                .query(&[
-                    ("application_name", application_name.as_str()),
-                    ("project", user_info.endpoint.as_str()),
-                    ("role", user_info.user.as_str()),
-                ])
-                .build()?;
-
-            debug!(url = request.url().as_str(), "sending http request");
-            let start = Instant::now();
-            let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Cplane);
-            let response = self.endpoint.execute(request).await?;
-            drop(pause);
-            info!(duration = ?start.elapsed(), "received http response");
-            let body = match parse_body::<GetRoleSecret>(response).await {
-                Ok(body) => body,
-                // Error 404 is special: it's ok not to have a secret.
-                // TODO(anna): retry
-                Err(e) => {
-                    return if e.get_reason().is_not_found() {
-                        // TODO: refactor this because it's weird
-                        // this is a failure to authenticate but we return Ok.
-                        Ok(AuthInfo::default())
-                    } else {
-                        Err(e.into())
-                    };
-                }
-            };
-
-            let secret = if body.role_secret.is_empty() {
-                None
-            } else {
-                let secret = scram::ServerSecret::parse(&body.role_secret)
-                    .map(AuthSecret::Scram)
-                    .ok_or(GetAuthInfoError::BadSecret)?;
-                Some(secret)
-            };
-            let allowed_ips = body.allowed_ips.unwrap_or_default();
-            Metrics::get()
-                .proxy
-                .allowed_ips_number
-                .observe(allowed_ips.len() as f64);
-            Ok(AuthInfo {
-                secret,
-                allowed_ips,
-                project_id: body.project_id,
-            })
-        }
-        .inspect_err(|e| tracing::debug!(error = ?e))
-        .instrument(info_span!("do_get_auth_info"))
-        .await
-    }
-
-    async fn do_get_endpoint_jwks(
-        &self,
-        ctx: &RequestContext,
-        endpoint: EndpointId,
-    ) -> Result<Vec<AuthRule>, GetEndpointJwksError> {
-        if !self
-            .caches
-            .endpoints_cache
-            .is_valid(ctx, &endpoint.normalize())
-        {
-            return Err(GetEndpointJwksError::EndpointNotFound);
-        }
-        let request_id = ctx.session_id().to_string();
-        async {
-            let request = self
-                .endpoint
-                .get_with_url(|url| {
-                    url.path_segments_mut()
-                        .push("endpoints")
-                        .push(endpoint.as_str())
-                        .push("jwks");
-                })
-                .header(X_REQUEST_ID, &request_id)
-                .header(AUTHORIZATION, format!("Bearer {}", &self.jwt))
-                .query(&[("session_id", ctx.session_id())])
-                .build()
-                .map_err(GetEndpointJwksError::RequestBuild)?;
-
-            debug!(url = request.url().as_str(), "sending http request");
-            let start = Instant::now();
-            let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Cplane);
-            let response = self
-                .endpoint
-                .execute(request)
-                .await
-                .map_err(GetEndpointJwksError::RequestExecute)?;
-            drop(pause);
-            info!(duration = ?start.elapsed(), "received http response");
-
-            let body = parse_body::<EndpointJwksResponse>(response).await?;
-
-            let rules = body
-                .jwks
-                .into_iter()
-                .map(|jwks| AuthRule {
-                    id: jwks.id,
-                    jwks_url: jwks.jwks_url,
-                    audience: jwks.jwt_audience,
-                    role_names: jwks.role_names,
-                })
-                .collect();
-
-            Ok(rules)
-        }
-        .inspect_err(|e| tracing::debug!(error = ?e))
-        .instrument(info_span!("do_get_endpoint_jwks"))
-        .await
-    }
-
-    async fn do_wake_compute(
-        &self,
-        ctx: &RequestContext,
-        user_info: &ComputeUserInfo,
-    ) -> Result<NodeInfo, WakeComputeError> {
-        let request_id = ctx.session_id().to_string();
-        let application_name = ctx.console_application_name();
-        async {
-            let mut request_builder = self
-                .endpoint
-                .get_path("proxy_wake_compute")
-                .header("X-Request-ID", &request_id)
-                .header("Authorization", format!("Bearer {}", &self.jwt))
-                .query(&[("session_id", ctx.session_id())])
-                .query(&[
-                    ("application_name", application_name.as_str()),
-                    ("project", user_info.endpoint.as_str()),
-                ]);
-
-            let options = user_info.options.to_deep_object();
-            if !options.is_empty() {
-                request_builder = request_builder.query(&options);
-            }
-
-            let request = request_builder.build()?;
-
-            debug!(url = request.url().as_str(), "sending http request");
-            let start = Instant::now();
-            let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Cplane);
-            let response = self.endpoint.execute(request).await?;
-            drop(pause);
-            info!(duration = ?start.elapsed(), "received http response");
-            let body = parse_body::<WakeCompute>(response).await?;
-
-            // Unfortunately, ownership won't let us use `Option::ok_or` here.
-            let (host, port) = match parse_host_port(&body.address) {
-                None => return Err(WakeComputeError::BadComputeAddress(body.address)),
-                Some(x) => x,
-            };
-
-            // Don't set anything but host and port! This config will be cached.
-            // We'll set username and such later using the startup message.
-            // TODO: add more type safety (in progress).
-            let mut config = compute::ConnCfg::new(host.to_owned(), port);
-            config.ssl_mode(SslMode::Disable); // TLS is not configured on compute nodes.
-
-            let node = NodeInfo {
-                config,
-                aux: body.aux,
-                allow_self_signed_compute: false,
-            };
-
-            Ok(node)
-        }
-        .inspect_err(|e| tracing::debug!(error = ?e))
-        .instrument(info_span!("do_wake_compute"))
-        .await
-    }
-}
-
-impl super::ControlPlaneApi for NeonControlPlaneClient {
-    #[tracing::instrument(skip_all)]
-    async fn get_role_secret(
-        &self,
-        ctx: &RequestContext,
-        user_info: &ComputeUserInfo,
-    ) -> Result<CachedRoleSecret, GetAuthInfoError> {
-        let normalized_ep = &user_info.endpoint.normalize();
-        let user = &user_info.user;
-        if let Some(role_secret) = self
-            .caches
-            .project_info
-            .get_role_secret(normalized_ep, user)
-        {
-            return Ok(role_secret);
-        }
-        let auth_info = self.do_get_auth_info(ctx, user_info).await?;
-        if let Some(project_id) = auth_info.project_id {
-            let normalized_ep_int = normalized_ep.into();
-            self.caches.project_info.insert_role_secret(
-                project_id,
-                normalized_ep_int,
-                user.into(),
-                auth_info.secret.clone(),
-            );
-            self.caches.project_info.insert_allowed_ips(
-                project_id,
-                normalized_ep_int,
-                Arc::new(auth_info.allowed_ips),
-            );
-            ctx.set_project_id(project_id);
-        }
-        // When we just got a secret, we don't need to invalidate it.
-        Ok(Cached::new_uncached(auth_info.secret))
-    }
-
-    async fn get_allowed_ips_and_secret(
-        &self,
-        ctx: &RequestContext,
-        user_info: &ComputeUserInfo,
-    ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), GetAuthInfoError> {
-        let normalized_ep = &user_info.endpoint.normalize();
-        if let Some(allowed_ips) = self.caches.project_info.get_allowed_ips(normalized_ep) {
-            Metrics::get()
-                .proxy
-                .allowed_ips_cache_misses
-                .inc(CacheOutcome::Hit);
-            return Ok((allowed_ips, None));
-        }
-        Metrics::get()
-            .proxy
-            .allowed_ips_cache_misses
-            .inc(CacheOutcome::Miss);
-        let auth_info = self.do_get_auth_info(ctx, user_info).await?;
-        let allowed_ips = Arc::new(auth_info.allowed_ips);
-        let user = &user_info.user;
-        if let Some(project_id) = auth_info.project_id {
-            let normalized_ep_int = normalized_ep.into();
-            self.caches.project_info.insert_role_secret(
-                project_id,
-                normalized_ep_int,
-                user.into(),
-                auth_info.secret.clone(),
-            );
-            self.caches.project_info.insert_allowed_ips(
-                project_id,
-                normalized_ep_int,
-                allowed_ips.clone(),
-            );
-            ctx.set_project_id(project_id);
-        }
-        Ok((
-            Cached::new_uncached(allowed_ips),
-            Some(Cached::new_uncached(auth_info.secret)),
-        ))
-    }
-
-    #[tracing::instrument(skip_all)]
-    async fn get_endpoint_jwks(
-        &self,
-        ctx: &RequestContext,
-        endpoint: EndpointId,
-    ) -> Result<Vec<AuthRule>, GetEndpointJwksError> {
-        self.do_get_endpoint_jwks(ctx, endpoint).await
-    }
-
-    #[tracing::instrument(skip_all)]
-    async fn wake_compute(
-        &self,
-        ctx: &RequestContext,
-        user_info: &ComputeUserInfo,
-    ) -> Result<CachedNodeInfo, WakeComputeError> {
-        let key = user_info.endpoint_cache_key();
-
-        macro_rules! check_cache {
-            () => {
-                if let Some(cached) = self.caches.node_info.get(&key) {
-                    let (cached, info) = cached.take_value();
-                    let info = info.map_err(|c| {
-                        info!(key = &*key, "found cached wake_compute error");
-                        WakeComputeError::ControlPlane(ControlPlaneError::Message(Box::new(*c)))
-                    })?;
-
-                    debug!(key = &*key, "found cached compute node info");
-                    ctx.set_project(info.aux.clone());
-                    return Ok(cached.map(|()| info));
-                }
-            };
-        }
-
-        // Every time we do a wakeup http request, the compute node will stay up
-        // for some time (highly depends on the console's scale-to-zero policy);
-        // The connection info remains the same during that period of time,
-        // which means that we might cache it to reduce the load and latency.
-        check_cache!();
-
-        let permit = self.locks.get_permit(&key).await?;
-
-        // after getting back a permit - it's possible the cache was filled
-        // double check
-        if permit.should_check_cache() {
-            // TODO: if there is something in the cache, mark the permit as success.
-            check_cache!();
-        }
-
-        // check rate limit
-        if !self
-            .wake_compute_endpoint_rate_limiter
-            .check(user_info.endpoint.normalize_intern(), 1)
-        {
-            return Err(WakeComputeError::TooManyConnections);
-        }
-
-        let node = permit.release_result(self.do_wake_compute(ctx, user_info).await);
-        match node {
-            Ok(node) => {
-                ctx.set_project(node.aux.clone());
-                debug!(key = &*key, "created a cache entry for woken compute node");
-
-                let mut stored_node = node.clone();
-                // store the cached node as 'warm_cached'
-                stored_node.aux.cold_start_info = ColdStartInfo::WarmCached;
-
-                let (_, cached) = self.caches.node_info.insert_unit(key, Ok(stored_node));
-
-                Ok(cached.map(|()| node))
-            }
-            Err(err) => match err {
-                WakeComputeError::ControlPlane(ControlPlaneError::Message(err)) => {
-                    let Some(status) = &err.status else {
-                        return Err(WakeComputeError::ControlPlane(ControlPlaneError::Message(
-                            err,
-                        )));
-                    };
-
-                    let reason = status
-                        .details
-                        .error_info
-                        .map_or(Reason::Unknown, |x| x.reason);
-
-                    // if we can retry this error, do not cache it.
-                    if reason.can_retry() {
-                        return Err(WakeComputeError::ControlPlane(ControlPlaneError::Message(
-                            err,
-                        )));
-                    }
-
-                    // at this point, we should only have quota errors.
-                    debug!(
-                        key = &*key,
-                        "created a cache entry for the wake compute error"
-                    );
-
-                    self.caches.node_info.insert_ttl(
-                        key,
-                        Err(err.clone()),
-                        Duration::from_secs(30),
-                    );
-
-                    Err(WakeComputeError::ControlPlane(ControlPlaneError::Message(
-                        err,
-                    )))
-                }
-                err => return Err(err),
-            },
-        }
-    }
-}
-
-/// Parse http response body, taking status code into account.
-async fn parse_body<T: for<'a> serde::Deserialize<'a>>(
-    response: http::Response,
-) -> Result<T, ControlPlaneError> {
-    let status = response.status();
-    if status.is_success() {
-        // We shouldn't log raw body because it may contain secrets.
-        info!("request succeeded, processing the body");
-        return Ok(response.json().await?);
-    }
-    let s = response.bytes().await?;
-    // Log plaintext to be able to detect, whether there are some cases not covered by the error struct.
-    info!("response_error plaintext: {:?}", s);
-
-    // Don't throw an error here because it's not as important
-    // as the fact that the request itself has failed.
-    let mut body = serde_json::from_slice(&s).unwrap_or_else(|e| {
-        warn!("failed to parse error body: {e}");
-        ControlPlaneErrorMessage {
-            error: "reason unclear (malformed error message)".into(),
-            http_status_code: status,
-            status: None,
-        }
-    });
-    body.http_status_code = status;
-
-    warn!("console responded with an error ({status}): {body:?}");
-    Err(ControlPlaneError::Message(Box::new(body)))
-}
-
-fn parse_host_port(input: &str) -> Option<(&str, u16)> {
-    let (host, port) = input.rsplit_once(':')?;
-    let ipv6_brackets: &[_] = &['[', ']'];
-    Some((host.trim_matches(ipv6_brackets), port.parse().ok()?))
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    #[test]
-    fn test_parse_host_port_v4() {
-        let (host, port) = parse_host_port("127.0.0.1:5432").expect("failed to parse");
-        assert_eq!(host, "127.0.0.1");
-        assert_eq!(port, 5432);
-    }
-
-    #[test]
-    fn test_parse_host_port_v6() {
-        let (host, port) = parse_host_port("[2001:db8::1]:5432").expect("failed to parse");
-        assert_eq!(host, "2001:db8::1");
-        assert_eq!(port, 5432);
-    }
-
-    #[test]
-    fn test_parse_host_port_url() {
-        let (host, port) = parse_host_port("compute-foo-bar-1234.default.svc.cluster.local:5432")
-            .expect("failed to parse");
-        assert_eq!(host, "compute-foo-bar-1234.default.svc.cluster.local");
-        assert_eq!(port, 5432);
-    }
-}
--- a/proxy/src/control_plane/messages.rs
+++ b/proxy/src/control_plane/messages.rs
@@ -221,15 +221,6 @@ pub(crate) struct UserFacingMessage {
    pub(crate) message: Box<str>,
 }

-/// Response which holds client's auth secret, e.g. [`crate::scram::ServerSecret`].
-/// Returned by the `/proxy_get_role_secret` API method.
-#[derive(Deserialize)]
-pub(crate) struct GetRoleSecret {
-    pub(crate) role_secret: Box<str>,
-    pub(crate) allowed_ips: Option<Vec<IpPattern>>,
-    pub(crate) project_id: Option<ProjectIdInt>,
-}
-
 /// Response which holds client's auth secret, e.g. [`crate::scram::ServerSecret`].
 /// Returned by the `/get_endpoint_access_control` API method.
 #[derive(Deserialize)]
@@ -240,13 +231,6 @@ pub(crate) struct GetEndpointAccessControl {
    pub(crate) allowed_vpc_endpoint_ids: Option<Vec<EndpointIdInt>>,
 }

-// Manually implement debug to omit sensitive info.
-impl fmt::Debug for GetRoleSecret {
-    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        f.debug_struct("GetRoleSecret").finish_non_exhaustive()
-    }
-}
-
 /// Response which holds compute node's `host:port` pair.
 /// Returned by the `/proxy_wake_compute` API method.
 #[derive(Debug, Deserialize)]
@@ -477,18 +461,18 @@ mod tests {
        let json = json!({
            "role_secret": "secret",
        });
-        serde_json::from_str::<GetRoleSecret>(&json.to_string())?;
+        serde_json::from_str::<GetEndpointAccessControl>(&json.to_string())?;
        let json = json!({
            "role_secret": "secret",
            "allowed_ips": ["8.8.8.8"],
        });
-        serde_json::from_str::<GetRoleSecret>(&json.to_string())?;
+        serde_json::from_str::<GetEndpointAccessControl>(&json.to_string())?;
        let json = json!({
            "role_secret": "secret",
            "allowed_ips": ["8.8.8.8"],
            "project_id": "project",
        });
-        serde_json::from_str::<GetRoleSecret>(&json.to_string())?;
+        serde_json::from_str::<GetEndpointAccessControl>(&json.to_string())?;

        Ok(())
    }
--- a/safekeeper/src/control_file_upgrade.rs
+++ b/safekeeper/src/control_file_upgrade.rs
@@ -1,11 +1,12 @@
 //! Code to deal with safekeeper control file upgrades
 use crate::{
-    safekeeper::{AcceptorState, PgUuid, ServerInfo, Term, TermHistory, TermLsn},
+    safekeeper::{AcceptorState, PgUuid, TermHistory, TermLsn},
    state::{EvictionState, PersistedPeers, TimelinePersistentState},
    wal_backup_partial,
 };
 use anyhow::{bail, Result};
 use pq_proto::SystemId;
+use safekeeper_api::{ServerInfo, Term};
 use serde::{Deserialize, Serialize};
 use tracing::*;
 use utils::{
--- a/safekeeper/src/debug_dump.rs
+++ b/safekeeper/src/debug_dump.rs
@@ -14,6 +14,7 @@ use camino::Utf8PathBuf;
 use chrono::{DateTime, Utc};
 use postgres_ffi::XLogSegNo;
 use postgres_ffi::MAX_SEND_SIZE;
+use safekeeper_api::models::WalSenderState;
 use serde::Deserialize;
 use serde::Serialize;

@@ -25,7 +26,6 @@ use utils::id::{TenantId, TimelineId};
 use utils::lsn::Lsn;

 use crate::safekeeper::TermHistory;
-use crate::send_wal::WalSenderState;
 use crate::state::TimelineMemState;
 use crate::state::TimelinePersistentState;
 use crate::timeline::get_timeline_dir;
--- a/safekeeper/src/handler.rs
+++ b/safekeeper/src/handler.rs
@@ -4,6 +4,8 @@
 use anyhow::Context;
 use pageserver_api::models::ShardParameters;
 use pageserver_api::shard::{ShardIdentity, ShardStripeSize};
+use safekeeper_api::models::ConnectionId;
+use safekeeper_api::Term;
 use std::future::Future;
 use std::str::{self, FromStr};
 use std::sync::Arc;
@@ -16,9 +18,7 @@ use crate::auth::check_permission;
 use crate::json_ctrl::{handle_json_ctrl, AppendLogicalMessage};

 use crate::metrics::{TrafficMetrics, PG_QUERIES_GAUGE};
-use crate::safekeeper::Term;
 use crate::timeline::TimelineError;
-use crate::wal_service::ConnectionId;
 use crate::{GlobalTimelines, SafeKeeperConf};
 use postgres_backend::PostgresBackend;
 use postgres_backend::QueryError;
--- a/safekeeper/src/http/client.rs
+++ b/safekeeper/src/http/client.rs
@@ -8,6 +8,7 @@
 //! etc.

 use reqwest::{IntoUrl, Method, StatusCode};
+use safekeeper_api::models::TimelineStatus;
 use std::error::Error as _;
 use utils::{
    http::error::HttpErrorBody,
@@ -15,8 +16,6 @@ use utils::{
    logging::SecretString,
 };

-use super::routes::TimelineStatus;
-
 #[derive(Debug, Clone)]
 pub struct Client {
    mgmt_api_endpoint: String,
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -1,5 +1,9 @@
 use hyper::{Body, Request, Response, StatusCode};
-use serde::{Deserialize, Serialize};
+use safekeeper_api::models::AcceptorStateStatus;
+use safekeeper_api::models::SafekeeperStatus;
+use safekeeper_api::models::TermSwitchApiEntry;
+use safekeeper_api::models::TimelineStatus;
+use safekeeper_api::ServerInfo;
 use std::collections::HashMap;
 use std::fmt;
 use std::io::Write as _;
@@ -31,26 +35,17 @@ use utils::{
        request::{ensure_no_body, parse_request_param},
        RequestExt, RouterBuilder,
    },
-    id::{NodeId, TenantId, TenantTimelineId, TimelineId},
+    id::{TenantId, TenantTimelineId, TimelineId},
    lsn::Lsn,
 };

 use crate::debug_dump::TimelineDigestRequest;
-use crate::receive_wal::WalReceiverState;
-use crate::safekeeper::Term;
-use crate::safekeeper::{ServerInfo, TermLsn};
-use crate::send_wal::WalSenderState;
-use crate::timeline::PeerInfo;
+use crate::safekeeper::TermLsn;
 use crate::timelines_global_map::TimelineDeleteForceResult;
 use crate::GlobalTimelines;
 use crate::SafeKeeperConf;
 use crate::{copy_timeline, debug_dump, patch_control_file, pull_timeline};

-#[derive(Debug, Serialize)]
-struct SafekeeperStatus {
-    id: NodeId,
-}
-
 /// Healthcheck handler.
 async fn status_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
    check_permission(&request, None)?;
@@ -73,50 +68,6 @@ fn get_global_timelines(request: &Request<Body>) -> Arc<GlobalTimelines> {
        .clone()
 }

-/// Same as TermLsn, but serializes LSN using display serializer
-/// in Postgres format, i.e. 0/FFFFFFFF. Used only for the API response.
-#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
-pub struct TermSwitchApiEntry {
-    pub term: Term,
-    pub lsn: Lsn,
-}
-
-impl From<TermSwitchApiEntry> for TermLsn {
-    fn from(api_val: TermSwitchApiEntry) -> Self {
-        TermLsn {
-            term: api_val.term,
-            lsn: api_val.lsn,
-        }
-    }
-}
-
-/// Augment AcceptorState with last_log_term for convenience
-#[derive(Debug, Serialize, Deserialize)]
-pub struct AcceptorStateStatus {
-    pub term: Term,
-    pub epoch: Term, // aka last_log_term
-    pub term_history: Vec<TermSwitchApiEntry>,
-}
-
-/// Info about timeline on safekeeper ready for reporting.
-#[derive(Debug, Serialize, Deserialize)]
-pub struct TimelineStatus {
-    pub tenant_id: TenantId,
-    pub timeline_id: TimelineId,
-    pub acceptor_state: AcceptorStateStatus,
-    pub pg_info: ServerInfo,
-    pub flush_lsn: Lsn,
-    pub timeline_start_lsn: Lsn,
-    pub local_start_lsn: Lsn,
-    pub commit_lsn: Lsn,
-    pub backup_lsn: Lsn,
-    pub peer_horizon_lsn: Lsn,
-    pub remote_consistent_lsn: Lsn,
-    pub peers: Vec<PeerInfo>,
-    pub walsenders: Vec<WalSenderState>,
-    pub walreceivers: Vec<WalReceiverState>,
-}
-
 fn check_permission(request: &Request<Body>, tenant_id: Option<TenantId>) -> Result<(), ApiError> {
    check_permission_with(request, |claims| {
        crate::auth::check_permission(claims, tenant_id)
@@ -187,6 +138,15 @@ async fn timeline_list_handler(request: Request<Body>) -> Result<Response<Body>,
    json_response(StatusCode::OK, res)
 }

+impl From<TermSwitchApiEntry> for TermLsn {
+    fn from(api_val: TermSwitchApiEntry) -> Self {
+        TermLsn {
+            term: api_val.term,
+            lsn: api_val.lsn,
+        }
+    }
+}
+
 /// Report info about timeline.
 async fn timeline_status_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
    let ttid = TenantTimelineId::new(
--- a/safekeeper/src/json_ctrl.rs
+++ b/safekeeper/src/json_ctrl.rs
@@ -8,16 +8,17 @@

 use anyhow::Context;
 use postgres_backend::QueryError;
+use safekeeper_api::{ServerInfo, Term};
 use serde::{Deserialize, Serialize};
 use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::*;

 use crate::handler::SafekeeperPostgresHandler;
-use crate::safekeeper::{AcceptorProposerMessage, AppendResponse, ServerInfo};
+use crate::safekeeper::{AcceptorProposerMessage, AppendResponse};
 use crate::safekeeper::{
    AppendRequest, AppendRequestHeader, ProposerAcceptorMessage, ProposerElected,
 };
-use crate::safekeeper::{Term, TermHistory, TermLsn};
+use crate::safekeeper::{TermHistory, TermLsn};
 use crate::state::TimelinePersistentState;
 use crate::timeline::WalResidentTimeline;
 use postgres_backend::PostgresBackend;
--- a/safekeeper/src/pull_timeline.rs
+++ b/safekeeper/src/pull_timeline.rs
@@ -4,6 +4,7 @@ use camino::Utf8PathBuf;
 use chrono::{DateTime, Utc};
 use futures::{SinkExt, StreamExt, TryStreamExt};
 use postgres_ffi::{XLogFileName, XLogSegNo, PG_TLI};
+use safekeeper_api::{models::TimelineStatus, Term};
 use serde::{Deserialize, Serialize};
 use std::{
    cmp::min,
@@ -21,11 +22,7 @@ use tracing::{error, info, instrument};
 use crate::{
    control_file::CONTROL_FILE_NAME,
    debug_dump,
-    http::{
-        client::{self, Client},
-        routes::TimelineStatus,
-    },
-    safekeeper::Term,
+    http::client::{self, Client},
    state::{EvictionState, TimelinePersistentState},
    timeline::{Timeline, WalResidentTimeline},
    timelines_global_map::{create_temp_timeline_dir, validate_temp_timeline},
--- a/safekeeper/src/receive_wal.rs
+++ b/safekeeper/src/receive_wal.rs
@@ -9,9 +9,7 @@ use crate::metrics::{
 };
 use crate::safekeeper::AcceptorProposerMessage;
 use crate::safekeeper::ProposerAcceptorMessage;
-use crate::safekeeper::ServerInfo;
 use crate::timeline::WalResidentTimeline;
-use crate::wal_service::ConnectionId;
 use crate::GlobalTimelines;
 use anyhow::{anyhow, Context};
 use bytes::BytesMut;
@@ -23,8 +21,8 @@ use postgres_backend::PostgresBackend;
 use postgres_backend::PostgresBackendReader;
 use postgres_backend::QueryError;
 use pq_proto::BeMessage;
-use serde::Deserialize;
-use serde::Serialize;
+use safekeeper_api::models::{ConnectionId, WalReceiverState, WalReceiverStatus};
+use safekeeper_api::ServerInfo;
 use std::future;
 use std::net::SocketAddr;
 use std::sync::Arc;
@@ -171,21 +169,6 @@ impl WalReceiversShared {
    }
 }

-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct WalReceiverState {
-    /// None means it is recovery initiated by us (this safekeeper).
-    pub conn_id: Option<ConnectionId>,
-    pub status: WalReceiverStatus,
-}
-
-/// Walreceiver status. Currently only whether it passed voting stage and
-/// started receiving the stream, but it is easy to add more if needed.
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub enum WalReceiverStatus {
-    Voting,
-    Streaming,
-}
-
 /// Scope guard to access slot in WalReceivers registry and unregister from
 /// it in Drop.
 pub struct WalReceiverGuard {
--- a/safekeeper/src/recovery.rs
+++ b/safekeeper/src/recovery.rs
@@ -7,6 +7,8 @@ use std::{fmt, pin::pin};
 use anyhow::{bail, Context};
 use futures::StreamExt;
 use postgres_protocol::message::backend::ReplicationMessage;
+use safekeeper_api::models::{PeerInfo, TimelineStatus};
+use safekeeper_api::Term;
 use tokio::sync::mpsc::{channel, Receiver, Sender};
 use tokio::time::timeout;
 use tokio::{
@@ -24,13 +26,11 @@ use crate::receive_wal::{WalAcceptor, REPLY_QUEUE_SIZE};
 use crate::safekeeper::{AppendRequest, AppendRequestHeader};
 use crate::timeline::WalResidentTimeline;
 use crate::{
-    http::routes::TimelineStatus,
    receive_wal::MSG_QUEUE_SIZE,
    safekeeper::{
-        AcceptorProposerMessage, ProposerAcceptorMessage, ProposerElected, Term, TermHistory,
-        TermLsn, VoteRequest,
+        AcceptorProposerMessage, ProposerAcceptorMessage, ProposerElected, TermHistory, TermLsn,
+        VoteRequest,
    },
-    timeline::PeerInfo,
    SafeKeeperConf,
 };

--- a/safekeeper/src/safekeeper.rs
+++ b/safekeeper/src/safekeeper.rs
@@ -5,6 +5,9 @@ use byteorder::{LittleEndian, ReadBytesExt};
 use bytes::{Buf, BufMut, Bytes, BytesMut};

 use postgres_ffi::{TimeLineID, MAX_SEND_SIZE};
+use safekeeper_api::models::HotStandbyFeedback;
+use safekeeper_api::Term;
+use safekeeper_api::INVALID_TERM;
 use serde::{Deserialize, Serialize};
 use std::cmp::max;
 use std::cmp::min;
@@ -16,7 +19,6 @@ use tracing::*;

 use crate::control_file;
 use crate::metrics::MISC_OPERATION_SECONDS;
-use crate::send_wal::HotStandbyFeedback;

 use crate::state::TimelineState;
 use crate::wal_storage;
@@ -31,10 +33,6 @@ use utils::{
 const SK_PROTOCOL_VERSION: u32 = 2;
 pub const UNKNOWN_SERVER_VERSION: u32 = 0;

-/// Consensus logical timestamp.
-pub type Term = u64;
-pub const INVALID_TERM: Term = 0;
-
 #[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
 pub struct TermLsn {
    pub term: Term,
@@ -198,16 +196,6 @@ impl AcceptorState {
    }
 }

-/// Information about Postgres. Safekeeper gets it once and then verifies
-/// all further connections from computes match.
-#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
-pub struct ServerInfo {
-    /// Postgres server version
-    pub pg_version: u32,
-    pub system_id: SystemId,
-    pub wal_seg_size: u32,
-}
-
 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
 pub struct PersistedPeerInfo {
    /// LSN up to which safekeeper offloaded WAL to s3.
@@ -1041,6 +1029,7 @@ where
 mod tests {
    use futures::future::BoxFuture;
    use postgres_ffi::{XLogSegNo, WAL_SEGMENT_SIZE};
+    use safekeeper_api::ServerInfo;

    use super::*;
    use crate::state::{EvictionState, PersistedPeers, TimelinePersistentState};
--- a/safekeeper/src/send_wal.rs
+++ b/safekeeper/src/send_wal.rs
@@ -4,11 +4,10 @@
 use crate::handler::SafekeeperPostgresHandler;
 use crate::metrics::RECEIVED_PS_FEEDBACKS;
 use crate::receive_wal::WalReceivers;
-use crate::safekeeper::{Term, TermLsn};
+use crate::safekeeper::TermLsn;
 use crate::send_interpreted_wal::InterpretedWalSender;
 use crate::timeline::WalResidentTimeline;
 use crate::wal_reader_stream::WalReaderStreamBuilder;
-use crate::wal_service::ConnectionId;
 use crate::wal_storage::WalReader;
 use anyhow::{bail, Context as AnyhowContext};
 use bytes::Bytes;
@@ -19,7 +18,11 @@ use postgres_backend::{CopyStreamHandlerEnd, PostgresBackendReader, QueryError};
 use postgres_ffi::get_current_timestamp;
 use postgres_ffi::{TimestampTz, MAX_SEND_SIZE};
 use pq_proto::{BeMessage, WalSndKeepAlive, XLogDataBody};
-use serde::{Deserialize, Serialize};
+use safekeeper_api::models::{
+    ConnectionId, HotStandbyFeedback, ReplicationFeedback, StandbyFeedback, StandbyReply,
+    WalSenderState, INVALID_FULL_TRANSACTION_ID,
+};
+use safekeeper_api::Term;
 use tokio::io::{AsyncRead, AsyncWrite};
 use utils::failpoint_support;
 use utils::id::TenantTimelineId;
@@ -28,7 +31,6 @@ use utils::postgres_client::PostgresClientProtocol;

 use std::cmp::{max, min};
 use std::net::SocketAddr;
-use std::str;
 use std::sync::Arc;
 use std::time::Duration;
 use tokio::sync::watch::Receiver;
@@ -42,65 +44,6 @@ const STANDBY_STATUS_UPDATE_TAG_BYTE: u8 = b'r';
 // neon extension of replication protocol
 const NEON_STATUS_UPDATE_TAG_BYTE: u8 = b'z';

-type FullTransactionId = u64;
-
-/// Hot standby feedback received from replica
-#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
-pub struct HotStandbyFeedback {
-    pub ts: TimestampTz,
-    pub xmin: FullTransactionId,
-    pub catalog_xmin: FullTransactionId,
-}
-
-const INVALID_FULL_TRANSACTION_ID: FullTransactionId = 0;
-
-impl HotStandbyFeedback {
-    pub fn empty() -> HotStandbyFeedback {
-        HotStandbyFeedback {
-            ts: 0,
-            xmin: 0,
-            catalog_xmin: 0,
-        }
-    }
-}
-
-/// Standby status update
-#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
-pub struct StandbyReply {
-    pub write_lsn: Lsn, // The location of the last WAL byte + 1 received and written to disk in the standby.
-    pub flush_lsn: Lsn, // The location of the last WAL byte + 1 flushed to disk in the standby.
-    pub apply_lsn: Lsn, // The location of the last WAL byte + 1 applied in the standby.
-    pub reply_ts: TimestampTz, // The client's system clock at the time of transmission, as microseconds since midnight on 2000-01-01.
-    pub reply_requested: bool,
-}
-
-impl StandbyReply {
-    fn empty() -> Self {
-        StandbyReply {
-            write_lsn: Lsn::INVALID,
-            flush_lsn: Lsn::INVALID,
-            apply_lsn: Lsn::INVALID,
-            reply_ts: 0,
-            reply_requested: false,
-        }
-    }
-}
-
-#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
-pub struct StandbyFeedback {
-    pub reply: StandbyReply,
-    pub hs_feedback: HotStandbyFeedback,
-}
-
-impl StandbyFeedback {
-    pub fn empty() -> Self {
-        StandbyFeedback {
-            reply: StandbyReply::empty(),
-            hs_feedback: HotStandbyFeedback::empty(),
-        }
-    }
-}
-
 /// WalSenders registry. Timeline holds it (wrapped in Arc).
 pub struct WalSenders {
    mutex: Mutex<WalSendersShared>,
@@ -341,25 +284,6 @@ impl WalSendersShared {
    }
 }

-// Serialized is used only for pretty printing in json.
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct WalSenderState {
-    ttid: TenantTimelineId,
-    addr: SocketAddr,
-    conn_id: ConnectionId,
-    // postgres application_name
-    appname: Option<String>,
-    feedback: ReplicationFeedback,
-}
-
-// Receiver is either pageserver or regular standby, which have different
-// feedbacks.
-#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
-enum ReplicationFeedback {
-    Pageserver(PageserverFeedback),
-    Standby(StandbyFeedback),
-}
-
 // id of the occupied slot in WalSenders to access it (and save in the
 // WalSenderGuard). We could give Arc directly to the slot, but there is not
 // much sense in that as values aggregation which is performed on each feedback
@@ -888,6 +812,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> ReplyReader<IO> {

 #[cfg(test)]
 mod tests {
+    use safekeeper_api::models::FullTransactionId;
    use utils::id::{TenantId, TimelineId};

    use super::*;
--- a/safekeeper/src/state.rs
+++ b/safekeeper/src/state.rs
@@ -5,7 +5,7 @@ use std::{cmp::max, ops::Deref};

 use anyhow::{bail, Result};
 use postgres_ffi::WAL_SEGMENT_SIZE;
-use safekeeper_api::models::TimelineTermBumpResponse;
+use safekeeper_api::{models::TimelineTermBumpResponse, ServerInfo, Term};
 use serde::{Deserialize, Serialize};
 use utils::{
    id::{NodeId, TenantId, TenantTimelineId, TimelineId},
@@ -14,10 +14,7 @@ use utils::{

 use crate::{
    control_file,
-    safekeeper::{
-        AcceptorState, PersistedPeerInfo, PgUuid, ServerInfo, Term, TermHistory,
-        UNKNOWN_SERVER_VERSION,
-    },
+    safekeeper::{AcceptorState, PersistedPeerInfo, PgUuid, TermHistory, UNKNOWN_SERVER_VERSION},
    timeline::TimelineError,
    wal_backup_partial::{self},
 };
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -4,8 +4,8 @@
 use anyhow::{anyhow, bail, Result};
 use camino::{Utf8Path, Utf8PathBuf};
 use remote_storage::RemotePath;
-use safekeeper_api::models::TimelineTermBumpResponse;
-use serde::{Deserialize, Serialize};
+use safekeeper_api::models::{PeerInfo, TimelineTermBumpResponse};
+use safekeeper_api::Term;
 use tokio::fs::{self};
 use tokio_util::sync::CancellationToken;
 use utils::id::TenantId;
@@ -31,9 +31,7 @@ use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId;
 use crate::control_file;
 use crate::rate_limit::RateLimiter;
 use crate::receive_wal::WalReceivers;
-use crate::safekeeper::{
-    AcceptorProposerMessage, ProposerAcceptorMessage, SafeKeeper, Term, TermLsn,
-};
+use crate::safekeeper::{AcceptorProposerMessage, ProposerAcceptorMessage, SafeKeeper, TermLsn};
 use crate::send_wal::WalSenders;
 use crate::state::{EvictionState, TimelineMemState, TimelinePersistentState, TimelineState};
 use crate::timeline_guard::ResidenceGuard;
@@ -47,40 +45,17 @@ use crate::wal_storage::{Storage as wal_storage_iface, WalReader};
 use crate::SafeKeeperConf;
 use crate::{debug_dump, timeline_manager, wal_storage};

-/// Things safekeeper should know about timeline state on peers.
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct PeerInfo {
-    pub sk_id: NodeId,
-    pub term: Term,
-    /// Term of the last entry.
-    pub last_log_term: Term,
-    /// LSN of the last record.
-    pub flush_lsn: Lsn,
-    pub commit_lsn: Lsn,
-    /// Since which LSN safekeeper has WAL.
-    pub local_start_lsn: Lsn,
-    /// When info was received. Serde annotations are not very useful but make
-    /// the code compile -- we don't rely on this field externally.
-    #[serde(skip)]
-    #[serde(default = "Instant::now")]
-    ts: Instant,
-    pub pg_connstr: String,
-    pub http_connstr: String,
-}
-
-impl PeerInfo {
-    fn from_sk_info(sk_info: &SafekeeperTimelineInfo, ts: Instant) -> PeerInfo {
-        PeerInfo {
-            sk_id: NodeId(sk_info.safekeeper_id),
-            term: sk_info.term,
-            last_log_term: sk_info.last_log_term,
-            flush_lsn: Lsn(sk_info.flush_lsn),
-            commit_lsn: Lsn(sk_info.commit_lsn),
-            local_start_lsn: Lsn(sk_info.local_start_lsn),
-            pg_connstr: sk_info.safekeeper_connstr.clone(),
-            http_connstr: sk_info.http_connstr.clone(),
-            ts,
-        }
+fn peer_info_from_sk_info(sk_info: &SafekeeperTimelineInfo, ts: Instant) -> PeerInfo {
+    PeerInfo {
+        sk_id: NodeId(sk_info.safekeeper_id),
+        term: sk_info.term,
+        last_log_term: sk_info.last_log_term,
+        flush_lsn: Lsn(sk_info.flush_lsn),
+        commit_lsn: Lsn(sk_info.commit_lsn),
+        local_start_lsn: Lsn(sk_info.local_start_lsn),
+        pg_connstr: sk_info.safekeeper_connstr.clone(),
+        http_connstr: sk_info.http_connstr.clone(),
+        ts,
    }
 }

@@ -697,7 +672,7 @@ impl Timeline {
        {
            let mut shared_state = self.write_shared_state().await;
            shared_state.sk.record_safekeeper_info(&sk_info).await?;
-            let peer_info = PeerInfo::from_sk_info(&sk_info, Instant::now());
+            let peer_info = peer_info_from_sk_info(&sk_info, Instant::now());
            shared_state.peers_info.upsert(&peer_info);
        }
        Ok(())
--- a/safekeeper/src/timeline_manager.rs
+++ b/safekeeper/src/timeline_manager.rs
@@ -14,6 +14,7 @@ use std::{

 use futures::channel::oneshot;
 use postgres_ffi::XLogSegNo;
+use safekeeper_api::{models::PeerInfo, Term};
 use serde::{Deserialize, Serialize};
 use tokio::{
    task::{JoinError, JoinHandle},
@@ -32,10 +33,9 @@ use crate::{
    rate_limit::{rand_duration, RateLimiter},
    recovery::recovery_main,
    remove_wal::calc_horizon_lsn,
-    safekeeper::Term,
    send_wal::WalSenders,
    state::TimelineState,
-    timeline::{ManagerTimeline, PeerInfo, ReadGuardSharedState, StateSK, WalResidentTimeline},
+    timeline::{ManagerTimeline, ReadGuardSharedState, StateSK, WalResidentTimeline},
    timeline_guard::{AccessService, GuardId, ResidenceGuard},
    timelines_set::{TimelineSetGuard, TimelinesSet},
    wal_backup::{self, WalBackupTaskHandle},
--- a/safekeeper/src/timelines_global_map.rs
+++ b/safekeeper/src/timelines_global_map.rs
@@ -4,7 +4,6 @@

 use crate::defaults::DEFAULT_EVICTION_CONCURRENCY;
 use crate::rate_limit::RateLimiter;
-use crate::safekeeper::ServerInfo;
 use crate::state::TimelinePersistentState;
 use crate::timeline::{get_tenant_dir, get_timeline_dir, Timeline, TimelineError};
 use crate::timelines_set::TimelinesSet;
@@ -13,6 +12,7 @@ use crate::{control_file, wal_storage, SafeKeeperConf};
 use anyhow::{bail, Context, Result};
 use camino::Utf8PathBuf;
 use camino_tempfile::Utf8TempDir;
+use safekeeper_api::ServerInfo;
 use serde::Serialize;
 use std::collections::HashMap;
 use std::str::FromStr;
--- a/safekeeper/src/wal_backup.rs
+++ b/safekeeper/src/wal_backup.rs
@@ -3,6 +3,7 @@ use anyhow::{Context, Result};
 use camino::{Utf8Path, Utf8PathBuf};
 use futures::stream::FuturesOrdered;
 use futures::StreamExt;
+use safekeeper_api::models::PeerInfo;
 use tokio::task::JoinHandle;
 use tokio_util::sync::CancellationToken;
 use utils::backoff;
@@ -30,7 +31,7 @@ use tracing::*;
 use utils::{id::TenantTimelineId, lsn::Lsn};

 use crate::metrics::{BACKED_UP_SEGMENTS, BACKUP_ERRORS, WAL_BACKUP_TASKS};
-use crate::timeline::{PeerInfo, WalResidentTimeline};
+use crate::timeline::WalResidentTimeline;
 use crate::timeline_manager::{Manager, StateSnapshot};
 use crate::{SafeKeeperConf, WAL_BACKUP_RUNTIME};

--- a/safekeeper/src/wal_backup_partial.rs
+++ b/safekeeper/src/wal_backup_partial.rs
@@ -22,6 +22,7 @@
 use camino::Utf8PathBuf;
 use postgres_ffi::{XLogFileName, XLogSegNo, PG_TLI};
 use remote_storage::RemotePath;
+use safekeeper_api::Term;
 use serde::{Deserialize, Serialize};

 use tokio_util::sync::CancellationToken;
@@ -31,7 +32,6 @@ use utils::{id::NodeId, lsn::Lsn};
 use crate::{
    metrics::{MISC_OPERATION_SECONDS, PARTIAL_BACKUP_UPLOADED_BYTES, PARTIAL_BACKUP_UPLOADS},
    rate_limit::{rand_duration, RateLimiter},
-    safekeeper::Term,
    timeline::WalResidentTimeline,
    timeline_manager::StateSnapshot,
    wal_backup::{self},
--- a/safekeeper/src/wal_reader_stream.rs
+++ b/safekeeper/src/wal_reader_stream.rs
@@ -4,12 +4,12 @@ use async_stream::try_stream;
 use bytes::Bytes;
 use futures::Stream;
 use postgres_backend::CopyStreamHandlerEnd;
+use safekeeper_api::Term;
 use std::time::Duration;
 use tokio::time::timeout;
 use utils::lsn::Lsn;

 use crate::{
-    safekeeper::Term,
    send_wal::{EndWatch, WalSenderGuard},
    timeline::WalResidentTimeline,
 };
--- a/safekeeper/src/wal_service.rs
+++ b/safekeeper/src/wal_service.rs
@@ -4,6 +4,7 @@
 //!
 use anyhow::{Context, Result};
 use postgres_backend::QueryError;
+use safekeeper_api::models::ConnectionId;
 use std::sync::Arc;
 use std::time::Duration;
 use tokio::net::TcpStream;
@@ -114,8 +115,6 @@ async fn handle_socket(
        .await
 }

-/// Unique WAL service connection ids are logged in spans for observability.
-pub type ConnectionId = u32;
 pub type ConnectionCount = u32;

 pub fn issue_connection_id(count: &mut ConnectionCount) -> ConnectionId {
--- a/safekeeper/tests/walproposer_sim/safekeeper.rs
+++ b/safekeeper/tests/walproposer_sim/safekeeper.rs
@@ -15,12 +15,13 @@ use desim::{
 };
 use http::Uri;
 use safekeeper::{
-    safekeeper::{ProposerAcceptorMessage, SafeKeeper, ServerInfo, UNKNOWN_SERVER_VERSION},
+    safekeeper::{ProposerAcceptorMessage, SafeKeeper, UNKNOWN_SERVER_VERSION},
    state::{TimelinePersistentState, TimelineState},
    timeline::TimelineError,
    wal_storage::Storage,
    SafeKeeperConf,
 };
+use safekeeper_api::ServerInfo;
 use tracing::{debug, info_span, warn};
 use utils::{
    id::{NodeId, TenantId, TenantTimelineId, TimelineId},
--- a/storage_controller/src/background_node_operations.rs
+++ b/storage_controller/src/background_node_operations.rs
@@ -3,7 +3,7 @@ use std::{borrow::Cow, fmt::Debug, fmt::Display};
 use tokio_util::sync::CancellationToken;
 use utils::id::NodeId;

-pub(crate) const MAX_RECONCILES_PER_OPERATION: usize = 32;
+pub(crate) const MAX_RECONCILES_PER_OPERATION: usize = 64;

 #[derive(Copy, Clone)]
 pub(crate) struct Drain {
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -18,8 +18,9 @@ use pageserver_api::controller_api::{
    ShardsPreferredAzsRequest, TenantCreateRequest,
 };
 use pageserver_api::models::{
-    TenantConfigRequest, TenantLocationConfigRequest, TenantShardSplitRequest,
-    TenantTimeTravelRequest, TimelineArchivalConfigRequest, TimelineCreateRequest,
+    TenantConfigPatchRequest, TenantConfigRequest, TenantLocationConfigRequest,
+    TenantShardSplitRequest, TenantTimeTravelRequest, TimelineArchivalConfigRequest,
+    TimelineCreateRequest,
 };
 use pageserver_api::shard::TenantShardId;
 use pageserver_client::{mgmt_api, BlockUnblock};
@@ -208,6 +209,27 @@ async fn handle_tenant_location_config(
    )
 }

+async fn handle_tenant_config_patch(
+    service: Arc<Service>,
+    req: Request<Body>,
+) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::PageServerApi)?;
+
+    let mut req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
+    let config_req = json_request::<TenantConfigPatchRequest>(&mut req).await?;
+
+    json_response(
+        StatusCode::OK,
+        service.tenant_config_patch(config_req).await?,
+    )
+}
+
 async fn handle_tenant_config_set(
    service: Arc<Service>,
    req: Request<Body>,
@@ -857,6 +879,21 @@ async fn handle_cancel_node_fill(req: Request<Body>) -> Result<Response<Body>, A
    json_response(StatusCode::ACCEPTED, ())
 }

+async fn handle_safekeeper_list(req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Infra)?;
+
+    let req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
+    let state = get_state(&req);
+    let safekeepers = state.service.safekeepers_list().await?;
+    json_response(StatusCode::OK, safekeepers)
+}
+
 async fn handle_metadata_health_update(req: Request<Body>) -> Result<Response<Body>, ApiError> {
    check_permissions(&req, Scope::Scrubber)?;

@@ -1181,7 +1218,7 @@ impl From<ReconcileError> for ApiError {
 ///
 /// Not used by anything except manual testing.
 async fn handle_get_safekeeper(req: Request<Body>) -> Result<Response<Body>, ApiError> {
-    check_permissions(&req, Scope::Admin)?;
+    check_permissions(&req, Scope::Infra)?;

    let id = parse_request_param::<i64>(&req, "id")?;

@@ -1199,7 +1236,7 @@ async fn handle_get_safekeeper(req: Request<Body>) -> Result<Response<Body>, Api
    match res {
        Ok(b) => json_response(StatusCode::OK, b),
        Err(crate::persistence::DatabaseError::Query(diesel::result::Error::NotFound)) => {
-            Err(ApiError::NotFound("unknown instance_id".into()))
+            Err(ApiError::NotFound("unknown instance id".into()))
        }
        Err(other) => Err(other.into()),
    }
@@ -1795,6 +1832,21 @@ pub fn make_router(
                RequestName("control_v1_metadata_health_list_outdated"),
            )
        })
+        // Safekeepers
+        .get("/control/v1/safekeeper", |r| {
+            named_request_span(
+                r,
+                handle_safekeeper_list,
+                RequestName("control_v1_safekeeper_list"),
+            )
+        })
+        .get("/control/v1/safekeeper/:id", |r| {
+            named_request_span(r, handle_get_safekeeper, RequestName("v1_safekeeper"))
+        })
+        .post("/control/v1/safekeeper/:id", |r| {
+            // id is in the body
+            named_request_span(r, handle_upsert_safekeeper, RequestName("v1_safekeeper"))
+        })
        // Tenant Shard operations
        .put("/control/v1/tenant/:tenant_shard_id/migrate", |r| {
            tenant_service_handler(
@@ -1847,13 +1899,6 @@ pub fn make_router(
        .put("/control/v1/step_down", |r| {
            named_request_span(r, handle_step_down, RequestName("control_v1_step_down"))
        })
-        .get("/control/v1/safekeeper/:id", |r| {
-            named_request_span(r, handle_get_safekeeper, RequestName("v1_safekeeper"))
-        })
-        .post("/control/v1/safekeeper/:id", |r| {
-            // id is in the body
-            named_request_span(r, handle_upsert_safekeeper, RequestName("v1_safekeeper"))
-        })
        // Tenant operations
        // The ^/v1/ endpoints act as a "Virtual Pageserver", enabling shard-naive clients to call into
        // this service to manage tenants that actually consist of many tenant shards, as if they are a single entity.
@@ -1863,6 +1908,13 @@ pub fn make_router(
        .delete("/v1/tenant/:tenant_id", |r| {
            tenant_service_handler(r, handle_tenant_delete, RequestName("v1_tenant"))
        })
+        .patch("/v1/tenant/config", |r| {
+            tenant_service_handler(
+                r,
+                handle_tenant_config_patch,
+                RequestName("v1_tenant_config"),
+            )
+        })
        .put("/v1/tenant/config", |r| {
            tenant_service_handler(r, handle_tenant_config_set, RequestName("v1_tenant_config"))
        })
--- a/storage_controller/src/persistence.rs
+++ b/storage_controller/src/persistence.rs
@@ -104,6 +104,7 @@ pub(crate) enum DatabaseOperation {
    ListMetadataHealth,
    ListMetadataHealthUnhealthy,
    ListMetadataHealthOutdated,
+    ListSafekeepers,
    GetLeader,
    UpdateLeader,
    SetPreferredAzs,
@@ -1011,6 +1012,22 @@ impl Persistence {
        Ok(())
    }

+    /// At startup, populate the list of nodes which our shards may be placed on
+    pub(crate) async fn list_safekeepers(&self) -> DatabaseResult<Vec<SafekeeperPersistence>> {
+        let safekeepers: Vec<SafekeeperPersistence> = self
+            .with_measured_conn(
+                DatabaseOperation::ListNodes,
+                move |conn| -> DatabaseResult<_> {
+                    Ok(crate::schema::safekeepers::table.load::<SafekeeperPersistence>(conn)?)
+                },
+            )
+            .await?;
+
+        tracing::info!("list_safekeepers: loaded {} nodes", safekeepers.len());
+
+        Ok(safekeepers)
+    }
+
    pub(crate) async fn safekeeper_get(
        &self,
        id: i64,
--- a/storage_controller/src/scheduler.rs
+++ b/storage_controller/src/scheduler.rs
@@ -742,6 +742,50 @@ impl Scheduler {
        self.schedule_shard::<AttachedShardTag>(&[], &None, &ScheduleContext::default())
    }

+    /// For choosing which AZ to schedule a new shard into, use this.  It will return the
+    /// AZ with the lowest median utilization.
+    ///
+    /// We use an AZ-wide measure rather than simply selecting the AZ of the least-loaded
+    /// node, because while tenants start out single sharded, when they grow and undergo
+    /// shard-split, they will occupy space on many nodes within an AZ.
+    ///
+    /// We use median rather than total free space or mean utilization, because
+    /// we wish to avoid preferring AZs that have low-load nodes resulting from
+    /// recent replacements.
+    ///
+    /// The practical result is that we will pick an AZ based on its median node, and
+    /// then actually _schedule_ the new shard onto the lowest-loaded node in that AZ.
+    pub(crate) fn get_az_for_new_tenant(&self) -> Option<AvailabilityZone> {
+        if self.nodes.is_empty() {
+            return None;
+        }
+
+        let mut scores_by_az = HashMap::new();
+        for (node_id, node) in &self.nodes {
+            let az_scores = scores_by_az.entry(&node.az).or_insert_with(Vec::new);
+            let score = match &node.may_schedule {
+                MaySchedule::Yes(utilization) => utilization.score(),
+                MaySchedule::No => PageserverUtilization::full().score(),
+            };
+            az_scores.push((node_id, node, score));
+        }
+
+        // Sort by utilization.  Also include the node ID to break ties.
+        for scores in scores_by_az.values_mut() {
+            scores.sort_by_key(|i| (i.2, i.0));
+        }
+
+        let mut median_by_az = scores_by_az
+            .iter()
+            .map(|(az, nodes)| (*az, nodes.get(nodes.len() / 2).unwrap().2))
+            .collect::<Vec<_>>();
+        // Sort by utilization.  Also include the AZ to break ties.
+        median_by_az.sort_by_key(|i| (i.1, i.0));
+
+        // Return the AZ with the lowest median utilization
+        Some(median_by_az.first().unwrap().0.clone())
+    }
+
    /// Unit test access to internal state
    #[cfg(test)]
    pub(crate) fn get_node_shard_count(&self, node_id: NodeId) -> usize {
@@ -1087,4 +1131,53 @@ mod tests {
            intent.clear(&mut scheduler);
        }
    }
+
+    #[test]
+    fn az_scheduling_for_new_tenant() {
+        let az_a_tag = AvailabilityZone("az-a".to_string());
+        let az_b_tag = AvailabilityZone("az-b".to_string());
+        let nodes = test_utils::make_test_nodes(
+            6,
+            &[
+                az_a_tag.clone(),
+                az_a_tag.clone(),
+                az_a_tag.clone(),
+                az_b_tag.clone(),
+                az_b_tag.clone(),
+                az_b_tag.clone(),
+            ],
+        );
+
+        let mut scheduler = Scheduler::new(nodes.values());
+
+        /// Force the utilization of a node in Scheduler's state to a particular
+        /// number of bytes used.
+        fn set_utilization(scheduler: &mut Scheduler, node_id: NodeId, shard_count: u32) {
+            let mut node = Node::new(
+                node_id,
+                "".to_string(),
+                0,
+                "".to_string(),
+                0,
+                scheduler.nodes.get(&node_id).unwrap().az.clone(),
+            );
+            node.set_availability(NodeAvailability::Active(test_utilization::simple(
+                shard_count,
+                0,
+            )));
+            scheduler.node_upsert(&node);
+        }
+
+        // Initial empty state.  Scores are tied, scheduler prefers lower AZ ID.
+        assert_eq!(scheduler.get_az_for_new_tenant(), Some(az_a_tag.clone()));
+
+        // Put some utilization on one node in AZ A: this should change nothing, as the median hasn't changed
+        set_utilization(&mut scheduler, NodeId(1), 1000000);
+        assert_eq!(scheduler.get_az_for_new_tenant(), Some(az_a_tag.clone()));
+
+        // Put some utilization on a second node in AZ A: now the median has changed, so the scheduler
+        // should prefer the other AZ.
+        set_utilization(&mut scheduler, NodeId(2), 1000000);
+        assert_eq!(scheduler.get_az_for_new_tenant(), Some(az_b_tag.clone()));
+    }
 }
--- a/storage_controller/src/schema.rs
+++ b/storage_controller/src/schema.rs
@@ -29,6 +29,19 @@ diesel::table! {
    }
 }

+diesel::table! {
+    safekeepers (id) {
+        id -> Int8,
+        region_id -> Text,
+        version -> Int8,
+        host -> Text,
+        port -> Int4,
+        active -> Bool,
+        http_port -> Int4,
+        availability_zone_id -> Text,
+    }
+}
+
 diesel::table! {
    tenant_shards (tenant_id, shard_number, shard_count) {
        tenant_id -> Varchar,
@@ -45,18 +58,10 @@ diesel::table! {
    }
 }

-diesel::allow_tables_to_appear_in_same_query!(controllers, metadata_health, nodes, tenant_shards,);
-
-diesel::table! {
-    safekeepers {
-        id -> Int8,
-        region_id -> Text,
-        version -> Int8,
-        instance_id -> Text,
-        host -> Text,
-        port -> Int4,
-        active -> Bool,
-        http_port -> Int4,
-        availability_zone_id -> Text,
-    }
-}
+diesel::allow_tables_to_appear_in_same_query!(
+    controllers,
+    metadata_health,
+    nodes,
+    safekeepers,
+    tenant_shards,
+);
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -52,8 +52,8 @@ use pageserver_api::{
        TenantPolicyRequest, TenantShardMigrateRequest, TenantShardMigrateResponse,
    },
    models::{
-        SecondaryProgress, TenantConfigRequest, TimelineArchivalConfigRequest,
-        TopTenantShardsRequest,
+        SecondaryProgress, TenantConfigPatchRequest, TenantConfigRequest,
+        TimelineArchivalConfigRequest, TopTenantShardsRequest,
    },
 };
 use reqwest::StatusCode;
@@ -100,6 +100,8 @@ use crate::{

 use context_iterator::TenantShardContextIterator;

+const WAITER_FILL_DRAIN_POLL_TIMEOUT: Duration = Duration::from_millis(500);
+
 // For operations that should be quick, like attaching a new tenant
 const SHORT_RECONCILE_TIMEOUT: Duration = Duration::from_secs(5);

@@ -139,6 +141,7 @@ enum TenantOperations {
    Create,
    LocationConfig,
    ConfigSet,
+    ConfigPatch,
    TimeTravelRemoteStorage,
    Delete,
    UpdatePolicy,
@@ -1579,6 +1582,7 @@ impl Service {
                            attach_req.tenant_shard_id,
                            ShardIdentity::unsharded(),
                            PlacementPolicy::Attached(0),
+                            None,
                        ),
                    );
                    tracing::info!("Inserted shard {} in memory", attach_req.tenant_shard_id);
@@ -2106,6 +2110,16 @@ impl Service {
            )
        };

+        let preferred_az_id = {
+            let locked = self.inner.read().unwrap();
+            // Idempotency: take the existing value if the tenant already exists
+            if let Some(shard) = locked.tenants.get(create_ids.first().unwrap()) {
+                shard.preferred_az().cloned()
+            } else {
+                locked.scheduler.get_az_for_new_tenant()
+            }
+        };
+
        // Ordering: we persist tenant shards before creating them on the pageserver.  This enables a caller
        // to clean up after themselves by issuing a tenant deletion if something goes wrong and we restart
        // during the creation, rather than risking leaving orphan objects in S3.
@@ -2125,7 +2139,7 @@ impl Service {
                splitting: SplitState::default(),
                scheduling_policy: serde_json::to_string(&ShardSchedulingPolicy::default())
                    .unwrap(),
-                preferred_az_id: None,
+                preferred_az_id: preferred_az_id.as_ref().map(|az| az.to_string()),
            })
            .collect();

@@ -2161,6 +2175,7 @@ impl Service {
                    &create_req.shard_parameters,
                    create_req.config.clone(),
                    placement_policy.clone(),
+                    preferred_az_id.as_ref(),
                    &mut schedule_context,
                )
                .await;
@@ -2174,44 +2189,6 @@ impl Service {
            }
        }

-        let preferred_azs = {
-            let locked = self.inner.read().unwrap();
-            response_shards
-                .iter()
-                .filter_map(|resp| {
-                    let az_id = locked
-                        .nodes
-                        .get(&resp.node_id)
-                        .map(|n| n.get_availability_zone_id().clone())?;
-
-                    Some((resp.shard_id, az_id))
-                })
-                .collect::<Vec<_>>()
-        };
-
-        // Note that we persist the preferred AZ for the new shards separately.
-        // In theory, we could "peek" the scheduler to determine where the shard will
-        // land, but the subsequent "real" call into the scheduler might select a different
-        // node. Hence, we do this awkward update to keep things consistent.
-        let updated = self
-            .persistence
-            .set_tenant_shard_preferred_azs(preferred_azs)
-            .await
-            .map_err(|err| {
-                ApiError::InternalServerError(anyhow::anyhow!(
-                    "Failed to persist preferred az ids: {err}"
-                ))
-            })?;
-
-        {
-            let mut locked = self.inner.write().unwrap();
-            for (tid, az_id) in updated {
-                if let Some(shard) = locked.tenants.get_mut(&tid) {
-                    shard.set_preferred_az(az_id);
-                }
-            }
-        }
-
        // If we failed to schedule shards, then they are still created in the controller,
        // but we return an error to the requester to avoid a silent failure when someone
        // tries to e.g. create a tenant whose placement policy requires more nodes than
@@ -2242,6 +2219,7 @@ impl Service {

    /// Helper for tenant creation that does the scheduling for an individual shard. Covers both the
    /// case of a new tenant and a pre-existing one.
+    #[allow(clippy::too_many_arguments)]
    async fn do_initial_shard_scheduling(
        &self,
        tenant_shard_id: TenantShardId,
@@ -2249,6 +2227,7 @@ impl Service {
        shard_params: &ShardParameters,
        config: TenantConfig,
        placement_policy: PlacementPolicy,
+        preferred_az_id: Option<&AvailabilityZone>,
        schedule_context: &mut ScheduleContext,
    ) -> InitialShardScheduleOutcome {
        let mut locked = self.inner.write().unwrap();
@@ -2259,10 +2238,6 @@ impl Service {
            Entry::Occupied(mut entry) => {
                tracing::info!("Tenant shard {tenant_shard_id} already exists while creating");

-                // TODO: schedule() should take an anti-affinity expression that pushes
-                // attached and secondary locations (independently) away frorm those
-                // pageservers also holding a shard for this tenant.
-
                if let Err(err) = entry.get_mut().schedule(scheduler, schedule_context) {
                    return InitialShardScheduleOutcome::ShardScheduleError(err);
                }
@@ -2286,6 +2261,7 @@ impl Service {
                    tenant_shard_id,
                    ShardIdentity::from_params(tenant_shard_id.shard_number, shard_params),
                    placement_policy,
+                    preferred_az_id.cloned(),
                ));

                state.generation = initial_generation;
@@ -2602,6 +2578,55 @@ impl Service {
        Ok(result)
    }

+    pub(crate) async fn tenant_config_patch(
+        &self,
+        req: TenantConfigPatchRequest,
+    ) -> Result<(), ApiError> {
+        let _tenant_lock = trace_exclusive_lock(
+            &self.tenant_op_locks,
+            req.tenant_id,
+            TenantOperations::ConfigPatch,
+        )
+        .await;
+
+        let tenant_id = req.tenant_id;
+        let patch = req.config;
+
+        let base = {
+            let locked = self.inner.read().unwrap();
+            let shards = locked
+                .tenants
+                .range(TenantShardId::tenant_range(req.tenant_id));
+
+            let mut configs = shards.map(|(_sid, shard)| &shard.config).peekable();
+
+            let first = match configs.peek() {
+                Some(first) => (*first).clone(),
+                None => {
+                    return Err(ApiError::NotFound(
+                        anyhow::anyhow!("Tenant {} not found", req.tenant_id).into(),
+                    ));
+                }
+            };
+
+            if !configs.all_equal() {
+                tracing::error!("Tenant configs for {} are mismatched. ", req.tenant_id);
+                // This can't happen because we atomically update the database records
+                // of all shards to the new value in [`Self::set_tenant_config_and_reconcile`].
+                return Err(ApiError::InternalServerError(anyhow::anyhow!(
+                    "Tenant configs for {} are mismatched",
+                    req.tenant_id
+                )));
+            }
+
+            first
+        };
+
+        let updated_config = base.apply_patch(patch);
+        self.set_tenant_config_and_reconcile(tenant_id, updated_config)
+            .await
+    }
+
    pub(crate) async fn tenant_config_set(&self, req: TenantConfigRequest) -> Result<(), ApiError> {
        // We require an exclusive lock, because we are updating persistent and in-memory state
        let _tenant_lock = trace_exclusive_lock(
@@ -2611,12 +2636,32 @@ impl Service {
        )
        .await;

-        let tenant_id = req.tenant_id;
-        let config = req.config;
+        let tenant_exists = {
+            let locked = self.inner.read().unwrap();
+            let mut r = locked
+                .tenants
+                .range(TenantShardId::tenant_range(req.tenant_id));
+            r.next().is_some()
+        };

+        if !tenant_exists {
+            return Err(ApiError::NotFound(
+                anyhow::anyhow!("Tenant {} not found", req.tenant_id).into(),
+            ));
+        }
+
+        self.set_tenant_config_and_reconcile(req.tenant_id, req.config)
+            .await
+    }
+
+    async fn set_tenant_config_and_reconcile(
+        &self,
+        tenant_id: TenantId,
+        config: TenantConfig,
+    ) -> Result<(), ApiError> {
        self.persistence
            .update_tenant_shard(
-                TenantFilter::Tenant(req.tenant_id),
+                TenantFilter::Tenant(tenant_id),
                None,
                Some(config.clone()),
                None,
@@ -4184,7 +4229,8 @@ impl Service {
                        },
                    );

-                    let mut child_state = TenantShard::new(child, child_shard, policy.clone());
+                    let mut child_state =
+                        TenantShard::new(child, child_shard, policy.clone(), preferred_az.clone());
                    child_state.intent = IntentState::single(scheduler, Some(pageserver));
                    child_state.observed = ObservedState {
                        locations: child_observed,
@@ -6728,7 +6774,7 @@ impl Service {
            }

            waiters = self
-                .await_waiters_remainder(waiters, SHORT_RECONCILE_TIMEOUT)
+                .await_waiters_remainder(waiters, WAITER_FILL_DRAIN_POLL_TIMEOUT)
                .await;

            failpoint_support::sleep_millis_async!("sleepy-drain-loop", &cancel);
@@ -6981,7 +7027,7 @@ impl Service {
            }

            waiters = self
-                .await_waiters_remainder(waiters, SHORT_RECONCILE_TIMEOUT)
+                .await_waiters_remainder(waiters, WAITER_FILL_DRAIN_POLL_TIMEOUT)
                .await;
        }

@@ -7113,6 +7159,12 @@ impl Service {
        global_observed
    }

+    pub(crate) async fn safekeepers_list(
+        &self,
+    ) -> Result<Vec<crate::persistence::SafekeeperPersistence>, DatabaseError> {
+        self.persistence.list_safekeepers().await
+    }
+
    pub(crate) async fn get_safekeeper(
        &self,
        id: i64,
--- a/storage_controller/src/tenant_shard.rs
+++ b/storage_controller/src/tenant_shard.rs
@@ -472,6 +472,7 @@ impl TenantShard {
        tenant_shard_id: TenantShardId,
        shard: ShardIdentity,
        policy: PlacementPolicy,
+        preferred_az_id: Option<AvailabilityZone>,
    ) -> Self {
        metrics::METRICS_REGISTRY
            .metrics_group
@@ -495,7 +496,7 @@ impl TenantShard {
            last_error: Arc::default(),
            pending_compute_notification: false,
            scheduling_policy: ShardSchedulingPolicy::default(),
-            preferred_az_id: None,
+            preferred_az_id,
        }
    }

@@ -1571,6 +1572,7 @@ pub(crate) mod tests {
            )
            .unwrap(),
            policy,
+            None,
        )
    }

@@ -1597,7 +1599,7 @@ pub(crate) mod tests {
                    shard_number,
                    shard_count,
                };
-                let mut ts = TenantShard::new(
+                TenantShard::new(
                    tenant_shard_id,
                    ShardIdentity::new(
                        shard_number,
@@ -1606,13 +1608,8 @@ pub(crate) mod tests {
                    )
                    .unwrap(),
                    policy.clone(),
-                );
-
-                if let Some(az) = &preferred_az {
-                    ts.set_preferred_az(az.clone());
-                }
-
-                ts
+                    preferred_az.clone(),
+                )
            })
            .collect()
    }
--- a/storage_scrubber/src/checks.rs
+++ b/storage_scrubber/src/checks.rs
@@ -533,8 +533,9 @@ async fn list_timeline_blobs_impl(
 }

 pub(crate) struct RemoteTenantManifestInfo {
-    pub(crate) latest_generation: Option<Generation>,
-    pub(crate) manifests: Vec<(Generation, ListingObject)>,
+    pub(crate) generation: Generation,
+    pub(crate) manifest: TenantManifest,
+    pub(crate) listing_object: ListingObject,
 }

 pub(crate) enum ListTenantManifestResult {
@@ -543,7 +544,10 @@ pub(crate) enum ListTenantManifestResult {
        #[allow(dead_code)]
        unknown_keys: Vec<ListingObject>,
    },
-    NoErrors(RemoteTenantManifestInfo),
+    NoErrors {
+        latest_generation: Option<RemoteTenantManifestInfo>,
+        manifests: Vec<(Generation, ListingObject)>,
+    },
 }

 /// Lists the tenant manifests in remote storage and parses the latest one, returning a [`ListTenantManifestResult`] object.
@@ -592,14 +596,6 @@ pub(crate) async fn list_tenant_manifests(
        unknown_keys.push(obj);
    }

-    if manifests.is_empty() {
-        tracing::debug!("No manifest for timeline.");
-
-        return Ok(ListTenantManifestResult::WithErrors {
-            errors,
-            unknown_keys,
-        });
-    }
    if !unknown_keys.is_empty() {
        errors.push(((*prefix_str).to_owned(), "unknown keys listed".to_string()));

@@ -609,6 +605,15 @@ pub(crate) async fn list_tenant_manifests(
        });
    }

+    if manifests.is_empty() {
+        tracing::debug!("No manifest for timeline.");
+
+        return Ok(ListTenantManifestResult::NoErrors {
+            latest_generation: None,
+            manifests,
+        });
+    }
+
    // Find the manifest with the highest generation
    let (latest_generation, latest_listing_object) = manifests
        .iter()
@@ -616,6 +621,8 @@ pub(crate) async fn list_tenant_manifests(
        .map(|(g, obj)| (*g, obj.clone()))
        .unwrap();

+    manifests.retain(|(gen, _obj)| gen != &latest_generation);
+
    let manifest_bytes =
        match download_object_with_retries(remote_client, &latest_listing_object.key).await {
            Ok(bytes) => bytes,
@@ -634,13 +641,15 @@ pub(crate) async fn list_tenant_manifests(
        };

    match TenantManifest::from_json_bytes(&manifest_bytes) {
-        Ok(_manifest) => {
-            return Ok(ListTenantManifestResult::NoErrors(
-                RemoteTenantManifestInfo {
-                    latest_generation: Some(latest_generation),
-                    manifests,
-                },
-            ));
+        Ok(manifest) => {
+            return Ok(ListTenantManifestResult::NoErrors {
+                latest_generation: Some(RemoteTenantManifestInfo {
+                    generation: latest_generation,
+                    manifest,
+                    listing_object: latest_listing_object,
+                }),
+                manifests,
+            });
        }
        Err(parse_error) => errors.push((
            latest_listing_object.key.get_path().as_str().to_owned(),
--- a/storage_scrubber/src/pageserver_physical_gc.rs
+++ b/storage_scrubber/src/pageserver_physical_gc.rs
@@ -4,11 +4,13 @@ use std::time::Duration;

 use crate::checks::{
    list_tenant_manifests, list_timeline_blobs, BlobDataParseResult, ListTenantManifestResult,
+    RemoteTenantManifestInfo,
 };
 use crate::metadata_stream::{stream_tenant_timelines, stream_tenants};
 use crate::{init_remote, BucketConfig, NodeKind, RootTarget, TenantShardTimelineId, MAX_RETRIES};
 use futures_util::{StreamExt, TryStreamExt};
 use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata;
+use pageserver::tenant::remote_timeline_client::manifest::OffloadedTimelineManifest;
 use pageserver::tenant::remote_timeline_client::{
    parse_remote_index_path, parse_remote_tenant_manifest_path, remote_layer_path,
 };
@@ -527,7 +529,7 @@ async fn gc_tenant_manifests(
    target: &RootTarget,
    mode: GcMode,
    tenant_shard_id: TenantShardId,
-) -> anyhow::Result<GcSummary> {
+) -> anyhow::Result<(GcSummary, Option<RemoteTenantManifestInfo>)> {
    let mut gc_summary = GcSummary::default();
    match list_tenant_manifests(remote_client, tenant_shard_id, target).await? {
        ListTenantManifestResult::WithErrors {
@@ -537,33 +539,35 @@ async fn gc_tenant_manifests(
            for (_key, error) in errors {
                tracing::warn!(%tenant_shard_id, "list_tenant_manifests: {error}");
            }
+            Ok((gc_summary, None))
        }
-        ListTenantManifestResult::NoErrors(mut manifest_info) => {
-            let Some(latest_gen) = manifest_info.latest_generation else {
-                return Ok(gc_summary);
+        ListTenantManifestResult::NoErrors {
+            latest_generation,
+            mut manifests,
+        } => {
+            let Some(latest_generation) = latest_generation else {
+                return Ok((gc_summary, None));
            };
-            manifest_info
-                .manifests
-                .sort_by_key(|(generation, _obj)| *generation);
+            manifests.sort_by_key(|(generation, _obj)| *generation);
            // skip the two latest generations (they don't neccessarily have to be 1 apart from each other)
-            let candidates = manifest_info.manifests.iter().rev().skip(2);
+            let candidates = manifests.iter().rev().skip(2);
            for (_generation, key) in candidates {
                maybe_delete_tenant_manifest(
                    remote_client,
                    &min_age,
-                    latest_gen,
+                    latest_generation.generation,
                    key,
                    mode,
                    &mut gc_summary,
                )
                .instrument(
-                    info_span!("maybe_delete_tenant_manifest", %tenant_shard_id, ?latest_gen, %key.key),
+                    info_span!("maybe_delete_tenant_manifest", %tenant_shard_id, ?latest_generation.generation, %key.key),
                )
                .await;
            }
+            Ok((gc_summary, Some(latest_generation)))
        }
    }
-    Ok(gc_summary)
 }

 async fn gc_timeline(
@@ -573,6 +577,7 @@ async fn gc_timeline(
    mode: GcMode,
    ttid: TenantShardTimelineId,
    accumulator: &Arc<std::sync::Mutex<TenantRefAccumulator>>,
+    tenant_manifest_info: Arc<Option<RemoteTenantManifestInfo>>,
 ) -> anyhow::Result<GcSummary> {
    let mut summary = GcSummary::default();
    let data = list_timeline_blobs(remote_client, ttid, target).await?;
@@ -597,6 +602,60 @@ async fn gc_timeline(
        }
    };

+    if let Some(tenant_manifest_info) = &*tenant_manifest_info {
+        // TODO: this is O(n^2) in the number of offloaded timelines. Do a hashmap lookup instead.
+        let maybe_offloaded = tenant_manifest_info
+            .manifest
+            .offloaded_timelines
+            .iter()
+            .find(|offloaded_timeline| offloaded_timeline.timeline_id == ttid.timeline_id);
+        if let Some(offloaded) = maybe_offloaded {
+            let warnings = validate_index_part_with_offloaded(index_part, offloaded);
+            let warn = if warnings.is_empty() {
+                false
+            } else {
+                // Verify that the manifest hasn't changed. If it has, a potential racing change could have been cause for our troubles.
+                match list_tenant_manifests(remote_client, ttid.tenant_shard_id, target).await? {
+                    ListTenantManifestResult::WithErrors {
+                        errors,
+                        unknown_keys: _,
+                    } => {
+                        for (_key, error) in errors {
+                            tracing::warn!(%ttid, "list_tenant_manifests in gc_timeline: {error}");
+                        }
+                        true
+                    }
+                    ListTenantManifestResult::NoErrors {
+                        latest_generation,
+                        manifests: _,
+                    } => {
+                        if let Some(new_latest_gen) = latest_generation {
+                            let manifest_changed = (
+                                new_latest_gen.generation,
+                                new_latest_gen.listing_object.last_modified,
+                            ) == (
+                                tenant_manifest_info.generation,
+                                tenant_manifest_info.listing_object.last_modified,
+                            );
+                            if manifest_changed {
+                                tracing::debug!(%ttid, "tenant manifest changed since it was loaded, suppressing {} warnings", warnings.len());
+                            }
+                            manifest_changed
+                        } else {
+                            // The latest generation is gone. This timeline is in the progress of being deleted?
+                            false
+                        }
+                    }
+                }
+            };
+            if warn {
+                for warning in warnings {
+                    tracing::warn!(%ttid, "{}", warning);
+                }
+            }
+        }
+    }
+
    accumulator.lock().unwrap().update(ttid, index_part);

    for key in candidates {
@@ -608,6 +667,35 @@ async fn gc_timeline(
    Ok(summary)
 }

+fn validate_index_part_with_offloaded(
+    index_part: &IndexPart,
+    offloaded: &OffloadedTimelineManifest,
+) -> Vec<String> {
+    let mut warnings = Vec::new();
+    if let Some(archived_at_index_part) = index_part.archived_at {
+        if archived_at_index_part
+            .signed_duration_since(offloaded.archived_at)
+            .num_seconds()
+            != 0
+        {
+            warnings.push(format!(
+                "index-part archived_at={} differs from manifest archived_at={}",
+                archived_at_index_part, offloaded.archived_at
+            ));
+        }
+    } else {
+        warnings.push("Timeline offloaded in manifest but not archived in index-part".to_string());
+    }
+    if index_part.metadata.ancestor_timeline() != offloaded.ancestor_timeline_id {
+        warnings.push(format!(
+            "index-part anestor={:?} differs from manifest ancestor={:?}",
+            index_part.metadata.ancestor_timeline(),
+            offloaded.ancestor_timeline_id
+        ));
+    }
+    warnings
+}
+
 /// Physical garbage collection: removing unused S3 objects.
 ///
 /// This is distinct from the garbage collection done inside the pageserver, which operates at a higher level
@@ -650,29 +738,38 @@ pub async fn pageserver_physical_gc(
        let target_ref = &target;
        let remote_client_ref = &remote_client;
        async move {
-            let summaries_from_manifests = match gc_tenant_manifests(
+            let gc_manifest_result = gc_tenant_manifests(
                remote_client_ref,
                min_age,
                target_ref,
                mode,
                tenant_shard_id,
            )
-            .await
-            {
-                Ok(gc_summary) => vec![Ok(GcSummaryOrContent::<TenantShardTimelineId>::GcSummary(
-                    gc_summary,
-                ))],
+            .await;
+            let (summary_from_manifest, tenant_manifest_opt) = match gc_manifest_result {
+                Ok((gc_summary, tenant_manifest)) => (gc_summary, tenant_manifest),
                Err(e) => {
                    tracing::warn!(%tenant_shard_id, "Error in gc_tenant_manifests: {e}");
-                    Vec::new()
+                    (GcSummary::default(), None)
                }
            };
+            let tenant_manifest_arc = Arc::new(tenant_manifest_opt);
+            let summary_from_manifest = Ok(GcSummaryOrContent::<(_, _)>::GcSummary(
+                summary_from_manifest,
+            ));
            stream_tenant_timelines(remote_client_ref, target_ref, tenant_shard_id)
                .await
                .map(|stream| {
                    stream
-                        .map_ok(GcSummaryOrContent::Content)
-                        .chain(futures::stream::iter(summaries_from_manifests.into_iter()))
+                        .zip(futures::stream::iter(std::iter::repeat(
+                            tenant_manifest_arc,
+                        )))
+                        .map(|(ttid_res, tenant_manifest_arc)| {
+                            ttid_res.map(move |ttid| {
+                                GcSummaryOrContent::Content((ttid, tenant_manifest_arc))
+                            })
+                        })
+                        .chain(futures::stream::iter([summary_from_manifest].into_iter()))
                })
        }
    });
@@ -684,14 +781,17 @@ pub async fn pageserver_physical_gc(
    // Drain futures for per-shard GC, populating accumulator as a side effect
    {
        let timelines = timelines.map_ok(|summary_or_ttid| match summary_or_ttid {
-            GcSummaryOrContent::Content(ttid) => futures::future::Either::Left(gc_timeline(
-                &remote_client,
-                &min_age,
-                &target,
-                mode,
-                ttid,
-                &accumulator,
-            )),
+            GcSummaryOrContent::Content((ttid, tenant_manifest_arc)) => {
+                futures::future::Either::Left(gc_timeline(
+                    &remote_client,
+                    &min_age,
+                    &target,
+                    mode,
+                    ttid,
+                    &accumulator,
+                    tenant_manifest_arc,
+                ))
+            }
            GcSummaryOrContent::GcSummary(gc_summary) => {
                futures::future::Either::Right(futures::future::ok(gc_summary))
            }
--- a/test_runner/fixtures/httpserver.py
+++ b/test_runner/fixtures/httpserver.py
@@ -7,24 +7,25 @@ from pytest_httpserver import HTTPServer

 if TYPE_CHECKING:
    from collections.abc import Iterator
+    from ssl import SSLContext

    from fixtures.port_distributor import PortDistributor

-# TODO: mypy fails with:
-#  Module "fixtures.neon_fixtures" does not explicitly export attribute "PortDistributor"  [attr-defined]
-# from fixtures.neon_fixtures import PortDistributor
+    ListenAddress = tuple[str, int]

 # compared to the fixtures from pytest_httpserver with same names, these are
 # always function scoped, so you can check and stop the server in tests.


@pytest.fixture(scope="function")
-def httpserver_ssl_context():
-    return None
+def httpserver_ssl_context() -> Iterator[SSLContext | None]:
+    yield None


@pytest.fixture(scope="function")
-def make_httpserver(httpserver_listen_address, httpserver_ssl_context) -> Iterator[HTTPServer]:
+def make_httpserver(
+    httpserver_listen_address: ListenAddress, httpserver_ssl_context: SSLContext | None
+) -> Iterator[HTTPServer]:
    host, port = httpserver_listen_address
    if not host:
        host = HTTPServer.DEFAULT_LISTEN_HOST
@@ -47,6 +48,6 @@ def httpserver(make_httpserver: HTTPServer) -> Iterator[HTTPServer]:


@pytest.fixture(scope="function")
-def httpserver_listen_address(port_distributor: PortDistributor) -> tuple[str, int]:
+def httpserver_listen_address(port_distributor: PortDistributor) -> ListenAddress:
    port = port_distributor.get_port()
    return ("localhost", port)
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -178,6 +178,7 @@ PAGESERVER_PER_TENANT_METRICS: tuple[str, ...] = (
    counter("pageserver_timeline_wal_records_received"),
    counter("pageserver_page_service_pagestream_flush_in_progress_micros"),
    *histogram("pageserver_page_service_batch_size"),
+    *histogram("pageserver_page_service_pagestream_batch_wait_time_seconds"),
    *PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS,
    # "pageserver_directory_entries_count", -- only used if above a certain threshold
    # "pageserver_broken_tenants_count" -- used only for broken
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -435,7 +435,10 @@ class NeonEnvBuilder:

        self.pageserver_virtual_file_io_mode = pageserver_virtual_file_io_mode

-        self.pageserver_wal_receiver_protocol = pageserver_wal_receiver_protocol
+        if pageserver_wal_receiver_protocol is not None:
+            self.pageserver_wal_receiver_protocol = pageserver_wal_receiver_protocol
+        else:
+            self.pageserver_wal_receiver_protocol = PageserverWalReceiverProtocol.INTERPRETED

        assert test_name.startswith(
            "test_"
@@ -2329,6 +2332,16 @@ class NeonStorageController(MetricsGetter, LogUtils):
                return None
            raise e

+    def get_safekeepers(self) -> list[dict[str, Any]]:
+        response = self.request(
+            "GET",
+            f"{self.api}/control/v1/safekeeper",
+            headers=self.headers(TokenScope.ADMIN),
+        )
+        json = response.json()
+        assert isinstance(json, list)
+        return json
+
    def set_preferred_azs(self, preferred_azs: dict[TenantShardId, str]) -> list[TenantShardId]:
        response = self.request(
            "PUT",
@@ -3209,7 +3222,7 @@ class NeonProxy(PgProtocol):
                *["--allow-self-signed-compute", "true"],
            ]

-    class Console(AuthBackend):
+    class ProxyV1(AuthBackend):
        def __init__(self, endpoint: str, fixed_rate_limit: int | None = None):
            self.endpoint = endpoint
            self.fixed_rate_limit = fixed_rate_limit
@@ -3217,7 +3230,7 @@ class NeonProxy(PgProtocol):
        def extra_args(self) -> list[str]:
            args = [
                # Console auth backend params
-                *["--auth-backend", "console"],
+                *["--auth-backend", "cplane-v1"],
                *["--auth-endpoint", self.endpoint],
                *["--sql-over-http-pool-opt-in", "false"],
            ]
@@ -3465,13 +3478,13 @@ class NeonProxy(PgProtocol):


 class NeonAuthBroker:
-    class ControlPlane:
+    class ProxyV1:
        def __init__(self, endpoint: str):
            self.endpoint = endpoint

        def extra_args(self) -> list[str]:
            args = [
-                *["--auth-backend", "console"],
+                *["--auth-backend", "cplane-v1"],
                *["--auth-endpoint", self.endpoint],
            ]
            return args
@@ -3483,7 +3496,7 @@ class NeonAuthBroker:
        http_port: int,
        mgmt_port: int,
        external_http_port: int,
-        auth_backend: NeonAuthBroker.ControlPlane,
+        auth_backend: NeonAuthBroker.ProxyV1,
    ):
        self.domain = "apiauth.localtest.me"  # resolves to 127.0.0.1
        self.host = "127.0.0.1"
@@ -3669,7 +3682,7 @@ def static_auth_broker(
    local_proxy_addr = f"{http2_echoserver.host}:{http2_echoserver.port}"

    # return local_proxy addr on ProxyWakeCompute.
-    httpserver.expect_request("/cplane/proxy_wake_compute").respond_with_json(
+    httpserver.expect_request("/cplane/wake_compute").respond_with_json(
        {
            "address": local_proxy_addr,
            "aux": {
@@ -3709,7 +3722,7 @@ def static_auth_broker(
        http_port=http_port,
        mgmt_port=mgmt_port,
        external_http_port=external_http_port,
-        auth_backend=NeonAuthBroker.ControlPlane(httpserver.url_for("/cplane")),
+        auth_backend=NeonAuthBroker.ProxyV1(httpserver.url_for("/cplane")),
    ) as proxy:
        proxy.start()
        yield proxy
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -488,7 +488,20 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
        )
        self.verbose_error(res)

-    def patch_tenant_config_client_side(
+    def patch_tenant_config(self, tenant_id: TenantId | TenantShardId, updates: dict[str, Any]):
+        """
+        Only use this via storage_controller.pageserver_api().
+
+        See `set_tenant_config` for more information.
+        """
+        assert "tenant_id" not in updates.keys()
+        res = self.patch(
+            f"http://localhost:{self.port}/v1/tenant/config",
+            json={**updates, "tenant_id": str(tenant_id)},
+        )
+        self.verbose_error(res)
+
+    def update_tenant_config(
        self,
        tenant_id: TenantId,
        inserts: dict[str, Any] | None = None,
@@ -499,13 +512,13 @@ class PageserverHttpClient(requests.Session, MetricsGetter):

        See `set_tenant_config` for more information.
        """
-        current = self.tenant_config(tenant_id).tenant_specific_overrides
-        if inserts is not None:
-            current.update(inserts)
-        if removes is not None:
-            for key in removes:
-                del current[key]
-        self.set_tenant_config(tenant_id, current)
+        if inserts is None:
+            inserts = {}
+        if removes is None:
+            removes = []
+
+        patch = inserts | {remove: None for remove in removes}
+        self.patch_tenant_config(tenant_id, patch)

    def tenant_size(self, tenant_id: TenantId | TenantShardId) -> int:
        return self.tenant_size_and_modelinputs(tenant_id)[0]
--- a/test_runner/fixtures/remote_storage.py
+++ b/test_runner/fixtures/remote_storage.py
@@ -70,6 +70,9 @@ class MockS3Server:
    def secret_key(self) -> str:
        return "test"

+    def session_token(self) -> str:
+        return "test"
+
    def kill(self):
        self.server.stop()

@@ -161,6 +164,7 @@ class S3Storage:
    bucket_region: str
    access_key: str | None
    secret_key: str | None
+    session_token: str | None
    aws_profile: str | None
    prefix_in_bucket: str
    client: S3Client
@@ -181,13 +185,18 @@ class S3Storage:
            if home is not None:
                env["HOME"] = home
            return env
-        if self.access_key is not None and self.secret_key is not None:
+        if (
+            self.access_key is not None
+            and self.secret_key is not None
+            and self.session_token is not None
+        ):
            return {
                "AWS_ACCESS_KEY_ID": self.access_key,
                "AWS_SECRET_ACCESS_KEY": self.secret_key,
+                "AWS_SESSION_TOKEN": self.session_token,
            }
        raise RuntimeError(
-            "Either AWS_PROFILE or (AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY) have to be set for S3Storage"
+            "Either AWS_PROFILE or (AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY and AWS_SESSION_TOKEN) have to be set for S3Storage"
        )

    def to_string(self) -> str:
@@ -352,6 +361,7 @@ class RemoteStorageKind(StrEnum):
            mock_region = mock_s3_server.region()

            access_key, secret_key = mock_s3_server.access_key(), mock_s3_server.secret_key()
+            session_token = mock_s3_server.session_token()

            client = boto3.client(
                "s3",
@@ -359,6 +369,7 @@ class RemoteStorageKind(StrEnum):
                region_name=mock_region,
                aws_access_key_id=access_key,
                aws_secret_access_key=secret_key,
+                aws_session_token=session_token,
            )

            bucket_name = to_bucket_name(user, test_name)
@@ -372,6 +383,7 @@ class RemoteStorageKind(StrEnum):
                bucket_region=mock_region,
                access_key=access_key,
                secret_key=secret_key,
+                session_token=session_token,
                aws_profile=None,
                prefix_in_bucket="",
                client=client,
@@ -383,9 +395,10 @@ class RemoteStorageKind(StrEnum):

        env_access_key = os.getenv("AWS_ACCESS_KEY_ID")
        env_secret_key = os.getenv("AWS_SECRET_ACCESS_KEY")
+        env_access_token = os.getenv("AWS_SESSION_TOKEN")
        env_profile = os.getenv("AWS_PROFILE")
        assert (
-            env_access_key and env_secret_key
+            env_access_key and env_secret_key and env_access_token
        ) or env_profile, "need to specify either access key and secret access key or profile"

        bucket_name = bucket_name or os.getenv("REMOTE_STORAGE_S3_BUCKET")
@@ -398,6 +411,9 @@ class RemoteStorageKind(StrEnum):
        client = boto3.client(
            "s3",
            region_name=bucket_region,
+            aws_access_key_id=env_access_key,
+            aws_secret_access_key=env_secret_key,
+            aws_session_token=env_access_token,
        )

        return S3Storage(
@@ -405,6 +421,7 @@ class RemoteStorageKind(StrEnum):
            bucket_region=bucket_region,
            access_key=env_access_key,
            secret_key=env_secret_key,
+            session_token=env_access_token,
            aws_profile=env_profile,
            prefix_in_bucket=prefix_in_bucket,
            client=client,
--- a/test_runner/regress/test_compaction.py
+++ b/test_runner/regress/test_compaction.py
@@ -153,6 +153,7 @@ def test_pageserver_gc_compaction_smoke(neon_env_builder: NeonEnvBuilder):
        if i % 10 == 0:
            log.info(f"Running churn round {i}/{churn_rounds} ...")

+        if (i - 1) % 10 == 0:
            # Run gc-compaction every 10 rounds to ensure the test doesn't take too long time.
            ps_http.timeline_compact(
                tenant_id,
@@ -161,10 +162,11 @@ def test_pageserver_gc_compaction_smoke(neon_env_builder: NeonEnvBuilder):
                body={
                    "scheduled": True,
                    "sub_compaction": True,
-                    "compact_range": {
+                    "compact_key_range": {
                        "start": "000000000000000000000000000000000000",
                        "end": "030000000000000000000000000000000000",
                    },
+                    "sub_compaction_max_job_size_mb": 16,
                },
            )

--- a/Show More
+++ b/Show More