Merge pull request #10107 from neondatabase/rc/release-proxy/2024-12-12

Proxy release 2024-12-12
Merge branch 'release-proxy' into rc/release-proxy/2024-12-12
2026-07-14 01:20:38 +00:00 · 2024-12-12 10:21:30 +00:00 · 2024-12-12 09:58:31 +00:00 · 2024-12-12 06:02:08 +00:00 · 2024-12-12 01:09:24 +00:00 · 2024-12-11 22:21:42 +00:00
178 changed files with 9884 additions and 1897 deletions
--- a/.github/actionlint.yml
+++ b/.github/actionlint.yml
@@ -21,3 +21,5 @@ config-variables:
  - SLACK_UPCOMING_RELEASE_CHANNEL_ID
  - DEV_AWS_OIDC_ROLE_ARN
  - BENCHMARK_INGEST_TARGET_PROJECTID
+  - PGREGRESS_PG16_PROJECT_ID
+  - PGREGRESS_PG17_PROJECT_ID
--- a/.github/actions/allure-report-generate/action.yml
+++ b/.github/actions/allure-report-generate/action.yml
@@ -43,7 +43,8 @@ runs:
        PR_NUMBER=$(jq --raw-output .pull_request.number "$GITHUB_EVENT_PATH" || true)
        if [ "${PR_NUMBER}" != "null" ]; then
          BRANCH_OR_PR=pr-${PR_NUMBER}
-        elif [ "${GITHUB_REF_NAME}" = "main" ] || [ "${GITHUB_REF_NAME}" = "release" ] || [ "${GITHUB_REF_NAME}" = "release-proxy" ]; then
+        elif [ "${GITHUB_REF_NAME}" = "main" ] || [ "${GITHUB_REF_NAME}" = "release" ] || \
+             [ "${GITHUB_REF_NAME}" = "release-proxy" ] || [ "${GITHUB_REF_NAME}" = "release-compute" ]; then
          # Shortcut for special branches
          BRANCH_OR_PR=${GITHUB_REF_NAME}
        else
--- a/.github/actions/allure-report-store/action.yml
+++ b/.github/actions/allure-report-store/action.yml
@@ -23,7 +23,8 @@ runs:
        PR_NUMBER=$(jq --raw-output .pull_request.number "$GITHUB_EVENT_PATH" || true)
        if [ "${PR_NUMBER}" != "null" ]; then
          BRANCH_OR_PR=pr-${PR_NUMBER}
-        elif [ "${GITHUB_REF_NAME}" = "main" ] || [ "${GITHUB_REF_NAME}" = "release" ] || [ "${GITHUB_REF_NAME}" = "release-proxy" ]; then
+        elif [ "${GITHUB_REF_NAME}" = "main" ] || [ "${GITHUB_REF_NAME}" = "release" ] || \
+             [ "${GITHUB_REF_NAME}" = "release-proxy" ] || [ "${GITHUB_REF_NAME}" = "release-compute" ]; then
          # Shortcut for special branches
          BRANCH_OR_PR=${GITHUB_REF_NAME}
        else
--- a/.github/workflows/_build-and-test-locally.yml
+++ b/.github/workflows/_build-and-test-locally.yml
@@ -283,7 +283,7 @@ jobs:
          submodules: true

      - name: Pytest regression tests
-        continue-on-error: ${{ matrix.lfc_state == 'with-lfc' }}
+        continue-on-error: ${{ matrix.lfc_state == 'with-lfc' && inputs.build-type == 'debug' }}
        uses: ./.github/actions/run-python-test-set
        timeout-minutes: 60
        with:
--- a/.github/workflows/_create-release-pr.yml
+++ b/.github/workflows/_create-release-pr.yml
@@ -21,7 +21,7 @@ defaults:
    shell: bash -euo pipefail {0}

 jobs:
-  create-storage-release-branch:
+  create-release-branch:
    runs-on: ubuntu-22.04

    permissions:
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -6,6 +6,7 @@ on:
      - main
      - release
      - release-proxy
+      - release-compute
  pull_request:

 defaults:
@@ -70,8 +71,10 @@ jobs:
            echo "tag=release-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
          elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then
            echo "tag=release-proxy-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
+          elif [[ "$GITHUB_REF_NAME" == "release-compute" ]]; then
+            echo "tag=release-compute-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
          else
-            echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
+            echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release', 'release-proxy', 'release-compute'"
            echo "tag=$GITHUB_RUN_ID" >> $GITHUB_OUTPUT
          fi
        shell: bash
@@ -252,15 +255,17 @@ jobs:
      build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
      build-tag: ${{ needs.tag.outputs.build-tag }}
      build-type: ${{ matrix.build-type }}
-      # Run tests on all Postgres versions in release builds and only on the latest version in debug builds
-      # run without LFC on v17 release only
+      # Run tests on all Postgres versions in release builds and only on the latest version in debug builds.
+      # Run without LFC on v17 release and debug builds only. For all the other cases LFC is enabled. Failure on the
+      # debug build with LFC enabled doesn't block merging.
      test-cfg: |
-        ${{ matrix.build-type == 'release' && '[{"pg_version":"v14", "lfc_state": "without-lfc"},
-                                                {"pg_version":"v15", "lfc_state": "without-lfc"},
-                                                {"pg_version":"v16", "lfc_state": "without-lfc"},
-                                                {"pg_version":"v17", "lfc_state": "without-lfc"},
-                                                {"pg_version":"v17", "lfc_state": "with-lfc"}]'
-                                           || '[{"pg_version":"v17", "lfc_state": "without-lfc"}]' }}
+        ${{ matrix.build-type == 'release' && '[{"pg_version":"v14", "lfc_state": "with-lfc"},
+                                                {"pg_version":"v15", "lfc_state": "with-lfc"},
+                                                {"pg_version":"v16", "lfc_state": "with-lfc"},
+                                                {"pg_version":"v17", "lfc_state": "with-lfc"},
+                                                {"pg_version":"v17", "lfc_state": "without-lfc"}]'
+                                           || '[{"pg_version":"v17", "lfc_state": "without-lfc"},
+                                                {"pg_version":"v17", "lfc_state": "with-lfc" }]' }}
    secrets: inherit

  # Keep `benchmarks` job outside of `build-and-test-locally` workflow to make job failures non-blocking
@@ -513,7 +518,7 @@ jobs:
            })

  trigger-e2e-tests:
-    if: ${{ !github.event.pull_request.draft || contains( github.event.pull_request.labels.*.name, 'run-e2e-tests-in-draft') || github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy' }}
+    if: ${{ !github.event.pull_request.draft || contains( github.event.pull_request.labels.*.name, 'run-e2e-tests-in-draft') || github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute' }}
    needs: [ check-permissions, promote-images, tag ]
    uses: ./.github/workflows/trigger-e2e-tests.yml
    secrets: inherit
@@ -934,7 +939,7 @@ jobs:
                                              neondatabase/neon-test-extensions-v16:${{ needs.tag.outputs.build-tag }}

      - name: Configure AWS-prod credentials
-        if: github.ref_name == 'release'|| github.ref_name == 'release-proxy'
+        if: github.ref_name == 'release'|| github.ref_name == 'release-proxy' || github.ref_name == 'release-compute'
        uses: aws-actions/configure-aws-credentials@v4
        with:
          aws-region: eu-central-1
@@ -943,12 +948,12 @@ jobs:

      - name: Login to prod ECR
        uses: docker/login-action@v3
-        if: github.ref_name == 'release'|| github.ref_name == 'release-proxy'
+        if: github.ref_name == 'release'|| github.ref_name == 'release-proxy' || github.ref_name == 'release-compute'
        with:
          registry: 093970136003.dkr.ecr.eu-central-1.amazonaws.com

      - name: Copy all images to prod ECR
-        if: github.ref_name == 'release'|| github.ref_name == 'release-proxy'
+        if: github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute'
        run: |
          for image in neon compute-tools {vm-,}compute-node-{v14,v15,v16,v17}; do
            docker buildx imagetools create -t 093970136003.dkr.ecr.eu-central-1.amazonaws.com/${image}:${{ needs.tag.outputs.build-tag }} \
@@ -968,7 +973,7 @@ jobs:
      tenant_id: ${{ vars.AZURE_TENANT_ID }}

  push-to-acr-prod:
-    if: github.ref_name == 'release'|| github.ref_name == 'release-proxy'
+    if: github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute'
    needs: [ tag, promote-images ]
    uses: ./.github/workflows/_push-to-acr.yml
    with:
@@ -1056,13 +1061,77 @@ jobs:
  deploy:
    needs: [ check-permissions, promote-images, tag, build-and-test-locally, trigger-custom-extensions-build-and-wait, push-to-acr-dev, push-to-acr-prod ]
    # `!failure() && !cancelled()` is required because the workflow depends on the job that can be skipped: `push-to-acr-dev` and `push-to-acr-prod`
-    if: (github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy') && !failure() && !cancelled()
+    if: (github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute') && !failure() && !cancelled()

    runs-on: [ self-hosted, small ]
    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
    steps:
      - uses: actions/checkout@v4

+      - name: Create git tag and GitHub release
+        if: github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute'
+        uses: actions/github-script@v7
+        with:
+          retries: 5
+          script: |
+            const tag = "${{ needs.tag.outputs.build-tag }}";
+
+            try {
+              const existingRef = await github.rest.git.getRef({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                ref: `tags/${tag}`,
+              });
+
+              if (existingRef.data.object.sha !== context.sha) {
+                throw new Error(`Tag ${tag} already exists but points to a different commit (expected: ${context.sha}, actual: ${existingRef.data.object.sha}).`);
+              }
+
+              console.log(`Tag ${tag} already exists and points to ${context.sha} as expected.`);
+            } catch (error) {
+              if (error.status !== 404) {
+                throw error;
+              }
+
+              console.log(`Tag ${tag} does not exist. Creating it...`);
+              await github.rest.git.createRef({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                ref: `refs/tags/${tag}`,
+                sha: context.sha,
+              });
+              console.log(`Tag ${tag} created successfully.`);
+            }
+
+            # TODO: check how GitHub releases looks for proxy/compute releases and enable them if they're ok
+            if (context.ref !== 'refs/heads/release') {
+              console.log(`GitHub release skipped for ${context.ref}.`);
+              return;
+            }
+
+            try {
+              const existingRelease = await github.rest.repos.getReleaseByTag({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                tag: tag,
+              });
+
+              console.log(`Release for tag ${tag} already exists (ID: ${existingRelease.data.id}).`);
+            } catch (error) {
+              if (error.status !== 404) {
+                throw error;
+              }
+
+              console.log(`Release for tag ${tag} does not exist. Creating it...`);
+              await github.rest.repos.createRelease({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                tag_name: tag,
+                generate_release_notes: true,
+              });
+              console.log(`Release for tag ${tag} created successfully.`);
+            }
+
      - name: Trigger deploy workflow
        env:
          GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
@@ -1105,40 +1174,13 @@ jobs:
              -f deployProxyAuthBroker=true \
              -f branch=main \
              -f dockerTag=${{needs.tag.outputs.build-tag}}
+          elif [[ "$GITHUB_REF_NAME" == "release-compute" ]]; then
+            gh workflow --repo neondatabase/infra run deploy-compute-dev.yml --ref main -f dockerTag=${{needs.tag.outputs.build-tag}}
          else
-            echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
+            echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main', 'release', 'release-proxy' or 'release-compute'"
            exit 1
          fi

-      - name: Create git tag
-        if: github.ref_name == 'release' || github.ref_name == 'release-proxy'
-        uses: actions/github-script@v7
-        with:
-          # Retry script for 5XX server errors: https://github.com/actions/github-script#retries
-          retries: 5
-          script: |
-            await github.rest.git.createRef({
-              owner: context.repo.owner,
-              repo: context.repo.repo,
-              ref: "refs/tags/${{ needs.tag.outputs.build-tag }}",
-              sha: context.sha,
-            })
-
-      # TODO: check how GitHub releases looks for proxy releases and enable it if it's ok
-      - name: Create GitHub release
-        if: github.ref_name == 'release'
-        uses: actions/github-script@v7
-        with:
-          # Retry script for 5XX server errors: https://github.com/actions/github-script#retries
-          retries: 5
-          script: |
-            await github.rest.repos.createRelease({
-              owner: context.repo.owner,
-              repo: context.repo.repo,
-              tag_name: "${{ needs.tag.outputs.build-tag }}",
-              generate_release_notes: true,
-            })
-
  # The job runs on `release` branch and copies compatibility data and Neon artifact from the last *release PR* to the latest directory
  promote-compatibility-data:
    needs: [ deploy ]
--- a/.github/workflows/cloud-regress.yml
+++ b/.github/workflows/cloud-regress.yml
@@ -23,11 +23,14 @@ jobs:
  regress:
    env:
      POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
-      DEFAULT_PG_VERSION: 16
      TEST_OUTPUT: /tmp/test_output
      BUILD_TYPE: remote
      AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
      AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
+    strategy:
+      fail-fast: false
+      matrix:
+        pg-version: [16, 17]

    runs-on: us-east-2
    container:
@@ -40,9 +43,11 @@ jobs:
          submodules: true

      - name: Patch the test
+        env:
+          PG_VERSION: ${{matrix.pg-version}}
        run: |
-          cd "vendor/postgres-v${DEFAULT_PG_VERSION}"
-          patch -p1 < "../../compute/patches/cloud_regress_pg${DEFAULT_PG_VERSION}.patch"
+          cd "vendor/postgres-v${PG_VERSION}"
+          patch -p1 < "../../compute/patches/cloud_regress_pg${PG_VERSION}.patch"

      - name: Generate a random password
        id: pwgen
@@ -55,8 +60,9 @@ jobs:
      - name: Change tests according to the generated password
        env:
          DBPASS: ${{ steps.pwgen.outputs.DBPASS }}
+          PG_VERSION: ${{matrix.pg-version}}
        run: |
-          cd vendor/postgres-v"${DEFAULT_PG_VERSION}"/src/test/regress
+          cd vendor/postgres-v"${PG_VERSION}"/src/test/regress
          for fname in sql/*.sql expected/*.out; do
            sed -i.bak s/NEON_PASSWORD_PLACEHOLDER/"'${DBPASS}'"/ "${fname}"
          done
@@ -73,15 +79,29 @@ jobs:
          path: /tmp/neon/
          prefix: latest

+      - name: Create a new branch
+        id: create-branch
+        uses: ./.github/actions/neon-branch-create
+        with:
+          api_key: ${{ secrets.NEON_STAGING_API_KEY }}
+          project_id: ${{ vars[format('PGREGRESS_PG{0}_PROJECT_ID', matrix.pg-version)] }}
+
      - name: Run the regression tests
        uses: ./.github/actions/run-python-test-set
        with:
          build_type: ${{ env.BUILD_TYPE }}
          test_selection: cloud_regress
-          pg_version: ${{ env.DEFAULT_PG_VERSION }}
+          pg_version: ${{matrix.pg-version}}
          extra_params: -m remote_cluster
        env:
-          BENCHMARK_CONNSTR: ${{ secrets.PG_REGRESS_CONNSTR }}
+          BENCHMARK_CONNSTR: ${{steps.create-branch.outputs.dsn}}
+
+      - name: Delete branch
+        uses: ./.github/actions/neon-branch-delete
+        with:
+          api_key: ${{ secrets.NEON_STAGING_API_KEY }}
+          project_id: ${{ vars[format('PGREGRESS_PG{0}_PROJECT_ID', matrix.pg-version)] }}
+          branch_id: ${{steps.create-branch.outputs.branch_id}}

      - name: Create Allure report
        id: create-allure-report
--- a/.github/workflows/ingest_benchmark.yml
+++ b/.github/workflows/ingest_benchmark.yml
@@ -26,6 +26,7 @@ concurrency:
 jobs:
  ingest:
    strategy:
+      fail-fast: false # allow other variants to continue even if one fails
      matrix:
        target_project: [new_empty_project, large_existing_project]  
    permissions:
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -15,6 +15,10 @@ on:
        type: boolean
        description: 'Create Proxy release PR'
        required: false
+      create-compute-release-branch:
+        type: boolean
+        description: 'Create Compute release PR'
+        required: false

 # No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
 permissions: {}
@@ -25,20 +29,20 @@ defaults:

 jobs:
  create-storage-release-branch:
-    if: ${{ github.event.schedule == '0 6 * * MON' || format('{0}', inputs.create-storage-release-branch) == 'true' }}
+    if: ${{ github.event.schedule == '0 6 * * MON' || inputs.create-storage-release-branch }}

    permissions:
      contents: write

    uses: ./.github/workflows/_create-release-pr.yml
    with:
-      component-name: 'Storage & Compute'
+      component-name: 'Storage'
      release-branch: 'release'
    secrets:
      ci-access-token: ${{ secrets.CI_ACCESS_TOKEN }}

  create-proxy-release-branch:
-    if: ${{ github.event.schedule == '0 6 * * THU' || format('{0}', inputs.create-proxy-release-branch) == 'true' }}
+    if: ${{ github.event.schedule == '0 6 * * THU' || inputs.create-proxy-release-branch }}

    permissions:
      contents: write
@@ -49,3 +53,16 @@ jobs:
      release-branch: 'release-proxy'
    secrets:
      ci-access-token: ${{ secrets.CI_ACCESS_TOKEN }}
+
+  create-compute-release-branch:
+    if: inputs.create-compute-release-branch
+
+    permissions:
+      contents: write
+
+    uses: ./.github/workflows/_create-release-pr.yml
+    with:
+      component-name: 'Compute'
+      release-branch: 'release-compute'
+    secrets:
+      ci-access-token: ${{ secrets.CI_ACCESS_TOKEN }}
--- a/.github/workflows/trigger-e2e-tests.yml
+++ b/.github/workflows/trigger-e2e-tests.yml
@@ -51,6 +51,8 @@ jobs:
            echo "tag=release-$(git rev-list --count HEAD)" | tee -a $GITHUB_OUTPUT
          elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then
            echo "tag=release-proxy-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
+          elif [[ "$GITHUB_REF_NAME" == "release-compute" ]]; then
+            echo "tag=release-compute-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
          else
            echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
            BUILD_AND_TEST_RUN_ID=$(gh run list -b $CURRENT_BRANCH -c $CURRENT_SHA -w 'Build and Test' -L 1 --json databaseId --jq '.[].databaseId')
--- a/32
+++ b/32
@@ -1,15 +1,29 @@
-/.github/ @neondatabase/developer-productivity
-/compute_tools/ @neondatabase/control-plane @neondatabase/compute
-/libs/pageserver_api/ @neondatabase/storage
-/libs/postgres_ffi/ @neondatabase/compute @neondatabase/storage
-/libs/remote_storage/ @neondatabase/storage
-/libs/safekeeper_api/ @neondatabase/storage
+# Autoscaling
 /libs/vm_monitor/ @neondatabase/autoscaling
-/pageserver/ @neondatabase/storage
+
+# DevProd
+/.github/ @neondatabase/developer-productivity
+
+# Compute
 /pgxn/ @neondatabase/compute
-/pgxn/neon/ @neondatabase/compute @neondatabase/storage
+/vendor/ @neondatabase/compute
+/compute/ @neondatabase/compute
+/compute_tools/ @neondatabase/compute
+
+# Proxy
+/libs/proxy/ @neondatabase/proxy
 /proxy/ @neondatabase/proxy
+
+# Storage
+/pageserver/ @neondatabase/storage
 /safekeeper/ @neondatabase/storage
 /storage_controller @neondatabase/storage
 /storage_scrubber @neondatabase/storage
-/vendor/ @neondatabase/compute
+/libs/pageserver_api/ @neondatabase/storage
+/libs/remote_storage/ @neondatabase/storage
+/libs/safekeeper_api/ @neondatabase/storage
+
+# Shared
+/pgxn/neon/ @neondatabase/compute @neondatabase/storage
+/libs/compute_api/ @neondatabase/compute @neondatabase/control-plane
+/libs/postgres_ffi/ @neondatabase/compute @neondatabase/storage
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -51,10 +51,6 @@ anyhow = { version = "1.0", features = ["backtrace"] }
 arc-swap = "1.6"
 async-compression = { version = "0.4.0", features = ["tokio", "gzip", "zstd"] }
 atomic-take = "1.1.0"
-azure_core = { version = "0.19", default-features = false, features = ["enable_reqwest_rustls", "hmac_rust"] }
-azure_identity = { version = "0.19", default-features = false, features = ["enable_reqwest_rustls"] }
-azure_storage = { version = "0.19", default-features = false, features = ["enable_reqwest_rustls"] }
-azure_storage_blobs = { version = "0.19", default-features = false, features = ["enable_reqwest_rustls"] }
 flate2 = "1.0.26"
 async-stream = "0.3"
 async-trait = "0.1"
@@ -74,7 +70,7 @@ bindgen = "0.70"
 bit_field = "0.10.2"
 bstr = "1.0"
 byteorder = "1.4"
-bytes = "1.0"
+bytes = "1.9"
 camino = "1.1.6"
 cfg-if = "1.0.0"
 chrono = { version = "0.4", default-features = false, features = ["clock"] }
@@ -216,6 +212,12 @@ postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git",
 postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", branch = "neon" }
 tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch = "neon" }

+## Azure SDK crates
+azure_core = { git = "https://github.com/neondatabase/azure-sdk-for-rust.git", branch = "neon", default-features = false, features = ["enable_reqwest_rustls", "hmac_rust"] }
+azure_identity = { git = "https://github.com/neondatabase/azure-sdk-for-rust.git", branch = "neon", default-features = false, features = ["enable_reqwest_rustls"] }
+azure_storage = { git = "https://github.com/neondatabase/azure-sdk-for-rust.git", branch = "neon", default-features = false, features = ["enable_reqwest_rustls"] }
+azure_storage_blobs = { git = "https://github.com/neondatabase/azure-sdk-for-rust.git", branch = "neon", default-features = false, features = ["enable_reqwest_rustls"] }
+
 ## Local libraries
 compute_api = { version = "0.1", path = "./libs/compute_api/" }
 consumption_metrics = { version = "0.1", path = "./libs/consumption_metrics/" }
--- a/build-tools.Dockerfile
+++ b/build-tools.Dockerfile
@@ -115,7 +115,7 @@ RUN set -e \

 # Keep the version the same as in compute/compute-node.Dockerfile and
 # test_runner/regress/test_compute_metrics.py.
-ENV SQL_EXPORTER_VERSION=0.13.1
+ENV SQL_EXPORTER_VERSION=0.16.0
 RUN curl -fsSL \
    "https://github.com/burningalchemist/sql_exporter/releases/download/${SQL_EXPORTER_VERSION}/sql_exporter-${SQL_EXPORTER_VERSION}.linux-$(case "$(uname -m)" in x86_64) echo amd64;; aarch64) echo arm64;; esac).tar.gz" \
    --output sql_exporter.tar.gz \
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -1324,7 +1324,7 @@ FROM quay.io/prometheuscommunity/postgres-exporter:v0.12.1 AS postgres-exporter

 # Keep the version the same as in build-tools.Dockerfile and
 # test_runner/regress/test_compute_metrics.py.
-FROM burningalchemist/sql_exporter:0.13.1 AS sql-exporter
+FROM burningalchemist/sql_exporter:0.16.0 AS sql-exporter

 #########################################################################################
 #
--- a/compute/etc/neon_collector.jsonnet
+++ b/compute/etc/neon_collector.jsonnet
@@ -6,6 +6,7 @@
    import 'sql_exporter/compute_backpressure_throttling_seconds.libsonnet',
    import 'sql_exporter/compute_current_lsn.libsonnet',
    import 'sql_exporter/compute_logical_snapshot_files.libsonnet',
+    import 'sql_exporter/compute_logical_snapshots_bytes.libsonnet',
    import 'sql_exporter/compute_max_connections.libsonnet',
    import 'sql_exporter/compute_receive_lsn.libsonnet',
    import 'sql_exporter/compute_subscriptions_count.libsonnet',
--- a/compute/etc/pgbouncer.ini
+++ b/compute/etc/pgbouncer.ini
@@ -1,5 +1,9 @@
 [databases]
-*=host=localhost port=5432 auth_user=cloud_admin
+;; pgbouncer propagates application_name (if it's specified) to the server, but some
+;; clients don't set it. We set default application_name=pgbouncer to make it
+;; easier to identify pgbouncer connections in Postgres. If client sets
+;; application_name, it will be used instead.
+*=host=localhost port=5432 auth_user=cloud_admin application_name=pgbouncer
 [pgbouncer]
 listen_port=6432
 listen_addr=0.0.0.0
--- a/compute/etc/sql_exporter/compute_logical_snapshots_bytes.15.sql
+++ b/compute/etc/sql_exporter/compute_logical_snapshots_bytes.15.sql
@@ -0,0 +1,7 @@
+SELECT
+  (SELECT current_setting('neon.timeline_id')) AS timeline_id,
+  -- Postgres creates temporary snapshot files of the form %X-%X.snap.%d.tmp.
+  -- These temporary snapshot files are renamed to the actual snapshot files
+  -- after they are completely built. We only WAL-log the completely built
+  -- snapshot files
+  (SELECT COALESCE(sum(size), 0) FROM pg_ls_logicalsnapdir() WHERE name LIKE '%.snap') AS logical_snapshots_bytes;
--- a/compute/etc/sql_exporter/compute_logical_snapshots_bytes.libsonnet
+++ b/compute/etc/sql_exporter/compute_logical_snapshots_bytes.libsonnet
@@ -0,0 +1,17 @@
+local neon = import 'neon.libsonnet';
+
+local pg_ls_logicalsnapdir = importstr 'sql_exporter/compute_logical_snapshots_bytes.15.sql';
+local pg_ls_dir = importstr 'sql_exporter/compute_logical_snapshots_bytes.sql';
+
+{
+  metric_name: 'compute_logical_snapshots_bytes',
+  type: 'gauge',
+  help: 'Size of the pg_logical/snapshots directory, not including temporary files',
+  key_labels: [
+    'timeline_id',
+  ],
+  values: [
+    'logical_snapshots_bytes',
+  ],
+  query: if neon.PG_MAJORVERSION_NUM < 15 then pg_ls_dir else pg_ls_logicalsnapdir,
+}
--- a/compute/etc/sql_exporter/compute_logical_snapshots_bytes.sql
+++ b/compute/etc/sql_exporter/compute_logical_snapshots_bytes.sql
@@ -0,0 +1,9 @@
+SELECT
+  (SELECT setting FROM pg_settings WHERE name = 'neon.timeline_id') AS timeline_id,
+  -- Postgres creates temporary snapshot files of the form %X-%X.snap.%d.tmp.
+  -- These temporary snapshot files are renamed to the actual snapshot files
+  -- after they are completely built. We only WAL-log the completely built
+  -- snapshot files
+  (SELECT COALESCE(sum((pg_stat_file('pg_logical/snapshots/' || name, missing_ok => true)).size), 0)
+    FROM (SELECT * FROM pg_ls_dir('pg_logical/snapshots') WHERE pg_ls_dir LIKE '%.snap') AS name
+  ) AS logical_snapshots_bytes;
--- a/compute/patches/cloud_regress_pg17.patch
+++ b/compute/patches/cloud_regress_pg17.patch
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -246,47 +246,48 @@ fn try_spec_from_cli(
    let compute_id = matches.get_one::<String>("compute-id");
    let control_plane_uri = matches.get_one::<String>("control-plane-uri");

-    let spec;
-    let mut live_config_allowed = false;
-    match spec_json {
-        // First, try to get cluster spec from the cli argument
-        Some(json) => {
-            info!("got spec from cli argument {}", json);
-            spec = Some(serde_json::from_str(json)?);
-        }
-        None => {
-            // Second, try to read it from the file if path is provided
-            if let Some(sp) = spec_path {
-                let path = Path::new(sp);
-                let file = File::open(path)?;
-                spec = Some(serde_json::from_reader(file)?);
-                live_config_allowed = true;
-            } else if let Some(id) = compute_id {
-                if let Some(cp_base) = control_plane_uri {
-                    live_config_allowed = true;
-                    spec = match get_spec_from_control_plane(cp_base, id) {
-                        Ok(s) => s,
-                        Err(e) => {
-                            error!("cannot get response from control plane: {}", e);
-                            panic!("neither spec nor confirmation that compute is in the Empty state was received");
-                        }
-                    };
-                } else {
-                    panic!("must specify both --control-plane-uri and --compute-id or none");
-                }
-            } else {
-                panic!(
-                    "compute spec should be provided by one of the following ways: \
-                    --spec OR --spec-path OR --control-plane-uri and --compute-id"
-                );
-            }
-        }
+    // First, try to get cluster spec from the cli argument
+    if let Some(spec_json) = spec_json {
+        info!("got spec from cli argument {}", spec_json);
+        return Ok(CliSpecParams {
+            spec: Some(serde_json::from_str(spec_json)?),
+            live_config_allowed: false,
+        });
+    }
+
+    // Second, try to read it from the file if path is provided
+    if let Some(spec_path) = spec_path {
+        let file = File::open(Path::new(spec_path))?;
+        return Ok(CliSpecParams {
+            spec: Some(serde_json::from_reader(file)?),
+            live_config_allowed: true,
+        });
+    }
+
+    let Some(compute_id) = compute_id else {
+        panic!(
+            "compute spec should be provided by one of the following ways: \
+                --spec OR --spec-path OR --control-plane-uri and --compute-id"
+        );
+    };
+    let Some(control_plane_uri) = control_plane_uri else {
+        panic!("must specify both --control-plane-uri and --compute-id or none");
    };

-    Ok(CliSpecParams {
-        spec,
-        live_config_allowed,
-    })
+    match get_spec_from_control_plane(control_plane_uri, compute_id) {
+        Ok(spec) => Ok(CliSpecParams {
+            spec,
+            live_config_allowed: true,
+        }),
+        Err(e) => {
+            error!(
+                "cannot get response from control plane: {}\n\
+                neither spec nor confirmation that compute is in the Empty state was received",
+                e
+            );
+            Err(e)
+        }
+    }
 }

 struct CliSpecParams {
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -1243,12 +1243,7 @@ impl ComputeNode {
        let postgresql_conf_path = pgdata_path.join("postgresql.conf");
        config::write_postgres_conf(&postgresql_conf_path, &spec, self.http_port)?;

-        // TODO(ololobus): We need a concurrency during reconfiguration as well,
-        // but DB is already running and used by user. We can easily get out of
-        // `max_connections` limit, and the current code won't handle that.
-        // let compute_state = self.state.lock().unwrap().clone();
-        // let max_concurrent_connections = self.max_service_connections(&compute_state, &spec);
-        let max_concurrent_connections = 1;
+        let max_concurrent_connections = spec.reconfigure_concurrency;

        // Temporarily reset max_cluster_size in config
        // to avoid the possibility of hitting the limit, while we are reconfiguring:
--- a/compute_tools/src/http/openapi_spec.yaml
+++ b/compute_tools/src/http/openapi_spec.yaml
@@ -537,12 +537,14 @@ components:
            properties:
              extname:
                type: string
-              versions:
-                type: array
+              version:
+                type: string
                items:
                  type: string
              n_databases:
                type: integer
+              owned_by_superuser:
+                type: integer

    SetRoleGrantsRequest:
      type: object
--- a/compute_tools/src/installed_extensions.rs
+++ b/compute_tools/src/installed_extensions.rs
@@ -1,7 +1,6 @@
 use compute_api::responses::{InstalledExtension, InstalledExtensions};
 use metrics::proto::MetricFamily;
 use std::collections::HashMap;
-use std::collections::HashSet;

 use anyhow::Result;
 use postgres::{Client, NoTls};
@@ -38,61 +37,77 @@ fn list_dbs(client: &mut Client) -> Result<Vec<String>> {
 /// Connect to every database (see list_dbs above) and get the list of installed extensions.
 ///
 /// Same extension can be installed in multiple databases with different versions,
-/// we only keep the highest and lowest version across all databases.
+/// so we report a separate metric (number of databases where it is installed)
+/// for each extension version.
 pub fn get_installed_extensions(mut conf: postgres::config::Config) -> Result<InstalledExtensions> {
    conf.application_name("compute_ctl:get_installed_extensions");
    let mut client = conf.connect(NoTls)?;
-
    let databases: Vec<String> = list_dbs(&mut client)?;

-    let mut extensions_map: HashMap<String, InstalledExtension> = HashMap::new();
+    let mut extensions_map: HashMap<(String, String, String), InstalledExtension> = HashMap::new();
    for db in databases.iter() {
        conf.dbname(db);
        let mut db_client = conf.connect(NoTls)?;
-        let extensions: Vec<(String, String)> = db_client
+        let extensions: Vec<(String, String, i32)> = db_client
            .query(
-                "SELECT extname, extversion FROM pg_catalog.pg_extension;",
+                "SELECT extname, extversion, extowner::integer FROM pg_catalog.pg_extension",
                &[],
            )?
            .iter()
-            .map(|row| (row.get("extname"), row.get("extversion")))
+            .map(|row| {
+                (
+                    row.get("extname"),
+                    row.get("extversion"),
+                    row.get("extowner"),
+                )
+            })
            .collect();

-        for (extname, v) in extensions.iter() {
+        for (extname, v, extowner) in extensions.iter() {
            let version = v.to_string();

-            // increment the number of databases where the version of extension is installed
-            INSTALLED_EXTENSIONS
-                .with_label_values(&[extname, &version])
-                .inc();
+            // check if the extension is owned by superuser
+            // 10 is the oid of superuser
+            let owned_by_superuser = if *extowner == 10 { "1" } else { "0" };

            extensions_map
-                .entry(extname.to_string())
+                .entry((
+                    extname.to_string(),
+                    version.clone(),
+                    owned_by_superuser.to_string(),
+                ))
                .and_modify(|e| {
-                    e.versions.insert(version.clone());
                    // count the number of databases where the extension is installed
                    e.n_databases += 1;
                })
                .or_insert(InstalledExtension {
                    extname: extname.to_string(),
-                    versions: HashSet::from([version.clone()]),
+                    version: version.clone(),
                    n_databases: 1,
+                    owned_by_superuser: owned_by_superuser.to_string(),
                });
        }
    }

-    let res = InstalledExtensions {
-        extensions: extensions_map.into_values().collect(),
-    };
+    for (key, ext) in extensions_map.iter() {
+        let (extname, version, owned_by_superuser) = key;
+        let n_databases = ext.n_databases as u64;

-    Ok(res)
+        INSTALLED_EXTENSIONS
+            .with_label_values(&[extname, version, owned_by_superuser])
+            .set(n_databases);
+    }
+
+    Ok(InstalledExtensions {
+        extensions: extensions_map.into_values().collect(),
+    })
 }

 static INSTALLED_EXTENSIONS: Lazy<UIntGaugeVec> = Lazy::new(|| {
    register_uint_gauge_vec!(
        "compute_installed_extensions",
        "Number of databases where the version of extension is installed",
-        &["extension_name", "version"]
+        &["extension_name", "version", "owned_by_superuser"]
    )
    .expect("failed to define a metric")
 });
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -53,6 +53,7 @@ use compute_api::spec::Role;
 use nix::sys::signal::kill;
 use nix::sys::signal::Signal;
 use pageserver_api::shard::ShardStripeSize;
+use reqwest::header::CONTENT_TYPE;
 use serde::{Deserialize, Serialize};
 use url::Host;
 use utils::id::{NodeId, TenantId, TimelineId};
@@ -618,6 +619,7 @@ impl Endpoint {
            pgbouncer_settings: None,
            shard_stripe_size: Some(shard_stripe_size),
            local_proxy_config: None,
+            reconfigure_concurrency: 1,
        };
        let spec_path = self.endpoint_path().join("spec.json");
        std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?;
@@ -808,7 +810,7 @@ impl Endpoint {
        }

        let client = reqwest::Client::builder()
-            .timeout(Duration::from_secs(30))
+            .timeout(Duration::from_secs(120))
            .build()
            .unwrap();
        let response = client
@@ -817,6 +819,7 @@ impl Endpoint {
                self.http_address.ip(),
                self.http_address.port()
            ))
+            .header(CONTENT_TYPE.as_str(), "application/json")
            .body(format!(
                "{{\"spec\":{}}}",
                serde_json::to_string_pretty(&spec)?
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -435,7 +435,7 @@ impl PageServerNode {
    ) -> anyhow::Result<()> {
        let config = Self::parse_config(settings)?;
        self.http_client
-            .tenant_config(&models::TenantConfigRequest { tenant_id, config })
+            .set_tenant_config(&models::TenantConfigRequest { tenant_id, config })
            .await?;

        Ok(())
--- a/control_plane/src/safekeeper.rs
+++ b/control_plane/src/safekeeper.rs
@@ -5,6 +5,7 @@
 //! ```text
 //!   .neon/safekeepers/<safekeeper id>
 //! ```
+use std::error::Error as _;
 use std::future::Future;
 use std::io::Write;
 use std::path::PathBuf;
@@ -26,7 +27,7 @@ use crate::{

 #[derive(Error, Debug)]
 pub enum SafekeeperHttpError {
-    #[error("Reqwest error: {0}")]
+    #[error("request error: {0}{}", .0.source().map(|e| format!(": {e}")).unwrap_or_default())]
    Transport(#[from] reqwest::Error),

    #[error("Error: {0}")]
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -9,8 +9,8 @@ use pageserver_api::{
    },
    models::{
        EvictionPolicy, EvictionPolicyLayerAccessThreshold, LocationConfigSecondary,
-        ShardParameters, TenantConfig, TenantConfigRequest, TenantShardSplitRequest,
-        TenantShardSplitResponse,
+        ShardParameters, TenantConfig, TenantConfigPatchRequest, TenantConfigRequest,
+        TenantShardSplitRequest, TenantShardSplitResponse,
    },
    shard::{ShardStripeSize, TenantShardId},
 };
@@ -116,9 +116,19 @@ enum Command {
        #[arg(long)]
        tenant_shard_id: TenantShardId,
    },
-    /// Modify the pageserver tenant configuration of a tenant: this is the configuration structure
+    /// Set the pageserver tenant configuration of a tenant: this is the configuration structure
    /// that is passed through to pageservers, and does not affect storage controller behavior.
-    TenantConfig {
+    /// Any previous tenant configs are overwritten.
+    SetTenantConfig {
+        #[arg(long)]
+        tenant_id: TenantId,
+        #[arg(long)]
+        config: String,
+    },
+    /// Patch the pageserver tenant configuration of a tenant. Any fields with null values in the
+    /// provided JSON are unset from the tenant config and all fields with non-null values are set.
+    /// Unspecified fields are not changed.
+    PatchTenantConfig {
        #[arg(long)]
        tenant_id: TenantId,
        #[arg(long)]
@@ -549,11 +559,21 @@ async fn main() -> anyhow::Result<()> {
                )
                .await?;
        }
-        Command::TenantConfig { tenant_id, config } => {
+        Command::SetTenantConfig { tenant_id, config } => {
            let tenant_conf = serde_json::from_str(&config)?;

            vps_client
-                .tenant_config(&TenantConfigRequest {
+                .set_tenant_config(&TenantConfigRequest {
+                    tenant_id,
+                    config: tenant_conf,
+                })
+                .await?;
+        }
+        Command::PatchTenantConfig { tenant_id, config } => {
+            let tenant_conf = serde_json::from_str(&config)?;
+
+            vps_client
+                .patch_tenant_config(&TenantConfigPatchRequest {
                    tenant_id,
                    config: tenant_conf,
                })
@@ -736,7 +756,7 @@ async fn main() -> anyhow::Result<()> {
            threshold,
        } => {
            vps_client
-                .tenant_config(&TenantConfigRequest {
+                .set_tenant_config(&TenantConfigRequest {
                    tenant_id,
                    config: TenantConfig {
                        eviction_policy: Some(EvictionPolicy::LayerAccessThreshold(
--- a/deny.toml
+++ b/deny.toml
@@ -42,6 +42,7 @@ allow = [
    "MPL-2.0",
    "OpenSSL",
    "Unicode-DFS-2016",
+    "Unicode-3.0",
 ]
 confidence-threshold = 0.8
 exceptions = [
--- a/libs/compute_api/src/responses.rs
+++ b/libs/compute_api/src/responses.rs
@@ -1,6 +1,5 @@
 //! Structs representing the JSON formats used in the compute_ctl's HTTP API.

-use std::collections::HashSet;
 use std::fmt::Display;

 use chrono::{DateTime, Utc};
@@ -163,8 +162,9 @@ pub enum ControlPlaneComputeStatus {
 #[derive(Clone, Debug, Default, Serialize)]
 pub struct InstalledExtension {
    pub extname: String,
-    pub versions: HashSet<String>,
+    pub version: String,
    pub n_databases: u32, // Number of databases using this extension
+    pub owned_by_superuser: String,
 }

 #[derive(Clone, Debug, Default, Serialize)]
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -19,6 +19,10 @@ pub type PgIdent = String;
 /// String type alias representing Postgres extension version
 pub type ExtVersion = String;

+fn default_reconfigure_concurrency() -> usize {
+    1
+}
+
 /// Cluster spec or configuration represented as an optional number of
 /// delta operations + final cluster state description.
 #[derive(Clone, Debug, Default, Deserialize, Serialize)]
@@ -67,7 +71,7 @@ pub struct ComputeSpec {
    pub cluster: Cluster,
    pub delta_operations: Option<Vec<DeltaOp>>,

-    /// An optinal hint that can be passed to speed up startup time if we know
+    /// An optional hint that can be passed to speed up startup time if we know
    /// that no pg catalog mutations (like role creation, database creation,
    /// extension creation) need to be done on the actual database to start.
    #[serde(default)] // Default false
@@ -86,9 +90,7 @@ pub struct ComputeSpec {
    // etc. GUCs in cluster.settings. TODO: Once the control plane has been
    // updated to fill these fields, we can make these non optional.
    pub tenant_id: Option<TenantId>,
-
    pub timeline_id: Option<TimelineId>,
-
    pub pageserver_connstring: Option<String>,

    #[serde(default)]
@@ -113,6 +115,20 @@ pub struct ComputeSpec {
    /// Local Proxy configuration used for JWT authentication
    #[serde(default)]
    pub local_proxy_config: Option<LocalProxySpec>,
+
+    /// Number of concurrent connections during the parallel RunInEachDatabase
+    /// phase of the apply config process.
+    ///
+    /// We need a higher concurrency during reconfiguration in case of many DBs,
+    /// but instance is already running and used by client. We can easily get out of
+    /// `max_connections` limit, and the current code won't handle that.
+    ///
+    /// Default is 1, but also allow control plane to override this value for specific
+    /// projects. It's also recommended to bump `superuser_reserved_connections` +=
+    /// `reconfigure_concurrency` for such projects to ensure that we always have
+    /// enough spare connections for reconfiguration process to succeed.
+    #[serde(default = "default_reconfigure_concurrency")]
+    pub reconfigure_concurrency: usize,
 }

 /// Feature flag to signal `compute_ctl` to enable certain experimental functionality.
@@ -315,6 +331,9 @@ mod tests {

        // Features list defaults to empty vector.
        assert!(spec.features.is_empty());
+
+        // Reconfigure concurrency defaults to 1.
+        assert_eq!(spec.reconfigure_concurrency, 1);
    }

    #[test]
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -245,6 +245,17 @@ impl From<NodeAvailability> for NodeAvailabilityWrapper {
    }
 }

+/// Scheduling policy enables us to selectively disable some automatic actions that the
+/// controller performs on a tenant shard. This is only set to a non-default value by
+/// human intervention, and it is reset to the default value (Active) when the tenant's
+/// placement policy is modified away from Attached.
+///
+/// The typical use of a non-Active scheduling policy is one of:
+/// - Pinnning a shard to a node (i.e. migrating it there & setting a non-Active scheduling policy)
+/// - Working around a bug (e.g. if something is flapping and we need to stop it until the bug is fixed)
+///
+/// If you're not sure which policy to use to pin a shard to its current location, you probably
+/// want Pause.
 #[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, Debug)]
 pub enum ShardSchedulingPolicy {
    // Normal mode: the tenant's scheduled locations may be updated at will, including
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -24,7 +24,7 @@ pub struct Key {

 /// When working with large numbers of Keys in-memory, it is more efficient to handle them as i128 than as
 /// a struct of fields.
-#[derive(Clone, Copy, Hash, PartialEq, Eq, Ord, PartialOrd, Serialize, Deserialize)]
+#[derive(Clone, Copy, Hash, PartialEq, Eq, Ord, PartialOrd, Serialize, Deserialize, Debug)]
 pub struct CompactKey(i128);

 /// The storage key size.
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -17,7 +17,7 @@ use std::{

 use byteorder::{BigEndian, ReadBytesExt};
 use postgres_ffi::BLCKSZ;
-use serde::{Deserialize, Serialize};
+use serde::{Deserialize, Deserializer, Serialize, Serializer};
 use serde_with::serde_as;
 use utils::{
    completion,
@@ -325,6 +325,115 @@ impl Default for ShardParameters {
    }
 }

+#[derive(Debug, Default, Clone, Eq, PartialEq)]
+pub enum FieldPatch<T> {
+    Upsert(T),
+    Remove,
+    #[default]
+    Noop,
+}
+
+impl<T> FieldPatch<T> {
+    fn is_noop(&self) -> bool {
+        matches!(self, FieldPatch::Noop)
+    }
+
+    pub fn apply(self, target: &mut Option<T>) {
+        match self {
+            Self::Upsert(v) => *target = Some(v),
+            Self::Remove => *target = None,
+            Self::Noop => {}
+        }
+    }
+
+    pub fn map<U, E, F: FnOnce(T) -> Result<U, E>>(self, map: F) -> Result<FieldPatch<U>, E> {
+        match self {
+            Self::Upsert(v) => Ok(FieldPatch::<U>::Upsert(map(v)?)),
+            Self::Remove => Ok(FieldPatch::<U>::Remove),
+            Self::Noop => Ok(FieldPatch::<U>::Noop),
+        }
+    }
+}
+
+impl<'de, T: Deserialize<'de>> Deserialize<'de> for FieldPatch<T> {
+    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
+    where
+        D: Deserializer<'de>,
+    {
+        Option::deserialize(deserializer).map(|opt| match opt {
+            None => FieldPatch::Remove,
+            Some(val) => FieldPatch::Upsert(val),
+        })
+    }
+}
+
+impl<T: Serialize> Serialize for FieldPatch<T> {
+    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+    where
+        S: Serializer,
+    {
+        match self {
+            FieldPatch::Upsert(val) => serializer.serialize_some(val),
+            FieldPatch::Remove => serializer.serialize_none(),
+            FieldPatch::Noop => unreachable!(),
+        }
+    }
+}
+
+#[derive(Serialize, Deserialize, Debug, Default, Clone, Eq, PartialEq)]
+#[serde(default)]
+pub struct TenantConfigPatch {
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub checkpoint_distance: FieldPatch<u64>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub checkpoint_timeout: FieldPatch<String>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub compaction_target_size: FieldPatch<u64>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub compaction_period: FieldPatch<String>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub compaction_threshold: FieldPatch<usize>,
+    // defer parsing compaction_algorithm, like eviction_policy
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub compaction_algorithm: FieldPatch<CompactionAlgorithmSettings>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub gc_horizon: FieldPatch<u64>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub gc_period: FieldPatch<String>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub image_creation_threshold: FieldPatch<usize>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub pitr_interval: FieldPatch<String>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub walreceiver_connect_timeout: FieldPatch<String>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub lagging_wal_timeout: FieldPatch<String>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub max_lsn_wal_lag: FieldPatch<NonZeroU64>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub eviction_policy: FieldPatch<EvictionPolicy>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub min_resident_size_override: FieldPatch<u64>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub evictions_low_residence_duration_metric_threshold: FieldPatch<String>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub heatmap_period: FieldPatch<String>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub lazy_slru_download: FieldPatch<bool>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub timeline_get_throttle: FieldPatch<ThrottleConfig>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub image_layer_creation_check_threshold: FieldPatch<u8>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub lsn_lease_length: FieldPatch<String>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub lsn_lease_length_for_ts: FieldPatch<String>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub timeline_offloading: FieldPatch<bool>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub wal_receiver_protocol_override: FieldPatch<PostgresClientProtocol>,
+}
+
 /// An alternative representation of `pageserver::tenant::TenantConf` with
 /// simpler types.
 #[derive(Serialize, Deserialize, Debug, Default, Clone, Eq, PartialEq)]
@@ -356,6 +465,107 @@ pub struct TenantConfig {
    pub wal_receiver_protocol_override: Option<PostgresClientProtocol>,
 }

+impl TenantConfig {
+    pub fn apply_patch(self, patch: TenantConfigPatch) -> TenantConfig {
+        let Self {
+            mut checkpoint_distance,
+            mut checkpoint_timeout,
+            mut compaction_target_size,
+            mut compaction_period,
+            mut compaction_threshold,
+            mut compaction_algorithm,
+            mut gc_horizon,
+            mut gc_period,
+            mut image_creation_threshold,
+            mut pitr_interval,
+            mut walreceiver_connect_timeout,
+            mut lagging_wal_timeout,
+            mut max_lsn_wal_lag,
+            mut eviction_policy,
+            mut min_resident_size_override,
+            mut evictions_low_residence_duration_metric_threshold,
+            mut heatmap_period,
+            mut lazy_slru_download,
+            mut timeline_get_throttle,
+            mut image_layer_creation_check_threshold,
+            mut lsn_lease_length,
+            mut lsn_lease_length_for_ts,
+            mut timeline_offloading,
+            mut wal_receiver_protocol_override,
+        } = self;
+
+        patch.checkpoint_distance.apply(&mut checkpoint_distance);
+        patch.checkpoint_timeout.apply(&mut checkpoint_timeout);
+        patch
+            .compaction_target_size
+            .apply(&mut compaction_target_size);
+        patch.compaction_period.apply(&mut compaction_period);
+        patch.compaction_threshold.apply(&mut compaction_threshold);
+        patch.compaction_algorithm.apply(&mut compaction_algorithm);
+        patch.gc_horizon.apply(&mut gc_horizon);
+        patch.gc_period.apply(&mut gc_period);
+        patch
+            .image_creation_threshold
+            .apply(&mut image_creation_threshold);
+        patch.pitr_interval.apply(&mut pitr_interval);
+        patch
+            .walreceiver_connect_timeout
+            .apply(&mut walreceiver_connect_timeout);
+        patch.lagging_wal_timeout.apply(&mut lagging_wal_timeout);
+        patch.max_lsn_wal_lag.apply(&mut max_lsn_wal_lag);
+        patch.eviction_policy.apply(&mut eviction_policy);
+        patch
+            .min_resident_size_override
+            .apply(&mut min_resident_size_override);
+        patch
+            .evictions_low_residence_duration_metric_threshold
+            .apply(&mut evictions_low_residence_duration_metric_threshold);
+        patch.heatmap_period.apply(&mut heatmap_period);
+        patch.lazy_slru_download.apply(&mut lazy_slru_download);
+        patch
+            .timeline_get_throttle
+            .apply(&mut timeline_get_throttle);
+        patch
+            .image_layer_creation_check_threshold
+            .apply(&mut image_layer_creation_check_threshold);
+        patch.lsn_lease_length.apply(&mut lsn_lease_length);
+        patch
+            .lsn_lease_length_for_ts
+            .apply(&mut lsn_lease_length_for_ts);
+        patch.timeline_offloading.apply(&mut timeline_offloading);
+        patch
+            .wal_receiver_protocol_override
+            .apply(&mut wal_receiver_protocol_override);
+
+        Self {
+            checkpoint_distance,
+            checkpoint_timeout,
+            compaction_target_size,
+            compaction_period,
+            compaction_threshold,
+            compaction_algorithm,
+            gc_horizon,
+            gc_period,
+            image_creation_threshold,
+            pitr_interval,
+            walreceiver_connect_timeout,
+            lagging_wal_timeout,
+            max_lsn_wal_lag,
+            eviction_policy,
+            min_resident_size_override,
+            evictions_low_residence_duration_metric_threshold,
+            heatmap_period,
+            lazy_slru_download,
+            timeline_get_throttle,
+            image_layer_creation_check_threshold,
+            lsn_lease_length,
+            lsn_lease_length_for_ts,
+            timeline_offloading,
+            wal_receiver_protocol_override,
+        }
+    }
+}
+
 /// The policy for the aux file storage.
 ///
 /// It can be switched through `switch_aux_file_policy` tenant config.
@@ -686,6 +896,14 @@ impl TenantConfigRequest {
    }
 }

+#[derive(Serialize, Deserialize, Debug)]
+#[serde(deny_unknown_fields)]
+pub struct TenantConfigPatchRequest {
+    pub tenant_id: TenantId,
+    #[serde(flatten)]
+    pub config: TenantConfigPatch, // as we have a flattened field, we should reject all unknown fields in it
+}
+
 /// See [`TenantState::attachment_status`] and the OpenAPI docs for context.
 #[derive(Serialize, Deserialize, Clone)]
 #[serde(tag = "slug", content = "data", rename_all = "snake_case")]
@@ -1699,4 +1917,45 @@ mod tests {
            );
        }
    }
+
+    #[test]
+    fn test_tenant_config_patch_request_serde() {
+        let patch_request = TenantConfigPatchRequest {
+            tenant_id: TenantId::from_str("17c6d121946a61e5ab0fe5a2fd4d8215").unwrap(),
+            config: TenantConfigPatch {
+                checkpoint_distance: FieldPatch::Upsert(42),
+                gc_horizon: FieldPatch::Remove,
+                compaction_threshold: FieldPatch::Noop,
+                ..TenantConfigPatch::default()
+            },
+        };
+
+        let json = serde_json::to_string(&patch_request).unwrap();
+
+        let expected = r#"{"tenant_id":"17c6d121946a61e5ab0fe5a2fd4d8215","checkpoint_distance":42,"gc_horizon":null}"#;
+        assert_eq!(json, expected);
+
+        let decoded: TenantConfigPatchRequest = serde_json::from_str(&json).unwrap();
+        assert_eq!(decoded.tenant_id, patch_request.tenant_id);
+        assert_eq!(decoded.config, patch_request.config);
+
+        // Now apply the patch to a config to demonstrate semantics
+
+        let base = TenantConfig {
+            checkpoint_distance: Some(28),
+            gc_horizon: Some(100),
+            compaction_target_size: Some(1024),
+            ..Default::default()
+        };
+
+        let expected = TenantConfig {
+            checkpoint_distance: Some(42),
+            gc_horizon: None,
+            ..base.clone()
+        };
+
+        let patched = base.apply_patch(decoded.config);
+
+        assert_eq!(patched, expected);
+    }
 }
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -158,7 +158,8 @@ impl ShardIdentity {
        key_to_shard_number(self.count, self.stripe_size, key)
    }

-    /// Return true if the key should be ingested by this shard
+    /// Return true if the key is stored only on this shard. This does not include
+    /// global keys, see is_key_global().
    ///
    /// Shards must ingest _at least_ keys which return true from this check.
    pub fn is_key_local(&self, key: &Key) -> bool {
@@ -171,7 +172,7 @@ impl ShardIdentity {
    }

    /// Return true if the key should be stored on all shards, not just one.
-    fn is_key_global(&self, key: &Key) -> bool {
+    pub fn is_key_global(&self, key: &Key) -> bool {
        if key.is_slru_block_key() || key.is_slru_segment_size_key() || key.is_aux_file_key() {
            // Special keys that are only stored on shard 0
            false
--- a/libs/pq_proto/src/lib.rs
+++ b/libs/pq_proto/src/lib.rs
@@ -100,7 +100,7 @@ impl StartupMessageParamsBuilder {

 #[derive(Debug, Clone, Default)]
 pub struct StartupMessageParams {
-    params: Bytes,
+    pub params: Bytes,
 }

 impl StartupMessageParams {
--- a/libs/proxy/postgres-protocol2/src/authentication/sasl.rs
+++ b/libs/proxy/postgres-protocol2/src/authentication/sasl.rs
@@ -117,7 +117,7 @@ enum Credentials<const N: usize> {
    /// A regular password as a vector of bytes.
    Password(Vec<u8>),
    /// A precomputed pair of keys.
-    Keys(Box<ScramKeys<N>>),
+    Keys(ScramKeys<N>),
 }

 enum State {
@@ -176,7 +176,7 @@ impl ScramSha256 {

    /// Constructs a new instance which will use the provided key pair for authentication.
    pub fn new_with_keys(keys: ScramKeys<32>, channel_binding: ChannelBinding) -> ScramSha256 {
-        let password = Credentials::Keys(keys.into());
+        let password = Credentials::Keys(keys);
        ScramSha256::new_inner(password, channel_binding, nonce())
    }

--- a/libs/proxy/postgres-protocol2/src/message/frontend.rs
+++ b/libs/proxy/postgres-protocol2/src/message/frontend.rs
@@ -255,22 +255,34 @@ pub fn ssl_request(buf: &mut BytesMut) {
 }

 #[inline]
-pub fn startup_message<'a, I>(parameters: I, buf: &mut BytesMut) -> io::Result<()>
-where
-    I: IntoIterator<Item = (&'a str, &'a str)>,
-{
+pub fn startup_message(parameters: &StartupMessageParams, buf: &mut BytesMut) -> io::Result<()> {
    write_body(buf, |buf| {
        // postgres protocol version 3.0(196608) in bigger-endian
        buf.put_i32(0x00_03_00_00);
-        for (key, value) in parameters {
-            write_cstr(key.as_bytes(), buf)?;
-            write_cstr(value.as_bytes(), buf)?;
-        }
+        buf.put_slice(&parameters.params);
        buf.put_u8(0);
        Ok(())
    })
 }

+#[derive(Debug, Clone, Default, PartialEq, Eq)]
+pub struct StartupMessageParams {
+    pub params: BytesMut,
+}
+
+impl StartupMessageParams {
+    /// Set parameter's value by its name.
+    pub fn insert(&mut self, name: &str, value: &str) {
+        if name.contains('\0') || value.contains('\0') {
+            panic!("startup parameter name or value contained a null")
+        }
+        self.params.put_slice(name.as_bytes());
+        self.params.put_u8(0);
+        self.params.put_slice(value.as_bytes());
+        self.params.put_u8(0);
+    }
+}
+
 #[inline]
 pub fn sync(buf: &mut BytesMut) {
    buf.put_u8(b'S');
--- a/libs/proxy/tokio-postgres2/src/codec.rs
+++ b/libs/proxy/tokio-postgres2/src/codec.rs
@@ -35,9 +35,7 @@ impl FallibleIterator for BackendMessages {
    }
 }

-pub struct PostgresCodec {
-    pub max_message_size: Option<usize>,
-}
+pub struct PostgresCodec;

 impl Encoder<FrontendMessage> for PostgresCodec {
    type Error = io::Error;
@@ -66,15 +64,6 @@ impl Decoder for PostgresCodec {
                break;
            }

-            if let Some(max) = self.max_message_size {
-                if len > max {
-                    return Err(io::Error::new(
-                        io::ErrorKind::InvalidInput,
-                        "message too large",
-                    ));
-                }
-            }
-
            match header.tag() {
                backend::NOTICE_RESPONSE_TAG
                | backend::NOTIFICATION_RESPONSE_TAG
--- a/libs/proxy/tokio-postgres2/src/config.rs
+++ b/libs/proxy/tokio-postgres2/src/config.rs
@@ -6,6 +6,7 @@ use crate::connect_raw::RawConnection;
 use crate::tls::MakeTlsConnect;
 use crate::tls::TlsConnect;
 use crate::{Client, Connection, Error};
+use postgres_protocol2::message::frontend::StartupMessageParams;
 use std::fmt;
 use std::str;
 use std::time::Duration;
@@ -14,16 +15,6 @@ use tokio::io::{AsyncRead, AsyncWrite};
 pub use postgres_protocol2::authentication::sasl::ScramKeys;
 use tokio::net::TcpStream;

-/// Properties required of a session.
-#[derive(Debug, Copy, Clone, PartialEq, Eq)]
-#[non_exhaustive]
-pub enum TargetSessionAttrs {
-    /// No special properties are required.
-    Any,
-    /// The session must allow writes.
-    ReadWrite,
-}
-
 /// TLS configuration.
 #[derive(Debug, Copy, Clone, PartialEq, Eq)]
 #[non_exhaustive]
@@ -73,94 +64,20 @@ pub enum AuthKeys {
 }

 /// Connection configuration.
-///
-/// Configuration can be parsed from libpq-style connection strings. These strings come in two formats:
-///
-/// # Key-Value
-///
-/// This format consists of space-separated key-value pairs. Values which are either the empty string or contain
-/// whitespace should be wrapped in `'`. `'` and `\` characters should be backslash-escaped.
-///
-/// ## Keys
-///
-/// * `user` - The username to authenticate with. Required.
-/// * `password` - The password to authenticate with.
-/// * `dbname` - The name of the database to connect to. Defaults to the username.
-/// * `options` - Command line options used to configure the server.
-/// * `application_name` - Sets the `application_name` parameter on the server.
-/// * `sslmode` - Controls usage of TLS. If set to `disable`, TLS will not be used. If set to `prefer`, TLS will be used
-///     if available, but not used otherwise. If set to `require`, TLS will be forced to be used. Defaults to `prefer`.
-/// * `host` - The host to connect to. On Unix platforms, if the host starts with a `/` character it is treated as the
-///     path to the directory containing Unix domain sockets. Otherwise, it is treated as a hostname. Multiple hosts
-///     can be specified, separated by commas. Each host will be tried in turn when connecting. Required if connecting
-///     with the `connect` method.
-/// * `port` - The port to connect to. Multiple ports can be specified, separated by commas. The number of ports must be
-///     either 1, in which case it will be used for all hosts, or the same as the number of hosts. Defaults to 5432 if
-///     omitted or the empty string.
-/// * `connect_timeout` - The time limit in seconds applied to each socket-level connection attempt. Note that hostnames
-///     can resolve to multiple IP addresses, and this limit is applied to each address. Defaults to no timeout.
-/// * `target_session_attrs` - Specifies requirements of the session. If set to `read-write`, the client will check that
-///     the `transaction_read_write` session parameter is set to `on`. This can be used to connect to the primary server
-///     in a database cluster as opposed to the secondary read-only mirrors. Defaults to `all`.
-/// * `channel_binding` - Controls usage of channel binding in the authentication process. If set to `disable`, channel
-///     binding will not be used. If set to `prefer`, channel binding will be used if available, but not used otherwise.
-///     If set to `require`, the authentication process will fail if channel binding is not used. Defaults to `prefer`.
-///
-/// ## Examples
-///
-/// ```not_rust
-/// host=localhost user=postgres connect_timeout=10 keepalives=0
-/// ```
-///
-/// ```not_rust
-/// host=/var/lib/postgresql,localhost port=1234 user=postgres password='password with spaces'
-/// ```
-///
-/// ```not_rust
-/// host=host1,host2,host3 port=1234,,5678 user=postgres target_session_attrs=read-write
-/// ```
-///
-/// # Url
-///
-/// This format resembles a URL with a scheme of either `postgres://` or `postgresql://`. All components are optional,
-/// and the format accepts query parameters for all of the key-value pairs described in the section above. Multiple
-/// host/port pairs can be comma-separated. Unix socket paths in the host section of the URL should be percent-encoded,
-/// as the path component of the URL specifies the database name.
-///
-/// ## Examples
-///
-/// ```not_rust
-/// postgresql://user@localhost
-/// ```
-///
-/// ```not_rust
-/// postgresql://user:password@%2Fvar%2Flib%2Fpostgresql/mydb?connect_timeout=10
-/// ```
-///
-/// ```not_rust
-/// postgresql://user@host1:1234,host2,host3:5678?target_session_attrs=read-write
-/// ```
-///
-/// ```not_rust
-/// postgresql:///mydb?user=user&host=/var/lib/postgresql
-/// ```
 #[derive(Clone, PartialEq, Eq)]
 pub struct Config {
    pub(crate) host: Host,
    pub(crate) port: u16,

-    pub(crate) user: Option<String>,
    pub(crate) password: Option<Vec<u8>>,
    pub(crate) auth_keys: Option<Box<AuthKeys>>,
-    pub(crate) dbname: Option<String>,
-    pub(crate) options: Option<String>,
-    pub(crate) application_name: Option<String>,
    pub(crate) ssl_mode: SslMode,
    pub(crate) connect_timeout: Option<Duration>,
-    pub(crate) target_session_attrs: TargetSessionAttrs,
    pub(crate) channel_binding: ChannelBinding,
-    pub(crate) replication_mode: Option<ReplicationMode>,
-    pub(crate) max_backend_message_size: Option<usize>,
+    pub(crate) server_params: StartupMessageParams,
+
+    database: bool,
+    username: bool,
 }

 impl Config {
@@ -169,18 +86,15 @@ impl Config {
        Config {
            host: Host::Tcp(host),
            port,
-            user: None,
            password: None,
            auth_keys: None,
-            dbname: None,
-            options: None,
-            application_name: None,
            ssl_mode: SslMode::Prefer,
            connect_timeout: None,
-            target_session_attrs: TargetSessionAttrs::Any,
            channel_binding: ChannelBinding::Prefer,
-            replication_mode: None,
-            max_backend_message_size: None,
+            server_params: StartupMessageParams::default(),
+
+            database: false,
+            username: false,
        }
    }

@@ -188,14 +102,13 @@ impl Config {
    ///
    /// Required.
    pub fn user(&mut self, user: &str) -> &mut Config {
-        self.user = Some(user.to_string());
-        self
+        self.set_param("user", user)
    }

    /// Gets the user to authenticate with, if one has been configured with
    /// the `user` method.
-    pub fn get_user(&self) -> Option<&str> {
-        self.user.as_deref()
+    pub fn user_is_set(&self) -> bool {
+        self.username
    }

    /// Sets the password to authenticate with.
@@ -231,40 +144,26 @@ impl Config {
    ///
    /// Defaults to the user.
    pub fn dbname(&mut self, dbname: &str) -> &mut Config {
-        self.dbname = Some(dbname.to_string());
-        self
+        self.set_param("database", dbname)
    }

    /// Gets the name of the database to connect to, if one has been configured
    /// with the `dbname` method.
-    pub fn get_dbname(&self) -> Option<&str> {
-        self.dbname.as_deref()
+    pub fn db_is_set(&self) -> bool {
+        self.database
    }

-    /// Sets command line options used to configure the server.
-    pub fn options(&mut self, options: &str) -> &mut Config {
-        self.options = Some(options.to_string());
+    pub fn set_param(&mut self, name: &str, value: &str) -> &mut Config {
+        if name == "database" {
+            self.database = true;
+        } else if name == "user" {
+            self.username = true;
+        }
+
+        self.server_params.insert(name, value);
        self
    }

-    /// Gets the command line options used to configure the server, if the
-    /// options have been set with the `options` method.
-    pub fn get_options(&self) -> Option<&str> {
-        self.options.as_deref()
-    }
-
-    /// Sets the value of the `application_name` runtime parameter.
-    pub fn application_name(&mut self, application_name: &str) -> &mut Config {
-        self.application_name = Some(application_name.to_string());
-        self
-    }
-
-    /// Gets the value of the `application_name` runtime parameter, if it has
-    /// been set with the `application_name` method.
-    pub fn get_application_name(&self) -> Option<&str> {
-        self.application_name.as_deref()
-    }
-
    /// Sets the SSL configuration.
    ///
    /// Defaults to `prefer`.
@@ -303,23 +202,6 @@ impl Config {
        self.connect_timeout.as_ref()
    }

-    /// Sets the requirements of the session.
-    ///
-    /// This can be used to connect to the primary server in a clustered database rather than one of the read-only
-    /// secondary servers. Defaults to `Any`.
-    pub fn target_session_attrs(
-        &mut self,
-        target_session_attrs: TargetSessionAttrs,
-    ) -> &mut Config {
-        self.target_session_attrs = target_session_attrs;
-        self
-    }
-
-    /// Gets the requirements of the session.
-    pub fn get_target_session_attrs(&self) -> TargetSessionAttrs {
-        self.target_session_attrs
-    }
-
    /// Sets the channel binding behavior.
    ///
    /// Defaults to `prefer`.
@@ -333,28 +215,6 @@ impl Config {
        self.channel_binding
    }

-    /// Set replication mode.
-    pub fn replication_mode(&mut self, replication_mode: ReplicationMode) -> &mut Config {
-        self.replication_mode = Some(replication_mode);
-        self
-    }
-
-    /// Get replication mode.
-    pub fn get_replication_mode(&self) -> Option<ReplicationMode> {
-        self.replication_mode
-    }
-
-    /// Set limit for backend messages size.
-    pub fn max_backend_message_size(&mut self, max_backend_message_size: usize) -> &mut Config {
-        self.max_backend_message_size = Some(max_backend_message_size);
-        self
-    }
-
-    /// Get limit for backend messages size.
-    pub fn get_max_backend_message_size(&self) -> Option<usize> {
-        self.max_backend_message_size
-    }
-
    /// Opens a connection to a PostgreSQL database.
    ///
    /// Requires the `runtime` Cargo feature (enabled by default).
@@ -392,18 +252,13 @@ impl fmt::Debug for Config {
        }

        f.debug_struct("Config")
-            .field("user", &self.user)
            .field("password", &self.password.as_ref().map(|_| Redaction {}))
-            .field("dbname", &self.dbname)
-            .field("options", &self.options)
-            .field("application_name", &self.application_name)
            .field("ssl_mode", &self.ssl_mode)
            .field("host", &self.host)
            .field("port", &self.port)
            .field("connect_timeout", &self.connect_timeout)
-            .field("target_session_attrs", &self.target_session_attrs)
            .field("channel_binding", &self.channel_binding)
-            .field("replication", &self.replication_mode)
+            .field("server_params", &self.server_params)
            .finish()
    }
 }
--- a/libs/proxy/tokio-postgres2/src/connect.rs
+++ b/libs/proxy/tokio-postgres2/src/connect.rs
@@ -1,14 +1,11 @@
 use crate::client::SocketConfig;
 use crate::codec::BackendMessage;
-use crate::config::{Host, TargetSessionAttrs};
+use crate::config::Host;
 use crate::connect_raw::connect_raw;
 use crate::connect_socket::connect_socket;
 use crate::tls::{MakeTlsConnect, TlsConnect};
-use crate::{Client, Config, Connection, Error, RawConnection, SimpleQueryMessage};
-use futures_util::{future, pin_mut, Future, FutureExt, Stream};
+use crate::{Client, Config, Connection, Error, RawConnection};
 use postgres_protocol2::message::backend::Message;
-use std::io;
-use std::task::Poll;
 use tokio::net::TcpStream;
 use tokio::sync::mpsc;

@@ -72,47 +69,7 @@ where
        .map(|m| BackendMessage::Async(Message::NoticeResponse(m)))
        .collect();

-    let mut connection = Connection::new(stream, delayed, parameters, receiver);
-
-    if let TargetSessionAttrs::ReadWrite = config.target_session_attrs {
-        let rows = client.simple_query_raw("SHOW transaction_read_only");
-        pin_mut!(rows);
-
-        let rows = future::poll_fn(|cx| {
-            if connection.poll_unpin(cx)?.is_ready() {
-                return Poll::Ready(Err(Error::closed()));
-            }
-
-            rows.as_mut().poll(cx)
-        })
-        .await?;
-        pin_mut!(rows);
-
-        loop {
-            let next = future::poll_fn(|cx| {
-                if connection.poll_unpin(cx)?.is_ready() {
-                    return Poll::Ready(Some(Err(Error::closed())));
-                }
-
-                rows.as_mut().poll_next(cx)
-            });
-
-            match next.await.transpose()? {
-                Some(SimpleQueryMessage::Row(row)) => {
-                    if row.try_get(0)? == Some("on") {
-                        return Err(Error::connect(io::Error::new(
-                            io::ErrorKind::PermissionDenied,
-                            "database does not allow writes",
-                        )));
-                    } else {
-                        break;
-                    }
-                }
-                Some(_) => {}
-                None => return Err(Error::unexpected_message()),
-            }
-        }
-    }
+    let connection = Connection::new(stream, delayed, parameters, receiver);

    Ok((client, connection))
 }
--- a/libs/proxy/tokio-postgres2/src/connect_raw.rs
+++ b/libs/proxy/tokio-postgres2/src/connect_raw.rs
@@ -1,5 +1,5 @@
 use crate::codec::{BackendMessage, BackendMessages, FrontendMessage, PostgresCodec};
-use crate::config::{self, AuthKeys, Config, ReplicationMode};
+use crate::config::{self, AuthKeys, Config};
 use crate::connect_tls::connect_tls;
 use crate::maybe_tls_stream::MaybeTlsStream;
 use crate::tls::{TlsConnect, TlsStream};
@@ -96,12 +96,7 @@ where
    let stream = connect_tls(stream, config.ssl_mode, tls).await?;

    let mut stream = StartupStream {
-        inner: Framed::new(
-            stream,
-            PostgresCodec {
-                max_message_size: config.max_backend_message_size,
-            },
-        ),
+        inner: Framed::new(stream, PostgresCodec),
        buf: BackendMessages::empty(),
        delayed_notice: Vec::new(),
    };
@@ -124,28 +119,8 @@ where
    S: AsyncRead + AsyncWrite + Unpin,
    T: AsyncRead + AsyncWrite + Unpin,
 {
-    let mut params = vec![("client_encoding", "UTF8")];
-    if let Some(user) = &config.user {
-        params.push(("user", &**user));
-    }
-    if let Some(dbname) = &config.dbname {
-        params.push(("database", &**dbname));
-    }
-    if let Some(options) = &config.options {
-        params.push(("options", &**options));
-    }
-    if let Some(application_name) = &config.application_name {
-        params.push(("application_name", &**application_name));
-    }
-    if let Some(replication_mode) = &config.replication_mode {
-        match replication_mode {
-            ReplicationMode::Physical => params.push(("replication", "true")),
-            ReplicationMode::Logical => params.push(("replication", "database")),
-        }
-    }
-
    let mut buf = BytesMut::new();
-    frontend::startup_message(params, &mut buf).map_err(Error::encode)?;
+    frontend::startup_message(&config.server_params, &mut buf).map_err(Error::encode)?;

    stream
        .send(FrontendMessage::Raw(buf.freeze()))
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -8,15 +8,14 @@ use std::io;
 use std::num::NonZeroU32;
 use std::pin::Pin;
 use std::str::FromStr;
-use std::sync::Arc;
 use std::time::Duration;
 use std::time::SystemTime;

 use super::REMOTE_STORAGE_PREFIX_SEPARATOR;
+use anyhow::Context;
 use anyhow::Result;
 use azure_core::request_options::{IfMatchCondition, MaxResults, Metadata, Range};
 use azure_core::{Continuable, RetryOptions};
-use azure_identity::DefaultAzureCredential;
 use azure_storage::StorageCredentials;
 use azure_storage_blobs::blob::CopyStatus;
 use azure_storage_blobs::prelude::ClientBuilder;
@@ -76,8 +75,9 @@ impl AzureBlobStorage {
        let credentials = if let Ok(access_key) = env::var("AZURE_STORAGE_ACCESS_KEY") {
            StorageCredentials::access_key(account.clone(), access_key)
        } else {
-            let token_credential = DefaultAzureCredential::default();
-            StorageCredentials::token_credential(Arc::new(token_credential))
+            let token_credential = azure_identity::create_default_credential()
+                .context("trying to obtain Azure default credentials")?;
+            StorageCredentials::token_credential(token_credential)
        };

        // we have an outer retry
@@ -624,6 +624,10 @@ impl RemoteStorage for AzureBlobStorage {
        res
    }

+    fn max_keys_per_delete(&self) -> usize {
+        super::MAX_KEYS_PER_DELETE_AZURE
+    }
+
    async fn copy(
        &self,
        from: &RemotePath,
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -70,7 +70,14 @@ pub const DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT: usize = 100;
 pub const DEFAULT_MAX_KEYS_PER_LIST_RESPONSE: Option<i32> = None;

 /// As defined in S3 docs
-pub const MAX_KEYS_PER_DELETE: usize = 1000;
+///
+/// <https://docs.aws.amazon.com/AmazonS3/latest/API/API_DeleteObjects.html>
+pub const MAX_KEYS_PER_DELETE_S3: usize = 1000;
+
+/// As defined in Azure docs
+///
+/// <https://learn.microsoft.com/en-us/rest/api/storageservices/blob-batch>
+pub const MAX_KEYS_PER_DELETE_AZURE: usize = 256;

 const REMOTE_STORAGE_PREFIX_SEPARATOR: char = '/';

@@ -340,6 +347,14 @@ pub trait RemoteStorage: Send + Sync + 'static {
        cancel: &CancellationToken,
    ) -> anyhow::Result<()>;

+    /// Returns the maximum number of keys that a call to [`Self::delete_objects`] can delete without chunking
+    ///
+    /// The value returned is only an optimization hint, One can pass larger number of objects to
+    /// `delete_objects` as well.
+    ///
+    /// The value is guaranteed to be >= 1.
+    fn max_keys_per_delete(&self) -> usize;
+
    /// Deletes all objects matching the given prefix.
    ///
    /// NB: this uses NoDelimiter and will match partial prefixes. For example, the prefix /a/b will
@@ -533,6 +548,16 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
        }
    }

+    /// [`RemoteStorage::max_keys_per_delete`]
+    pub fn max_keys_per_delete(&self) -> usize {
+        match self {
+            Self::LocalFs(s) => s.max_keys_per_delete(),
+            Self::AwsS3(s) => s.max_keys_per_delete(),
+            Self::AzureBlob(s) => s.max_keys_per_delete(),
+            Self::Unreliable(s) => s.max_keys_per_delete(),
+        }
+    }
+
    /// See [`RemoteStorage::delete_prefix`]
    pub async fn delete_prefix(
        &self,
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -573,6 +573,10 @@ impl RemoteStorage for LocalFs {
        Ok(())
    }

+    fn max_keys_per_delete(&self) -> usize {
+        super::MAX_KEYS_PER_DELETE_S3
+    }
+
    async fn copy(
        &self,
        from: &RemotePath,
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -48,7 +48,7 @@ use crate::{
    metrics::{start_counting_cancelled_wait, start_measuring_requests},
    support::PermitCarrying,
    ConcurrencyLimiter, Download, DownloadError, DownloadOpts, Listing, ListingMode, ListingObject,
-    RemotePath, RemoteStorage, TimeTravelError, TimeoutOrCancel, MAX_KEYS_PER_DELETE,
+    RemotePath, RemoteStorage, TimeTravelError, TimeoutOrCancel, MAX_KEYS_PER_DELETE_S3,
    REMOTE_STORAGE_PREFIX_SEPARATOR,
 };

@@ -355,7 +355,7 @@ impl S3Bucket {
        let kind = RequestKind::Delete;
        let mut cancel = std::pin::pin!(cancel.cancelled());

-        for chunk in delete_objects.chunks(MAX_KEYS_PER_DELETE) {
+        for chunk in delete_objects.chunks(MAX_KEYS_PER_DELETE_S3) {
            let started_at = start_measuring_requests(kind);

            let req = self
@@ -832,6 +832,10 @@ impl RemoteStorage for S3Bucket {
        self.delete_oids(&permit, &delete_objects, cancel).await
    }

+    fn max_keys_per_delete(&self) -> usize {
+        MAX_KEYS_PER_DELETE_S3
+    }
+
    async fn delete(&self, path: &RemotePath, cancel: &CancellationToken) -> anyhow::Result<()> {
        let paths = std::array::from_ref(path);
        self.delete_objects(paths, cancel).await
--- a/libs/remote_storage/src/simulate_failures.rs
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -203,6 +203,10 @@ impl RemoteStorage for UnreliableWrapper {
        Ok(())
    }

+    fn max_keys_per_delete(&self) -> usize {
+        self.inner.max_keys_per_delete()
+    }
+
    async fn copy(
        &self,
        from: &RemotePath,
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -94,6 +94,8 @@ pub mod toml_edit_ext;

 pub mod circuit_breaker;

+pub mod try_rcu;
+
 // Re-export used in macro. Avoids adding git-version as dep in target crates.
 #[doc(hidden)]
 pub use git_version;
--- a/libs/utils/src/shard.rs
+++ b/libs/utils/src/shard.rs
@@ -164,6 +164,12 @@ impl TenantShardId {
    }
 }

+impl std::fmt::Display for ShardNumber {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        self.0.fmt(f)
+    }
+}
+
 impl std::fmt::Display for ShardSlug<'_> {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(
--- a/libs/utils/src/sync.rs
+++ b/libs/utils/src/sync.rs
@@ -1,5 +1,6 @@
 pub mod heavier_once_cell;

+pub mod duplex;
 pub mod gate;

 pub mod spsc_fold;
--- a/libs/utils/src/sync/duplex.rs
+++ b/libs/utils/src/sync/duplex.rs
@@ -0,0 +1 @@
+pub mod mpsc;
--- a/libs/utils/src/sync/duplex/mpsc.rs
+++ b/libs/utils/src/sync/duplex/mpsc.rs
@@ -0,0 +1,36 @@
+use tokio::sync::mpsc;
+
+/// A bi-directional channel.
+pub struct Duplex<S, R> {
+    pub tx: mpsc::Sender<S>,
+    pub rx: mpsc::Receiver<R>,
+}
+
+/// Creates a bi-directional channel.
+///
+/// The channel will buffer up to the provided number of messages. Once the buffer is full,
+/// attempts to send new messages will wait until a message is received from the channel.
+/// The provided buffer capacity must be at least 1.
+pub fn channel<A: Send, B: Send>(buffer: usize) -> (Duplex<A, B>, Duplex<B, A>) {
+    let (tx_a, rx_a) = mpsc::channel::<A>(buffer);
+    let (tx_b, rx_b) = mpsc::channel::<B>(buffer);
+
+    (Duplex { tx: tx_a, rx: rx_b }, Duplex { tx: tx_b, rx: rx_a })
+}
+
+impl<S: Send, R: Send> Duplex<S, R> {
+    /// Sends a value, waiting until there is capacity.
+    ///
+    /// A successful send occurs when it is determined that the other end of the channel has not hung up already.
+    pub async fn send(&self, x: S) -> Result<(), mpsc::error::SendError<S>> {
+        self.tx.send(x).await
+    }
+
+    /// Receives the next value for this receiver.
+    ///
+    /// This method returns `None` if the channel has been closed and there are
+    /// no remaining messages in the channel's buffer.
+    pub async fn recv(&mut self) -> Option<R> {
+        self.rx.recv().await
+    }
+}
--- a/libs/utils/src/try_rcu.rs
+++ b/libs/utils/src/try_rcu.rs
@@ -0,0 +1,77 @@
+//! Try RCU extension lifted from <https://github.com/vorner/arc-swap/issues/94#issuecomment-1987154023>
+
+pub trait ArcSwapExt<T> {
+    /// [`ArcSwap::rcu`](arc_swap::ArcSwap::rcu), but with Result that short-circuits on error.
+    fn try_rcu<R, F, E>(&self, f: F) -> Result<T, E>
+    where
+        F: FnMut(&T) -> Result<R, E>,
+        R: Into<T>;
+}
+
+impl<T, S> ArcSwapExt<T> for arc_swap::ArcSwapAny<T, S>
+where
+    T: arc_swap::RefCnt,
+    S: arc_swap::strategy::CaS<T>,
+{
+    fn try_rcu<R, F, E>(&self, mut f: F) -> Result<T, E>
+    where
+        F: FnMut(&T) -> Result<R, E>,
+        R: Into<T>,
+    {
+        fn ptr_eq<Base, A, B>(a: A, b: B) -> bool
+        where
+            A: arc_swap::AsRaw<Base>,
+            B: arc_swap::AsRaw<Base>,
+        {
+            let a = a.as_raw();
+            let b = b.as_raw();
+            std::ptr::eq(a, b)
+        }
+
+        let mut cur = self.load();
+        loop {
+            let new = f(&cur)?.into();
+            let prev = self.compare_and_swap(&*cur, new);
+            let swapped = ptr_eq(&*cur, &*prev);
+            if swapped {
+                return Ok(arc_swap::Guard::into_inner(prev));
+            } else {
+                cur = prev;
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use arc_swap::ArcSwap;
+    use std::sync::Arc;
+
+    #[test]
+    fn test_try_rcu_success() {
+        let swap = ArcSwap::from(Arc::new(42));
+
+        let result = swap.try_rcu(|value| -> Result<_, String> { Ok(**value + 1) });
+
+        assert!(result.is_ok());
+        assert_eq!(**swap.load(), 43);
+    }
+
+    #[test]
+    fn test_try_rcu_error() {
+        let swap = ArcSwap::from(Arc::new(42));
+
+        let result = swap.try_rcu(|value| -> Result<i32, _> {
+            if **value == 42 {
+                Err("err")
+            } else {
+                Ok(**value + 1)
+            }
+        });
+
+        assert!(result.is_err());
+        assert_eq!(result.unwrap_err(), "err");
+        assert_eq!(**swap.load(), 42);
+    }
+}
--- a/libs/wal_decoder/proto/interpreted_wal.proto
+++ b/libs/wal_decoder/proto/interpreted_wal.proto
@@ -37,7 +37,7 @@ message ValueMeta {
 }

 message CompactKey {
-  int64 high = 1;
-  int64 low = 2;
+  uint64 high = 1;
+  uint64 low = 2;
 }

--- a/libs/wal_decoder/src/wire_format.rs
+++ b/libs/wal_decoder/src/wire_format.rs
@@ -236,8 +236,8 @@ impl From<ValueMeta> for proto::ValueMeta {
 impl From<CompactKey> for proto::CompactKey {
    fn from(value: CompactKey) -> Self {
        proto::CompactKey {
-            high: (value.raw() >> 64) as i64,
-            low: value.raw() as i64,
+            high: (value.raw() >> 64) as u64,
+            low: value.raw() as u64,
        }
    }
 }
@@ -354,3 +354,64 @@ impl From<proto::CompactKey> for CompactKey {
        (((value.high as i128) << 64) | (value.low as i128)).into()
    }
 }
+
+#[test]
+fn test_compact_key_with_large_relnode() {
+    use pageserver_api::key::Key;
+
+    let inputs = vec![
+        Key {
+            field1: 0,
+            field2: 0x100,
+            field3: 0x200,
+            field4: 0,
+            field5: 0x10,
+            field6: 0x5,
+        },
+        Key {
+            field1: 0,
+            field2: 0x100,
+            field3: 0x200,
+            field4: 0x007FFFFF,
+            field5: 0x10,
+            field6: 0x5,
+        },
+        Key {
+            field1: 0,
+            field2: 0x100,
+            field3: 0x200,
+            field4: 0x00800000,
+            field5: 0x10,
+            field6: 0x5,
+        },
+        Key {
+            field1: 0,
+            field2: 0x100,
+            field3: 0x200,
+            field4: 0x00800001,
+            field5: 0x10,
+            field6: 0x5,
+        },
+        Key {
+            field1: 0,
+            field2: 0xFFFFFFFF,
+            field3: 0xFFFFFFFF,
+            field4: 0xFFFFFFFF,
+            field5: 0x0,
+            field6: 0x0,
+        },
+    ];
+
+    for input in inputs {
+        assert!(input.is_valid_key_on_write_path());
+        let compact = input.to_compact();
+        let proto: proto::CompactKey = compact.into();
+        let from_proto: CompactKey = proto.into();
+
+        assert_eq!(
+            compact, from_proto,
+            "Round trip failed for key with relnode={:#x}",
+            input.field4
+        );
+    }
+}
--- a/libs/walproposer/build.rs
+++ b/libs/walproposer/build.rs
@@ -30,9 +30,9 @@ fn main() -> anyhow::Result<()> {
    let pgxn_neon = std::fs::canonicalize(pgxn_neon)?;
    let pgxn_neon = pgxn_neon.to_str().ok_or(anyhow!("Bad non-UTF path"))?;

+    println!("cargo:rustc-link-lib=static=walproposer");
    println!("cargo:rustc-link-lib=static=pgport");
    println!("cargo:rustc-link-lib=static=pgcommon");
-    println!("cargo:rustc-link-lib=static=walproposer");
    println!("cargo:rustc-link-search={walproposer_lib_search_str}");

    // Rebuild crate when libwalproposer.a changes
--- a/pageserver/benches/bench_ingest.rs
+++ b/pageserver/benches/bench_ingest.rs
@@ -62,10 +62,8 @@ async fn ingest(
    let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);

    let gate = utils::sync::gate::Gate::default();
-    let entered = gate.enter().unwrap();

-    let layer =
-        InMemoryLayer::create(conf, timeline_id, tenant_shard_id, lsn, entered, &ctx).await?;
+    let layer = InMemoryLayer::create(conf, timeline_id, tenant_shard_id, lsn, &gate, &ctx).await?;

    let data = Value::Image(Bytes::from(vec![0u8; put_size]));
    let data_ser_size = data.serialized_size().unwrap() as usize;
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -1,4 +1,4 @@
-use std::collections::HashMap;
+use std::{collections::HashMap, error::Error as _};

 use bytes::Bytes;
 use detach_ancestor::AncestorDetached;
@@ -25,10 +25,10 @@ pub struct Client {

 #[derive(thiserror::Error, Debug)]
 pub enum Error {
-    #[error("send request: {0}")]
+    #[error("send request: {0}{}", .0.source().map(|e| format!(": {e}")).unwrap_or_default())]
    SendRequest(reqwest::Error),

-    #[error("receive body: {0}")]
+    #[error("receive body: {0}{}", .0.source().map(|e| format!(": {e}")).unwrap_or_default())]
    ReceiveBody(reqwest::Error),

    #[error("receive error body: {0}")]
@@ -270,12 +270,18 @@ impl Client {
        Ok(body)
    }

-    pub async fn tenant_config(&self, req: &TenantConfigRequest) -> Result<()> {
+    pub async fn set_tenant_config(&self, req: &TenantConfigRequest) -> Result<()> {
        let uri = format!("{}/v1/tenant/config", self.mgmt_api_endpoint);
        self.request(Method::PUT, &uri, req).await?;
        Ok(())
    }

+    pub async fn patch_tenant_config(&self, req: &TenantConfigPatchRequest) -> Result<()> {
+        let uri = format!("{}/v1/tenant/config", self.mgmt_api_endpoint);
+        self.request(Method::PATCH, &uri, req).await?;
+        Ok(())
+    }
+
    pub async fn tenant_secondary_download(
        &self,
        tenant_id: TenantShardId,
--- a/pageserver/pagebench/src/cmd/aux_files.rs
+++ b/pageserver/pagebench/src/cmd/aux_files.rs
@@ -64,7 +64,7 @@ async fn main_impl(args: Args) -> anyhow::Result<()> {
    println!("operating on timeline {}", timeline);

    mgmt_api_client
-        .tenant_config(&TenantConfigRequest {
+        .set_tenant_config(&TenantConfigRequest {
            tenant_id: timeline.tenant_id,
            config: TenantConfig::default(),
        })
--- a/pageserver/src/consumption_metrics/upload.rs
+++ b/pageserver/src/consumption_metrics/upload.rs
@@ -1,3 +1,4 @@
+use std::error::Error as _;
 use std::time::SystemTime;

 use chrono::{DateTime, Utc};
@@ -350,7 +351,11 @@ impl std::fmt::Display for UploadError {

        match self {
            Rejected(code) => write!(f, "server rejected the metrics with {code}"),
-            Reqwest(e) => write!(f, "request failed: {e}"),
+            Reqwest(e) => write!(
+                f,
+                "request failed: {e}{}",
+                e.source().map(|e| format!(": {e}")).unwrap_or_default()
+            ),
            Cancelled => write!(f, "cancelled"),
        }
    }
--- a/pageserver/src/deletion_queue/deleter.rs
+++ b/pageserver/src/deletion_queue/deleter.rs
@@ -9,7 +9,6 @@
 use remote_storage::GenericRemoteStorage;
 use remote_storage::RemotePath;
 use remote_storage::TimeoutOrCancel;
-use remote_storage::MAX_KEYS_PER_DELETE;
 use std::time::Duration;
 use tokio_util::sync::CancellationToken;
 use tracing::info;
@@ -131,7 +130,8 @@ impl Deleter {
    }

    pub(super) async fn background(&mut self) -> Result<(), DeletionQueueError> {
-        self.accumulator.reserve(MAX_KEYS_PER_DELETE);
+        let max_keys_per_delete = self.remote_storage.max_keys_per_delete();
+        self.accumulator.reserve(max_keys_per_delete);

        loop {
            if self.cancel.is_cancelled() {
@@ -156,14 +156,14 @@ impl Deleter {

            match msg {
                DeleterMessage::Delete(mut list) => {
-                    while !list.is_empty() || self.accumulator.len() == MAX_KEYS_PER_DELETE {
-                        if self.accumulator.len() == MAX_KEYS_PER_DELETE {
+                    while !list.is_empty() || self.accumulator.len() == max_keys_per_delete {
+                        if self.accumulator.len() == max_keys_per_delete {
                            self.flush().await?;
                            // If we have received this number of keys, proceed with attempting to execute
                            assert_eq!(self.accumulator.len(), 0);
                        }

-                        let available_slots = MAX_KEYS_PER_DELETE - self.accumulator.len();
+                        let available_slots = max_keys_per_delete - self.accumulator.len();
                        let take_count = std::cmp::min(available_slots, list.len());
                        for path in list.drain(list.len() - take_count..) {
                            self.accumulator.push(path);
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -767,7 +767,27 @@ paths:
  /v1/tenant/config:
    put:
      description: |
-        Update tenant's config.
+        Update tenant's config by setting it to the provided value
+
+        Invalid fields in the tenant config will cause the request to be rejected with status 400.
+      requestBody:
+        content:
+          application/json:
+            schema:
+              $ref: "#/components/schemas/TenantConfigRequest"
+      responses:
+        "200":
+          description: OK
+          content:
+            application/json:
+              schema:
+                type: array
+                items:
+                  $ref: "#/components/schemas/TenantInfo"
+    patch:
+      description: |
+        Update tenant's config additively by patching the updated fields provided.
+        Null values unset the field and non-null values upsert it.

        Invalid fields in the tenant config will cause the request to be rejected with status 400.
      requestBody:
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -28,6 +28,7 @@ use pageserver_api::models::LsnLease;
 use pageserver_api::models::LsnLeaseRequest;
 use pageserver_api::models::OffloadedTimelineInfo;
 use pageserver_api::models::ShardParameters;
+use pageserver_api::models::TenantConfigPatchRequest;
 use pageserver_api::models::TenantDetails;
 use pageserver_api::models::TenantLocationConfigRequest;
 use pageserver_api::models::TenantLocationConfigResponse;
@@ -87,7 +88,7 @@ use crate::tenant::timeline::offload::offload_timeline;
 use crate::tenant::timeline::offload::OffloadError;
 use crate::tenant::timeline::CompactFlags;
 use crate::tenant::timeline::CompactOptions;
-use crate::tenant::timeline::CompactRange;
+use crate::tenant::timeline::CompactRequest;
 use crate::tenant::timeline::CompactionError;
 use crate::tenant::timeline::Timeline;
 use crate::tenant::GetTimelineError;
@@ -279,7 +280,10 @@ impl From<TenantStateError> for ApiError {
 impl From<GetTenantError> for ApiError {
    fn from(tse: GetTenantError) -> ApiError {
        match tse {
-            GetTenantError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {}", tid).into()),
+            GetTenantError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {tid}").into()),
+            GetTenantError::ShardNotFound(tid) => {
+                ApiError::NotFound(anyhow!("tenant {tid}").into())
+            }
            GetTenantError::NotActive(_) => {
                // Why is this not `ApiError::NotFound`?
                // Because we must be careful to never return 404 for a tenant if it does
@@ -387,6 +391,16 @@ impl From<crate::tenant::mgr::DeleteTenantError> for ApiError {
    }
 }

+impl From<crate::tenant::secondary::SecondaryTenantError> for ApiError {
+    fn from(ste: crate::tenant::secondary::SecondaryTenantError) -> ApiError {
+        use crate::tenant::secondary::SecondaryTenantError;
+        match ste {
+            SecondaryTenantError::GetTenant(gte) => gte.into(),
+            SecondaryTenantError::ShuttingDown => ApiError::ShuttingDown,
+        }
+    }
+}
+
 // Helper function to construct a TimelineInfo struct for a timeline
 async fn build_timeline_info(
    timeline: &Arc<Timeline>,
@@ -1047,9 +1061,11 @@ async fn timeline_delete_handler(
            match e {
                // GetTenantError has a built-in conversion to ApiError, but in this context we don't
                // want to treat missing tenants as 404, to avoid ambiguity with successful deletions.
-                GetTenantError::NotFound(_) => ApiError::PreconditionFailed(
-                    "Requested tenant is missing".to_string().into_boxed_str(),
-                ),
+                GetTenantError::NotFound(_) | GetTenantError::ShardNotFound(_) => {
+                    ApiError::PreconditionFailed(
+                        "Requested tenant is missing".to_string().into_boxed_str(),
+                    )
+                }
                e => e.into(),
            }
        })?;
@@ -1680,7 +1696,47 @@ async fn update_tenant_config_handler(
    crate::tenant::Tenant::persist_tenant_config(state.conf, &tenant_shard_id, &location_conf)
        .await
        .map_err(|e| ApiError::InternalServerError(anyhow::anyhow!(e)))?;
-    tenant.set_new_tenant_config(new_tenant_conf);
+
+    let _ = tenant
+        .update_tenant_config(|_crnt| Ok(new_tenant_conf.clone()))
+        .expect("Closure returns Ok()");
+
+    json_response(StatusCode::OK, ())
+}
+
+async fn patch_tenant_config_handler(
+    mut request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let request_data: TenantConfigPatchRequest = json_request(&mut request).await?;
+    let tenant_id = request_data.tenant_id;
+    check_permission(&request, Some(tenant_id))?;
+
+    let state = get_state(&request);
+
+    let tenant_shard_id = TenantShardId::unsharded(tenant_id);
+
+    let tenant = state
+        .tenant_manager
+        .get_attached_tenant_shard(tenant_shard_id)?;
+    tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
+
+    let updated = tenant
+        .update_tenant_config(|crnt| crnt.apply_patch(request_data.config.clone()))
+        .map_err(ApiError::BadRequest)?;
+
+    // This is a legacy API that only operates on attached tenants: the preferred
+    // API to use is the location_config/ endpoint, which lets the caller provide
+    // the full LocationConf.
+    let location_conf = LocationConf::attached_single(
+        updated,
+        tenant.get_generation(),
+        &ShardParameters::default(),
+    );
+
+    crate::tenant::Tenant::persist_tenant_config(state.conf, &tenant_shard_id, &location_conf)
+        .await
+        .map_err(|e| ApiError::InternalServerError(anyhow::anyhow!(e)))?;

    json_response(StatusCode::OK, ())
 }
@@ -1963,6 +2019,26 @@ async fn timeline_gc_handler(
    json_response(StatusCode::OK, gc_result)
 }

+// Cancel scheduled compaction tasks
+async fn timeline_cancel_compact_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
+    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
+    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
+    let state = get_state(&request);
+    async {
+        let tenant = state
+            .tenant_manager
+            .get_attached_tenant_shard(tenant_shard_id)?;
+        tenant.cancel_scheduled_compaction(timeline_id);
+        json_response(StatusCode::OK, ())
+    }
+    .instrument(info_span!("timeline_cancel_compact", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id))
+    .await
+}
+
 // Run compaction immediately on given timeline.
 async fn timeline_compact_handler(
    mut request: Request<Body>,
@@ -1972,7 +2048,7 @@ async fn timeline_compact_handler(
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
    check_permission(&request, Some(tenant_shard_id.tenant_id))?;

-    let compact_range = json_request_maybe::<Option<CompactRange>>(&mut request).await?;
+    let compact_request = json_request_maybe::<Option<CompactRequest>>(&mut request).await?;

    let state = get_state(&request);

@@ -1997,22 +2073,50 @@ async fn timeline_compact_handler(
    let wait_until_uploaded =
        parse_query_param::<_, bool>(&request, "wait_until_uploaded")?.unwrap_or(false);

+    let wait_until_scheduled_compaction_done =
+        parse_query_param::<_, bool>(&request, "wait_until_scheduled_compaction_done")?
+            .unwrap_or(false);
+
+    let sub_compaction = compact_request
+        .as_ref()
+        .map(|r| r.sub_compaction)
+        .unwrap_or(false);
    let options = CompactOptions {
-        compact_range,
+        compact_range: compact_request
+            .as_ref()
+            .and_then(|r| r.compact_range.clone()),
+        compact_below_lsn: compact_request.as_ref().and_then(|r| r.compact_below_lsn),
        flags,
+        sub_compaction,
    };

+    let scheduled = compact_request
+        .as_ref()
+        .map(|r| r.scheduled)
+        .unwrap_or(false);
+
    async {
        let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
        let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?;
-        timeline
-            .compact_with_options(&cancel, options, &ctx)
-            .await
-            .map_err(|e| ApiError::InternalServerError(e.into()))?;
-        if wait_until_uploaded {
-            timeline.remote_client.wait_completion().await
-            // XXX map to correct ApiError for the cases where it's due to shutdown
-            .context("wait completion").map_err(ApiError::InternalServerError)?;
+        if scheduled {
+            let tenant = state
+                .tenant_manager
+                .get_attached_tenant_shard(tenant_shard_id)?;
+            let rx = tenant.schedule_compaction(timeline_id, options).await.map_err(ApiError::InternalServerError)?;
+            if wait_until_scheduled_compaction_done {
+                // It is possible that this will take a long time, dropping the HTTP request will not cancel the compaction.
+                rx.await.ok();
+            }
+        } else {
+            timeline
+                .compact_with_options(&cancel, options, &ctx)
+                .await
+                .map_err(|e| ApiError::InternalServerError(e.into()))?;
+            if wait_until_uploaded {
+                timeline.remote_client.wait_completion().await
+                // XXX map to correct ApiError for the cases where it's due to shutdown
+                .context("wait completion").map_err(ApiError::InternalServerError)?;
+            }
        }
        json_response(StatusCode::OK, ())
    }
@@ -2093,16 +2197,20 @@ async fn timeline_checkpoint_handler(
    // By default, checkpoints come with a compaction, but this may be optionally disabled by tests that just want to flush + upload.
    let compact = parse_query_param::<_, bool>(&request, "compact")?.unwrap_or(true);

+    let wait_until_flushed: bool =
+        parse_query_param(&request, "wait_until_flushed")?.unwrap_or(true);
+
    let wait_until_uploaded =
        parse_query_param::<_, bool>(&request, "wait_until_uploaded")?.unwrap_or(false);

    async {
        let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
        let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?;
-        timeline
-            .freeze_and_flush()
-            .await
-            .map_err(|e| {
+        if wait_until_flushed {
+            timeline.freeze_and_flush().await
+        } else {
+            timeline.freeze().await.and(Ok(()))
+        }.map_err(|e| {
                match e {
                    tenant::timeline::FlushLayerError::Cancelled => ApiError::ShuttingDown,
                    other => ApiError::InternalServerError(other.into()),
@@ -2462,8 +2570,7 @@ async fn secondary_upload_handler(
    state
        .secondary_controller
        .upload_tenant(tenant_shard_id)
-        .await
-        .map_err(ApiError::InternalServerError)?;
+        .await?;

    json_response(StatusCode::OK, ())
 }
@@ -2578,7 +2685,7 @@ async fn secondary_download_handler(
        // Edge case: downloads aren't usually fallible: things like a missing heatmap are considered
        // okay.  We could get an error here in the unlikely edge case that the tenant
        // was detached between our check above and executing the download job.
-        Ok(Err(e)) => return Err(ApiError::InternalServerError(e)),
+        Ok(Err(e)) => return Err(e.into()),
        // A timeout is not an error: we have started the download, we're just not done
        // yet.  The caller will get a response body indicating status.
        Err(_) => StatusCode::ACCEPTED,
@@ -3222,6 +3329,9 @@ pub fn make_router(
        .get("/v1/tenant/:tenant_shard_id/synthetic_size", |r| {
            api_handler(r, tenant_size_handler)
        })
+        .patch("/v1/tenant/config", |r| {
+            api_handler(r, patch_tenant_config_handler)
+        })
        .put("/v1/tenant/config", |r| {
            api_handler(r, update_tenant_config_handler)
        })
@@ -3287,6 +3397,10 @@ pub fn make_router(
            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/compact",
            |r| api_handler(r, timeline_compact_handler),
        )
+        .delete(
+            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/compact",
+            |r| api_handler(r, timeline_cancel_compact_handler),
+        )
        .put(
            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/offload",
            |r| testing_api_handler("attempt timeline offload", r, timeline_offload_handler),
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -16,7 +16,6 @@ use postgres_backend::{is_expected_io_error, QueryError};
 use pq_proto::framed::ConnectionError;
 use strum::{EnumCount, VariantNames};
 use strum_macros::{IntoStaticStr, VariantNames};
-use tracing::warn;
 use utils::id::TimelineId;

 /// Prometheus histogram buckets (in seconds) for operations in the critical
@@ -464,6 +463,24 @@ static LAST_RECORD_LSN: Lazy<IntGaugeVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

+static DISK_CONSISTENT_LSN: Lazy<IntGaugeVec> = Lazy::new(|| {
+    register_int_gauge_vec!(
+        "pageserver_disk_consistent_lsn",
+        "Disk consistent LSN grouped by timeline",
+        &["tenant_id", "shard_id", "timeline_id"]
+    )
+    .expect("failed to define a metric")
+});
+
+pub(crate) static PROJECTED_REMOTE_CONSISTENT_LSN: Lazy<UIntGaugeVec> = Lazy::new(|| {
+    register_uint_gauge_vec!(
+        "pageserver_projected_remote_consistent_lsn",
+        "Projected remote consistent LSN grouped by timeline",
+        &["tenant_id", "shard_id", "timeline_id"]
+    )
+    .expect("failed to define a metric")
+});
+
 static PITR_HISTORY_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
    register_uint_gauge_vec!(
        "pageserver_pitr_history_size",
@@ -1205,54 +1222,163 @@ pub(crate) mod virtual_file_io_engine {
    });
 }

-pub(crate) struct SmgrOpTimer {
-    global_latency_histo: Histogram,
+pub(crate) struct SmgrOpTimer(Option<SmgrOpTimerInner>);
+pub(crate) struct SmgrOpTimerInner {
+    global_execution_latency_histo: Histogram,
+    per_timeline_execution_latency_histo: Option<Histogram>,

-    // Optional because not all op types are tracked per-timeline
-    per_timeline_latency_histo: Option<Histogram>,
+    global_batch_wait_time: Histogram,
+    per_timeline_batch_wait_time: Histogram,

-    start: Instant,
-    throttled: Duration,
-    op: SmgrQueryType,
+    global_flush_in_progress_micros: IntCounter,
+    per_timeline_flush_in_progress_micros: IntCounter,
+
+    timings: SmgrOpTimerState,
+}
+
+#[derive(Debug)]
+enum SmgrOpTimerState {
+    Received {
+        received_at: Instant,
+    },
+    ThrottleDoneExecutionStarting {
+        received_at: Instant,
+        throttle_started_at: Instant,
+        started_execution_at: Instant,
+    },
+}
+
+pub(crate) struct SmgrOpFlushInProgress {
+    flush_started_at: Instant,
+    global_micros: IntCounter,
+    per_timeline_micros: IntCounter,
 }

 impl SmgrOpTimer {
-    pub(crate) fn deduct_throttle(&mut self, throttle: &Option<Duration>) {
-        let Some(throttle) = throttle else {
-            return;
-        };
-        self.throttled += *throttle;
+    pub(crate) fn observe_throttle_done_execution_starting(&mut self, throttle: &ThrottleResult) {
+        let inner = self.0.as_mut().expect("other public methods consume self");
+        match (&mut inner.timings, throttle) {
+            (SmgrOpTimerState::Received { received_at }, throttle) => match throttle {
+                ThrottleResult::NotThrottled { start } => {
+                    inner.timings = SmgrOpTimerState::ThrottleDoneExecutionStarting {
+                        received_at: *received_at,
+                        throttle_started_at: *start,
+                        started_execution_at: *start,
+                    };
+                }
+                ThrottleResult::Throttled { start, end } => {
+                    inner.timings = SmgrOpTimerState::ThrottleDoneExecutionStarting {
+                        received_at: *start,
+                        throttle_started_at: *start,
+                        started_execution_at: *end,
+                    };
+                }
+            },
+            (x, _) => panic!("called in unexpected state: {x:?}"),
+        }
+    }
+
+    pub(crate) fn observe_smgr_op_completion_and_start_flushing(mut self) -> SmgrOpFlushInProgress {
+        let (flush_start, inner) = self
+            .smgr_op_end()
+            .expect("this method consume self, and the only other caller is drop handler");
+        let SmgrOpTimerInner {
+            global_flush_in_progress_micros,
+            per_timeline_flush_in_progress_micros,
+            ..
+        } = inner;
+        SmgrOpFlushInProgress {
+            flush_started_at: flush_start,
+            global_micros: global_flush_in_progress_micros,
+            per_timeline_micros: per_timeline_flush_in_progress_micros,
+        }
+    }
+
+    /// Returns `None`` if this method has already been called, `Some` otherwise.
+    fn smgr_op_end(&mut self) -> Option<(Instant, SmgrOpTimerInner)> {
+        let inner = self.0.take()?;
+
+        let now = Instant::now();
+
+        let batch;
+        let execution;
+        let throttle;
+        match inner.timings {
+            SmgrOpTimerState::Received { received_at } => {
+                batch = (now - received_at).as_secs_f64();
+                // TODO: use label for dropped requests.
+                // This is quite rare in practice, only during tenant/pageservers shutdown.
+                throttle = Duration::ZERO;
+                execution = Duration::ZERO.as_secs_f64();
+            }
+            SmgrOpTimerState::ThrottleDoneExecutionStarting {
+                received_at,
+                throttle_started_at,
+                started_execution_at,
+            } => {
+                batch = (throttle_started_at - received_at).as_secs_f64();
+                throttle = started_execution_at - throttle_started_at;
+                execution = (now - started_execution_at).as_secs_f64();
+            }
+        }
+
+        // update time spent in batching
+        inner.global_batch_wait_time.observe(batch);
+        inner.per_timeline_batch_wait_time.observe(batch);
+
+        // time spent in throttle metric is updated by throttle impl
+        let _ = throttle;
+
+        // update metrics for execution latency
+        inner.global_execution_latency_histo.observe(execution);
+        if let Some(per_timeline_execution_latency_histo) =
+            &inner.per_timeline_execution_latency_histo
+        {
+            per_timeline_execution_latency_histo.observe(execution);
+        }
+
+        Some((now, inner))
    }
 }

 impl Drop for SmgrOpTimer {
    fn drop(&mut self) {
-        let elapsed = self.start.elapsed();
+        self.smgr_op_end();
+    }
+}

-        let elapsed = match elapsed.checked_sub(self.throttled) {
-            Some(elapsed) => elapsed,
-            None => {
-                use utils::rate_limit::RateLimit;
-                static LOGGED: Lazy<Mutex<enum_map::EnumMap<SmgrQueryType, RateLimit>>> =
-                    Lazy::new(|| {
-                        Mutex::new(enum_map::EnumMap::from_array(std::array::from_fn(|_| {
-                            RateLimit::new(Duration::from_secs(10))
-                        })))
-                    });
-                let mut guard = LOGGED.lock().unwrap();
-                let rate_limit = &mut guard[self.op];
-                rate_limit.call(|| {
-                    warn!(op=?self.op, ?elapsed, ?self.throttled, "implementation error: time spent throttled exceeds total request wall clock time");
-                });
-                elapsed // un-throttled time, more info than just saturating to 0
+impl SmgrOpFlushInProgress {
+    pub(crate) async fn measure<Fut, O>(mut self, mut fut: Fut) -> O
+    where
+        Fut: std::future::Future<Output = O>,
+    {
+        let mut fut = std::pin::pin!(fut);
+
+        let now = Instant::now();
+        // Whenever observe_guard gets called, or dropped,
+        // it adds the time elapsed since its last call to metrics.
+        // Last call is tracked in `now`.
+        let mut observe_guard = scopeguard::guard(
+            || {
+                let elapsed = now - self.flush_started_at;
+                self.global_micros
+                    .inc_by(u64::try_from(elapsed.as_micros()).unwrap());
+                self.per_timeline_micros
+                    .inc_by(u64::try_from(elapsed.as_micros()).unwrap());
+                self.flush_started_at = now;
+            },
+            |mut observe| {
+                observe();
+            },
+        );
+
+        loop {
+            match tokio::time::timeout(Duration::from_secs(10), &mut fut).await {
+                Ok(v) => return v,
+                Err(_timeout) => {
+                    (*observe_guard)();
+                }
            }
-        };
-
-        let elapsed = elapsed.as_secs_f64();
-
-        self.global_latency_histo.observe(elapsed);
-        if let Some(per_timeline_getpage_histo) = &self.per_timeline_latency_histo {
-            per_timeline_getpage_histo.observe(elapsed);
        }
    }
 }
@@ -1284,6 +1410,10 @@ pub(crate) struct SmgrQueryTimePerTimeline {
    per_timeline_getpage_latency: Histogram,
    global_batch_size: Histogram,
    per_timeline_batch_size: Histogram,
+    global_flush_in_progress_micros: IntCounter,
+    per_timeline_flush_in_progress_micros: IntCounter,
+    global_batch_wait_time: Histogram,
+    per_timeline_batch_wait_time: Histogram,
 }

 static SMGR_QUERY_STARTED_GLOBAL: Lazy<IntCounterVec> = Lazy::new(|| {
@@ -1306,12 +1436,15 @@ static SMGR_QUERY_STARTED_PER_TENANT_TIMELINE: Lazy<IntCounterVec> = Lazy::new(|
    .expect("failed to define a metric")
 });

+// Alias so all histograms recording per-timeline smgr timings use the same buckets.
+static SMGR_QUERY_TIME_PER_TENANT_TIMELINE_BUCKETS: &[f64] = CRITICAL_OP_BUCKETS;
+
 static SMGR_QUERY_TIME_PER_TENANT_TIMELINE: Lazy<HistogramVec> = Lazy::new(|| {
    register_histogram_vec!(
        "pageserver_smgr_query_seconds",
-        "Time spent on smgr query handling, aggegated by query type and tenant/timeline.",
+        "Time spent _executing_ smgr query handling, excluding batch and throttle delays.",
        &["smgr_query_type", "tenant_id", "shard_id", "timeline_id"],
-        CRITICAL_OP_BUCKETS.into(),
+        SMGR_QUERY_TIME_PER_TENANT_TIMELINE_BUCKETS.into(),
    )
    .expect("failed to define a metric")
 });
@@ -1369,7 +1502,7 @@ static SMGR_QUERY_TIME_GLOBAL_BUCKETS: Lazy<Vec<f64>> = Lazy::new(|| {
 static SMGR_QUERY_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
    register_histogram_vec!(
        "pageserver_smgr_query_seconds_global",
-        "Time spent on smgr query handling, aggregated by query type.",
+        "Like pageserver_smgr_query_seconds, but aggregated to instance level.",
        &["smgr_query_type"],
        SMGR_QUERY_TIME_GLOBAL_BUCKETS.clone(),
    )
@@ -1446,6 +1579,45 @@ fn set_page_service_config_max_batch_size(conf: &PageServicePipeliningConfig) {
        .set(value.try_into().unwrap());
 }

+static PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "pageserver_page_service_pagestream_flush_in_progress_micros",
+        "Counter that sums up the microseconds that a pagestream response was being flushed into the TCP connection. \
+         If the flush is particularly slow, this counter will be updated periodically to make slow flushes \
+         easily discoverable in monitoring. \
+         Hence, this is NOT a completion latency historgram.",
+        &["tenant_id", "shard_id", "timeline_id"],
+    )
+    .expect("failed to define a metric")
+});
+
+static PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS_GLOBAL: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
+        "pageserver_page_service_pagestream_flush_in_progress_micros_global",
+        "Like pageserver_page_service_pagestream_flush_in_progress_seconds, but instance-wide.",
+    )
+    .expect("failed to define a metric")
+});
+
+static PAGE_SERVICE_SMGR_BATCH_WAIT_TIME: Lazy<HistogramVec> = Lazy::new(|| {
+    register_histogram_vec!(
+        "pageserver_page_service_pagestream_batch_wait_time_seconds",
+        "Time a request spent waiting in its batch until the batch moved to throttle&execution.",
+        &["tenant_id", "shard_id", "timeline_id"],
+        SMGR_QUERY_TIME_PER_TENANT_TIMELINE_BUCKETS.into(),
+    )
+    .expect("failed to define a metric")
+});
+
+static PAGE_SERVICE_SMGR_BATCH_WAIT_TIME_GLOBAL: Lazy<Histogram> = Lazy::new(|| {
+    register_histogram!(
+        "pageserver_page_service_pagestream_batch_wait_time_seconds_global",
+        "Like pageserver_page_service_pagestream_batch_wait_time_seconds, but aggregated to instance level.",
+        SMGR_QUERY_TIME_GLOBAL_BUCKETS.to_vec(),
+    )
+    .expect("failed to define a metric")
+});
+
 impl SmgrQueryTimePerTimeline {
    pub(crate) fn new(tenant_shard_id: &TenantShardId, timeline_id: &TimelineId) -> Self {
        let tenant_id = tenant_shard_id.tenant_id.to_string();
@@ -1486,6 +1658,17 @@ impl SmgrQueryTimePerTimeline {
            .get_metric_with_label_values(&[&tenant_id, &shard_slug, &timeline_id])
            .unwrap();

+        let global_batch_wait_time = PAGE_SERVICE_SMGR_BATCH_WAIT_TIME_GLOBAL.clone();
+        let per_timeline_batch_wait_time = PAGE_SERVICE_SMGR_BATCH_WAIT_TIME
+            .get_metric_with_label_values(&[&tenant_id, &shard_slug, &timeline_id])
+            .unwrap();
+
+        let global_flush_in_progress_micros =
+            PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS_GLOBAL.clone();
+        let per_timeline_flush_in_progress_micros = PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS
+            .get_metric_with_label_values(&[&tenant_id, &shard_slug, &timeline_id])
+            .unwrap();
+
        Self {
            global_started,
            global_latency,
@@ -1493,9 +1676,13 @@ impl SmgrQueryTimePerTimeline {
            per_timeline_getpage_started,
            global_batch_size,
            per_timeline_batch_size,
+            global_flush_in_progress_micros,
+            per_timeline_flush_in_progress_micros,
+            global_batch_wait_time,
+            per_timeline_batch_wait_time,
        }
    }
-    pub(crate) fn start_smgr_op(&self, op: SmgrQueryType, started_at: Instant) -> SmgrOpTimer {
+    pub(crate) fn start_smgr_op(&self, op: SmgrQueryType, received_at: Instant) -> SmgrOpTimer {
        self.global_started[op as usize].inc();

        let per_timeline_latency_histo = if matches!(op, SmgrQueryType::GetPageAtLsn) {
@@ -1505,13 +1692,17 @@ impl SmgrQueryTimePerTimeline {
            None
        };

-        SmgrOpTimer {
-            global_latency_histo: self.global_latency[op as usize].clone(),
-            per_timeline_latency_histo,
-            start: started_at,
-            op,
-            throttled: Duration::ZERO,
-        }
+        SmgrOpTimer(Some(SmgrOpTimerInner {
+            global_execution_latency_histo: self.global_latency[op as usize].clone(),
+            per_timeline_execution_latency_histo: per_timeline_latency_histo,
+            timings: SmgrOpTimerState::Received { received_at },
+            global_flush_in_progress_micros: self.global_flush_in_progress_micros.clone(),
+            per_timeline_flush_in_progress_micros: self
+                .per_timeline_flush_in_progress_micros
+                .clone(),
+            global_batch_wait_time: self.global_batch_wait_time.clone(),
+            per_timeline_batch_wait_time: self.per_timeline_batch_wait_time.clone(),
+        }))
    }

    pub(crate) fn observe_getpage_batch_start(&self, batch_size: usize) {
@@ -2186,6 +2377,15 @@ pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| WalIngestMet
    .expect("failed to define a metric"),
 });

+pub(crate) static PAGESERVER_TIMELINE_WAL_RECORDS_RECEIVED: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "pageserver_timeline_wal_records_received",
+        "Number of WAL records received per shard",
+        &["tenant_id", "shard_id", "timeline_id"]
+    )
+    .expect("failed to define a metric")
+});
+
 pub(crate) static WAL_REDO_TIME: Lazy<Histogram> = Lazy::new(|| {
    register_histogram!(
        "pageserver_wal_redo_seconds",
@@ -2394,7 +2594,8 @@ pub(crate) struct TimelineMetrics {
    pub load_layer_map_histo: StorageTimeMetrics,
    pub garbage_collect_histo: StorageTimeMetrics,
    pub find_gc_cutoffs_histo: StorageTimeMetrics,
-    pub last_record_gauge: IntGauge,
+    pub last_record_lsn_gauge: IntGauge,
+    pub disk_consistent_lsn_gauge: IntGauge,
    pub pitr_history_size: UIntGauge,
    pub archival_size: UIntGauge,
    pub(crate) layer_size_image: UIntGauge,
@@ -2412,6 +2613,7 @@ pub(crate) struct TimelineMetrics {
    pub evictions_with_low_residence_duration: std::sync::RwLock<EvictionsWithLowResidenceDuration>,
    /// Number of valid LSN leases.
    pub valid_lsn_lease_count_gauge: UIntGauge,
+    pub wal_records_received: IntCounter,
    shutdown: std::sync::atomic::AtomicBool,
 }

@@ -2475,7 +2677,11 @@ impl TimelineMetrics {
            &shard_id,
            &timeline_id,
        );
-        let last_record_gauge = LAST_RECORD_LSN
+        let last_record_lsn_gauge = LAST_RECORD_LSN
+            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
+            .unwrap();
+
+        let disk_consistent_lsn_gauge = DISK_CONSISTENT_LSN
            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
            .unwrap();

@@ -2565,6 +2771,10 @@ impl TimelineMetrics {
            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
            .unwrap();

+        let wal_records_received = PAGESERVER_TIMELINE_WAL_RECORDS_RECEIVED
+            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
+            .unwrap();
+
        TimelineMetrics {
            tenant_id,
            shard_id,
@@ -2578,7 +2788,8 @@ impl TimelineMetrics {
            garbage_collect_histo,
            find_gc_cutoffs_histo,
            load_layer_map_histo,
-            last_record_gauge,
+            last_record_lsn_gauge,
+            disk_consistent_lsn_gauge,
            pitr_history_size,
            archival_size,
            layer_size_image,
@@ -2596,6 +2807,7 @@ impl TimelineMetrics {
                evictions_with_low_residence_duration,
            ),
            valid_lsn_lease_count_gauge,
+            wal_records_received,
            shutdown: std::sync::atomic::AtomicBool::default(),
        }
    }
@@ -2642,6 +2854,7 @@ impl TimelineMetrics {
        let timeline_id = &self.timeline_id;
        let shard_id = &self.shard_id;
        let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, shard_id, timeline_id]);
+        let _ = DISK_CONSISTENT_LSN.remove_label_values(&[tenant_id, shard_id, timeline_id]);
        let _ = FLUSH_WAIT_UPLOAD_TIME.remove_label_values(&[tenant_id, shard_id, timeline_id]);
        let _ = STANDBY_HORIZON.remove_label_values(&[tenant_id, shard_id, timeline_id]);
        {
@@ -2732,6 +2945,21 @@ impl TimelineMetrics {
            shard_id,
            timeline_id,
        ]);
+        let _ = PAGESERVER_TIMELINE_WAL_RECORDS_RECEIVED.remove_label_values(&[
+            tenant_id,
+            shard_id,
+            timeline_id,
+        ]);
+        let _ = PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS.remove_label_values(&[
+            tenant_id,
+            shard_id,
+            timeline_id,
+        ]);
+        let _ = PAGE_SERVICE_SMGR_BATCH_WAIT_TIME.remove_label_values(&[
+            tenant_id,
+            shard_id,
+            timeline_id,
+        ]);
    }
 }

@@ -2762,6 +2990,7 @@ use crate::context::{PageContentKind, RequestContext};
 use crate::task_mgr::TaskKind;
 use crate::tenant::mgr::TenantSlot;
 use crate::tenant::tasks::BackgroundLoopKind;
+use crate::tenant::throttle::ThrottleResult;
 use crate::tenant::Timeline;

 /// Maintain a per timeline gauge in addition to the global gauge.
@@ -2805,6 +3034,7 @@ pub(crate) struct RemoteTimelineClientMetrics {
    calls: Mutex<HashMap<(&'static str, &'static str), IntCounterPair>>,
    bytes_started_counter: Mutex<HashMap<(&'static str, &'static str), IntCounter>>,
    bytes_finished_counter: Mutex<HashMap<(&'static str, &'static str), IntCounter>>,
+    pub(crate) projected_remote_consistent_lsn_gauge: UIntGauge,
 }

 impl RemoteTimelineClientMetrics {
@@ -2819,6 +3049,10 @@ impl RemoteTimelineClientMetrics {
                .unwrap(),
        );

+        let projected_remote_consistent_lsn_gauge = PROJECTED_REMOTE_CONSISTENT_LSN
+            .get_metric_with_label_values(&[&tenant_id_str, &shard_id_str, &timeline_id_str])
+            .unwrap();
+
        RemoteTimelineClientMetrics {
            tenant_id: tenant_id_str,
            shard_id: shard_id_str,
@@ -2827,6 +3061,7 @@ impl RemoteTimelineClientMetrics {
            bytes_started_counter: Mutex::new(HashMap::default()),
            bytes_finished_counter: Mutex::new(HashMap::default()),
            remote_physical_size_gauge,
+            projected_remote_consistent_lsn_gauge,
        }
    }

@@ -3040,6 +3275,7 @@ impl Drop for RemoteTimelineClientMetrics {
            calls,
            bytes_started_counter,
            bytes_finished_counter,
+            projected_remote_consistent_lsn_gauge,
        } = self;
        for ((a, b), _) in calls.get_mut().unwrap().drain() {
            let mut res = [Ok(()), Ok(())];
@@ -3069,6 +3305,14 @@ impl Drop for RemoteTimelineClientMetrics {
            let _ = remote_physical_size_gauge; // use to avoid 'unused' warning in desctructuring above
            let _ = REMOTE_PHYSICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
        }
+        {
+            let _ = projected_remote_consistent_lsn_gauge;
+            let _ = PROJECTED_REMOTE_CONSISTENT_LSN.remove_label_values(&[
+                tenant_id,
+                shard_id,
+                timeline_id,
+            ]);
+        }
    }
 }

@@ -3601,6 +3845,7 @@ pub fn preinitialize_metrics(conf: &'static PageServerConf) {
        &REMOTE_ONDEMAND_DOWNLOADED_BYTES,
        &CIRCUIT_BREAKERS_BROKEN,
        &CIRCUIT_BREAKERS_UNBROKEN,
+        &PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS_GLOBAL,
    ]
    .into_iter()
    .for_each(|c| {
@@ -3648,6 +3893,7 @@ pub fn preinitialize_metrics(conf: &'static PageServerConf) {
        &WAL_REDO_BYTES_HISTOGRAM,
        &WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM,
        &PAGE_SERVICE_BATCH_SIZE_GLOBAL,
+        &PAGE_SERVICE_SMGR_BATCH_WAIT_TIME_GLOBAL,
    ]
    .into_iter()
    .for_each(|h| {
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -575,7 +575,10 @@ enum BatchedFeMessage {
 }

 impl BatchedFeMessage {
-    async fn throttle(&mut self, cancel: &CancellationToken) -> Result<(), QueryError> {
+    async fn throttle_and_record_start_processing(
+        &mut self,
+        cancel: &CancellationToken,
+    ) -> Result<(), QueryError> {
        let (shard, tokens, timers) = match self {
            BatchedFeMessage::Exists { shard, timer, .. }
            | BatchedFeMessage::Nblocks { shard, timer, .. }
@@ -603,7 +606,7 @@ impl BatchedFeMessage {
            }
        };
        for timer in timers {
-            timer.deduct_throttle(&throttled);
+            timer.observe_throttle_done_execution_starting(&throttled);
        }
        Ok(())
    }
@@ -1017,10 +1020,8 @@ impl PageServerHandler {
        // Map handler result to protocol behavior.
        // Some handler errors cause exit from pagestream protocol.
        // Other handler errors are sent back as an error message and we stay in pagestream protocol.
-        let mut timers: smallvec::SmallVec<[_; 1]> =
-            smallvec::SmallVec::with_capacity(handler_results.len());
        for handler_result in handler_results {
-            let response_msg = match handler_result {
+            let (response_msg, timer) = match handler_result {
                Err(e) => match &e {
                    PageStreamError::Shutdown => {
                        // If we fail to fulfil a request during shutdown, which may be _because_ of
@@ -1044,34 +1045,66 @@ impl PageServerHandler {
                        span.in_scope(|| {
                            error!("error reading relation or page version: {full:#}")
                        });
-                        PagestreamBeMessage::Error(PagestreamErrorResponse {
-                            message: e.to_string(),
-                        })
+                        (
+                            PagestreamBeMessage::Error(PagestreamErrorResponse {
+                                message: e.to_string(),
+                            }),
+                            None, // TODO: measure errors
+                        )
                    }
                },
-                Ok((response_msg, timer)) => {
-                    // Extending the lifetime of the timers so observations on drop
-                    // include the flush time.
-                    timers.push(timer);
-                    response_msg
-                }
+                Ok((response_msg, timer)) => (response_msg, Some(timer)),
            };

+            //
            // marshal & transmit response message
+            //
+
            pgb_writer.write_message_noflush(&BeMessage::CopyData(&response_msg.serialize()))?;
-        }
-        tokio::select! {
-            biased;
-            _ = cancel.cancelled() => {
-                // We were requested to shut down.
-                info!("shutdown request received in page handler");
-                return Err(QueryError::Shutdown)
-            }
-            res = pgb_writer.flush() => {
-                res?;
+
+            // We purposefully don't count flush time into the timer.
+            //
+            // The reason is that current compute client will not perform protocol processing
+            // if the postgres backend process is doing things other than `->smgr_read()`.
+            // This is especially the case for prefetch.
+            //
+            // If the compute doesn't read from the connection, eventually TCP will backpressure
+            // all the way into our flush call below.
+            //
+            // The timer's underlying metric is used for a storage-internal latency SLO and
+            // we don't want to include latency in it that we can't control.
+            // And as pointed out above, in this case, we don't control the time that flush will take.
+            let flushing_timer =
+                timer.map(|timer| timer.observe_smgr_op_completion_and_start_flushing());
+
+            // what we want to do
+            let flush_fut = pgb_writer.flush();
+            // metric for how long flushing takes
+            let flush_fut = match flushing_timer {
+                Some(flushing_timer) => {
+                    futures::future::Either::Left(flushing_timer.measure(flush_fut))
+                }
+                None => futures::future::Either::Right(flush_fut),
+            };
+            // do it while respecting cancellation
+            let _: () = async move {
+                tokio::select! {
+                    biased;
+                    _ = cancel.cancelled() => {
+                        // We were requested to shut down.
+                        info!("shutdown request received in page handler");
+                        return Err(QueryError::Shutdown)
+                    }
+                    res = flush_fut => {
+                        res?;
+                    }
+                }
+                Ok(())
            }
+            // and log the info! line inside the request span
+            .instrument(span.clone())
+            .await?;
        }
-        drop(timers);
        Ok(())
    }

@@ -1200,7 +1233,7 @@ impl PageServerHandler {
                }
            };

-            if let Err(cancelled) = msg.throttle(&self.cancel).await {
+            if let Err(cancelled) = msg.throttle_and_record_start_processing(&self.cancel).await {
                break cancelled;
            }

@@ -1367,7 +1400,9 @@ impl PageServerHandler {
                            return Err(e);
                        }
                    };
-                    batch.throttle(&self.cancel).await?;
+                    batch
+                        .throttle_and_record_start_processing(&self.cancel)
+                        .await?;
                    self.pagesteam_handle_batched_message(pgb_writer, batch, &cancel, &ctx)
                        .await?;
                }
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -37,14 +37,19 @@ use remote_timeline_client::manifest::{
 };
 use remote_timeline_client::UploadQueueNotReadyError;
 use std::collections::BTreeMap;
+use std::collections::VecDeque;
 use std::fmt;
 use std::future::Future;
 use std::sync::atomic::AtomicBool;
 use std::sync::Weak;
 use std::time::SystemTime;
 use storage_broker::BrokerClientChannel;
+use timeline::compaction::ScheduledCompactionTask;
 use timeline::import_pgdata;
 use timeline::offload::offload_timeline;
+use timeline::CompactFlags;
+use timeline::CompactOptions;
+use timeline::CompactionError;
 use timeline::ShutdownMode;
 use tokio::io::BufReader;
 use tokio::sync::watch;
@@ -63,6 +68,7 @@ use utils::sync::gate::Gate;
 use utils::sync::gate::GateGuard;
 use utils::timeout::timeout_cancellable;
 use utils::timeout::TimeoutCancellableError;
+use utils::try_rcu::ArcSwapExt;
 use utils::zstd::create_zst_tarball;
 use utils::zstd::extract_zst_tarball;

@@ -339,6 +345,11 @@ pub struct Tenant {
    /// Overhead of mutex is acceptable because compaction is done with a multi-second period.
    compaction_circuit_breaker: std::sync::Mutex<CircuitBreaker>,

+    /// Scheduled compaction tasks. Currently, this can only be populated by triggering
+    /// a manual gc-compaction from the manual compaction API.
+    scheduled_compaction_tasks:
+        std::sync::Mutex<HashMap<TimelineId, VecDeque<ScheduledCompactionTask>>>,
+
    /// If the tenant is in Activating state, notify this to encourage it
    /// to proceed to Active as soon as possible, rather than waiting for lazy
    /// background warmup.
@@ -2953,27 +2964,109 @@ impl Tenant {

        for (timeline_id, timeline, (can_compact, can_offload)) in &timelines_to_compact_or_offload
        {
+            // pending_task_left == None: cannot compact, maybe still pending tasks
+            // pending_task_left == Some(true): compaction task left
+            // pending_task_left == Some(false): no compaction task left
            let pending_task_left = if *can_compact {
-                Some(
-                    timeline
-                        .compact(cancel, EnumSet::empty(), ctx)
-                        .instrument(info_span!("compact_timeline", %timeline_id))
-                        .await
-                        .inspect_err(|e| match e {
-                            timeline::CompactionError::ShuttingDown => (),
-                            timeline::CompactionError::Offload(_) => {
-                                // Failures to offload timelines do not trip the circuit breaker, because
-                                // they do not do lots of writes the way compaction itself does: it is cheap
-                                // to retry, and it would be bad to stop all compaction because of an issue with offloading.
+                let has_pending_l0_compaction_task = timeline
+                    .compact(cancel, EnumSet::empty(), ctx)
+                    .instrument(info_span!("compact_timeline", %timeline_id))
+                    .await
+                    .inspect_err(|e| match e {
+                        timeline::CompactionError::ShuttingDown => (),
+                        timeline::CompactionError::Offload(_) => {
+                            // Failures to offload timelines do not trip the circuit breaker, because
+                            // they do not do lots of writes the way compaction itself does: it is cheap
+                            // to retry, and it would be bad to stop all compaction because of an issue with offloading.
+                        }
+                        timeline::CompactionError::Other(e) => {
+                            self.compaction_circuit_breaker
+                                .lock()
+                                .unwrap()
+                                .fail(&CIRCUIT_BREAKERS_BROKEN, e);
+                        }
+                    })?;
+                if has_pending_l0_compaction_task {
+                    Some(true)
+                } else {
+                    let mut has_pending_scheduled_compaction_task;
+                    let next_scheduled_compaction_task = {
+                        let mut guard = self.scheduled_compaction_tasks.lock().unwrap();
+                        if let Some(tline_pending_tasks) = guard.get_mut(timeline_id) {
+                            if !tline_pending_tasks.is_empty() {
+                                info!(
+                                    "{} tasks left in the compaction schedule queue",
+                                    tline_pending_tasks.len()
+                                );
                            }
-                            timeline::CompactionError::Other(e) => {
-                                self.compaction_circuit_breaker
-                                    .lock()
-                                    .unwrap()
-                                    .fail(&CIRCUIT_BREAKERS_BROKEN, e);
+                            let next_task = tline_pending_tasks.pop_front();
+                            has_pending_scheduled_compaction_task = !tline_pending_tasks.is_empty();
+                            next_task
+                        } else {
+                            has_pending_scheduled_compaction_task = false;
+                            None
+                        }
+                    };
+                    if let Some(mut next_scheduled_compaction_task) = next_scheduled_compaction_task
+                    {
+                        if !next_scheduled_compaction_task
+                            .options
+                            .flags
+                            .contains(CompactFlags::EnhancedGcBottomMostCompaction)
+                        {
+                            warn!("ignoring scheduled compaction task: scheduled task must be gc compaction: {:?}", next_scheduled_compaction_task.options);
+                        } else if next_scheduled_compaction_task.options.sub_compaction {
+                            info!("running scheduled enhanced gc bottom-most compaction with sub-compaction, splitting compaction jobs");
+                            let jobs = timeline
+                                .gc_compaction_split_jobs(next_scheduled_compaction_task.options)
+                                .await
+                                .map_err(CompactionError::Other)?;
+                            if jobs.is_empty() {
+                                info!("no jobs to run, skipping scheduled compaction task");
+                            } else {
+                                has_pending_scheduled_compaction_task = true;
+                                let jobs_len = jobs.len();
+                                let mut guard = self.scheduled_compaction_tasks.lock().unwrap();
+                                let tline_pending_tasks = guard.entry(*timeline_id).or_default();
+                                for (idx, job) in jobs.into_iter().enumerate() {
+                                    tline_pending_tasks.push_back(if idx == jobs_len - 1 {
+                                        ScheduledCompactionTask {
+                                            options: job,
+                                            // The last job in the queue sends the signal and releases the gc guard
+                                            result_tx: next_scheduled_compaction_task
+                                                .result_tx
+                                                .take(),
+                                            gc_block: next_scheduled_compaction_task
+                                                .gc_block
+                                                .take(),
+                                        }
+                                    } else {
+                                        ScheduledCompactionTask {
+                                            options: job,
+                                            result_tx: None,
+                                            gc_block: None,
+                                        }
+                                    });
+                                }
+                                info!("scheduled enhanced gc bottom-most compaction with sub-compaction, split into {} jobs", jobs_len);
                            }
-                        })?,
-                )
+                        } else {
+                            let _ = timeline
+                                .compact_with_options(
+                                    cancel,
+                                    next_scheduled_compaction_task.options,
+                                    ctx,
+                                )
+                                .instrument(info_span!("scheduled_compact_timeline", %timeline_id))
+                                .await?;
+                            if let Some(tx) = next_scheduled_compaction_task.result_tx.take() {
+                                // TODO: we can send compaction statistics in the future
+                                tx.send(()).ok();
+                            }
+                        }
+                    }
+                    Some(has_pending_scheduled_compaction_task)
+                }
            } else {
                None
            };
@@ -2993,6 +3086,43 @@ impl Tenant {
        Ok(has_pending_task)
    }

+    /// Cancel scheduled compaction tasks
+    pub(crate) fn cancel_scheduled_compaction(
+        &self,
+        timeline_id: TimelineId,
+    ) -> Vec<ScheduledCompactionTask> {
+        let mut guard = self.scheduled_compaction_tasks.lock().unwrap();
+        if let Some(tline_pending_tasks) = guard.get_mut(&timeline_id) {
+            let current_tline_pending_tasks = std::mem::take(tline_pending_tasks);
+            current_tline_pending_tasks.into_iter().collect()
+        } else {
+            Vec::new()
+        }
+    }
+
+    /// Schedule a compaction task for a timeline.
+    pub(crate) async fn schedule_compaction(
+        &self,
+        timeline_id: TimelineId,
+        options: CompactOptions,
+    ) -> anyhow::Result<tokio::sync::oneshot::Receiver<()>> {
+        let gc_guard = match self.gc_block.start().await {
+            Ok(guard) => guard,
+            Err(e) => {
+                bail!("cannot run gc-compaction because gc is blocked: {}", e);
+            }
+        };
+        let (tx, rx) = tokio::sync::oneshot::channel();
+        let mut guard = self.scheduled_compaction_tasks.lock().unwrap();
+        let tline_pending_tasks = guard.entry(timeline_id).or_default();
+        tline_pending_tasks.push_back(ScheduledCompactionTask {
+            options,
+            result_tx: Some(tx),
+            gc_block: Some(gc_guard),
+        });
+        Ok(rx)
+    }
+
    // Call through to all timelines to freeze ephemeral layers if needed.  Usually
    // this happens during ingest: this background housekeeping is for freezing layers
    // that are open but haven't been written to for some time.
@@ -3422,7 +3552,7 @@ impl Tenant {
                            r.map_err(
                            |_e: tokio::sync::watch::error::RecvError|
                                // Tenant existed but was dropped: report it as non-existent
-                                GetActiveTenantError::NotFound(GetTenantError::NotFound(self.tenant_shard_id.tenant_id))
+                                GetActiveTenantError::NotFound(GetTenantError::ShardNotFound(self.tenant_shard_id))
                        )?
                        }
                        Err(TimeoutCancellableError::Cancelled) => {
@@ -3792,25 +3922,28 @@ impl Tenant {
        }
    }

-    pub fn set_new_tenant_config(&self, new_tenant_conf: TenantConfOpt) {
+    pub fn update_tenant_config<F: Fn(TenantConfOpt) -> anyhow::Result<TenantConfOpt>>(
+        &self,
+        update: F,
+    ) -> anyhow::Result<TenantConfOpt> {
        // Use read-copy-update in order to avoid overwriting the location config
        // state if this races with [`Tenant::set_new_location_config`]. Note that
        // this race is not possible if both request types come from the storage
        // controller (as they should!) because an exclusive op lock is required
        // on the storage controller side.

-        self.tenant_conf.rcu(|inner| {
-            Arc::new(AttachedTenantConf {
-                tenant_conf: new_tenant_conf.clone(),
-                location: inner.location,
-                // Attached location is not changed, no need to update lsn lease deadline.
-                lsn_lease_deadline: inner.lsn_lease_deadline,
-            })
-        });
+        self.tenant_conf
+            .try_rcu(|attached_conf| -> Result<_, anyhow::Error> {
+                Ok(Arc::new(AttachedTenantConf {
+                    tenant_conf: update(attached_conf.tenant_conf.clone())?,
+                    location: attached_conf.location,
+                    lsn_lease_deadline: attached_conf.lsn_lease_deadline,
+                }))
+            })?;

-        let updated = self.tenant_conf.load().clone();
+        let updated = self.tenant_conf.load();

-        self.tenant_conf_updated(&new_tenant_conf);
+        self.tenant_conf_updated(&updated.tenant_conf);
        // Don't hold self.timelines.lock() during the notifies.
        // There's no risk of deadlock right now, but there could be if we consolidate
        // mutexes in struct Timeline in the future.
@@ -3818,6 +3951,8 @@ impl Tenant {
        for timeline in timelines {
            timeline.tenant_conf_updated(&updated);
        }
+
+        Ok(updated.tenant_conf.clone())
    }

    pub(crate) fn set_new_location_config(&self, new_conf: AttachedTenantConf) {
@@ -4005,6 +4140,7 @@ impl Tenant {
                // use an extremely long backoff.
                Some(Duration::from_secs(3600 * 24)),
            )),
+            scheduled_compaction_tasks: Mutex::new(Default::default()),
            activate_now_sem: tokio::sync::Semaphore::new(0),
            attach_wal_lag_cooldown: Arc::new(std::sync::OnceLock::new()),
            cancel: CancellationToken::default(),
@@ -4376,7 +4512,12 @@ impl Tenant {
                // - this timeline was created while we were finding cutoffs
                // - lsn for timestamp search fails for this timeline repeatedly
                if let Some(cutoffs) = gc_cutoffs.get(&timeline.timeline_id) {
-                    target.cutoffs = cutoffs.clone();
+                    let original_cutoffs = target.cutoffs.clone();
+                    // GC cutoffs should never go back
+                    target.cutoffs = GcCutoffs {
+                        space: Lsn(cutoffs.space.0.max(original_cutoffs.space.0)),
+                        time: Lsn(cutoffs.time.0.max(original_cutoffs.time.0)),
+                    }
                }
            }

@@ -8036,6 +8177,12 @@ mod tests {
            )
            .await?;
        {
+            tline
+                .latest_gc_cutoff_lsn
+                .lock_for_write()
+                .store_and_unlock(Lsn(0x30))
+                .wait()
+                .await;
            // Update GC info
            let mut guard = tline.gc_info.write().unwrap();
            guard.cutoffs.time = Lsn(0x30);
@@ -8138,6 +8285,12 @@ mod tests {

        // increase GC horizon and compact again
        {
+            tline
+                .latest_gc_cutoff_lsn
+                .lock_for_write()
+                .store_and_unlock(Lsn(0x40))
+                .wait()
+                .await;
            // Update GC info
            let mut guard = tline.gc_info.write().unwrap();
            guard.cutoffs.time = Lsn(0x40);
@@ -8518,6 +8671,12 @@ mod tests {
                .await?
        };
        {
+            tline
+                .latest_gc_cutoff_lsn
+                .lock_for_write()
+                .store_and_unlock(Lsn(0x30))
+                .wait()
+                .await;
            // Update GC info
            let mut guard = tline.gc_info.write().unwrap();
            *guard = GcInfo {
@@ -8599,6 +8758,12 @@ mod tests {

        // increase GC horizon and compact again
        {
+            tline
+                .latest_gc_cutoff_lsn
+                .lock_for_write()
+                .store_and_unlock(Lsn(0x40))
+                .wait()
+                .await;
            // Update GC info
            let mut guard = tline.gc_info.write().unwrap();
            guard.cutoffs.time = Lsn(0x40);
@@ -9046,6 +9211,12 @@ mod tests {
            )
            .await?;
        {
+            tline
+                .latest_gc_cutoff_lsn
+                .lock_for_write()
+                .store_and_unlock(Lsn(0x30))
+                .wait()
+                .await;
            // Update GC info
            let mut guard = tline.gc_info.write().unwrap();
            *guard = GcInfo {
@@ -9163,6 +9334,7 @@ mod tests {
                CompactOptions {
                    flags: dryrun_flags,
                    compact_range: None,
+                    ..Default::default()
                },
                &ctx,
            )
@@ -9187,6 +9359,12 @@ mod tests {

        // increase GC horizon and compact again
        {
+            tline
+                .latest_gc_cutoff_lsn
+                .lock_for_write()
+                .store_and_unlock(Lsn(0x38))
+                .wait()
+                .await;
            // Update GC info
            let mut guard = tline.gc_info.write().unwrap();
            guard.cutoffs.time = Lsn(0x38);
@@ -9282,6 +9460,12 @@ mod tests {
            )
            .await?;
        {
+            tline
+                .latest_gc_cutoff_lsn
+                .lock_for_write()
+                .store_and_unlock(Lsn(0x30))
+                .wait()
+                .await;
            // Update GC info
            let mut guard = tline.gc_info.write().unwrap();
            *guard = GcInfo {
@@ -9399,6 +9583,7 @@ mod tests {
                CompactOptions {
                    flags: dryrun_flags,
                    compact_range: None,
+                    ..Default::default()
                },
                &ctx,
            )
@@ -9525,6 +9710,12 @@ mod tests {
        branch_tline.add_extra_test_dense_keyspace(KeySpace::single(get_key(0)..get_key(10)));

        {
+            parent_tline
+                .latest_gc_cutoff_lsn
+                .lock_for_write()
+                .store_and_unlock(Lsn(0x10))
+                .wait()
+                .await;
            // Update GC info
            let mut guard = parent_tline.gc_info.write().unwrap();
            *guard = GcInfo {
@@ -9539,6 +9730,12 @@ mod tests {
        }

        {
+            branch_tline
+                .latest_gc_cutoff_lsn
+                .lock_for_write()
+                .store_and_unlock(Lsn(0x50))
+                .wait()
+                .await;
            // Update GC info
            let mut guard = branch_tline.gc_info.write().unwrap();
            *guard = GcInfo {
@@ -9868,6 +10065,12 @@ mod tests {
            .await?;

        {
+            tline
+                .latest_gc_cutoff_lsn
+                .lock_for_write()
+                .store_and_unlock(Lsn(0x30))
+                .wait()
+                .await;
            // Update GC info
            let mut guard = tline.gc_info.write().unwrap();
            *guard = GcInfo {
@@ -9885,7 +10088,15 @@ mod tests {

        // Do a partial compaction on key range 0..2
        tline
-            .partial_compact_with_gc(get_key(0)..get_key(2), &cancel, EnumSet::new(), &ctx)
+            .compact_with_gc(
+                &cancel,
+                CompactOptions {
+                    flags: EnumSet::new(),
+                    compact_range: Some((get_key(0)..get_key(2)).into()),
+                    ..Default::default()
+                },
+                &ctx,
+            )
            .await
            .unwrap();
        let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await;
@@ -9924,7 +10135,15 @@ mod tests {

        // Do a partial compaction on key range 2..4
        tline
-            .partial_compact_with_gc(get_key(2)..get_key(4), &cancel, EnumSet::new(), &ctx)
+            .compact_with_gc(
+                &cancel,
+                CompactOptions {
+                    flags: EnumSet::new(),
+                    compact_range: Some((get_key(2)..get_key(4)).into()),
+                    ..Default::default()
+                },
+                &ctx,
+            )
            .await
            .unwrap();
        let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await;
@@ -9968,7 +10187,15 @@ mod tests {

        // Do a partial compaction on key range 4..9
        tline
-            .partial_compact_with_gc(get_key(4)..get_key(9), &cancel, EnumSet::new(), &ctx)
+            .compact_with_gc(
+                &cancel,
+                CompactOptions {
+                    flags: EnumSet::new(),
+                    compact_range: Some((get_key(4)..get_key(9)).into()),
+                    ..Default::default()
+                },
+                &ctx,
+            )
            .await
            .unwrap();
        let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await;
@@ -10011,7 +10238,15 @@ mod tests {

        // Do a partial compaction on key range 9..10
        tline
-            .partial_compact_with_gc(get_key(9)..get_key(10), &cancel, EnumSet::new(), &ctx)
+            .compact_with_gc(
+                &cancel,
+                CompactOptions {
+                    flags: EnumSet::new(),
+                    compact_range: Some((get_key(9)..get_key(10)).into()),
+                    ..Default::default()
+                },
+                &ctx,
+            )
            .await
            .unwrap();
        let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await;
@@ -10059,7 +10294,15 @@ mod tests {

        // Do a partial compaction on key range 0..10, all image layers below LSN 20 can be replaced with new ones.
        tline
-            .partial_compact_with_gc(get_key(0)..get_key(10), &cancel, EnumSet::new(), &ctx)
+            .compact_with_gc(
+                &cancel,
+                CompactOptions {
+                    flags: EnumSet::new(),
+                    compact_range: Some((get_key(0)..get_key(10)).into()),
+                    ..Default::default()
+                },
+                &ctx,
+            )
            .await
            .unwrap();
        let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await;
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -11,7 +11,7 @@
 pub(crate) use pageserver_api::config::TenantConfigToml as TenantConf;
 use pageserver_api::models::CompactionAlgorithmSettings;
 use pageserver_api::models::EvictionPolicy;
-use pageserver_api::models::{self, ThrottleConfig};
+use pageserver_api::models::{self, TenantConfigPatch, ThrottleConfig};
 use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize};
 use serde::de::IntoDeserializer;
 use serde::{Deserialize, Serialize};
@@ -427,6 +427,129 @@ impl TenantConfOpt {
                .or(global_conf.wal_receiver_protocol_override),
        }
    }
+
+    pub fn apply_patch(self, patch: TenantConfigPatch) -> anyhow::Result<TenantConfOpt> {
+        let Self {
+            mut checkpoint_distance,
+            mut checkpoint_timeout,
+            mut compaction_target_size,
+            mut compaction_period,
+            mut compaction_threshold,
+            mut compaction_algorithm,
+            mut gc_horizon,
+            mut gc_period,
+            mut image_creation_threshold,
+            mut pitr_interval,
+            mut walreceiver_connect_timeout,
+            mut lagging_wal_timeout,
+            mut max_lsn_wal_lag,
+            mut eviction_policy,
+            mut min_resident_size_override,
+            mut evictions_low_residence_duration_metric_threshold,
+            mut heatmap_period,
+            mut lazy_slru_download,
+            mut timeline_get_throttle,
+            mut image_layer_creation_check_threshold,
+            mut lsn_lease_length,
+            mut lsn_lease_length_for_ts,
+            mut timeline_offloading,
+            mut wal_receiver_protocol_override,
+        } = self;
+
+        patch.checkpoint_distance.apply(&mut checkpoint_distance);
+        patch
+            .checkpoint_timeout
+            .map(|v| humantime::parse_duration(&v))?
+            .apply(&mut checkpoint_timeout);
+        patch
+            .compaction_target_size
+            .apply(&mut compaction_target_size);
+        patch
+            .compaction_period
+            .map(|v| humantime::parse_duration(&v))?
+            .apply(&mut compaction_period);
+        patch.compaction_threshold.apply(&mut compaction_threshold);
+        patch.compaction_algorithm.apply(&mut compaction_algorithm);
+        patch.gc_horizon.apply(&mut gc_horizon);
+        patch
+            .gc_period
+            .map(|v| humantime::parse_duration(&v))?
+            .apply(&mut gc_period);
+        patch
+            .image_creation_threshold
+            .apply(&mut image_creation_threshold);
+        patch
+            .pitr_interval
+            .map(|v| humantime::parse_duration(&v))?
+            .apply(&mut pitr_interval);
+        patch
+            .walreceiver_connect_timeout
+            .map(|v| humantime::parse_duration(&v))?
+            .apply(&mut walreceiver_connect_timeout);
+        patch
+            .lagging_wal_timeout
+            .map(|v| humantime::parse_duration(&v))?
+            .apply(&mut lagging_wal_timeout);
+        patch.max_lsn_wal_lag.apply(&mut max_lsn_wal_lag);
+        patch.eviction_policy.apply(&mut eviction_policy);
+        patch
+            .min_resident_size_override
+            .apply(&mut min_resident_size_override);
+        patch
+            .evictions_low_residence_duration_metric_threshold
+            .map(|v| humantime::parse_duration(&v))?
+            .apply(&mut evictions_low_residence_duration_metric_threshold);
+        patch
+            .heatmap_period
+            .map(|v| humantime::parse_duration(&v))?
+            .apply(&mut heatmap_period);
+        patch.lazy_slru_download.apply(&mut lazy_slru_download);
+        patch
+            .timeline_get_throttle
+            .apply(&mut timeline_get_throttle);
+        patch
+            .image_layer_creation_check_threshold
+            .apply(&mut image_layer_creation_check_threshold);
+        patch
+            .lsn_lease_length
+            .map(|v| humantime::parse_duration(&v))?
+            .apply(&mut lsn_lease_length);
+        patch
+            .lsn_lease_length_for_ts
+            .map(|v| humantime::parse_duration(&v))?
+            .apply(&mut lsn_lease_length_for_ts);
+        patch.timeline_offloading.apply(&mut timeline_offloading);
+        patch
+            .wal_receiver_protocol_override
+            .apply(&mut wal_receiver_protocol_override);
+
+        Ok(Self {
+            checkpoint_distance,
+            checkpoint_timeout,
+            compaction_target_size,
+            compaction_period,
+            compaction_threshold,
+            compaction_algorithm,
+            gc_horizon,
+            gc_period,
+            image_creation_threshold,
+            pitr_interval,
+            walreceiver_connect_timeout,
+            lagging_wal_timeout,
+            max_lsn_wal_lag,
+            eviction_policy,
+            min_resident_size_override,
+            evictions_low_residence_duration_metric_threshold,
+            heatmap_period,
+            lazy_slru_download,
+            timeline_get_throttle,
+            image_layer_creation_check_threshold,
+            lsn_lease_length,
+            lsn_lease_length_for_ts,
+            timeline_offloading,
+            wal_receiver_protocol_override,
+        })
+    }
 }

 impl TryFrom<&'_ models::TenantConfig> for TenantConfOpt {
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -8,10 +8,8 @@ use crate::page_cache;
 use crate::tenant::storage_layer::inmemory_layer::vectored_dio_read::File;
 use crate::virtual_file::owned_buffers_io::io_buf_aligned::IoBufAlignedMut;
 use crate::virtual_file::owned_buffers_io::slice::SliceMutExt;
-use crate::virtual_file::owned_buffers_io::util::size_tracking_writer;
 use crate::virtual_file::owned_buffers_io::write::Buffer;
 use crate::virtual_file::{self, owned_buffers_io, IoBufferMut, VirtualFile};
-use bytes::BytesMut;
 use camino::Utf8PathBuf;
 use num_traits::Num;
 use pageserver_api::shard::TenantShardId;
@@ -20,6 +18,7 @@ use tracing::error;

 use std::io;
 use std::sync::atomic::AtomicU64;
+use std::sync::Arc;
 use utils::id::TimelineId;

 pub struct EphemeralFile {
@@ -27,10 +26,7 @@ pub struct EphemeralFile {
    _timeline_id: TimelineId,
    page_cache_file_id: page_cache::FileId,
    bytes_written: u64,
-    buffered_writer: owned_buffers_io::write::BufferedWriter<
-        BytesMut,
-        size_tracking_writer::Writer<VirtualFile>,
-    >,
+    buffered_writer: owned_buffers_io::write::BufferedWriter<IoBufferMut, VirtualFile>,
    /// Gate guard is held on as long as we need to do operations in the path (delete on drop)
    _gate_guard: utils::sync::gate::GateGuard,
 }
@@ -42,9 +38,9 @@ impl EphemeralFile {
        conf: &PageServerConf,
        tenant_shard_id: TenantShardId,
        timeline_id: TimelineId,
-        gate_guard: utils::sync::gate::GateGuard,
+        gate: &utils::sync::gate::Gate,
        ctx: &RequestContext,
-    ) -> Result<EphemeralFile, io::Error> {
+    ) -> anyhow::Result<EphemeralFile> {
        static NEXT_FILENAME: AtomicU64 = AtomicU64::new(1);
        let filename_disambiguator =
            NEXT_FILENAME.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
@@ -55,15 +51,17 @@ impl EphemeralFile {
                "ephemeral-{filename_disambiguator}"
            )));

-        let file = VirtualFile::open_with_options(
-            &filename,
-            virtual_file::OpenOptions::new()
-                .read(true)
-                .write(true)
-                .create(true),
-            ctx,
-        )
-        .await?;
+        let file = Arc::new(
+            VirtualFile::open_with_options_v2(
+                &filename,
+                virtual_file::OpenOptions::new()
+                    .read(true)
+                    .write(true)
+                    .create(true),
+                ctx,
+            )
+            .await?,
+        );

        let page_cache_file_id = page_cache::next_file_id(); // XXX get rid, we're not page-caching anymore

@@ -73,10 +71,12 @@ impl EphemeralFile {
            page_cache_file_id,
            bytes_written: 0,
            buffered_writer: owned_buffers_io::write::BufferedWriter::new(
-                size_tracking_writer::Writer::new(file),
-                BytesMut::with_capacity(TAIL_SZ),
+                file,
+                || IoBufferMut::with_capacity(TAIL_SZ),
+                gate.enter()?,
+                ctx,
            ),
-            _gate_guard: gate_guard,
+            _gate_guard: gate.enter()?,
        })
    }
 }
@@ -85,7 +85,7 @@ impl Drop for EphemeralFile {
    fn drop(&mut self) {
        // unlink the file
        // we are clear to do this, because we have entered a gate
-        let path = self.buffered_writer.as_inner().as_inner().path();
+        let path = self.buffered_writer.as_inner().path();
        let res = std::fs::remove_file(path);
        if let Err(e) = res {
            if e.kind() != std::io::ErrorKind::NotFound {
@@ -132,6 +132,18 @@ impl EphemeralFile {
        srcbuf: &[u8],
        ctx: &RequestContext,
    ) -> std::io::Result<u64> {
+        let (pos, control) = self.write_raw_controlled(srcbuf, ctx).await?;
+        if let Some(control) = control {
+            control.release().await;
+        }
+        Ok(pos)
+    }
+
+    async fn write_raw_controlled(
+        &mut self,
+        srcbuf: &[u8],
+        ctx: &RequestContext,
+    ) -> std::io::Result<(u64, Option<owned_buffers_io::write::FlushControl>)> {
        let pos = self.bytes_written;

        let new_bytes_written = pos.checked_add(srcbuf.len().into_u64()).ok_or_else(|| {
@@ -145,9 +157,9 @@ impl EphemeralFile {
        })?;

        // Write the payload
-        let nwritten = self
+        let (nwritten, control) = self
            .buffered_writer
-            .write_buffered_borrowed(srcbuf, ctx)
+            .write_buffered_borrowed_controlled(srcbuf, ctx)
            .await?;
        assert_eq!(
            nwritten,
@@ -157,7 +169,7 @@ impl EphemeralFile {

        self.bytes_written = new_bytes_written;

-        Ok(pos)
+        Ok((pos, control))
    }
 }

@@ -168,11 +180,12 @@ impl super::storage_layer::inmemory_layer::vectored_dio_read::File for Ephemeral
        dst: tokio_epoll_uring::Slice<B>,
        ctx: &'a RequestContext,
    ) -> std::io::Result<(tokio_epoll_uring::Slice<B>, usize)> {
-        let file_size_tracking_writer = self.buffered_writer.as_inner();
-        let flushed_offset = file_size_tracking_writer.bytes_written();
+        let submitted_offset = self.buffered_writer.bytes_submitted();

-        let buffer = self.buffered_writer.inspect_buffer();
-        let buffered = &buffer[0..buffer.pending()];
+        let mutable = self.buffered_writer.inspect_mutable();
+        let mutable = &mutable[0..mutable.pending()];
+
+        let maybe_flushed = self.buffered_writer.inspect_maybe_flushed();

        let dst_cap = dst.bytes_total().into_u64();
        let end = {
@@ -197,11 +210,42 @@ impl super::storage_layer::inmemory_layer::vectored_dio_read::File for Ephemeral
                }
            }
        }
-        let written_range = Range(start, std::cmp::min(end, flushed_offset));
-        let buffered_range = Range(std::cmp::max(start, flushed_offset), end);
+
+        let (written_range, maybe_flushed_range) = {
+            if maybe_flushed.is_some() {
+                // [       written       ][ maybe_flushed ][    mutable    ]
+                //                        <-   TAIL_SZ   -><-   TAIL_SZ   ->
+                //                                         ^
+                //                                 `submitted_offset`
+                // <++++++ on disk +++++++????????????????>
+                (
+                    Range(
+                        start,
+                        std::cmp::min(end, submitted_offset.saturating_sub(TAIL_SZ as u64)),
+                    ),
+                    Range(
+                        std::cmp::max(start, submitted_offset.saturating_sub(TAIL_SZ as u64)),
+                        std::cmp::min(end, submitted_offset),
+                    ),
+                )
+            } else {
+                // [       written                        ][    mutable    ]
+                //                                         <-   TAIL_SZ   ->
+                //                                         ^
+                //                                 `submitted_offset`
+                // <++++++ on disk +++++++++++++++++++++++>
+                (
+                    Range(start, std::cmp::min(end, submitted_offset)),
+                    // zero len
+                    Range(submitted_offset, u64::MIN),
+                )
+            }
+        };
+
+        let mutable_range = Range(std::cmp::max(start, submitted_offset), end);

        let dst = if written_range.len() > 0 {
-            let file: &VirtualFile = file_size_tracking_writer.as_inner();
+            let file: &VirtualFile = self.buffered_writer.as_inner();
            let bounds = dst.bounds();
            let slice = file
                .read_exact_at(dst.slice(0..written_range.len().into_usize()), start, ctx)
@@ -211,19 +255,21 @@ impl super::storage_layer::inmemory_layer::vectored_dio_read::File for Ephemeral
            dst
        };

-        let dst = if buffered_range.len() > 0 {
-            let offset_in_buffer = buffered_range
+        let dst = if maybe_flushed_range.len() > 0 {
+            let offset_in_buffer = maybe_flushed_range
                .0
-                .checked_sub(flushed_offset)
+                .checked_sub(submitted_offset.saturating_sub(TAIL_SZ as u64))
                .unwrap()
                .into_usize();
-            let to_copy =
-                &buffered[offset_in_buffer..(offset_in_buffer + buffered_range.len().into_usize())];
+            // Checked previously the buffer is Some.
+            let maybe_flushed = maybe_flushed.unwrap();
+            let to_copy = &maybe_flushed
+                [offset_in_buffer..(offset_in_buffer + maybe_flushed_range.len().into_usize())];
            let bounds = dst.bounds();
            let mut view = dst.slice({
                let start = written_range.len().into_usize();
                let end = start
-                    .checked_add(buffered_range.len().into_usize())
+                    .checked_add(maybe_flushed_range.len().into_usize())
                    .unwrap();
                start..end
            });
@@ -234,6 +280,28 @@ impl super::storage_layer::inmemory_layer::vectored_dio_read::File for Ephemeral
            dst
        };

+        let dst = if mutable_range.len() > 0 {
+            let offset_in_buffer = mutable_range
+                .0
+                .checked_sub(submitted_offset)
+                .unwrap()
+                .into_usize();
+            let to_copy =
+                &mutable[offset_in_buffer..(offset_in_buffer + mutable_range.len().into_usize())];
+            let bounds = dst.bounds();
+            let mut view = dst.slice({
+                let start =
+                    written_range.len().into_usize() + maybe_flushed_range.len().into_usize();
+                let end = start.checked_add(mutable_range.len().into_usize()).unwrap();
+                start..end
+            });
+            view.as_mut_rust_slice_full_zeroed()
+                .copy_from_slice(to_copy);
+            Slice::from_buf_bounds(Slice::into_inner(view), bounds)
+        } else {
+            dst
+        };
+
        // TODO: in debug mode, randomize the remaining bytes in `dst` to catch bugs

        Ok((dst, (end - start).into_usize()))
@@ -295,7 +363,7 @@ mod tests {

        let gate = utils::sync::gate::Gate::default();

-        let file = EphemeralFile::create(conf, tenant_id, timeline_id, gate.enter().unwrap(), &ctx)
+        let file = EphemeralFile::create(conf, tenant_id, timeline_id, &gate, &ctx)
            .await
            .unwrap();

@@ -326,14 +394,15 @@ mod tests {

        let gate = utils::sync::gate::Gate::default();

-        let mut file =
-            EphemeralFile::create(conf, tenant_id, timeline_id, gate.enter().unwrap(), &ctx)
-                .await
-                .unwrap();
+        let mut file = EphemeralFile::create(conf, tenant_id, timeline_id, &gate, &ctx)
+            .await
+            .unwrap();

-        let cap = file.buffered_writer.inspect_buffer().capacity();
+        let mutable = file.buffered_writer.inspect_mutable();
+        let cap = mutable.capacity();
+        let align = mutable.align();

-        let write_nbytes = cap + cap / 2;
+        let write_nbytes = cap * 2 + cap / 2;

        let content: Vec<u8> = rand::thread_rng()
            .sample_iter(rand::distributions::Standard)
@@ -341,30 +410,39 @@ mod tests {
            .collect();

        let mut value_offsets = Vec::new();
-        for i in 0..write_nbytes {
-            let off = file.write_raw(&content[i..i + 1], &ctx).await.unwrap();
+        for range in (0..write_nbytes)
+            .step_by(align)
+            .map(|start| start..(start + align).min(write_nbytes))
+        {
+            let off = file.write_raw(&content[range], &ctx).await.unwrap();
            value_offsets.push(off);
        }

-        assert!(file.len() as usize == write_nbytes);
-        for i in 0..write_nbytes {
-            assert_eq!(value_offsets[i], i.into_u64());
-            let buf = IoBufferMut::with_capacity(1);
+        assert_eq!(file.len() as usize, write_nbytes);
+        for (i, range) in (0..write_nbytes)
+            .step_by(align)
+            .map(|start| start..(start + align).min(write_nbytes))
+            .enumerate()
+        {
+            assert_eq!(value_offsets[i], range.start.into_u64());
+            let buf = IoBufferMut::with_capacity(range.len());
            let (buf_slice, nread) = file
-                .read_exact_at_eof_ok(i.into_u64(), buf.slice_full(), &ctx)
+                .read_exact_at_eof_ok(range.start.into_u64(), buf.slice_full(), &ctx)
                .await
                .unwrap();
            let buf = buf_slice.into_inner();
-            assert_eq!(nread, 1);
-            assert_eq!(&buf, &content[i..i + 1]);
+            assert_eq!(nread, range.len());
+            assert_eq!(&buf, &content[range]);
        }

-        let file_contents =
-            std::fs::read(file.buffered_writer.as_inner().as_inner().path()).unwrap();
-        assert_eq!(file_contents, &content[0..cap]);
+        let file_contents = std::fs::read(file.buffered_writer.as_inner().path()).unwrap();
+        assert!(file_contents == content[0..cap * 2]);

-        let buffer_contents = file.buffered_writer.inspect_buffer();
-        assert_eq!(buffer_contents, &content[cap..write_nbytes]);
+        let maybe_flushed_buffer_contents = file.buffered_writer.inspect_maybe_flushed().unwrap();
+        assert_eq!(&maybe_flushed_buffer_contents[..], &content[cap..cap * 2]);
+
+        let mutable_buffer_contents = file.buffered_writer.inspect_mutable();
+        assert_eq!(mutable_buffer_contents, &content[cap * 2..write_nbytes]);
    }

    #[tokio::test]
@@ -373,16 +451,16 @@ mod tests {

        let gate = utils::sync::gate::Gate::default();

-        let mut file =
-            EphemeralFile::create(conf, tenant_id, timeline_id, gate.enter().unwrap(), &ctx)
-                .await
-                .unwrap();
+        let mut file = EphemeralFile::create(conf, tenant_id, timeline_id, &gate, &ctx)
+            .await
+            .unwrap();

-        let cap = file.buffered_writer.inspect_buffer().capacity();
+        // mutable buffer and maybe_flushed buffer each has `cap` bytes.
+        let cap = file.buffered_writer.inspect_mutable().capacity();

        let content: Vec<u8> = rand::thread_rng()
            .sample_iter(rand::distributions::Standard)
-            .take(cap + cap / 2)
+            .take(cap * 2 + cap / 2)
            .collect();

        file.write_raw(&content, &ctx).await.unwrap();
@@ -390,23 +468,21 @@ mod tests {
        // assert the state is as this test expects it to be
        assert_eq!(
            &file.load_to_io_buf(&ctx).await.unwrap(),
-            &content[0..cap + cap / 2]
+            &content[0..cap * 2 + cap / 2]
        );
-        let md = file
-            .buffered_writer
-            .as_inner()
-            .as_inner()
-            .path()
-            .metadata()
-            .unwrap();
+        let md = file.buffered_writer.as_inner().path().metadata().unwrap();
        assert_eq!(
            md.len(),
-            cap.into_u64(),
-            "buffered writer does one write if we write 1.5x buffer capacity"
+            2 * cap.into_u64(),
+            "buffered writer requires one write to be flushed if we write 2.5x buffer capacity"
        );
        assert_eq!(
-            &file.buffered_writer.inspect_buffer()[0..cap / 2],
-            &content[cap..cap + cap / 2]
+            &file.buffered_writer.inspect_maybe_flushed().unwrap()[0..cap],
+            &content[cap..cap * 2]
+        );
+        assert_eq!(
+            &file.buffered_writer.inspect_mutable()[0..cap / 2],
+            &content[cap * 2..cap * 2 + cap / 2]
        );
    }

@@ -422,19 +498,19 @@ mod tests {

        let gate = utils::sync::gate::Gate::default();

-        let mut file =
-            EphemeralFile::create(conf, tenant_id, timeline_id, gate.enter().unwrap(), &ctx)
-                .await
-                .unwrap();
-
-        let cap = file.buffered_writer.inspect_buffer().capacity();
+        let mut file = EphemeralFile::create(conf, tenant_id, timeline_id, &gate, &ctx)
+            .await
+            .unwrap();

+        let mutable = file.buffered_writer.inspect_mutable();
+        let cap = mutable.capacity();
+        let align = mutable.align();
        let content: Vec<u8> = rand::thread_rng()
            .sample_iter(rand::distributions::Standard)
-            .take(cap + cap / 2)
+            .take(cap * 2 + cap / 2)
            .collect();

-        file.write_raw(&content, &ctx).await.unwrap();
+        let (_, control) = file.write_raw_controlled(&content, &ctx).await.unwrap();

        let test_read = |start: usize, len: usize| {
            let file = &file;
@@ -454,16 +530,38 @@ mod tests {
            }
        };

+        let test_read_all_offset_combinations = || {
+            async move {
+                test_read(align, align).await;
+                // border onto edge of file
+                test_read(cap - align, align).await;
+                // read across file and buffer
+                test_read(cap - align, 2 * align).await;
+                // stay from start of maybe flushed buffer
+                test_read(cap, align).await;
+                // completely within maybe flushed buffer
+                test_read(cap + align, align).await;
+                // border onto edge of maybe flushed buffer.
+                test_read(cap * 2 - align, align).await;
+                // read across maybe flushed and mutable buffer
+                test_read(cap * 2 - align, 2 * align).await;
+                // read across three segments
+                test_read(cap - align, cap + 2 * align).await;
+                // completely within mutable buffer
+                test_read(cap * 2 + align, align).await;
+            }
+        };
+
        // completely within the file range
-        assert!(20 < cap, "test assumption");
-        test_read(10, 10).await;
-        // border onto edge of file
-        test_read(cap - 10, 10).await;
-        // read across file and buffer
-        test_read(cap - 10, 20).await;
-        // stay from start of buffer
-        test_read(cap, 10).await;
-        // completely within buffer
-        test_read(cap + 10, 10).await;
+        assert!(align < cap, "test assumption");
+        assert!(cap % align == 0);
+
+        // test reads at different flush stages.
+        let not_started = control.unwrap().into_not_started();
+        test_read_all_offset_combinations().await;
+        let in_progress = not_started.ready_to_flush();
+        test_read_all_offset_combinations().await;
+        in_progress.wait_until_flush_is_done().await;
+        test_read_all_offset_combinations().await;
    }
 }
--- a/pageserver/src/tenant/gc_block.rs
+++ b/pageserver/src/tenant/gc_block.rs
@@ -1,4 +1,4 @@
-use std::collections::HashMap;
+use std::{collections::HashMap, sync::Arc};

 use utils::id::TimelineId;

@@ -20,7 +20,7 @@ pub(crate) struct GcBlock {
    /// Do not add any more features taking and forbidding taking this lock. It should be
    /// `tokio::sync::Notify`, but that is rarely used. On the other side, [`GcBlock::insert`]
    /// synchronizes with gc attempts by locking and unlocking this mutex.
-    blocking: tokio::sync::Mutex<()>,
+    blocking: Arc<tokio::sync::Mutex<()>>,
 }

 impl GcBlock {
@@ -30,7 +30,7 @@ impl GcBlock {
    /// it's ending, or if not currently possible, a value describing the reasons why not.
    ///
    /// Cancellation safe.
-    pub(super) async fn start(&self) -> Result<Guard<'_>, BlockingReasons> {
+    pub(super) async fn start(&self) -> Result<Guard, BlockingReasons> {
        let reasons = {
            let g = self.reasons.lock().unwrap();

@@ -44,7 +44,7 @@ impl GcBlock {
            Err(reasons)
        } else {
            Ok(Guard {
-                _inner: self.blocking.lock().await,
+                _inner: self.blocking.clone().lock_owned().await,
            })
        }
    }
@@ -170,8 +170,8 @@ impl GcBlock {
    }
 }

-pub(super) struct Guard<'a> {
-    _inner: tokio::sync::MutexGuard<'a, ()>,
+pub(crate) struct Guard {
+    _inner: tokio::sync::OwnedMutexGuard<()>,
 }

 #[derive(Debug)]
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -894,7 +894,7 @@ impl TenantManager {
            Some(TenantSlot::Attached(tenant)) => Ok(Arc::clone(tenant)),
            Some(TenantSlot::InProgress(_)) => Err(GetTenantError::NotActive(tenant_shard_id)),
            None | Some(TenantSlot::Secondary(_)) => {
-                Err(GetTenantError::NotFound(tenant_shard_id.tenant_id))
+                Err(GetTenantError::ShardNotFound(tenant_shard_id))
            }
        }
    }
@@ -2258,6 +2258,9 @@ pub(crate) enum GetTenantError {
    #[error("Tenant {0} not found")]
    NotFound(TenantId),

+    #[error("Tenant {0} not found")]
+    ShardNotFound(TenantShardId),
+
    #[error("Tenant {0} is not active")]
    NotActive(TenantShardId),

--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -681,6 +681,7 @@ impl RemoteTimelineClient {
        layer_file_name: &LayerName,
        layer_metadata: &LayerFileMetadata,
        local_path: &Utf8Path,
+        gate: &utils::sync::gate::Gate,
        cancel: &CancellationToken,
        ctx: &RequestContext,
    ) -> Result<u64, DownloadError> {
@@ -700,6 +701,7 @@ impl RemoteTimelineClient {
                layer_file_name,
                layer_metadata,
                local_path,
+                gate,
                cancel,
                ctx,
            )
@@ -2190,6 +2192,9 @@ impl RemoteTimelineClient {
                    upload_queue.clean.1 = Some(task.task_id);

                    let lsn = upload_queue.clean.0.metadata.disk_consistent_lsn();
+                    self.metrics
+                        .projected_remote_consistent_lsn_gauge
+                        .set(lsn.0);

                    if self.generation.is_none() {
                        // Legacy mode: skip validating generation
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -26,8 +26,6 @@ use crate::span::{
 use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_path};
 use crate::tenant::storage_layer::LayerName;
 use crate::tenant::Generation;
-#[cfg_attr(target_os = "macos", allow(unused_imports))]
-use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt;
 use crate::virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile};
 use crate::TEMP_FILE_SUFFIX;
 use remote_storage::{
@@ -60,6 +58,7 @@ pub async fn download_layer_file<'a>(
    layer_file_name: &'a LayerName,
    layer_metadata: &'a LayerFileMetadata,
    local_path: &Utf8Path,
+    gate: &utils::sync::gate::Gate,
    cancel: &CancellationToken,
    ctx: &RequestContext,
 ) -> Result<u64, DownloadError> {
@@ -88,7 +87,9 @@ pub async fn download_layer_file<'a>(
    let temp_file_path = path_with_suffix_extension(local_path, TEMP_DOWNLOAD_EXTENSION);

    let bytes_amount = download_retry(
-        || async { download_object(storage, &remote_path, &temp_file_path, cancel, ctx).await },
+        || async {
+            download_object(storage, &remote_path, &temp_file_path, gate, cancel, ctx).await
+        },
        &format!("download {remote_path:?}"),
        cancel,
    )
@@ -148,6 +149,7 @@ async fn download_object<'a>(
    storage: &'a GenericRemoteStorage,
    src_path: &RemotePath,
    dst_path: &Utf8PathBuf,
+    #[cfg_attr(target_os = "macos", allow(unused_variables))] gate: &utils::sync::gate::Gate,
    cancel: &CancellationToken,
    #[cfg_attr(target_os = "macos", allow(unused_variables))] ctx: &RequestContext,
 ) -> Result<u64, DownloadError> {
@@ -205,13 +207,18 @@ async fn download_object<'a>(
        }
        #[cfg(target_os = "linux")]
        crate::virtual_file::io_engine::IoEngine::TokioEpollUring => {
-            use crate::virtual_file::owned_buffers_io::{self, util::size_tracking_writer};
-            use bytes::BytesMut;
+            use crate::virtual_file::owned_buffers_io;
+            use crate::virtual_file::IoBufferMut;
+            use std::sync::Arc;
            async {
-                let destination_file = VirtualFile::create(dst_path, ctx)
-                    .await
-                    .with_context(|| format!("create a destination file for layer '{dst_path}'"))
-                    .map_err(DownloadError::Other)?;
+                let destination_file = Arc::new(
+                    VirtualFile::create(dst_path, ctx)
+                        .await
+                        .with_context(|| {
+                            format!("create a destination file for layer '{dst_path}'")
+                        })
+                        .map_err(DownloadError::Other)?,
+                );

                let mut download = storage
                    .download(src_path, &DownloadOpts::default(), cancel)
@@ -219,14 +226,16 @@ async fn download_object<'a>(

                pausable_failpoint!("before-downloading-layer-stream-pausable");

+                let mut buffered = owned_buffers_io::write::BufferedWriter::<IoBufferMut, _>::new(
+                    destination_file,
+                    || IoBufferMut::with_capacity(super::BUFFER_SIZE),
+                    gate.enter().map_err(|_| DownloadError::Cancelled)?,
+                    ctx,
+                );
+
                // TODO: use vectored write (writev) once supported by tokio-epoll-uring.
                // There's chunks_vectored() on the stream.
                let (bytes_amount, destination_file) = async {
-                    let size_tracking = size_tracking_writer::Writer::new(destination_file);
-                    let mut buffered = owned_buffers_io::write::BufferedWriter::<BytesMut, _>::new(
-                        size_tracking,
-                        BytesMut::with_capacity(super::BUFFER_SIZE),
-                    );
                    while let Some(res) =
                        futures::StreamExt::next(&mut download.download_stream).await
                    {
@@ -234,10 +243,10 @@ async fn download_object<'a>(
                            Ok(chunk) => chunk,
                            Err(e) => return Err(e),
                        };
-                        buffered.write_buffered(chunk.slice_len(), ctx).await?;
+                        buffered.write_buffered_borrowed(&chunk, ctx).await?;
                    }
-                    let size_tracking = buffered.flush_and_into_inner(ctx).await?;
-                    Ok(size_tracking.into_inner())
+                    let inner = buffered.flush_and_into_inner(ctx).await?;
+                    Ok(inner)
                }
                .await?;

--- a/pageserver/src/tenant/secondary.rs
+++ b/pageserver/src/tenant/secondary.rs
@@ -22,6 +22,7 @@ use super::{
    mgr::TenantManager,
    span::debug_assert_current_span_has_tenant_id,
    storage_layer::LayerName,
+    GetTenantError,
 };

 use crate::metrics::SECONDARY_RESIDENT_PHYSICAL_SIZE;
@@ -66,7 +67,21 @@ struct CommandRequest<T> {
 }

 struct CommandResponse {
-    result: anyhow::Result<()>,
+    result: Result<(), SecondaryTenantError>,
+}
+
+#[derive(thiserror::Error, Debug)]
+pub(crate) enum SecondaryTenantError {
+    #[error("{0}")]
+    GetTenant(GetTenantError),
+    #[error("shutting down")]
+    ShuttingDown,
+}
+
+impl From<GetTenantError> for SecondaryTenantError {
+    fn from(gte: GetTenantError) -> Self {
+        Self::GetTenant(gte)
+    }
 }

 // Whereas [`Tenant`] represents an attached tenant, this type represents the work
@@ -285,7 +300,7 @@ impl SecondaryController {
        &self,
        queue: &tokio::sync::mpsc::Sender<CommandRequest<T>>,
        payload: T,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), SecondaryTenantError> {
        let (response_tx, response_rx) = tokio::sync::oneshot::channel();

        queue
@@ -294,20 +309,26 @@ impl SecondaryController {
                response_tx,
            })
            .await
-            .map_err(|_| anyhow::anyhow!("Receiver shut down"))?;
+            .map_err(|_| SecondaryTenantError::ShuttingDown)?;

        let response = response_rx
            .await
-            .map_err(|_| anyhow::anyhow!("Request dropped"))?;
+            .map_err(|_| SecondaryTenantError::ShuttingDown)?;

        response.result
    }

-    pub async fn upload_tenant(&self, tenant_shard_id: TenantShardId) -> anyhow::Result<()> {
+    pub(crate) async fn upload_tenant(
+        &self,
+        tenant_shard_id: TenantShardId,
+    ) -> Result<(), SecondaryTenantError> {
        self.dispatch(&self.upload_req_tx, UploadCommand::Upload(tenant_shard_id))
            .await
    }
-    pub async fn download_tenant(&self, tenant_shard_id: TenantShardId) -> anyhow::Result<()> {
+    pub(crate) async fn download_tenant(
+        &self,
+        tenant_shard_id: TenantShardId,
+    ) -> Result<(), SecondaryTenantError> {
        self.dispatch(
            &self.download_req_tx,
            DownloadCommand::Download(tenant_shard_id),
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -35,7 +35,7 @@ use super::{
        self, period_jitter, period_warmup, Completion, JobGenerator, SchedulingResult,
        TenantBackgroundJobs,
    },
-    SecondaryTenant,
+    GetTenantError, SecondaryTenant, SecondaryTenantError,
 };

 use crate::tenant::{
@@ -470,15 +470,16 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
        result
    }

-    fn on_command(&mut self, command: DownloadCommand) -> anyhow::Result<PendingDownload> {
+    fn on_command(
+        &mut self,
+        command: DownloadCommand,
+    ) -> Result<PendingDownload, SecondaryTenantError> {
        let tenant_shard_id = command.get_tenant_shard_id();

        let tenant = self
            .tenant_manager
-            .get_secondary_tenant_shard(*tenant_shard_id);
-        let Some(tenant) = tenant else {
-            return Err(anyhow::anyhow!("Not found or not in Secondary mode"));
-        };
+            .get_secondary_tenant_shard(*tenant_shard_id)
+            .ok_or(GetTenantError::ShardNotFound(*tenant_shard_id))?;

        Ok(PendingDownload {
            target_time: None,
@@ -1182,6 +1183,7 @@ impl<'a> TenantDownloader<'a> {
            &layer.name,
            &layer.metadata,
            &local_path,
+            &self.secondary_state.gate,
            &self.secondary_state.cancel,
            ctx,
        )
--- a/pageserver/src/tenant/secondary/heatmap_uploader.rs
+++ b/pageserver/src/tenant/secondary/heatmap_uploader.rs
@@ -28,7 +28,7 @@ use super::{
        self, period_jitter, period_warmup, JobGenerator, RunningJob, SchedulingResult,
        TenantBackgroundJobs,
    },
-    CommandRequest, UploadCommand,
+    CommandRequest, SecondaryTenantError, UploadCommand,
 };
 use tokio_util::sync::CancellationToken;
 use tracing::{info_span, instrument, Instrument};
@@ -279,7 +279,10 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
        }.instrument(info_span!(parent: None, "heatmap_upload", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))))
    }

-    fn on_command(&mut self, command: UploadCommand) -> anyhow::Result<UploadPending> {
+    fn on_command(
+        &mut self,
+        command: UploadCommand,
+    ) -> Result<UploadPending, SecondaryTenantError> {
        let tenant_shard_id = command.get_tenant_shard_id();

        tracing::info!(
@@ -287,8 +290,7 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
            "Starting heatmap write on command");
        let tenant = self
            .tenant_manager
-            .get_attached_tenant_shard(*tenant_shard_id)
-            .map_err(|e| anyhow::anyhow!(e))?;
+            .get_attached_tenant_shard(*tenant_shard_id)?;
        if !tenant.is_active() {
            return Err(GetTenantError::NotActive(*tenant_shard_id).into());
        }
--- a/pageserver/src/tenant/secondary/scheduler.rs
+++ b/pageserver/src/tenant/secondary/scheduler.rs
@@ -12,7 +12,7 @@ use tokio::task::JoinSet;
 use tokio_util::sync::CancellationToken;
 use utils::{completion::Barrier, yielding_loop::yielding_loop};

-use super::{CommandRequest, CommandResponse};
+use super::{CommandRequest, CommandResponse, SecondaryTenantError};

 /// Scheduling interval is the time between calls to JobGenerator::schedule.
 /// When we schedule jobs, the job generator may provide a hint of its preferred
@@ -112,7 +112,7 @@ where

    /// Called when a command is received.  A job will be spawned immediately if the return
    /// value is Some, ignoring concurrency limits and the pending queue.
-    fn on_command(&mut self, cmd: CMD) -> anyhow::Result<PJ>;
+    fn on_command(&mut self, cmd: CMD) -> Result<PJ, SecondaryTenantError>;
 }

 /// [`JobGenerator`] returns this to provide pending jobs, and hints about scheduling
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -555,13 +555,12 @@ impl InMemoryLayer {
        timeline_id: TimelineId,
        tenant_shard_id: TenantShardId,
        start_lsn: Lsn,
-        gate_guard: utils::sync::gate::GateGuard,
+        gate: &utils::sync::gate::Gate,
        ctx: &RequestContext,
    ) -> Result<InMemoryLayer> {
        trace!("initializing new empty InMemoryLayer for writing on timeline {timeline_id} at {start_lsn}");

-        let file =
-            EphemeralFile::create(conf, tenant_shard_id, timeline_id, gate_guard, ctx).await?;
+        let file = EphemeralFile::create(conf, tenant_shard_id, timeline_id, gate, ctx).await?;
        let key = InMemoryLayerFileId(file.page_cache_file_id());

        Ok(InMemoryLayer {
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -1149,6 +1149,7 @@ impl LayerInner {
                &self.desc.layer_name(),
                &self.metadata(),
                &self.path,
+                &timeline.gate,
                &timeline.cancel,
                ctx,
            )
--- a/pageserver/src/tenant/throttle.rs
+++ b/pageserver/src/tenant/throttle.rs
@@ -58,6 +58,11 @@ pub struct Stats {
    pub sum_throttled_usecs: u64,
 }

+pub enum ThrottleResult {
+    NotThrottled { start: Instant },
+    Throttled { start: Instant, end: Instant },
+}
+
 impl<M> Throttle<M>
 where
    M: Metric,
@@ -122,15 +127,15 @@ where
        self.inner.load().rate_limiter.steady_rps()
    }

-    pub async fn throttle(&self, key_count: usize) -> Option<Duration> {
+    pub async fn throttle(&self, key_count: usize) -> ThrottleResult {
        let inner = self.inner.load_full(); // clones the `Inner` Arc

-        if !inner.enabled {
-            return None;
-        }
-
        let start = std::time::Instant::now();

+        if !inner.enabled {
+            return ThrottleResult::NotThrottled { start };
+        }
+
        self.metric.accounting_start();
        self.count_accounted_start.fetch_add(1, Ordering::Relaxed);
        let did_throttle = inner.rate_limiter.acquire(key_count).await;
@@ -145,9 +150,9 @@ where
                .fetch_add(wait_time.as_micros() as u64, Ordering::Relaxed);
            let observation = Observation { wait_time };
            self.metric.observe_throttling(&observation);
-            Some(wait_time)
+            ThrottleResult::Throttled { start, end: now }
        } else {
-            None
+            ThrottleResult::NotThrottled { start }
        }
    }
 }
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -53,7 +53,7 @@ use utils::{
    postgres_client::PostgresClientProtocol,
    sync::gate::{Gate, GateGuard},
 };
-use wal_decoder::serialized_batch::SerializedValueBatch;
+use wal_decoder::serialized_batch::{SerializedValueBatch, ValueMeta};

 use std::sync::atomic::Ordering as AtomicOrdering;
 use std::sync::{Arc, Mutex, RwLock, Weak};
@@ -768,7 +768,7 @@ pub enum GetLogicalSizePriority {
    Background,
 }

-#[derive(enumset::EnumSetType)]
+#[derive(Debug, enumset::EnumSetType)]
 pub(crate) enum CompactFlags {
    ForceRepartition,
    ForceImageLayerCreation,
@@ -777,6 +777,19 @@ pub(crate) enum CompactFlags {
    DryRun,
 }

+#[serde_with::serde_as]
+#[derive(Debug, Clone, serde::Deserialize)]
+pub(crate) struct CompactRequest {
+    pub compact_range: Option<CompactRange>,
+    pub compact_below_lsn: Option<Lsn>,
+    /// Whether the compaction job should be scheduled.
+    #[serde(default)]
+    pub scheduled: bool,
+    /// Whether the compaction job should be split across key ranges.
+    #[serde(default)]
+    pub sub_compaction: bool,
+}
+
 #[serde_with::serde_as]
 #[derive(Debug, Clone, serde::Deserialize)]
 pub(crate) struct CompactRange {
@@ -786,10 +799,27 @@ pub(crate) struct CompactRange {
    pub end: Key,
 }

-#[derive(Clone, Default)]
+impl From<Range<Key>> for CompactRange {
+    fn from(range: Range<Key>) -> Self {
+        CompactRange {
+            start: range.start,
+            end: range.end,
+        }
+    }
+}
+
+#[derive(Debug, Clone, Default)]
 pub(crate) struct CompactOptions {
    pub flags: EnumSet<CompactFlags>,
+    /// If set, the compaction will only compact the key range specified by this option.
+    /// This option is only used by GC compaction.
    pub compact_range: Option<CompactRange>,
+    /// If set, the compaction will only compact the LSN below this value.
+    /// This option is only used by GC compaction.
+    pub compact_below_lsn: Option<Lsn>,
+    /// Enable sub-compaction (split compaction job across key ranges).
+    /// This option is only used by GC compaction.
+    pub sub_compaction: bool,
 }

 impl std::fmt::Debug for Timeline {
@@ -1433,23 +1463,31 @@ impl Timeline {
        Ok(lease)
    }

-    /// Flush to disk all data that was written with the put_* functions
+    /// Freeze the current open in-memory layer. It will be written to disk on next iteration.
+    /// Returns the flush request ID which can be awaited with wait_flush_completion().
+    #[instrument(skip(self), fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%self.timeline_id))]
+    pub(crate) async fn freeze(&self) -> Result<u64, FlushLayerError> {
+        self.freeze0().await
+    }
+
+    /// Freeze and flush the open in-memory layer, waiting for it to be written to disk.
    #[instrument(skip(self), fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%self.timeline_id))]
    pub(crate) async fn freeze_and_flush(&self) -> Result<(), FlushLayerError> {
        self.freeze_and_flush0().await
    }

+    /// Freeze the current open in-memory layer. It will be written to disk on next iteration.
+    /// Returns the flush request ID which can be awaited with wait_flush_completion().
+    pub(crate) async fn freeze0(&self) -> Result<u64, FlushLayerError> {
+        let mut g = self.write_lock.lock().await;
+        let to_lsn = self.get_last_record_lsn();
+        self.freeze_inmem_layer_at(to_lsn, &mut g).await
+    }
+
    // This exists to provide a non-span creating version of `freeze_and_flush` we can call without
    // polluting the span hierarchy.
    pub(crate) async fn freeze_and_flush0(&self) -> Result<(), FlushLayerError> {
-        let token = {
-            // Freeze the current open in-memory layer. It will be written to disk on next
-            // iteration.
-            let mut g = self.write_lock.lock().await;
-
-            let to_lsn = self.get_last_record_lsn();
-            self.freeze_inmem_layer_at(to_lsn, &mut g).await?
-        };
+        let token = self.freeze0().await?;
        self.wait_flush_completion(token).await
    }

@@ -1604,6 +1642,8 @@ impl Timeline {
            CompactOptions {
                flags,
                compact_range: None,
+                compact_below_lsn: None,
+                sub_compaction: false,
            },
            ctx,
        )
@@ -2359,7 +2399,7 @@ impl Timeline {

            result
                .metrics
-                .last_record_gauge
+                .last_record_lsn_gauge
                .set(disk_consistent_lsn.0 as i64);
            result
        })
@@ -3455,7 +3495,6 @@ impl Timeline {
        ctx: &RequestContext,
    ) -> anyhow::Result<Arc<InMemoryLayer>> {
        let mut guard = self.layers.write().await;
-        let gate_guard = self.gate.enter().context("enter gate for inmem layer")?;

        let last_record_lsn = self.get_last_record_lsn();
        ensure!(
@@ -3472,7 +3511,7 @@ impl Timeline {
                self.conf,
                self.timeline_id,
                self.tenant_shard_id,
-                gate_guard,
+                &self.gate,
                ctx,
            )
            .await?;
@@ -3482,7 +3521,7 @@ impl Timeline {
    pub(crate) fn finish_write(&self, new_lsn: Lsn) {
        assert!(new_lsn.is_aligned());

-        self.metrics.last_record_gauge.set(new_lsn.0 as i64);
+        self.metrics.last_record_lsn_gauge.set(new_lsn.0 as i64);
        self.last_record_lsn.advance(new_lsn);
    }

@@ -3850,6 +3889,10 @@ impl Timeline {
    fn set_disk_consistent_lsn(&self, new_value: Lsn) -> bool {
        let old_value = self.disk_consistent_lsn.fetch_max(new_value);
        assert!(new_value >= old_value, "disk_consistent_lsn must be growing monotonously at runtime; current {old_value}, offered {new_value}");
+
+        self.metrics
+            .disk_consistent_lsn_gauge
+            .set(new_value.0 as i64);
        new_value != old_value
    }

@@ -5888,6 +5931,23 @@ impl<'a> TimelineWriter<'a> {
            return Ok(());
        }

+        // In debug builds, assert that we don't write any keys that don't belong to this shard.
+        // We don't assert this in release builds, since key ownership policies may change over
+        // time. Stray keys will be removed during compaction.
+        if cfg!(debug_assertions) {
+            for metadata in &batch.metadata {
+                if let ValueMeta::Serialized(metadata) = metadata {
+                    let key = Key::from_compact(metadata.key);
+                    assert!(
+                        self.shard_identity.is_key_local(&key)
+                            || self.shard_identity.is_key_global(&key),
+                        "key {key} does not belong on shard {}",
+                        self.shard_identity.shard_index()
+                    );
+                }
+            }
+        }
+
        let batch_max_lsn = batch.max_lsn;
        let buf_size: u64 = batch.buffer_size() as u64;

--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -10,13 +10,12 @@ use std::sync::Arc;

 use super::layer_manager::LayerManager;
 use super::{
-    CompactFlags, CompactOptions, CreateImageLayersError, DurationRecorder, ImageLayerCreationMode,
-    RecordedDuration, Timeline,
+    CompactFlags, CompactOptions, CompactRange, CreateImageLayersError, DurationRecorder,
+    ImageLayerCreationMode, RecordedDuration, Timeline,
 };

 use anyhow::{anyhow, bail, Context};
 use bytes::Bytes;
-use enumset::EnumSet;
 use fail::fail_point;
 use itertools::Itertools;
 use pageserver_api::key::KEY_SIZE;
@@ -30,7 +29,6 @@ use utils::id::TimelineId;
 use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder};
 use crate::page_cache;
 use crate::statvfs::Statvfs;
-use crate::tenant::checks::check_valid_layermap;
 use crate::tenant::remote_timeline_client::WaitCompletionError;
 use crate::tenant::storage_layer::batch_split_writer::{
    BatchWriterResult, SplitDeltaLayerWriter, SplitImageLayerWriter,
@@ -43,7 +41,7 @@ use crate::tenant::storage_layer::{
 use crate::tenant::timeline::ImageLayerCreationOutcome;
 use crate::tenant::timeline::{drop_rlock, DeltaLayerWriter, ImageLayerWriter};
 use crate::tenant::timeline::{Layer, ResidentLayer};
-use crate::tenant::{DeltaLayer, MaybeOffloaded};
+use crate::tenant::{gc_block, DeltaLayer, MaybeOffloaded};
 use crate::virtual_file::{MaybeFatalIo, VirtualFile};
 use pageserver_api::config::tenant_conf_defaults::{
    DEFAULT_CHECKPOINT_DISTANCE, DEFAULT_COMPACTION_THRESHOLD,
@@ -64,6 +62,15 @@ use super::CompactionError;
 /// Maximum number of deltas before generating an image layer in bottom-most compaction.
 const COMPACTION_DELTA_THRESHOLD: usize = 5;

+/// A scheduled compaction task.
+pub(crate) struct ScheduledCompactionTask {
+    pub options: CompactOptions,
+    /// The channel to send the compaction result. If this is a subcompaction, the last compaction job holds the sender.
+    pub result_tx: Option<tokio::sync::oneshot::Sender<()>>,
+    /// Hold the GC block. If this is a subcompaction, the last compaction job holds the gc block guard.
+    pub gc_block: Option<gc_block::Guard>,
+}
+
 pub struct GcCompactionJobDescription {
    /// All layers to read in the compaction job
    selected_layers: Vec<Layer>,
@@ -1174,11 +1181,12 @@ impl Timeline {
                    .await
                    .map_err(CompactionError::Other)?;
            } else {
-                debug!(
-                    "Dropping key {} during compaction (it belongs on shard {:?})",
-                    key,
-                    self.shard_identity.get_shard_number(&key)
-                );
+                let shard = self.shard_identity.shard_index();
+                let owner = self.shard_identity.get_shard_number(&key);
+                if cfg!(debug_assertions) {
+                    panic!("key {key} does not belong on shard {shard}, owned by {owner}");
+                }
+                debug!("dropping key {key} during compaction (it belongs on shard {owner})");
            }

            if !new_layers.is_empty() {
@@ -1746,22 +1754,113 @@ impl Timeline {
        Ok(())
    }

-    pub(crate) async fn compact_with_gc(
+    /// Split a gc-compaction job into multiple compaction jobs. Optimally, this function should return a vector of
+    /// `GcCompactionJobDesc`. But we want to keep it simple on the tenant scheduling side without exposing too much
+    /// ad-hoc information about gc compaction itself.
+    pub(crate) async fn gc_compaction_split_jobs(
        self: &Arc<Self>,
-        cancel: &CancellationToken,
        options: CompactOptions,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
-        self.partial_compact_with_gc(
-            options
-                .compact_range
-                .map(|range| range.start..range.end)
-                .unwrap_or_else(|| Key::MIN..Key::MAX),
-            cancel,
-            options.flags,
-            ctx,
-        )
-        .await
+    ) -> anyhow::Result<Vec<CompactOptions>> {
+        if !options.sub_compaction {
+            return Ok(vec![options]);
+        }
+        let compact_range = options.compact_range.clone().unwrap_or(CompactRange {
+            start: Key::MIN,
+            end: Key::MAX,
+        });
+        let compact_below_lsn = if let Some(compact_below_lsn) = options.compact_below_lsn {
+            compact_below_lsn
+        } else {
+            *self.get_latest_gc_cutoff_lsn() // use the real gc cutoff
+        };
+        let mut compact_jobs = Vec::new();
+        // For now, we simply use the key partitioning information; we should do a more fine-grained partitioning
+        // by estimating the amount of files read for a compaction job. We should also partition on LSN.
+        let Ok(partition) = self.partitioning.try_lock() else {
+            bail!("failed to acquire partition lock");
+        };
+        let ((dense_ks, sparse_ks), _) = &*partition;
+        // Truncate the key range to be within user specified compaction range.
+        fn truncate_to(
+            source_start: &Key,
+            source_end: &Key,
+            target_start: &Key,
+            target_end: &Key,
+        ) -> Option<(Key, Key)> {
+            let start = source_start.max(target_start);
+            let end = source_end.min(target_end);
+            if start < end {
+                Some((*start, *end))
+            } else {
+                None
+            }
+        }
+        let mut split_key_ranges = Vec::new();
+        let ranges = dense_ks
+            .parts
+            .iter()
+            .map(|partition| partition.ranges.iter())
+            .chain(sparse_ks.parts.iter().map(|x| x.0.ranges.iter()))
+            .flatten()
+            .cloned()
+            .collect_vec();
+        for range in ranges.iter() {
+            let Some((start, end)) = truncate_to(
+                &range.start,
+                &range.end,
+                &compact_range.start,
+                &compact_range.end,
+            ) else {
+                continue;
+            };
+            split_key_ranges.push((start, end));
+        }
+        split_key_ranges.sort();
+        let guard = self.layers.read().await;
+        let layer_map = guard.layer_map()?;
+        let mut current_start = None;
+        // Split compaction job to about 2GB each
+        const GC_COMPACT_MAX_SIZE_MB: u64 = 4 * 1024; // 4GB, TODO: should be configuration in the future
+        let ranges_num = split_key_ranges.len();
+        for (idx, (start, end)) in split_key_ranges.into_iter().enumerate() {
+            if current_start.is_none() {
+                current_start = Some(start);
+            }
+            let start = current_start.unwrap();
+            if start >= end {
+                // We have already processed this partition.
+                continue;
+            }
+            let res = layer_map.range_search(start..end, compact_below_lsn);
+            let total_size = res.found.keys().map(|x| x.layer.file_size()).sum::<u64>();
+            if total_size > GC_COMPACT_MAX_SIZE_MB * 1024 * 1024 || ranges_num == idx + 1 {
+                let mut compact_options = options.clone();
+                // Try to extend the compaction range so that we include at least one full layer file.
+                let extended_end = res
+                    .found
+                    .keys()
+                    .map(|layer| layer.layer.key_range.end)
+                    .min();
+                // It is possible that the search range does not contain any layer files when we reach the end of the loop.
+                // In this case, we simply use the specified key range end.
+                let end = if let Some(extended_end) = extended_end {
+                    extended_end.max(end)
+                } else {
+                    end
+                };
+                info!(
+                    "splitting compaction job: {}..{}, estimated_size={}",
+                    start, end, total_size
+                );
+                compact_options.compact_range = Some(CompactRange { start, end });
+                compact_options.compact_below_lsn = Some(compact_below_lsn);
+                compact_options.sub_compaction = false;
+                compact_jobs.push(compact_options);
+                current_start = Some(end);
+            }
+        }
+        drop(guard);
+        Ok(compact_jobs)
    }

    /// An experimental compaction building block that combines compaction with garbage collection.
@@ -1771,19 +1870,51 @@ impl Timeline {
    /// layers and image layers, which generates image layers on the gc horizon, drop deltas below gc horizon,
    /// and create delta layers with all deltas >= gc horizon.
    ///
-    /// If `key_range` is provided, it will only compact the keys within the range, aka partial compaction.
+    /// If `options.compact_range` is provided, it will only compact the keys within the range, aka partial compaction.
    /// Partial compaction will read and process all layers overlapping with the key range, even if it might
    /// contain extra keys. After the gc-compaction phase completes, delta layers that are not fully contained
    /// within the key range will be rewritten to ensure they do not overlap with the delta layers. Providing
    /// Key::MIN..Key..MAX to the function indicates a full compaction, though technically, `Key::MAX` is not
    /// part of the range.
-    pub(crate) async fn partial_compact_with_gc(
+    ///
+    /// If `options.compact_below_lsn` is provided, the compaction will only compact layers below or intersect with
+    /// the LSN. Otherwise, it will use the gc cutoff by default.
+    pub(crate) async fn compact_with_gc(
        self: &Arc<Self>,
-        compaction_key_range: Range<Key>,
        cancel: &CancellationToken,
-        flags: EnumSet<CompactFlags>,
+        options: CompactOptions,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
+        if options.sub_compaction {
+            info!("running enhanced gc bottom-most compaction with sub-compaction, splitting compaction jobs");
+            let jobs = self.gc_compaction_split_jobs(options).await?;
+            let jobs_len = jobs.len();
+            for (idx, job) in jobs.into_iter().enumerate() {
+                info!(
+                    "running enhanced gc bottom-most compaction, sub-compaction {}/{}",
+                    idx + 1,
+                    jobs_len
+                );
+                self.compact_with_gc_inner(cancel, job, ctx).await?;
+            }
+            if jobs_len == 0 {
+                info!("no jobs to run, skipping gc bottom-most compaction");
+            }
+            return Ok(());
+        }
+        self.compact_with_gc_inner(cancel, options, ctx).await
+    }
+
+    async fn compact_with_gc_inner(
+        self: &Arc<Self>,
+        cancel: &CancellationToken,
+        options: CompactOptions,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
+        assert!(
+            !options.sub_compaction,
+            "sub-compaction should be handled by the outer function"
+        );
        // Block other compaction/GC tasks from running for now. GC-compaction could run along
        // with legacy compaction tasks in the future. Always ensure the lock order is compaction -> gc.
        // Note that we already acquired the compaction lock when the outer `compact` function gets called.
@@ -1803,6 +1934,12 @@ impl Timeline {
        )
        .await?;

+        let flags = options.flags;
+        let compaction_key_range = options
+            .compact_range
+            .map(|range| range.start..range.end)
+            .unwrap_or_else(|| Key::MIN..Key::MAX);
+
        let dry_run = flags.contains(CompactFlags::DryRun);

        if compaction_key_range == (Key::MIN..Key::MAX) {
@@ -1826,7 +1963,22 @@ impl Timeline {
            let layers = guard.layer_map()?;
            let gc_info = self.gc_info.read().unwrap();
            let mut retain_lsns_below_horizon = Vec::new();
-            let gc_cutoff = gc_info.cutoffs.select_min();
+            let gc_cutoff = {
+                // Currently, gc-compaction only kicks in after the legacy gc has updated the gc_cutoff.
+                // Therefore, it can only clean up data that cannot be cleaned up with legacy gc, instead of
+                // cleaning everything that theoritically it could. In the future, it should use `self.gc_info`
+                // to get the truth data.
+                let real_gc_cutoff = *self.get_latest_gc_cutoff_lsn();
+                // The compaction algorithm will keep all keys above the gc_cutoff while keeping only necessary keys below the gc_cutoff for
+                // each of the retain_lsn. Therefore, if the user-provided `compact_below_lsn` is larger than the real gc cutoff, we will use
+                // the real cutoff.
+                let mut gc_cutoff = options.compact_below_lsn.unwrap_or(real_gc_cutoff);
+                if gc_cutoff > real_gc_cutoff {
+                    warn!("provided compact_below_lsn={} is larger than the real_gc_cutoff={}, using the real gc cutoff", gc_cutoff, real_gc_cutoff);
+                    gc_cutoff = real_gc_cutoff;
+                }
+                gc_cutoff
+            };
            for (lsn, _timeline_id, _is_offloaded) in &gc_info.retain_lsns {
                if lsn < &gc_cutoff {
                    retain_lsns_below_horizon.push(*lsn);
@@ -1846,7 +1998,7 @@ impl Timeline {
                .map(|desc| desc.get_lsn_range().end)
                .max()
            else {
-                info!("no layers to compact with gc");
+                info!("no layers to compact with gc: no historic layers below gc_cutoff, gc_cutoff={}", gc_cutoff);
                return Ok(());
            };
            // Then, pick all the layers that are below the max_layer_lsn. This is to ensure we can pick all single-key
@@ -1869,7 +2021,7 @@ impl Timeline {
                }
            }
            if selected_layers.is_empty() {
-                info!("no layers to compact with gc");
+                info!("no layers to compact with gc: no layers within the key range, gc_cutoff={}, key_range={}..{}", gc_cutoff, compaction_key_range.start, compaction_key_range.end);
                return Ok(());
            }
            retain_lsns_below_horizon.sort();
@@ -1936,14 +2088,15 @@ impl Timeline {

        // Step 1: construct a k-merge iterator over all layers.
        // Also, verify if the layer map can be split by drawing a horizontal line at every LSN start/end split point.
-        let layer_names = job_desc
-            .selected_layers
-            .iter()
-            .map(|layer| layer.layer_desc().layer_name())
-            .collect_vec();
-        if let Some(err) = check_valid_layermap(&layer_names) {
-            warn!("gc-compaction layer map check failed because {}, this is normal if partial compaction is not finished yet", err);
-        }
+        // disable the check for now because we need to adjust the check for partial compactions, will enable later.
+        // let layer_names = job_desc
+        //     .selected_layers
+        //     .iter()
+        //     .map(|layer| layer.layer_desc().layer_name())
+        //     .collect_vec();
+        // if let Some(err) = check_valid_layermap(&layer_names) {
+        //     warn!("gc-compaction layer map check failed because {}, this is normal if partial compaction is not finished yet", err);
+        // }
        // The maximum LSN we are processing in this compaction loop
        let end_lsn = job_desc
            .selected_layers
@@ -2048,6 +2201,11 @@ impl Timeline {
                // This is not handled in the filter iterator because shard is determined by hash.
                // Therefore, it does not give us any performance benefit to do things like skip
                // a whole layer file as handling key spaces (ranges).
+                if cfg!(debug_assertions) {
+                    let shard = self.shard_identity.shard_index();
+                    let owner = self.shard_identity.get_shard_number(&key);
+                    panic!("key {key} does not belong on shard {shard}, owned by {owner}");
+                }
                continue;
            }
            if !job_desc.compaction_key_range.contains(&key) {
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -182,7 +182,7 @@ impl OpenLayerManager {
        conf: &'static PageServerConf,
        timeline_id: TimelineId,
        tenant_shard_id: TenantShardId,
-        gate_guard: utils::sync::gate::GateGuard,
+        gate: &utils::sync::gate::Gate,
        ctx: &RequestContext,
    ) -> anyhow::Result<Arc<InMemoryLayer>> {
        ensure!(lsn.is_aligned());
@@ -212,15 +212,9 @@ impl OpenLayerManager {
                lsn
            );

-            let new_layer = InMemoryLayer::create(
-                conf,
-                timeline_id,
-                tenant_shard_id,
-                start_lsn,
-                gate_guard,
-                ctx,
-            )
-            .await?;
+            let new_layer =
+                InMemoryLayer::create(conf, timeline_id, tenant_shard_id, start_lsn, gate, ctx)
+                    .await?;
            let layer = Arc::new(new_layer);

            self.layer_map.open_layer = Some(layer.clone());
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -369,6 +369,13 @@ pub(super) async fn handle_walreceiver_connection(
                // advances it to its end LSN. 0 is just an initialization placeholder.
                let mut modification = timeline.begin_modification(Lsn(0));

+                if !records.is_empty() {
+                    timeline
+                        .metrics
+                        .wal_records_received
+                        .inc_by(records.len() as u64);
+                }
+
                for interpreted in records {
                    if matches!(interpreted.flush_uncommitted, FlushUncommittedRecords::Yes)
                        && uncommitted_records > 0
@@ -510,6 +517,7 @@ pub(super) async fn handle_walreceiver_connection(
                        }

                        // Ingest the records without immediately committing them.
+                        timeline.metrics.wal_records_received.inc();
                        let ingested = walingest
                            .ingest_record(interpreted, &mut modification, &ctx)
                            .await
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -20,7 +20,7 @@ use camino::{Utf8Path, Utf8PathBuf};
 use once_cell::sync::OnceCell;
 use owned_buffers_io::aligned_buffer::buffer::AlignedBuffer;
 use owned_buffers_io::aligned_buffer::{AlignedBufferMut, AlignedSlice, ConstAlign};
-use owned_buffers_io::io_buf_aligned::IoBufAlignedMut;
+use owned_buffers_io::io_buf_aligned::{IoBufAligned, IoBufAlignedMut};
 use owned_buffers_io::io_buf_ext::FullSlice;
 use pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT;
 use pageserver_api::shard::TenantShardId;
@@ -63,9 +63,6 @@ pub(crate) mod owned_buffers_io {
    pub(crate) mod io_buf_ext;
    pub(crate) mod slice;
    pub(crate) mod write;
-    pub(crate) mod util {
-        pub(crate) mod size_tracking_writer;
-    }
 }

 #[derive(Debug)]
@@ -221,7 +218,7 @@ impl VirtualFile {
        self.inner.read_exact_at_page(page, offset, ctx).await
    }

-    pub async fn write_all_at<Buf: IoBuf + Send>(
+    pub async fn write_all_at<Buf: IoBufAligned + Send>(
        &self,
        buf: FullSlice<Buf>,
        offset: u64,
@@ -1325,14 +1322,14 @@ impl Drop for VirtualFileInner {
 }

 impl OwnedAsyncWriter for VirtualFile {
-    #[inline(always)]
-    async fn write_all<Buf: IoBuf + Send>(
-        &mut self,
+    async fn write_all_at<Buf: IoBufAligned + Send>(
+        &self,
        buf: FullSlice<Buf>,
+        offset: u64,
        ctx: &RequestContext,
-    ) -> std::io::Result<(usize, FullSlice<Buf>)> {
-        let (buf, res) = VirtualFile::write_all(self, buf, ctx).await;
-        res.map(move |v| (v, buf))
+    ) -> std::io::Result<FullSlice<Buf>> {
+        let (buf, res) = VirtualFile::write_all_at(self, buf, offset, ctx).await;
+        res.map(|_| buf)
    }
 }

@@ -1451,7 +1448,7 @@ mod tests {
                }
            }
        }
-        async fn write_all_at<Buf: IoBuf + Send>(
+        async fn write_all_at<Buf: IoBufAligned + Send>(
            &self,
            buf: FullSlice<Buf>,
            offset: u64,
@@ -1594,6 +1591,7 @@ mod tests {
            &ctx,
        )
        .await?;
+
        file_a
            .write_all(b"foobar".to_vec().slice_len(), &ctx)
            .await?;
@@ -1652,10 +1650,10 @@ mod tests {
        )
        .await?;
        file_b
-            .write_all_at(b"BAR".to_vec().slice_len(), 3, &ctx)
+            .write_all_at(IoBuffer::from(b"BAR").slice_len(), 3, &ctx)
            .await?;
        file_b
-            .write_all_at(b"FOO".to_vec().slice_len(), 0, &ctx)
+            .write_all_at(IoBuffer::from(b"FOO").slice_len(), 0, &ctx)
            .await?;

        assert_eq!(file_b.read_string_at(2, 3, &ctx).await?, "OBA");
--- a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/alignment.rs
+++ b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/alignment.rs
@@ -4,7 +4,7 @@ pub trait Alignment: std::marker::Unpin + 'static {
 }

 /// Alignment at compile time.
-#[derive(Debug)]
+#[derive(Debug, Clone, Copy)]
 pub struct ConstAlign<const A: usize>;

 impl<const A: usize> Alignment for ConstAlign<A> {
@@ -14,7 +14,7 @@ impl<const A: usize> Alignment for ConstAlign<A> {
 }

 /// Alignment at run time.
-#[derive(Debug)]
+#[derive(Debug, Clone, Copy)]
 pub struct RuntimeAlign {
    align: usize,
 }
--- a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer.rs
+++ b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer.rs
@@ -3,9 +3,10 @@ use std::{
    sync::Arc,
 };

-use super::{alignment::Alignment, raw::RawAlignedBuffer};
+use super::{alignment::Alignment, raw::RawAlignedBuffer, AlignedBufferMut, ConstAlign};

 /// An shared, immutable aligned buffer type.
+#[derive(Clone, Debug)]
 pub struct AlignedBuffer<A: Alignment> {
    /// Shared raw buffer.
    raw: Arc<RawAlignedBuffer<A>>,
@@ -86,6 +87,13 @@ impl<A: Alignment> AlignedBuffer<A> {
            range: begin..end,
        }
    }
+
+    /// Returns the mutable aligned buffer, if the immutable aligned buffer
+    /// has exactly one strong reference. Otherwise returns `None`.
+    pub fn into_mut(self) -> Option<AlignedBufferMut<A>> {
+        let raw = Arc::into_inner(self.raw)?;
+        Some(AlignedBufferMut::from_raw(raw))
+    }
 }

 impl<A: Alignment> Deref for AlignedBuffer<A> {
@@ -108,6 +116,14 @@ impl<A: Alignment> PartialEq<[u8]> for AlignedBuffer<A> {
    }
 }

+impl<const A: usize, const N: usize> From<&[u8; N]> for AlignedBuffer<ConstAlign<A>> {
+    fn from(value: &[u8; N]) -> Self {
+        let mut buf = AlignedBufferMut::with_capacity(N);
+        buf.extend_from_slice(value);
+        buf.freeze()
+    }
+}
+
 /// SAFETY: the underlying buffer references a stable memory region.
 unsafe impl<A: Alignment> tokio_epoll_uring::IoBuf for AlignedBuffer<A> {
    fn stable_ptr(&self) -> *const u8 {
--- a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer_mut.rs
+++ b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer_mut.rs
@@ -1,4 +1,7 @@
-use std::ops::{Deref, DerefMut};
+use std::{
+    mem::MaybeUninit,
+    ops::{Deref, DerefMut},
+};

 use super::{
    alignment::{Alignment, ConstAlign},
@@ -46,6 +49,11 @@ impl<const A: usize> AlignedBufferMut<ConstAlign<A>> {
 }

 impl<A: Alignment> AlignedBufferMut<A> {
+    /// Constructs a mutable aligned buffer from raw.
+    pub(super) fn from_raw(raw: RawAlignedBuffer<A>) -> Self {
+        AlignedBufferMut { raw }
+    }
+
    /// Returns the total number of bytes the buffer can hold.
    #[inline]
    pub fn capacity(&self) -> usize {
@@ -128,6 +136,39 @@ impl<A: Alignment> AlignedBufferMut<A> {
        let len = self.len();
        AlignedBuffer::from_raw(self.raw, 0..len)
    }
+
+    /// Clones and appends all elements in a slice to the buffer. Reserves additional capacity as needed.
+    #[inline]
+    pub fn extend_from_slice(&mut self, extend: &[u8]) {
+        let cnt = extend.len();
+        self.reserve(cnt);
+
+        // SAFETY: we already reserved additional `cnt` bytes, safe to perform memcpy.
+        unsafe {
+            let dst = self.spare_capacity_mut();
+            // Reserved above
+            debug_assert!(dst.len() >= cnt);
+
+            core::ptr::copy_nonoverlapping(extend.as_ptr(), dst.as_mut_ptr().cast(), cnt);
+        }
+        // SAFETY: We do have at least `cnt` bytes remaining before advance.
+        unsafe {
+            bytes::BufMut::advance_mut(self, cnt);
+        }
+    }
+
+    /// Returns the remaining spare capacity of the vector as a slice of `MaybeUninit<u8>`.
+    #[inline]
+    fn spare_capacity_mut(&mut self) -> &mut [MaybeUninit<u8>] {
+        // SAFETY: we guarantees that the `Self::capacity()` bytes from
+        // `Self::as_mut_ptr()` are allocated.
+        unsafe {
+            let ptr = self.as_mut_ptr().add(self.len());
+            let len = self.capacity() - self.len();
+
+            core::slice::from_raw_parts_mut(ptr.cast(), len)
+        }
+    }
 }

 impl<A: Alignment> Deref for AlignedBufferMut<A> {
--- a/pageserver/src/virtual_file/owned_buffers_io/io_buf_aligned.rs
+++ b/pageserver/src/virtual_file/owned_buffers_io/io_buf_aligned.rs
@@ -1,9 +1,15 @@
-use tokio_epoll_uring::IoBufMut;
+use tokio_epoll_uring::{IoBuf, IoBufMut};

-use crate::virtual_file::{IoBufferMut, PageWriteGuardBuf};
+use crate::virtual_file::{IoBuffer, IoBufferMut, PageWriteGuardBuf};

+/// A marker trait for a mutable aligned buffer type.
 pub trait IoBufAlignedMut: IoBufMut {}

+/// A marker trait for an aligned buffer type.
+pub trait IoBufAligned: IoBuf {}
+
 impl IoBufAlignedMut for IoBufferMut {}

+impl IoBufAligned for IoBuffer {}
+
 impl IoBufAlignedMut for PageWriteGuardBuf {}
--- a/pageserver/src/virtual_file/owned_buffers_io/io_buf_ext.rs
+++ b/pageserver/src/virtual_file/owned_buffers_io/io_buf_ext.rs
@@ -5,6 +5,8 @@ use bytes::{Bytes, BytesMut};
 use std::ops::{Deref, Range};
 use tokio_epoll_uring::{BoundedBuf, IoBuf, Slice};

+use super::write::CheapCloneForRead;
+
 /// The true owned equivalent for Rust [`slice`]. Use this for the write path.
 ///
 /// Unlike [`tokio_epoll_uring::Slice`], which we unfortunately inherited from `tokio-uring`,
@@ -43,6 +45,18 @@ where
    }
 }

+impl<B> CheapCloneForRead for FullSlice<B>
+where
+    B: IoBuf + CheapCloneForRead,
+{
+    fn cheap_clone(&self) -> Self {
+        let bounds = self.slice.bounds();
+        let clone = self.slice.get_ref().cheap_clone();
+        let slice = clone.slice(bounds);
+        Self { slice }
+    }
+}
+
 pub(crate) trait IoBufExt {
    /// Get a [`FullSlice`] for the entire buffer, i.e., `self[..]` or `self[0..self.len()]`.
    fn slice_len(self) -> FullSlice<Self>
--- a/pageserver/src/virtual_file/owned_buffers_io/util/size_tracking_writer.rs
+++ b/pageserver/src/virtual_file/owned_buffers_io/util/size_tracking_writer.rs
@@ -1,50 +0,0 @@
-use crate::{
-    context::RequestContext,
-    virtual_file::owned_buffers_io::{io_buf_ext::FullSlice, write::OwnedAsyncWriter},
-};
-use tokio_epoll_uring::IoBuf;
-
-pub struct Writer<W> {
-    dst: W,
-    bytes_amount: u64,
-}
-
-impl<W> Writer<W> {
-    pub fn new(dst: W) -> Self {
-        Self {
-            dst,
-            bytes_amount: 0,
-        }
-    }
-
-    pub fn bytes_written(&self) -> u64 {
-        self.bytes_amount
-    }
-
-    pub fn as_inner(&self) -> &W {
-        &self.dst
-    }
-
-    /// Returns the wrapped `VirtualFile` object as well as the number
-    /// of bytes that were written to it through this object.
-    #[cfg_attr(target_os = "macos", allow(dead_code))]
-    pub fn into_inner(self) -> (u64, W) {
-        (self.bytes_amount, self.dst)
-    }
-}
-
-impl<W> OwnedAsyncWriter for Writer<W>
-where
-    W: OwnedAsyncWriter,
-{
-    #[inline(always)]
-    async fn write_all<Buf: IoBuf + Send>(
-        &mut self,
-        buf: FullSlice<Buf>,
-        ctx: &RequestContext,
-    ) -> std::io::Result<(usize, FullSlice<Buf>)> {
-        let (nwritten, buf) = self.dst.write_all(buf, ctx).await?;
-        self.bytes_amount += u64::try_from(nwritten).unwrap();
-        Ok((nwritten, buf))
-    }
-}
--- a/pageserver/src/virtual_file/owned_buffers_io/write.rs
+++ b/pageserver/src/virtual_file/owned_buffers_io/write.rs
@@ -1,55 +1,88 @@
-use bytes::BytesMut;
+mod flush;
+use std::sync::Arc;
+
+use flush::FlushHandle;
 use tokio_epoll_uring::IoBuf;

-use crate::context::RequestContext;
+use crate::{
+    context::RequestContext,
+    virtual_file::{IoBuffer, IoBufferMut},
+};

-use super::io_buf_ext::{FullSlice, IoBufExt};
+use super::{
+    io_buf_aligned::IoBufAligned,
+    io_buf_ext::{FullSlice, IoBufExt},
+};
+
+pub(crate) use flush::FlushControl;
+
+pub(crate) trait CheapCloneForRead {
+    /// Returns a cheap clone of the buffer.
+    fn cheap_clone(&self) -> Self;
+}
+
+impl CheapCloneForRead for IoBuffer {
+    fn cheap_clone(&self) -> Self {
+        // Cheap clone over an `Arc`.
+        self.clone()
+    }
+}

 /// A trait for doing owned-buffer write IO.
 /// Think [`tokio::io::AsyncWrite`] but with owned buffers.
+/// The owned buffers need to be aligned due to Direct IO requirements.
 pub trait OwnedAsyncWriter {
-    async fn write_all<Buf: IoBuf + Send>(
-        &mut self,
+    fn write_all_at<Buf: IoBufAligned + Send>(
+        &self,
        buf: FullSlice<Buf>,
+        offset: u64,
        ctx: &RequestContext,
-    ) -> std::io::Result<(usize, FullSlice<Buf>)>;
+    ) -> impl std::future::Future<Output = std::io::Result<FullSlice<Buf>>> + Send;
 }

 /// A wrapper aorund an [`OwnedAsyncWriter`] that uses a [`Buffer`] to batch
 /// small writes into larger writes of size [`Buffer::cap`].
-///
-/// # Passthrough Of Large Writers
-///
-/// Calls to [`BufferedWriter::write_buffered`] that are larger than [`Buffer::cap`]
-/// cause the internal buffer to be flushed prematurely so that the large
-/// buffered write is passed through to the underlying [`OwnedAsyncWriter`].
-///
-/// This pass-through is generally beneficial for throughput, but if
-/// the storage backend of the [`OwnedAsyncWriter`] is a shared resource,
-/// unlimited large writes may cause latency or fairness issues.
-///
-/// In such cases, a different implementation that always buffers in memory
-/// may be preferable.
-pub struct BufferedWriter<B, W> {
-    writer: W,
+// TODO(yuchen): For large write, implementing buffer bypass for aligned parts of the write could be beneficial to throughput,
+// since we would avoid copying majority of the data into the internal buffer.
+pub struct BufferedWriter<B: Buffer, W> {
+    writer: Arc<W>,
    /// invariant: always remains Some(buf) except
    /// - while IO is ongoing => goes back to Some() once the IO completed successfully
    /// - after an IO error => stays `None` forever
    ///
    /// In these exceptional cases, it's `None`.
-    buf: Option<B>,
+    mutable: Option<B>,
+    /// A handle to the background flush task for writting data to disk.
+    flush_handle: FlushHandle<B::IoBuf, W>,
+    /// The number of bytes submitted to the background task.
+    bytes_submitted: u64,
 }

 impl<B, Buf, W> BufferedWriter<B, W>
 where
-    B: Buffer<IoBuf = Buf> + Send,
-    Buf: IoBuf + Send,
-    W: OwnedAsyncWriter,
+    B: Buffer<IoBuf = Buf> + Send + 'static,
+    Buf: IoBufAligned + Send + Sync + CheapCloneForRead,
+    W: OwnedAsyncWriter + Send + Sync + 'static + std::fmt::Debug,
 {
-    pub fn new(writer: W, buf: B) -> Self {
+    /// Creates a new buffered writer.
+    ///
+    /// The `buf_new` function provides a way to initialize the owned buffers used by this writer.
+    pub fn new(
+        writer: Arc<W>,
+        buf_new: impl Fn() -> B,
+        gate_guard: utils::sync::gate::GateGuard,
+        ctx: &RequestContext,
+    ) -> Self {
        Self {
-            writer,
-            buf: Some(buf),
+            writer: writer.clone(),
+            mutable: Some(buf_new()),
+            flush_handle: FlushHandle::spawn_new(
+                writer,
+                buf_new(),
+                gate_guard,
+                ctx.attached_child(),
+            ),
+            bytes_submitted: 0,
        }
    }

@@ -57,87 +90,71 @@ where
        &self.writer
    }

+    /// Returns the number of bytes submitted to the background flush task.
+    pub fn bytes_submitted(&self) -> u64 {
+        self.bytes_submitted
+    }
+
    /// Panics if used after any of the write paths returned an error
-    pub fn inspect_buffer(&self) -> &B {
-        self.buf()
+    pub fn inspect_mutable(&self) -> &B {
+        self.mutable()
+    }
+
+    /// Gets a reference to the maybe flushed read-only buffer.
+    /// Returns `None` if the writer has not submitted any flush request.
+    pub fn inspect_maybe_flushed(&self) -> Option<&FullSlice<Buf>> {
+        self.flush_handle.maybe_flushed.as_ref()
    }

    #[cfg_attr(target_os = "macos", allow(dead_code))]
-    pub async fn flush_and_into_inner(mut self, ctx: &RequestContext) -> std::io::Result<W> {
+    pub async fn flush_and_into_inner(
+        mut self,
+        ctx: &RequestContext,
+    ) -> std::io::Result<(u64, Arc<W>)> {
        self.flush(ctx).await?;

-        let Self { buf, writer } = self;
+        let Self {
+            mutable: buf,
+            writer,
+            mut flush_handle,
+            bytes_submitted: bytes_amount,
+        } = self;
+        flush_handle.shutdown().await?;
        assert!(buf.is_some());
-        Ok(writer)
+        Ok((bytes_amount, writer))
    }

+    /// Gets a reference to the mutable in-memory buffer.
    #[inline(always)]
-    fn buf(&self) -> &B {
-        self.buf
+    fn mutable(&self) -> &B {
+        self.mutable
            .as_ref()
            .expect("must not use after we returned an error")
    }

-    /// Guarantees that if Ok() is returned, all bytes in `chunk` have been accepted.
    #[cfg_attr(target_os = "macos", allow(dead_code))]
-    pub async fn write_buffered<S: IoBuf + Send>(
+    pub async fn write_buffered_borrowed(
        &mut self,
-        chunk: FullSlice<S>,
+        chunk: &[u8],
        ctx: &RequestContext,
-    ) -> std::io::Result<(usize, FullSlice<S>)> {
-        let chunk = chunk.into_raw_slice();
-
-        let chunk_len = chunk.len();
-        // avoid memcpy for the middle of the chunk
-        if chunk.len() >= self.buf().cap() {
-            self.flush(ctx).await?;
-            // do a big write, bypassing `buf`
-            assert_eq!(
-                self.buf
-                    .as_ref()
-                    .expect("must not use after an error")
-                    .pending(),
-                0
-            );
-            let (nwritten, chunk) = self
-                .writer
-                .write_all(FullSlice::must_new(chunk), ctx)
-                .await?;
-            assert_eq!(nwritten, chunk_len);
-            return Ok((nwritten, chunk));
+    ) -> std::io::Result<usize> {
+        let (len, control) = self.write_buffered_borrowed_controlled(chunk, ctx).await?;
+        if let Some(control) = control {
+            control.release().await;
        }
-        // in-memory copy the < BUFFER_SIZED tail of the chunk
-        assert!(chunk.len() < self.buf().cap());
-        let mut slice = &chunk[..];
-        while !slice.is_empty() {
-            let buf = self.buf.as_mut().expect("must not use after an error");
-            let need = buf.cap() - buf.pending();
-            let have = slice.len();
-            let n = std::cmp::min(need, have);
-            buf.extend_from_slice(&slice[..n]);
-            slice = &slice[n..];
-            if buf.pending() >= buf.cap() {
-                assert_eq!(buf.pending(), buf.cap());
-                self.flush(ctx).await?;
-            }
-        }
-        assert!(slice.is_empty(), "by now we should have drained the chunk");
-        Ok((chunk_len, FullSlice::must_new(chunk)))
+        Ok(len)
    }

-    /// Strictly less performant variant of [`Self::write_buffered`] that allows writing borrowed data.
-    ///
-    /// It is less performant because we always have to copy the borrowed data into the internal buffer
-    /// before we can do the IO. The [`Self::write_buffered`] can avoid this, which is more performant
-    /// for large writes.
-    pub async fn write_buffered_borrowed(
+    /// In addition to bytes submitted in this write, also returns a handle that can control the flush behavior.
+    pub(crate) async fn write_buffered_borrowed_controlled(
        &mut self,
        mut chunk: &[u8],
        ctx: &RequestContext,
-    ) -> std::io::Result<usize> {
+    ) -> std::io::Result<(usize, Option<FlushControl>)> {
        let chunk_len = chunk.len();
+        let mut control: Option<FlushControl> = None;
        while !chunk.is_empty() {
-            let buf = self.buf.as_mut().expect("must not use after an error");
+            let buf = self.mutable.as_mut().expect("must not use after an error");
            let need = buf.cap() - buf.pending();
            let have = chunk.len();
            let n = std::cmp::min(need, have);
@@ -145,26 +162,27 @@ where
            chunk = &chunk[n..];
            if buf.pending() >= buf.cap() {
                assert_eq!(buf.pending(), buf.cap());
-                self.flush(ctx).await?;
+                if let Some(control) = control.take() {
+                    control.release().await;
+                }
+                control = self.flush(ctx).await?;
            }
        }
-        Ok(chunk_len)
+        Ok((chunk_len, control))
    }

-    async fn flush(&mut self, ctx: &RequestContext) -> std::io::Result<()> {
-        let buf = self.buf.take().expect("must not use after an error");
+    #[must_use = "caller must explcitly check the flush control"]
+    async fn flush(&mut self, _ctx: &RequestContext) -> std::io::Result<Option<FlushControl>> {
+        let buf = self.mutable.take().expect("must not use after an error");
        let buf_len = buf.pending();
        if buf_len == 0 {
-            self.buf = Some(buf);
-            return Ok(());
+            self.mutable = Some(buf);
+            return Ok(None);
        }
-        let slice = buf.flush();
-        let (nwritten, slice) = self.writer.write_all(slice, ctx).await?;
-        assert_eq!(nwritten, buf_len);
-        self.buf = Some(Buffer::reuse_after_flush(
-            slice.into_raw_slice().into_inner(),
-        ));
-        Ok(())
+        let (recycled, flush_control) = self.flush_handle.flush(buf, self.bytes_submitted).await?;
+        self.bytes_submitted += u64::try_from(buf_len).unwrap();
+        self.mutable = Some(recycled);
+        Ok(Some(flush_control))
    }
 }

@@ -192,64 +210,77 @@ pub trait Buffer {
    fn reuse_after_flush(iobuf: Self::IoBuf) -> Self;
 }

-impl Buffer for BytesMut {
-    type IoBuf = BytesMut;
+impl Buffer for IoBufferMut {
+    type IoBuf = IoBuffer;

-    #[inline(always)]
    fn cap(&self) -> usize {
        self.capacity()
    }

    fn extend_from_slice(&mut self, other: &[u8]) {
-        BytesMut::extend_from_slice(self, other)
+        if self.len() + other.len() > self.cap() {
+            panic!("Buffer capacity exceeded");
+        }
+
+        IoBufferMut::extend_from_slice(self, other);
    }

-    #[inline(always)]
    fn pending(&self) -> usize {
        self.len()
    }

-    fn flush(self) -> FullSlice<BytesMut> {
-        self.slice_len()
+    fn flush(self) -> FullSlice<Self::IoBuf> {
+        self.freeze().slice_len()
    }

-    fn reuse_after_flush(mut iobuf: BytesMut) -> Self {
-        iobuf.clear();
-        iobuf
-    }
-}
-
-impl OwnedAsyncWriter for Vec<u8> {
-    async fn write_all<Buf: IoBuf + Send>(
-        &mut self,
-        buf: FullSlice<Buf>,
-        _: &RequestContext,
-    ) -> std::io::Result<(usize, FullSlice<Buf>)> {
-        self.extend_from_slice(&buf[..]);
-        Ok((buf.len(), buf))
+    /// Caller should make sure that `iobuf` only have one strong reference before invoking this method.
+    fn reuse_after_flush(iobuf: Self::IoBuf) -> Self {
+        let mut recycled = iobuf
+            .into_mut()
+            .expect("buffer should only have one strong reference");
+        recycled.clear();
+        recycled
    }
 }

 #[cfg(test)]
 mod tests {
-    use bytes::BytesMut;
+    use std::sync::Mutex;

    use super::*;
    use crate::context::{DownloadBehavior, RequestContext};
    use crate::task_mgr::TaskKind;

-    #[derive(Default)]
+    #[derive(Default, Debug)]
    struct RecorderWriter {
-        writes: Vec<Vec<u8>>,
+        /// record bytes and write offsets.
+        writes: Mutex<Vec<(Vec<u8>, u64)>>,
    }
+
+    impl RecorderWriter {
+        /// Gets recorded bytes and write offsets.
+        fn get_writes(&self) -> Vec<Vec<u8>> {
+            self.writes
+                .lock()
+                .unwrap()
+                .iter()
+                .map(|(buf, _)| buf.clone())
+                .collect()
+        }
+    }
+
    impl OwnedAsyncWriter for RecorderWriter {
-        async fn write_all<Buf: IoBuf + Send>(
-            &mut self,
+        async fn write_all_at<Buf: IoBufAligned + Send>(
+            &self,
            buf: FullSlice<Buf>,
+            offset: u64,
            _: &RequestContext,
-        ) -> std::io::Result<(usize, FullSlice<Buf>)> {
-            self.writes.push(Vec::from(&buf[..]));
-            Ok((buf.len(), buf))
+        ) -> std::io::Result<FullSlice<Buf>> {
+            self.writes
+                .lock()
+                .unwrap()
+                .push((Vec::from(&buf[..]), offset));
+            Ok(buf)
        }
    }

@@ -257,71 +288,21 @@ mod tests {
        RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error)
    }

-    macro_rules! write {
-        ($writer:ident, $data:literal) => {{
-            $writer
-                .write_buffered(::bytes::Bytes::from_static($data).slice_len(), &test_ctx())
-                .await?;
-        }};
-    }
-
    #[tokio::test]
-    async fn test_buffered_writes_only() -> std::io::Result<()> {
-        let recorder = RecorderWriter::default();
-        let mut writer = BufferedWriter::new(recorder, BytesMut::with_capacity(2));
-        write!(writer, b"a");
-        write!(writer, b"b");
-        write!(writer, b"c");
-        write!(writer, b"d");
-        write!(writer, b"e");
-        let recorder = writer.flush_and_into_inner(&test_ctx()).await?;
-        assert_eq!(
-            recorder.writes,
-            vec![Vec::from(b"ab"), Vec::from(b"cd"), Vec::from(b"e")]
-        );
-        Ok(())
-    }
-
-    #[tokio::test]
-    async fn test_passthrough_writes_only() -> std::io::Result<()> {
-        let recorder = RecorderWriter::default();
-        let mut writer = BufferedWriter::new(recorder, BytesMut::with_capacity(2));
-        write!(writer, b"abc");
-        write!(writer, b"de");
-        write!(writer, b"");
-        write!(writer, b"fghijk");
-        let recorder = writer.flush_and_into_inner(&test_ctx()).await?;
-        assert_eq!(
-            recorder.writes,
-            vec![Vec::from(b"abc"), Vec::from(b"de"), Vec::from(b"fghijk")]
-        );
-        Ok(())
-    }
-
-    #[tokio::test]
-    async fn test_passthrough_write_with_nonempty_buffer() -> std::io::Result<()> {
-        let recorder = RecorderWriter::default();
-        let mut writer = BufferedWriter::new(recorder, BytesMut::with_capacity(2));
-        write!(writer, b"a");
-        write!(writer, b"bc");
-        write!(writer, b"d");
-        write!(writer, b"e");
-        let recorder = writer.flush_and_into_inner(&test_ctx()).await?;
-        assert_eq!(
-            recorder.writes,
-            vec![Vec::from(b"a"), Vec::from(b"bc"), Vec::from(b"de")]
-        );
-        Ok(())
-    }
-
-    #[tokio::test]
-    async fn test_write_all_borrowed_always_goes_through_buffer() -> std::io::Result<()> {
+    async fn test_write_all_borrowed_always_goes_through_buffer() -> anyhow::Result<()> {
        let ctx = test_ctx();
        let ctx = &ctx;
-        let recorder = RecorderWriter::default();
-        let mut writer = BufferedWriter::new(recorder, BytesMut::with_capacity(2));
+        let recorder = Arc::new(RecorderWriter::default());
+        let gate = utils::sync::gate::Gate::default();
+        let mut writer = BufferedWriter::<_, RecorderWriter>::new(
+            recorder,
+            || IoBufferMut::with_capacity(2),
+            gate.enter()?,
+            ctx,
+        );

        writer.write_buffered_borrowed(b"abc", ctx).await?;
+        writer.write_buffered_borrowed(b"", ctx).await?;
        writer.write_buffered_borrowed(b"d", ctx).await?;
        writer.write_buffered_borrowed(b"e", ctx).await?;
        writer.write_buffered_borrowed(b"fg", ctx).await?;
@@ -329,9 +310,9 @@ mod tests {
        writer.write_buffered_borrowed(b"j", ctx).await?;
        writer.write_buffered_borrowed(b"klmno", ctx).await?;

-        let recorder = writer.flush_and_into_inner(ctx).await?;
+        let (_, recorder) = writer.flush_and_into_inner(ctx).await?;
        assert_eq!(
-            recorder.writes,
+            recorder.get_writes(),
            {
                let expect: &[&[u8]] = &[b"ab", b"cd", b"ef", b"gh", b"ij", b"kl", b"mn", b"o"];
                expect
--- a/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs
+++ b/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs
@@ -0,0 +1,314 @@
+use std::sync::Arc;
+
+use utils::sync::duplex;
+
+use crate::{
+    context::RequestContext,
+    virtual_file::owned_buffers_io::{io_buf_aligned::IoBufAligned, io_buf_ext::FullSlice},
+};
+
+use super::{Buffer, CheapCloneForRead, OwnedAsyncWriter};
+
+/// A handle to the flush task.
+pub struct FlushHandle<Buf, W> {
+    inner: Option<FlushHandleInner<Buf, W>>,
+    /// Immutable buffer for serving tail reads.
+    /// `None` if no flush request has been submitted.
+    pub(super) maybe_flushed: Option<FullSlice<Buf>>,
+}
+
+pub struct FlushHandleInner<Buf, W> {
+    /// A bi-directional channel that sends (buffer, offset) for writes,
+    /// and receives recyled buffer.
+    channel: duplex::mpsc::Duplex<FlushRequest<Buf>, FullSlice<Buf>>,
+    /// Join handle for the background flush task.
+    join_handle: tokio::task::JoinHandle<std::io::Result<Arc<W>>>,
+}
+
+struct FlushRequest<Buf> {
+    slice: FullSlice<Buf>,
+    offset: u64,
+    #[cfg(test)]
+    ready_to_flush_rx: tokio::sync::oneshot::Receiver<()>,
+    #[cfg(test)]
+    done_flush_tx: tokio::sync::oneshot::Sender<()>,
+}
+
+/// Constructs a request and a control object for a new flush operation.
+#[cfg(not(test))]
+fn new_flush_op<Buf>(slice: FullSlice<Buf>, offset: u64) -> (FlushRequest<Buf>, FlushControl) {
+    let request = FlushRequest { slice, offset };
+    let control = FlushControl::untracked();
+
+    (request, control)
+}
+
+/// Constructs a request and a control object for a new flush operation.
+#[cfg(test)]
+fn new_flush_op<Buf>(slice: FullSlice<Buf>, offset: u64) -> (FlushRequest<Buf>, FlushControl) {
+    let (ready_to_flush_tx, ready_to_flush_rx) = tokio::sync::oneshot::channel();
+    let (done_flush_tx, done_flush_rx) = tokio::sync::oneshot::channel();
+    let control = FlushControl::not_started(ready_to_flush_tx, done_flush_rx);
+
+    let request = FlushRequest {
+        slice,
+        offset,
+        ready_to_flush_rx,
+        done_flush_tx,
+    };
+    (request, control)
+}
+
+/// A handle to a `FlushRequest` that allows unit tests precise control over flush behavior.
+#[cfg(test)]
+pub(crate) struct FlushControl {
+    not_started: FlushNotStarted,
+}
+
+#[cfg(not(test))]
+pub(crate) struct FlushControl;
+
+impl FlushControl {
+    #[cfg(test)]
+    fn not_started(
+        ready_to_flush_tx: tokio::sync::oneshot::Sender<()>,
+        done_flush_rx: tokio::sync::oneshot::Receiver<()>,
+    ) -> Self {
+        FlushControl {
+            not_started: FlushNotStarted {
+                ready_to_flush_tx,
+                done_flush_rx,
+            },
+        }
+    }
+
+    #[cfg(not(test))]
+    fn untracked() -> Self {
+        FlushControl
+    }
+
+    /// In tests, turn flush control into a not started state.
+    #[cfg(test)]
+    pub(crate) fn into_not_started(self) -> FlushNotStarted {
+        self.not_started
+    }
+
+    /// Release control to the submitted buffer.
+    ///
+    /// In `cfg(test)` environment, the buffer is guranteed to be flushed to disk after [`FlushControl::release`] is finishes execution.
+    pub async fn release(self) {
+        #[cfg(test)]
+        {
+            self.not_started
+                .ready_to_flush()
+                .wait_until_flush_is_done()
+                .await;
+        }
+    }
+}
+
+impl<Buf, W> FlushHandle<Buf, W>
+where
+    Buf: IoBufAligned + Send + Sync + CheapCloneForRead,
+    W: OwnedAsyncWriter + Send + Sync + 'static + std::fmt::Debug,
+{
+    /// Spawns a new background flush task and obtains a handle.
+    ///
+    /// Note: The background task so we do not need to explicitly maintain a queue of buffers.
+    pub fn spawn_new<B>(
+        file: Arc<W>,
+        buf: B,
+        gate_guard: utils::sync::gate::GateGuard,
+        ctx: RequestContext,
+    ) -> Self
+    where
+        B: Buffer<IoBuf = Buf> + Send + 'static,
+    {
+        // It is fine to buffer up to only 1 message. We only 1 message in-flight at a time.
+        let (front, back) = duplex::mpsc::channel(1);
+
+        let join_handle = tokio::spawn(async move {
+            FlushBackgroundTask::new(back, file, gate_guard, ctx)
+                .run(buf.flush())
+                .await
+        });
+
+        FlushHandle {
+            inner: Some(FlushHandleInner {
+                channel: front,
+                join_handle,
+            }),
+            maybe_flushed: None,
+        }
+    }
+
+    /// Submits a buffer to be flushed in the background task.
+    /// Returns a buffer that completed flushing for re-use, length reset to 0, capacity unchanged.
+    /// If `save_buf_for_read` is true, then we save the buffer in `Self::maybe_flushed`, otherwise
+    /// clear `maybe_flushed`.
+    pub async fn flush<B>(&mut self, buf: B, offset: u64) -> std::io::Result<(B, FlushControl)>
+    where
+        B: Buffer<IoBuf = Buf> + Send + 'static,
+    {
+        let slice = buf.flush();
+
+        // Saves a buffer for read while flushing. This also removes reference to the old buffer.
+        self.maybe_flushed = Some(slice.cheap_clone());
+
+        let (request, flush_control) = new_flush_op(slice, offset);
+
+        // Submits the buffer to the background task.
+        let submit = self.inner_mut().channel.send(request).await;
+        if submit.is_err() {
+            return self.handle_error().await;
+        }
+
+        // Wait for an available buffer from the background flush task.
+        // This is the BACKPRESSURE mechanism: if the flush task can't keep up,
+        // then the write path will eventually wait for it here.
+        let Some(recycled) = self.inner_mut().channel.recv().await else {
+            return self.handle_error().await;
+        };
+
+        // The only other place that could hold a reference to the recycled buffer
+        // is in `Self::maybe_flushed`, but we have already replace it with the new buffer.
+        let recycled = Buffer::reuse_after_flush(recycled.into_raw_slice().into_inner());
+        Ok((recycled, flush_control))
+    }
+
+    async fn handle_error<T>(&mut self) -> std::io::Result<T> {
+        Err(self
+            .shutdown()
+            .await
+            .expect_err("flush task only disconnects duplex if it exits with an error"))
+    }
+
+    /// Cleans up the channel, join the flush task.
+    pub async fn shutdown(&mut self) -> std::io::Result<Arc<W>> {
+        let handle = self
+            .inner
+            .take()
+            .expect("must not use after we returned an error");
+        drop(handle.channel.tx);
+        handle.join_handle.await.unwrap()
+    }
+
+    /// Gets a mutable reference to the inner handle. Panics if [`Self::inner`] is `None`.
+    /// This only happens if the handle is used after an error.
+    fn inner_mut(&mut self) -> &mut FlushHandleInner<Buf, W> {
+        self.inner
+            .as_mut()
+            .expect("must not use after we returned an error")
+    }
+}
+
+/// A background task for flushing data to disk.
+pub struct FlushBackgroundTask<Buf, W> {
+    /// A bi-directional channel that receives (buffer, offset) for writes,
+    /// and send back recycled buffer.
+    channel: duplex::mpsc::Duplex<FullSlice<Buf>, FlushRequest<Buf>>,
+    /// A writter for persisting data to disk.
+    writer: Arc<W>,
+    ctx: RequestContext,
+    /// Prevent timeline from shuting down until the flush background task finishes flushing all remaining buffers to disk.
+    _gate_guard: utils::sync::gate::GateGuard,
+}
+
+impl<Buf, W> FlushBackgroundTask<Buf, W>
+where
+    Buf: IoBufAligned + Send + Sync,
+    W: OwnedAsyncWriter + Sync + 'static,
+{
+    /// Creates a new background flush task.
+    fn new(
+        channel: duplex::mpsc::Duplex<FullSlice<Buf>, FlushRequest<Buf>>,
+        file: Arc<W>,
+        gate_guard: utils::sync::gate::GateGuard,
+        ctx: RequestContext,
+    ) -> Self {
+        FlushBackgroundTask {
+            channel,
+            writer: file,
+            _gate_guard: gate_guard,
+            ctx,
+        }
+    }
+
+    /// Runs the background flush task.
+    /// The passed in slice is immediately sent back to the flush handle through the duplex channel.
+    async fn run(mut self, slice: FullSlice<Buf>) -> std::io::Result<Arc<W>> {
+        // Sends the extra buffer back to the handle.
+        self.channel.send(slice).await.map_err(|_| {
+            std::io::Error::new(std::io::ErrorKind::BrokenPipe, "flush handle closed early")
+        })?;
+
+        //  Exit condition: channel is closed and there is no remaining buffer to be flushed
+        while let Some(request) = self.channel.recv().await {
+            #[cfg(test)]
+            {
+                // In test, wait for control to signal that we are ready to flush.
+                if request.ready_to_flush_rx.await.is_err() {
+                    tracing::debug!("control dropped");
+                }
+            }
+
+            // Write slice to disk at `offset`.
+            let slice = self
+                .writer
+                .write_all_at(request.slice, request.offset, &self.ctx)
+                .await?;
+
+            #[cfg(test)]
+            {
+                // In test, tell control we are done flushing buffer.
+                if request.done_flush_tx.send(()).is_err() {
+                    tracing::debug!("control dropped");
+                }
+            }
+
+            // Sends the buffer back to the handle for reuse. The handle is in charged of cleaning the buffer.
+            if self.channel.send(slice).await.is_err() {
+                // Although channel is closed. Still need to finish flushing the remaining buffers.
+                continue;
+            }
+        }
+
+        Ok(self.writer)
+    }
+}
+
+#[cfg(test)]
+pub(crate) struct FlushNotStarted {
+    ready_to_flush_tx: tokio::sync::oneshot::Sender<()>,
+    done_flush_rx: tokio::sync::oneshot::Receiver<()>,
+}
+
+#[cfg(test)]
+pub(crate) struct FlushInProgress {
+    done_flush_rx: tokio::sync::oneshot::Receiver<()>,
+}
+
+#[cfg(test)]
+pub(crate) struct FlushDone;
+
+#[cfg(test)]
+impl FlushNotStarted {
+    /// Signals the background task the buffer is ready to flush to disk.
+    pub fn ready_to_flush(self) -> FlushInProgress {
+        self.ready_to_flush_tx
+            .send(())
+            .map(|_| FlushInProgress {
+                done_flush_rx: self.done_flush_rx,
+            })
+            .unwrap()
+    }
+}
+
+#[cfg(test)]
+impl FlushInProgress {
+    /// Waits until background flush is done.
+    pub async fn wait_until_flush_is_done(self) -> FlushDone {
+        self.done_flush_rx.await.unwrap();
+        FlushDone
+    }
+}
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -582,18 +582,21 @@ impl WalIngest {
                forknum: FSM_FORKNUM,
            };

+            // Zero out the last remaining FSM page, if this shard owns it. We are not precise here,
+            // and instead of digging in the FSM bitmap format we just clear the whole page.
            let fsm_logical_page_no = blkno / pg_constants::SLOTS_PER_FSM_PAGE;
            let mut fsm_physical_page_no = fsm_logical_to_physical(fsm_logical_page_no);
-            if blkno % pg_constants::SLOTS_PER_FSM_PAGE != 0 {
-                // Tail of last remaining FSM page has to be zeroed.
-                // We are not precise here and instead of digging in FSM bitmap format just clear the whole page.
+            if blkno % pg_constants::SLOTS_PER_FSM_PAGE != 0
+                && self
+                    .shard
+                    .is_key_local(&rel_block_to_key(rel, fsm_physical_page_no))
+            {
                modification.put_rel_page_image_zero(rel, fsm_physical_page_no)?;
                fsm_physical_page_no += 1;
            }
-            // TODO: re-examine the None case here wrt. sharding; should we error?
+            // Truncate this shard's view of the FSM relation size, if it even has one.
            let nblocks = get_relsize(modification, rel, ctx).await?.unwrap_or(0);
            if nblocks > fsm_physical_page_no {
-                // check if something to do: FSM is larger than truncate position
                self.put_rel_truncation(modification, rel, fsm_physical_page_no, ctx)
                    .await?;
            }
@@ -617,7 +620,7 @@ impl WalIngest {
            // tail bits in the last remaining map page, representing truncated heap
            // blocks, need to be cleared. This is not only tidy, but also necessary
            // because we don't get a chance to clear the bits if the heap is extended
-            // again.
+            // again. Only do this on the shard that owns the page.
            if (trunc_byte != 0 || trunc_offs != 0)
                && self.shard.is_key_local(&rel_block_to_key(rel, vm_page_no))
            {
@@ -631,10 +634,9 @@ impl WalIngest {
                )?;
                vm_page_no += 1;
            }
-            // TODO: re-examine the None case here wrt. sharding; should we error?
+            // Truncate this shard's view of the VM relation size, if it even has one.
            let nblocks = get_relsize(modification, rel, ctx).await?.unwrap_or(0);
            if nblocks > vm_page_no {
-                // check if something to do: VM is larger than truncate position
                self.put_rel_truncation(modification, rel, vm_page_no, ctx)
                    .await?;
            }
@@ -875,22 +877,24 @@ impl WalIngest {
        // will block waiting for the last valid LSN to advance up to
        // it. So we use the previous record's LSN in the get calls
        // instead.
-        for segno in modification
-            .tline
-            .list_slru_segments(SlruKind::Clog, Version::Modified(modification), ctx)
-            .await?
-        {
-            let segpage = segno * pg_constants::SLRU_PAGES_PER_SEGMENT;
+        if modification.tline.get_shard_identity().is_shard_zero() {
+            for segno in modification
+                .tline
+                .list_slru_segments(SlruKind::Clog, Version::Modified(modification), ctx)
+                .await?
+            {
+                let segpage = segno * pg_constants::SLRU_PAGES_PER_SEGMENT;

-            let may_delete = dispatch_pgversion!(modification.tline.pg_version, {
-                pgv::nonrelfile_utils::slru_may_delete_clogsegment(segpage, pageno)
-            });
+                let may_delete = dispatch_pgversion!(modification.tline.pg_version, {
+                    pgv::nonrelfile_utils::slru_may_delete_clogsegment(segpage, pageno)
+                });

-            if may_delete {
-                modification
-                    .drop_slru_segment(SlruKind::Clog, segno, ctx)
-                    .await?;
-                trace!("Drop CLOG segment {:>04X}", segno);
+                if may_delete {
+                    modification
+                        .drop_slru_segment(SlruKind::Clog, segno, ctx)
+                        .await?;
+                    trace!("Drop CLOG segment {:>04X}", segno);
+                }
            }
        }

@@ -1045,16 +1049,18 @@ impl WalIngest {

        // Delete all the segments except the last one. The last segment can still
        // contain, possibly partially, valid data.
-        while segment != endsegment {
-            modification
-                .drop_slru_segment(SlruKind::MultiXactMembers, segment as u32, ctx)
-                .await?;
+        if modification.tline.get_shard_identity().is_shard_zero() {
+            while segment != endsegment {
+                modification
+                    .drop_slru_segment(SlruKind::MultiXactMembers, segment as u32, ctx)
+                    .await?;

-            /* move to next segment, handling wraparound correctly */
-            if segment == maxsegment {
-                segment = 0;
-            } else {
-                segment += 1;
+                /* move to next segment, handling wraparound correctly */
+                if segment == maxsegment {
+                    segment = 0;
+                } else {
+                    segment += 1;
+                }
            }
        }

--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -22,6 +22,7 @@
 #include "libpq/pqformat.h"
 #include "miscadmin.h"
 #include "pgstat.h"
+#include "portability/instr_time.h"
 #include "postmaster/interrupt.h"
 #include "storage/buf_internals.h"
 #include "storage/ipc.h"
@@ -118,6 +119,11 @@ typedef struct
 	 */
 	PSConnectionState state;
 	PGconn		   *conn;
+
+	/* request / response counters for debugging */
+	uint64			nrequests_sent;
+	uint64			nresponses_received;
+
 	/*---
 	 * WaitEventSet containing:
 	 *	- WL_SOCKET_READABLE on 'conn'
@@ -628,6 +634,8 @@ pageserver_connect(shardno_t shard_no, int elevel)
 		}

 		shard->state = PS_Connected;
+		shard->nrequests_sent = 0;
+		shard->nresponses_received = 0;
 	}
 	/* FALLTHROUGH */
 	case PS_Connected:
@@ -656,6 +664,27 @@ call_PQgetCopyData(shardno_t shard_no, char **buffer)
 	int			ret;
 	PageServer *shard = &page_servers[shard_no];
 	PGconn	   *pageserver_conn = shard->conn;
+	instr_time	now,
+				start_ts,
+				since_start,
+				last_log_ts,
+				since_last_log;
+	bool		logged = false;
+
+	/*
+	 * As a debugging aid, if we don't get a response for a long time, print a
+	 * log message.
+	 *
+	 * 10 s is a very generous threshold, normally we expect a response in a
+	 * few milliseconds. We have metrics to track latencies in normal ranges,
+	 * but in the cases that take exceptionally long, it's useful to log the
+	 * exact timestamps.
+	 */
+#define LOG_INTERVAL_US		UINT64CONST(10 * 1000000)
+
+	INSTR_TIME_SET_CURRENT(now);
+	start_ts = last_log_ts = now;
+	INSTR_TIME_SET_ZERO(since_last_log);

 retry:
 	ret = PQgetCopyData(pageserver_conn, buffer, 1 /* async */ );
@@ -663,9 +692,12 @@ retry:
 	if (ret == 0)
 	{
 		WaitEvent	event;
+		long		timeout;
+
+		timeout = Min(0, LOG_INTERVAL_US - INSTR_TIME_GET_MICROSEC(since_last_log));

 		/* Sleep until there's something to do */
-		(void) WaitEventSetWait(shard->wes_read, -1L, &event, 1,
+		(void) WaitEventSetWait(shard->wes_read, timeout, &event, 1,
 								WAIT_EVENT_NEON_PS_READ);
 		ResetLatch(MyLatch);

@@ -684,9 +716,40 @@ retry:
 			}
 		}

+		/*
+		 * Print a message to the log if a long time has passed with no
+		 * response.
+		 */
+		INSTR_TIME_SET_CURRENT(now);
+		since_last_log = now;
+		INSTR_TIME_SUBTRACT(since_last_log, last_log_ts);
+		if (INSTR_TIME_GET_MICROSEC(since_last_log) >= LOG_INTERVAL_US)
+		{
+			since_start = now;
+			INSTR_TIME_SUBTRACT(since_start, start_ts);
+			neon_shard_log(shard_no, LOG, "no response received from pageserver for %0.3f s, still waiting (sent " UINT64_FORMAT " requests, received " UINT64_FORMAT " responses)",
+						   INSTR_TIME_GET_DOUBLE(since_start),
+						   shard->nrequests_sent, shard->nresponses_received);
+			last_log_ts = now;
+			logged = true;
+		}
+
 		goto retry;
 	}

+	/*
+	 * If we logged earlier that the response is taking a long time, log
+	 * another message when the response is finally received.
+	 */
+	if (logged)
+	{
+		INSTR_TIME_SET_CURRENT(now);
+		since_start = now;
+		INSTR_TIME_SUBTRACT(since_start, start_ts);
+		neon_shard_log(shard_no, LOG, "received response from pageserver after %0.3f s",
+					   INSTR_TIME_GET_DOUBLE(since_start));
+	}
+
 	return ret;
 }

@@ -786,6 +849,7 @@ pageserver_send(shardno_t shard_no, NeonRequest *request)
 	 * PGRES_POLLING_WRITING state. It's kinda dirty to disconnect at this
 	 * point, but on the grand scheme of things it's only a small issue.
 	 */
+	shard->nrequests_sent++;
 	if (PQputCopyData(pageserver_conn, req_buff.data, req_buff.len) <= 0)
 	{
 		char	   *msg = pchomp(PQerrorMessage(pageserver_conn));
@@ -878,6 +942,7 @@ pageserver_receive(shardno_t shard_no)
 		neon_shard_log(shard_no, ERROR, "pageserver_receive disconnect: unexpected PQgetCopyData return value: %d", rc);
 	}

+	shard->nresponses_received++;
 	return (NeonResponse *) resp;
 }

--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -423,7 +423,11 @@ readahead_buffer_resize(int newsize, void *extra)
 	 * ensuring we have received all but the last n requests (n = newsize).
 	 */
 	if (MyPState->n_requests_inflight > newsize)
-		prefetch_wait_for(MyPState->ring_unused - newsize);
+	{
+		Assert(MyPState->ring_unused >= MyPState->n_requests_inflight - newsize);
+		prefetch_wait_for(MyPState->ring_unused - (MyPState->n_requests_inflight - newsize));
+		Assert(MyPState->n_requests_inflight <= newsize);
+	}

 	/* construct the new PrefetchState, and copy over the memory contexts */
 	newPState = MemoryContextAllocZero(TopMemoryContext, newprfs_size);
@@ -438,7 +442,6 @@ readahead_buffer_resize(int newsize, void *extra)
 	newPState->ring_last = newsize;
 	newPState->ring_unused = newsize;
 	newPState->ring_receive = newsize;
-	newPState->ring_flush = newsize;
 	newPState->max_shard_no = MyPState->max_shard_no;
 	memcpy(newPState->shard_bitmap, MyPState->shard_bitmap, sizeof(MyPState->shard_bitmap));

@@ -489,6 +492,7 @@ readahead_buffer_resize(int newsize, void *extra)
 		}
 		newPState->n_unused -= 1;
 	}
+	newPState->ring_flush = newPState->ring_receive;

 	MyNeonCounters->getpage_prefetches_buffered =
 		MyPState->n_responses_buffered;
@@ -498,6 +502,7 @@ readahead_buffer_resize(int newsize, void *extra)
 	for (; end >= MyPState->ring_last && end != UINT64_MAX; end -= 1)
 	{
 		PrefetchRequest *slot = GetPrfSlot(end);
+		Assert(slot->status != PRFS_REQUESTED);
 		if (slot->status == PRFS_RECEIVED)
 		{
 			pfree(slot->response);
@@ -610,6 +615,9 @@ prefetch_read(PrefetchRequest *slot)
 {
 	NeonResponse *response;
 	MemoryContext old;
+	BufferTag	buftag;
+	shardno_t	shard_no;
+	uint64		my_ring_index;

 	Assert(slot->status == PRFS_REQUESTED);
 	Assert(slot->response == NULL);
@@ -623,11 +631,29 @@ prefetch_read(PrefetchRequest *slot)
 					   slot->status, slot->response,
 					   (long)slot->my_ring_index, (long)MyPState->ring_receive);

+	/*
+	 * Copy the request info so that if an error happens and the prefetch
+	 * queue is flushed during the receive call, we can print the original
+	 * values in the error message
+	 */
+	buftag = slot->buftag;
+	shard_no = slot->shard_no;
+	my_ring_index = slot->my_ring_index;
+
 	old = MemoryContextSwitchTo(MyPState->errctx);
-	response = (NeonResponse *) page_server->receive(slot->shard_no);
+	response = (NeonResponse *) page_server->receive(shard_no);
 	MemoryContextSwitchTo(old);
 	if (response)
 	{
+		/* The slot should still be valid */
+		if (slot->status != PRFS_REQUESTED ||
+			slot->response != NULL ||
+			slot->my_ring_index != MyPState->ring_receive)
+			neon_shard_log(shard_no, ERROR,
+						   "Incorrect prefetch slot state after receive: status=%d response=%p my=%lu receive=%lu",
+						   slot->status, slot->response,
+						   (long) slot->my_ring_index, (long) MyPState->ring_receive);
+
 		/* update prefetch state */
 		MyPState->n_responses_buffered += 1;
 		MyPState->n_requests_inflight -= 1;
@@ -642,11 +668,15 @@ prefetch_read(PrefetchRequest *slot)
 	}
 	else
 	{
-		neon_shard_log(slot->shard_no, LOG,
+		/*
+		 * Note: The slot might no longer be valid, if the connection was lost
+		 * and the prefetch queue was flushed during the receive call
+		 */
+		neon_shard_log(shard_no, LOG,
 					   "No response from reading prefetch entry %lu: %u/%u/%u.%u block %u. This can be caused by a concurrent disconnect",
-					   (long)slot->my_ring_index,
-					   RelFileInfoFmt(BufTagGetNRelFileInfo(slot->buftag)),
-					   slot->buftag.forkNum, slot->buftag.blockNum);
+					   (long) my_ring_index,
+					   RelFileInfoFmt(BufTagGetNRelFileInfo(buftag)),
+					   buftag.forkNum, buftag.blockNum);
 		return false;
 	}
 }
--- a/proxy/src/auth/backend/mod.rs
+++ b/proxy/src/auth/backend/mod.rs
@@ -70,6 +70,10 @@ impl std::fmt::Display for Backend<'_, ()> {
    fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
            Self::ControlPlane(api, ()) => match &**api {
+                ControlPlaneClient::ProxyV1(endpoint) => fmt
+                    .debug_tuple("ControlPlane::ProxyV1")
+                    .field(&endpoint.url())
+                    .finish(),
                ControlPlaneClient::Neon(endpoint) => fmt
                    .debug_tuple("ControlPlane::Neon")
                    .field(&endpoint.url())
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -46,6 +46,9 @@ enum AuthBackendType {
    #[value(name("console"), alias("cplane"))]
    ControlPlane,

+    #[value(name("cplane-v1"), alias("control-plane"))]
+    ControlPlaneV1,
+
    #[value(name("link"), alias("control-redirect"))]
    ConsoleRedirect,

@@ -518,6 +521,39 @@ async fn main() -> anyhow::Result<()> {
                        .instrument(span),
                );
            }
+        } else if let proxy::control_plane::client::ControlPlaneClient::ProxyV1(api) = &**api {
+            match (redis_notifications_client, regional_redis_client.clone()) {
+                (None, None) => {}
+                (client1, client2) => {
+                    let cache = api.caches.project_info.clone();
+                    if let Some(client) = client1 {
+                        maintenance_tasks.spawn(notifications::task_main(
+                            client,
+                            cache.clone(),
+                            cancel_map.clone(),
+                            args.region.clone(),
+                        ));
+                    }
+                    if let Some(client) = client2 {
+                        maintenance_tasks.spawn(notifications::task_main(
+                            client,
+                            cache.clone(),
+                            cancel_map.clone(),
+                            args.region.clone(),
+                        ));
+                    }
+                    maintenance_tasks.spawn(async move { cache.clone().gc_worker().await });
+                }
+            }
+            if let Some(regional_redis_client) = regional_redis_client {
+                let cache = api.caches.endpoints_cache.clone();
+                let con = regional_redis_client;
+                let span = tracing::info_span!("endpoints_cache");
+                maintenance_tasks.spawn(
+                    async move { cache.do_read(con, cancellation_token.clone()).await }
+                        .instrument(span),
+                );
+            }
        }
    }

@@ -662,6 +698,65 @@ fn build_auth_backend(
    args: &ProxyCliArgs,
 ) -> anyhow::Result<Either<&'static auth::Backend<'static, ()>, &'static ConsoleRedirectBackend>> {
    match &args.auth_backend {
+        AuthBackendType::ControlPlaneV1 => {
+            let wake_compute_cache_config: CacheOptions = args.wake_compute_cache.parse()?;
+            let project_info_cache_config: ProjectInfoCacheOptions =
+                args.project_info_cache.parse()?;
+            let endpoint_cache_config: config::EndpointCacheConfig =
+                args.endpoint_cache_config.parse()?;
+
+            info!("Using NodeInfoCache (wake_compute) with options={wake_compute_cache_config:?}");
+            info!(
+                "Using AllowedIpsCache (wake_compute) with options={project_info_cache_config:?}"
+            );
+            info!("Using EndpointCacheConfig with options={endpoint_cache_config:?}");
+            let caches = Box::leak(Box::new(control_plane::caches::ApiCaches::new(
+                wake_compute_cache_config,
+                project_info_cache_config,
+                endpoint_cache_config,
+            )));
+
+            let config::ConcurrencyLockOptions {
+                shards,
+                limiter,
+                epoch,
+                timeout,
+            } = args.wake_compute_lock.parse()?;
+            info!(?limiter, shards, ?epoch, "Using NodeLocks (wake_compute)");
+            let locks = Box::leak(Box::new(control_plane::locks::ApiLocks::new(
+                "wake_compute_lock",
+                limiter,
+                shards,
+                timeout,
+                epoch,
+                &Metrics::get().wake_compute_lock,
+            )?));
+            tokio::spawn(locks.garbage_collect_worker());
+
+            let url: proxy::url::ApiUrl = args.auth_endpoint.parse()?;
+
+            let endpoint = http::Endpoint::new(url, http::new_client());
+
+            let mut wake_compute_rps_limit = args.wake_compute_limit.clone();
+            RateBucketInfo::validate(&mut wake_compute_rps_limit)?;
+            let wake_compute_endpoint_rate_limiter =
+                Arc::new(WakeComputeRateLimiter::new(wake_compute_rps_limit));
+
+            let api = control_plane::client::cplane_proxy_v1::NeonControlPlaneClient::new(
+                endpoint,
+                args.control_plane_token.clone(),
+                caches,
+                locks,
+                wake_compute_endpoint_rate_limiter,
+            );
+
+            let api = control_plane::client::ControlPlaneClient::ProxyV1(api);
+            let auth_backend = auth::Backend::ControlPlane(MaybeOwned::Owned(api), ());
+            let config = Box::leak(Box::new(auth_backend));
+
+            Ok(Either::Left(config))
+        }
+
        AuthBackendType::ControlPlane => {
            let wake_compute_cache_config: CacheOptions = args.wake_compute_cache.parse()?;
            let project_info_cache_config: ProjectInfoCacheOptions =
@@ -697,13 +792,15 @@ fn build_auth_backend(
            )?));
            tokio::spawn(locks.garbage_collect_worker());

-            let url = args.auth_endpoint.parse()?;
+            let url: proxy::url::ApiUrl = args.auth_endpoint.parse()?;
+
            let endpoint = http::Endpoint::new(url, http::new_client());

            let mut wake_compute_rps_limit = args.wake_compute_limit.clone();
            RateBucketInfo::validate(&mut wake_compute_rps_limit)?;
            let wake_compute_endpoint_rate_limiter =
                Arc::new(WakeComputeRateLimiter::new(wake_compute_rps_limit));
+
            let api = control_plane::client::neon::NeonControlPlaneClient::new(
                endpoint,
                args.control_plane_token.clone(),
--- a/proxy/src/cancellation.rs
+++ b/proxy/src/cancellation.rs
@@ -70,11 +70,12 @@ impl ReportableError for CancelError {
 impl<P: CancellationPublisher> CancellationHandler<P> {
    /// Run async action within an ephemeral session identified by [`CancelKeyData`].
    pub(crate) fn get_session(self: Arc<Self>) -> Session<P> {
-        // HACK: We'd rather get the real backend_pid but postgres_client doesn't
-        // expose it and we don't want to do another roundtrip to query
-        // for it. The client will be able to notice that this is not the
-        // actual backend_pid, but backend_pid is not used for anything
-        // so it doesn't matter.
+        // we intentionally generate a random "backend pid" and "secret key" here.
+        // we use the corresponding u64 as an identifier for the
+        // actual endpoint+pid+secret for postgres/pgbouncer.
+        //
+        // if we forwarded the backend_pid from postgres to the client, there would be a lot
+        // of overlap between our computes as most pids are small (~100).
        let key = loop {
            let key = rand::random();

@@ -114,7 +115,8 @@ impl<P: CancellationPublisher> CancellationHandler<P> {
                IpAddr::V6(ip) => IpNet::V6(Ipv6Net::new_assert(ip, 64).trunc()),
            };
            if !self.limiter.lock().unwrap().check(subnet_key, 1) {
-                tracing::debug!("Rate limit exceeded. Skipping cancellation message");
+                // log only the subnet part of the IP address to know which subnet is rate limited
+                tracing::warn!("Rate limit exceeded. Skipping cancellation message, {subnet_key}");
                Metrics::get()
                    .proxy
                    .cancellation_requests_total
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -131,49 +131,37 @@ impl ConnCfg {
    }

    /// Apply startup message params to the connection config.
-    pub(crate) fn set_startup_params(&mut self, params: &StartupMessageParams) {
-        // Only set `user` if it's not present in the config.
-        // Console redirect auth flow takes username from the console's response.
-        if let (None, Some(user)) = (self.get_user(), params.get("user")) {
-            self.user(user);
+    pub(crate) fn set_startup_params(
+        &mut self,
+        params: &StartupMessageParams,
+        arbitrary_params: bool,
+    ) {
+        if !arbitrary_params {
+            self.set_param("client_encoding", "UTF8");
        }
-
-        // Only set `dbname` if it's not present in the config.
-        // Console redirect auth flow takes dbname from the console's response.
-        if let (None, Some(dbname)) = (self.get_dbname(), params.get("database")) {
-            self.dbname(dbname);
-        }
-
-        // Don't add `options` if they were only used for specifying a project.
-        // Connection pools don't support `options`, because they affect backend startup.
-        if let Some(options) = filtered_options(params) {
-            self.options(&options);
-        }
-
-        if let Some(app_name) = params.get("application_name") {
-            self.application_name(app_name);
-        }
-
-        // TODO: This is especially ugly...
-        if let Some(replication) = params.get("replication") {
-            use postgres_client::config::ReplicationMode;
-            match replication {
-                "true" | "on" | "yes" | "1" => {
-                    self.replication_mode(ReplicationMode::Physical);
+        for (k, v) in params.iter() {
+            match k {
+                // Only set `user` if it's not present in the config.
+                // Console redirect auth flow takes username from the console's response.
+                "user" if self.user_is_set() => continue,
+                "database" if self.db_is_set() => continue,
+                "options" => {
+                    if let Some(options) = filtered_options(v) {
+                        self.set_param(k, &options);
+                    }
                }
-                "database" => {
-                    self.replication_mode(ReplicationMode::Logical);
+                "user" | "database" | "application_name" | "replication" => {
+                    self.set_param(k, v);
                }
-                _other => {}
+
+                // if we allow arbitrary params, then we forward them through.
+                // this is a flag for a period of backwards compatibility
+                k if arbitrary_params => {
+                    self.set_param(k, v);
+                }
+                _ => {}
            }
        }
-
-        // TODO: extend the list of the forwarded startup parameters.
-        // Currently, tokio-postgres doesn't allow us to pass
-        // arbitrary parameters, but the ones above are a good start.
-        //
-        // This and the reverse params problem can be better addressed
-        // in a bespoke connection machinery (a new library for that sake).
    }
 }

@@ -347,10 +335,9 @@ impl ConnCfg {
 }

 /// Retrieve `options` from a startup message, dropping all proxy-secific flags.
-fn filtered_options(params: &StartupMessageParams) -> Option<String> {
+fn filtered_options(options: &str) -> Option<String> {
    #[allow(unstable_name_collisions)]
-    let options: String = params
-        .options_raw()?
+    let options: String = StartupMessageParams::parse_options_raw(options)
        .filter(|opt| parse_endpoint_param(opt).is_none() && neon_option(opt).is_none())
        .intersperse(" ") // TODO: use impl from std once it's stabilized
        .collect();
@@ -427,27 +414,24 @@ mod tests {
    #[test]
    fn test_filtered_options() {
        // Empty options is unlikely to be useful anyway.
-        let params = StartupMessageParams::new([("options", "")]);
-        assert_eq!(filtered_options(&params), None);
+        let params = "";
+        assert_eq!(filtered_options(params), None);

        // It's likely that clients will only use options to specify endpoint/project.
-        let params = StartupMessageParams::new([("options", "project=foo")]);
-        assert_eq!(filtered_options(&params), None);
+        let params = "project=foo";
+        assert_eq!(filtered_options(params), None);

        // Same, because unescaped whitespaces are no-op.
-        let params = StartupMessageParams::new([("options", " project=foo ")]);
-        assert_eq!(filtered_options(&params).as_deref(), None);
+        let params = " project=foo ";
+        assert_eq!(filtered_options(params).as_deref(), None);

-        let params = StartupMessageParams::new([("options", r"\  project=foo \ ")]);
-        assert_eq!(filtered_options(&params).as_deref(), Some(r"\  \ "));
+        let params = r"\  project=foo \ ";
+        assert_eq!(filtered_options(params).as_deref(), Some(r"\  \ "));

-        let params = StartupMessageParams::new([("options", "project = foo")]);
-        assert_eq!(filtered_options(&params).as_deref(), Some("project = foo"));
+        let params = "project = foo";
+        assert_eq!(filtered_options(params).as_deref(), Some("project = foo"));

-        let params = StartupMessageParams::new([(
-            "options",
-            "project = foo neon_endpoint_type:read_write   neon_lsn:0/2",
-        )]);
-        assert_eq!(filtered_options(&params).as_deref(), Some("project = foo"));
+        let params = "project = foo neon_endpoint_type:read_write   neon_lsn:0/2 neon_proxy_params_compat:true";
+        assert_eq!(filtered_options(params).as_deref(), Some("project = foo"));
    }
 }
--- a/Show More
+++ b/Show More