convert appendresponse

convert appendrequest
convert ProposerElected
2026-05-21 07:00:38 +00:00 · 2025-02-04 10:02:43 +01:00 · 2025-01-30 13:21:40 +01:00 · 2025-01-30 12:23:27 +01:00 · 2025-01-30 11:29:14 +01:00 · 2025-01-30 11:10:58 +01:00
192 changed files with 10729 additions and 4428 deletions
--- a/.dockerignore
+++ b/.dockerignore
@@ -24,4 +24,3 @@
 !storage_controller/
 !vendor/postgres-*/
 !workspace_hack/
-!debug-oom/
--- a/.github/actionlint.yml
+++ b/.github/actionlint.yml
@@ -25,3 +25,4 @@ config-variables:
  - PGREGRESS_PG17_PROJECT_ID
  - SLACK_ON_CALL_QA_STAGING_STREAM
  - DEV_AWS_OIDC_ROLE_MANAGE_BENCHMARK_EC2_VMS_ARN
+  - SLACK_ON_CALL_STORAGE_STAGING_STREAM
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -346,25 +346,22 @@ jobs:
      # XXX: no coverage data handling here, since benchmarks are run on release builds,
      # while coverage is currently collected for the debug ones

-  report-benchmarks-failures:
+  report-benchmarks-results-to-slack:
    needs: [ benchmarks, create-test-report ]
-    if: github.ref_name == 'main' && failure() && needs.benchmarks.result == 'failure'
-    permissions:
-      id-token: write # aws-actions/configure-aws-credentials
-      statuses: write
-      contents: write
-      pull-requests: write
+    if: github.ref_name == 'main' && !cancelled() && contains(fromJSON('["success", "failure"]'), needs.benchmarks.result)
    runs-on: ubuntu-22.04

    steps:
-    - uses: slackapi/slack-github-action@v1
+    - uses: slackapi/slack-github-action@v2
      with:
-        channel-id: C060CNA47S9 # on-call-staging-storage-stream
-        slack-message: |
-          Benchmarks failed on main <${{ github.event.head_commit.url }}|${{ github.sha }}>
-          <${{ needs.create-test-report.outputs.report-url }}|Allure report>
-      env:
-        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
+        method: chat.postMessage
+        token: ${{ secrets.SLACK_BOT_TOKEN }}
+        payload: |
+          channel: "${{ vars.SLACK_ON_CALL_STORAGE_STAGING_STREAM }}"
+          text: |
+            Benchmarks on main: *${{ needs.benchmarks.result }}*
+            - <${{ needs.create-test-report.outputs.report-url }}|Allure report>
+            - <${{ github.event.head_commit.url }}|${{ github.sha }}>

  create-test-report:
    needs: [ check-permissions, build-and-test-locally, coverage-report, build-build-tools-image, benchmarks ]
@@ -728,30 +725,6 @@ jobs:
          tags: |
            neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{needs.tag.outputs.build-tag}}-${{ matrix.version.debian }}-${{ matrix.arch }}

-      - name: Build compute-tools image
-        # compute-tools are Postgres independent, so build it only once
-        # We pick 16, because that builds on debian 11 with older glibc (and is
-        # thus compatible with newer glibc), rather than 17 on Debian 12, as
-        # that isn't guaranteed to be compatible with Debian 11
-        if: matrix.version.pg == 'v16'
-        uses: docker/build-push-action@v6
-        with:
-          target: compute-tools-image
-          context: .
-          build-args: |
-            GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
-            BUILD_TAG=${{ needs.tag.outputs.build-tag }}
-            TAG=${{ needs.build-build-tools-image.outputs.image-tag }}-${{ matrix.version.debian }}
-            DEBIAN_VERSION=${{ matrix.version.debian }}
-          provenance: false
-          push: true
-          pull: true
-          file: compute/compute-node.Dockerfile
-          cache-from: type=registry,ref=cache.neon.build/compute-node-${{ matrix.version.pg }}:cache-${{ matrix.version.debian }}-${{ matrix.arch }}
-          cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/compute-tools-{0}:cache-{1}-{2},mode=max', matrix.version.pg, matrix.version.debian, matrix.arch) || '' }}
-          tags: |
-            neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-${{ matrix.arch }}
-
  compute-node-image:
    needs: [ compute-node-image-arch, tag ]
    permissions:
@@ -794,14 +767,6 @@ jobs:
                                             neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-x64 \
                                             neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-arm64

-      - name: Create multi-arch compute-tools image
-        if: matrix.version.pg == 'v16'
-        run: |
-          docker buildx imagetools create -t neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }} \
-                                          -t neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }} \
-                                             neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-x64 \
-                                             neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-arm64
-
      - name: Configure AWS credentials
        uses: aws-actions/configure-aws-credentials@v4
        with:
@@ -817,12 +782,6 @@ jobs:
          docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }} \
                                                                                neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}

-      - name: Push multi-arch compute-tools image to ECR
-        if: matrix.version.pg == 'v16'
-        run: |
-          docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{ needs.tag.outputs.build-tag }} \
-                                                                                neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}
-
  vm-compute-node-image:
    needs: [ check-permissions, tag, compute-node-image ]
    runs-on: [ self-hosted, large ]
@@ -1001,9 +960,6 @@ jobs:
            docker buildx imagetools create -t $repo/neon:latest \
                                               $repo/neon:${{ needs.tag.outputs.build-tag }}

-            docker buildx imagetools create -t $repo/compute-tools:latest \
-                                               $repo/compute-tools:${{ needs.tag.outputs.build-tag }}
-
            for version in ${VERSIONS}; do
              docker buildx imagetools create -t $repo/compute-node-${version}:latest \
                                                 $repo/compute-node-${version}:${{ needs.tag.outputs.build-tag }}
@@ -1032,7 +988,7 @@ jobs:
      - name: Copy all images to prod ECR
        if: github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute'
        run: |
-          for image in neon compute-tools {vm-,}compute-node-{v14,v15,v16,v17}; do
+          for image in neon {vm-,}compute-node-{v14,v15,v16,v17}; do
            docker buildx imagetools create -t 093970136003.dkr.ecr.eu-central-1.amazonaws.com/${image}:${{ needs.tag.outputs.build-tag }} \
                                               369495373322.dkr.ecr.eu-central-1.amazonaws.com/${image}:${{ needs.tag.outputs.build-tag }}
          done
@@ -1044,7 +1000,7 @@ jobs:
    with:
      client_id: ${{ vars.AZURE_DEV_CLIENT_ID }}
      image_tag: ${{ needs.tag.outputs.build-tag }}
-      images: neon compute-tools vm-compute-node-v14 vm-compute-node-v15 vm-compute-node-v16 vm-compute-node-v17 compute-node-v14 compute-node-v15 compute-node-v16 compute-node-v17
+      images: neon vm-compute-node-v14 vm-compute-node-v15 vm-compute-node-v16 vm-compute-node-v17 compute-node-v14 compute-node-v15 compute-node-v16 compute-node-v17
      registry_name: ${{ vars.AZURE_DEV_REGISTRY_NAME }}
      subscription_id: ${{ vars.AZURE_DEV_SUBSCRIPTION_ID }}
      tenant_id: ${{ vars.AZURE_TENANT_ID }}
@@ -1056,7 +1012,7 @@ jobs:
    with:
      client_id: ${{ vars.AZURE_PROD_CLIENT_ID }}
      image_tag: ${{ needs.tag.outputs.build-tag }}
-      images: neon compute-tools vm-compute-node-v14 vm-compute-node-v15 vm-compute-node-v16 vm-compute-node-v17 compute-node-v14 compute-node-v15 compute-node-v16 compute-node-v17
+      images: neon vm-compute-node-v14 vm-compute-node-v15 vm-compute-node-v16 vm-compute-node-v17 compute-node-v14 compute-node-v15 compute-node-v16 compute-node-v17
      registry_name: ${{ vars.AZURE_PROD_REGISTRY_NAME }}
      subscription_id: ${{ vars.AZURE_PROD_SUBSCRIPTION_ID }}
      tenant_id: ${{ vars.AZURE_TENANT_ID }}
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -718,13 +718,13 @@ dependencies = [

 [[package]]
 name = "axum"
-version = "0.7.5"
+version = "0.7.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3a6c9af12842a67734c9a2e355436e5d03b22383ed60cf13cd0c18fbfe3dcbcf"
+checksum = "edca88bc138befd0323b20752846e6587272d3b03b0343c8ea28a6f819e6e71f"
 dependencies = [
 "async-trait",
 "axum-core",
- "base64 0.21.1",
+ "base64 0.22.1",
 "bytes",
 "futures-util",
 "http 1.1.0",
@@ -746,8 +746,8 @@ dependencies = [
 "sha1",
 "sync_wrapper 1.0.1",
 "tokio",
- "tokio-tungstenite",
- "tower",
+ "tokio-tungstenite 0.24.0",
+ "tower 0.5.2",
 "tower-layer",
 "tower-service",
 "tracing",
@@ -1267,6 +1267,7 @@ dependencies = [
 "aws-config",
 "aws-sdk-kms",
 "aws-sdk-s3",
+ "axum",
 "base64 0.13.1",
 "bytes",
 "camino",
@@ -1277,7 +1278,7 @@ dependencies = [
 "fail",
 "flate2",
 "futures",
- "hyper 0.14.30",
+ "http 1.1.0",
 "metrics",
 "nix 0.27.1",
 "notify",
@@ -1303,6 +1304,8 @@ dependencies = [
 "tokio-postgres",
 "tokio-stream",
 "tokio-util",
+ "tower 0.5.2",
+ "tower-http",
 "tracing",
 "tracing-opentelemetry",
 "tracing-subscriber",
@@ -1602,6 +1605,32 @@ dependencies = [
 "typenum",
 ]

+[[package]]
+name = "curve25519-dalek"
+version = "4.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "97fb8b7c4503de7d6ae7b42ab72a5a59857b4c937ec27a3d4539dba95b5ab2be"
+dependencies = [
+ "cfg-if",
+ "cpufeatures",
+ "curve25519-dalek-derive",
+ "digest",
+ "fiat-crypto",
+ "rustc_version",
+ "subtle",
+]
+
+[[package]]
+name = "curve25519-dalek-derive"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f46882e17999c6cc590af592290432be3bce0428cb0d5f8b6715e4dc7b383eb3"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.90",
+]
+
 [[package]]
 name = "darling"
 version = "0.20.1"
@@ -1650,6 +1679,20 @@ dependencies = [
 "parking_lot_core 0.9.8",
 ]

+[[package]]
+name = "dashmap"
+version = "6.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5041cc499144891f3790297212f32a74fb938e5136a14943f338ef9e0ae276cf"
+dependencies = [
+ "cfg-if",
+ "crossbeam-utils",
+ "hashbrown 0.14.5",
+ "lock_api",
+ "once_cell",
+ "parking_lot_core 0.9.8",
+]
+
 [[package]]
 name = "data-encoding"
 version = "2.4.0"
@@ -1858,6 +1901,28 @@ dependencies = [
 "spki 0.7.3",
 ]

+[[package]]
+name = "ed25519"
+version = "2.2.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "115531babc129696a58c64a4fef0a8bf9e9698629fb97e9e40767d235cfbcd53"
+dependencies = [
+ "signature 2.2.0",
+]
+
+[[package]]
+name = "ed25519-dalek"
+version = "2.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4a3daa8e81a3963a60642bcc1f90a670680bd4a77535faa384e9d1c79d620871"
+dependencies = [
+ "curve25519-dalek",
+ "ed25519",
+ "rand_core 0.6.4",
+ "sha2",
+ "subtle",
+]
+
 [[package]]
 name = "either"
 version = "1.8.1"
@@ -1949,6 +2014,15 @@ dependencies = [
 "syn 2.0.90",
 ]

+[[package]]
+name = "env_filter"
+version = "0.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "186e05a59d4c50738528153b83b0b0194d3a29507dfec16eccd4b342903397d0"
+dependencies = [
+ "log",
+]
+
 [[package]]
 name = "env_logger"
 version = "0.10.2"
@@ -1962,6 +2036,16 @@ dependencies = [
 "termcolor",
 ]

+[[package]]
+name = "env_logger"
+version = "0.11.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6c012a26a7f605efc424dd53697843a72be7dc86ad2d01f7814337794a12231d"
+dependencies = [
+ "env_filter",
+ "log",
+]
+
 [[package]]
 name = "equator"
 version = "0.2.2"
@@ -2077,6 +2161,12 @@ dependencies = [
 "subtle",
 ]

+[[package]]
+name = "fiat-crypto"
+version = "0.2.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "28dea519a9695b9977216879a3ebfddf92f1c08c05d984f8996aecd6ecdc811d"
+
 [[package]]
 name = "filetime"
 version = "0.2.22"
@@ -2720,7 +2810,7 @@ dependencies = [
 "pin-project-lite",
 "socket2",
 "tokio",
- "tower",
+ "tower 0.4.13",
 "tower-service",
 "tracing",
 ]
@@ -2945,6 +3035,28 @@ dependencies = [
 "str_stack",
 ]

+[[package]]
+name = "inferno"
+version = "0.12.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "75a5d75fee4d36809e6b021e4b96b686e763d365ffdb03af2bd00786353f84fe"
+dependencies = [
+ "ahash",
+ "clap",
+ "crossbeam-channel",
+ "crossbeam-utils",
+ "dashmap 6.1.0",
+ "env_logger 0.11.2",
+ "indexmap 2.0.1",
+ "itoa",
+ "log",
+ "num-format",
+ "once_cell",
+ "quick-xml 0.37.1",
+ "rgb",
+ "str_stack",
+]
+
 [[package]]
 name = "inotify"
 version = "0.9.6"
@@ -3152,7 +3264,7 @@ version = "0.7.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4644821e1c3d7a560fe13d842d13f587c07348a1a05d3a797152d41c90c56df2"
 dependencies = [
- "dashmap",
+ "dashmap 5.5.0",
 "hashbrown 0.13.2",
 ]

@@ -3260,9 +3372,9 @@ checksum = "b87248edafb776e59e6ee64a79086f65890d3510f2c656c000bf2a7e8a0aea40"

 [[package]]
 name = "matchit"
-version = "0.8.2"
+version = "0.8.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "540f1c43aed89909c0cc0cc604e3bb2f7e7a341a3728a9e6cfe760e733cd11ed"
+checksum = "47e1ffaa40ddd1f3ed91f717a33c8c0ee23fff369e3aa8772b9605cc1d22f4c3"

 [[package]]
 name = "md-5"
@@ -3690,23 +3802,23 @@ checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf"

 [[package]]
 name = "opentelemetry"
-version = "0.26.0"
+version = "0.27.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "570074cc999d1a58184080966e5bd3bf3a9a4af650c3b05047c2621e7405cd17"
+checksum = "ab70038c28ed37b97d8ed414b6429d343a8bbf44c9f79ec854f3a643029ba6d7"
 dependencies = [
 "futures-core",
 "futures-sink",
 "js-sys",
- "once_cell",
 "pin-project-lite",
 "thiserror",
+ "tracing",
 ]

 [[package]]
 name = "opentelemetry-http"
-version = "0.26.0"
+version = "0.27.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6351496aeaa49d7c267fb480678d85d1cd30c5edb20b497c48c56f62a8c14b99"
+checksum = "10a8a7f5f6ba7c1b286c2fbca0454eaba116f63bbe69ed250b642d36fbb04d80"
 dependencies = [
 "async-trait",
 "bytes",
@@ -3717,9 +3829,9 @@ dependencies = [

 [[package]]
 name = "opentelemetry-otlp"
-version = "0.26.0"
+version = "0.27.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "29e1f9c8b032d4f635c730c0efcf731d5e2530ea13fa8bef7939ddc8420696bd"
+checksum = "91cf61a1868dacc576bf2b2a1c3e9ab150af7272909e80085c3173384fe11f76"
 dependencies = [
 "async-trait",
 "futures-core",
@@ -3735,9 +3847,9 @@ dependencies = [

 [[package]]
 name = "opentelemetry-proto"
-version = "0.26.1"
+version = "0.27.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c9d3968ce3aefdcca5c27e3c4ea4391b37547726a70893aab52d3de95d5f8b34"
+checksum = "a6e05acbfada5ec79023c85368af14abd0b307c015e9064d249b2a950ef459a6"
 dependencies = [
 "opentelemetry",
 "opentelemetry_sdk",
@@ -3747,22 +3859,21 @@ dependencies = [

 [[package]]
 name = "opentelemetry-semantic-conventions"
-version = "0.26.0"
+version = "0.27.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "db945c1eaea8ac6a9677185357480d215bb6999faa9f691d0c4d4d641eab7a09"
+checksum = "bc1b6902ff63b32ef6c489e8048c5e253e2e4a803ea3ea7e783914536eb15c52"

 [[package]]
 name = "opentelemetry_sdk"
-version = "0.26.0"
+version = "0.27.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d2c627d9f4c9cdc1f21a29ee4bfbd6028fcb8bcf2a857b43f3abdf72c9c862f3"
+checksum = "231e9d6ceef9b0b2546ddf52335785ce41252bc7474ee8ba05bfad277be13ab8"
 dependencies = [
 "async-trait",
 "futures-channel",
 "futures-executor",
 "futures-util",
 "glob",
- "once_cell",
 "opentelemetry",
 "percent-encoding",
 "rand 0.8.5",
@@ -3770,6 +3881,7 @@ dependencies = [
 "thiserror",
 "tokio",
 "tokio-stream",
+ "tracing",
 ]

 [[package]]
@@ -3932,6 +4044,7 @@ dependencies = [
 "postgres_connection",
 "postgres_ffi",
 "postgres_initdb",
+ "pprof",
 "pq_proto",
 "procfs",
 "rand 0.8.5",
@@ -4418,7 +4531,7 @@ dependencies = [
 "bytes",
 "crc32c",
 "criterion",
- "env_logger",
+ "env_logger 0.10.2",
 "log",
 "memoffset 0.9.0",
 "once_cell",
@@ -4459,7 +4572,7 @@ dependencies = [
 "cfg-if",
 "criterion",
 "findshlibs",
- "inferno",
+ "inferno 0.11.21",
 "libc",
 "log",
 "nix 0.26.4",
@@ -4494,9 +4607,9 @@ checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de"

 [[package]]
 name = "pq-sys"
-version = "0.4.8"
+version = "0.6.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "31c0052426df997c0cbd30789eb44ca097e3541717a7b8fa36b1c464ee7edebd"
+checksum = "f6cc05d7ea95200187117196eee9edd0644424911821aeb28a18ce60ea0b8793"
 dependencies = [
 "vcpkg",
 ]
@@ -4685,9 +4798,10 @@ dependencies = [
 "clap",
 "compute_api",
 "consumption_metrics",
- "dashmap",
+ "dashmap 5.5.0",
 "ecdsa 0.16.9",
- "env_logger",
+ "ed25519-dalek",
+ "env_logger 0.10.2",
 "fallible-iterator",
 "flate2",
 "framed-websockets",
@@ -4758,7 +4872,7 @@ dependencies = [
 "tokio-postgres",
 "tokio-postgres2",
 "tokio-rustls 0.26.0",
- "tokio-tungstenite",
+ "tokio-tungstenite 0.21.0",
 "tokio-util",
 "tracing",
 "tracing-subscriber",
@@ -4794,6 +4908,15 @@ dependencies = [
 "serde",
 ]

+[[package]]
+name = "quick-xml"
+version = "0.37.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f22f29bdff3987b4d8632ef95fd6424ec7e4e0a57e2f4fc63e489e75357f6a03"
+dependencies = [
+ "memchr",
+]
+
 [[package]]
 name = "quote"
 version = "1.0.37"
@@ -5178,15 +5301,15 @@ dependencies = [

 [[package]]
 name = "reqwest-tracing"
-version = "0.5.4"
+version = "0.5.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ff82cf5730a1311fb9413b0bc2b8e743e0157cd73f010ab4ec374a923873b6a2"
+checksum = "73e6153390585f6961341b50e5a1931d6be6dee4292283635903c26ef9d980d2"
 dependencies = [
 "anyhow",
 "async-trait",
 "getrandom 0.2.11",
 "http 1.1.0",
- "matchit 0.8.2",
+ "matchit 0.8.4",
 "opentelemetry",
 "reqwest",
 "reqwest-middleware",
@@ -5584,10 +5707,12 @@ dependencies = [
 name = "safekeeper_api"
 version = "0.1.0"
 dependencies = [
+ "anyhow",
 "const_format",
 "postgres_ffi",
 "pq_proto",
 "serde",
+ "serde_json",
 "tokio",
 "utils",
 ]
@@ -6800,7 +6925,19 @@ dependencies = [
 "futures-util",
 "log",
 "tokio",
- "tungstenite",
+ "tungstenite 0.21.0",
+]
+
+[[package]]
+name = "tokio-tungstenite"
+version = "0.24.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "edc5f74e248dc973e0dbb7b74c7e0d6fcc301c694ff50049504004ef4d0cdcd9"
+dependencies = [
+ "futures-util",
+ "log",
+ "tokio",
+ "tungstenite 0.24.0",
 ]

 [[package]]
@@ -6881,7 +7018,7 @@ dependencies = [
 "tokio",
 "tokio-rustls 0.26.0",
 "tokio-stream",
- "tower",
+ "tower 0.4.13",
 "tower-layer",
 "tower-service",
 "tracing",
@@ -6922,16 +7059,49 @@ dependencies = [
 ]

 [[package]]
-name = "tower-layer"
-version = "0.3.2"
+name = "tower"
+version = "0.5.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c20c8dbed6283a09604c3e69b4b7eeb54e298b8a600d4d5ecb5ad39de609f1d0"
+checksum = "d039ad9159c98b70ecfd540b2573b97f7f52c3e8d9f8ad57a24b916a536975f9"
+dependencies = [
+ "futures-core",
+ "futures-util",
+ "pin-project-lite",
+ "sync_wrapper 1.0.1",
+ "tokio",
+ "tower-layer",
+ "tower-service",
+ "tracing",
+]
+
+[[package]]
+name = "tower-http"
+version = "0.6.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "403fa3b783d4b626a8ad51d766ab03cb6d2dbfc46b1c5d4448395e6628dc9697"
+dependencies = [
+ "bitflags 2.4.1",
+ "bytes",
+ "http 1.1.0",
+ "http-body 1.0.0",
+ "pin-project-lite",
+ "tower-layer",
+ "tower-service",
+ "tracing",
+ "uuid",
+]
+
+[[package]]
+name = "tower-layer"
+version = "0.3.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "121c2a6cda46980bb0fcd1647ffaf6cd3fc79a013de288782836f6df9c48780e"

 [[package]]
 name = "tower-service"
-version = "0.3.2"
+version = "0.3.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b6bc1c9ce2b5135ac7f93c72918fc37feb872bdc6a5533a8b85eb4b86bfdae52"
+checksum = "8df9b6e13f2d32c91b9bd719c00d1958837bc7dec474d94952798cc8e69eeec3"

 [[package]]
 name = "tracing"
@@ -7000,9 +7170,9 @@ dependencies = [

 [[package]]
 name = "tracing-opentelemetry"
-version = "0.27.0"
+version = "0.28.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dc58af5d3f6c5811462cabb3289aec0093f7338e367e5a33d28c0433b3c7360b"
+checksum = "97a971f6058498b5c0f1affa23e7ea202057a7301dbff68e968b2d578bcbd053"
 dependencies = [
 "js-sys",
 "once_cell",
@@ -7086,6 +7256,24 @@ dependencies = [
 "utf-8",
 ]

+[[package]]
+name = "tungstenite"
+version = "0.24.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "18e5b8366ee7a95b16d32197d0b2604b43a0be89dc5fac9f8e96ccafbaedda8a"
+dependencies = [
+ "byteorder",
+ "bytes",
+ "data-encoding",
+ "http 1.1.0",
+ "httparse",
+ "log",
+ "rand 0.8.5",
+ "sha1",
+ "thiserror",
+ "utf-8",
+]
+
 [[package]]
 name = "twox-hash"
 version = "1.6.3"
@@ -7253,6 +7441,7 @@ dependencies = [
 "hex-literal",
 "humantime",
 "hyper 0.14.30",
+ "inferno 0.12.0",
 "itertools 0.10.5",
 "jemalloc_pprof",
 "jsonwebtoken",
@@ -7356,7 +7545,7 @@ dependencies = [
 "anyhow",
 "camino-tempfile",
 "clap",
- "env_logger",
+ "env_logger 0.10.2",
 "log",
 "postgres",
 "postgres_ffi",
@@ -7371,12 +7560,21 @@ dependencies = [
 "anyhow",
 "async-compression",
 "bytes",
+ "camino",
+ "camino-tempfile",
+ "criterion",
+ "futures",
 "pageserver_api",
 "postgres_ffi",
+ "pprof",
 "prost",
+ "remote_storage",
 "serde",
+ "serde_json",
 "thiserror",
+ "tikv-jemallocator",
 "tokio",
+ "tokio-util",
 "tonic",
 "tonic-build",
 "tracing",
@@ -7867,7 +8065,8 @@ dependencies = [
 "tokio-util",
 "toml_edit",
 "tonic",
- "tower",
+ "tower 0.4.13",
+ "tower 0.5.2",
 "tracing",
 "tracing-core",
 "url",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -65,7 +65,7 @@ aws-smithy-types = "1.2"
 aws-credential-types = "1.2.0"
 aws-sigv4 = { version = "1.2", features = ["sign-http"] }
 aws-types = "1.3"
-axum = { version = "0.7.5", features = ["ws"] }
+axum = { version = "0.7.9", features = ["ws"] }
 base64 = "0.13.0"
 bincode = "1.3"
 bindgen = "0.70"
@@ -110,6 +110,7 @@ hyper-util = "0.1"
 tokio-tungstenite = "0.21.0"
 indexmap = "2"
 indoc = "2"
+inferno = "0.12.0"
 ipnet = "2.10.0"
 itertools = "0.10"
 itoa = "1.0.11"
@@ -126,10 +127,10 @@ notify = "6.0.0"
 num_cpus = "1.15"
 num-traits = "0.2.15"
 once_cell = "1.13"
-opentelemetry = "0.26"
-opentelemetry_sdk = "0.26"
-opentelemetry-otlp = { version = "0.26", default-features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
-opentelemetry-semantic-conventions = "0.26"
+opentelemetry = "0.27"
+opentelemetry_sdk = "0.27"
+opentelemetry-otlp = { version = "0.27", default-features = false, features = ["http-proto", "trace", "http", "reqwest-client"] }
+opentelemetry-semantic-conventions = "0.27"
 parking_lot = "0.12"
 parquet = { version = "53", default-features = false, features = ["zstd"] }
 parquet_derive = "53"
@@ -143,7 +144,7 @@ rand = "0.8"
 redis = { version = "0.25.2", features = ["tokio-rustls-comp", "keep-alive"] }
 regex = "1.10.2"
 reqwest = { version = "0.12", default-features = false, features = ["rustls-tls"] }
-reqwest-tracing = { version = "0.5", features = ["opentelemetry_0_26"] }
+reqwest-tracing = { version = "0.5", features = ["opentelemetry_0_27"] }
 reqwest-middleware = "0.4"
 reqwest-retry = "0.7"
 routerify = "3"
@@ -187,10 +188,12 @@ tokio-util = { version = "0.7.10", features = ["io", "rt"] }
 toml = "0.8"
 toml_edit = "0.22"
 tonic = {version = "0.12.3", features = ["tls", "tls-roots"]}
-tower-service = "0.3.2"
+tower = { version = "0.5.2", default-features = false }
+tower-http = { version = "0.6.2", features = ["request-id", "trace"] }
+tower-service = "0.3.3"
 tracing = "0.1"
 tracing-error = "0.2"
-tracing-opentelemetry = "0.27"
+tracing-opentelemetry = "0.28"
 tracing-subscriber = { version = "0.3", default-features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] }
 try-lock = "0.2.5"
 twox-hash = { version = "1.6.3", default-features = false }
--- a/1
+++ b/1
@@ -71,6 +71,7 @@ RUN set -e \
        ca-certificates \
 	# System postgres for use with client libraries (e.g. in storage controller)
        postgresql-15 \
+        openssl \
    && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* \
    && useradd -d /data neon \
    && chown -R neon:neon /data
--- a/3
+++ b/3
@@ -3,7 +3,6 @@ ROOT_PROJECT_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 # Where to install Postgres, default is ./pg_install, maybe useful for package managers
 POSTGRES_INSTALL_DIR ?= $(ROOT_PROJECT_DIR)/pg_install/

-OPENSSL_PREFIX_DIR := /usr/local/openssl
 ICU_PREFIX_DIR := /usr/local/icu

 #
@@ -26,11 +25,9 @@ endif
 ifeq ($(shell test -e /home/nonroot/.docker_build && echo -n yes),yes)
 	# Exclude static build openssl, icu for local build (MacOS, Linux)
 	# Only keep for build type release and debug
-	PG_CFLAGS += -I$(OPENSSL_PREFIX_DIR)/include
 	PG_CONFIGURE_OPTS += --with-icu
 	PG_CONFIGURE_OPTS += ICU_CFLAGS='-I/$(ICU_PREFIX_DIR)/include -DU_STATIC_IMPLEMENTATION'
 	PG_CONFIGURE_OPTS += ICU_LIBS='-L$(ICU_PREFIX_DIR)/lib -L$(ICU_PREFIX_DIR)/lib64 -licui18n -licuuc -licudata -lstdc++ -Wl,-Bdynamic -lm'
-	PG_CONFIGURE_OPTS += LDFLAGS='-L$(OPENSSL_PREFIX_DIR)/lib -L$(OPENSSL_PREFIX_DIR)/lib64 -L$(ICU_PREFIX_DIR)/lib -L$(ICU_PREFIX_DIR)/lib64 -Wl,-Bstatic -lssl -lcrypto -Wl,-Bdynamic -lrt -lm -ldl -lpthread'
 endif

 UNAME_S := $(shell uname -s)
--- a/build-tools.Dockerfile
+++ b/build-tools.Dockerfile
@@ -115,7 +115,7 @@ RUN set -e \

 # Keep the version the same as in compute/compute-node.Dockerfile and
 # test_runner/regress/test_compute_metrics.py.
-ENV SQL_EXPORTER_VERSION=0.16.0
+ENV SQL_EXPORTER_VERSION=0.17.0
 RUN curl -fsSL \
    "https://github.com/burningalchemist/sql_exporter/releases/download/${SQL_EXPORTER_VERSION}/sql_exporter-${SQL_EXPORTER_VERSION}.linux-$(case "$(uname -m)" in x86_64) echo amd64;; aarch64) echo arm64;; esac).tar.gz" \
    --output sql_exporter.tar.gz \
@@ -190,21 +190,6 @@ RUN for package in Capture::Tiny DateTime Devel::Cover Digest::MD5 File::Spec JS
    && make install \
    && rm -rf ../lcov.tar.gz

-# Compile and install the static OpenSSL library
-ENV OPENSSL_VERSION=1.1.1w
-ENV OPENSSL_PREFIX=/usr/local/openssl
-RUN wget -O /tmp/openssl-${OPENSSL_VERSION}.tar.gz https://www.openssl.org/source/openssl-${OPENSSL_VERSION}.tar.gz && \
-    echo "cf3098950cb4d853ad95c0841f1f9c6d3dc102dccfcacd521d93925208b76ac8 /tmp/openssl-${OPENSSL_VERSION}.tar.gz" | sha256sum --check && \
-    cd /tmp && \
-    tar xzvf /tmp/openssl-${OPENSSL_VERSION}.tar.gz && \
-    rm /tmp/openssl-${OPENSSL_VERSION}.tar.gz && \
-    cd /tmp/openssl-${OPENSSL_VERSION} && \
-    ./config --prefix=${OPENSSL_PREFIX}  -static --static no-shared -fPIC && \
-    make -j "$(nproc)" && \
-    make install && \
-    cd /tmp && \
-    rm -rf /tmp/openssl-${OPENSSL_VERSION}
-
 # Use the same version of libicu as the compute nodes so that
 # clusters created using inidb on pageserver can be used by computes.
 #
@@ -258,7 +243,7 @@ WORKDIR /home/nonroot

 # Rust
 # Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
-ENV RUSTC_VERSION=1.83.0
+ENV RUSTC_VERSION=1.84.0
 ENV RUSTUP_HOME="/home/nonroot/.rustup"
 ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
 ARG RUSTFILT_VERSION=0.2.1
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -15,6 +15,7 @@ aws-config.workspace = true
 aws-sdk-s3.workspace = true
 aws-sdk-kms.workspace = true
 anyhow.workspace = true
+axum = { workspace = true, features = [] }
 camino.workspace = true
 chrono.workspace = true
 cfg-if.workspace = true
@@ -22,7 +23,7 @@ clap.workspace = true
 fail.workspace = true
 flate2.workspace = true
 futures.workspace = true
-hyper0 = { workspace = true, features = ["full"] }
+http.workspace = true
 metrics.workspace = true
 nix.workspace = true
 notify.workspace = true
@@ -37,6 +38,8 @@ serde_with.workspace = true
 serde_json.workspace = true
 signal-hook.workspace = true
 tar.workspace = true
+tower.workspace = true
+tower-http.workspace = true
 reqwest = { workspace = true, features = ["json"] }
 tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
 tokio-postgres.workspace = true
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -60,7 +60,7 @@ use compute_tools::compute::{
 };
 use compute_tools::configurator::launch_configurator;
 use compute_tools::extension_server::get_pg_version_string;
-use compute_tools::http::api::launch_http_server;
+use compute_tools::http::launch_http_server;
 use compute_tools::logger::*;
 use compute_tools::monitor::launch_monitor;
 use compute_tools::params::*;
@@ -68,7 +68,6 @@ use compute_tools::spec::*;
 use compute_tools::swap::resize_swap;
 use rlimit::{setrlimit, Resource};
 use utils::failpoint_support;
-use utils::id::{TenantId, TimelineId};

 // this is an arbitrary build tag. Fine as a default / for testing purposes
 // in-case of not-set environment var
@@ -88,9 +87,9 @@ fn main() -> Result<()> {

        let cli_args = process_cli(&clap_args)?;

-        // let cli_spec = try_spec_from_cli(&clap_args, &cli_args)?;
+        let cli_spec = try_spec_from_cli(&clap_args, &cli_args)?;

-        let wait_spec_result = wait_spec(build_tag, cli_args)?;
+        let wait_spec_result = wait_spec(build_tag, cli_args, cli_spec)?;

        start_postgres(&clap_args, wait_spec_result)?

@@ -112,11 +111,6 @@ fn main() -> Result<()> {
 fn init() -> Result<(String, clap::ArgMatches)> {
    init_tracing_and_logging(DEFAULT_LOG_LEVEL)?;

-    opentelemetry::global::set_error_handler(|err| {
-        tracing::info!("OpenTelemetry error: {err}");
-    })
-    .expect("global error handler lock poisoned");
-
    let mut signals = Signals::new([SIGINT, SIGTERM, SIGQUIT])?;
    thread::spawn(move || {
        for sig in signals.forever() {
@@ -314,41 +308,14 @@ fn wait_spec(
        http_port,
        ..
    }: ProcessCliResult,
+    CliSpecParams {
+        spec,
+        live_config_allowed,
+    }: CliSpecParams,
 ) -> Result<WaitSpecResult> {
    let mut new_state = ComputeState::new();
    let spec_set;

-    let live_config_allowed = true;
-
-    let spec = Some(ComputeSpec {
-        // format_version: todo!(),
-        // operation_uuid: todo!(),
-        // features: todo!(),
-        // swap_size_bytes: todo!(),
-        // disk_quota_bytes: todo!(),
-        // disable_lfc_resizing: todo!(),
-        // cluster: todo!(),
-        // delta_operations: todo!(),
-        // skip_pg_catalog_updates: todo!(),
-        // tenant_id: todo!(),
-        // timeline_id: todo!(),
-        // pageserver_connstring: todo!(),
-        // safekeeper_connstrings: todo!(),
-        // mode: todo!(),
-        // storage_auth_token: todo!(),
-        // remote_extensions: todo!(),
-        // pgbouncer_settings: todo!(),
-        // shard_stripe_size: todo!(),
-        // local_proxy_config: todo!(),
-        // reconfigure_concurrency: todo!(),
-        pageserver_connstring: Some("pageserver-1.example.com:5432".to_string()),
-        safekeeper_connstrings: vec!["safekeeper-1.example.com:5432".to_string()],
-        tenant_id: Some(TenantId::generate()),
-        timeline_id: Some(TimelineId::generate()),
-
-        ..Default::default()
-    });
-
    if let Some(spec) = spec {
        let pspec = ParsedSpec::try_from(spec).map_err(|msg| anyhow::anyhow!(msg))?;
        info!("new pspec.spec: {:?}", pspec.spec);
@@ -383,7 +350,9 @@ fn wait_spec(
    // available for binding. Prewarming helps Postgres start quicker later,
    // because QEMU will already have its memory allocated from the host, and
    // the necessary binaries will already be cached.
-    compute.prewarm_postgres()?;
+    if !spec_set {
+        compute.prewarm_postgres()?;
+    }

    // Launch http service first, so that we can serve control-plane requests
    // while configuration is still in progress.
@@ -519,7 +488,10 @@ fn start_postgres(
    let mut pg = None;
    if !prestartup_failed {
        pg = match compute.start_compute() {
-            Ok(pg) => Some(pg),
+            Ok(pg) => {
+                info!(postmaster_pid = %pg.0.id(), "Postgres was started");
+                Some(pg)
+            }
            Err(err) => {
                error!("could not start the compute node: {:#}", err);
                compute.set_failed_status(err);
@@ -617,6 +589,8 @@ fn wait_postgres(pg: Option<PostgresHandle>) -> Result<WaitPostgresResult> {
    // propagate to Postgres and it will be shut down as well.
    let mut exit_code = None;
    if let Some((mut pg, logs_handle)) = pg {
+        info!(postmaster_pid = %pg.id(), "Waiting for Postgres to exit");
+
        let ecode = pg
            .wait()
            .expect("failed to start waiting on Postgres process");
--- a/compute_tools/src/bin/fast_import.rs
+++ b/compute_tools/src/bin/fast_import.rs
@@ -17,7 +17,7 @@
 //!
 //! # Local Testing
 //!
-//! - Comment out most of the pgxns in The Dockerfile.compute-tools to speed up the build.
+//! - Comment out most of the pgxns in compute-node.Dockerfile to speed up the build.
 //! - Build the image with the following command:
 //!
 //! ```bash
--- a/compute_tools/src/catalog.rs
+++ b/compute_tools/src/catalog.rs
@@ -36,11 +36,11 @@ pub async fn get_dbs_and_roles(compute: &Arc<ComputeNode>) -> anyhow::Result<Cat

 #[derive(Debug, thiserror::Error)]
 pub enum SchemaDumpError {
-    #[error("Database does not exist.")]
+    #[error("database does not exist")]
    DatabaseDoesNotExist,
-    #[error("Failed to execute pg_dump.")]
+    #[error("failed to execute pg_dump")]
    IO(#[from] std::io::Error),
-    #[error("Unexpected error.")]
+    #[error("unexpected I/O error")]
    Unexpected,
 }

--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -358,22 +358,64 @@ impl ComputeNode {
        let spec = compute_state.pspec.as_ref().expect("spec must be set");
        let start_time = Instant::now();

-        // Open backup file directly
-        let backup_file = std::fs::File::open("/var/db/backups/backup.tar.gz")?;
-        let mut measured_reader = MeasuredReader::new(backup_file);
+        let shard0_connstr = spec.pageserver_connstr.split(',').next().unwrap();
+        let mut config = postgres::Config::from_str(shard0_connstr)?;
+
+        // Use the storage auth token from the config file, if given.
+        // Note: this overrides any password set in the connection string.
+        if let Some(storage_auth_token) = &spec.storage_auth_token {
+            info!("Got storage auth token from spec file");
+            config.password(storage_auth_token);
+        } else {
+            info!("Storage auth token not set");
+        }
+
+        // Connect to pageserver
+        let mut client = config.connect(NoTls)?;
+        let pageserver_connect_micros = start_time.elapsed().as_micros() as u64;
+
+        let basebackup_cmd = match lsn {
+            Lsn(0) => {
+                if spec.spec.mode != ComputeMode::Primary {
+                    format!(
+                        "basebackup {} {} --gzip --replica",
+                        spec.tenant_id, spec.timeline_id
+                    )
+                } else {
+                    format!("basebackup {} {} --gzip", spec.tenant_id, spec.timeline_id)
+                }
+            }
+            _ => {
+                if spec.spec.mode != ComputeMode::Primary {
+                    format!(
+                        "basebackup {} {} {} --gzip --replica",
+                        spec.tenant_id, spec.timeline_id, lsn
+                    )
+                } else {
+                    format!(
+                        "basebackup {} {} {} --gzip",
+                        spec.tenant_id, spec.timeline_id, lsn
+                    )
+                }
+            }
+        };
+
+        let copyreader = client.copy_out(basebackup_cmd.as_str())?;
+        let mut measured_reader = MeasuredReader::new(copyreader);
        let mut bufreader = std::io::BufReader::new(&mut measured_reader);

-        // Read the archive directly from the file
+        // Read the archive directly from the `CopyOutReader`
        //
        // Set `ignore_zeros` so that unpack() reads all the Copy data and
-        // doesn't stop at the end-of-archive marker.
+        // doesn't stop at the end-of-archive marker. Otherwise, if the server
+        // sends an Error after finishing the tarball, we will not notice it.
        let mut ar = tar::Archive::new(flate2::read::GzDecoder::new(&mut bufreader));
        ar.set_ignore_zeros(true);
        ar.unpack(&self.pgdata)?;

        // Report metrics
        let mut state = self.state.lock().unwrap();
-        state.metrics.pageserver_connect_micros = 0;
+        state.metrics.pageserver_connect_micros = pageserver_connect_micros;
        state.metrics.basebackup_bytes = measured_reader.get_byte_count() as u64;
        state.metrics.basebackup_ms = start_time.elapsed().as_millis() as u64;
        Ok(())
@@ -586,7 +628,32 @@ impl ComputeNode {
            self.http_port,
        )?;

-        let lsn = Lsn(0);
+        // Syncing safekeepers is only safe with primary nodes: if a primary
+        // is already connected it will be kicked out, so a secondary (standby)
+        // cannot sync safekeepers.
+        let lsn = match spec.mode {
+            ComputeMode::Primary => {
+                info!("checking if safekeepers are synced");
+                let lsn = if let Ok(Some(lsn)) = self.check_safekeepers_synced(compute_state) {
+                    lsn
+                } else {
+                    info!("starting safekeepers syncing");
+                    self.sync_safekeepers(pspec.storage_auth_token.clone())
+                        .with_context(|| "failed to sync safekeepers")?
+                };
+                info!("safekeepers synced at LSN {}", lsn);
+                lsn
+            }
+            ComputeMode::Static(lsn) => {
+                info!("Starting read-only node at static LSN {}", lsn);
+                lsn
+            }
+            ComputeMode::Replica => {
+                info!("Initializing standby from latest Pageserver LSN");
+                Lsn(0)
+            }
+        };
+
        info!(
            "getting basebackup@{} from pageserver {}",
            lsn, &pspec.pageserver_connstr
--- a/compute_tools/src/http/api.rs
+++ b/compute_tools/src/http/api.rs
@@ -1,606 +0,0 @@
-use std::convert::Infallible;
-use std::net::IpAddr;
-use std::net::Ipv6Addr;
-use std::net::SocketAddr;
-use std::sync::Arc;
-use std::thread;
-
-use crate::catalog::SchemaDumpError;
-use crate::catalog::{get_database_schema, get_dbs_and_roles};
-use crate::compute::forward_termination_signal;
-use crate::compute::{ComputeNode, ComputeState, ParsedSpec};
-use crate::installed_extensions;
-use compute_api::requests::{ConfigurationRequest, ExtensionInstallRequest, SetRoleGrantsRequest};
-use compute_api::responses::{
-    ComputeStatus, ComputeStatusResponse, ExtensionInstallResult, GenericAPIError,
-    SetRoleGrantsResponse,
-};
-
-use anyhow::Result;
-use hyper::header::CONTENT_TYPE;
-use hyper::service::{make_service_fn, service_fn};
-use hyper::{Body, Method, Request, Response, Server, StatusCode};
-use metrics::proto::MetricFamily;
-use metrics::Encoder;
-use metrics::TextEncoder;
-use tokio::task;
-use tokio_util::sync::CancellationToken;
-use tracing::{debug, error, info, warn};
-use tracing_utils::http::OtelName;
-use utils::failpoint_support::failpoints_handler;
-use utils::http::error::ApiError;
-use utils::http::request::must_get_query_param;
-
-fn status_response_from_state(state: &ComputeState) -> ComputeStatusResponse {
-    ComputeStatusResponse {
-        start_time: state.start_time,
-        tenant: state
-            .pspec
-            .as_ref()
-            .map(|pspec| pspec.tenant_id.to_string()),
-        timeline: state
-            .pspec
-            .as_ref()
-            .map(|pspec| pspec.timeline_id.to_string()),
-        status: state.status,
-        last_active: state.last_active,
-        error: state.error.clone(),
-    }
-}
-
-// Service function to handle all available routes.
-async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body> {
-    //
-    // NOTE: The URI path is currently included in traces. That's OK because
-    // it doesn't contain any variable parts or sensitive information. But
-    // please keep that in mind if you change the routing here.
-    //
-    match (req.method(), req.uri().path()) {
-        // Serialized compute state.
-        (&Method::GET, "/status") => {
-            debug!("serving /status GET request");
-            let state = compute.state.lock().unwrap();
-            let status_response = status_response_from_state(&state);
-            Response::new(Body::from(serde_json::to_string(&status_response).unwrap()))
-        }
-
-        // Startup metrics in JSON format. Keep /metrics reserved for a possible
-        // future use for Prometheus metrics format.
-        (&Method::GET, "/metrics.json") => {
-            info!("serving /metrics.json GET request");
-            let metrics = compute.state.lock().unwrap().metrics.clone();
-            Response::new(Body::from(serde_json::to_string(&metrics).unwrap()))
-        }
-
-        // Prometheus metrics
-        (&Method::GET, "/metrics") => {
-            debug!("serving /metrics GET request");
-
-            // When we call TextEncoder::encode() below, it will immediately
-            // return an error if a metric family has no metrics, so we need to
-            // preemptively filter out metric families with no metrics.
-            let metrics = installed_extensions::collect()
-                .into_iter()
-                .filter(|m| !m.get_metric().is_empty())
-                .collect::<Vec<MetricFamily>>();
-
-            let encoder = TextEncoder::new();
-            let mut buffer = vec![];
-
-            if let Err(err) = encoder.encode(&metrics, &mut buffer) {
-                let msg = format!("error handling /metrics request: {err}");
-                error!(msg);
-                return render_json_error(&msg, StatusCode::INTERNAL_SERVER_ERROR);
-            }
-
-            match Response::builder()
-                .status(StatusCode::OK)
-                .header(CONTENT_TYPE, encoder.format_type())
-                .body(Body::from(buffer))
-            {
-                Ok(response) => response,
-                Err(err) => {
-                    let msg = format!("error handling /metrics request: {err}");
-                    error!(msg);
-                    render_json_error(&msg, StatusCode::INTERNAL_SERVER_ERROR)
-                }
-            }
-        }
-        // Collect Postgres current usage insights
-        (&Method::GET, "/insights") => {
-            info!("serving /insights GET request");
-            let status = compute.get_status();
-            if status != ComputeStatus::Running {
-                let msg = format!("compute is not running, current status: {:?}", status);
-                error!(msg);
-                return Response::new(Body::from(msg));
-            }
-
-            let insights = compute.collect_insights().await;
-            Response::new(Body::from(insights))
-        }
-
-        (&Method::POST, "/check_writability") => {
-            info!("serving /check_writability POST request");
-            let status = compute.get_status();
-            if status != ComputeStatus::Running {
-                let msg = format!(
-                    "invalid compute status for check_writability request: {:?}",
-                    status
-                );
-                error!(msg);
-                return Response::new(Body::from(msg));
-            }
-
-            let res = crate::checker::check_writability(compute).await;
-            match res {
-                Ok(_) => Response::new(Body::from("true")),
-                Err(e) => {
-                    error!("check_writability failed: {}", e);
-                    Response::new(Body::from(e.to_string()))
-                }
-            }
-        }
-
-        (&Method::POST, "/extensions") => {
-            info!("serving /extensions POST request");
-            let status = compute.get_status();
-            if status != ComputeStatus::Running {
-                let msg = format!(
-                    "invalid compute status for extensions request: {:?}",
-                    status
-                );
-                error!(msg);
-                return render_json_error(&msg, StatusCode::PRECONDITION_FAILED);
-            }
-
-            let request = hyper::body::to_bytes(req.into_body()).await.unwrap();
-            let request = serde_json::from_slice::<ExtensionInstallRequest>(&request).unwrap();
-            let res = compute
-                .install_extension(&request.extension, &request.database, request.version)
-                .await;
-            match res {
-                Ok(version) => render_json(Body::from(
-                    serde_json::to_string(&ExtensionInstallResult {
-                        extension: request.extension,
-                        version,
-                    })
-                    .unwrap(),
-                )),
-                Err(e) => {
-                    error!("install_extension failed: {}", e);
-                    render_json_error(&e.to_string(), StatusCode::INTERNAL_SERVER_ERROR)
-                }
-            }
-        }
-
-        (&Method::GET, "/info") => {
-            let num_cpus = num_cpus::get_physical();
-            info!("serving /info GET request. num_cpus: {}", num_cpus);
-            Response::new(Body::from(
-                serde_json::json!({
-                    "num_cpus": num_cpus,
-                })
-                .to_string(),
-            ))
-        }
-
-        // Accept spec in JSON format and request compute configuration. If
-        // anything goes wrong after we set the compute status to `ConfigurationPending`
-        // and update compute state with new spec, we basically leave compute
-        // in the potentially wrong state. That said, it's control-plane's
-        // responsibility to watch compute state after reconfiguration request
-        // and to clean restart in case of errors.
-        (&Method::POST, "/configure") => {
-            info!("serving /configure POST request");
-            match handle_configure_request(req, compute).await {
-                Ok(msg) => Response::new(Body::from(msg)),
-                Err((msg, code)) => {
-                    error!("error handling /configure request: {msg}");
-                    render_json_error(&msg, code)
-                }
-            }
-        }
-
-        (&Method::POST, "/terminate") => {
-            info!("serving /terminate POST request");
-            match handle_terminate_request(compute).await {
-                Ok(()) => Response::new(Body::empty()),
-                Err((msg, code)) => {
-                    error!("error handling /terminate request: {msg}");
-                    render_json_error(&msg, code)
-                }
-            }
-        }
-
-        (&Method::GET, "/dbs_and_roles") => {
-            info!("serving /dbs_and_roles GET request",);
-            match get_dbs_and_roles(compute).await {
-                Ok(res) => render_json(Body::from(serde_json::to_string(&res).unwrap())),
-                Err(_) => {
-                    render_json_error("can't get dbs and roles", StatusCode::INTERNAL_SERVER_ERROR)
-                }
-            }
-        }
-
-        (&Method::GET, "/database_schema") => {
-            let database = match must_get_query_param(&req, "database") {
-                Err(e) => return e.into_response(),
-                Ok(database) => database,
-            };
-            info!("serving /database_schema GET request with database: {database}",);
-            match get_database_schema(compute, &database).await {
-                Ok(res) => render_plain(Body::wrap_stream(res)),
-                Err(SchemaDumpError::DatabaseDoesNotExist) => {
-                    render_json_error("database does not exist", StatusCode::NOT_FOUND)
-                }
-                Err(e) => {
-                    error!("can't get schema dump: {}", e);
-                    render_json_error("can't get schema dump", StatusCode::INTERNAL_SERVER_ERROR)
-                }
-            }
-        }
-
-        (&Method::POST, "/grants") => {
-            info!("serving /grants POST request");
-            let status = compute.get_status();
-            if status != ComputeStatus::Running {
-                let msg = format!(
-                    "invalid compute status for set_role_grants request: {:?}",
-                    status
-                );
-                error!(msg);
-                return render_json_error(&msg, StatusCode::PRECONDITION_FAILED);
-            }
-
-            let request = hyper::body::to_bytes(req.into_body()).await.unwrap();
-            let request = serde_json::from_slice::<SetRoleGrantsRequest>(&request).unwrap();
-
-            let res = compute
-                .set_role_grants(
-                    &request.database,
-                    &request.schema,
-                    &request.privileges,
-                    &request.role,
-                )
-                .await;
-            match res {
-                Ok(()) => render_json(Body::from(
-                    serde_json::to_string(&SetRoleGrantsResponse {
-                        database: request.database,
-                        schema: request.schema,
-                        role: request.role,
-                        privileges: request.privileges,
-                    })
-                    .unwrap(),
-                )),
-                Err(e) => render_json_error(
-                    &format!("could not grant role privileges to the schema: {e}"),
-                    // TODO: can we filter on role/schema not found errors
-                    // and return appropriate error code?
-                    StatusCode::INTERNAL_SERVER_ERROR,
-                ),
-            }
-        }
-
-        // get the list of installed extensions
-        // currently only used in python tests
-        // TODO: call it from cplane
-        (&Method::GET, "/installed_extensions") => {
-            info!("serving /installed_extensions GET request");
-            let status = compute.get_status();
-            if status != ComputeStatus::Running {
-                let msg = format!(
-                    "invalid compute status for extensions request: {:?}",
-                    status
-                );
-                error!(msg);
-                return Response::new(Body::from(msg));
-            }
-
-            let conf = compute.get_conn_conf(None);
-            let res =
-                task::spawn_blocking(move || installed_extensions::get_installed_extensions(conf))
-                    .await
-                    .unwrap();
-
-            match res {
-                Ok(res) => render_json(Body::from(serde_json::to_string(&res).unwrap())),
-                Err(e) => render_json_error(
-                    &format!("could not get list of installed extensions: {}", e),
-                    StatusCode::INTERNAL_SERVER_ERROR,
-                ),
-            }
-        }
-
-        (&Method::POST, "/failpoints") if cfg!(feature = "testing") => {
-            match failpoints_handler(req, CancellationToken::new()).await {
-                Ok(r) => r,
-                Err(ApiError::BadRequest(e)) => {
-                    render_json_error(&e.to_string(), StatusCode::BAD_REQUEST)
-                }
-                Err(_) => {
-                    render_json_error("Internal server error", StatusCode::INTERNAL_SERVER_ERROR)
-                }
-            }
-        }
-
-        // download extension files from remote extension storage on demand
-        (&Method::POST, route) if route.starts_with("/extension_server/") => {
-            info!("serving {:?} POST request", route);
-            info!("req.uri {:?}", req.uri());
-
-            // don't even try to download extensions
-            // if no remote storage is configured
-            if compute.ext_remote_storage.is_none() {
-                info!("no extensions remote storage configured");
-                let mut resp = Response::new(Body::from("no remote storage configured"));
-                *resp.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
-                return resp;
-            }
-
-            let mut is_library = false;
-            if let Some(params) = req.uri().query() {
-                info!("serving {:?} POST request with params: {}", route, params);
-                if params == "is_library=true" {
-                    is_library = true;
-                } else {
-                    let mut resp = Response::new(Body::from("Wrong request parameters"));
-                    *resp.status_mut() = StatusCode::BAD_REQUEST;
-                    return resp;
-                }
-            }
-            let filename = route.split('/').last().unwrap().to_string();
-            info!("serving /extension_server POST request, filename: {filename:?} is_library: {is_library}");
-
-            // get ext_name and path from spec
-            // don't lock compute_state for too long
-            let ext = {
-                let compute_state = compute.state.lock().unwrap();
-                let pspec = compute_state.pspec.as_ref().expect("spec must be set");
-                let spec = &pspec.spec;
-
-                // debug only
-                info!("spec: {:?}", spec);
-
-                let remote_extensions = match spec.remote_extensions.as_ref() {
-                    Some(r) => r,
-                    None => {
-                        info!("no remote extensions spec was provided");
-                        let mut resp = Response::new(Body::from("no remote storage configured"));
-                        *resp.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
-                        return resp;
-                    }
-                };
-
-                remote_extensions.get_ext(
-                    &filename,
-                    is_library,
-                    &compute.build_tag,
-                    &compute.pgversion,
-                )
-            };
-
-            match ext {
-                Ok((ext_name, ext_path)) => {
-                    match compute.download_extension(ext_name, ext_path).await {
-                        Ok(_) => Response::new(Body::from("OK")),
-                        Err(e) => {
-                            error!("extension download failed: {}", e);
-                            let mut resp = Response::new(Body::from(e.to_string()));
-                            *resp.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
-                            resp
-                        }
-                    }
-                }
-                Err(e) => {
-                    warn!("extension download failed to find extension: {}", e);
-                    let mut resp = Response::new(Body::from("failed to find file"));
-                    *resp.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
-                    resp
-                }
-            }
-        }
-
-        // Return the `404 Not Found` for any other routes.
-        _ => {
-            let mut not_found = Response::new(Body::from("404 Not Found"));
-            *not_found.status_mut() = StatusCode::NOT_FOUND;
-            not_found
-        }
-    }
-}
-
-async fn handle_configure_request(
-    req: Request<Body>,
-    compute: &Arc<ComputeNode>,
-) -> Result<String, (String, StatusCode)> {
-    if !compute.live_config_allowed {
-        return Err((
-            "live configuration is not allowed for this compute node".to_string(),
-            StatusCode::PRECONDITION_FAILED,
-        ));
-    }
-
-    let body_bytes = hyper::body::to_bytes(req.into_body()).await.unwrap();
-    let spec_raw = String::from_utf8(body_bytes.to_vec()).unwrap();
-    if let Ok(request) = serde_json::from_str::<ConfigurationRequest>(&spec_raw) {
-        let spec = request.spec;
-
-        let parsed_spec = match ParsedSpec::try_from(spec) {
-            Ok(ps) => ps,
-            Err(msg) => return Err((msg, StatusCode::BAD_REQUEST)),
-        };
-
-        // XXX: wrap state update under lock in code blocks. Otherwise,
-        // we will try to `Send` `mut state` into the spawned thread
-        // bellow, which will cause error:
-        // ```
-        // error: future cannot be sent between threads safely
-        // ```
-        {
-            let mut state = compute.state.lock().unwrap();
-            if state.status != ComputeStatus::Empty && state.status != ComputeStatus::Running {
-                let msg = format!(
-                    "invalid compute status for configuration request: {:?}",
-                    state.status.clone()
-                );
-                return Err((msg, StatusCode::PRECONDITION_FAILED));
-            }
-            state.pspec = Some(parsed_spec);
-            state.set_status(ComputeStatus::ConfigurationPending, &compute.state_changed);
-            drop(state);
-            info!("set new spec and notified waiters");
-        }
-
-        // Spawn a blocking thread to wait for compute to become Running.
-        // This is needed to do not block the main pool of workers and
-        // be able to serve other requests while some particular request
-        // is waiting for compute to finish configuration.
-        let c = compute.clone();
-        task::spawn_blocking(move || {
-            let mut state = c.state.lock().unwrap();
-            while state.status != ComputeStatus::Running {
-                state = c.state_changed.wait(state).unwrap();
-                info!(
-                    "waiting for compute to become Running, current status: {:?}",
-                    state.status
-                );
-
-                if state.status == ComputeStatus::Failed {
-                    let err = state.error.as_ref().map_or("unknown error", |x| x);
-                    let msg = format!("compute configuration failed: {:?}", err);
-                    return Err((msg, StatusCode::INTERNAL_SERVER_ERROR));
-                }
-            }
-
-            Ok(())
-        })
-        .await
-        .unwrap()?;
-
-        // Return current compute state if everything went well.
-        let state = compute.state.lock().unwrap().clone();
-        let status_response = status_response_from_state(&state);
-        Ok(serde_json::to_string(&status_response).unwrap())
-    } else {
-        Err(("invalid spec".to_string(), StatusCode::BAD_REQUEST))
-    }
-}
-
-fn render_json_error(e: &str, status: StatusCode) -> Response<Body> {
-    let error = GenericAPIError {
-        error: e.to_string(),
-    };
-    Response::builder()
-        .status(status)
-        .header(CONTENT_TYPE, "application/json")
-        .body(Body::from(serde_json::to_string(&error).unwrap()))
-        .unwrap()
-}
-
-fn render_json(body: Body) -> Response<Body> {
-    Response::builder()
-        .header(CONTENT_TYPE, "application/json")
-        .body(body)
-        .unwrap()
-}
-
-fn render_plain(body: Body) -> Response<Body> {
-    Response::builder()
-        .header(CONTENT_TYPE, "text/plain")
-        .body(body)
-        .unwrap()
-}
-
-async fn handle_terminate_request(compute: &Arc<ComputeNode>) -> Result<(), (String, StatusCode)> {
-    {
-        let mut state = compute.state.lock().unwrap();
-        if state.status == ComputeStatus::Terminated {
-            return Ok(());
-        }
-        if state.status != ComputeStatus::Empty && state.status != ComputeStatus::Running {
-            let msg = format!(
-                "invalid compute status for termination request: {}",
-                state.status
-            );
-            return Err((msg, StatusCode::PRECONDITION_FAILED));
-        }
-        state.set_status(ComputeStatus::TerminationPending, &compute.state_changed);
-        drop(state);
-    }
-
-    forward_termination_signal();
-    info!("sent signal and notified waiters");
-
-    // Spawn a blocking thread to wait for compute to become Terminated.
-    // This is needed to do not block the main pool of workers and
-    // be able to serve other requests while some particular request
-    // is waiting for compute to finish configuration.
-    let c = compute.clone();
-    task::spawn_blocking(move || {
-        let mut state = c.state.lock().unwrap();
-        while state.status != ComputeStatus::Terminated {
-            state = c.state_changed.wait(state).unwrap();
-            info!(
-                "waiting for compute to become {}, current status: {:?}",
-                ComputeStatus::Terminated,
-                state.status
-            );
-        }
-
-        Ok(())
-    })
-    .await
-    .unwrap()?;
-    info!("terminated Postgres");
-    Ok(())
-}
-
-// Main Hyper HTTP server function that runs it and blocks waiting on it forever.
-#[tokio::main]
-async fn serve(port: u16, state: Arc<ComputeNode>) {
-    // this usually binds to both IPv4 and IPv6 on linux
-    // see e.g. https://github.com/rust-lang/rust/pull/34440
-    let addr = SocketAddr::new(IpAddr::from(Ipv6Addr::UNSPECIFIED), port);
-
-    let make_service = make_service_fn(move |_conn| {
-        let state = state.clone();
-        async move {
-            Ok::<_, Infallible>(service_fn(move |req: Request<Body>| {
-                let state = state.clone();
-                async move {
-                    Ok::<_, Infallible>(
-                        // NOTE: We include the URI path in the string. It
-                        // doesn't contain any variable parts or sensitive
-                        // information in this API.
-                        tracing_utils::http::tracing_handler(
-                            req,
-                            |req| routes(req, &state),
-                            OtelName::UriPath,
-                        )
-                        .await,
-                    )
-                }
-            }))
-        }
-    });
-
-    info!("starting HTTP server on {}", addr);
-
-    let server = Server::bind(&addr).serve(make_service);
-
-    // Run this server forever
-    if let Err(e) = server.await {
-        error!("server error: {}", e);
-    }
-}
-
-/// Launch a separate Hyper HTTP API server thread and return its `JoinHandle`.
-pub fn launch_http_server(port: u16, state: &Arc<ComputeNode>) -> Result<thread::JoinHandle<()>> {
-    let state = Arc::clone(state);
-
-    Ok(thread::Builder::new()
-        .name("http-endpoint".into())
-        .spawn(move || serve(port, state))?)
-}
--- a/compute_tools/src/http/extract/json.rs
+++ b/compute_tools/src/http/extract/json.rs
@@ -0,0 +1,48 @@
+use std::ops::{Deref, DerefMut};
+
+use axum::{
+    async_trait,
+    extract::{rejection::JsonRejection, FromRequest, Request},
+};
+use compute_api::responses::GenericAPIError;
+use http::StatusCode;
+
+/// Custom `Json` extractor, so that we can format errors into
+/// `JsonResponse<GenericAPIError>`.
+#[derive(Debug, Clone, Copy, Default)]
+pub(crate) struct Json<T>(pub T);
+
+#[async_trait]
+impl<S, T> FromRequest<S> for Json<T>
+where
+    axum::Json<T>: FromRequest<S, Rejection = JsonRejection>,
+    S: Send + Sync,
+{
+    type Rejection = (StatusCode, axum::Json<GenericAPIError>);
+
+    async fn from_request(req: Request, state: &S) -> Result<Self, Self::Rejection> {
+        match axum::Json::<T>::from_request(req, state).await {
+            Ok(value) => Ok(Self(value.0)),
+            Err(rejection) => Err((
+                rejection.status(),
+                axum::Json(GenericAPIError {
+                    error: rejection.body_text().to_lowercase(),
+                }),
+            )),
+        }
+    }
+}
+
+impl<T> Deref for Json<T> {
+    type Target = T;
+
+    fn deref(&self) -> &Self::Target {
+        &self.0
+    }
+}
+
+impl<T> DerefMut for Json<T> {
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        &mut self.0
+    }
+}
--- a/compute_tools/src/http/extract/mod.rs
+++ b/compute_tools/src/http/extract/mod.rs
@@ -0,0 +1,7 @@
+pub(crate) mod json;
+pub(crate) mod path;
+pub(crate) mod query;
+
+pub(crate) use json::Json;
+pub(crate) use path::Path;
+pub(crate) use query::Query;
--- a/compute_tools/src/http/extract/path.rs
+++ b/compute_tools/src/http/extract/path.rs
@@ -0,0 +1,48 @@
+use std::ops::{Deref, DerefMut};
+
+use axum::{
+    async_trait,
+    extract::{rejection::PathRejection, FromRequestParts},
+};
+use compute_api::responses::GenericAPIError;
+use http::{request::Parts, StatusCode};
+
+/// Custom `Path` extractor, so that we can format errors into
+/// `JsonResponse<GenericAPIError>`.
+#[derive(Debug, Clone, Copy, Default)]
+pub(crate) struct Path<T>(pub T);
+
+#[async_trait]
+impl<S, T> FromRequestParts<S> for Path<T>
+where
+    axum::extract::Path<T>: FromRequestParts<S, Rejection = PathRejection>,
+    S: Send + Sync,
+{
+    type Rejection = (StatusCode, axum::Json<GenericAPIError>);
+
+    async fn from_request_parts(parts: &mut Parts, state: &S) -> Result<Self, Self::Rejection> {
+        match axum::extract::Path::<T>::from_request_parts(parts, state).await {
+            Ok(value) => Ok(Self(value.0)),
+            Err(rejection) => Err((
+                rejection.status(),
+                axum::Json(GenericAPIError {
+                    error: rejection.body_text().to_ascii_lowercase(),
+                }),
+            )),
+        }
+    }
+}
+
+impl<T> Deref for Path<T> {
+    type Target = T;
+
+    fn deref(&self) -> &Self::Target {
+        &self.0
+    }
+}
+
+impl<T> DerefMut for Path<T> {
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        &mut self.0
+    }
+}
--- a/compute_tools/src/http/extract/query.rs
+++ b/compute_tools/src/http/extract/query.rs
@@ -0,0 +1,48 @@
+use std::ops::{Deref, DerefMut};
+
+use axum::{
+    async_trait,
+    extract::{rejection::QueryRejection, FromRequestParts},
+};
+use compute_api::responses::GenericAPIError;
+use http::{request::Parts, StatusCode};
+
+/// Custom `Query` extractor, so that we can format errors into
+/// `JsonResponse<GenericAPIError>`.
+#[derive(Debug, Clone, Copy, Default)]
+pub(crate) struct Query<T>(pub T);
+
+#[async_trait]
+impl<S, T> FromRequestParts<S> for Query<T>
+where
+    axum::extract::Query<T>: FromRequestParts<S, Rejection = QueryRejection>,
+    S: Send + Sync,
+{
+    type Rejection = (StatusCode, axum::Json<GenericAPIError>);
+
+    async fn from_request_parts(parts: &mut Parts, state: &S) -> Result<Self, Self::Rejection> {
+        match axum::extract::Query::<T>::from_request_parts(parts, state).await {
+            Ok(value) => Ok(Self(value.0)),
+            Err(rejection) => Err((
+                rejection.status(),
+                axum::Json(GenericAPIError {
+                    error: rejection.body_text().to_ascii_lowercase(),
+                }),
+            )),
+        }
+    }
+}
+
+impl<T> Deref for Query<T> {
+    type Target = T;
+
+    fn deref(&self) -> &Self::Target {
+        &self.0
+    }
+}
+
+impl<T> DerefMut for Query<T> {
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        &mut self.0
+    }
+}
--- a/compute_tools/src/http/mod.rs
+++ b/compute_tools/src/http/mod.rs
@@ -1 +1,56 @@
-pub mod api;
+use axum::{body::Body, response::Response};
+use compute_api::responses::{ComputeStatus, GenericAPIError};
+use http::{header::CONTENT_TYPE, StatusCode};
+use serde::Serialize;
+use tracing::error;
+
+pub use server::launch_http_server;
+
+mod extract;
+mod routes;
+mod server;
+
+/// Convenience response builder for JSON responses
+struct JsonResponse;
+
+impl JsonResponse {
+    /// Helper for actually creating a response
+    fn create_response(code: StatusCode, body: impl Serialize) -> Response {
+        Response::builder()
+            .status(code)
+            .header(CONTENT_TYPE.as_str(), "application/json")
+            .body(Body::from(serde_json::to_string(&body).unwrap()))
+            .unwrap()
+    }
+
+    /// Create a successful error response
+    pub(self) fn success(code: StatusCode, body: impl Serialize) -> Response {
+        assert!({
+            let code = code.as_u16();
+
+            (200..300).contains(&code)
+        });
+
+        Self::create_response(code, body)
+    }
+
+    /// Create an error response
+    pub(self) fn error(code: StatusCode, error: impl ToString) -> Response {
+        assert!(code.as_u16() >= 400);
+
+        let message = error.to_string();
+        error!(message);
+
+        Self::create_response(code, &GenericAPIError { error: message })
+    }
+
+    /// Create an error response related to the compute being in an invalid state
+    pub(self) fn invalid_status(status: ComputeStatus) -> Response {
+        Self::create_response(
+            StatusCode::PRECONDITION_FAILED,
+            &GenericAPIError {
+                error: format!("invalid compute status: {status}"),
+            },
+        )
+    }
+}
--- a/compute_tools/src/http/openapi_spec.yaml
+++ b/compute_tools/src/http/openapi_spec.yaml
@@ -37,7 +37,7 @@ paths:
              schema:
                $ref: "#/components/schemas/ComputeMetrics"

-  /metrics
+  /metrics:
    get:
      tags:
      - Info
--- a/compute_tools/src/http/routes/check_writability.rs
+++ b/compute_tools/src/http/routes/check_writability.rs
@@ -0,0 +1,20 @@
+use std::sync::Arc;
+
+use axum::{extract::State, response::Response};
+use compute_api::responses::ComputeStatus;
+use http::StatusCode;
+
+use crate::{checker::check_writability, compute::ComputeNode, http::JsonResponse};
+
+/// Check that the compute is currently running.
+pub(in crate::http) async fn is_writable(State(compute): State<Arc<ComputeNode>>) -> Response {
+    let status = compute.get_status();
+    if status != ComputeStatus::Running {
+        return JsonResponse::invalid_status(status);
+    }
+
+    match check_writability(&compute).await {
+        Ok(_) => JsonResponse::success(StatusCode::OK, true),
+        Err(e) => JsonResponse::error(StatusCode::INTERNAL_SERVER_ERROR, e),
+    }
+}
--- a/compute_tools/src/http/routes/configure.rs
+++ b/compute_tools/src/http/routes/configure.rs
@@ -0,0 +1,91 @@
+use std::sync::Arc;
+
+use axum::{extract::State, response::Response};
+use compute_api::{
+    requests::ConfigurationRequest,
+    responses::{ComputeStatus, ComputeStatusResponse},
+};
+use http::StatusCode;
+use tokio::task;
+use tracing::info;
+
+use crate::{
+    compute::{ComputeNode, ParsedSpec},
+    http::{extract::Json, JsonResponse},
+};
+
+// Accept spec in JSON format and request compute configuration. If anything
+// goes wrong after we set the compute status to `ConfigurationPending` and
+// update compute state with new spec, we basically leave compute in the
+// potentially wrong state. That said, it's control-plane's responsibility to
+// watch compute state after reconfiguration request and to clean restart in
+// case of errors.
+pub(in crate::http) async fn configure(
+    State(compute): State<Arc<ComputeNode>>,
+    request: Json<ConfigurationRequest>,
+) -> Response {
+    if !compute.live_config_allowed {
+        return JsonResponse::error(
+            StatusCode::PRECONDITION_FAILED,
+            "live configuration is not allowed for this compute node".to_string(),
+        );
+    }
+
+    let pspec = match ParsedSpec::try_from(request.spec.clone()) {
+        Ok(p) => p,
+        Err(e) => return JsonResponse::error(StatusCode::BAD_REQUEST, e),
+    };
+
+    // XXX: wrap state update under lock in a code block. Otherwise, we will try
+    // to `Send` `mut state` into the spawned thread bellow, which will cause
+    // the following rustc error:
+    //
+    // error: future cannot be sent between threads safely
+    {
+        let mut state = compute.state.lock().unwrap();
+        if !matches!(state.status, ComputeStatus::Empty | ComputeStatus::Running) {
+            return JsonResponse::invalid_status(state.status);
+        }
+
+        state.pspec = Some(pspec);
+        state.set_status(ComputeStatus::ConfigurationPending, &compute.state_changed);
+        drop(state);
+    }
+
+    // Spawn a blocking thread to wait for compute to become Running. This is
+    // needed to do not block the main pool of workers and be able to serve
+    // other requests while some particular request is waiting for compute to
+    // finish configuration.
+    let c = compute.clone();
+    let completed = task::spawn_blocking(move || {
+        let mut state = c.state.lock().unwrap();
+        while state.status != ComputeStatus::Running {
+            state = c.state_changed.wait(state).unwrap();
+            info!(
+                "waiting for compute to become {}, current status: {}",
+                ComputeStatus::Running,
+                state.status
+            );
+
+            if state.status == ComputeStatus::Failed {
+                let err = state.error.as_ref().map_or("unknown error", |x| x);
+                let msg = format!("compute configuration failed: {:?}", err);
+                return Err(msg);
+            }
+        }
+
+        Ok(())
+    })
+    .await
+    .unwrap();
+
+    if let Err(e) = completed {
+        return JsonResponse::error(StatusCode::INTERNAL_SERVER_ERROR, e);
+    }
+
+    // Return current compute state if everything went well.
+    let state = compute.state.lock().unwrap().clone();
+    let body = ComputeStatusResponse::from(&state);
+
+    JsonResponse::success(StatusCode::OK, body)
+}
--- a/compute_tools/src/http/routes/database_schema.rs
+++ b/compute_tools/src/http/routes/database_schema.rs
@@ -0,0 +1,34 @@
+use std::sync::Arc;
+
+use axum::{body::Body, extract::State, response::Response};
+use http::{header::CONTENT_TYPE, StatusCode};
+use serde::Deserialize;
+
+use crate::{
+    catalog::{get_database_schema, SchemaDumpError},
+    compute::ComputeNode,
+    http::{extract::Query, JsonResponse},
+};
+
+#[derive(Debug, Clone, Deserialize)]
+pub(in crate::http) struct DatabaseSchemaParams {
+    database: String,
+}
+
+/// Get a schema dump of the requested database.
+pub(in crate::http) async fn get_schema_dump(
+    params: Query<DatabaseSchemaParams>,
+    State(compute): State<Arc<ComputeNode>>,
+) -> Response {
+    match get_database_schema(&compute, &params.database).await {
+        Ok(schema) => Response::builder()
+            .status(StatusCode::OK)
+            .header(CONTENT_TYPE.as_str(), "application/json")
+            .body(Body::from_stream(schema))
+            .unwrap(),
+        Err(SchemaDumpError::DatabaseDoesNotExist) => {
+            JsonResponse::error(StatusCode::NOT_FOUND, SchemaDumpError::DatabaseDoesNotExist)
+        }
+        Err(e) => JsonResponse::error(StatusCode::INTERNAL_SERVER_ERROR, e),
+    }
+}
--- a/compute_tools/src/http/routes/dbs_and_roles.rs
+++ b/compute_tools/src/http/routes/dbs_and_roles.rs
@@ -0,0 +1,16 @@
+use std::sync::Arc;
+
+use axum::{extract::State, response::Response};
+use http::StatusCode;
+
+use crate::{catalog::get_dbs_and_roles, compute::ComputeNode, http::JsonResponse};
+
+/// Get the databases and roles from the compute.
+pub(in crate::http) async fn get_catalog_objects(
+    State(compute): State<Arc<ComputeNode>>,
+) -> Response {
+    match get_dbs_and_roles(&compute).await {
+        Ok(catalog_objects) => JsonResponse::success(StatusCode::OK, catalog_objects),
+        Err(e) => JsonResponse::error(StatusCode::INTERNAL_SERVER_ERROR, e),
+    }
+}
--- a/compute_tools/src/http/routes/extension_server.rs
+++ b/compute_tools/src/http/routes/extension_server.rs
@@ -0,0 +1,67 @@
+use std::sync::Arc;
+
+use axum::{
+    extract::State,
+    response::{IntoResponse, Response},
+};
+use http::StatusCode;
+use serde::Deserialize;
+
+use crate::{
+    compute::ComputeNode,
+    http::{
+        extract::{Path, Query},
+        JsonResponse,
+    },
+};
+
+#[derive(Debug, Clone, Deserialize)]
+pub(in crate::http) struct ExtensionServerParams {
+    is_library: Option<bool>,
+}
+
+/// Download a remote extension.
+pub(in crate::http) async fn download_extension(
+    Path(filename): Path<String>,
+    params: Query<ExtensionServerParams>,
+    State(compute): State<Arc<ComputeNode>>,
+) -> Response {
+    // Don't even try to download extensions if no remote storage is configured
+    if compute.ext_remote_storage.is_none() {
+        return JsonResponse::error(
+            StatusCode::PRECONDITION_FAILED,
+            "remote storage is not configured",
+        );
+    }
+
+    let ext = {
+        let state = compute.state.lock().unwrap();
+        let pspec = state.pspec.as_ref().unwrap();
+        let spec = &pspec.spec;
+
+        let remote_extensions = match spec.remote_extensions.as_ref() {
+            Some(r) => r,
+            None => {
+                return JsonResponse::error(
+                    StatusCode::CONFLICT,
+                    "information about remote extensions is unavailable",
+                );
+            }
+        };
+
+        remote_extensions.get_ext(
+            &filename,
+            params.is_library.unwrap_or(false),
+            &compute.build_tag,
+            &compute.pgversion,
+        )
+    };
+
+    match ext {
+        Ok((ext_name, ext_path)) => match compute.download_extension(ext_name, ext_path).await {
+            Ok(_) => StatusCode::OK.into_response(),
+            Err(e) => JsonResponse::error(StatusCode::INTERNAL_SERVER_ERROR, e),
+        },
+        Err(e) => JsonResponse::error(StatusCode::NOT_FOUND, e),
+    }
+}
--- a/compute_tools/src/http/routes/extensions.rs
+++ b/compute_tools/src/http/routes/extensions.rs
@@ -0,0 +1,45 @@
+use std::sync::Arc;
+
+use axum::{extract::State, response::Response};
+use compute_api::{
+    requests::ExtensionInstallRequest,
+    responses::{ComputeStatus, ExtensionInstallResponse},
+};
+use http::StatusCode;
+
+use crate::{
+    compute::ComputeNode,
+    http::{extract::Json, JsonResponse},
+};
+
+/// Install a extension.
+pub(in crate::http) async fn install_extension(
+    State(compute): State<Arc<ComputeNode>>,
+    request: Json<ExtensionInstallRequest>,
+) -> Response {
+    let status = compute.get_status();
+    if status != ComputeStatus::Running {
+        return JsonResponse::invalid_status(status);
+    }
+
+    match compute
+        .install_extension(
+            &request.extension,
+            &request.database,
+            request.version.to_string(),
+        )
+        .await
+    {
+        Ok(version) => JsonResponse::success(
+            StatusCode::CREATED,
+            Some(ExtensionInstallResponse {
+                extension: request.extension.clone(),
+                version,
+            }),
+        ),
+        Err(e) => JsonResponse::error(
+            StatusCode::INTERNAL_SERVER_ERROR,
+            format!("failed to install extension: {e}"),
+        ),
+    }
+}
--- a/compute_tools/src/http/routes/failpoints.rs
+++ b/compute_tools/src/http/routes/failpoints.rs
@@ -0,0 +1,35 @@
+use axum::response::{IntoResponse, Response};
+use http::StatusCode;
+use tracing::info;
+use utils::failpoint_support::{apply_failpoint, ConfigureFailpointsRequest};
+
+use crate::http::{extract::Json, JsonResponse};
+
+/// Configure failpoints for testing purposes.
+pub(in crate::http) async fn configure_failpoints(
+    failpoints: Json<ConfigureFailpointsRequest>,
+) -> Response {
+    if !fail::has_failpoints() {
+        return JsonResponse::error(
+            StatusCode::PRECONDITION_FAILED,
+            "Cannot manage failpoints because neon was compiled without failpoints support",
+        );
+    }
+
+    for fp in &*failpoints {
+        info!("cfg failpoint: {} {}", fp.name, fp.actions);
+
+        // We recognize one extra "action" that's not natively recognized
+        // by the failpoints crate: exit, to immediately kill the process
+        let cfg_result = apply_failpoint(&fp.name, &fp.actions);
+
+        if let Err(e) = cfg_result {
+            return JsonResponse::error(
+                StatusCode::BAD_REQUEST,
+                format!("failed to configure failpoints: {e}"),
+            );
+        }
+    }
+
+    StatusCode::OK.into_response()
+}
--- a/compute_tools/src/http/routes/grants.rs
+++ b/compute_tools/src/http/routes/grants.rs
@@ -0,0 +1,48 @@
+use std::sync::Arc;
+
+use axum::{extract::State, response::Response};
+use compute_api::{
+    requests::SetRoleGrantsRequest,
+    responses::{ComputeStatus, SetRoleGrantsResponse},
+};
+use http::StatusCode;
+
+use crate::{
+    compute::ComputeNode,
+    http::{extract::Json, JsonResponse},
+};
+
+/// Add grants for a role.
+pub(in crate::http) async fn add_grant(
+    State(compute): State<Arc<ComputeNode>>,
+    request: Json<SetRoleGrantsRequest>,
+) -> Response {
+    let status = compute.get_status();
+    if status != ComputeStatus::Running {
+        return JsonResponse::invalid_status(status);
+    }
+
+    match compute
+        .set_role_grants(
+            &request.database,
+            &request.schema,
+            &request.privileges,
+            &request.role,
+        )
+        .await
+    {
+        Ok(()) => JsonResponse::success(
+            StatusCode::CREATED,
+            Some(SetRoleGrantsResponse {
+                database: request.database.clone(),
+                schema: request.schema.clone(),
+                role: request.role.clone(),
+                privileges: request.privileges.clone(),
+            }),
+        ),
+        Err(e) => JsonResponse::error(
+            StatusCode::INTERNAL_SERVER_ERROR,
+            format!("failed to grant role privileges to the schema: {e}"),
+        ),
+    }
+}
--- a/compute_tools/src/http/routes/info.rs
+++ b/compute_tools/src/http/routes/info.rs
@@ -0,0 +1,11 @@
+use axum::response::Response;
+use compute_api::responses::InfoResponse;
+use http::StatusCode;
+
+use crate::http::JsonResponse;
+
+/// Get information about the physical characteristics about the compute.
+pub(in crate::http) async fn get_info() -> Response {
+    let num_cpus = num_cpus::get_physical();
+    JsonResponse::success(StatusCode::OK, &InfoResponse { num_cpus })
+}
--- a/compute_tools/src/http/routes/insights.rs
+++ b/compute_tools/src/http/routes/insights.rs
@@ -0,0 +1,18 @@
+use std::sync::Arc;
+
+use axum::{extract::State, response::Response};
+use compute_api::responses::ComputeStatus;
+use http::StatusCode;
+
+use crate::{compute::ComputeNode, http::JsonResponse};
+
+/// Collect current Postgres usage insights.
+pub(in crate::http) async fn get_insights(State(compute): State<Arc<ComputeNode>>) -> Response {
+    let status = compute.get_status();
+    if status != ComputeStatus::Running {
+        return JsonResponse::invalid_status(status);
+    }
+
+    let insights = compute.collect_insights().await;
+    JsonResponse::success(StatusCode::OK, insights)
+}
--- a/compute_tools/src/http/routes/installed_extensions.rs
+++ b/compute_tools/src/http/routes/installed_extensions.rs
@@ -0,0 +1,33 @@
+use std::sync::Arc;
+
+use axum::{extract::State, response::Response};
+use compute_api::responses::ComputeStatus;
+use http::StatusCode;
+use tokio::task;
+
+use crate::{compute::ComputeNode, http::JsonResponse, installed_extensions};
+
+/// Get a list of installed extensions.
+pub(in crate::http) async fn get_installed_extensions(
+    State(compute): State<Arc<ComputeNode>>,
+) -> Response {
+    let status = compute.get_status();
+    if status != ComputeStatus::Running {
+        return JsonResponse::invalid_status(status);
+    }
+
+    let conf = compute.get_conn_conf(None);
+    let res = task::spawn_blocking(move || installed_extensions::get_installed_extensions(conf))
+        .await
+        .unwrap();
+
+    match res {
+        Ok(installed_extensions) => {
+            JsonResponse::success(StatusCode::OK, Some(installed_extensions))
+        }
+        Err(e) => JsonResponse::error(
+            StatusCode::INTERNAL_SERVER_ERROR,
+            format!("failed to get list of installed extensions: {e}"),
+        ),
+    }
+}
--- a/compute_tools/src/http/routes/metrics.rs
+++ b/compute_tools/src/http/routes/metrics.rs
@@ -0,0 +1,32 @@
+use axum::{body::Body, response::Response};
+use http::header::CONTENT_TYPE;
+use http::StatusCode;
+use metrics::proto::MetricFamily;
+use metrics::Encoder;
+use metrics::TextEncoder;
+
+use crate::{http::JsonResponse, installed_extensions};
+
+/// Expose Prometheus metrics.
+pub(in crate::http) async fn get_metrics() -> Response {
+    // When we call TextEncoder::encode() below, it will immediately return an
+    // error if a metric family has no metrics, so we need to preemptively
+    // filter out metric families with no metrics.
+    let metrics = installed_extensions::collect()
+        .into_iter()
+        .filter(|m| !m.get_metric().is_empty())
+        .collect::<Vec<MetricFamily>>();
+
+    let encoder = TextEncoder::new();
+    let mut buffer = vec![];
+
+    if let Err(e) = encoder.encode(&metrics, &mut buffer) {
+        return JsonResponse::error(StatusCode::INTERNAL_SERVER_ERROR, e);
+    }
+
+    Response::builder()
+        .status(StatusCode::OK)
+        .header(CONTENT_TYPE, encoder.format_type())
+        .body(Body::from(buffer))
+        .unwrap()
+}
--- a/compute_tools/src/http/routes/metrics_json.rs
+++ b/compute_tools/src/http/routes/metrics_json.rs
@@ -0,0 +1,12 @@
+use std::sync::Arc;
+
+use axum::{extract::State, response::Response};
+use http::StatusCode;
+
+use crate::{compute::ComputeNode, http::JsonResponse};
+
+/// Get startup metrics.
+pub(in crate::http) async fn get_metrics(State(compute): State<Arc<ComputeNode>>) -> Response {
+    let metrics = compute.state.lock().unwrap().metrics.clone();
+    JsonResponse::success(StatusCode::OK, metrics)
+}
--- a/compute_tools/src/http/routes/mod.rs
+++ b/compute_tools/src/http/routes/mod.rs
@@ -0,0 +1,38 @@
+use compute_api::responses::ComputeStatusResponse;
+
+use crate::compute::ComputeState;
+
+pub(in crate::http) mod check_writability;
+pub(in crate::http) mod configure;
+pub(in crate::http) mod database_schema;
+pub(in crate::http) mod dbs_and_roles;
+pub(in crate::http) mod extension_server;
+pub(in crate::http) mod extensions;
+pub(in crate::http) mod failpoints;
+pub(in crate::http) mod grants;
+pub(in crate::http) mod info;
+pub(in crate::http) mod insights;
+pub(in crate::http) mod installed_extensions;
+pub(in crate::http) mod metrics;
+pub(in crate::http) mod metrics_json;
+pub(in crate::http) mod status;
+pub(in crate::http) mod terminate;
+
+impl From<&ComputeState> for ComputeStatusResponse {
+    fn from(state: &ComputeState) -> Self {
+        ComputeStatusResponse {
+            start_time: state.start_time,
+            tenant: state
+                .pspec
+                .as_ref()
+                .map(|pspec| pspec.tenant_id.to_string()),
+            timeline: state
+                .pspec
+                .as_ref()
+                .map(|pspec| pspec.timeline_id.to_string()),
+            status: state.status,
+            last_active: state.last_active,
+            error: state.error.clone(),
+        }
+    }
+}
--- a/compute_tools/src/http/routes/status.rs
+++ b/compute_tools/src/http/routes/status.rs
@@ -0,0 +1,14 @@
+use std::{ops::Deref, sync::Arc};
+
+use axum::{extract::State, http::StatusCode, response::Response};
+use compute_api::responses::ComputeStatusResponse;
+
+use crate::{compute::ComputeNode, http::JsonResponse};
+
+/// Retrieve the state of the comute.
+pub(in crate::http) async fn get_status(State(compute): State<Arc<ComputeNode>>) -> Response {
+    let state = compute.state.lock().unwrap();
+    let body = ComputeStatusResponse::from(state.deref());
+
+    JsonResponse::success(StatusCode::OK, body)
+}
--- a/compute_tools/src/http/routes/terminate.rs
+++ b/compute_tools/src/http/routes/terminate.rs
@@ -0,0 +1,58 @@
+use std::sync::Arc;
+
+use axum::{
+    extract::State,
+    response::{IntoResponse, Response},
+};
+use compute_api::responses::ComputeStatus;
+use http::StatusCode;
+use tokio::task;
+use tracing::info;
+
+use crate::{
+    compute::{forward_termination_signal, ComputeNode},
+    http::JsonResponse,
+};
+
+/// Terminate the compute.
+pub(in crate::http) async fn terminate(State(compute): State<Arc<ComputeNode>>) -> Response {
+    {
+        let mut state = compute.state.lock().unwrap();
+        if state.status == ComputeStatus::Terminated {
+            return StatusCode::CREATED.into_response();
+        }
+
+        if !matches!(state.status, ComputeStatus::Empty | ComputeStatus::Running) {
+            return JsonResponse::invalid_status(state.status);
+        }
+
+        state.set_status(ComputeStatus::TerminationPending, &compute.state_changed);
+        drop(state);
+    }
+
+    forward_termination_signal();
+    info!("sent signal and notified waiters");
+
+    // Spawn a blocking thread to wait for compute to become Terminated.
+    // This is needed to do not block the main pool of workers and
+    // be able to serve other requests while some particular request
+    // is waiting for compute to finish configuration.
+    let c = compute.clone();
+    task::spawn_blocking(move || {
+        let mut state = c.state.lock().unwrap();
+        while state.status != ComputeStatus::Terminated {
+            state = c.state_changed.wait(state).unwrap();
+            info!(
+                "waiting for compute to become {}, current status: {:?}",
+                ComputeStatus::Terminated,
+                state.status
+            );
+        }
+    })
+    .await
+    .unwrap();
+
+    info!("terminated Postgres");
+
+    StatusCode::OK.into_response()
+}
--- a/compute_tools/src/http/server.rs
+++ b/compute_tools/src/http/server.rs
@@ -0,0 +1,165 @@
+use std::{
+    net::{IpAddr, Ipv6Addr, SocketAddr},
+    sync::{
+        atomic::{AtomicU64, Ordering},
+        Arc,
+    },
+    thread,
+    time::Duration,
+};
+
+use anyhow::Result;
+use axum::{
+    response::{IntoResponse, Response},
+    routing::{get, post},
+    Router,
+};
+use http::StatusCode;
+use tokio::net::TcpListener;
+use tower::ServiceBuilder;
+use tower_http::{
+    request_id::{MakeRequestId, PropagateRequestIdLayer, RequestId, SetRequestIdLayer},
+    trace::TraceLayer,
+};
+use tracing::{debug, error, info, Span};
+
+use super::routes::{
+    check_writability, configure, database_schema, dbs_and_roles, extension_server, extensions,
+    grants, info as info_route, insights, installed_extensions, metrics, metrics_json, status,
+    terminate,
+};
+use crate::compute::ComputeNode;
+
+async fn handle_404() -> Response {
+    StatusCode::NOT_FOUND.into_response()
+}
+
+#[derive(Clone, Default)]
+struct ComputeMakeRequestId(Arc<AtomicU64>);
+
+impl MakeRequestId for ComputeMakeRequestId {
+    fn make_request_id<B>(
+        &mut self,
+        _request: &http::Request<B>,
+    ) -> Option<tower_http::request_id::RequestId> {
+        let request_id = self
+            .0
+            .fetch_add(1, Ordering::SeqCst)
+            .to_string()
+            .parse()
+            .unwrap();
+
+        Some(RequestId::new(request_id))
+    }
+}
+
+/// Run the HTTP server and wait on it forever.
+#[tokio::main]
+async fn serve(port: u16, compute: Arc<ComputeNode>) {
+    const X_REQUEST_ID: &str = "x-request-id";
+
+    let mut app = Router::new()
+        .route("/check_writability", post(check_writability::is_writable))
+        .route("/configure", post(configure::configure))
+        .route("/database_schema", get(database_schema::get_schema_dump))
+        .route("/dbs_and_roles", get(dbs_and_roles::get_catalog_objects))
+        .route(
+            "/extension_server/*filename",
+            post(extension_server::download_extension),
+        )
+        .route("/extensions", post(extensions::install_extension))
+        .route("/grants", post(grants::add_grant))
+        .route("/info", get(info_route::get_info))
+        .route("/insights", get(insights::get_insights))
+        .route(
+            "/installed_extensions",
+            get(installed_extensions::get_installed_extensions),
+        )
+        .route("/metrics", get(metrics::get_metrics))
+        .route("/metrics.json", get(metrics_json::get_metrics))
+        .route("/status", get(status::get_status))
+        .route("/terminate", post(terminate::terminate))
+        .fallback(handle_404)
+        .layer(
+            ServiceBuilder::new()
+                .layer(SetRequestIdLayer::x_request_id(
+                    ComputeMakeRequestId::default(),
+                ))
+                .layer(
+                    TraceLayer::new_for_http()
+                        .on_request(|request: &http::Request<_>, _span: &Span| {
+                            let request_id = request
+                                .headers()
+                                .get(X_REQUEST_ID)
+                                .unwrap()
+                                .to_str()
+                                .unwrap();
+
+                            match request.uri().path() {
+                                "/metrics" => {
+                                    debug!(%request_id, "{} {}", request.method(), request.uri())
+                                }
+                                _ => info!(%request_id, "{} {}", request.method(), request.uri()),
+                            };
+                        })
+                        .on_response(
+                            |response: &http::Response<_>, latency: Duration, _span: &Span| {
+                                let request_id = response
+                                    .headers()
+                                    .get(X_REQUEST_ID)
+                                    .unwrap()
+                                    .to_str()
+                                    .unwrap();
+
+                                info!(
+                                    %request_id,
+                                    code = response.status().as_u16(),
+                                    latency = latency.as_millis()
+                                )
+                            },
+                        ),
+                )
+                .layer(PropagateRequestIdLayer::x_request_id()),
+        )
+        .with_state(compute);
+
+    // Add in any testing support
+    if cfg!(feature = "testing") {
+        use super::routes::failpoints;
+
+        app = app.route("/failpoints", post(failpoints::configure_failpoints))
+    }
+
+    // This usually binds to both IPv4 and IPv6 on Linux, see
+    // https://github.com/rust-lang/rust/pull/34440 for more information
+    let addr = SocketAddr::new(IpAddr::from(Ipv6Addr::UNSPECIFIED), port);
+    let listener = match TcpListener::bind(&addr).await {
+        Ok(listener) => listener,
+        Err(e) => {
+            error!(
+                "failed to bind the compute_ctl HTTP server to port {}: {}",
+                port, e
+            );
+            return;
+        }
+    };
+
+    if let Ok(local_addr) = listener.local_addr() {
+        info!("compute_ctl HTTP server listening on {}", local_addr);
+    } else {
+        info!("compute_ctl HTTP server listening on port {}", port);
+    }
+
+    if let Err(e) = axum::serve(listener, app).await {
+        error!("compute_ctl HTTP server error: {}", e);
+    }
+}
+
+/// Launch a separate HTTP server thread and return its `JoinHandle`.
+pub fn launch_http_server(port: u16, state: &Arc<ComputeNode>) -> Result<thread::JoinHandle<()>> {
+    let state = Arc::clone(state);
+
+    Ok(thread::Builder::new()
+        .name("http-server".into())
+        .spawn(move || serve(port, state))?)
+}
--- a/compute_tools/src/lib.rs
+++ b/compute_tools/src/lib.rs
@@ -3,8 +3,6 @@
 #![deny(unsafe_code)]
 #![deny(clippy::undocumented_unsafe_blocks)]

-extern crate hyper0 as hyper;
-
 pub mod checker;
 pub mod config;
 pub mod configurator;
--- a/compute_tools/src/spec_apply.rs
+++ b/compute_tools/src/spec_apply.rs
@@ -75,7 +75,7 @@ pub struct MutableApplyContext {
    pub dbs: HashMap<String, Database>,
 }

-/// Appply the operations that belong to the given spec apply phase.
+/// Apply the operations that belong to the given spec apply phase.
 ///
 /// Commands within a single phase are executed in order of Iterator yield.
 /// Commands of ApplySpecPhase::RunInEachDatabase will execute in the database
@@ -498,7 +498,19 @@ async fn get_operations<'a>(
                                        ),
                                        comment: None,
                                    },
+                                    // Revoke some potentially blocking privileges (Neon-specific currently)
+                                    Operation {
+                                        query: format!(
+                                            include_str!("sql/pre_drop_role_revoke_privileges.sql"),
+                                            role_name = quoted,
+                                        ),
+                                        comment: None,
+                                    },
                                    // This now will only drop privileges of the role
+                                    // TODO: this is obviously not 100% true because of the above case,
+                                    // there could be still some privileges that are not revoked. Maybe this
+                                    // only drops privileges that were granted *by this* role, not *to this* role,
+                                    // but this has to be checked.
                                    Operation {
                                        query: format!("DROP OWNED BY {}", quoted),
                                        comment: None,
--- a/compute_tools/src/sql/pre_drop_role_revoke_privileges.sql
+++ b/compute_tools/src/sql/pre_drop_role_revoke_privileges.sql
@@ -0,0 +1,28 @@
+SET SESSION ROLE neon_superuser;
+
+DO $$
+DECLARE
+    schema TEXT;
+    revoke_query TEXT;
+BEGIN
+    FOR schema IN
+        SELECT schema_name
+        FROM information_schema.schemata
+        -- So far, we only had issues with 'public' schema. Probably, because we do some additional grants,
+        -- e.g., make DB owner the owner of 'public' schema automatically (when created via API).
+        -- See https://github.com/neondatabase/cloud/issues/13582 for the context.
+        -- Still, keep the loop because i) it efficiently handles the case when there is no 'public' schema,
+        -- ii) it's easy to add more schemas to the list if needed.
+        WHERE schema_name IN ('public')
+    LOOP
+        revoke_query := format(
+            'REVOKE ALL PRIVILEGES ON ALL TABLES IN SCHEMA %I FROM {role_name} GRANTED BY neon_superuser;',
+            schema
+        );
+
+        EXECUTE revoke_query;
+    END LOOP;
+END;
+$$;
+
+RESET ROLE;
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -62,7 +62,7 @@ use crate::local_env::LocalEnv;
 use crate::postgresql_conf::PostgresConf;
 use crate::storage_controller::StorageController;

-use compute_api::responses::{ComputeState, ComputeStatus};
+use compute_api::responses::{ComputeStatus, ComputeStatusResponse};
 use compute_api::spec::{Cluster, ComputeFeature, ComputeMode, ComputeSpec};

 // contents of a endpoint.json file
@@ -739,7 +739,7 @@ impl Endpoint {
    }

    // Call the /status HTTP API
-    pub async fn get_status(&self) -> Result<ComputeState> {
+    pub async fn get_status(&self) -> Result<ComputeStatusResponse> {
        let client = reqwest::Client::new();

        let response = client
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -483,7 +483,6 @@ impl LocalEnv {
            .iter()
            .find(|(mapped_tenant_id, _)| mapped_tenant_id == &tenant_id)
            .map(|&(_, timeline_id)| timeline_id)
-            .map(TimelineId::from)
    }

    pub fn timeline_name_mappings(&self) -> HashMap<TenantTimelineId, String> {
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -822,10 +822,7 @@ impl StorageController {
        self.dispatch(
            Method::PUT,
            format!("control/v1/tenant/{tenant_shard_id}/migrate"),
-            Some(TenantShardMigrateRequest {
-                tenant_shard_id,
-                node_id,
-            }),
+            Some(TenantShardMigrateRequest { node_id }),
        )
        .await
    }
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -1,12 +1,16 @@
 use futures::StreamExt;
-use std::{str::FromStr, time::Duration};
+use std::{
+    collections::{HashMap, HashSet},
+    str::FromStr,
+    time::Duration,
+};

 use clap::{Parser, Subcommand};
 use pageserver_api::{
    controller_api::{
        AvailabilityZone, NodeAvailabilityWrapper, NodeDescribeResponse, NodeShardResponse,
-        SafekeeperDescribeResponse, ShardSchedulingPolicy, TenantCreateRequest,
-        TenantDescribeResponse, TenantPolicyRequest,
+        SafekeeperDescribeResponse, ShardSchedulingPolicy, ShardsPreferredAzsRequest,
+        TenantCreateRequest, TenantDescribeResponse, TenantPolicyRequest,
    },
    models::{
        EvictionPolicy, EvictionPolicyLayerAccessThreshold, LocationConfigSecondary,
@@ -112,6 +116,13 @@ enum Command {
        #[arg(long)]
        node: NodeId,
    },
+    /// Migrate the secondary location for a tenant shard to a specific pageserver.
+    TenantShardMigrateSecondary {
+        #[arg(long)]
+        tenant_shard_id: TenantShardId,
+        #[arg(long)]
+        node: NodeId,
+    },
    /// Cancel any ongoing reconciliation for this shard
    TenantShardCancelReconcile {
        #[arg(long)]
@@ -146,6 +157,12 @@ enum Command {
        #[arg(long)]
        tenant_id: TenantId,
    },
+    TenantSetPreferredAz {
+        #[arg(long)]
+        tenant_id: TenantId,
+        #[arg(long)]
+        preferred_az: Option<String>,
+    },
    /// Uncleanly drop a tenant from the storage controller: this doesn't delete anything from pageservers. Appropriate
    /// if you e.g. used `tenant-warmup` by mistake on a tenant ID that doesn't really exist, or is in some other region.
    TenantDrop {
@@ -395,11 +412,12 @@ async fn main() -> anyhow::Result<()> {
            resp.sort_by(|a, b| a.listen_http_addr.cmp(&b.listen_http_addr));

            let mut table = comfy_table::Table::new();
-            table.set_header(["Id", "Hostname", "Scheduling", "Availability"]);
+            table.set_header(["Id", "Hostname", "AZ", "Scheduling", "Availability"]);
            for node in resp {
                table.add_row([
                    format!("{}", node.id),
                    node.listen_http_addr,
+                    node.availability_zone_id,
                    format!("{:?}", node.scheduling),
                    format!("{:?}", node.availability),
                ]);
@@ -459,33 +477,65 @@ async fn main() -> anyhow::Result<()> {
            println!("{table}");
        }
        Command::Tenants { node_id: None } => {
-            let mut resp = storcon_client
-                .dispatch::<(), Vec<TenantDescribeResponse>>(
-                    Method::GET,
-                    "control/v1/tenant".to_string(),
-                    None,
-                )
-                .await?;
-
-            resp.sort_by(|a, b| a.tenant_id.cmp(&b.tenant_id));
-
+            // Set up output formatting
            let mut table = comfy_table::Table::new();
            table.set_header([
                "TenantId",
+                "Preferred AZ",
                "ShardCount",
                "StripeSize",
                "Placement",
                "Scheduling",
            ]);
-            for tenant in resp {
-                let shard_zero = tenant.shards.into_iter().next().unwrap();
-                table.add_row([
-                    format!("{}", tenant.tenant_id),
-                    format!("{}", shard_zero.tenant_shard_id.shard_count.literal()),
-                    format!("{:?}", tenant.stripe_size),
-                    format!("{:?}", tenant.policy),
-                    format!("{:?}", shard_zero.scheduling_policy),
-                ]);
+
+            // Pagination loop over listing API
+            let mut start_after = None;
+            const LIMIT: usize = 1000;
+            loop {
+                let path = match start_after {
+                    None => format!("control/v1/tenant?limit={LIMIT}"),
+                    Some(start_after) => {
+                        format!("control/v1/tenant?limit={LIMIT}&start_after={start_after}")
+                    }
+                };
+
+                let resp = storcon_client
+                    .dispatch::<(), Vec<TenantDescribeResponse>>(Method::GET, path, None)
+                    .await?;
+
+                if resp.is_empty() {
+                    // End of data reached
+                    break;
+                }
+
+                // Give some visual feedback while we're building up the table (comfy_table doesn't have
+                // streaming output)
+                if resp.len() >= LIMIT {
+                    eprint!(".");
+                }
+
+                start_after = Some(resp.last().unwrap().tenant_id);
+
+                for tenant in resp {
+                    let shard_zero = tenant.shards.into_iter().next().unwrap();
+                    table.add_row([
+                        format!("{}", tenant.tenant_id),
+                        shard_zero
+                            .preferred_az_id
+                            .as_ref()
+                            .cloned()
+                            .unwrap_or("".to_string()),
+                        format!("{}", shard_zero.tenant_shard_id.shard_count.literal()),
+                        format!("{:?}", tenant.stripe_size),
+                        format!("{:?}", tenant.policy),
+                        format!("{:?}", shard_zero.scheduling_policy),
+                    ]);
+                }
+            }
+
+            // Terminate progress dots
+            if table.row_count() > LIMIT {
+                eprint!("");
            }

            println!("{table}");
@@ -540,10 +590,7 @@ async fn main() -> anyhow::Result<()> {
            tenant_shard_id,
            node,
        } => {
-            let req = TenantShardMigrateRequest {
-                tenant_shard_id,
-                node_id: node,
-            };
+            let req = TenantShardMigrateRequest { node_id: node };

            storcon_client
                .dispatch::<TenantShardMigrateRequest, TenantShardMigrateResponse>(
@@ -553,6 +600,20 @@ async fn main() -> anyhow::Result<()> {
                )
                .await?;
        }
+        Command::TenantShardMigrateSecondary {
+            tenant_shard_id,
+            node,
+        } => {
+            let req = TenantShardMigrateRequest { node_id: node };
+
+            storcon_client
+                .dispatch::<TenantShardMigrateRequest, TenantShardMigrateResponse>(
+                    Method::PUT,
+                    format!("control/v1/tenant/{tenant_shard_id}/migrate_secondary"),
+                    Some(req),
+                )
+                .await?;
+        }
        Command::TenantShardCancelReconcile { tenant_shard_id } => {
            storcon_client
                .dispatch::<(), ()>(
@@ -596,6 +657,19 @@ async fn main() -> anyhow::Result<()> {
                    None,
                )
                .await?;
+
+            let nodes = storcon_client
+                .dispatch::<(), Vec<NodeDescribeResponse>>(
+                    Method::GET,
+                    "control/v1/node".to_string(),
+                    None,
+                )
+                .await?;
+            let nodes = nodes
+                .into_iter()
+                .map(|n| (n.id, n))
+                .collect::<HashMap<_, _>>();
+
            println!("Tenant {tenant_id}");
            let mut table = comfy_table::Table::new();
            table.add_row(["Policy", &format!("{:?}", policy)]);
@@ -604,7 +678,14 @@ async fn main() -> anyhow::Result<()> {
            println!("{table}");
            println!("Shards:");
            let mut table = comfy_table::Table::new();
-            table.set_header(["Shard", "Attached", "Secondary", "Last error", "status"]);
+            table.set_header([
+                "Shard",
+                "Attached",
+                "Attached AZ",
+                "Secondary",
+                "Last error",
+                "status",
+            ]);
            for shard in shards {
                let secondary = shard
                    .node_secondary
@@ -627,11 +708,18 @@ async fn main() -> anyhow::Result<()> {
                }
                let status = status_parts.join(",");

+                let attached_node = shard
+                    .node_attached
+                    .as_ref()
+                    .map(|id| nodes.get(id).expect("Shard references nonexistent node"));
+
                table.add_row([
                    format!("{}", shard.tenant_shard_id),
-                    shard
-                        .node_attached
-                        .map(|n| format!("{}", n))
+                    attached_node
+                        .map(|n| format!("{} ({})", n.listen_http_addr, n.id))
+                        .unwrap_or(String::new()),
+                    attached_node
+                        .map(|n| n.availability_zone_id.clone())
                        .unwrap_or(String::new()),
                    secondary,
                    shard.last_error,
@@ -640,6 +728,66 @@ async fn main() -> anyhow::Result<()> {
            }
            println!("{table}");
        }
+        Command::TenantSetPreferredAz {
+            tenant_id,
+            preferred_az,
+        } => {
+            // First learn about the tenant's shards
+            let describe_response = storcon_client
+                .dispatch::<(), TenantDescribeResponse>(
+                    Method::GET,
+                    format!("control/v1/tenant/{tenant_id}"),
+                    None,
+                )
+                .await?;
+
+            // Learn about nodes to validate the AZ ID
+            let nodes = storcon_client
+                .dispatch::<(), Vec<NodeDescribeResponse>>(
+                    Method::GET,
+                    "control/v1/node".to_string(),
+                    None,
+                )
+                .await?;
+
+            if let Some(preferred_az) = &preferred_az {
+                let azs = nodes
+                    .into_iter()
+                    .map(|n| (n.availability_zone_id))
+                    .collect::<HashSet<_>>();
+                if !azs.contains(preferred_az) {
+                    anyhow::bail!(
+                        "AZ {} not found on any node: known AZs are: {:?}",
+                        preferred_az,
+                        azs
+                    );
+                }
+            } else {
+                // Make it obvious to the user that since they've omitted an AZ, we're clearing it
+                eprintln!("Clearing preferred AZ for tenant {}", tenant_id);
+            }
+
+            // Construct a request that modifies all the tenant's shards
+            let req = ShardsPreferredAzsRequest {
+                preferred_az_ids: describe_response
+                    .shards
+                    .into_iter()
+                    .map(|s| {
+                        (
+                            s.tenant_shard_id,
+                            preferred_az.clone().map(AvailabilityZone),
+                        )
+                    })
+                    .collect(),
+            };
+            storcon_client
+                .dispatch::<ShardsPreferredAzsRequest, ()>(
+                    Method::PUT,
+                    "control/v1/preferred_azs".to_string(),
+                    Some(req),
+                )
+                .await?;
+        }
        Command::TenantWarmup { tenant_id } => {
            let describe_response = storcon_client
                .dispatch::<(), TenantDescribeResponse>(
@@ -915,10 +1063,7 @@ async fn main() -> anyhow::Result<()> {
                            .dispatch::<TenantShardMigrateRequest, TenantShardMigrateResponse>(
                                Method::PUT,
                                format!("control/v1/tenant/{}/migrate", mv.tenant_shard_id),
-                                Some(TenantShardMigrateRequest {
-                                    tenant_shard_id: mv.tenant_shard_id,
-                                    node_id: mv.to,
-                                }),
+                                Some(TenantShardMigrateRequest { node_id: mv.to }),
                            )
                            .await
                            .map_err(|e| (mv.tenant_shard_id, mv.from, mv.to, e))
@@ -1035,7 +1180,15 @@ async fn main() -> anyhow::Result<()> {
            resp.sort_by(|a, b| a.id.cmp(&b.id));

            let mut table = comfy_table::Table::new();
-            table.set_header(["Id", "Version", "Host", "Port", "Http Port", "AZ Id"]);
+            table.set_header([
+                "Id",
+                "Version",
+                "Host",
+                "Port",
+                "Http Port",
+                "AZ Id",
+                "Scheduling",
+            ]);
            for sk in resp {
                table.add_row([
                    format!("{}", sk.id),
@@ -1043,7 +1196,8 @@ async fn main() -> anyhow::Result<()> {
                    sk.host,
                    format!("{}", sk.port),
                    format!("{}", sk.http_port),
-                    sk.availability_zone_id.to_string(),
+                    sk.availability_zone_id.clone(),
+                    String::from(sk.scheduling_policy),
                ]);
            }
            println!("{table}");
--- a/debug-oom/.gitignore
+++ b/debug-oom/.gitignore
@@ -1 +0,0 @@
-backup.tar.gz
--- a/debug-oom/README.md
+++ b/debug-oom/README.md
@@ -1,21 +0,0 @@
-To build a compute image:
-```
-docker build --build-arg GIT_VERSION=custombuild --build-arg PG_VERSION=v16 -t neon-local-v16 -f ../compute/compute-node.Dockerfile .. && \
-../../autoscaling/bin/vm-builder \
-            -spec=../compute/vm-image-spec-bullseye.yaml \
-            -src=neon-local-v16:latest \
-            -dst=vm-neon-local-v16:latest \
-            -target-arch=linux/amd64 \
-            -size 2G && \
-../../autoscaling/bin/kind load docker-image vm-neon-local-v16:latest --name neonvm-arthur
-```
-
-To start a compute node:
-```
-kubectl apply -f ./spec.yml
-```
-
-How to destroy:
-```
-kubectl delete -f ./spec.yml
-```
--- a/debug-oom/spec.yml
+++ b/debug-oom/spec.yml
@@ -1,99 +0,0 @@
-apiVersion: vm.neon.tech/v1
-kind: VirtualMachine
-metadata:
-  annotations:
-    autoscaling.neon.tech/bounds: '{"min":{"cpu":"250m","mem":"1Gi"},"max":{"cpu":"2","mem":"8Gi"}}'
-    autoscaling.neon.tech/config: '{"enableLFCMetrics":true}'
-  creationTimestamp: "2025-01-04T18:37:29Z"
-  finalizers:
-  - vm.neon.tech/finalizer
-  generation: 1
-  labels:
-    autoscaling.neon.tech/enabled: "true"
-    neon/component: compute-node
-    neon/compute-id: compute-purple-art-unreal
-    neon/endpoint-id: ep-unreal
-  name: compute-purple-art-unreal
-  namespace: default
-spec:
-  cpuScalingMode: QmpScaling
-  disks:
-  - emptyDisk:
-      discard: true
-      size: 36096Mi
-    mountPath: /neonvm/cache
-    name: cache
-    readOnly: false
-  - emptyDisk:
-      discard: true
-      enableQuotas: true
-      size: 150Gi
-    mountPath: /var/db/postgres/compute
-    name: pgdata
-    readOnly: false
-  enableAcceleration: true
-  enableNetworkMonitoring: false
-  enableSSH: true
-  guest:
-    args:
-    - -c
-    - /usr/local/bin/compute_ctl -D /var/db/postgres/compute/pgdata -b /usr/local/bin/postgres
-      -C postgresql://cloud_admin@127.0.0.1/postgres?options=-c%20default_transaction_read_only%3Dfalse
-      --compute-id compute-purple-art-unreal --control-plane-uri http://dontexist.local:9096
-      --resize-swap-on-bind --set-disk-quota-for-fs /var/db/postgres/compute 2>&1
-    command:
-    - /bin/sh
-    cpus:
-      max: 10
-      min: 250m
-      use: 500m
-    env:
-    - name: RUST_LOG
-      value: info
-    - name: OTEL_SDK_DISABLED
-      value: "true"
-    - name: AUTOSCALING
-      value: "true"
-    memorySlotSize: 1Gi
-    memorySlots:
-      max: 40
-      min: 1
-      use: 2
-    ports:
-    - name: postgres
-      port: 5432
-      protocol: TCP
-    - name: control
-      port: 3080
-      protocol: TCP
-    - name: pooler
-      port: 6432
-      protocol: TCP
-    - name: host-metrics
-      port: 9100
-      protocol: TCP
-    - name: metrics
-      port: 9187
-      protocol: TCP
-    - name: sql-exporter
-      port: 9399
-      protocol: TCP
-    - name: sql-exporter-2
-      port: 9499
-      protocol: TCP
-    - name: vm-monitor
-      port: 10301
-      protocol: TCP
-    - name: local-proxy
-      port: 10432
-      protocol: TCP
-    rootDisk:
-      image: vm-neon-local-v16
-      imagePullPolicy: IfNotPresent
-      size: 20Gi
-    settings:
-      swap: 40Gi
-      sysctl:
-      - vm.overcommit_memory=2
-  restartPolicy: Always
-  schedulerName: autoscale-scheduler
--- a/docs/docker.md
+++ b/docs/docker.md
@@ -7,15 +7,11 @@ Currently we build two main images:
 - [neondatabase/neon](https://hub.docker.com/repository/docker/neondatabase/neon) — image with pre-built `pageserver`, `safekeeper` and `proxy` binaries and all the required runtime dependencies. Built from [/Dockerfile](/Dockerfile).
 - [neondatabase/compute-node-v16](https://hub.docker.com/repository/docker/neondatabase/compute-node-v16) — compute node image with pre-built Postgres binaries from [neondatabase/postgres](https://github.com/neondatabase/postgres). Similar images exist for v15 and v14. Built from [/compute-node/Dockerfile](/compute/compute-node.Dockerfile).

-And additional intermediate image:
-
- [neondatabase/compute-tools](https://hub.docker.com/repository/docker/neondatabase/compute-tools) — compute node configuration management tools.
-
 ## Build pipeline

 We build all images after a successful `release` tests run and push automatically to Docker Hub with two parallel CI jobs

-1. `neondatabase/compute-tools` and `neondatabase/compute-node-v16` (and -v15 and -v14)
+1. `neondatabase/compute-node-v17` (and -16, -v15, -v14)

 2. `neondatabase/neon`

--- a/docs/rfcs/035-safekeeper-dynamic-membership-change.md
+++ b/docs/rfcs/035-safekeeper-dynamic-membership-change.md
@@ -81,7 +81,7 @@ configuration generation in them is less than its current one. Namely, it
 refuses to vote, to truncate WAL in `handle_elected` and to accept WAL. In
 response it sends its current configuration generation to let walproposer know.

-Safekeeper gets `PUT /v1/tenants/{tenant_id}/timelines/{timeline_id}/configuration` 
+Safekeeper gets `PUT /v1/tenants/{tenant_id}/timelines/{timeline_id}/configuration`
 accepting `Configuration`. Safekeeper switches to the given conf it is higher than its
 current one and ignores it otherwise. In any case it replies with
 ```
@@ -103,7 +103,7 @@ currently and tries to communicate with all of them. However, the list does not
 define consensus members. Instead, on start walproposer tracks highest
 configuration it receives from `AcceptorGreeting`s. Once it assembles greetings
 from majority of `sk_set` and majority of `new_sk_set` (if it is present), it
-establishes this configuration as its own and moves to voting. 
+establishes this configuration as its own and moves to voting.

 It should stop talking to safekeepers not listed in the configuration at this
 point, though it is not unsafe to continue doing so.
@@ -119,7 +119,7 @@ refusal to accept due to configuration change) it simply restarts.
 The following algorithm can be executed anywhere having access to configuration
 storage and safekeepers. It is safe to interrupt / restart it and run multiple
 instances of it concurrently, though likely one of them won't make
-progress then. It accepts `desired_set: Vec<NodeId>` as input. 
+progress then. It accepts `desired_set: Vec<NodeId>` as input.

 Algorithm will refuse to make the change if it encounters previous interrupted
 change attempt, but in this case it will try to finish it.
@@ -140,7 +140,7 @@ storage are reachable.
   safe. Failed CAS aborts the procedure.
 4) Call `PUT` `configuration` on safekeepers from the current set,
   delivering them `joint_conf`. Collecting responses from majority is required
-   to proceed. If any response returned generation higher than 
+   to proceed. If any response returned generation higher than
   `joint_conf.generation`, abort (another switch raced us). Otherwise, choose
   max `<last_log_term, flush_lsn>` among responses and establish it as
   (in memory) `sync_position`. Also choose max `term` and establish it as (in
@@ -149,49 +149,49 @@ storage are reachable.
   without ack from the new set. Similarly, we'll bump term on new majority
   to `sync_term` so that two computes with the same term are never elected.
 4) Initialize timeline on safekeeper(s) from `new_sk_set` where it
-   doesn't exist yet by doing `pull_timeline` from the majority of the 
+   doesn't exist yet by doing `pull_timeline` from the majority of the
   current set. Doing that on majority of `new_sk_set` is enough to
   proceed, but it is reasonable to ensure that all `new_sk_set` members
   are initialized -- if some of them are down why are we migrating there?
-5) Call `POST` `bump_term(sync_term)` on safekeepers from the new set. 
+5) Call `POST` `bump_term(sync_term)` on safekeepers from the new set.
   Success on majority is enough.
 6) Repeatedly call `PUT` `configuration` on safekeepers from the new set,
   delivering them `joint_conf` and collecting their positions. This will
-   switch them to the `joint_conf` which generally won't be needed 
+   switch them to the `joint_conf` which generally won't be needed
   because `pull_timeline` already includes it and plus additionally would be
   broadcast by compute. More importantly, we may proceed to the next step
-   only when `<last_log_term, flush_lsn>` on the majority of the new set reached 
-   `sync_position`. Similarly, on the happy path no waiting is not needed because 
+   only when `<last_log_term, flush_lsn>` on the majority of the new set reached
+   `sync_position`. Similarly, on the happy path no waiting is not needed because
   `pull_timeline` already includes it. However, we should double
    check to be safe. For example, timeline could have been created earlier e.g.
-    manually or after try-to-migrate, abort, try-to-migrate-again sequence. 
-7) Create `new_conf: Configuration` incrementing `join_conf` generation and having new 
-   safekeeper set as `sk_set` and None `new_sk_set`. Write it to configuration 
+    manually or after try-to-migrate, abort, try-to-migrate-again sequence.
+7) Create `new_conf: Configuration` incrementing `join_conf` generation and having new
+   safekeeper set as `sk_set` and None `new_sk_set`. Write it to configuration
   storage under one more CAS.
 8) Call `PUT` `configuration` on safekeepers from the new set,
-   delivering them `new_conf`. It is enough to deliver it to the majority 
+   delivering them `new_conf`. It is enough to deliver it to the majority
   of the new set; the rest can be updated by compute.

 I haven't put huge effort to make the description above very precise, because it
 is natural language prone to interpretations anyway. Instead I'd like to make TLA+
 spec of it.

-Description above focuses on safety. To make the flow practical and live, here a few more 
+Description above focuses on safety. To make the flow practical and live, here a few more
 considerations.
-1) It makes sense to ping new set to ensure it we are migrating to live node(s) before 
+1) It makes sense to ping new set to ensure it we are migrating to live node(s) before
  step 3.
-2) If e.g. accidentally wrong new sk set has been specified, before CAS in step `6` is completed 
+2) If e.g. accidentally wrong new sk set has been specified, before CAS in step `6` is completed
   it is safe to rollback to the old conf with one more CAS.
-3) On step 4 timeline might be already created on members of the new set for various reasons; 
+3) On step 4 timeline might be already created on members of the new set for various reasons;
   the simplest is the procedure restart. There are more complicated scenarious like mentioned
-   in step 5. Deleting and re-doing `pull_timeline` is generally unsafe without involving 
-   generations, so seems simpler to treat existing timeline as success. However, this also 
+   in step 5. Deleting and re-doing `pull_timeline` is generally unsafe without involving
+   generations, so seems simpler to treat existing timeline as success. However, this also
   has a disadvantage: you might imagine an surpassingly unlikely schedule where condition in
   the step 5 is never reached until compute is (re)awaken up to synchronize new member(s).
   I don't think we'll observe this in practice, but can add waking up compute if needed.
 4) In the end timeline should be locally deleted on the safekeeper(s) which are
   in the old set but not in the new one, unless they are unreachable. To be
-   safe this also should be done under generation number (deletion proceeds only if 
+   safe this also should be done under generation number (deletion proceeds only if
   current configuration is <= than one in request and safekeeper is not memeber of it).
 5) If current conf fetched on step 1 is already not joint and members equal to `desired_set`,
   jump to step 7, using it as `new_conf`.
@@ -202,47 +202,87 @@ The procedure ought to be driven from somewhere. Obvious candidates are control
 plane and storage_controller; and as each of them already has db we don't want
 yet another storage. I propose to manage safekeepers in storage_controller
 because 1) since it is in rust it simplifies simulation testing (more on this
-below) 2) it already manages pageservers. 
+below) 2) it already manages pageservers.

 This assumes that migration will be fully usable only after we migrate all
 tenants/timelines to storage_controller. It is discussible whether we want also
 to manage pageserver attachments for all of these, but likely we do.

-This requires us to define storcon <-> cplane interface.
+This requires us to define storcon <-> cplane interface and changes.

-### storage_controller <-> control plane interface
+### storage_controller <-> control plane interface and changes

 First of all, control plane should
 [change](https://neondb.slack.com/archives/C03438W3FLZ/p1719226543199829)
 storing safekeepers per timeline instead of per tenant because we can't migrate
-tenants atomically. 
+tenants atomically.

 The important question is how updated configuration is delivered from
 storage_controller to control plane to provide it to computes. As always, there
 are two options, pull and push. Let's do it the same push as with pageserver
 `/notify-attach` because 1) it keeps storage_controller out of critical compute
-start path 2) provides easier upgrade: there won't be such a thing as 'timeline
-managed by control plane / storcon', cplane just takes the value out of its db
-when needed 3) uniformity. It makes storage_controller responsible for retrying notifying
-control plane until it succeeds.
+start path 2) uniformity. It makes storage_controller responsible for retrying
+notifying control plane until it succeeds.

-So, cplane `/notify-safekeepers` for the timeline accepts `Configuration` and
-updates it in the db if the provided conf generation is higher (the cplane db
-should also store generations for this). Similarly to [`/notify-attach`](https://www.notion.so/neondatabase/Storage-Controller-Control-Plane-interface-6de56dd310a043bfa5c2f5564fa98365), it
-should update db which makes the call successful, and then try to schedule
-`apply_config` if possible, it is ok if not. storage_controller 
-should rate limit calling the endpoint, but likely this won't be needed, as migration
+It is not needed for the control plane to fully know the `Configuration`. It is
+enough for it to only to be aware of the list of safekeepers in the latest
+configuration to supply it to compute, plus associated generation number to
+protect from stale update requests and to also pass it to compute.
+
+So, cplane `/notify-safekeepers` for the timeline can accept JSON like
+```
+{
+   tenant_id: String,
+   timeline_id: String,
+   generation: u32,
+   safekeepers: Vec<SafekeeperId>,
+}
+```
+where `SafekeeperId` is
+```
+{
+   node_id: u64,
+   host: String
+}
+```
+In principle `host` is redundant, but may be useful for observability.
+
+The request updates list of safekeepers in the db if the provided conf
+generation is higher (the cplane db should also store generations for this).
+Similarly to
+[`/notify-attach`](https://www.notion.so/neondatabase/Storage-Controller-Control-Plane-interface-6de56dd310a043bfa5c2f5564fa98365),
+it should update db which makes the call successful, and then try to schedule
+`apply_config` if possible, it is ok if not. storage_controller should rate
+limit calling the endpoint, but likely this won't be needed, as migration
 throughput is limited by `pull_timeline`.

 Timeline (branch) creation in cplane should call storage_controller POST
 `tenant/:tenant_id/timeline` like it currently does for sharded tenants.
-Response should be augmented with `safekeeper_conf: Configuration`. The call
-should be retried until succeeds.
+Response should be augmented with `safekeepers_generation` and `safekeepers`
+fields like described in `/notify-safekeepers` above. Initially (currently)
+these fields may be absent; in this case cplane chooses safekeepers on its own
+like it currently does. The call should be retried until succeeds.

 Timeline deletion and tenant deletion in cplane should call appropriate
 storage_controller endpoints like it currently does for sharded tenants. The
 calls should be retried until they succeed.

+When compute receives safekeepers list from control plane it needs to know the
+generation to checked whether it should be updated (note that compute may get
+safekeeper list from either cplane or safekeepers). Currently `neon.safekeepers`
+GUC is just a comma separates list of `host:port`. Let's prefix it with
+`g#<generation>:` to this end, so it will look like
+```
+g#42:safekeeper-0.eu-central-1.aws.neon.tech:6401,safekeeper-2.eu-central-1.aws.neon.tech:6401,safekeeper-1.eu-central-1.aws.neon.tech:6401
+```
+
+To summarize, list of cplane changes:
+- per tenant -> per timeline safekeepers management and addition of int `safekeeper_generation` field.
+- `/notify-safekeepers` endpoint.
+- Branch creation call may return list of safekeepers and when it is
+  present cplane should adopt it instead of choosing on its own like it does currently.
+- `neon.safekeepers` GUC should be prefixed with `g#<generation>:`.
+
 ### storage_controller implementation

 Current 'load everything on startup and keep in memory' easy design is fine.
@@ -360,10 +400,10 @@ source safekeeper might fail, which is not a problem if we are going to
 decomission the node but leaves garbage otherwise. I'd propose in the first version
 1) Don't attempt deletion at all if node status is `offline`.
 2) If it failed, just issue warning.
-And add PUT `/control/v1/safekeepers/:node_id/scrub` endpoint which would find and 
-remove garbage timelines for manual use. It will 1) list all timelines on the 
-safekeeper 2) compare each one against configuration storage: if timeline 
-doesn't exist at all (had been deleted), it can be deleted. Otherwise, it can 
+And add PUT `/control/v1/safekeepers/:node_id/scrub` endpoint which would find and
+remove garbage timelines for manual use. It will 1) list all timelines on the
+safekeeper 2) compare each one against configuration storage: if timeline
+doesn't exist at all (had been deleted), it can be deleted. Otherwise, it can
 be deleted under generation number if node is not member of current generation.

 Automating this is untrivial; we'd need to register all potential missing
@@ -412,8 +452,8 @@ There should be following layers of tests:
 3) Since simulation testing injects at relatively high level points (not
   syscalls), it omits some code, in particular `pull_timeline`. Thus it is
   better to have basic tests covering whole system as well. Extended version of
-   `test_restarts_under_load` would do: start background load and do migration 
-   under it, then restart endpoint and check that no reported commits 
+   `test_restarts_under_load` would do: start background load and do migration
+   under it, then restart endpoint and check that no reported commits
   had been lost. I'd also add one more creating classic network split scenario, with
   one compute talking to AC and another to BD while migration from nodes ABC to ABD
   happens.
@@ -422,35 +462,51 @@ There should be following layers of tests:

 ## Order of implementation and rollout

-Note that 
+Note that
 - Control plane parts and integration with it is fully independent from everything else
  (tests would use simulation and neon_local).
+- It is reasonable to make compute <-> safekeepers protocol change
+  independent of enabling generations.
 - There is a lot of infra work making storage_controller aware of timelines and safekeepers
  and its impl/rollout should be separate from migration itself.
- Initially walproposer can just stop working while it observers joint configuration.
+- Initially walproposer can just stop working while it observes joint configuration.
  Such window would be typically very short anyway.
+- Obviously we want to test the whole thing thoroughly on staging and only then
+  gradually enable in prod.

-To rollout smoothly, both walproposer and safekeeper should have flag
-`configurations_enabled`; when set to false, they would work as currently, i.e.
-walproposer is able to commit on whatever safekeeper set it is provided. Until
-all timelines are managed by storcon we'd need to use current script to migrate
-and update/drop entries in the storage_controller database if it has any.
+Let's have the following implementation bits for gradual rollout:
+- compute gets `neon.safekeepers_proto_version` flag.
+  Initially both compute and safekeepers will be able to talk both
+  versions so that we can delay force restart of them and for
+  simplicity of rollback in case it is needed.
+- storcon gets `-set-safekeepers` config option disabled by
+  default. Timeline creation request chooses safekeepers
+  (and returns them in response to cplane) only when it is set to
+  true.
+- control_plane [see above](storage_controller-<->-control-plane interface-and-changes)
+  prefixes `neon.safekeepers` GUC with generation number. When it is 0
+  (or prefix not present at all), walproposer behaves as currently, committing on
+  the provided safekeeper list -- generations are disabled.
+  If it is non 0 it follows this RFC rules.
+- We provide a script for manual migration to storage controller.
+  It selects timeline(s) from control plane (specified or all of them) db
+  and calls special import endpoint on storage controller which is very
+  similar to timeline creation: it inserts into the db, sets
+  configuration to initial on the safekeepers, calls cplane
+  `notify-safekeepers`.

-Safekeepers would need to be able to talk both current and new protocol version
-with compute to reduce number of computes restarted in prod once v2 protocol is
-deployed (though before completely switching we'd need to force this).
-
-Let's have the following rollout order:
- storage_controller becomes aware of safekeepers;
- storage_controller gets timeline creation for new timelines and deletion requests, but
-  doesn't manage all timelines yet. Migration can be tested on these new timelines.
-  To keep control plane and storage_controller databases in sync while control 
-  plane still chooses the safekeepers initially (until all timelines are imported
-  it can choose better), `TimelineCreateRequest` can get optional safekeepers
-  field with safekeepers chosen by cplane.
- Then we can import all existing timelines from control plane to
-  storage_controller and gradually enable configurations region by region.
+Then the rollout for a region would be:
+- Current situation: safekeepers are choosen by control_plane.
+- We manually migrate some timelines, test moving them around.
+- Then we enable `--set-safekeepers` so that all new timelines
+  are on storage controller.
+- Finally migrate all existing timelines using the script (no
+  compute should be speaking old proto version at this point).

+Until all timelines are managed by storcon we'd need to use current ad hoc
+script to migrate if needed. To keep state clean, all storage controller managed
+timelines must be migrated before that, or controller db and configurations
+state of safekeepers dropped manually.

 Very rough implementation order:
 - Add concept of configurations to safekeepers (including control file),
@@ -458,10 +514,10 @@ Very rough implementation order:
 - Implement walproposer changes, including protocol.
 - Implement storconn part. Use it in neon_local (and pytest).
 - Make cplane store safekeepers per timeline instead of per tenant.
- Implement cplane/storcon integration. Route branch creation/deletion 
+- Implement cplane/storcon integration. Route branch creation/deletion
  through storcon. Then we can test migration of new branches.
- Finally import existing branches. Then we can drop cplane 
-  safekeeper selection code. Gradually enable configurations at 
+- Finally import existing branches. Then we can drop cplane
+  safekeeper selection code. Gradually enable configurations at
  computes and safekeepers. Before that, all computes must talk only
  v3 protocol version.

--- a/docs/rfcs/040-profiling.md
+++ b/docs/rfcs/040-profiling.md
@@ -0,0 +1,247 @@
+# CPU and Memory Profiling
+
+Created 2025-01-12 by Erik Grinaker.
+
+See also [internal user guide](https://www.notion.so/neondatabase/Storage-CPU-Memory-Profiling-14bf189e004780228ec7d04442742324?pvs=4).
+
+## Summary
+
+This document proposes a standard cross-team pattern for CPU and memory profiling across
+applications and languages, using the [pprof](https://github.com/google/pprof) profile format.
+
+It enables both ad hoc profiles via HTTP endpoints, and continuous profiling across the fleet via
+[Grafana Cloud Profiles](https://grafana.com/docs/grafana-cloud/monitor-applications/profiles/).
+Continuous profiling incurs an overhead of about 0.1% CPU usage and 3% slower heap allocations.
+
+## Motivation
+
+CPU and memory profiles are crucial observability tools for understanding performance issues,
+resource exhaustion, and resource costs. They allow answering questions like:
+
+* Why is this process using 100% CPU?
+* How do I make this go faster?
+* Why did this process run out of memory?
+* Why are we paying for all these CPU cores and memory chips?
+
+Go has [first-class support](https://pkg.go.dev/net/http/pprof) for profiling included in its
+standard library, using the [pprof](https://github.com/google/pprof) profile format and associated
+tooling.
+
+This is not the case for Rust and C, where obtaining profiles can be rather cumbersome. It requires
+installing and running additional tools like `perf` as root on production nodes, with analysis tools
+that can be hard to use and often don't give good results. This is not only annoying, but can also
+significantly affect the resolution time of production incidents.
+
+This proposal will:
+
+* Provide CPU and heap profiles in pprof format via HTTP API.
+* Record continuous profiles in Grafana for aggregate historical analysis.
+* Make it easy for anyone to see a flamegraph in less than one minute.
+* Be reasonably consistent across teams and services (Rust, Go, C).
+
+## Non Goals (For Now)
+
+* [Additional profile types](https://grafana.com/docs/pyroscope/next/configure-client/profile-types/)
+  like mutexes, locks, goroutines, etc.
+* [Runtime trace integration](https://grafana.com/docs/pyroscope/next/configure-client/trace-span-profiles/).
+* [Profile-guided optimization](https://en.wikipedia.org/wiki/Profile-guided_optimization).
+
+## Using Profiles
+
+Ready-to-use profiles can be obtained using e.g. `curl`. For Rust services:
+
+```
+$ curl localhost:9898/profile/cpu >profile.pb.gz
+```
+
+pprof profiles can be explored using the [`pprof`](https://github.com/google/pprof) web UI, which
+provides flamegraphs, call graphs, plain text listings, and more:
+
+```
+$ pprof -http :6060 <profile>
+```
+
+Some endpoints (e.g. Rust-based ones) can also generate flamegraph SVGs directly:
+
+```
+$ curl localhost:9898/profile/cpu?format=svg >profile.svg
+$ open profile.svg
+```
+
+Continuous profiles are available in Grafana under Explore → Profiles → Explore Profiles
+(currently only in [staging](https://neonstaging.grafana.net/a/grafana-pyroscope-app/profiles-explorer)).
+
+## API Requirements
+
+* HTTP endpoints that return a profile in pprof format (with symbols).
+  * CPU: records a profile over the request time interval (`seconds` query parameter).
+  * Memory: returns the current in-use heap allocations.
+* Unauthenticated, as it should not expose user data or pose a denial-of-service risk.
+* Default sample frequency should not impact service (maximum 5% CPU overhead).
+* Linux-compatibility.
+
+Nice to have:
+
+* Return flamegraph SVG directly from the HTTP endpoint if requested.
+* Configurable sample frequency for CPU profiles.
+* Historical heap allocations, by count and bytes.
+* macOS-compatiblity.
+
+## Rust Profiling
+
+[`libs/utils/src/http/endpoint.rs`](https://github.com/neondatabase/neon/blob/8327f68043e692c77f70d6a6dafa463636c01578/libs/utils/src/http/endpoint.rs)
+contains ready-to-use HTTP endpoints for CPU and memory profiling:
+[`profile_cpu_handler`](https://github.com/neondatabase/neon/blob/8327f68043e692c77f70d6a6dafa463636c01578/libs/utils/src/http/endpoint.rs#L338) and [`profile_heap_handler`](https://github.com/neondatabase/neon/blob/8327f68043e692c77f70d6a6dafa463636c01578/libs/utils/src/http/endpoint.rs#L416).
+
+### CPU
+
+CPU profiles are provided by [pprof-rs](https://github.com/tikv/pprof-rs) via
+[`profile_cpu_handler`](https://github.com/neondatabase/neon/blob/8327f68043e692c77f70d6a6dafa463636c01578/libs/utils/src/http/endpoint.rs#L338).
+Expose it unauthenticated at `/profile/cpu`.
+
+Parameters:
+
+* `format`: profile output format (`pprof` or `svg`; default `pprof`).
+* `seconds`: duration to collect profile over, in seconds (default `5`).
+* `frequency`: how often to sample thread stacks, in Hz (default `99`).
+* `force`: if `true`, cancel a running profile and start a new one (default `false`).
+
+Works on Linux and macOS.
+
+### Memory
+
+Use the jemalloc allocator via [`tikv-jemallocator`](https://github.com/tikv/jemallocator),
+and enable profiling with samples every 2 MB allocated:
+
+```rust
+#[global_allocator]
+static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
+
+#[allow(non_upper_case_globals)]
+#[export_name = "malloc_conf"]
+pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:21\0";
+```
+
+pprof profiles are generated by
+[`jemalloc-pprof`](https://github.com/polarsignals/rust-jemalloc-pprof) via
+[`profile_heap_handler`](https://github.com/neondatabase/neon/blob/8327f68043e692c77f70d6a6dafa463636c01578/libs/utils/src/http/endpoint.rs#L416).
+Expose it unauthenticated at `/profile/heap`.
+
+Parameters:
+
+* `format`: profile output format (`pprof`, `svg`, or `jemalloc`; default `pprof`).
+
+Works on Linux only, due to [jemalloc limitations](https://github.com/jemalloc/jemalloc/issues/26).
+
+## Go Profiling
+
+The Go standard library includes pprof profiling via HTTP API in
+[`net/http/pprof`](https://pkg.go.dev/net/http/pprof). Expose it unauthenticated at
+`/debug/pprof`.
+
+Works on Linux and macOS.
+
+### CPU 
+
+Via `/debug/pprof/profile`. Parameters:
+
+* `debug`: profile output format (`0` is pprof, `1` or above is plaintext; default `0`).
+* `seconds`: duration to collect profile over, in seconds (default `30`).
+
+Does not support a frequency parameter (see [#57488](https://github.com/golang/go/issues/57488)),
+and defaults to 100 Hz. A lower frequency can be hardcoded via `SetCPUProfileRate`, but the default
+is likely ok (estimated 1% overhead).
+
+### Memory
+
+Via `/debug/pprof/heap`. Parameters:
+
+* `seconds`: take a delta profile over the given duration, in seconds (default `0`).
+* `gc`: if `1`, garbage collect before taking profile.
+
+## C Profiling
+
+[gperftools](https://github.com/gperftools/gperftools) provides in-process CPU and heap profiling
+with pprof output.
+
+However, continuous profiling of PostgreSQL is expensive (many computes), and has limited value
+since we don't own the internals anyway.
+
+Ad hoc profiling might still be useful, but the compute team considers existing tooling sufficient,
+so this is not a priority at the moment.
+
+## Grafana Continuous Profiling
+
+[Grafana Alloy](https://grafana.com/docs/alloy/latest/) continually scrapes CPU and memory profiles
+across the fleet, and archives them as time series. This can be used to analyze resource usage over
+time, either in aggregate or zoomed in to specific events and nodes.
+
+Profiles are retained for 30 days. Profile ingestion volume for CPU+heap at 60-second intervals
+is about 0.5 GB/node/day, or about $0.25/node/day = $7.5/node/month ($0.50/GB).
+
+It is currently enabled in [staging](https://neonstaging.grafana.net/a/grafana-pyroscope-app/profiles-explorer)
+for Pageserver and Safekeeper.
+
+### Scraping
+
+* CPU profiling: 59 seconds at 19 Hz every 60 seconds.
+* Heap profiling: heap snapshot with 2 MB frequency every 60 seconds.
+
+There are two main approaches that can be taken for CPU profiles:
+
+* Continuous low-frequency profiles (e.g. 19 Hz for 60 seconds every 60 seconds).
+* Occasional high-frequency profiles (e.g. 99 Hz for 5 seconds every 60 seconds).
+
+We choose continuous low-frequency profiles where possible. This has a fixed low overhead, instead
+of a spiky high overhead. It likely also gives a more representative view of resource usage.
+However, a 19 Hz rate gives a minimum resolution of 52.6 ms per sample, which may be larger than the
+actual runtime of small functions. Note that Go does not support a frequency parameter, so we must
+use a fixed frequency for all profiles via `SetCPUProfileRate()` (default 100 Hz).
+
+Only one CPU profile can be taken at a time. With continuous profiling, one will always be running.
+To allow also taking an ad hoc CPU profile, the Rust endpoint supports a `force` query parameter to
+cancel a running profile and start a new one.
+
+### Overhead
+
+With Rust:
+
+* CPU profiles at 19 Hz frequency: 0.1% overhead.
+* Heap profiles at 2 MB frequency: 3% allocation overhead.
+* Profile call/encoding/symbolization: 20 ms every 60 seconds, or 0.03% of 1 CPU (for Pageserver).
+* Profile symbolization caches: 125 MB memory, or 0.4% of 32 GB (for Pageserver).
+
+Benchmarks with pprof-rs showed that the CPU time for taking a stack trace of a 40-frame stack was
+11 µs using the `frame-pointer` feature, and 1.4 µs using `libunwind` with DWARF. `libunwind` saw
+frequent seg faults, so we use `frame-pointer` and build binaries with frame pointers (negligible
+overhead).
+
+CPU profiles work by installing an `ITIMER_PROF` for the process, which triggers a `SIGPROF` signal
+after a given amount of cumulative CPU time across all CPUs. The signal handler will run for one
+of the currently executing threads and take a stack trace. Thus, a 19 Hz profile will take 1 stack
+trace every 52.6 ms CPU time -- assuming 11 µs for a stack trace, this is 0.02% overhead, but
+likely 0.1% in practice (given e.g. context switches).
+
+Heap profiles work by probabilistically taking a stack trace on allocations, adjusted for the
+allocation size. A 1 MB allocation takes about 15 µs in benchmarks, and a stack trace about 1 µs,
+so we can estimate that a 2 MB sampling frequency has about 3% allocation overhead -- this is 
+consistent with benchmarks. This is significantly larger than CPU profiles, but mitigated by the
+fact that performance-sensitive code will avoid allocations as far as possible.
+
+Profile symbolization uses in-memory caches for symbol lookups. These take about 125 MB for
+Pageserver.
+
+## Alternatives Considered
+
+* eBPF profiles.
+  * Don't require instrumenting the binary.
+  * Use less resources.
+  * Can profile in kernel space too.
+  * Supported by Grafana.
+  * Less information about stack frames and spans.
+  * Limited tooling for local analysis.
+  * Does not support heap profiles.
+  * Does not work on macOS.
+
+* [Polar Signals](https://www.polarsignals.com) instead of Grafana.
+  * We already use Grafana for everything else. Appears good enough.
--- a/libs/compute_api/src/responses.rs
+++ b/libs/compute_api/src/responses.rs
@@ -15,6 +15,17 @@ pub struct GenericAPIError {
    pub error: String,
 }

+#[derive(Debug, Clone, Serialize)]
+pub struct InfoResponse {
+    pub num_cpus: usize,
+}
+
+#[derive(Debug, Clone, Serialize)]
+pub struct ExtensionInstallResponse {
+    pub extension: PgIdent,
+    pub version: ExtVersion,
+}
+
 /// Response of the /status API
 #[derive(Serialize, Debug, Deserialize)]
 #[serde(rename_all = "snake_case")]
@@ -28,16 +39,6 @@ pub struct ComputeStatusResponse {
    pub error: Option<String>,
 }

-#[derive(Deserialize, Serialize)]
-#[serde(rename_all = "snake_case")]
-pub struct ComputeState {
-    pub status: ComputeStatus,
-    /// Timestamp of the last Postgres activity
-    #[serde(serialize_with = "rfc3339_serialize")]
-    pub last_active: Option<DateTime<Utc>>,
-    pub error: Option<String>,
-}
-
 #[derive(Serialize, Clone, Copy, Debug, Deserialize, PartialEq, Eq)]
 #[serde(rename_all = "snake_case")]
 pub enum ComputeStatus {
@@ -78,7 +79,7 @@ impl Display for ComputeStatus {
    }
 }

-fn rfc3339_serialize<S>(x: &Option<DateTime<Utc>>, s: S) -> Result<S::Ok, S::Error>
+pub fn rfc3339_serialize<S>(x: &Option<DateTime<Utc>>, s: S) -> Result<S::Ok, S::Error>
 where
    S: Serializer,
 {
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -87,7 +87,7 @@ impl Display for AvailabilityZone {
 #[derive(Serialize, Deserialize)]
 pub struct ShardsPreferredAzsRequest {
    #[serde(flatten)]
-    pub preferred_az_ids: HashMap<TenantShardId, AvailabilityZone>,
+    pub preferred_az_ids: HashMap<TenantShardId, Option<AvailabilityZone>>,
 }

 #[derive(Serialize, Deserialize)]
@@ -144,6 +144,8 @@ pub struct NodeDescribeResponse {
    pub availability: NodeAvailabilityWrapper,
    pub scheduling: NodeSchedulingPolicy,

+    pub availability_zone_id: String,
+
    pub listen_http_addr: String,
    pub listen_http_port: u16,

@@ -179,7 +181,6 @@ pub struct TenantDescribeResponseShard {
 /// specifies some constraints, e.g. asking it to get off particular node(s)
 #[derive(Serialize, Deserialize, Debug)]
 pub struct TenantShardMigrateRequest {
-    pub tenant_shard_id: TenantShardId,
    pub node_id: NodeId,
 }

@@ -320,6 +321,38 @@ impl From<NodeSchedulingPolicy> for String {
    }
 }

+#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, Debug)]
+pub enum SkSchedulingPolicy {
+    Active,
+    Disabled,
+    Decomissioned,
+}
+
+impl FromStr for SkSchedulingPolicy {
+    type Err = anyhow::Error;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        Ok(match s {
+            "active" => Self::Active,
+            "disabled" => Self::Disabled,
+            "decomissioned" => Self::Decomissioned,
+            _ => return Err(anyhow::anyhow!("Unknown scheduling state '{s}'")),
+        })
+    }
+}
+
+impl From<SkSchedulingPolicy> for String {
+    fn from(value: SkSchedulingPolicy) -> String {
+        use SkSchedulingPolicy::*;
+        match value {
+            Active => "active",
+            Disabled => "disabled",
+            Decomissioned => "decomissioned",
+        }
+        .to_string()
+    }
+}
+
 /// Controls how tenant shards are mapped to locations on pageservers, e.g. whether
 /// to create secondary locations.
 #[derive(Clone, Serialize, Deserialize, Debug, PartialEq, Eq)]
@@ -336,6 +369,16 @@ pub enum PlacementPolicy {
    Detached,
 }

+impl PlacementPolicy {
+    pub fn want_secondaries(&self) -> usize {
+        match self {
+            PlacementPolicy::Attached(secondary_count) => *secondary_count,
+            PlacementPolicy::Secondary => 1,
+            PlacementPolicy::Detached => 0,
+        }
+    }
+}
+
 #[derive(Serialize, Deserialize, Debug)]
 pub struct TenantShardMigrateResponse {}

@@ -387,6 +430,7 @@ pub struct SafekeeperDescribeResponse {
    pub port: i32,
    pub http_port: i32,
    pub availability_zone_id: String,
+    pub scheduling_policy: SkSchedulingPolicy,
 }

 #[cfg(test)]
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -706,7 +706,7 @@ pub fn repl_origin_key_range() -> Range<Key> {
 /// Non inherited range for vectored get.
 pub const NON_INHERITED_RANGE: Range<Key> = AUX_FILES_KEY..AUX_FILES_KEY.next();
 /// Sparse keyspace range for vectored get. Missing key error will be ignored for this range.
-pub const NON_INHERITED_SPARSE_RANGE: Range<Key> = Key::metadata_key_range();
+pub const SPARSE_RANGE: Range<Key> = Key::metadata_key_range();

 impl Key {
    // AUX_FILES currently stores only data for logical replication (slots etc), and
@@ -714,7 +714,42 @@ impl Key {
    // switch (and generally it likely should be optional), so ignore these.
    #[inline(always)]
    pub fn is_inherited_key(self) -> bool {
-        !NON_INHERITED_RANGE.contains(&self) && !NON_INHERITED_SPARSE_RANGE.contains(&self)
+        if self.is_sparse() {
+            self.is_inherited_sparse_key()
+        } else {
+            !NON_INHERITED_RANGE.contains(&self)
+        }
+    }
+
+    #[inline(always)]
+    pub fn is_sparse(self) -> bool {
+        self.field1 >= METADATA_KEY_BEGIN_PREFIX && self.field1 < METADATA_KEY_END_PREFIX
+    }
+
+    /// Check if the key belongs to the inherited keyspace.
+    fn is_inherited_sparse_key(self) -> bool {
+        debug_assert!(self.is_sparse());
+        self.field1 == RELATION_SIZE_PREFIX
+    }
+
+    pub fn sparse_non_inherited_keyspace() -> Range<Key> {
+        // The two keys are adjacent; if we will have non-adjancent keys in the future, we should return a keyspace
+        debug_assert_eq!(AUX_KEY_PREFIX + 1, REPL_ORIGIN_KEY_PREFIX);
+        Key {
+            field1: AUX_KEY_PREFIX,
+            field2: 0,
+            field3: 0,
+            field4: 0,
+            field5: 0,
+            field6: 0,
+        }..Key {
+            field1: REPL_ORIGIN_KEY_PREFIX + 1,
+            field2: 0,
+            field3: 0,
+            field4: 0,
+            field5: 0,
+            field6: 0,
+        }
    }

    #[inline(always)]
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -272,6 +272,8 @@ pub struct CompactInfoResponse {
    pub compact_key_range: Option<CompactKeyRange>,
    pub compact_lsn_range: Option<CompactLsnRange>,
    pub sub_compaction: bool,
+    pub running: bool,
+    pub job_id: usize,
 }

 #[derive(Serialize, Deserialize, Clone)]
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -31,6 +31,8 @@
 //! - In a tenant with 4 shards, each shard has ShardCount(N), ShardNumber(i) where i in 0..N-1 (inclusive),
 //!   and their slugs are 0004, 0104, 0204, and 0304.

+use std::hash::{Hash, Hasher};
+
 use crate::{key::Key, models::ShardParameters};
 use postgres_ffi::relfile_utils::INIT_FORKNUM;
 use serde::{Deserialize, Serialize};
@@ -48,6 +50,23 @@ pub struct ShardIdentity {
    layout: ShardLayout,
 }

+/// Hash implementation
+///
+/// The stripe size cannot change dynamically, so it can be ignored for efficiency reasons.
+impl Hash for ShardIdentity {
+    fn hash<H: Hasher>(&self, state: &mut H) {
+        let ShardIdentity {
+            number,
+            count,
+            stripe_size: _,
+            layout: _,
+        } = self;
+
+        number.0.hash(state);
+        count.0.hash(state);
+    }
+}
+
 /// Stripe size in number of pages
 #[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
 pub struct ShardStripeSize(pub u32);
@@ -59,7 +78,7 @@ impl Default for ShardStripeSize {
 }

 /// Layout version: for future upgrades where we might change how the key->shard mapping works
-#[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
+#[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Hash, Debug)]
 pub struct ShardLayout(u8);

 const LAYOUT_V1: ShardLayout = ShardLayout(1);
--- a/libs/postgres_ffi/src/walrecord.rs
+++ b/libs/postgres_ffi/src/walrecord.rs
@@ -16,7 +16,7 @@ use utils::bin_ser::DeserializeError;
 use utils::lsn::Lsn;

 #[repr(C)]
-#[derive(Debug, Serialize, Deserialize)]
+#[derive(Clone, Debug, Serialize, Deserialize)]
 pub struct XlMultiXactCreate {
    pub mid: MultiXactId,
    /* new MultiXact's ID */
@@ -46,7 +46,7 @@ impl XlMultiXactCreate {
 }

 #[repr(C)]
-#[derive(Debug, Serialize, Deserialize)]
+#[derive(Clone, Debug, Serialize, Deserialize)]
 pub struct XlMultiXactTruncate {
    pub oldest_multi_db: Oid,
    /* to-be-truncated range of multixact offsets */
@@ -72,7 +72,7 @@ impl XlMultiXactTruncate {
 }

 #[repr(C)]
-#[derive(Debug, Serialize, Deserialize)]
+#[derive(Clone, Debug, Serialize, Deserialize)]
 pub struct XlRelmapUpdate {
    pub dbid: Oid,   /* database ID, or 0 for shared map */
    pub tsid: Oid,   /* database's tablespace, or pg_global */
@@ -90,7 +90,7 @@ impl XlRelmapUpdate {
 }

 #[repr(C)]
-#[derive(Debug, Serialize, Deserialize)]
+#[derive(Clone, Debug, Serialize, Deserialize)]
 pub struct XlReploriginDrop {
    pub node_id: RepOriginId,
 }
@@ -104,7 +104,7 @@ impl XlReploriginDrop {
 }

 #[repr(C)]
-#[derive(Debug, Serialize, Deserialize)]
+#[derive(Clone, Debug, Serialize, Deserialize)]
 pub struct XlReploriginSet {
    pub remote_lsn: Lsn,
    pub node_id: RepOriginId,
@@ -911,7 +911,7 @@ impl XlSmgrCreate {
 }

 #[repr(C)]
-#[derive(Debug, Serialize, Deserialize)]
+#[derive(Clone, Debug, Serialize, Deserialize)]
 pub struct XlSmgrTruncate {
    pub blkno: BlockNumber,
    pub rnode: RelFileNode,
@@ -984,7 +984,7 @@ impl XlDropDatabase {
 /// xl_xact_parsed_abort structs in PostgreSQL, but we use the same
 /// struct for commits and aborts.
 ///
-#[derive(Debug, Serialize, Deserialize)]
+#[derive(Clone, Debug, Serialize, Deserialize)]
 pub struct XlXactParsedRecord {
    pub xid: TransactionId,
    pub info: u8,
--- a/libs/pq_proto/src/lib.rs
+++ b/libs/pq_proto/src/lib.rs
@@ -44,7 +44,7 @@ pub struct ProtocolVersion(u32);

 impl ProtocolVersion {
    pub const fn new(major: u16, minor: u16) -> Self {
-        Self((major as u32) << 16 | minor as u32)
+        Self(((major as u32) << 16) | minor as u32)
    }
    pub const fn minor(self) -> u16 {
        self.0 as u16
--- a/libs/remote_storage/src/config.rs
+++ b/libs/remote_storage/src/config.rs
@@ -43,6 +43,17 @@ impl RemoteStorageKind {
    }
 }

+impl RemoteStorageConfig {
+    /// Helper to fetch the configured concurrency limit.
+    pub fn concurrency_limit(&self) -> Option<usize> {
+        match &self.storage {
+            RemoteStorageKind::LocalFs { .. } => None,
+            RemoteStorageKind::AwsS3(c) => Some(c.concurrency_limit.into()),
+            RemoteStorageKind::AzureContainer(c) => Some(c.concurrency_limit.into()),
+        }
+    }
+}
+
 fn default_timeout() -> Duration {
    RemoteStorageConfig::DEFAULT_TIMEOUT
 }
@@ -115,13 +126,15 @@ fn default_max_keys_per_list_response() -> Option<i32> {
 }

 fn default_azure_conn_pool_size() -> usize {
-    // Conservative default: no connection pooling.  At time of writing this is the Azure
-    // SDK's default as well, due to historic reports of hard-to-reproduce issues
+    // By default, the Azure SDK does no connection pooling, due to historic reports of hard-to-reproduce issues
    // (https://github.com/hyperium/hyper/issues/2312)
    //
    // However, using connection pooling is important to avoid exhausting client ports when
    // doing huge numbers of requests (https://github.com/neondatabase/cloud/issues/20971)
-    0
+    //
+    // We therefore enable a modest pool size by default: this may be configured to zero if
+    // issues like the alleged upstream hyper issue appear.
+    8
 }

 impl Debug for S3Config {
--- a/libs/safekeeper_api/Cargo.toml
+++ b/libs/safekeeper_api/Cargo.toml
@@ -5,8 +5,10 @@ edition.workspace = true
 license.workspace = true

 [dependencies]
+anyhow.workspace = true
 const_format.workspace = true
 serde.workspace = true
+serde_json.workspace = true
 postgres_ffi.workspace = true
 pq_proto.workspace = true
 tokio.workspace = true
--- a/libs/safekeeper_api/src/lib.rs
+++ b/libs/safekeeper_api/src/lib.rs
@@ -4,12 +4,15 @@ use const_format::formatcp;
 use pq_proto::SystemId;
 use serde::{Deserialize, Serialize};

+pub mod membership;
 /// Public API types
 pub mod models;

 /// Consensus logical timestamp. Note: it is a part of sk control file.
 pub type Term = u64;
-pub const INVALID_TERM: Term = 0;
+/// With this term timeline is created initially. It
+/// is a normal term except wp is never elected with it.
+pub const INITIAL_TERM: Term = 0;

 /// Information about Postgres. Safekeeper gets it once and then verifies all
 /// further connections from computes match. Note: it is a part of sk control
--- a/libs/safekeeper_api/src/membership.rs
+++ b/libs/safekeeper_api/src/membership.rs
@@ -0,0 +1,160 @@
+//! Types defining safekeeper membership, see
+//! rfcs/035-safekeeper-dynamic-membership-change.md
+//! for details.
+
+use std::{collections::HashSet, fmt::Display};
+
+use anyhow;
+use anyhow::bail;
+use serde::{Deserialize, Serialize};
+use utils::id::NodeId;
+
+/// Number uniquely identifying safekeeper configuration.
+/// Note: it is a part of sk control file.
+pub type Generation = u32;
+/// 1 is the first valid generation, 0 is used as
+/// a placeholder before we fully migrate to generations.
+pub const INVALID_GENERATION: Generation = 0;
+pub const INITIAL_GENERATION: Generation = 1;
+
+/// Membership is defined by ids so e.g. walproposer uses them to figure out
+/// quorums, but we also carry host and port to give wp idea where to connect.
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
+pub struct SafekeeperId {
+    pub id: NodeId,
+    pub host: String,
+    /// We include here only port for computes -- that is, pg protocol tenant
+    /// only port, or wide pg protocol port if the former is not configured.
+    pub pg_port: u16,
+}
+
+impl Display for SafekeeperId {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "[id={}, ep={}:{}]", self.id, self.host, self.pg_port)
+    }
+}
+
+/// Set of safekeepers.
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
+#[serde(transparent)]
+pub struct MemberSet {
+    pub m: Vec<SafekeeperId>,
+}
+
+impl MemberSet {
+    pub fn empty() -> Self {
+        MemberSet { m: Vec::new() }
+    }
+
+    pub fn new(members: Vec<SafekeeperId>) -> anyhow::Result<Self> {
+        let hs: HashSet<NodeId> = HashSet::from_iter(members.iter().map(|sk| sk.id));
+        if hs.len() != members.len() {
+            bail!("duplicate safekeeper id in the set {:?}", members);
+        }
+        Ok(MemberSet { m: members })
+    }
+
+    pub fn contains(&self, sk: &SafekeeperId) -> bool {
+        self.m.iter().any(|m| m.id == sk.id)
+    }
+
+    pub fn add(&mut self, sk: SafekeeperId) -> anyhow::Result<()> {
+        if self.contains(&sk) {
+            bail!(format!(
+                "sk {} is already member of the set {}",
+                sk.id, self
+            ));
+        }
+        self.m.push(sk);
+        Ok(())
+    }
+}
+
+impl Display for MemberSet {
+    /// Display as a comma separated list of members.
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        let sks_str = self.m.iter().map(|sk| sk.to_string()).collect::<Vec<_>>();
+        write!(f, "({})", sks_str.join(", "))
+    }
+}
+
+/// Safekeeper membership configuration.
+/// Note: it is a part of both control file and http API.
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
+pub struct Configuration {
+    /// Unique id.
+    pub generation: Generation,
+    /// Current members of the configuration.
+    pub members: MemberSet,
+    /// Some means it is a joint conf.
+    pub new_members: Option<MemberSet>,
+}
+
+impl Configuration {
+    /// Used for pre-generations timelines, will be removed eventually.
+    pub fn empty() -> Self {
+        Configuration {
+            generation: INVALID_GENERATION,
+            members: MemberSet::empty(),
+            new_members: None,
+        }
+    }
+}
+
+impl Display for Configuration {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "gen={}, members={}, new_members={}",
+            self.generation,
+            self.members,
+            self.new_members
+                .as_ref()
+                .map(ToString::to_string)
+                .unwrap_or(String::from("none"))
+        )
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::{MemberSet, SafekeeperId};
+    use utils::id::NodeId;
+
+    #[test]
+    fn test_member_set() {
+        let mut members = MemberSet::empty();
+        members
+            .add(SafekeeperId {
+                id: NodeId(42),
+                host: String::from("lala.org"),
+                pg_port: 5432,
+            })
+            .unwrap();
+
+        members
+            .add(SafekeeperId {
+                id: NodeId(42),
+                host: String::from("lala.org"),
+                pg_port: 5432,
+            })
+            .expect_err("duplicate must not be allowed");
+
+        members
+            .add(SafekeeperId {
+                id: NodeId(43),
+                host: String::from("bubu.org"),
+                pg_port: 5432,
+            })
+            .unwrap();
+
+        println!("members: {}", members);
+
+        let j = serde_json::to_string(&members).expect("failed to serialize");
+        println!("members json: {}", j);
+        assert_eq!(
+            j,
+            r#"[{"id":42,"host":"lala.org","pg_port":5432},{"id":43,"host":"bubu.org","pg_port":5432}]"#
+        );
+    }
+}
--- a/libs/safekeeper_api/src/models.rs
+++ b/libs/safekeeper_api/src/models.rs
@@ -11,7 +11,7 @@ use utils::{
    pageserver_feedback::PageserverFeedback,
 };

-use crate::{ServerInfo, Term};
+use crate::{membership::Configuration, ServerInfo, Term};

 #[derive(Debug, Serialize)]
 pub struct SafekeeperStatus {
@@ -22,13 +22,16 @@ pub struct SafekeeperStatus {
 pub struct TimelineCreateRequest {
    pub tenant_id: TenantId,
    pub timeline_id: TimelineId,
-    pub peer_ids: Option<Vec<NodeId>>,
+    pub mconf: Configuration,
    pub pg_version: u32,
    pub system_id: Option<u64>,
+    // By default WAL_SEGMENT_SIZE
    pub wal_seg_size: Option<u32>,
-    pub commit_lsn: Lsn,
-    // If not passed, it is assigned to the beginning of commit_lsn segment.
-    pub local_start_lsn: Option<Lsn>,
+    pub start_lsn: Lsn,
+    // Normal creation should omit this field (start_lsn initializes all LSNs).
+    // However, we allow specifying custom value higher than start_lsn for
+    // manual recovery case, see test_s3_wal_replay.
+    pub commit_lsn: Option<Lsn>,
 }

 /// Same as TermLsn, but serializes LSN using display serializer
@@ -172,6 +175,7 @@ pub enum WalReceiverStatus {
 pub struct TimelineStatus {
    pub tenant_id: TenantId,
    pub timeline_id: TimelineId,
+    pub mconf: Configuration,
    pub acceptor_state: AcceptorStateStatus,
    pub pg_info: ServerInfo,
    pub flush_lsn: Lsn,
@@ -186,6 +190,20 @@ pub struct TimelineStatus {
    pub walreceivers: Vec<WalReceiverState>,
 }

+/// Request to switch membership configuration.
+#[derive(Serialize, Deserialize)]
+#[serde(transparent)]
+pub struct TimelineMembershipSwitchRequest {
+    pub mconf: Configuration,
+}
+
+/// In response both previous and current configuration are sent.
+#[derive(Serialize, Deserialize)]
+pub struct TimelineMembershipSwitchResponse {
+    pub previous_conf: Configuration,
+    pub current_conf: Configuration,
+}
+
 fn lsn_invalid() -> Lsn {
    Lsn::INVALID
 }
--- a/libs/tracing-utils/src/lib.rs
+++ b/libs/tracing-utils/src/lib.rs
@@ -38,7 +38,6 @@ pub mod http;

 use opentelemetry::trace::TracerProvider;
 use opentelemetry::KeyValue;
-use opentelemetry_sdk::Resource;
 use tracing::Subscriber;
 use tracing_subscriber::registry::LookupSpan;
 use tracing_subscriber::Layer;
@@ -121,7 +120,10 @@ where
    S: Subscriber + for<'span> LookupSpan<'span>,
 {
    // Sets up exporter from the OTEL_EXPORTER_* environment variables.
-    let exporter = opentelemetry_otlp::new_exporter().http();
+    let exporter = opentelemetry_otlp::SpanExporter::builder()
+        .with_http()
+        .build()
+        .expect("could not initialize opentelemetry exporter");

    // TODO: opentelemetry::global::set_error_handler() with custom handler that
    //       bypasses default tracing layers, but logs regular looking log
@@ -132,17 +134,13 @@ where
        opentelemetry_sdk::propagation::TraceContextPropagator::new(),
    );

-    let tracer = opentelemetry_otlp::new_pipeline()
-        .tracing()
-        .with_exporter(exporter)
-        .with_trace_config(opentelemetry_sdk::trace::Config::default().with_resource(
-            Resource::new(vec![KeyValue::new(
-                opentelemetry_semantic_conventions::resource::SERVICE_NAME,
-                service_name,
-            )]),
-        ))
-        .install_batch(opentelemetry_sdk::runtime::Tokio)
-        .expect("could not initialize opentelemetry exporter")
+    let tracer = opentelemetry_sdk::trace::TracerProvider::builder()
+        .with_batch_exporter(exporter, opentelemetry_sdk::runtime::Tokio)
+        .with_resource(opentelemetry_sdk::Resource::new(vec![KeyValue::new(
+            opentelemetry_semantic_conventions::resource::SERVICE_NAME,
+            service_name,
+        )]))
+        .build()
        .tracer("global");

    tracing_opentelemetry::layer().with_tracer(tracer)
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -26,6 +26,7 @@ git-version.workspace = true
 hex = { workspace = true, features = ["serde"] }
 humantime.workspace = true
 hyper0 = { workspace = true, features = ["full"] }
+inferno.workspace = true
 itertools.workspace = true
 fail.workspace = true
 futures = { workspace = true }
--- a/libs/utils/src/generation.rs
+++ b/libs/utils/src/generation.rs
@@ -112,9 +112,9 @@ impl Serialize for Generation {
            // We should never be asked to serialize a None. Structures
            // that include an optional generation should convert None to an
            // Option<Generation>::None
-            Err(serde::ser::Error::custom(
-                "Tried to serialize invalid generation ({self})",
-            ))
+            Err(serde::ser::Error::custom(format!(
+                "Tried to serialize invalid generation ({self:?})"
+            )))
        }
    }
 }
--- a/libs/utils/src/http/endpoint.rs
+++ b/libs/utils/src/http/endpoint.rs
@@ -15,7 +15,7 @@ use once_cell::sync::Lazy;
 use regex::Regex;
 use routerify::ext::RequestExt;
 use routerify::{Middleware, RequestInfo, Router, RouterBuilder};
-use tokio::sync::{mpsc, Mutex};
+use tokio::sync::{mpsc, Mutex, Notify};
 use tokio_stream::wrappers::ReceiverStream;
 use tokio_util::io::ReaderStream;
 use tracing::{debug, info, info_span, warn, Instrument};
@@ -350,33 +350,53 @@ pub async fn profile_cpu_handler(req: Request<Body>) -> Result<Response<Body>, A
    };
    let seconds = match parse_query_param(&req, "seconds")? {
        None => 5,
-        Some(seconds @ 1..=30) => seconds,
-        Some(_) => return Err(ApiError::BadRequest(anyhow!("duration must be 1-30 secs"))),
+        Some(seconds @ 1..=60) => seconds,
+        Some(_) => return Err(ApiError::BadRequest(anyhow!("duration must be 1-60 secs"))),
    };
    let frequency_hz = match parse_query_param(&req, "frequency")? {
        None => 99,
        Some(1001..) => return Err(ApiError::BadRequest(anyhow!("frequency must be <=1000 Hz"))),
        Some(frequency) => frequency,
    };
-
-    // Only allow one profiler at a time.
-    static PROFILE_LOCK: Lazy<Mutex<()>> = Lazy::new(|| Mutex::new(()));
-    let _lock = PROFILE_LOCK
-        .try_lock()
-        .map_err(|_| ApiError::Conflict("profiler already running".into()))?;
+    let force: bool = parse_query_param(&req, "force")?.unwrap_or_default();

    // Take the profile.
-    let report = tokio::task::spawn_blocking(move || {
+    static PROFILE_LOCK: Lazy<Mutex<()>> = Lazy::new(|| Mutex::new(()));
+    static PROFILE_CANCEL: Lazy<Notify> = Lazy::new(Notify::new);
+
+    let report = {
+        // Only allow one profiler at a time. If force is true, cancel a running profile (e.g. a
+        // Grafana continuous profile). We use a try_lock() loop when cancelling instead of waiting
+        // for a lock(), to avoid races where the notify isn't currently awaited.
+        let _lock = loop {
+            match PROFILE_LOCK.try_lock() {
+                Ok(lock) => break lock,
+                Err(_) if force => PROFILE_CANCEL.notify_waiters(),
+                Err(_) => {
+                    return Err(ApiError::Conflict(
+                        "profiler already running (use ?force=true to cancel it)".into(),
+                    ))
+                }
+            }
+            tokio::time::sleep(Duration::from_millis(1)).await; // don't busy-wait
+        };
+
        let guard = ProfilerGuardBuilder::default()
            .frequency(frequency_hz)
            .blocklist(&["libc", "libgcc", "pthread", "vdso"])
-            .build()?;
-        std::thread::sleep(Duration::from_secs(seconds));
-        guard.report().build()
-    })
-    .await
-    .map_err(|join_err| ApiError::InternalServerError(join_err.into()))?
-    .map_err(|pprof_err| ApiError::InternalServerError(pprof_err.into()))?;
+            .build()
+            .map_err(|err| ApiError::InternalServerError(err.into()))?;
+
+        tokio::select! {
+            _ = tokio::time::sleep(Duration::from_secs(seconds)) => {},
+            _ = PROFILE_CANCEL.notified() => {},
+        };
+
+        guard
+            .report()
+            .build()
+            .map_err(|err| ApiError::InternalServerError(err.into()))?
+    };

    // Return the report in the requested format.
    match format {
@@ -417,6 +437,7 @@ pub async fn profile_heap_handler(req: Request<Body>) -> Result<Response<Body>,
    enum Format {
        Jemalloc,
        Pprof,
+        Svg,
    }

    // Parameters.
@@ -424,9 +445,24 @@ pub async fn profile_heap_handler(req: Request<Body>) -> Result<Response<Body>,
        None => Format::Pprof,
        Some("jemalloc") => Format::Jemalloc,
        Some("pprof") => Format::Pprof,
+        Some("svg") => Format::Svg,
        Some(format) => return Err(ApiError::BadRequest(anyhow!("invalid format {format}"))),
    };

+    // Functions and mappings to strip when symbolizing pprof profiles. If true,
+    // also remove child frames.
+    static STRIP_FUNCTIONS: Lazy<Vec<(Regex, bool)>> = Lazy::new(|| {
+        vec![
+            (Regex::new("^__rust").unwrap(), false),
+            (Regex::new("^_start$").unwrap(), false),
+            (Regex::new("^irallocx_prof").unwrap(), true),
+            (Regex::new("^prof_alloc_prep").unwrap(), true),
+            (Regex::new("^std::rt::lang_start").unwrap(), false),
+            (Regex::new("^std::sys::backtrace::__rust").unwrap(), false),
+        ]
+    });
+    const STRIP_MAPPINGS: &[&str] = &["libc", "libgcc", "pthread", "vdso"];
+
    // Obtain profiler handle.
    let mut prof_ctl = jemalloc_pprof::PROF_CTL
        .as_ref()
@@ -464,24 +500,9 @@ pub async fn profile_heap_handler(req: Request<Body>) -> Result<Response<Body>,
                // Symbolize the profile.
                // TODO: consider moving this upstream to jemalloc_pprof and avoiding the
                // serialization roundtrip.
-                static STRIP_FUNCTIONS: Lazy<Vec<(Regex, bool)>> = Lazy::new(|| {
-                    // Functions to strip from profiles. If true, also remove child frames.
-                    vec![
-                        (Regex::new("^__rust").unwrap(), false),
-                        (Regex::new("^_start$").unwrap(), false),
-                        (Regex::new("^irallocx_prof").unwrap(), true),
-                        (Regex::new("^prof_alloc_prep").unwrap(), true),
-                        (Regex::new("^std::rt::lang_start").unwrap(), false),
-                        (Regex::new("^std::sys::backtrace::__rust").unwrap(), false),
-                    ]
-                });
                let profile = pprof::decode(&bytes)?;
                let profile = pprof::symbolize(profile)?;
-                let profile = pprof::strip_locations(
-                    profile,
-                    &["libc", "libgcc", "pthread", "vdso"],
-                    &STRIP_FUNCTIONS,
-                );
+                let profile = pprof::strip_locations(profile, STRIP_MAPPINGS, &STRIP_FUNCTIONS);
                pprof::encode(&profile)
            })
            .await
@@ -494,6 +515,27 @@ pub async fn profile_heap_handler(req: Request<Body>) -> Result<Response<Body>,
                .body(Body::from(data))
                .map_err(|err| ApiError::InternalServerError(err.into()))
        }
+
+        Format::Svg => {
+            let body = tokio::task::spawn_blocking(move || {
+                let bytes = prof_ctl.dump_pprof()?;
+                let profile = pprof::decode(&bytes)?;
+                let profile = pprof::symbolize(profile)?;
+                let profile = pprof::strip_locations(profile, STRIP_MAPPINGS, &STRIP_FUNCTIONS);
+                let mut opts = inferno::flamegraph::Options::default();
+                opts.title = "Heap inuse".to_string();
+                opts.count_name = "bytes".to_string();
+                pprof::flamegraph(profile, &mut opts)
+            })
+            .await
+            .map_err(|join_err| ApiError::InternalServerError(join_err.into()))?
+            .map_err(ApiError::InternalServerError)?;
+            Response::builder()
+                .status(200)
+                .header(CONTENT_TYPE, "image/svg+xml")
+                .body(Body::from(body))
+                .map_err(|err| ApiError::InternalServerError(err.into()))
+        }
    }
 }

--- a/libs/utils/src/lsn.rs
+++ b/libs/utils/src/lsn.rs
@@ -260,7 +260,7 @@ impl FromStr for Lsn {
        {
            let left_num = u32::from_str_radix(left, 16).map_err(|_| LsnParseError)?;
            let right_num = u32::from_str_radix(right, 16).map_err(|_| LsnParseError)?;
-            Ok(Lsn((left_num as u64) << 32 | right_num as u64))
+            Ok(Lsn(((left_num as u64) << 32) | right_num as u64))
        } else {
            Err(LsnParseError)
        }
--- a/libs/utils/src/pprof.rs
+++ b/libs/utils/src/pprof.rs
@@ -1,8 +1,9 @@
+use anyhow::bail;
 use flate2::write::{GzDecoder, GzEncoder};
 use flate2::Compression;
 use itertools::Itertools as _;
 use once_cell::sync::Lazy;
-use pprof::protos::{Function, Line, Message as _, Profile};
+use pprof::protos::{Function, Line, Location, Message as _, Profile};
 use regex::Regex;

 use std::borrow::Cow;
@@ -188,3 +189,59 @@ pub fn strip_locations(

    profile
 }
+
+/// Generates an SVG flamegraph from a symbolized pprof profile.
+pub fn flamegraph(
+    profile: Profile,
+    opts: &mut inferno::flamegraph::Options,
+) -> anyhow::Result<Vec<u8>> {
+    if profile.mapping.iter().any(|m| !m.has_functions) {
+        bail!("profile not symbolized");
+    }
+
+    // Index locations, functions, and strings.
+    let locations: HashMap<u64, Location> =
+        profile.location.into_iter().map(|l| (l.id, l)).collect();
+    let functions: HashMap<u64, Function> =
+        profile.function.into_iter().map(|f| (f.id, f)).collect();
+    let strings = profile.string_table;
+
+    // Resolve stacks as function names, and sum sample values per stack. Also reverse the stack,
+    // since inferno expects it bottom-up.
+    let mut stacks: HashMap<Vec<&str>, i64> = HashMap::new();
+    for sample in profile.sample {
+        let mut stack = Vec::with_capacity(sample.location_id.len());
+        for location in sample.location_id.into_iter().rev() {
+            let Some(location) = locations.get(&location) else {
+                bail!("missing location {location}");
+            };
+            for line in location.line.iter().rev() {
+                let Some(function) = functions.get(&line.function_id) else {
+                    bail!("missing function {}", line.function_id);
+                };
+                let Some(name) = strings.get(function.name as usize) else {
+                    bail!("missing string {}", function.name);
+                };
+                stack.push(name.as_str());
+            }
+        }
+        let Some(&value) = sample.value.first() else {
+            bail!("missing value");
+        };
+        *stacks.entry(stack).or_default() += value;
+    }
+
+    // Construct stack lines for inferno.
+    let lines = stacks
+        .into_iter()
+        .map(|(stack, value)| (stack.into_iter().join(";"), value))
+        .map(|(stack, value)| format!("{stack} {value}"))
+        .sorted()
+        .collect_vec();
+
+    // Construct the flamegraph.
+    let mut bytes = Vec::new();
+    let lines = lines.iter().map(|line| line.as_str());
+    inferno::flamegraph::from_lines(opts, lines, &mut bytes)?;
+    Ok(bytes)
+}
--- a/libs/utils/src/sync/spsc_fold.rs
+++ b/libs/utils/src/sync/spsc_fold.rs
@@ -96,7 +96,11 @@ impl<T: Send> Sender<T> {
                    }
                }
                State::SenderWaitsForReceiverToConsume(_data) => {
-                    // Really, we shouldn't be polled until receiver has consumed and wakes us.
+                    // SAFETY: send is single threaded due to `&mut self` requirement,
+                    // therefore register is not concurrent.
+                    unsafe {
+                        self.state.wake_sender.register(cx.waker());
+                    }
                    Poll::Pending
                }
                State::ReceiverGone => Poll::Ready(Err(SendError::ReceiverGone)),
@@ -449,4 +453,38 @@ mod tests {
        let err = recv_task.await.unwrap().expect_err("should error");
        assert!(matches!(err, RecvError::SenderGone));
    }
+
+    #[tokio::test(start_paused = true)]
+    async fn test_receiver_drop_while_waiting_for_receiver_to_consume_unblocks_sender() {
+        let (mut sender, receiver) = channel();
+
+        let state = receiver.state.clone();
+
+        sender.send((), |_, _| unreachable!()).await.unwrap();
+
+        assert!(matches!(&*state.value.lock().unwrap(), &State::HasData(_)));
+
+        let unmergeable = sender.send((), |_, _| Err(()));
+        let mut unmergeable = std::pin::pin!(unmergeable);
+        tokio::select! {
+            _ = tokio::time::sleep(FOREVER) => {},
+            _ = &mut unmergeable => {
+                panic!("unmergeable should not complete");
+            },
+        }
+
+        assert!(matches!(
+            &*state.value.lock().unwrap(),
+            &State::SenderWaitsForReceiverToConsume(_)
+        ));
+
+        drop(receiver);
+
+        assert!(matches!(
+            &*state.value.lock().unwrap(),
+            &State::ReceiverGone
+        ));
+
+        unmergeable.await.unwrap_err();
+    }
 }
--- a/libs/wal_decoder/Cargo.toml
+++ b/libs/wal_decoder/Cargo.toml
@@ -24,3 +24,18 @@ workspace_hack = { version = "0.1", path = "../../workspace_hack" }

 [build-dependencies]
 tonic-build.workspace = true
+
+[dev-dependencies]
+criterion.workspace = true
+camino.workspace = true
+camino-tempfile.workspace = true
+remote_storage.workspace = true
+tokio-util.workspace = true
+serde_json.workspace = true
+futures.workspace = true
+tikv-jemallocator.workspace = true
+pprof.workspace = true
+
+[[bench]]
+name = "bench_interpret_wal"
+harness = false
--- a/libs/wal_decoder/benches/README.md
+++ b/libs/wal_decoder/benches/README.md
@@ -0,0 +1,34 @@
+## WAL Decoding and Interpretation Benchmarks
+
+Note that these benchmarks pull WAL from a public bucket in S3
+as a preparation step. Hence, you need a way to auth with AWS.
+You can achieve this by copying the `~/.aws/config` file from
+the AWS SSO notion page and exporting `AWS_PROFILE=dev` when invoking
+the benchmarks.
+
+To run benchmarks:
+
+```sh
+aws sso login --profile dev
+
+# All benchmarks.
+AWS_PROFILE=dev cargo bench --package wal_decoder
+
+# Specific file.
+AWS_PROFILE=dev cargo bench --package wal_decoder --bench bench_interpret_wal
+
+# Specific benchmark.
+AWS_PROFILE=dev cargo bench --package wal_decoder --bench bench_interpret_wal unsharded
+
+# List available benchmarks.
+cargo bench --package wal_decoder --benches -- --list
+
+# Generate flamegraph profiles using pprof-rs, profiling for 10 seconds.
+# Output in target/criterion/*/profile/flamegraph.svg.
+AWS_PROFILE=dev cargo bench --package wal_decoder --bench bench_interpret_wal unsharded -- --profile-time 10
+```
+
+Additional charts and statistics are available in `target/criterion/report/index.html`.
+
+Benchmarks are automatically compared against the previous run. To compare against other runs, see
+`--baseline` and `--save-baseline`.
--- a/libs/wal_decoder/benches/bench_interpret_wal.rs
+++ b/libs/wal_decoder/benches/bench_interpret_wal.rs
@@ -0,0 +1,250 @@
+use anyhow::Context;
+use criterion::{criterion_group, criterion_main, Criterion};
+use futures::{stream::FuturesUnordered, StreamExt};
+use pageserver_api::shard::{ShardIdentity, ShardStripeSize};
+use postgres_ffi::{waldecoder::WalStreamDecoder, MAX_SEND_SIZE, WAL_SEGMENT_SIZE};
+use pprof::criterion::{Output, PProfProfiler};
+use serde::Deserialize;
+use std::{env, num::NonZeroUsize, sync::Arc};
+
+use camino::{Utf8Path, Utf8PathBuf};
+use camino_tempfile::Utf8TempDir;
+use remote_storage::{
+    DownloadOpts, GenericRemoteStorage, ListingMode, RemoteStorageConfig, RemoteStorageKind,
+    S3Config,
+};
+use tokio_util::sync::CancellationToken;
+use utils::{
+    lsn::Lsn,
+    shard::{ShardCount, ShardNumber},
+};
+use wal_decoder::models::InterpretedWalRecord;
+
+const S3_BUCKET: &str = "neon-github-public-dev";
+const S3_REGION: &str = "eu-central-1";
+const BUCKET_PREFIX: &str = "wal-snapshots/bulk-insert/";
+const METADATA_FILENAME: &str = "metadata.json";
+
+/// Use jemalloc, and configure it to sample allocations for profiles every 1 MB.
+/// This mirrors the configuration in bin/safekeeper.rs.
+#[global_allocator]
+static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
+
+#[allow(non_upper_case_globals)]
+#[export_name = "malloc_conf"]
+pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:20\0";
+
+async fn create_s3_client() -> anyhow::Result<Arc<GenericRemoteStorage>> {
+    let remote_storage_config = RemoteStorageConfig {
+        storage: RemoteStorageKind::AwsS3(S3Config {
+            bucket_name: S3_BUCKET.to_string(),
+            bucket_region: S3_REGION.to_string(),
+            prefix_in_bucket: Some(BUCKET_PREFIX.to_string()),
+            endpoint: None,
+            concurrency_limit: NonZeroUsize::new(100).unwrap(),
+            max_keys_per_list_response: None,
+            upload_storage_class: None,
+        }),
+        timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
+        small_timeout: RemoteStorageConfig::DEFAULT_SMALL_TIMEOUT,
+    };
+    Ok(Arc::new(
+        GenericRemoteStorage::from_config(&remote_storage_config)
+            .await
+            .context("remote storage init")?,
+    ))
+}
+
+async fn download_bench_data(
+    client: Arc<GenericRemoteStorage>,
+    cancel: &CancellationToken,
+) -> anyhow::Result<Utf8TempDir> {
+    let temp_dir_parent: Utf8PathBuf = env::current_dir().unwrap().try_into()?;
+    let temp_dir = camino_tempfile::tempdir_in(temp_dir_parent)?;
+
+    eprintln!("Downloading benchmark data to {:?}", temp_dir);
+
+    let listing = client
+        .list(None, ListingMode::NoDelimiter, None, cancel)
+        .await?;
+
+    let mut downloads = listing
+        .keys
+        .into_iter()
+        .map(|obj| {
+            let client = client.clone();
+            let temp_dir_path = temp_dir.path().to_owned();
+
+            async move {
+                let remote_path = obj.key;
+                let download = client
+                    .download(&remote_path, &DownloadOpts::default(), cancel)
+                    .await?;
+                let mut body = tokio_util::io::StreamReader::new(download.download_stream);
+
+                let file_name = remote_path.object_name().unwrap();
+                let file_path = temp_dir_path.join(file_name);
+                let file = tokio::fs::OpenOptions::new()
+                    .create(true)
+                    .truncate(true)
+                    .write(true)
+                    .open(&file_path)
+                    .await?;
+
+                let mut writer = tokio::io::BufWriter::new(file);
+                tokio::io::copy_buf(&mut body, &mut writer).await?;
+
+                Ok::<(), anyhow::Error>(())
+            }
+        })
+        .collect::<FuturesUnordered<_>>();
+
+    while let Some(download) = downloads.next().await {
+        download?;
+    }
+
+    Ok(temp_dir)
+}
+
+struct BenchmarkData {
+    wal: Vec<u8>,
+    meta: BenchmarkMetadata,
+}
+
+#[derive(Deserialize)]
+struct BenchmarkMetadata {
+    pg_version: u32,
+    start_lsn: Lsn,
+}
+
+async fn load_bench_data(path: &Utf8Path, input_size: usize) -> anyhow::Result<BenchmarkData> {
+    eprintln!("Loading benchmark data from {:?}", path);
+
+    let mut entries = tokio::fs::read_dir(path).await?;
+    let mut ordered_segment_paths = Vec::new();
+    let mut metadata = None;
+
+    while let Some(entry) = entries.next_entry().await? {
+        if entry.file_name() == METADATA_FILENAME {
+            let bytes = tokio::fs::read(entry.path()).await?;
+            metadata = Some(
+                serde_json::from_slice::<BenchmarkMetadata>(&bytes)
+                    .context("failed to deserialize metadata.json")?,
+            );
+        } else {
+            ordered_segment_paths.push(entry.path());
+        }
+    }
+
+    ordered_segment_paths.sort();
+
+    let mut buffer = Vec::new();
+    for path in ordered_segment_paths {
+        if buffer.len() >= input_size {
+            break;
+        }
+
+        use async_compression::tokio::bufread::ZstdDecoder;
+        let file = tokio::fs::File::open(path).await?;
+        let reader = tokio::io::BufReader::new(file);
+        let decoder = ZstdDecoder::new(reader);
+        let mut reader = tokio::io::BufReader::new(decoder);
+        tokio::io::copy_buf(&mut reader, &mut buffer).await?;
+    }
+
+    buffer.truncate(input_size);
+
+    Ok(BenchmarkData {
+        wal: buffer,
+        meta: metadata.unwrap(),
+    })
+}
+
+fn criterion_benchmark(c: &mut Criterion) {
+    const INPUT_SIZE: usize = 128 * 1024 * 1024;
+
+    let setup_runtime = tokio::runtime::Builder::new_current_thread()
+        .enable_all()
+        .build()
+        .unwrap();
+
+    let (_temp_dir, bench_data) = setup_runtime.block_on(async move {
+        let cancel = CancellationToken::new();
+        let client = create_s3_client().await.unwrap();
+        let temp_dir = download_bench_data(client, &cancel).await.unwrap();
+        let bench_data = load_bench_data(temp_dir.path(), INPUT_SIZE).await.unwrap();
+
+        (temp_dir, bench_data)
+    });
+
+    eprintln!(
+        "Benchmarking against {} MiB of WAL",
+        INPUT_SIZE / 1024 / 1024
+    );
+
+    let mut group = c.benchmark_group("decode-interpret-wal");
+    group.throughput(criterion::Throughput::Bytes(bench_data.wal.len() as u64));
+    group.sample_size(10);
+
+    group.bench_function("unsharded", |b| {
+        b.iter(|| decode_interpret_main(&bench_data, &[ShardIdentity::unsharded()]))
+    });
+
+    let eight_shards = (0..8)
+        .map(|i| ShardIdentity::new(ShardNumber(i), ShardCount(8), ShardStripeSize(8)).unwrap())
+        .collect::<Vec<_>>();
+
+    group.bench_function("8/8-shards", |b| {
+        b.iter(|| decode_interpret_main(&bench_data, &eight_shards))
+    });
+
+    let four_shards = eight_shards
+        .into_iter()
+        .filter(|s| s.number.0 % 2 == 0)
+        .collect::<Vec<_>>();
+    group.bench_function("4/8-shards", |b| {
+        b.iter(|| decode_interpret_main(&bench_data, &four_shards))
+    });
+
+    let two_shards = four_shards
+        .into_iter()
+        .filter(|s| s.number.0 % 4 == 0)
+        .collect::<Vec<_>>();
+    group.bench_function("2/8-shards", |b| {
+        b.iter(|| decode_interpret_main(&bench_data, &two_shards))
+    });
+}
+
+fn decode_interpret_main(bench: &BenchmarkData, shards: &[ShardIdentity]) {
+    let r = decode_interpret(bench, shards);
+    if let Err(e) = r {
+        panic!("{e:?}");
+    }
+}
+
+fn decode_interpret(bench: &BenchmarkData, shard: &[ShardIdentity]) -> anyhow::Result<()> {
+    let mut decoder = WalStreamDecoder::new(bench.meta.start_lsn, bench.meta.pg_version);
+    let xlogoff: usize = bench.meta.start_lsn.segment_offset(WAL_SEGMENT_SIZE);
+
+    for chunk in bench.wal[xlogoff..].chunks(MAX_SEND_SIZE) {
+        decoder.feed_bytes(chunk);
+        while let Some((lsn, recdata)) = decoder.poll_decode().unwrap() {
+            assert!(lsn.is_aligned());
+            let _ = InterpretedWalRecord::from_bytes_filtered(
+                recdata,
+                shard,
+                lsn,
+                bench.meta.pg_version,
+            )
+            .unwrap();
+        }
+    }
+
+    Ok(())
+}
+criterion_group!(
+    name=benches;
+    config=Criterion::default().with_profiler(PProfProfiler::new(100, Output::Flamegraph(None)));
+    targets=criterion_benchmark
+);
+criterion_main!(benches);
--- a/libs/wal_decoder/src/decoder.rs
+++ b/libs/wal_decoder/src/decoder.rs
@@ -1,6 +1,8 @@
 //! This module contains logic for decoding and interpreting
 //! raw bytes which represent a raw Postgres WAL record.

+use std::collections::HashMap;
+
 use crate::models::*;
 use crate::serialized_batch::SerializedValueBatch;
 use bytes::{Buf, Bytes};
@@ -14,15 +16,15 @@ use utils::lsn::Lsn;

 impl InterpretedWalRecord {
    /// Decode and interpreted raw bytes which represent one Postgres WAL record.
-    /// Data blocks which do not match the provided shard identity are filtered out.
+    /// Data blocks which do not match any of the provided shard identities are filtered out.
    /// Shard 0 is a special case since it tracks all relation sizes. We only give it
    /// the keys that are being written as that is enough for updating relation sizes.
    pub fn from_bytes_filtered(
        buf: Bytes,
-        shard: &ShardIdentity,
+        shards: &[ShardIdentity],
        next_record_lsn: Lsn,
        pg_version: u32,
-    ) -> anyhow::Result<InterpretedWalRecord> {
+    ) -> anyhow::Result<HashMap<ShardIdentity, InterpretedWalRecord>> {
        let mut decoded = DecodedWALRecord::default();
        decode_wal_record(buf, &mut decoded, pg_version)?;
        let xid = decoded.xl_xid;
@@ -33,43 +35,57 @@ impl InterpretedWalRecord {
            FlushUncommittedRecords::No
        };

-        let metadata_record =
-            MetadataRecord::from_decoded_filtered(&decoded, shard, next_record_lsn, pg_version)?;
-        let batch = SerializedValueBatch::from_decoded_filtered(
+        let mut shard_records: HashMap<ShardIdentity, InterpretedWalRecord> =
+            HashMap::with_capacity(shards.len());
+        for shard in shards {
+            shard_records.insert(
+                *shard,
+                InterpretedWalRecord {
+                    metadata_record: None,
+                    batch: SerializedValueBatch::default(),
+                    next_record_lsn,
+                    flush_uncommitted,
+                    xid,
+                },
+            );
+        }
+
+        MetadataRecord::from_decoded_filtered(
+            &decoded,
+            &mut shard_records,
+            next_record_lsn,
+            pg_version,
+        )?;
+        SerializedValueBatch::from_decoded_filtered(
            decoded,
-            shard,
+            &mut shard_records,
            next_record_lsn,
            pg_version,
        )?;

-        Ok(InterpretedWalRecord {
-            metadata_record,
-            batch,
-            next_record_lsn,
-            flush_uncommitted,
-            xid,
-        })
+        Ok(shard_records)
    }
 }

 impl MetadataRecord {
-    /// Builds a metadata record for this WAL record, if any.
+    /// Populates the given `shard_records` with metadata records from this WAL record, if any,
+    /// discarding those belonging to other shards.
    ///
-    /// Only metadata records relevant for the given shard are emitted. Currently, most metadata
+    /// Only metadata records relevant for the given shards is emitted. Currently, most metadata
    /// records are broadcast to all shards for simplicity, but this should be improved.
    fn from_decoded_filtered(
        decoded: &DecodedWALRecord,
-        shard: &ShardIdentity,
+        shard_records: &mut HashMap<ShardIdentity, InterpretedWalRecord>,
        next_record_lsn: Lsn,
        pg_version: u32,
-    ) -> anyhow::Result<Option<MetadataRecord>> {
+    ) -> anyhow::Result<()> {
        // Note: this doesn't actually copy the bytes since
        // the [`Bytes`] type implements it via a level of indirection.
        let mut buf = decoded.record.clone();
        buf.advance(decoded.main_data_offset);

        // First, generate metadata records from the decoded WAL record.
-        let mut metadata_record = match decoded.xl_rmid {
+        let metadata_record = match decoded.xl_rmid {
            pg_constants::RM_HEAP_ID | pg_constants::RM_HEAP2_ID => {
                Self::decode_heapam_record(&mut buf, decoded, pg_version)?
            }
@@ -112,41 +128,65 @@ impl MetadataRecord {
        };

        // Next, filter the metadata record by shard.
-        match metadata_record {
-            Some(
-                MetadataRecord::Heapam(HeapamRecord::ClearVmBits(ref mut clear_vm_bits))
-                | MetadataRecord::Neonrmgr(NeonrmgrRecord::ClearVmBits(ref mut clear_vm_bits)),
-            ) => {
-                // Route VM page updates to the shards that own them. VM pages are stored in the VM fork
-                // of the main relation. These are sharded and managed just like regular relation pages.
-                // See: https://github.com/neondatabase/neon/issues/9855
-                let is_local_vm_page = |heap_blk| {
-                    let vm_blk = pg_constants::HEAPBLK_TO_MAPBLOCK(heap_blk);
-                    shard.is_key_local(&rel_block_to_key(clear_vm_bits.vm_rel, vm_blk))
-                };
-                // Send the old and new VM page updates to their respective shards.
-                clear_vm_bits.old_heap_blkno = clear_vm_bits
-                    .old_heap_blkno
-                    .filter(|&blkno| is_local_vm_page(blkno));
-                clear_vm_bits.new_heap_blkno = clear_vm_bits
-                    .new_heap_blkno
-                    .filter(|&blkno| is_local_vm_page(blkno));
-                // If neither VM page belongs to this shard, discard the record.
-                if clear_vm_bits.old_heap_blkno.is_none() && clear_vm_bits.new_heap_blkno.is_none()
-                {
-                    metadata_record = None
+        for (shard, record) in shard_records.iter_mut() {
+            match metadata_record {
+                Some(
+                    MetadataRecord::Heapam(HeapamRecord::ClearVmBits(ref clear_vm_bits))
+                    | MetadataRecord::Neonrmgr(NeonrmgrRecord::ClearVmBits(ref clear_vm_bits)),
+                ) => {
+                    // Route VM page updates to the shards that own them. VM pages are stored in the VM fork
+                    // of the main relation. These are sharded and managed just like regular relation pages.
+                    // See: https://github.com/neondatabase/neon/issues/9855
+                    let is_local_vm_page = |heap_blk| {
+                        let vm_blk = pg_constants::HEAPBLK_TO_MAPBLOCK(heap_blk);
+                        shard.is_key_local(&rel_block_to_key(clear_vm_bits.vm_rel, vm_blk))
+                    };
+                    // Send the old and new VM page updates to their respective shards.
+                    let updated_old_heap_blkno = clear_vm_bits
+                        .old_heap_blkno
+                        .filter(|&blkno| is_local_vm_page(blkno));
+                    let updated_new_heap_blkno = clear_vm_bits
+                        .new_heap_blkno
+                        .filter(|&blkno| is_local_vm_page(blkno));
+                    // If neither VM page belongs to this shard, discard the record.
+                    if updated_old_heap_blkno.is_some() || updated_new_heap_blkno.is_some() {
+                        // Clone the record and update it for the current shard.
+                        let mut for_shard = metadata_record.clone();
+                        match for_shard {
+                            Some(
+                                MetadataRecord::Heapam(HeapamRecord::ClearVmBits(
+                                    ref mut clear_vm_bits,
+                                ))
+                                | MetadataRecord::Neonrmgr(NeonrmgrRecord::ClearVmBits(
+                                    ref mut clear_vm_bits,
+                                )),
+                            ) => {
+                                clear_vm_bits.old_heap_blkno = updated_old_heap_blkno;
+                                clear_vm_bits.new_heap_blkno = updated_new_heap_blkno;
+                                record.metadata_record = for_shard;
+                            }
+                            _ => {
+                                unreachable!("for_shard is a clone of what we checked above")
+                            }
+                        }
+                    }
+                }
+                Some(MetadataRecord::LogicalMessage(LogicalMessageRecord::Put(_))) => {
+                    // Filter LogicalMessage records (AUX files) to only be stored on shard zero
+                    if shard.is_shard_zero() {
+                        record.metadata_record = metadata_record;
+                        // No other shards should receive this record, so we stop traversing shards early.
+                        break;
+                    }
+                }
+                _ => {
+                    // All other metadata records are sent to all shards.
+                    record.metadata_record = metadata_record.clone();
                }
            }
-            Some(MetadataRecord::LogicalMessage(LogicalMessageRecord::Put(_))) => {
-                // Filter LogicalMessage records (AUX files) to only be stored on shard zero
-                if !shard.is_shard_zero() {
-                    metadata_record = None;
-                }
-            }
-            _ => {}
        }

-        Ok(metadata_record)
+        Ok(())
    }

    fn decode_heapam_record(
--- a/libs/wal_decoder/src/models.rs
+++ b/libs/wal_decoder/src/models.rs
@@ -48,7 +48,7 @@ pub mod proto {
    tonic::include_proto!("interpreted_wal");
 }

-#[derive(Serialize, Deserialize)]
+#[derive(Copy, Clone, Serialize, Deserialize)]
 pub enum FlushUncommittedRecords {
    Yes,
    No,
@@ -107,7 +107,7 @@ impl InterpretedWalRecord {

 /// The interpreted part of the Postgres WAL record which requires metadata
 /// writes to the underlying storage engine.
-#[derive(Serialize, Deserialize)]
+#[derive(Clone, Serialize, Deserialize)]
 pub enum MetadataRecord {
    Heapam(HeapamRecord),
    Neonrmgr(NeonrmgrRecord),
@@ -123,12 +123,12 @@ pub enum MetadataRecord {
    Replorigin(ReploriginRecord),
 }

-#[derive(Serialize, Deserialize)]
+#[derive(Clone, Serialize, Deserialize)]
 pub enum HeapamRecord {
    ClearVmBits(ClearVmBits),
 }

-#[derive(Serialize, Deserialize)]
+#[derive(Clone, Serialize, Deserialize)]
 pub struct ClearVmBits {
    pub new_heap_blkno: Option<u32>,
    pub old_heap_blkno: Option<u32>,
@@ -136,29 +136,29 @@ pub struct ClearVmBits {
    pub flags: u8,
 }

-#[derive(Serialize, Deserialize)]
+#[derive(Clone, Serialize, Deserialize)]
 pub enum NeonrmgrRecord {
    ClearVmBits(ClearVmBits),
 }

-#[derive(Serialize, Deserialize)]
+#[derive(Clone, Serialize, Deserialize)]
 pub enum SmgrRecord {
    Create(SmgrCreate),
    Truncate(XlSmgrTruncate),
 }

-#[derive(Serialize, Deserialize)]
+#[derive(Clone, Serialize, Deserialize)]
 pub struct SmgrCreate {
    pub rel: RelTag,
 }

-#[derive(Serialize, Deserialize)]
+#[derive(Clone, Serialize, Deserialize)]
 pub enum DbaseRecord {
    Create(DbaseCreate),
    Drop(DbaseDrop),
 }

-#[derive(Serialize, Deserialize)]
+#[derive(Clone, Serialize, Deserialize)]
 pub struct DbaseCreate {
    pub db_id: Oid,
    pub tablespace_id: Oid,
@@ -166,32 +166,32 @@ pub struct DbaseCreate {
    pub src_tablespace_id: Oid,
 }

-#[derive(Serialize, Deserialize)]
+#[derive(Clone, Serialize, Deserialize)]
 pub struct DbaseDrop {
    pub db_id: Oid,
    pub tablespace_ids: Vec<Oid>,
 }

-#[derive(Serialize, Deserialize)]
+#[derive(Clone, Serialize, Deserialize)]
 pub enum ClogRecord {
    ZeroPage(ClogZeroPage),
    Truncate(ClogTruncate),
 }

-#[derive(Serialize, Deserialize)]
+#[derive(Clone, Serialize, Deserialize)]
 pub struct ClogZeroPage {
    pub segno: u32,
    pub rpageno: u32,
 }

-#[derive(Serialize, Deserialize)]
+#[derive(Clone, Serialize, Deserialize)]
 pub struct ClogTruncate {
    pub pageno: u32,
    pub oldest_xid: TransactionId,
    pub oldest_xid_db: Oid,
 }

-#[derive(Serialize, Deserialize)]
+#[derive(Clone, Serialize, Deserialize)]
 pub enum XactRecord {
    Commit(XactCommon),
    Abort(XactCommon),
@@ -200,7 +200,7 @@ pub enum XactRecord {
    Prepare(XactPrepare),
 }

-#[derive(Serialize, Deserialize)]
+#[derive(Clone, Serialize, Deserialize)]
 pub struct XactCommon {
    pub parsed: XlXactParsedRecord,
    pub origin_id: u16,
@@ -209,73 +209,73 @@ pub struct XactCommon {
    pub lsn: Lsn,
 }

-#[derive(Serialize, Deserialize)]
+#[derive(Clone, Serialize, Deserialize)]
 pub struct XactPrepare {
    pub xl_xid: TransactionId,
    pub data: Bytes,
 }

-#[derive(Serialize, Deserialize)]
+#[derive(Clone, Serialize, Deserialize)]
 pub enum MultiXactRecord {
    ZeroPage(MultiXactZeroPage),
    Create(XlMultiXactCreate),
    Truncate(XlMultiXactTruncate),
 }

-#[derive(Serialize, Deserialize)]
+#[derive(Clone, Serialize, Deserialize)]
 pub struct MultiXactZeroPage {
    pub slru_kind: SlruKind,
    pub segno: u32,
    pub rpageno: u32,
 }

-#[derive(Serialize, Deserialize)]
+#[derive(Clone, Serialize, Deserialize)]
 pub enum RelmapRecord {
    Update(RelmapUpdate),
 }

-#[derive(Serialize, Deserialize)]
+#[derive(Clone, Serialize, Deserialize)]
 pub struct RelmapUpdate {
    pub update: XlRelmapUpdate,
    pub buf: Bytes,
 }

-#[derive(Serialize, Deserialize)]
+#[derive(Clone, Serialize, Deserialize)]
 pub enum XlogRecord {
    Raw(RawXlogRecord),
 }

-#[derive(Serialize, Deserialize)]
+#[derive(Clone, Serialize, Deserialize)]
 pub struct RawXlogRecord {
    pub info: u8,
    pub lsn: Lsn,
    pub buf: Bytes,
 }

-#[derive(Serialize, Deserialize)]
+#[derive(Clone, Serialize, Deserialize)]
 pub enum LogicalMessageRecord {
    Put(PutLogicalMessage),
    #[cfg(feature = "testing")]
    Failpoint,
 }

-#[derive(Serialize, Deserialize)]
+#[derive(Clone, Serialize, Deserialize)]
 pub struct PutLogicalMessage {
    pub path: String,
    pub buf: Bytes,
 }

-#[derive(Serialize, Deserialize)]
+#[derive(Clone, Serialize, Deserialize)]
 pub enum StandbyRecord {
    RunningXacts(StandbyRunningXacts),
 }

-#[derive(Serialize, Deserialize)]
+#[derive(Clone, Serialize, Deserialize)]
 pub struct StandbyRunningXacts {
    pub oldest_running_xid: TransactionId,
 }

-#[derive(Serialize, Deserialize)]
+#[derive(Clone, Serialize, Deserialize)]
 pub enum ReploriginRecord {
    Set(XlReploriginSet),
    Drop(XlReploriginDrop),
--- a/libs/wal_decoder/src/serialized_batch.rs
+++ b/libs/wal_decoder/src/serialized_batch.rs
@@ -5,7 +5,7 @@
 //! Such batches are created from decoded PG wal records and ingested
 //! by the pageserver by writing directly to the ephemeral file.

-use std::collections::BTreeSet;
+use std::collections::{BTreeSet, HashMap};

 use bytes::{Bytes, BytesMut};
 use pageserver_api::key::rel_block_to_key;
@@ -22,6 +22,8 @@ use utils::lsn::Lsn;

 use pageserver_api::key::Key;

+use crate::models::InterpretedWalRecord;
+
 static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; BLCKSZ as usize]);

 /// Accompanying metadata for the batch
@@ -128,7 +130,8 @@ impl Default for SerializedValueBatch {
 }

 impl SerializedValueBatch {
-    /// Build a batch of serialized values from a decoded PG WAL record
+    /// Populates the given `shard_records` with value batches from this WAL record, if any,
+    /// discarding those belonging to other shards.
    ///
    /// The batch will only contain values for keys targeting the specifiec
    /// shard. Shard 0 is a special case, where any keys that don't belong to
@@ -136,21 +139,20 @@ impl SerializedValueBatch {
    /// but absent from the raw buffer [`SerializedValueBatch::raw`]).
    pub(crate) fn from_decoded_filtered(
        decoded: DecodedWALRecord,
-        shard: &ShardIdentity,
+        shard_records: &mut HashMap<ShardIdentity, InterpretedWalRecord>,
        next_record_lsn: Lsn,
        pg_version: u32,
-    ) -> anyhow::Result<SerializedValueBatch> {
-        // First determine how big the buffer needs to be and allocate it up-front.
+    ) -> anyhow::Result<()> {
+        // First determine how big the buffers need to be and allocate it up-front.
        // This duplicates some of the work below, but it's empirically much faster.
-        let estimated_buffer_size = Self::estimate_buffer_size(&decoded, shard, pg_version);
-        let mut buf = Vec::<u8>::with_capacity(estimated_buffer_size);
+        for (shard, record) in shard_records.iter_mut() {
+            assert!(record.batch.is_empty());
+
+            let estimate = Self::estimate_buffer_size(&decoded, shard, pg_version);
+            record.batch.raw = Vec::with_capacity(estimate);
+        }

-        let mut metadata: Vec<ValueMeta> = Vec::with_capacity(decoded.blocks.len());
-        let mut max_lsn: Lsn = Lsn(0);
-        let mut len: usize = 0;
        for blk in decoded.blocks.iter() {
-            let relative_off = buf.len() as u64;
-
            let rel = RelTag {
                spcnode: blk.rnode_spcnode,
                dbnode: blk.rnode_dbnode,
@@ -168,99 +170,98 @@ impl SerializedValueBatch {
                );
            }

-            let key_is_local = shard.is_key_local(&key);
+            for (shard, record) in shard_records.iter_mut() {
+                let key_is_local = shard.is_key_local(&key);

-            tracing::debug!(
-                lsn=%next_record_lsn,
-                key=%key,
-                "ingest: shard decision {}",
-                if !key_is_local { "drop" } else { "keep" },
-            );
+                tracing::debug!(
+                    lsn=%next_record_lsn,
+                    key=%key,
+                    "ingest: shard decision {}",
+                    if !key_is_local { "drop" } else { "keep" },
+                );

-            if !key_is_local {
-                if shard.is_shard_zero() {
-                    // Shard 0 tracks relation sizes.  Although we will not store this block, we will observe
-                    // its blkno in case it implicitly extends a relation.
-                    metadata.push(ValueMeta::Observed(ObservedValueMeta {
+                if !key_is_local {
+                    if shard.is_shard_zero() {
+                        // Shard 0 tracks relation sizes.  Although we will not store this block, we will observe
+                        // its blkno in case it implicitly extends a relation.
+                        record
+                            .batch
+                            .metadata
+                            .push(ValueMeta::Observed(ObservedValueMeta {
+                                key: key.to_compact(),
+                                lsn: next_record_lsn,
+                            }))
+                    }
+
+                    continue;
+                }
+
+                // Instead of storing full-page-image WAL record,
+                // it is better to store extracted image: we can skip wal-redo
+                // in this case. Also some FPI records may contain multiple (up to 32) pages,
+                // so them have to be copied multiple times.
+                //
+                let val = if Self::block_is_image(&decoded, blk, pg_version) {
+                    // Extract page image from FPI record
+                    let img_len = blk.bimg_len as usize;
+                    let img_offs = blk.bimg_offset as usize;
+                    let mut image = BytesMut::with_capacity(BLCKSZ as usize);
+                    // TODO(vlad): skip the copy
+                    image.extend_from_slice(&decoded.record[img_offs..img_offs + img_len]);
+
+                    if blk.hole_length != 0 {
+                        let tail = image.split_off(blk.hole_offset as usize);
+                        image.resize(image.len() + blk.hole_length as usize, 0u8);
+                        image.unsplit(tail);
+                    }
+                    //
+                    // Match the logic of XLogReadBufferForRedoExtended:
+                    // The page may be uninitialized. If so, we can't set the LSN because
+                    // that would corrupt the page.
+                    //
+                    if !page_is_new(&image) {
+                        page_set_lsn(&mut image, next_record_lsn)
+                    }
+                    assert_eq!(image.len(), BLCKSZ as usize);
+
+                    Value::Image(image.freeze())
+                } else {
+                    Value::WalRecord(NeonWalRecord::Postgres {
+                        will_init: blk.will_init || blk.apply_image,
+                        rec: decoded.record.clone(),
+                    })
+                };
+
+                let relative_off = record.batch.raw.len() as u64;
+
+                val.ser_into(&mut record.batch.raw)
+                    .expect("Writing into in-memory buffer is infallible");
+
+                let val_ser_size = record.batch.raw.len() - relative_off as usize;
+
+                record
+                    .batch
+                    .metadata
+                    .push(ValueMeta::Serialized(SerializedValueMeta {
                        key: key.to_compact(),
                        lsn: next_record_lsn,
-                    }))
-                }
-
-                continue;
+                        batch_offset: relative_off,
+                        len: val_ser_size,
+                        will_init: val.will_init(),
+                    }));
+                record.batch.max_lsn = std::cmp::max(record.batch.max_lsn, next_record_lsn);
+                record.batch.len += 1;
            }
-
-            // Instead of storing full-page-image WAL record,
-            // it is better to store extracted image: we can skip wal-redo
-            // in this case. Also some FPI records may contain multiple (up to 32) pages,
-            // so them have to be copied multiple times.
-            //
-            let val = if Self::block_is_image(&decoded, blk, pg_version) {
-                // Extract page image from FPI record
-                let img_len = blk.bimg_len as usize;
-                let img_offs = blk.bimg_offset as usize;
-                let mut image = BytesMut::with_capacity(BLCKSZ as usize);
-                // TODO(vlad): skip the copy
-                image.extend_from_slice(&decoded.record[img_offs..img_offs + img_len]);
-
-                if blk.hole_length != 0 {
-                    let tail = image.split_off(blk.hole_offset as usize);
-                    image.resize(image.len() + blk.hole_length as usize, 0u8);
-                    image.unsplit(tail);
-                }
-                //
-                // Match the logic of XLogReadBufferForRedoExtended:
-                // The page may be uninitialized. If so, we can't set the LSN because
-                // that would corrupt the page.
-                //
-                if !page_is_new(&image) {
-                    page_set_lsn(&mut image, next_record_lsn)
-                }
-                assert_eq!(image.len(), BLCKSZ as usize);
-
-                Value::Image(image.freeze())
-            } else {
-                Value::WalRecord(NeonWalRecord::Postgres {
-                    will_init: blk.will_init || blk.apply_image,
-                    rec: decoded.record.clone(),
-                })
-            };
-
-            val.ser_into(&mut buf)
-                .expect("Writing into in-memory buffer is infallible");
-
-            let val_ser_size = buf.len() - relative_off as usize;
-
-            metadata.push(ValueMeta::Serialized(SerializedValueMeta {
-                key: key.to_compact(),
-                lsn: next_record_lsn,
-                batch_offset: relative_off,
-                len: val_ser_size,
-                will_init: val.will_init(),
-            }));
-            max_lsn = std::cmp::max(max_lsn, next_record_lsn);
-            len += 1;
        }

        if cfg!(any(debug_assertions, test)) {
-            let batch = Self {
-                raw: buf,
-                metadata,
-                max_lsn,
-                len,
-            };
-
-            batch.validate_lsn_order();
-
-            return Ok(batch);
+            // Validate that the batches are correct
+            for record in shard_records.values() {
+                record.batch.validate_lsn_order();
+            }
        }

-        Ok(Self {
-            raw: buf,
-            metadata,
-            max_lsn,
-            len,
-        })
+        Ok(())
    }

    /// Look into the decoded PG WAL record and determine
--- a/libs/walproposer/src/walproposer.rs
+++ b/libs/walproposer/src/walproposer.rs
@@ -215,6 +215,7 @@ impl Wrapper {
            syncSafekeepers: config.sync_safekeepers,
            systemId: 0,
            pgTimeline: 1,
+            proto_version: 2,
            callback_data,
        };
        let c_config = Box::into_raw(Box::new(c_config));
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -44,6 +44,7 @@ postgres_backend.workspace = true
 postgres-protocol.workspace = true
 postgres-types.workspace = true
 postgres_initdb.workspace = true
+pprof.workspace = true
 rand.workspace = true
 range-set-blaze = { version = "0.1.16", features = ["alloc"] }
 regex.workspace = true
@@ -108,3 +109,7 @@ harness = false
 [[bench]]
 name = "bench_ingest"
 harness = false
+
+[[bench]]
+name = "upload_queue"
+harness = false
--- a/pageserver/benches/upload_queue.rs
+++ b/pageserver/benches/upload_queue.rs
@@ -0,0 +1,87 @@
+//! Upload queue benchmarks.
+
+use std::str::FromStr as _;
+use std::sync::atomic::AtomicU32;
+use std::sync::Arc;
+
+use criterion::{criterion_group, criterion_main, Bencher, Criterion};
+use pageserver::tenant::metadata::TimelineMetadata;
+use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata;
+use pageserver::tenant::storage_layer::LayerName;
+use pageserver::tenant::upload_queue::{Delete, UploadOp, UploadQueue, UploadTask};
+use pageserver::tenant::IndexPart;
+use pprof::criterion::{Output, PProfProfiler};
+use utils::generation::Generation;
+use utils::shard::{ShardCount, ShardIndex, ShardNumber};
+
+// Register benchmarks with Criterion.
+criterion_group!(
+    name = benches;
+    config = Criterion::default().with_profiler(PProfProfiler::new(100, Output::Flamegraph(None)));
+    targets = bench_upload_queue_next_ready,
+);
+criterion_main!(benches);
+
+/// Benchmarks the cost of UploadQueue::next_ready() with the given number of in-progress tasks
+/// (which is equivalent to tasks ahead of it in the queue). This has linear cost, and the upload
+/// queue as a whole is thus quadratic.
+///
+/// UploadOp::UploadLayer requires an entire tenant and timeline to construct, so we just test
+/// Delete and UploadMetadata instead. This is incidentally the most expensive case.
+fn bench_upload_queue_next_ready(c: &mut Criterion) {
+    let mut g = c.benchmark_group("upload_queue_next_ready");
+    for inprogress in [0, 1, 10, 100, 1_000, 10_000, 100_000, 1_000_000] {
+        g.bench_function(format!("inprogress={inprogress}"), |b| {
+            run_bench(b, inprogress).unwrap()
+        });
+    }
+
+    fn run_bench(b: &mut Bencher, inprogress: usize) -> anyhow::Result<()> {
+        // Construct two layers. layer0 is in the indexes, layer1 will be deleted.
+        let layer0 = LayerName::from_str("000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51").expect("invalid name");
+        let layer1 = LayerName::from_str("100000000000000000000000000000000001-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51").expect("invalid name");
+
+        let metadata = LayerFileMetadata {
+            shard: ShardIndex::new(ShardNumber(1), ShardCount(2)),
+            generation: Generation::Valid(1),
+            file_size: 0,
+        };
+
+        // Construct the (initial and uploaded) index with layer0.
+        let mut index = IndexPart::empty(TimelineMetadata::example());
+        index.layer_metadata.insert(layer0, metadata.clone());
+
+        // Construct the queue.
+        let mut queue = UploadQueue::Uninitialized;
+        let queue = queue.initialize_with_current_remote_index_part(&index, 0)?;
+
+        // Populate inprogress_tasks with a bunch of layer1 deletions.
+        let delete = UploadOp::Delete(Delete {
+            layers: vec![(layer1, metadata)],
+        });
+
+        for task_id in 0..(inprogress as u64) {
+            queue.inprogress_tasks.insert(
+                task_id,
+                Arc::new(UploadTask {
+                    task_id,
+                    retries: AtomicU32::new(0),
+                    op: delete.clone(),
+                    coalesced_ops: Vec::new(),
+                }),
+            );
+        }
+
+        // Benchmark index upload scheduling.
+        let index_upload = UploadOp::UploadMetadata {
+            uploaded: Box::new(index),
+        };
+
+        b.iter(|| {
+            queue.queued_operations.push_front(index_upload.clone());
+            assert!(queue.next_ready().is_some());
+        });
+
+        Ok(())
+    }
+}
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -53,10 +53,12 @@ project_build_tag!(BUILD_TAG);
 #[global_allocator]
 static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;

-/// Configure jemalloc to sample allocations for profiles every 1 MB (1 << 20).
+/// Configure jemalloc to profile heap allocations by sampling stack traces every 2 MB (1 << 21).
+/// This adds roughly 3% overhead for allocations on average, which is acceptable considering
+/// performance-sensitive code will avoid allocations as far as possible anyway.
 #[allow(non_upper_case_globals)]
 #[export_name = "malloc_conf"]
-pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:20\0";
+pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:21\0";

 const PID_FILE_NAME: &str = "pageserver.pid";

--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -97,8 +97,8 @@ use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError};
 use crate::DEFAULT_PG_VERSION;
 use crate::{disk_usage_eviction_task, tenant};
 use pageserver_api::models::{
-    CompactInfoResponse, StatusResponse, TenantConfigRequest, TenantInfo, TimelineCreateRequest,
-    TimelineGcRequest, TimelineInfo,
+    StatusResponse, TenantConfigRequest, TenantInfo, TimelineCreateRequest, TimelineGcRequest,
+    TimelineInfo,
 };
 use utils::{
    auth::SwappableJwtAuth,
@@ -2052,15 +2052,7 @@ async fn timeline_compact_info_handler(
        let tenant = state
            .tenant_manager
            .get_attached_tenant_shard(tenant_shard_id)?;
-        let res = tenant.get_scheduled_compaction_tasks(timeline_id);
-        let mut resp = Vec::new();
-        for item in res {
-            resp.push(CompactInfoResponse {
-                compact_key_range: item.compact_key_range,
-                compact_lsn_range: item.compact_lsn_range,
-                sub_compaction: item.sub_compaction,
-            });
-        }
+        let resp = tenant.get_scheduled_compaction_tasks(timeline_id);
        json_response(StatusCode::OK, resp)
    }
    .instrument(info_span!("timeline_compact_info", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id))
--- a/pageserver/src/import_datadir.rs
+++ b/pageserver/src/import_datadir.rs
@@ -278,6 +278,8 @@ async fn import_wal(

    let mut walingest = WalIngest::new(tline, startpoint, ctx).await?;

+    let shard = vec![*tline.get_shard_identity()];
+
    while last_lsn <= endpoint {
        // FIXME: assume postgresql tli 1 for now
        let filename = XLogFileName(1, segno, WAL_SEGMENT_SIZE);
@@ -314,10 +316,12 @@ async fn import_wal(
            if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
                let interpreted = InterpretedWalRecord::from_bytes_filtered(
                    recdata,
-                    tline.get_shard_identity(),
+                    &shard,
                    lsn,
                    tline.pg_version,
-                )?;
+                )?
+                .remove(tline.get_shard_identity())
+                .unwrap();

                walingest
                    .ingest_record(interpreted, &mut modification, ctx)
@@ -411,6 +415,7 @@ pub async fn import_wal_from_tar(
    let mut offset = start_lsn.segment_offset(WAL_SEGMENT_SIZE);
    let mut last_lsn = start_lsn;
    let mut walingest = WalIngest::new(tline, start_lsn, ctx).await?;
+    let shard = vec![*tline.get_shard_identity()];

    // Ingest wal until end_lsn
    info!("importing wal until {}", end_lsn);
@@ -459,10 +464,12 @@ pub async fn import_wal_from_tar(
            if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
                let interpreted = InterpretedWalRecord::from_bytes_filtered(
                    recdata,
-                    tline.get_shard_identity(),
+                    &shard,
                    lsn,
                    tline.pg_version,
-                )?;
+                )?
+                .remove(tline.get_shard_identity())
+                .unwrap();

                walingest
                    .ingest_record(interpreted, &mut modification, ctx)
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -91,15 +91,6 @@ pub(crate) static STORAGE_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub(crate) static READ_NUM_LAYERS_VISITED: Lazy<Histogram> = Lazy::new(|| {
-    register_histogram!(
-        "pageserver_layers_visited_per_read_global",
-        "Number of layers visited to reconstruct one key",
-        vec![1.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0],
-    )
-    .expect("failed to define a metric")
-});
-
 pub(crate) static VEC_READ_NUM_LAYERS_VISITED: Lazy<Histogram> = Lazy::new(|| {
    register_histogram!(
        "pageserver_layers_visited_per_vectored_read_global",
@@ -1233,117 +1224,189 @@ pub(crate) struct SmgrOpTimerInner {
    global_flush_in_progress_micros: IntCounter,
    per_timeline_flush_in_progress_micros: IntCounter,

+    throttling: Arc<tenant_throttling::Pagestream>,
+
    timings: SmgrOpTimerState,
 }

+/// The stages of request processing are represented by the enum variants.
+/// Used as part of [`SmgrOpTimerInner::timings`].
+///
+/// Request processing calls into the `SmgrOpTimer::observe_*` methods at the
+/// transition points.
+/// These methods bump relevant counters and then update [`SmgrOpTimerInner::timings`]
+/// to the next state.
+///
+/// Each request goes through every stage, in all configurations.
+///
 #[derive(Debug)]
 enum SmgrOpTimerState {
    Received {
+        // In the future, we may want to track the full time the request spent
+        // inside pageserver process (time spent in kernel buffers can't be tracked).
+        // `received_at` would be used for that.
+        #[allow(dead_code)]
        received_at: Instant,
    },
-    ThrottleDoneExecutionStarting {
-        received_at: Instant,
+    Throttling {
        throttle_started_at: Instant,
-        started_execution_at: Instant,
    },
+    Batching {
+        throttle_done_at: Instant,
+    },
+    Executing {
+        execution_started_at: Instant,
+    },
+    Flushing,
+    // NB: when adding observation points, remember to update the Drop impl.
 }

+// NB: when adding observation points, remember to update the Drop impl.
+impl SmgrOpTimer {
+    /// See [`SmgrOpTimerState`] for more context.
+    pub(crate) fn observe_throttle_start(&mut self, at: Instant) {
+        let Some(inner) = self.0.as_mut() else {
+            return;
+        };
+        let SmgrOpTimerState::Received { received_at: _ } = &mut inner.timings else {
+            return;
+        };
+        inner.throttling.count_accounted_start.inc();
+        inner.timings = SmgrOpTimerState::Throttling {
+            throttle_started_at: at,
+        };
+    }
+
+    /// See [`SmgrOpTimerState`] for more context.
+    pub(crate) fn observe_throttle_done(&mut self, throttle: ThrottleResult) {
+        let Some(inner) = self.0.as_mut() else {
+            return;
+        };
+        let SmgrOpTimerState::Throttling {
+            throttle_started_at,
+        } = &inner.timings
+        else {
+            return;
+        };
+        inner.throttling.count_accounted_finish.inc();
+        match throttle {
+            ThrottleResult::NotThrottled { end } => {
+                inner.timings = SmgrOpTimerState::Batching {
+                    throttle_done_at: end,
+                };
+            }
+            ThrottleResult::Throttled { end } => {
+                // update metrics
+                inner.throttling.count_throttled.inc();
+                inner
+                    .throttling
+                    .wait_time
+                    .inc_by((end - *throttle_started_at).as_micros().try_into().unwrap());
+                // state transition
+                inner.timings = SmgrOpTimerState::Batching {
+                    throttle_done_at: end,
+                };
+            }
+        }
+    }
+
+    /// See [`SmgrOpTimerState`] for more context.
+    pub(crate) fn observe_execution_start(&mut self, at: Instant) {
+        let Some(inner) = self.0.as_mut() else {
+            return;
+        };
+        let SmgrOpTimerState::Batching { throttle_done_at } = &inner.timings else {
+            return;
+        };
+        // update metrics
+        let batch = at - *throttle_done_at;
+        inner.global_batch_wait_time.observe(batch.as_secs_f64());
+        inner
+            .per_timeline_batch_wait_time
+            .observe(batch.as_secs_f64());
+        // state transition
+        inner.timings = SmgrOpTimerState::Executing {
+            execution_started_at: at,
+        }
+    }
+
+    /// For all but the first caller, this is a no-op.
+    /// The first callers receives Some, subsequent ones None.
+    ///
+    /// See [`SmgrOpTimerState`] for more context.
+    pub(crate) fn observe_execution_end_flush_start(
+        &mut self,
+        at: Instant,
+    ) -> Option<SmgrOpFlushInProgress> {
+        // NB: unlike the other observe_* methods, this one take()s.
+        #[allow(clippy::question_mark)] // maintain similar code pattern.
+        let Some(mut inner) = self.0.take() else {
+            return None;
+        };
+        let SmgrOpTimerState::Executing {
+            execution_started_at,
+        } = &inner.timings
+        else {
+            return None;
+        };
+        // update metrics
+        let execution = at - *execution_started_at;
+        inner
+            .global_execution_latency_histo
+            .observe(execution.as_secs_f64());
+        if let Some(per_timeline_execution_latency_histo) =
+            &inner.per_timeline_execution_latency_histo
+        {
+            per_timeline_execution_latency_histo.observe(execution.as_secs_f64());
+        }
+
+        // state transition
+        inner.timings = SmgrOpTimerState::Flushing;
+
+        // return the flush in progress object which
+        // will do the remaining metrics updates
+        let SmgrOpTimerInner {
+            global_flush_in_progress_micros,
+            per_timeline_flush_in_progress_micros,
+            ..
+        } = inner;
+        Some(SmgrOpFlushInProgress {
+            flush_started_at: at,
+            global_micros: global_flush_in_progress_micros,
+            per_timeline_micros: per_timeline_flush_in_progress_micros,
+        })
+    }
+}
+
+/// The last stage of request processing is serializing and flushing the request
+/// into the TCP connection. We want to make slow flushes observable
+/// _while they are occuring_, so this struct provides a wrapper method [`Self::measure`]
+/// to periodically bump the metric.
+///
+/// If in the future we decide that we're not interested in live updates, we can
+/// add another `observe_*` method to [`SmgrOpTimer`], follow the existing pattern there,
+/// and remove this struct from the code base.
 pub(crate) struct SmgrOpFlushInProgress {
    flush_started_at: Instant,
    global_micros: IntCounter,
    per_timeline_micros: IntCounter,
 }

-impl SmgrOpTimer {
-    pub(crate) fn observe_throttle_done_execution_starting(&mut self, throttle: &ThrottleResult) {
-        let inner = self.0.as_mut().expect("other public methods consume self");
-        match (&mut inner.timings, throttle) {
-            (SmgrOpTimerState::Received { received_at }, throttle) => match throttle {
-                ThrottleResult::NotThrottled { start } => {
-                    inner.timings = SmgrOpTimerState::ThrottleDoneExecutionStarting {
-                        received_at: *received_at,
-                        throttle_started_at: *start,
-                        started_execution_at: *start,
-                    };
-                }
-                ThrottleResult::Throttled { start, end } => {
-                    inner.timings = SmgrOpTimerState::ThrottleDoneExecutionStarting {
-                        received_at: *start,
-                        throttle_started_at: *start,
-                        started_execution_at: *end,
-                    };
-                }
-            },
-            (x, _) => panic!("called in unexpected state: {x:?}"),
-        }
-    }
-
-    pub(crate) fn observe_smgr_op_completion_and_start_flushing(mut self) -> SmgrOpFlushInProgress {
-        let (flush_start, inner) = self
-            .smgr_op_end()
-            .expect("this method consume self, and the only other caller is drop handler");
-        let SmgrOpTimerInner {
-            global_flush_in_progress_micros,
-            per_timeline_flush_in_progress_micros,
-            ..
-        } = inner;
-        SmgrOpFlushInProgress {
-            flush_started_at: flush_start,
-            global_micros: global_flush_in_progress_micros,
-            per_timeline_micros: per_timeline_flush_in_progress_micros,
-        }
-    }
-
-    /// Returns `None`` if this method has already been called, `Some` otherwise.
-    fn smgr_op_end(&mut self) -> Option<(Instant, SmgrOpTimerInner)> {
-        let inner = self.0.take()?;
-
-        let now = Instant::now();
-
-        let batch;
-        let execution;
-        let throttle;
-        match inner.timings {
-            SmgrOpTimerState::Received { received_at } => {
-                batch = (now - received_at).as_secs_f64();
-                // TODO: use label for dropped requests.
-                // This is quite rare in practice, only during tenant/pageservers shutdown.
-                throttle = Duration::ZERO;
-                execution = Duration::ZERO.as_secs_f64();
-            }
-            SmgrOpTimerState::ThrottleDoneExecutionStarting {
-                received_at,
-                throttle_started_at,
-                started_execution_at,
-            } => {
-                batch = (throttle_started_at - received_at).as_secs_f64();
-                throttle = started_execution_at - throttle_started_at;
-                execution = (now - started_execution_at).as_secs_f64();
-            }
-        }
-
-        // update time spent in batching
-        inner.global_batch_wait_time.observe(batch);
-        inner.per_timeline_batch_wait_time.observe(batch);
-
-        // time spent in throttle metric is updated by throttle impl
-        let _ = throttle;
-
-        // update metrics for execution latency
-        inner.global_execution_latency_histo.observe(execution);
-        if let Some(per_timeline_execution_latency_histo) =
-            &inner.per_timeline_execution_latency_histo
-        {
-            per_timeline_execution_latency_histo.observe(execution);
-        }
-
-        Some((now, inner))
-    }
-}
-
 impl Drop for SmgrOpTimer {
    fn drop(&mut self) {
-        self.smgr_op_end();
+        // In case of early drop, update any of the remaining metrics with
+        // observations so that (started,finished) counter pairs balance out
+        // and all counters on the latency path have the the same number of
+        // observations.
+        // It's technically lying and it would be better if each metric had
+        // a separate label or similar for cancelled requests.
+        // But we don't have that right now and counter pairs balancing
+        // out is useful when using the metrics in panels and whatnot.
+        let now = Instant::now();
+        self.observe_throttle_start(now);
+        self.observe_throttle_done(ThrottleResult::NotThrottled { end: now });
+        self.observe_execution_start(now);
+        self.observe_execution_end_flush_start(now);
    }
 }

@@ -1354,12 +1417,12 @@ impl SmgrOpFlushInProgress {
    {
        let mut fut = std::pin::pin!(fut);

-        let now = Instant::now();
        // Whenever observe_guard gets called, or dropped,
        // it adds the time elapsed since its last call to metrics.
        // Last call is tracked in `now`.
        let mut observe_guard = scopeguard::guard(
            || {
+                let now = Instant::now();
                let elapsed = now - self.flush_started_at;
                self.global_micros
                    .inc_by(u64::try_from(elapsed.as_micros()).unwrap());
@@ -1402,7 +1465,6 @@ pub enum SmgrQueryType {
    GetSlruSegment,
 }

-#[derive(Debug)]
 pub(crate) struct SmgrQueryTimePerTimeline {
    global_started: [IntCounter; SmgrQueryType::COUNT],
    global_latency: [Histogram; SmgrQueryType::COUNT],
@@ -1414,6 +1476,7 @@ pub(crate) struct SmgrQueryTimePerTimeline {
    per_timeline_flush_in_progress_micros: IntCounter,
    global_batch_wait_time: Histogram,
    per_timeline_batch_wait_time: Histogram,
+    throttling: Arc<tenant_throttling::Pagestream>,
 }

 static SMGR_QUERY_STARTED_GLOBAL: Lazy<IntCounterVec> = Lazy::new(|| {
@@ -1619,7 +1682,11 @@ static PAGE_SERVICE_SMGR_BATCH_WAIT_TIME_GLOBAL: Lazy<Histogram> = Lazy::new(||
 });

 impl SmgrQueryTimePerTimeline {
-    pub(crate) fn new(tenant_shard_id: &TenantShardId, timeline_id: &TimelineId) -> Self {
+    pub(crate) fn new(
+        tenant_shard_id: &TenantShardId,
+        timeline_id: &TimelineId,
+        pagestream_throttle_metrics: Arc<tenant_throttling::Pagestream>,
+    ) -> Self {
        let tenant_id = tenant_shard_id.tenant_id.to_string();
        let shard_slug = format!("{}", tenant_shard_id.shard_slug());
        let timeline_id = timeline_id.to_string();
@@ -1680,6 +1747,7 @@ impl SmgrQueryTimePerTimeline {
            per_timeline_flush_in_progress_micros,
            global_batch_wait_time,
            per_timeline_batch_wait_time,
+            throttling: pagestream_throttle_metrics,
        }
    }
    pub(crate) fn start_smgr_op(&self, op: SmgrQueryType, received_at: Instant) -> SmgrOpTimer {
@@ -1695,88 +1763,24 @@ impl SmgrQueryTimePerTimeline {
        SmgrOpTimer(Some(SmgrOpTimerInner {
            global_execution_latency_histo: self.global_latency[op as usize].clone(),
            per_timeline_execution_latency_histo: per_timeline_latency_histo,
-            timings: SmgrOpTimerState::Received { received_at },
            global_flush_in_progress_micros: self.global_flush_in_progress_micros.clone(),
            per_timeline_flush_in_progress_micros: self
                .per_timeline_flush_in_progress_micros
                .clone(),
            global_batch_wait_time: self.global_batch_wait_time.clone(),
            per_timeline_batch_wait_time: self.per_timeline_batch_wait_time.clone(),
+            throttling: self.throttling.clone(),
+            timings: SmgrOpTimerState::Received { received_at },
        }))
    }

+    /// TODO: do something about this? seems odd, we have a similar call on SmgrOpTimer
    pub(crate) fn observe_getpage_batch_start(&self, batch_size: usize) {
        self.global_batch_size.observe(batch_size as f64);
        self.per_timeline_batch_size.observe(batch_size as f64);
    }
 }

-#[cfg(test)]
-mod smgr_query_time_tests {
-    use std::time::Instant;
-
-    use pageserver_api::shard::TenantShardId;
-    use strum::IntoEnumIterator;
-    use utils::id::{TenantId, TimelineId};
-
-    // Regression test, we used hard-coded string constants before using an enum.
-    #[test]
-    fn op_label_name() {
-        use super::SmgrQueryType::*;
-        let expect: [(super::SmgrQueryType, &'static str); 5] = [
-            (GetRelExists, "get_rel_exists"),
-            (GetRelSize, "get_rel_size"),
-            (GetPageAtLsn, "get_page_at_lsn"),
-            (GetDbSize, "get_db_size"),
-            (GetSlruSegment, "get_slru_segment"),
-        ];
-        for (op, expect) in expect {
-            let actual: &'static str = op.into();
-            assert_eq!(actual, expect);
-        }
-    }
-
-    #[test]
-    fn basic() {
-        let ops: Vec<_> = super::SmgrQueryType::iter().collect();
-
-        for op in &ops {
-            let tenant_id = TenantId::generate();
-            let timeline_id = TimelineId::generate();
-            let metrics = super::SmgrQueryTimePerTimeline::new(
-                &TenantShardId::unsharded(tenant_id),
-                &timeline_id,
-            );
-
-            let get_counts = || {
-                let global: u64 = ops
-                    .iter()
-                    .map(|op| metrics.global_latency[*op as usize].get_sample_count())
-                    .sum();
-                (
-                    global,
-                    metrics.per_timeline_getpage_latency.get_sample_count(),
-                )
-            };
-
-            let (pre_global, pre_per_tenant_timeline) = get_counts();
-            assert_eq!(pre_per_tenant_timeline, 0);
-
-            let timer = metrics.start_smgr_op(*op, Instant::now());
-            drop(timer);
-
-            let (post_global, post_per_tenant_timeline) = get_counts();
-            if matches!(op, super::SmgrQueryType::GetPageAtLsn) {
-                // getpage ops are tracked per-timeline, others aren't
-                assert_eq!(post_per_tenant_timeline, 1);
-            } else {
-                assert_eq!(post_per_tenant_timeline, 0);
-            }
-            assert!(post_global > pre_global);
-        }
-    }
-}
-
 // keep in sync with control plane Go code so that we can validate
 // compute's basebackup_ms metric with our perspective in the context of SLI/SLO.
 static COMPUTE_STARTUP_BUCKETS: Lazy<[f64; 28]> = Lazy::new(|| {
@@ -3572,9 +3576,7 @@ pub(crate) mod tenant_throttling {
    use once_cell::sync::Lazy;
    use utils::shard::TenantShardId;

-    use crate::tenant::{self};
-
-    struct GlobalAndPerTenantIntCounter {
+    pub(crate) struct GlobalAndPerTenantIntCounter {
        global: IntCounter,
        per_tenant: IntCounter,
    }
@@ -3592,10 +3594,10 @@ pub(crate) mod tenant_throttling {
    }

    pub(crate) struct Metrics<const KIND: usize> {
-        count_accounted_start: GlobalAndPerTenantIntCounter,
-        count_accounted_finish: GlobalAndPerTenantIntCounter,
-        wait_time: GlobalAndPerTenantIntCounter,
-        count_throttled: GlobalAndPerTenantIntCounter,
+        pub(super) count_accounted_start: GlobalAndPerTenantIntCounter,
+        pub(super) count_accounted_finish: GlobalAndPerTenantIntCounter,
+        pub(super) wait_time: GlobalAndPerTenantIntCounter,
+        pub(super) count_throttled: GlobalAndPerTenantIntCounter,
    }

    static COUNT_ACCOUNTED_START: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
@@ -3730,26 +3732,6 @@ pub(crate) mod tenant_throttling {
            }
        }
    }
-
-    impl<const KIND: usize> tenant::throttle::Metric for Metrics<KIND> {
-        #[inline(always)]
-        fn accounting_start(&self) {
-            self.count_accounted_start.inc();
-        }
-        #[inline(always)]
-        fn accounting_finish(&self) {
-            self.count_accounted_finish.inc();
-        }
-        #[inline(always)]
-        fn observe_throttling(
-            &self,
-            tenant::throttle::Observation { wait_time }: &tenant::throttle::Observation,
-        ) {
-            let val = u64::try_from(wait_time.as_micros()).unwrap();
-            self.wait_time.inc_by(val);
-            self.count_throttled.inc();
-        }
-    }
 }

 pub(crate) mod disk_usage_based_eviction {
@@ -3894,7 +3876,6 @@ pub fn preinitialize_metrics(conf: &'static PageServerConf) {

    // histograms
    [
-        &READ_NUM_LAYERS_VISITED,
        &VEC_READ_NUM_LAYERS_VISITED,
        &WAIT_LSN_TIME,
        &WAL_REDO_TIME,
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -592,40 +592,21 @@ enum BatchedFeMessage {
 }

 impl BatchedFeMessage {
-    async fn throttle_and_record_start_processing(
-        &mut self,
-        cancel: &CancellationToken,
-    ) -> Result<(), QueryError> {
-        let (shard, tokens, timers) = match self {
-            BatchedFeMessage::Exists { shard, timer, .. }
-            | BatchedFeMessage::Nblocks { shard, timer, .. }
-            | BatchedFeMessage::DbSize { shard, timer, .. }
-            | BatchedFeMessage::GetSlruSegment { shard, timer, .. } => {
-                (
-                    shard,
-                    // 1 token is probably under-estimating because these
-                    // request handlers typically do several Timeline::get calls.
-                    1,
-                    itertools::Either::Left(std::iter::once(timer)),
-                )
+    fn observe_execution_start(&mut self, at: Instant) {
+        match self {
+            BatchedFeMessage::Exists { timer, .. }
+            | BatchedFeMessage::Nblocks { timer, .. }
+            | BatchedFeMessage::DbSize { timer, .. }
+            | BatchedFeMessage::GetSlruSegment { timer, .. } => {
+                timer.observe_execution_start(at);
            }
-            BatchedFeMessage::GetPage { shard, pages, .. } => (
-                shard,
-                pages.len(),
-                itertools::Either::Right(pages.iter_mut().map(|p| &mut p.timer)),
-            ),
-            BatchedFeMessage::RespondError { .. } => return Ok(()),
-        };
-        let throttled = tokio::select! {
-            throttled = shard.pagestream_throttle.throttle(tokens) => { throttled }
-            _ = cancel.cancelled() => {
-                return Err(QueryError::Shutdown);
+            BatchedFeMessage::GetPage { pages, .. } => {
+                for page in pages {
+                    page.timer.observe_execution_start(at);
+                }
            }
-        };
-        for timer in timers {
-            timer.observe_throttle_done_execution_starting(&throttled);
+            BatchedFeMessage::RespondError { .. } => {}
        }
-        Ok(())
    }
 }

@@ -717,6 +698,26 @@ impl PageServerHandler {
        let neon_fe_msg =
            PagestreamFeMessage::parse(&mut copy_data_bytes.reader(), protocol_version)?;

+        // TODO: turn in to async closure once available to avoid repeating received_at
+        async fn record_op_start_and_throttle(
+            shard: &timeline::handle::Handle<TenantManagerTypes>,
+            op: metrics::SmgrQueryType,
+            received_at: Instant,
+        ) -> Result<SmgrOpTimer, QueryError> {
+            // It's important to start the smgr op metric recorder as early as possible
+            // so that the _started counters are incremented before we do
+            // any serious waiting, e.g., for throttle, batching, or actual request handling.
+            let mut timer = shard.query_metrics.start_smgr_op(op, received_at);
+            let now = Instant::now();
+            timer.observe_throttle_start(now);
+            let throttled = tokio::select! {
+                res = shard.pagestream_throttle.throttle(1, now) => res,
+                _ = shard.cancel.cancelled() => return Err(QueryError::Shutdown),
+            };
+            timer.observe_throttle_done(throttled);
+            Ok(timer)
+        }
+
        let batched_msg = match neon_fe_msg {
            PagestreamFeMessage::Exists(req) => {
                let span = tracing::info_span!(parent: parent_span, "handle_get_rel_exists_request", rel = %req.rel, req_lsn = %req.hdr.request_lsn);
@@ -724,9 +725,12 @@ impl PageServerHandler {
                    .get(tenant_id, timeline_id, ShardSelector::Zero)
                    .instrument(span.clone()) // sets `shard_id` field
                    .await?;
-                let timer = shard
-                    .query_metrics
-                    .start_smgr_op(metrics::SmgrQueryType::GetRelExists, received_at);
+                let timer = record_op_start_and_throttle(
+                    &shard,
+                    metrics::SmgrQueryType::GetRelExists,
+                    received_at,
+                )
+                .await?;
                BatchedFeMessage::Exists {
                    span,
                    timer,
@@ -740,9 +744,12 @@ impl PageServerHandler {
                    .get(tenant_id, timeline_id, ShardSelector::Zero)
                    .instrument(span.clone()) // sets `shard_id` field
                    .await?;
-                let timer = shard
-                    .query_metrics
-                    .start_smgr_op(metrics::SmgrQueryType::GetRelSize, received_at);
+                let timer = record_op_start_and_throttle(
+                    &shard,
+                    metrics::SmgrQueryType::GetRelSize,
+                    received_at,
+                )
+                .await?;
                BatchedFeMessage::Nblocks {
                    span,
                    timer,
@@ -756,9 +763,12 @@ impl PageServerHandler {
                    .get(tenant_id, timeline_id, ShardSelector::Zero)
                    .instrument(span.clone()) // sets `shard_id` field
                    .await?;
-                let timer = shard
-                    .query_metrics
-                    .start_smgr_op(metrics::SmgrQueryType::GetDbSize, received_at);
+                let timer = record_op_start_and_throttle(
+                    &shard,
+                    metrics::SmgrQueryType::GetDbSize,
+                    received_at,
+                )
+                .await?;
                BatchedFeMessage::DbSize {
                    span,
                    timer,
@@ -772,9 +782,12 @@ impl PageServerHandler {
                    .get(tenant_id, timeline_id, ShardSelector::Zero)
                    .instrument(span.clone()) // sets `shard_id` field
                    .await?;
-                let timer = shard
-                    .query_metrics
-                    .start_smgr_op(metrics::SmgrQueryType::GetSlruSegment, received_at);
+                let timer = record_op_start_and_throttle(
+                    &shard,
+                    metrics::SmgrQueryType::GetSlruSegment,
+                    received_at,
+                )
+                .await?;
                BatchedFeMessage::GetSlruSegment {
                    span,
                    timer,
@@ -823,12 +836,12 @@ impl PageServerHandler {
                    }
                };

-                // It's important to start the timer before waiting for the LSN
-                // so that the _started counters are incremented before we do
-                // any serious waiting, e.g., for LSNs.
-                let timer = shard
-                    .query_metrics
-                    .start_smgr_op(metrics::SmgrQueryType::GetPageAtLsn, received_at);
+                let timer = record_op_start_and_throttle(
+                    &shard,
+                    metrics::SmgrQueryType::GetPageAtLsn,
+                    received_at,
+                )
+                .await?;

                let effective_request_lsn = match Self::wait_or_get_last_lsn(
                    &shard,
@@ -934,6 +947,13 @@ impl PageServerHandler {
    where
        IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
    {
+        let started_at = Instant::now();
+        let batch = {
+            let mut batch = batch;
+            batch.observe_execution_start(started_at);
+            batch
+        };
+
        // invoke handler function
        let (handler_results, span): (
            Vec<Result<(PagestreamBeMessage, SmgrOpTimer), BatchedPageStreamError>>,
@@ -1100,8 +1120,11 @@ impl PageServerHandler {
            // The timer's underlying metric is used for a storage-internal latency SLO and
            // we don't want to include latency in it that we can't control.
            // And as pointed out above, in this case, we don't control the time that flush will take.
-            let flushing_timer =
-                timer.map(|timer| timer.observe_smgr_op_completion_and_start_flushing());
+            let flushing_timer = timer.map(|mut timer| {
+                timer
+                    .observe_execution_end_flush_start(Instant::now())
+                    .expect("we are the first caller")
+            });

            // what we want to do
            let flush_fut = pgb_writer.flush();
@@ -1255,7 +1278,7 @@ impl PageServerHandler {
                Ok(msg) => msg,
                Err(e) => break e,
            };
-            let mut msg = match msg {
+            let msg = match msg {
                Some(msg) => msg,
                None => {
                    debug!("pagestream subprotocol end observed");
@@ -1263,10 +1286,6 @@ impl PageServerHandler {
                }
            };

-            if let Err(cancelled) = msg.throttle_and_record_start_processing(&self.cancel).await {
-                break cancelled;
-            }
-
            let err = self
                .pagesteam_handle_batched_message(pgb_writer, msg, &cancel, protocol_version, ctx)
                .await;
@@ -1426,15 +1445,12 @@ impl PageServerHandler {
                            return Ok(());
                        }
                    };
-                    let mut batch = match batch {
+                    let batch = match batch {
                        Ok(batch) => batch,
                        Err(e) => {
                            return Err(e);
                        }
                    };
-                    batch
-                        .throttle_and_record_start_processing(&self.cancel)
-                        .await?;
                    self.pagesteam_handle_batched_message(
                        pgb_writer,
                        batch,
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -21,6 +21,7 @@ use enumset::EnumSet;
 use futures::stream::FuturesUnordered;
 use futures::StreamExt;
 use pageserver_api::models;
+use pageserver_api::models::CompactInfoResponse;
 use pageserver_api::models::LsnLease;
 use pageserver_api::models::TimelineArchivalState;
 use pageserver_api::models::TimelineState;
@@ -37,20 +38,17 @@ use remote_timeline_client::manifest::{
 };
 use remote_timeline_client::UploadQueueNotReadyError;
 use std::collections::BTreeMap;
-use std::collections::VecDeque;
 use std::fmt;
 use std::future::Future;
 use std::sync::atomic::AtomicBool;
 use std::sync::Weak;
 use std::time::SystemTime;
 use storage_broker::BrokerClientChannel;
-use timeline::compaction::GcCompactJob;
-use timeline::compaction::ScheduledCompactionTask;
+use timeline::compaction::GcCompactionQueue;
 use timeline::import_pgdata;
 use timeline::offload::offload_timeline;
-use timeline::CompactFlags;
+use timeline::offload::OffloadError;
 use timeline::CompactOptions;
-use timeline::CompactionError;
 use timeline::ShutdownMode;
 use tokio::io::BufReader;
 use tokio::sync::watch;
@@ -346,10 +344,8 @@ pub struct Tenant {
    /// Overhead of mutex is acceptable because compaction is done with a multi-second period.
    compaction_circuit_breaker: std::sync::Mutex<CircuitBreaker>,

-    /// Scheduled compaction tasks. Currently, this can only be populated by triggering
-    /// a manual gc-compaction from the manual compaction API.
-    scheduled_compaction_tasks:
-        std::sync::Mutex<HashMap<TimelineId, VecDeque<ScheduledCompactionTask>>>,
+    /// Scheduled gc-compaction tasks.
+    scheduled_compaction_tasks: std::sync::Mutex<HashMap<TimelineId, Arc<GcCompactionQueue>>>,

    /// If the tenant is in Activating state, notify this to encourage it
    /// to proceed to Active as soon as possible, rather than waiting for lazy
@@ -369,8 +365,9 @@ pub struct Tenant {

    /// Throttle applied at the top of [`Timeline::get`].
    /// All [`Tenant::timelines`] of a given [`Tenant`] instance share the same [`throttle::Throttle`] instance.
-    pub(crate) pagestream_throttle:
-        Arc<throttle::Throttle<crate::metrics::tenant_throttling::Pagestream>>,
+    pub(crate) pagestream_throttle: Arc<throttle::Throttle>,
+
+    pub(crate) pagestream_throttle_metrics: Arc<crate::metrics::tenant_throttling::Pagestream>,

    /// An ongoing timeline detach concurrency limiter.
    ///
@@ -1691,6 +1688,7 @@ impl Tenant {
                    TimelineResources {
                        remote_client,
                        pagestream_throttle: self.pagestream_throttle.clone(),
+                        pagestream_throttle_metrics: self.pagestream_throttle_metrics.clone(),
                        l0_flush_global_state: self.l0_flush_global_state.clone(),
                    },
                    LoadTimelineCause::Attach,
@@ -2039,7 +2037,7 @@ impl Tenant {
    ) -> Result<Arc<Timeline>, TimelineArchivalError> {
        info!("unoffloading timeline");

-        // We activate the timeline below manually, so this must be called on an active timeline.
+        // We activate the timeline below manually, so this must be called on an active tenant.
        // We expect callers of this function to ensure this.
        match self.current_state() {
            TenantState::Activating { .. }
@@ -2996,113 +2994,35 @@ impl Tenant {
                if has_pending_l0_compaction_task {
                    Some(true)
                } else {
-                    let mut has_pending_scheduled_compaction_task;
-                    let next_scheduled_compaction_task = {
-                        let mut guard = self.scheduled_compaction_tasks.lock().unwrap();
-                        if let Some(tline_pending_tasks) = guard.get_mut(timeline_id) {
-                            if !tline_pending_tasks.is_empty() {
-                                info!(
-                                    "{} tasks left in the compaction schedule queue",
-                                    tline_pending_tasks.len()
-                                );
-                            }
-                            let next_task = tline_pending_tasks.pop_front();
-                            has_pending_scheduled_compaction_task = !tline_pending_tasks.is_empty();
-                            next_task
-                        } else {
-                            has_pending_scheduled_compaction_task = false;
-                            None
-                        }
+                    let queue = {
+                        let guard = self.scheduled_compaction_tasks.lock().unwrap();
+                        guard.get(timeline_id).cloned()
                    };
-                    if let Some(mut next_scheduled_compaction_task) = next_scheduled_compaction_task
-                    {
-                        if !next_scheduled_compaction_task
-                            .options
-                            .flags
-                            .contains(CompactFlags::EnhancedGcBottomMostCompaction)
-                        {
-                            warn!("ignoring scheduled compaction task: scheduled task must be gc compaction: {:?}", next_scheduled_compaction_task.options);
-                        } else if next_scheduled_compaction_task.options.sub_compaction {
-                            info!("running scheduled enhanced gc bottom-most compaction with sub-compaction, splitting compaction jobs");
-                            let jobs: Vec<GcCompactJob> = timeline
-                                .gc_compaction_split_jobs(
-                                    GcCompactJob::from_compact_options(
-                                        next_scheduled_compaction_task.options.clone(),
-                                    ),
-                                    next_scheduled_compaction_task
-                                        .options
-                                        .sub_compaction_max_job_size_mb,
-                                )
-                                .await
-                                .map_err(CompactionError::Other)?;
-                            if jobs.is_empty() {
-                                info!("no jobs to run, skipping scheduled compaction task");
-                            } else {
-                                has_pending_scheduled_compaction_task = true;
-                                let jobs_len = jobs.len();
-                                let mut guard = self.scheduled_compaction_tasks.lock().unwrap();
-                                let tline_pending_tasks = guard.entry(*timeline_id).or_default();
-                                for (idx, job) in jobs.into_iter().enumerate() {
-                                    // Unfortunately we need to convert the `GcCompactJob` back to `CompactionOptions`
-                                    // until we do further refactors to allow directly call `compact_with_gc`.
-                                    let mut flags: EnumSet<CompactFlags> = EnumSet::default();
-                                    flags |= CompactFlags::EnhancedGcBottomMostCompaction;
-                                    if job.dry_run {
-                                        flags |= CompactFlags::DryRun;
-                                    }
-                                    let options = CompactOptions {
-                                        flags,
-                                        sub_compaction: false,
-                                        compact_key_range: Some(job.compact_key_range.into()),
-                                        compact_lsn_range: Some(job.compact_lsn_range.into()),
-                                        sub_compaction_max_job_size_mb: None,
-                                    };
-                                    tline_pending_tasks.push_back(if idx == jobs_len - 1 {
-                                        ScheduledCompactionTask {
-                                            options,
-                                            // The last job in the queue sends the signal and releases the gc guard
-                                            result_tx: next_scheduled_compaction_task
-                                                .result_tx
-                                                .take(),
-                                            gc_block: next_scheduled_compaction_task
-                                                .gc_block
-                                                .take(),
-                                        }
-                                    } else {
-                                        ScheduledCompactionTask {
-                                            options,
-                                            result_tx: None,
-                                            gc_block: None,
-                                        }
-                                    });
-                                }
-                                info!("scheduled enhanced gc bottom-most compaction with sub-compaction, split into {} jobs", jobs_len);
-                            }
-                        } else {
-                            let _ = timeline
-                                .compact_with_options(
-                                    cancel,
-                                    next_scheduled_compaction_task.options,
-                                    ctx,
-                                )
-                                .instrument(info_span!("scheduled_compact_timeline", %timeline_id))
-                                .await?;
-                            if let Some(tx) = next_scheduled_compaction_task.result_tx.take() {
-                                // TODO: we can send compaction statistics in the future
-                                tx.send(()).ok();
-                            }
-                        }
+                    if let Some(queue) = queue {
+                        let has_pending_tasks = queue
+                            .iteration(cancel, ctx, &self.gc_block, timeline)
+                            .await?;
+                        Some(has_pending_tasks)
+                    } else {
+                        Some(false)
                    }
-                    Some(has_pending_scheduled_compaction_task)
                }
            } else {
                None
            };
            has_pending_task |= pending_task_left.unwrap_or(false);
            if pending_task_left == Some(false) && *can_offload {
-                offload_timeline(self, timeline)
+                pausable_failpoint!("before-timeline-auto-offload");
+                match offload_timeline(self, timeline)
                    .instrument(info_span!("offload_timeline", %timeline_id))
-                    .await?;
+                    .await
+                {
+                    Err(OffloadError::NotArchived) => {
+                        // Ignore this, we likely raced with unarchival
+                        Ok(())
+                    }
+                    other => other,
+                }?;
            }
        }

@@ -3115,34 +3035,32 @@ impl Tenant {
    }

    /// Cancel scheduled compaction tasks
-    pub(crate) fn cancel_scheduled_compaction(
-        &self,
-        timeline_id: TimelineId,
-    ) -> Vec<ScheduledCompactionTask> {
+    pub(crate) fn cancel_scheduled_compaction(&self, timeline_id: TimelineId) {
        let mut guard = self.scheduled_compaction_tasks.lock().unwrap();
-        if let Some(tline_pending_tasks) = guard.get_mut(&timeline_id) {
-            let current_tline_pending_tasks = std::mem::take(tline_pending_tasks);
-            current_tline_pending_tasks.into_iter().collect()
-        } else {
-            Vec::new()
+        if let Some(q) = guard.get_mut(&timeline_id) {
+            q.cancel_scheduled();
        }
    }

    pub(crate) fn get_scheduled_compaction_tasks(
        &self,
        timeline_id: TimelineId,
-    ) -> Vec<CompactOptions> {
-        use itertools::Itertools;
-        let guard = self.scheduled_compaction_tasks.lock().unwrap();
-        guard
-            .get(&timeline_id)
-            .map(|tline_pending_tasks| {
-                tline_pending_tasks
-                    .iter()
-                    .map(|x| x.options.clone())
-                    .collect_vec()
-            })
-            .unwrap_or_default()
+    ) -> Vec<CompactInfoResponse> {
+        let res = {
+            let guard = self.scheduled_compaction_tasks.lock().unwrap();
+            guard.get(&timeline_id).map(|q| q.remaining_jobs())
+        };
+        let Some((running, remaining)) = res else {
+            return Vec::new();
+        };
+        let mut result = Vec::new();
+        if let Some((id, running)) = running {
+            result.extend(running.into_compact_info_resp(id, true));
+        }
+        for (id, job) in remaining {
+            result.extend(job.into_compact_info_resp(id, false));
+        }
+        result
    }

    /// Schedule a compaction task for a timeline.
@@ -3151,20 +3069,12 @@ impl Tenant {
        timeline_id: TimelineId,
        options: CompactOptions,
    ) -> anyhow::Result<tokio::sync::oneshot::Receiver<()>> {
-        let gc_guard = match self.gc_block.start().await {
-            Ok(guard) => guard,
-            Err(e) => {
-                bail!("cannot run gc-compaction because gc is blocked: {}", e);
-            }
-        };
        let (tx, rx) = tokio::sync::oneshot::channel();
        let mut guard = self.scheduled_compaction_tasks.lock().unwrap();
-        let tline_pending_tasks = guard.entry(timeline_id).or_default();
-        tline_pending_tasks.push_back(ScheduledCompactionTask {
-            options,
-            result_tx: Some(tx),
-            gc_block: Some(gc_guard),
-        });
+        let q = guard
+            .entry(timeline_id)
+            .or_insert_with(|| Arc::new(GcCompactionQueue::new()));
+        q.schedule_manual_compaction(options, Some(tx));
        Ok(rx)
    }

@@ -4084,6 +3994,9 @@ impl Tenant {
        Ok(timeline)
    }

+    /// [`Tenant::shutdown`] must be called before dropping the returned [`Tenant`] object
+    /// to ensure proper cleanup of background tasks and metrics.
+    //
    // Allow too_many_arguments because a constructor's argument list naturally grows with the
    // number of attributes in the struct: breaking these out into a builder wouldn't be helpful.
    #[allow(clippy::too_many_arguments)]
@@ -4192,8 +4105,10 @@ impl Tenant {
            gate: Gate::default(),
            pagestream_throttle: Arc::new(throttle::Throttle::new(
                Tenant::get_pagestream_throttle_config(conf, &attached_conf.tenant_conf),
-                crate::metrics::tenant_throttling::Metrics::new(&tenant_shard_id),
            )),
+            pagestream_throttle_metrics: Arc::new(
+                crate::metrics::tenant_throttling::Pagestream::new(&tenant_shard_id),
+            ),
            tenant_conf: Arc::new(ArcSwap::from_pointee(attached_conf)),
            ongoing_timeline_detach: std::sync::Mutex::default(),
            gc_block: Default::default(),
@@ -5100,6 +5015,7 @@ impl Tenant {
        TimelineResources {
            remote_client: self.build_timeline_remote_client(timeline_id),
            pagestream_throttle: self.pagestream_throttle.clone(),
+            pagestream_throttle_metrics: self.pagestream_throttle_metrics.clone(),
            l0_flush_global_state: self.l0_flush_global_state.clone(),
        }
    }
@@ -5774,7 +5690,7 @@ mod tests {
    use bytes::{Bytes, BytesMut};
    use hex_literal::hex;
    use itertools::Itertools;
-    use pageserver_api::key::{Key, AUX_KEY_PREFIX, NON_INHERITED_RANGE};
+    use pageserver_api::key::{Key, AUX_KEY_PREFIX, NON_INHERITED_RANGE, RELATION_SIZE_PREFIX};
    use pageserver_api::keyspace::KeySpace;
    use pageserver_api::models::{CompactionAlgorithm, CompactionAlgorithmSettings};
    use pageserver_api::value::Value;
@@ -7833,7 +7749,18 @@ mod tests {
        let base_key = Key::from_hex("620000000033333333444444445500000000").unwrap();
        let base_key_child = Key::from_hex("620000000033333333444444445500000001").unwrap();
        let base_key_nonexist = Key::from_hex("620000000033333333444444445500000002").unwrap();
+        let base_key_overwrite = Key::from_hex("620000000033333333444444445500000003").unwrap();
+
+        let base_inherited_key = Key::from_hex("610000000033333333444444445500000000").unwrap();
+        let base_inherited_key_child =
+            Key::from_hex("610000000033333333444444445500000001").unwrap();
+        let base_inherited_key_nonexist =
+            Key::from_hex("610000000033333333444444445500000002").unwrap();
+        let base_inherited_key_overwrite =
+            Key::from_hex("610000000033333333444444445500000003").unwrap();
+
        assert_eq!(base_key.field1, AUX_KEY_PREFIX); // in case someone accidentally changed the prefix...
+        assert_eq!(base_inherited_key.field1, RELATION_SIZE_PREFIX);

        let tline = tenant
            .create_test_timeline_with_layers(
@@ -7842,7 +7769,18 @@ mod tests {
                DEFAULT_PG_VERSION,
                &ctx,
                Vec::new(), // delta layers
-                vec![(Lsn(0x20), vec![(base_key, test_img("metadata key 1"))])], // image layers
+                vec![(
+                    Lsn(0x20),
+                    vec![
+                        (base_inherited_key, test_img("metadata inherited key 1")),
+                        (
+                            base_inherited_key_overwrite,
+                            test_img("metadata key overwrite 1a"),
+                        ),
+                        (base_key, test_img("metadata key 1")),
+                        (base_key_overwrite, test_img("metadata key overwrite 1b")),
+                    ],
+                )], // image layers
                Lsn(0x20), // it's fine to not advance LSN to 0x30 while using 0x30 to get below because `get_vectored_impl` does not wait for LSN
            )
            .await?;
@@ -7856,7 +7794,18 @@ mod tests {
                Vec::new(), // delta layers
                vec![(
                    Lsn(0x30),
-                    vec![(base_key_child, test_img("metadata key 2"))],
+                    vec![
+                        (
+                            base_inherited_key_child,
+                            test_img("metadata inherited key 2"),
+                        ),
+                        (
+                            base_inherited_key_overwrite,
+                            test_img("metadata key overwrite 2a"),
+                        ),
+                        (base_key_child, test_img("metadata key 2")),
+                        (base_key_overwrite, test_img("metadata key overwrite 2b")),
+                    ],
                )], // image layers
                Lsn(0x30),
            )
@@ -7878,6 +7827,26 @@ mod tests {
            get_vectored_impl_wrapper(&tline, base_key_nonexist, lsn, &ctx).await?,
            None
        );
+        assert_eq!(
+            get_vectored_impl_wrapper(&tline, base_key_overwrite, lsn, &ctx).await?,
+            Some(test_img("metadata key overwrite 1b"))
+        );
+        assert_eq!(
+            get_vectored_impl_wrapper(&tline, base_inherited_key, lsn, &ctx).await?,
+            Some(test_img("metadata inherited key 1"))
+        );
+        assert_eq!(
+            get_vectored_impl_wrapper(&tline, base_inherited_key_child, lsn, &ctx).await?,
+            None
+        );
+        assert_eq!(
+            get_vectored_impl_wrapper(&tline, base_inherited_key_nonexist, lsn, &ctx).await?,
+            None
+        );
+        assert_eq!(
+            get_vectored_impl_wrapper(&tline, base_inherited_key_overwrite, lsn, &ctx).await?,
+            Some(test_img("metadata key overwrite 1a"))
+        );

        // test vectored get on child timeline
        assert_eq!(
@@ -7892,6 +7861,82 @@ mod tests {
            get_vectored_impl_wrapper(&child, base_key_nonexist, lsn, &ctx).await?,
            None
        );
+        assert_eq!(
+            get_vectored_impl_wrapper(&child, base_inherited_key, lsn, &ctx).await?,
+            Some(test_img("metadata inherited key 1"))
+        );
+        assert_eq!(
+            get_vectored_impl_wrapper(&child, base_inherited_key_child, lsn, &ctx).await?,
+            Some(test_img("metadata inherited key 2"))
+        );
+        assert_eq!(
+            get_vectored_impl_wrapper(&child, base_inherited_key_nonexist, lsn, &ctx).await?,
+            None
+        );
+        assert_eq!(
+            get_vectored_impl_wrapper(&child, base_key_overwrite, lsn, &ctx).await?,
+            Some(test_img("metadata key overwrite 2b"))
+        );
+        assert_eq!(
+            get_vectored_impl_wrapper(&child, base_inherited_key_overwrite, lsn, &ctx).await?,
+            Some(test_img("metadata key overwrite 2a"))
+        );
+
+        // test vectored scan on parent timeline
+        let mut reconstruct_state = ValuesReconstructState::new();
+        let res = tline
+            .get_vectored_impl(
+                KeySpace::single(Key::metadata_key_range()),
+                lsn,
+                &mut reconstruct_state,
+                &ctx,
+            )
+            .await?;
+
+        assert_eq!(
+            res.into_iter()
+                .map(|(k, v)| (k, v.unwrap()))
+                .collect::<Vec<_>>(),
+            vec![
+                (base_inherited_key, test_img("metadata inherited key 1")),
+                (
+                    base_inherited_key_overwrite,
+                    test_img("metadata key overwrite 1a")
+                ),
+                (base_key, test_img("metadata key 1")),
+                (base_key_overwrite, test_img("metadata key overwrite 1b")),
+            ]
+        );
+
+        // test vectored scan on child timeline
+        let mut reconstruct_state = ValuesReconstructState::new();
+        let res = child
+            .get_vectored_impl(
+                KeySpace::single(Key::metadata_key_range()),
+                lsn,
+                &mut reconstruct_state,
+                &ctx,
+            )
+            .await?;
+
+        assert_eq!(
+            res.into_iter()
+                .map(|(k, v)| (k, v.unwrap()))
+                .collect::<Vec<_>>(),
+            vec![
+                (base_inherited_key, test_img("metadata inherited key 1")),
+                (
+                    base_inherited_key_child,
+                    test_img("metadata inherited key 2")
+                ),
+                (
+                    base_inherited_key_overwrite,
+                    test_img("metadata key overwrite 2a")
+                ),
+                (base_key_child, test_img("metadata key 2")),
+                (base_key_overwrite, test_img("metadata key overwrite 2b")),
+            ]
+        );

        Ok(())
    }
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -11,7 +11,7 @@
 pub(crate) use pageserver_api::config::TenantConfigToml as TenantConf;
 use pageserver_api::models::CompactionAlgorithmSettings;
 use pageserver_api::models::EvictionPolicy;
-use pageserver_api::models::{self, TenantConfigPatch, ThrottleConfig};
+use pageserver_api::models::{self, TenantConfigPatch};
 use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize};
 use serde::de::IntoDeserializer;
 use serde::{Deserialize, Serialize};
@@ -597,7 +597,7 @@ impl From<TenantConfOpt> for models::TenantConfig {
                .map(humantime),
            heatmap_period: value.heatmap_period.map(humantime),
            lazy_slru_download: value.lazy_slru_download,
-            timeline_get_throttle: value.timeline_get_throttle.map(ThrottleConfig::from),
+            timeline_get_throttle: value.timeline_get_throttle,
            image_layer_creation_check_threshold: value.image_layer_creation_check_threshold,
            lsn_lease_length: value.lsn_lease_length.map(humantime),
            lsn_lease_length_for_ts: value.lsn_lease_length_for_ts.map(humantime),
--- a/pageserver/src/tenant/disk_btree.rs
+++ b/pageserver/src/tenant/disk_btree.rs
@@ -84,17 +84,17 @@ impl Value {

    fn to_u64(self) -> u64 {
        let b = &self.0;
-        (b[0] as u64) << 32
-            | (b[1] as u64) << 24
-            | (b[2] as u64) << 16
-            | (b[3] as u64) << 8
+        ((b[0] as u64) << 32)
+            | ((b[1] as u64) << 24)
+            | ((b[2] as u64) << 16)
+            | ((b[3] as u64) << 8)
            | b[4] as u64
    }

    fn to_blknum(self) -> u32 {
        let b = &self.0;
        assert!(b[0] == 0x80);
-        (b[1] as u32) << 24 | (b[2] as u32) << 16 | (b[3] as u32) << 8 | b[4] as u32
+        ((b[1] as u32) << 24) | ((b[2] as u32) << 16) | ((b[3] as u32) << 8) | b[4] as u32
    }
 }

--- a/pageserver/src/tenant/metadata.rs
+++ b/pageserver/src/tenant/metadata.rs
@@ -320,7 +320,6 @@ impl TimelineMetadata {

    // Checksums make it awkward to build a valid instance by hand.  This helper
    // provides a TimelineMetadata with a valid checksum in its header.
-    #[cfg(test)]
    pub fn example() -> Self {
        let instance = Self::new(
            "0/16960E8".parse::<Lsn>().unwrap(),
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -63,22 +63,18 @@
 //! The contract between client and its user is that the user is responsible of
 //! scheduling operations in an order that keeps the remote consistent as
 //! described above.
+//!
 //! From the user's perspective, the operations are executed sequentially.
 //! Internally, the client knows which operations can be performed in parallel,
 //! and which operations act like a "barrier" that require preceding operations
 //! to finish. The calling code just needs to call the schedule-functions in the
 //! correct order, and the client will parallelize the operations in a way that
-//! is safe.
-//!
-//! The caller should be careful with deletion, though. They should not delete
-//! local files that have been scheduled for upload but not yet finished uploading.
-//! Otherwise the upload will fail. To wait for an upload to finish, use
-//! the 'wait_completion' function (more on that later.)
+//! is safe. For more details, see `UploadOp::can_bypass`.
 //!
 //! All of this relies on the following invariants:
 //!
 //! - We rely on read-after write consistency in the remote storage.
-//! - Layer files are immutable
+//! - Layer files are immutable.
 //!
 //! NB: Pageserver assumes that it has exclusive write access to the tenant in remote
 //! storage. Different tenants can be attached to different pageservers, but if the
@@ -304,6 +300,15 @@ pub enum WaitCompletionError {
 #[derive(Debug, thiserror::Error)]
 #[error("Upload queue either in unexpected state or hasn't downloaded manifest yet")]
 pub struct UploadQueueNotReadyError;
+
+#[derive(Debug, thiserror::Error)]
+pub enum ShutdownIfArchivedError {
+    #[error(transparent)]
+    NotInitialized(NotInitialized),
+    #[error("timeline is not archived")]
+    NotArchived,
+}
+
 /// Behavioral modes that enable seamless live migration.
 ///
 /// See docs/rfcs/028-pageserver-migration.md to understand how these fit in.
@@ -420,8 +425,16 @@ impl RemoteTimelineClient {
    /// an index file upload, i.e., it's not empty.
    /// The given `index_part` must be the one on the remote.
    pub fn init_upload_queue(&self, index_part: &IndexPart) -> anyhow::Result<()> {
+        // Set the maximum number of inprogress tasks to the remote storage concurrency. There's
+        // certainly no point in starting more upload tasks than this.
+        let inprogress_limit = self
+            .conf
+            .remote_storage_config
+            .as_ref()
+            .and_then(|r| r.concurrency_limit())
+            .unwrap_or(0);
        let mut upload_queue = self.upload_queue.lock().unwrap();
-        upload_queue.initialize_with_current_remote_index_part(index_part)?;
+        upload_queue.initialize_with_current_remote_index_part(index_part, inprogress_limit)?;
        self.update_remote_physical_size_gauge(Some(index_part));
        info!(
            "initialized upload queue from remote index with {} layer files",
@@ -436,8 +449,16 @@ impl RemoteTimelineClient {
        &self,
        local_metadata: &TimelineMetadata,
    ) -> anyhow::Result<()> {
+        // Set the maximum number of inprogress tasks to the remote storage concurrency. There's
+        // certainly no point in starting more upload tasks than this.
+        let inprogress_limit = self
+            .conf
+            .remote_storage_config
+            .as_ref()
+            .and_then(|r| r.concurrency_limit())
+            .unwrap_or(0);
        let mut upload_queue = self.upload_queue.lock().unwrap();
-        upload_queue.initialize_empty_remote(local_metadata)?;
+        upload_queue.initialize_empty_remote(local_metadata, inprogress_limit)?;
        self.update_remote_physical_size_gauge(None);
        info!("initialized upload queue as empty");
        Ok(())
@@ -453,9 +474,15 @@ impl RemoteTimelineClient {
        let deleted_at = index_part.deleted_at.ok_or(anyhow::anyhow!(
            "bug: it is responsibility of the caller to provide index part from MaybeDeletedIndexPart::Deleted"
        ))?;
+        let inprogress_limit = self
+            .conf
+            .remote_storage_config
+            .as_ref()
+            .and_then(|r| r.concurrency_limit())
+            .unwrap_or(0);

        let mut upload_queue = self.upload_queue.lock().unwrap();
-        upload_queue.initialize_with_current_remote_index_part(index_part)?;
+        upload_queue.initialize_with_current_remote_index_part(index_part, inprogress_limit)?;
        self.update_remote_physical_size_gauge(Some(index_part));
        self.stop_impl(&mut upload_queue);

@@ -816,6 +843,55 @@ impl RemoteTimelineClient {
        Ok(need_wait)
    }

+    /// Shuts the timeline client down, but only if the timeline is archived.
+    ///
+    /// This function and [`Self::schedule_index_upload_for_timeline_archival_state`] use the
+    /// same lock to prevent races between unarchival and offloading: unarchival requires the
+    /// upload queue to be initialized, and leaves behind an upload queue where either dirty
+    /// or clean has archived_at of `None`. offloading leaves behind an uninitialized upload
+    /// queue.
+    pub(crate) async fn shutdown_if_archived(
+        self: &Arc<Self>,
+    ) -> Result<(), ShutdownIfArchivedError> {
+        {
+            let mut guard = self.upload_queue.lock().unwrap();
+            let upload_queue = guard
+                .initialized_mut()
+                .map_err(ShutdownIfArchivedError::NotInitialized)?;
+
+            match (
+                upload_queue.dirty.archived_at.is_none(),
+                upload_queue.clean.0.archived_at.is_none(),
+            ) {
+                // The expected case: the timeline is archived and we don't want to unarchive
+                (false, false) => {}
+                (true, false) => {
+                    tracing::info!("can't shut down timeline: timeline slated for unarchival");
+                    return Err(ShutdownIfArchivedError::NotArchived);
+                }
+                (dirty_archived, true) => {
+                    tracing::info!(%dirty_archived, "can't shut down timeline: timeline not archived in remote storage");
+                    return Err(ShutdownIfArchivedError::NotArchived);
+                }
+            }
+
+            // Set the shutting_down flag while the guard from the archival check is held.
+            // This prevents a race with unarchival, as initialized_mut will not return
+            // an upload queue from this point.
+            // Also launch the queued tasks like shutdown() does.
+            if !upload_queue.shutting_down {
+                upload_queue.shutting_down = true;
+                upload_queue.queued_operations.push_back(UploadOp::Shutdown);
+                // this operation is not counted similar to Barrier
+                self.launch_queued_tasks(upload_queue);
+            }
+        }
+
+        self.shutdown().await;
+
+        Ok(())
+    }
+
    /// Launch an index-file upload operation in the background, setting `import_pgdata` field.
    pub(crate) fn schedule_index_upload_for_import_pgdata_state_update(
        self: &Arc<Self>,
@@ -1797,57 +1873,17 @@ impl RemoteTimelineClient {
        Ok(())
    }

-    ///
    /// Pick next tasks from the queue, and start as many of them as possible without violating
    /// the ordering constraints.
    ///
-    /// The caller needs to already hold the `upload_queue` lock.
+    /// TODO: consider limiting the number of in-progress tasks, beyond what remote_storage does.
+    /// This can launch an unbounded number of queued tasks. `UploadQueue::next_ready()` also has
+    /// worst-case quadratic cost in the number of tasks, and may struggle beyond 10,000 tasks.
    fn launch_queued_tasks(self: &Arc<Self>, upload_queue: &mut UploadQueueInitialized) {
-        while let Some(next_op) = upload_queue.queued_operations.front() {
-            // Can we run this task now?
-            let can_run_now = match next_op {
-                UploadOp::UploadLayer(..) => {
-                    // Can always be scheduled.
-                    true
-                }
-                UploadOp::UploadMetadata { .. } => {
-                    // These can only be performed after all the preceding operations
-                    // have finished.
-                    upload_queue.inprogress_tasks.is_empty()
-                }
-                UploadOp::Delete(..) => {
-                    // Wait for preceding uploads to finish. Concurrent deletions are OK, though.
-                    upload_queue.num_inprogress_deletions == upload_queue.inprogress_tasks.len()
-                }
+        while let Some((mut next_op, coalesced_ops)) = upload_queue.next_ready() {
+            debug!("starting op: {next_op}");

-                UploadOp::Barrier(_) | UploadOp::Shutdown => {
-                    upload_queue.inprogress_tasks.is_empty()
-                }
-            };
-
-            // If we cannot launch this task, don't look any further.
-            //
-            // In some cases, we could let some non-frontmost tasks to "jump the queue" and launch
-            // them now, but we don't try to do that currently.  For example, if the frontmost task
-            // is an index-file upload that cannot proceed until preceding uploads have finished, we
-            // could still start layer uploads that were scheduled later.
-            if !can_run_now {
-                break;
-            }
-
-            if let UploadOp::Shutdown = next_op {
-                // leave the op in the queue but do not start more tasks; it will be dropped when
-                // the stop is called.
-                upload_queue.shutdown_ready.close();
-                break;
-            }
-
-            // We can launch this task. Remove it from the queue first.
-            let mut next_op = upload_queue.queued_operations.pop_front().unwrap();
-
-            debug!("starting op: {}", next_op);
-
-            // Update the counters and prepare
+            // Prepare upload.
            match &mut next_op {
                UploadOp::UploadLayer(layer, meta, mode) => {
                    if upload_queue
@@ -1858,18 +1894,14 @@ impl RemoteTimelineClient {
                    } else {
                        *mode = Some(OpType::MayReorder)
                    }
-                    upload_queue.num_inprogress_layer_uploads += 1;
-                }
-                UploadOp::UploadMetadata { .. } => {
-                    upload_queue.num_inprogress_metadata_uploads += 1;
                }
+                UploadOp::UploadMetadata { .. } => {}
                UploadOp::Delete(Delete { layers }) => {
                    for (name, meta) in layers {
                        upload_queue
                            .recently_deleted
                            .insert((name.clone(), meta.generation));
                    }
-                    upload_queue.num_inprogress_deletions += 1;
                }
                UploadOp::Barrier(sender) => {
                    sender.send_replace(());
@@ -1886,6 +1918,7 @@ impl RemoteTimelineClient {
            let task = Arc::new(UploadTask {
                task_id: upload_task_id,
                op: next_op,
+                coalesced_ops,
                retries: AtomicU32::new(0),
            });
            upload_queue
@@ -1969,6 +2002,8 @@ impl RemoteTimelineClient {

            let upload_result: anyhow::Result<()> = match &task.op {
                UploadOp::UploadLayer(ref layer, ref layer_metadata, mode) => {
+                    // TODO: check if this mechanism can be removed now that can_bypass() performs
+                    // conflict checks during scheduling.
                    if let Some(OpType::FlushDeletion) = mode {
                        if self.config.read().unwrap().block_deletions {
                            // Of course, this is not efficient... but usually the queue should be empty.
@@ -2191,13 +2226,8 @@ impl RemoteTimelineClient {
            upload_queue.inprogress_tasks.remove(&task.task_id);

            let lsn_update = match task.op {
-                UploadOp::UploadLayer(_, _, _) => {
-                    upload_queue.num_inprogress_layer_uploads -= 1;
-                    None
-                }
+                UploadOp::UploadLayer(_, _, _) => None,
                UploadOp::UploadMetadata { ref uploaded } => {
-                    upload_queue.num_inprogress_metadata_uploads -= 1;
-
                    // the task id is reused as a monotonicity check for storing the "clean"
                    // IndexPart.
                    let last_updater = upload_queue.clean.1;
@@ -2231,10 +2261,7 @@ impl RemoteTimelineClient {
                        None
                    }
                }
-                UploadOp::Delete(_) => {
-                    upload_queue.num_inprogress_deletions -= 1;
-                    None
-                }
+                UploadOp::Delete(_) => None,
                UploadOp::Barrier(..) | UploadOp::Shutdown => unreachable!(),
            };

@@ -2259,6 +2286,9 @@ impl RemoteTimelineClient {
        }

        self.metric_end(&task.op);
+        for coalesced_op in &task.coalesced_ops {
+            self.metric_end(coalesced_op);
+        }
    }

    fn metric_impl(
@@ -2351,6 +2381,7 @@ impl RemoteTimelineClient {
                    // but for this use case it doesnt really makes sense to bring unsafe code only for this usage point.
                    // Deletion is not really perf sensitive so there shouldnt be any problems with cloning a fraction of it.
                    let upload_queue_for_deletion = UploadQueueInitialized {
+                        inprogress_limit: initialized.inprogress_limit,
                        task_counter: 0,
                        dirty: initialized.dirty.clone(),
                        clean: initialized.clean.clone(),
@@ -2358,9 +2389,6 @@ impl RemoteTimelineClient {
                        visible_remote_consistent_lsn: initialized
                            .visible_remote_consistent_lsn
                            .clone(),
-                        num_inprogress_layer_uploads: 0,
-                        num_inprogress_metadata_uploads: 0,
-                        num_inprogress_deletions: 0,
                        inprogress_tasks: HashMap::default(),
                        queued_operations: VecDeque::default(),
                        #[cfg(feature = "testing")]
@@ -2387,14 +2415,6 @@ impl RemoteTimelineClient {
                    }
                };

-                // consistency check
-                assert_eq!(
-                    qi.num_inprogress_layer_uploads
-                        + qi.num_inprogress_metadata_uploads
-                        + qi.num_inprogress_deletions,
-                    qi.inprogress_tasks.len()
-                );
-
                // We don't need to do anything here for in-progress tasks. They will finish
                // on their own, decrement the unfinished-task counter themselves, and observe
                // that the queue is Stopped.
@@ -2841,8 +2861,8 @@ mod tests {
            let mut guard = client.upload_queue.lock().unwrap();
            let upload_queue = guard.initialized_mut().unwrap();
            assert!(upload_queue.queued_operations.is_empty());
-            assert!(upload_queue.inprogress_tasks.len() == 2);
-            assert!(upload_queue.num_inprogress_layer_uploads == 2);
+            assert_eq!(upload_queue.inprogress_tasks.len(), 2);
+            assert_eq!(upload_queue.num_inprogress_layer_uploads(), 2);

            // also check that `latest_file_changes` was updated
            assert!(upload_queue.latest_files_changes_since_metadata_upload_scheduled == 2);
@@ -2912,8 +2932,8 @@ mod tests {
            // Deletion schedules upload of the index file, and the file deletion itself
            assert_eq!(upload_queue.queued_operations.len(), 2);
            assert_eq!(upload_queue.inprogress_tasks.len(), 1);
-            assert_eq!(upload_queue.num_inprogress_layer_uploads, 1);
-            assert_eq!(upload_queue.num_inprogress_deletions, 0);
+            assert_eq!(upload_queue.num_inprogress_layer_uploads(), 1);
+            assert_eq!(upload_queue.num_inprogress_deletions(), 0);
            assert_eq!(
                upload_queue.latest_files_changes_since_metadata_upload_scheduled,
                0
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -104,7 +104,7 @@ impl IndexPart {

    pub const FILE_NAME: &'static str = "index_part.json";

-    pub(crate) fn empty(metadata: TimelineMetadata) -> Self {
+    pub fn empty(metadata: TimelineMetadata) -> Self {
        IndexPart {
            version: Self::LATEST_VERSION,
            layer_metadata: Default::default(),
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -12,7 +12,7 @@ pub mod merge_iterator;

 use crate::context::{AccessStatsBehavior, RequestContext};
 use bytes::Bytes;
-use pageserver_api::key::{Key, NON_INHERITED_SPARSE_RANGE};
+use pageserver_api::key::Key;
 use pageserver_api::keyspace::{KeySpace, KeySpaceRandomAccum};
 use pageserver_api::record::NeonWalRecord;
 use pageserver_api::value::Value;
@@ -209,7 +209,7 @@ impl ValuesReconstructState {
            .keys
            .entry(*key)
            .or_insert(Ok(VectoredValueReconstructState::default()));
-        let is_sparse_key = NON_INHERITED_SPARSE_RANGE.contains(key);
+        let is_sparse_key = key.is_sparse();
        if let Ok(state) = state {
            let key_done = match state.situation {
                ValueReconstructSituation::Complete => {
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -112,8 +112,8 @@ const MAX_SUPPORTED_BLOB_LEN_BITS: usize = {
 ///
 /// Layout:
 /// - 1 bit: `will_init`
-/// - [`MAX_SUPPORTED_BLOB_LEN_BITS`]: `len`
-/// - [`MAX_SUPPORTED_POS_BITS`]: `pos`
+/// - [`MAX_SUPPORTED_BLOB_LEN_BITS`][]: `len`
+/// - [`MAX_SUPPORTED_POS_BITS`](IndexEntry::MAX_SUPPORTED_POS_BITS): `pos`
 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
 pub struct IndexEntry(u64);

--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -1812,7 +1812,7 @@ enum LayerKind {

 /// Guard for forcing a layer be resident while it exists.
 #[derive(Clone)]
-pub(crate) struct ResidentLayer {
+pub struct ResidentLayer {
    owner: Layer,
    downloaded: Arc<DownloadedLayer>,
 }
--- a/pageserver/src/tenant/throttle.rs
+++ b/pageserver/src/tenant/throttle.rs
@@ -3,7 +3,7 @@ use std::{
        atomic::{AtomicU64, Ordering},
        Arc,
    },
-    time::{Duration, Instant},
+    time::Instant,
 };

 use arc_swap::ArcSwap;
@@ -16,9 +16,8 @@ use utils::leaky_bucket::{LeakyBucketConfig, RateLimiter};
 /// To share a throttle among multiple entities, wrap it in an [`Arc`].
 ///
 /// The intial use case for this is tenant-wide throttling of getpage@lsn requests.
-pub struct Throttle<M: Metric> {
+pub struct Throttle {
    inner: ArcSwap<Inner>,
-    metric: M,
    /// will be turned into [`Stats::count_accounted_start`]
    count_accounted_start: AtomicU64,
    /// will be turned into [`Stats::count_accounted_finish`]
@@ -36,15 +35,6 @@ pub struct Inner {

 pub type Config = pageserver_api::models::ThrottleConfig;

-pub struct Observation {
-    pub wait_time: Duration,
-}
-pub trait Metric {
-    fn accounting_start(&self);
-    fn accounting_finish(&self);
-    fn observe_throttling(&self, observation: &Observation);
-}
-
 /// See [`Throttle::reset_stats`].
 pub struct Stats {
    /// Number of requests that started [`Throttle::throttle`] calls.
@@ -59,18 +49,14 @@ pub struct Stats {
 }

 pub enum ThrottleResult {
-    NotThrottled { start: Instant },
-    Throttled { start: Instant, end: Instant },
+    NotThrottled { end: Instant },
+    Throttled { end: Instant },
 }

-impl<M> Throttle<M>
-where
-    M: Metric,
-{
-    pub fn new(config: Config, metric: M) -> Self {
+impl Throttle {
+    pub fn new(config: Config) -> Self {
        Self {
            inner: ArcSwap::new(Arc::new(Self::new_inner(config))),
-            metric,
            count_accounted_start: AtomicU64::new(0),
            count_accounted_finish: AtomicU64::new(0),
            count_throttled: AtomicU64::new(0),
@@ -127,32 +113,27 @@ where
        self.inner.load().rate_limiter.steady_rps()
    }

-    pub async fn throttle(&self, key_count: usize) -> ThrottleResult {
+    /// `start` must be [`Instant::now`] or earlier.
+    pub async fn throttle(&self, key_count: usize, start: Instant) -> ThrottleResult {
        let inner = self.inner.load_full(); // clones the `Inner` Arc

-        let start = std::time::Instant::now();
-
        if !inner.enabled {
-            return ThrottleResult::NotThrottled { start };
+            return ThrottleResult::NotThrottled { end: start };
        }

-        self.metric.accounting_start();
        self.count_accounted_start.fetch_add(1, Ordering::Relaxed);
        let did_throttle = inner.rate_limiter.acquire(key_count).await;
        self.count_accounted_finish.fetch_add(1, Ordering::Relaxed);
-        self.metric.accounting_finish();

        if did_throttle {
            self.count_throttled.fetch_add(1, Ordering::Relaxed);
-            let now = Instant::now();
-            let wait_time = now - start;
+            let end = Instant::now();
+            let wait_time = end - start;
            self.sum_throttled_usecs
                .fetch_add(wait_time.as_micros() as u64, Ordering::Relaxed);
-            let observation = Observation { wait_time };
-            self.metric.observe_throttling(&observation);
-            ThrottleResult::Throttled { start, end: now }
+            ThrottleResult::Throttled { end }
        } else {
-            ThrottleResult::NotThrottled { start }
+            ThrottleResult::NotThrottled { end: start }
        }
    }
 }
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -27,7 +27,7 @@ use pageserver_api::{
    config::tenant_conf_defaults::DEFAULT_COMPACTION_THRESHOLD,
    key::{
        KEY_SIZE, METADATA_KEY_BEGIN_PREFIX, METADATA_KEY_END_PREFIX, NON_INHERITED_RANGE,
-        NON_INHERITED_SPARSE_RANGE,
+        SPARSE_RANGE,
    },
    keyspace::{KeySpaceAccum, KeySpaceRandomAccum, SparseKeyPartitioning},
    models::{
@@ -208,8 +208,8 @@ fn drop_wlock<T>(rlock: tokio::sync::RwLockWriteGuard<'_, T>) {
 /// The outward-facing resources required to build a Timeline
 pub struct TimelineResources {
    pub remote_client: RemoteTimelineClient,
-    pub pagestream_throttle:
-        Arc<crate::tenant::throttle::Throttle<crate::metrics::tenant_throttling::Pagestream>>,
+    pub pagestream_throttle: Arc<crate::tenant::throttle::Throttle>,
+    pub pagestream_throttle_metrics: Arc<crate::metrics::tenant_throttling::Pagestream>,
    pub l0_flush_global_state: l0_flush::L0FlushGlobalState,
 }

@@ -412,8 +412,7 @@ pub struct Timeline {
    gc_lock: tokio::sync::Mutex<()>,

    /// Cloned from [`super::Tenant::pagestream_throttle`] on construction.
-    pub(crate) pagestream_throttle:
-        Arc<crate::tenant::throttle::Throttle<crate::metrics::tenant_throttling::Pagestream>>,
+    pub(crate) pagestream_throttle: Arc<crate::tenant::throttle::Throttle>,

    /// Size estimator for aux file v2
    pub(crate) aux_file_size_estimator: AuxFileSizeEstimator,
@@ -2310,6 +2309,7 @@ impl Timeline {
                query_metrics: crate::metrics::SmgrQueryTimePerTimeline::new(
                    &tenant_shard_id,
                    &timeline_id,
+                    resources.pagestream_throttle_metrics,
                ),

                directory_metrics: array::from_fn(|_| AtomicU64::new(0)),
@@ -3221,7 +3221,7 @@ impl Timeline {
            // We don't return a blanket [`GetVectoredError::MissingKey`] to avoid
            // stalling compaction.
            keyspace.remove_overlapping_with(&KeySpace {
-                ranges: vec![NON_INHERITED_RANGE, NON_INHERITED_SPARSE_RANGE],
+                ranges: vec![NON_INHERITED_RANGE, Key::sparse_non_inherited_keyspace()],
            });

            // Keyspace is fully retrieved
@@ -3242,7 +3242,11 @@ impl Timeline {
            // keys from `keyspace`, we expect there to be no overlap between it and the image covered key
            // space. If that's not the case, we had at least one key encounter a gap in the image layer
            // and stop the search as a result of that.
-            let removed = keyspace.remove_overlapping_with(&image_covered_keyspace);
+            let mut removed = keyspace.remove_overlapping_with(&image_covered_keyspace);
+            // Do not fire missing key error for sparse keys.
+            removed.remove_overlapping_with(&KeySpace {
+                ranges: vec![SPARSE_RANGE],
+            });
            if !removed.is_empty() {
                break Some(removed);
            }
@@ -3257,6 +3261,21 @@ impl Timeline {
            timeline = &*timeline_owned;
        };

+        // Remove sparse keys from the keyspace so that it doesn't fire errors.
+        let missing_keyspace = if let Some(missing_keyspace) = missing_keyspace {
+            let mut missing_keyspace = missing_keyspace;
+            missing_keyspace.remove_overlapping_with(&KeySpace {
+                ranges: vec![SPARSE_RANGE],
+            });
+            if missing_keyspace.is_empty() {
+                None
+            } else {
+                Some(missing_keyspace)
+            }
+        } else {
+            None
+        };
+
        if let Some(missing_keyspace) = missing_keyspace {
            return Err(GetVectoredError::MissingKey(MissingKeyError {
                key: missing_keyspace.start().unwrap(), /* better if we can store the full keyspace */
@@ -3762,36 +3781,35 @@ impl Timeline {
                return Err(FlushLayerError::Cancelled);
            }

-            let mut layers_to_upload = Vec::new();
-            layers_to_upload.extend(
-                self.create_image_layers(
-                    &rel_partition,
-                    self.initdb_lsn,
-                    ImageLayerCreationMode::Initial,
-                    ctx,
-                )
-                .await?,
-            );
+            // Ensure that we have a single call to `create_image_layers` with a combined dense keyspace.
+            // So that the key ranges don't overlap.
+            let mut partitions = KeyPartitioning::default();
+            partitions.parts.extend(rel_partition.parts);
            if !metadata_partition.parts.is_empty() {
                assert_eq!(
                    metadata_partition.parts.len(),
                    1,
                    "currently sparse keyspace should only contain a single metadata keyspace"
                );
-                layers_to_upload.extend(
-                    self.create_image_layers(
-                        // Safety: create_image_layers treat sparse keyspaces differently that it does not scan
-                        // every single key within the keyspace, and therefore, it's safe to force converting it
-                        // into a dense keyspace before calling this function.
-                        &metadata_partition.into_dense(),
-                        self.initdb_lsn,
-                        ImageLayerCreationMode::Initial,
-                        ctx,
-                    )
-                    .await?,
-                );
+                // Safety: create_image_layers treat sparse keyspaces differently that it does not scan
+                // every single key within the keyspace, and therefore, it's safe to force converting it
+                // into a dense keyspace before calling this function.
+                partitions
+                    .parts
+                    .extend(metadata_partition.into_dense().parts);
            }

+            let mut layers_to_upload = Vec::new();
+            layers_to_upload.extend(
+                self.create_image_layers(
+                    &partitions,
+                    self.initdb_lsn,
+                    ImageLayerCreationMode::Initial,
+                    ctx,
+                )
+                .await?,
+            );
+
            (layers_to_upload, None)
        } else {
            // Normal case, write out a L0 delta layer file.
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -4,7 +4,7 @@
 //!
 //! The old legacy algorithm is implemented directly in `timeline.rs`.

-use std::collections::{BinaryHeap, HashMap, HashSet};
+use std::collections::{BinaryHeap, HashMap, HashSet, VecDeque};
 use std::ops::{Deref, Range};
 use std::sync::Arc;

@@ -16,10 +16,12 @@ use super::{

 use anyhow::{anyhow, bail, Context};
 use bytes::Bytes;
+use enumset::EnumSet;
 use fail::fail_point;
 use itertools::Itertools;
 use pageserver_api::key::KEY_SIZE;
 use pageserver_api::keyspace::ShardedRange;
+use pageserver_api::models::CompactInfoResponse;
 use pageserver_api::shard::{ShardCount, ShardIdentity, TenantShardId};
 use serde::Serialize;
 use tokio_util::sync::CancellationToken;
@@ -30,6 +32,7 @@ use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder}
 use crate::page_cache;
 use crate::statvfs::Statvfs;
 use crate::tenant::checks::check_valid_layermap;
+use crate::tenant::gc_block::GcBlock;
 use crate::tenant::remote_timeline_client::WaitCompletionError;
 use crate::tenant::storage_layer::batch_split_writer::{
    BatchWriterResult, SplitDeltaLayerWriter, SplitImageLayerWriter,
@@ -63,16 +66,284 @@ use super::CompactionError;
 /// Maximum number of deltas before generating an image layer in bottom-most compaction.
 const COMPACTION_DELTA_THRESHOLD: usize = 5;

-/// A scheduled compaction task.
-pub(crate) struct ScheduledCompactionTask {
-    /// It's unfortunate that we need to store a compact options struct here because the only outer
-    /// API we can call here is `compact_with_options` which does a few setup calls before starting the
-    /// actual compaction job... We should refactor this to store `GcCompactionJob` in the future.
-    pub options: CompactOptions,
-    /// The channel to send the compaction result. If this is a subcompaction, the last compaction job holds the sender.
-    pub result_tx: Option<tokio::sync::oneshot::Sender<()>>,
-    /// Hold the GC block. If this is a subcompaction, the last compaction job holds the gc block guard.
-    pub gc_block: Option<gc_block::Guard>,
+#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)]
+pub struct GcCompactionJobId(pub usize);
+
+impl std::fmt::Display for GcCompactionJobId {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}", self.0)
+    }
+}
+
+#[derive(Debug, Clone)]
+pub enum GcCompactionQueueItem {
+    Manual(CompactOptions),
+    SubCompactionJob(CompactOptions),
+    #[allow(dead_code)]
+    UpdateL2Lsn(Lsn),
+    Notify(GcCompactionJobId),
+}
+
+impl GcCompactionQueueItem {
+    pub fn into_compact_info_resp(
+        self,
+        id: GcCompactionJobId,
+        running: bool,
+    ) -> Option<CompactInfoResponse> {
+        match self {
+            GcCompactionQueueItem::Manual(options) => Some(CompactInfoResponse {
+                compact_key_range: options.compact_key_range,
+                compact_lsn_range: options.compact_lsn_range,
+                sub_compaction: options.sub_compaction,
+                running,
+                job_id: id.0,
+            }),
+            GcCompactionQueueItem::SubCompactionJob(options) => Some(CompactInfoResponse {
+                compact_key_range: options.compact_key_range,
+                compact_lsn_range: options.compact_lsn_range,
+                sub_compaction: options.sub_compaction,
+                running,
+                job_id: id.0,
+            }),
+            GcCompactionQueueItem::UpdateL2Lsn(_) => None,
+            GcCompactionQueueItem::Notify(_) => None,
+        }
+    }
+}
+
+struct GcCompactionQueueInner {
+    running: Option<(GcCompactionJobId, GcCompactionQueueItem)>,
+    queued: VecDeque<(GcCompactionJobId, GcCompactionQueueItem)>,
+    notify: HashMap<GcCompactionJobId, tokio::sync::oneshot::Sender<()>>,
+    gc_guards: HashMap<GcCompactionJobId, gc_block::Guard>,
+    last_id: GcCompactionJobId,
+}
+
+impl GcCompactionQueueInner {
+    fn next_id(&mut self) -> GcCompactionJobId {
+        let id = self.last_id;
+        self.last_id = GcCompactionJobId(id.0 + 1);
+        id
+    }
+}
+
+/// A structure to store gc_compaction jobs.
+pub struct GcCompactionQueue {
+    /// All items in the queue, and the currently-running job.
+    inner: std::sync::Mutex<GcCompactionQueueInner>,
+    /// Ensure only one thread is consuming the queue.
+    consumer_lock: tokio::sync::Mutex<()>,
+}
+
+impl GcCompactionQueue {
+    pub fn new() -> Self {
+        GcCompactionQueue {
+            inner: std::sync::Mutex::new(GcCompactionQueueInner {
+                running: None,
+                queued: VecDeque::new(),
+                notify: HashMap::new(),
+                gc_guards: HashMap::new(),
+                last_id: GcCompactionJobId(0),
+            }),
+            consumer_lock: tokio::sync::Mutex::new(()),
+        }
+    }
+
+    pub fn cancel_scheduled(&self) {
+        let mut guard = self.inner.lock().unwrap();
+        guard.queued.clear();
+        guard.notify.clear();
+        guard.gc_guards.clear();
+    }
+
+    /// Schedule a manual compaction job.
+    pub fn schedule_manual_compaction(
+        &self,
+        options: CompactOptions,
+        notify: Option<tokio::sync::oneshot::Sender<()>>,
+    ) -> GcCompactionJobId {
+        let mut guard = self.inner.lock().unwrap();
+        let id = guard.next_id();
+        guard
+            .queued
+            .push_back((id, GcCompactionQueueItem::Manual(options)));
+        if let Some(notify) = notify {
+            guard.notify.insert(id, notify);
+        }
+        info!("scheduled compaction job id={}", id);
+        id
+    }
+
+    /// Trigger an auto compaction.
+    #[allow(dead_code)]
+    pub fn trigger_auto_compaction(&self, _: &Arc<Timeline>) {}
+
+    /// Notify the caller the job has finished and unblock GC.
+    fn notify_and_unblock(&self, id: GcCompactionJobId) {
+        info!("compaction job id={} finished", id);
+        let mut guard = self.inner.lock().unwrap();
+        if let Some(blocking) = guard.gc_guards.remove(&id) {
+            drop(blocking)
+        }
+        if let Some(tx) = guard.notify.remove(&id) {
+            let _ = tx.send(());
+        }
+    }
+
+    async fn handle_sub_compaction(
+        &self,
+        id: GcCompactionJobId,
+        options: CompactOptions,
+        timeline: &Arc<Timeline>,
+        gc_block: &GcBlock,
+    ) -> Result<(), CompactionError> {
+        info!("running scheduled enhanced gc bottom-most compaction with sub-compaction, splitting compaction jobs");
+        let jobs: Vec<GcCompactJob> = timeline
+            .gc_compaction_split_jobs(
+                GcCompactJob::from_compact_options(options.clone()),
+                options.sub_compaction_max_job_size_mb,
+            )
+            .await
+            .map_err(CompactionError::Other)?;
+        if jobs.is_empty() {
+            info!("no jobs to run, skipping scheduled compaction task");
+            self.notify_and_unblock(id);
+        } else {
+            let gc_guard = match gc_block.start().await {
+                Ok(guard) => guard,
+                Err(e) => {
+                    return Err(CompactionError::Other(anyhow!(
+                        "cannot run gc-compaction because gc is blocked: {}",
+                        e
+                    )));
+                }
+            };
+
+            let jobs_len = jobs.len();
+            let mut pending_tasks = Vec::new();
+            for job in jobs {
+                // Unfortunately we need to convert the `GcCompactJob` back to `CompactionOptions`
+                // until we do further refactors to allow directly call `compact_with_gc`.
+                let mut flags: EnumSet<CompactFlags> = EnumSet::default();
+                flags |= CompactFlags::EnhancedGcBottomMostCompaction;
+                if job.dry_run {
+                    flags |= CompactFlags::DryRun;
+                }
+                let options = CompactOptions {
+                    flags,
+                    sub_compaction: false,
+                    compact_key_range: Some(job.compact_key_range.into()),
+                    compact_lsn_range: Some(job.compact_lsn_range.into()),
+                    sub_compaction_max_job_size_mb: None,
+                };
+                pending_tasks.push(GcCompactionQueueItem::SubCompactionJob(options));
+            }
+            pending_tasks.push(GcCompactionQueueItem::Notify(id));
+            {
+                let mut guard = self.inner.lock().unwrap();
+                guard.gc_guards.insert(id, gc_guard);
+                let mut tasks = Vec::new();
+                for task in pending_tasks {
+                    let id = guard.next_id();
+                    tasks.push((id, task));
+                }
+                tasks.reverse();
+                for item in tasks {
+                    guard.queued.push_front(item);
+                }
+            }
+            info!("scheduled enhanced gc bottom-most compaction with sub-compaction, split into {} jobs", jobs_len);
+        }
+        Ok(())
+    }
+
+    /// Take a job from the queue and process it. Returns if there are still pending tasks.
+    pub async fn iteration(
+        &self,
+        cancel: &CancellationToken,
+        ctx: &RequestContext,
+        gc_block: &GcBlock,
+        timeline: &Arc<Timeline>,
+    ) -> Result<bool, CompactionError> {
+        let _one_op_at_a_time_guard = self.consumer_lock.lock().await;
+        let has_pending_tasks;
+        let (id, item) = {
+            let mut guard = self.inner.lock().unwrap();
+            let Some((id, item)) = guard.queued.pop_front() else {
+                return Ok(false);
+            };
+            guard.running = Some((id, item.clone()));
+            has_pending_tasks = !guard.queued.is_empty();
+            (id, item)
+        };
+
+        match item {
+            GcCompactionQueueItem::Manual(options) => {
+                if !options
+                    .flags
+                    .contains(CompactFlags::EnhancedGcBottomMostCompaction)
+                {
+                    warn!("ignoring scheduled compaction task: scheduled task must be gc compaction: {:?}", options);
+                } else if options.sub_compaction {
+                    self.handle_sub_compaction(id, options, timeline, gc_block)
+                        .await?;
+                } else {
+                    let gc_guard = match gc_block.start().await {
+                        Ok(guard) => guard,
+                        Err(e) => {
+                            return Err(CompactionError::Other(anyhow!(
+                                "cannot run gc-compaction because gc is blocked: {}",
+                                e
+                            )));
+                        }
+                    };
+                    {
+                        let mut guard = self.inner.lock().unwrap();
+                        guard.gc_guards.insert(id, gc_guard);
+                    }
+                    let _ = timeline
+                        .compact_with_options(cancel, options, ctx)
+                        .instrument(info_span!("scheduled_compact_timeline", %timeline.timeline_id))
+                        .await?;
+                    self.notify_and_unblock(id);
+                }
+            }
+            GcCompactionQueueItem::SubCompactionJob(options) => {
+                let _ = timeline
+                    .compact_with_options(cancel, options, ctx)
+                    .instrument(info_span!("scheduled_compact_timeline", %timeline.timeline_id))
+                    .await?;
+            }
+            GcCompactionQueueItem::Notify(id) => {
+                self.notify_and_unblock(id);
+            }
+            GcCompactionQueueItem::UpdateL2Lsn(_) => {
+                unreachable!()
+            }
+        }
+        {
+            let mut guard = self.inner.lock().unwrap();
+            guard.running = None;
+        }
+        Ok(has_pending_tasks)
+    }
+
+    #[allow(clippy::type_complexity)]
+    pub fn remaining_jobs(
+        &self,
+    ) -> (
+        Option<(GcCompactionJobId, GcCompactionQueueItem)>,
+        VecDeque<(GcCompactionJobId, GcCompactionQueueItem)>,
+    ) {
+        let guard = self.inner.lock().unwrap();
+        (guard.running.clone(), guard.queued.clone())
+    }
+
+    #[allow(dead_code)]
+    pub fn remaining_jobs_num(&self) -> usize {
+        let guard = self.inner.lock().unwrap();
+        guard.queued.len() + if guard.running.is_some() { 1 } else { 0 }
+    }
 }

 /// A job description for the gc-compaction job. This structure describes the rectangle range that the job will
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -194,7 +194,9 @@ impl DeleteTimelineFlow {
        super::debug_assert_current_span_has_tenant_and_timeline_id();

        let allow_offloaded_children = false;
-        let (timeline, mut guard) = Self::prepare(tenant, timeline_id, allow_offloaded_children)?;
+        let set_stopping = true;
+        let (timeline, mut guard) =
+            Self::prepare(tenant, timeline_id, allow_offloaded_children, set_stopping)?;

        guard.mark_in_progress()?;

@@ -299,6 +301,7 @@ impl DeleteTimelineFlow {
                TimelineResources {
                    remote_client,
                    pagestream_throttle: tenant.pagestream_throttle.clone(),
+                    pagestream_throttle_metrics: tenant.pagestream_throttle_metrics.clone(),
                    l0_flush_global_state: tenant.l0_flush_global_state.clone(),
                },
                // Important. We dont pass ancestor above because it can be missing.
@@ -334,6 +337,7 @@ impl DeleteTimelineFlow {
        tenant: &Tenant,
        timeline_id: TimelineId,
        allow_offloaded_children: bool,
+        set_stopping: bool,
    ) -> Result<(TimelineOrOffloaded, DeletionGuard), DeleteTimelineError> {
        // Note the interaction between this guard and deletion guard.
        // Here we attempt to lock deletion guard when we're holding a lock on timelines.
@@ -389,8 +393,10 @@ impl DeleteTimelineFlow {
            }
        };

-        if let TimelineOrOffloaded::Timeline(timeline) = &timeline {
-            timeline.set_state(TimelineState::Stopping);
+        if set_stopping {
+            if let TimelineOrOffloaded::Timeline(timeline) = &timeline {
+                timeline.set_state(TimelineState::Stopping);
+            }
        }

        Ok((timeline, delete_lock_guard))
--- a/pageserver/src/tenant/timeline/offload.rs
+++ b/pageserver/src/tenant/timeline/offload.rs
@@ -1,10 +1,11 @@
 use std::sync::Arc;

-use pageserver_api::models::TenantState;
+use pageserver_api::models::{TenantState, TimelineState};

 use super::delete::{delete_local_timeline_directory, DeleteTimelineFlow, DeletionGuard};
 use super::Timeline;
 use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
+use crate::tenant::remote_timeline_client::ShutdownIfArchivedError;
 use crate::tenant::{OffloadedTimeline, Tenant, TenantManifestError, TimelineOrOffloaded};

 #[derive(thiserror::Error, Debug)]
@@ -36,28 +37,29 @@ pub(crate) async fn offload_timeline(
    tracing::info!("offloading archived timeline");

    let allow_offloaded_children = true;
-    let (timeline, guard) =
-        DeleteTimelineFlow::prepare(tenant, timeline.timeline_id, allow_offloaded_children)
-            .map_err(|e| OffloadError::Other(anyhow::anyhow!(e)))?;
+    let set_stopping = false;
+    let (timeline, guard) = DeleteTimelineFlow::prepare(
+        tenant,
+        timeline.timeline_id,
+        allow_offloaded_children,
+        set_stopping,
+    )
+    .map_err(|e| OffloadError::Other(anyhow::anyhow!(e)))?;

    let TimelineOrOffloaded::Timeline(timeline) = timeline else {
        tracing::error!("timeline already offloaded, but given timeline object");
        return Ok(());
    };

-    let is_archived = timeline.is_archived();
-    match is_archived {
-        Some(true) => (),
-        Some(false) => {
-            tracing::warn!("tried offloading a non-archived timeline");
-            return Err(OffloadError::NotArchived);
-        }
-        None => {
-            // This is legal: calls to this function can race with the timeline shutting down
-            tracing::info!("tried offloading a timeline whose remote storage is not initialized");
-            return Err(OffloadError::Cancelled);
+    match timeline.remote_client.shutdown_if_archived().await {
+        Ok(()) => {}
+        Err(ShutdownIfArchivedError::NotInitialized(_)) => {
+            // Either the timeline is being deleted, the operation is being retried, or we are shutting down.
+            // Don't return cancelled here to keep it idempotent.
        }
+        Err(ShutdownIfArchivedError::NotArchived) => return Err(OffloadError::NotArchived),
    }
+    timeline.set_state(TimelineState::Stopping);

    // Now that the Timeline is in Stopping state, request all the related tasks to shut down.
    timeline.shutdown(super::ShutdownMode::Reload).await;
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -264,6 +264,8 @@ pub(super) async fn handle_walreceiver_connection(

    let mut walingest = WalIngest::new(timeline.as_ref(), startpoint, &ctx).await?;

+    let shard = vec![*timeline.get_shard_identity()];
+
    let interpreted_proto_config = match protocol {
        PostgresClientProtocol::Vanilla => None,
        PostgresClientProtocol::Interpreted {
@@ -403,7 +405,7 @@ pub(super) async fn handle_walreceiver_connection(
                // need to advance last record LSN on all shards. If we've not ingested the latest
                // record, then set the LSN of the modification past it. This way all shards
                // advance their last record LSN at the same time.
-                let needs_last_record_lsn_advance = match next_record_lsn.map(Lsn::from) {
+                let needs_last_record_lsn_advance = match next_record_lsn {
                    Some(lsn) if lsn > modification.get_lsn() => {
                        modification.set_lsn(lsn).unwrap();
                        true
@@ -476,10 +478,12 @@ pub(super) async fn handle_walreceiver_connection(
                        // Deserialize and interpret WAL record
                        let interpreted = InterpretedWalRecord::from_bytes_filtered(
                            recdata,
-                            modification.tline.get_shard_identity(),
+                            &shard,
                            next_record_lsn,
                            modification.tline.pg_version,
-                        )?;
+                        )?
+                        .remove(timeline.get_shard_identity())
+                        .unwrap();

                        if matches!(interpreted.flush_uncommitted, FlushUncommittedRecords::Yes)
                            && uncommitted_records > 0
--- a/pageserver/src/tenant/upload_queue.rs
+++ b/pageserver/src/tenant/upload_queue.rs
--- a/Show More
+++ b/Show More