fixup: doc reference to renamed field

prove hypothesis (inefficient fix)
tests: add unit test for vec read with overlapped images
2026-05-21 15:10:44 +00:00 · 2024-09-16 19:44:09 +01:00 · 2024-09-16 17:17:49 +01:00 · 2024-09-16 17:17:26 +01:00
142 changed files with 2059 additions and 2652 deletions
--- a/.dockerignore
+++ b/.dockerignore
@@ -13,7 +13,6 @@
 # Directories
 !.cargo/
 !.config/
-!compute/
 !compute_tools/
 !control_plane/
 !libs/
--- a/.github/workflows/_push-to-acr.yml
+++ b/.github/workflows/_push-to-acr.yml
@@ -52,5 +52,5 @@ jobs:
          for image in ${images}; do
            docker buildx imagetools create \
              -t ${{ inputs.registry_name }}.azurecr.io/neondatabase/${image}:${{ inputs.image_tag }} \
-                                                        neondatabase/${image}:${{ inputs.image_tag }}
+                                        neondatabase/${image}:${{ inputs.image_tag }}
          done
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -54,8 +54,8 @@ jobs:
      build-tag: ${{steps.build-tag.outputs.tag}}

    steps:
-      # Need `fetch-depth: 0` to count the number of commits in the branch
-      - uses: actions/checkout@v4
+      - name: Checkout
+        uses: actions/checkout@v4
        with:
          fetch-depth: 0

@@ -159,10 +159,6 @@ jobs:
      # This will catch compiler & clippy warnings in all feature combinations.
      # TODO: use cargo hack for build and test as well, but, that's quite expensive.
      # NB: keep clippy args in sync with ./run_clippy.sh
-      #
-      # The only difference between "clippy --debug" and "clippy --release" is that in --release mode,
-      # #[cfg(debug_assertions)] blocks are not built. It's not worth building everything for second
-      # time just for that, so skip "clippy --release".
      - run: |
          CLIPPY_COMMON_ARGS="$( source .neon_clippy_args; echo "$CLIPPY_COMMON_ARGS")"
          if [ "$CLIPPY_COMMON_ARGS" = "" ]; then
@@ -172,6 +168,8 @@ jobs:
          echo "CLIPPY_COMMON_ARGS=${CLIPPY_COMMON_ARGS}" >> $GITHUB_ENV
      - name: Run cargo clippy (debug)
        run: cargo hack --feature-powerset clippy $CLIPPY_COMMON_ARGS
+      - name: Run cargo clippy (release)
+        run: cargo hack --feature-powerset clippy --release $CLIPPY_COMMON_ARGS

      - name: Check documentation generation
        run: cargo doc --workspace --no-deps --document-private-items
@@ -359,7 +357,6 @@ jobs:
            })

  coverage-report:
-    if: ${{ !startsWith(github.ref_name, 'release') }}
    needs: [ check-permissions, build-build-tools-image, build-and-test-locally ]
    runs-on: [ self-hosted, small ]
    container:
@@ -376,8 +373,8 @@ jobs:
        coverage-html: ${{ steps.upload-coverage-report-new.outputs.report-url }}
        coverage-json: ${{ steps.upload-coverage-report-new.outputs.summary-json }}
    steps:
-      # Need `fetch-depth: 0` for differential coverage (to get diff between two commits)
-      - uses: actions/checkout@v4
+      - name: Checkout
+        uses: actions/checkout@v4
        with:
          submodules: true
          fetch-depth: 0
@@ -478,9 +475,11 @@ jobs:
    runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }}

    steps:
-      - uses: actions/checkout@v4
+      - name: Checkout
+        uses: actions/checkout@v4
        with:
          submodules: true
+          fetch-depth: 0

      - uses: ./.github/actions/set-docker-config-dir
      - uses: docker/setup-buildx-action@v3
@@ -555,9 +554,11 @@ jobs:
    runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }}

    steps:
-      - uses: actions/checkout@v4
+      - name: Checkout
+        uses: actions/checkout@v4
        with:
          submodules: true
+          fetch-depth: 0

      - uses: ./.github/actions/set-docker-config-dir
      - uses: docker/setup-buildx-action@v3
@@ -704,7 +705,10 @@ jobs:
      VM_BUILDER_VERSION: v0.29.3

    steps:
-      - uses: actions/checkout@v4
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0

      - name: Downloading vm-builder
        run: |
@@ -744,7 +748,10 @@ jobs:
    runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'small-arm64' || 'small')) }}

    steps:
-      - uses: actions/checkout@v4
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0

      - uses: ./.github/actions/set-docker-config-dir
      - uses: docker/login-action@v3
@@ -950,7 +957,6 @@ jobs:

  deploy:
    needs: [ check-permissions, promote-images, tag, build-and-test-locally, trigger-custom-extensions-build-and-wait, push-to-acr-dev, push-to-acr-prod ]
-    # `!failure() && !cancelled()` is required because the workflow depends on the job that can be skipped: `push-to-acr-dev` and `push-to-acr-prod`
    if: (github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy') && !failure() && !cancelled()

    runs-on: [ self-hosted, small ]
@@ -970,7 +976,10 @@ jobs:
            git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r"
          done

-      - uses: actions/checkout@v4
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0

      - name: Trigger deploy workflow
        env:
@@ -1049,8 +1058,7 @@ jobs:
  # The job runs on `release` branch and copies compatibility data and Neon artifact from the last *release PR* to the latest directory
  promote-compatibility-data:
    needs: [ deploy ]
-    # `!failure() && !cancelled()` is required because the workflow transitively depends on the job that can be skipped: `push-to-acr-dev` and `push-to-acr-prod`
-    if: github.ref_name == 'release' && !failure() && !cancelled()
+    if: github.ref_name == 'release'

    runs-on: ubuntu-22.04
    steps:
--- a/.github/workflows/trigger-e2e-tests.yml
+++ b/.github/workflows/trigger-e2e-tests.yml
@@ -34,8 +34,8 @@ jobs:
      build-tag: ${{ steps.build-tag.outputs.tag }}

    steps:
-      # Need `fetch-depth: 0` to count the number of commits in the branch
-      - uses: actions/checkout@v4
+      - name: Checkout
+        uses: actions/checkout@v4
        with:
          fetch-depth: 0

--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1209,6 +1209,7 @@ dependencies = [
 "remote_storage",
 "serde",
 "serde_json",
+ "serde_with",
 "utils",
 ]

@@ -1217,6 +1218,7 @@ name = "compute_tools"
 version = "0.1.0"
 dependencies = [
 "anyhow",
+ "async-compression",
 "bytes",
 "cfg-if",
 "chrono",
@@ -1235,6 +1237,7 @@ dependencies = [
 "reqwest 0.12.4",
 "rlimit",
 "rust-ini",
+ "serde",
 "serde_json",
 "signal-hook",
 "tar",
@@ -1243,6 +1246,7 @@ dependencies = [
 "tokio-postgres",
 "tokio-stream",
 "tokio-util",
+ "toml_edit",
 "tracing",
 "tracing-opentelemetry",
 "tracing-subscriber",
@@ -1313,9 +1317,12 @@ dependencies = [
 name = "consumption_metrics"
 version = "0.1.0"
 dependencies = [
+ "anyhow",
 "chrono",
 "rand 0.8.5",
 "serde",
+ "serde_with",
+ "utils",
 ]

 [[package]]
@@ -1327,7 +1334,9 @@ dependencies = [
 "clap",
 "comfy-table",
 "compute_api",
+ "futures",
 "git-version",
+ "hex",
 "humantime",
 "humantime-serde",
 "hyper 0.14.26",
@@ -1335,6 +1344,7 @@ dependencies = [
 "once_cell",
 "pageserver_api",
 "pageserver_client",
+ "postgres",
 "postgres_backend",
 "postgres_connection",
 "regex",
@@ -1343,7 +1353,9 @@ dependencies = [
 "scopeguard",
 "serde",
 "serde_json",
+ "serde_with",
 "storage_broker",
+ "tar",
 "thiserror",
 "tokio",
 "tokio-postgres",
@@ -1651,6 +1663,7 @@ dependencies = [
 "hex",
 "parking_lot 0.12.1",
 "rand 0.8.5",
+ "scopeguard",
 "smallvec",
 "tracing",
 "utils",
@@ -2220,22 +2233,24 @@ checksum = "4271d37baee1b8c7e4b708028c57d816cf9d2434acb33a549475f78c181f6253"

 [[package]]
 name = "git-version"
-version = "0.3.9"
+version = "0.3.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1ad568aa3db0fcbc81f2f116137f263d7304f512a1209b35b85150d3ef88ad19"
+checksum = "f6b0decc02f4636b9ccad390dcbe77b722a77efedfa393caf8379a51d5c61899"
 dependencies = [
 "git-version-macro",
+ "proc-macro-hack",
 ]

 [[package]]
 name = "git-version-macro"
-version = "0.3.9"
+version = "0.3.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "53010ccb100b96a67bc32c0175f0ed1426b31b655d562898e57325f81c023ac0"
+checksum = "fe69f1cbdb6e28af2bac214e943b99ce8a0a06b447d15d3e61161b0423139f3f"
 dependencies = [
+ "proc-macro-hack",
 "proc-macro2",
 "quote",
- "syn 2.0.52",
+ "syn 1.0.109",
 ]

 [[package]]
@@ -2729,6 +2744,19 @@ dependencies = [
 "libc",
 ]

+[[package]]
+name = "inotify"
+version = "0.10.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fdd168d97690d0b8c412d6b6c10360277f4d7ee495c5d0d5d5fe0854923255cc"
+dependencies = [
+ "bitflags 1.3.2",
+ "futures-core",
+ "inotify-sys",
+ "libc",
+ "tokio",
+]
+
 [[package]]
 name = "inotify-sys"
 version = "0.1.5"
@@ -3223,7 +3251,7 @@ dependencies = [
 "crossbeam-channel",
 "filetime",
 "fsevent-sys",
- "inotify",
+ "inotify 0.9.6",
 "kqueue",
 "libc",
 "log",
@@ -3614,6 +3642,7 @@ name = "pagectl"
 version = "0.1.0"
 dependencies = [
 "anyhow",
+ "bytes",
 "camino",
 "clap",
 "git-version",
@@ -3622,6 +3651,7 @@ dependencies = [
 "pageserver_api",
 "postgres_ffi",
 "remote_storage",
+ "serde",
 "serde_json",
 "svg_fmt",
 "thiserror",
@@ -3640,6 +3670,7 @@ dependencies = [
 "arc-swap",
 "async-compression",
 "async-stream",
+ "async-trait",
 "bit_field",
 "byteorder",
 "bytes",
@@ -3647,13 +3678,16 @@ dependencies = [
 "camino-tempfile",
 "chrono",
 "clap",
+ "const_format",
 "consumption_metrics",
 "crc32c",
 "criterion",
+ "crossbeam-utils",
 "either",
 "enum-map",
 "enumset",
 "fail",
+ "flate2",
 "futures",
 "git-version",
 "hex",
@@ -3692,9 +3726,13 @@ dependencies = [
 "serde_json",
 "serde_path_to_error",
 "serde_with",
+ "signal-hook",
+ "smallvec",
 "storage_broker",
 "strum",
 "strum_macros",
+ "svg_fmt",
+ "sync_wrapper",
 "sysinfo",
 "tenant_size_model",
 "thiserror",
@@ -3708,6 +3746,7 @@ dependencies = [
 "tokio-util",
 "toml_edit",
 "tracing",
+ "twox-hash",
 "url",
 "utils",
 "walkdir",
@@ -3771,22 +3810,44 @@ name = "pageserver_compaction"
 version = "0.1.0"
 dependencies = [
 "anyhow",
+ "async-compression",
 "async-stream",
+ "byteorder",
+ "bytes",
+ "chrono",
 "clap",
+ "const_format",
+ "consumption_metrics",
 "criterion",
+ "crossbeam-utils",
+ "either",
+ "fail",
+ "flate2",
 "futures",
 "git-version",
+ "hex",
 "hex-literal",
+ "humantime",
+ "humantime-serde",
 "itertools 0.10.5",
+ "metrics",
 "once_cell",
 "pageserver_api",
 "pin-project-lite",
 "rand 0.8.5",
+ "smallvec",
 "svg_fmt",
+ "sync_wrapper",
+ "thiserror",
 "tokio",
+ "tokio-io-timeout",
+ "tokio-util",
 "tracing",
+ "tracing-error",
 "tracing-subscriber",
+ "url",
 "utils",
+ "walkdir",
 "workspace_hack",
 ]

@@ -4103,7 +4164,9 @@ name = "postgres_backend"
 version = "0.1.0"
 dependencies = [
 "anyhow",
+ "async-trait",
 "bytes",
+ "futures",
 "once_cell",
 "pq_proto",
 "rustls 0.22.4",
@@ -4136,13 +4199,16 @@ version = "0.1.0"
 dependencies = [
 "anyhow",
 "bindgen",
+ "byteorder",
 "bytes",
 "crc32c",
 "env_logger",
+ "hex",
 "log",
 "memoffset 0.8.0",
 "once_cell",
 "postgres",
+ "rand 0.8.5",
 "regex",
 "serde",
 "thiserror",
@@ -4177,11 +4243,13 @@ dependencies = [
 "byteorder",
 "bytes",
 "itertools 0.10.5",
+ "pin-project-lite",
 "postgres-protocol",
 "rand 0.8.5",
 "serde",
 "thiserror",
 "tokio",
+ "tracing",
 ]

 [[package]]
@@ -4213,6 +4281,12 @@ dependencies = [
 "elliptic-curve 0.13.8",
 ]

+[[package]]
+name = "proc-macro-hack"
+version = "0.5.20+deprecated"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dc375e1527247fe1a97d8b7156678dfe7c1af2fc075c9a4db3690ecd2a148068"
+
 [[package]]
 name = "proc-macro2"
 version = "1.0.78"
@@ -4331,6 +4405,7 @@ dependencies = [
 "aws-config",
 "aws-sdk-iam",
 "aws-sigv4",
+ "aws-types",
 "base64 0.13.1",
 "bstr",
 "bytes",
@@ -4339,6 +4414,7 @@ dependencies = [
 "chrono",
 "clap",
 "consumption_metrics",
+ "crossbeam-deque",
 "dashmap",
 "ecdsa 0.16.9",
 "env_logger",
@@ -4364,9 +4440,11 @@ dependencies = [
 "jose-jwa",
 "jose-jwk",
 "lasso",
+ "md5",
 "measured",
 "metrics",
 "once_cell",
+ "opentelemetry",
 "p256 0.13.2",
 "parking_lot 0.12.1",
 "parquet",
@@ -4387,6 +4465,7 @@ dependencies = [
 "reqwest-middleware",
 "reqwest-retry",
 "reqwest-tracing",
+ "routerify",
 "rsa",
 "rstest",
 "rustc-hash",
@@ -4402,6 +4481,7 @@ dependencies = [
 "smol_str",
 "socket2 0.5.5",
 "subtle",
+ "task-local-extensions",
 "thiserror",
 "tikv-jemalloc-ctl",
 "tikv-jemallocator",
@@ -4411,6 +4491,7 @@ dependencies = [
 "tokio-rustls 0.25.0",
 "tokio-tungstenite",
 "tokio-util",
+ "tower-service",
 "tracing",
 "tracing-opentelemetry",
 "tracing-subscriber",
@@ -4700,6 +4781,7 @@ dependencies = [
 "async-stream",
 "async-trait",
 "aws-config",
+ "aws-credential-types",
 "aws-sdk-s3",
 "aws-smithy-async",
 "aws-smithy-types",
@@ -4713,6 +4795,7 @@ dependencies = [
 "futures",
 "futures-util",
 "http-types",
+ "humantime",
 "humantime-serde",
 "hyper 0.14.26",
 "itertools 0.10.5",
@@ -5192,12 +5275,14 @@ version = "0.1.0"
 dependencies = [
 "anyhow",
 "async-stream",
+ "async-trait",
 "byteorder",
 "bytes",
 "camino",
 "camino-tempfile",
 "chrono",
 "clap",
+ "const_format",
 "crc32c",
 "desim",
 "fail",
@@ -5223,7 +5308,9 @@ dependencies = [
 "sd-notify",
 "serde",
 "serde_json",
+ "serde_with",
 "sha2",
+ "signal-hook",
 "storage_broker",
 "strum",
 "strum_macros",
@@ -5234,6 +5321,7 @@ dependencies = [
 "tokio-stream",
 "tokio-tar",
 "tokio-util",
+ "toml_edit",
 "tracing",
 "tracing-subscriber",
 "url",
@@ -5248,6 +5336,7 @@ version = "0.1.0"
 dependencies = [
 "const_format",
 "serde",
+ "serde_with",
 "utils",
 ]

@@ -5776,6 +5865,7 @@ version = "0.1.0"
 dependencies = [
 "anyhow",
 "async-stream",
+ "bytes",
 "clap",
 "const_format",
 "futures",
@@ -5789,6 +5879,7 @@ dependencies = [
 "parking_lot 0.12.1",
 "prost",
 "tokio",
+ "tokio-stream",
 "tonic",
 "tonic-build",
 "tracing",
@@ -5801,7 +5892,9 @@ name = "storage_controller"
 version = "0.1.0"
 dependencies = [
 "anyhow",
+ "aws-config",
 "bytes",
+ "camino",
 "chrono",
 "clap",
 "control_plane",
@@ -5842,9 +5935,20 @@ dependencies = [
 name = "storage_controller_client"
 version = "0.1.0"
 dependencies = [
+ "anyhow",
+ "bytes",
+ "futures",
+ "pageserver_api",
 "pageserver_client",
+ "postgres",
 "reqwest 0.12.4",
 "serde",
+ "thiserror",
+ "tokio",
+ "tokio-postgres",
+ "tokio-stream",
+ "tokio-util",
+ "utils",
 "workspace_hack",
 ]

@@ -5856,9 +5960,13 @@ dependencies = [
 "async-stream",
 "aws-config",
 "aws-sdk-s3",
+ "aws-smithy-async",
+ "bincode",
+ "bytes",
 "camino",
 "chrono",
 "clap",
+ "crc32c",
 "either",
 "futures",
 "futures-util",
@@ -5870,16 +5978,20 @@ dependencies = [
 "pageserver",
 "pageserver_api",
 "postgres_ffi",
+ "rand 0.8.5",
 "remote_storage",
 "reqwest 0.12.4",
 "rustls 0.22.4",
 "rustls-native-certs 0.7.0",
 "serde",
 "serde_json",
+ "serde_with",
 "storage_controller_client",
+ "thiserror",
 "tokio",
 "tokio-postgres",
 "tokio-postgres-rustls",
+ "tokio-rustls 0.25.0",
 "tokio-stream",
 "tokio-util",
 "tracing",
@@ -5898,11 +6010,14 @@ dependencies = [
 "comfy-table",
 "futures",
 "humantime",
+ "hyper 0.14.26",
 "pageserver_api",
 "pageserver_client",
 "reqwest 0.12.4",
+ "serde",
 "serde_json",
 "storage_controller_client",
+ "thiserror",
 "tokio",
 "tracing",
 "utils",
@@ -6025,6 +6140,15 @@ dependencies = [
 "xattr",
 ]

+[[package]]
+name = "task-local-extensions"
+version = "0.1.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ba323866e5d033818e3240feeb9f7db2c4296674e4d9e16b97b7bf8f490434e8"
+dependencies = [
+ "pin-utils",
+]
+
 [[package]]
 name = "tempfile"
 version = "3.9.0"
@@ -6615,6 +6739,7 @@ dependencies = [
 "opentelemetry",
 "opentelemetry-otlp",
 "opentelemetry-semantic-conventions",
+ "reqwest 0.12.4",
 "tokio",
 "tracing",
 "tracing-opentelemetry",
@@ -6818,6 +6943,7 @@ dependencies = [
 "serde_assert",
 "serde_json",
 "serde_path_to_error",
+ "serde_with",
 "signal-hook",
 "strum",
 "strum_macros",
@@ -6873,11 +6999,13 @@ dependencies = [
 "cgroups-rs",
 "clap",
 "futures",
+ "inotify 0.10.2",
 "serde",
 "serde_json",
 "sysinfo",
 "tokio",
 "tokio-postgres",
+ "tokio-stream",
 "tokio-util",
 "tracing",
 "tracing-subscriber",
@@ -6904,6 +7032,7 @@ dependencies = [
 "clap",
 "env_logger",
 "log",
+ "once_cell",
 "postgres",
 "postgres_ffi",
 "regex",
@@ -7426,7 +7555,6 @@ dependencies = [
 "digest",
 "either",
 "fail",
- "futures",
 "futures-channel",
 "futures-executor",
 "futures-io",
@@ -7482,8 +7610,6 @@ dependencies = [
 "tower",
 "tracing",
 "tracing-core",
- "tracing-log",
- "tracing-subscriber",
 "url",
 "uuid",
 "zeroize",
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -55,27 +55,22 @@ RUN cd postgres && \
    # We could add the additional grant statements to the postgres repository but it would be hard to maintain,
    # whenever we need to pick up a new postgres version and we want to limit the changes in our postgres fork,
    # so we do it here.
+    old_list="pg_stat_statements--1.0--1.1.sql pg_stat_statements--1.1--1.2.sql pg_stat_statements--1.2--1.3.sql pg_stat_statements--1.3--1.4.sql pg_stat_statements--1.4--1.5.sql pg_stat_statements--1.4.sql pg_stat_statements--1.5--1.6.sql"; \
+    # the first loop is for pg_stat_statement extension version <= 1.6
    for file in /usr/local/pgsql/share/extension/pg_stat_statements--*.sql; do \
        filename=$(basename "$file"); \
-        # Note that there are no downgrade scripts for pg_stat_statements, so we \
-        # don't have to modify any downgrade paths or (much) older versions: we only \
-        # have to make sure every creation of the pg_stat_statements_reset function \
-        # also adds execute permissions to the neon_superuser.
-        case $filename in \
-          pg_stat_statements--1.4.sql) \
-            # pg_stat_statements_reset is first created with 1.4
+        if echo "$old_list" | grep -q -F "$filename"; then \
            echo 'GRANT EXECUTE ON FUNCTION pg_stat_statements_reset() TO neon_superuser;' >> $file; \
-            ;; \
-          pg_stat_statements--1.6--1.7.sql) \
-            # Then with the 1.6-1.7 migration it is re-created with a new signature, thus add the permissions back
+        fi; \
+    done; \
+    # the second loop is for pg_stat_statement extension versions >= 1.7,
+    # where pg_stat_statement_reset() got 3 additional arguments
+    for file in /usr/local/pgsql/share/extension/pg_stat_statements--*.sql; do \
+        filename=$(basename "$file"); \
+        if ! echo "$old_list" | grep -q -F "$filename"; then \
            echo 'GRANT EXECUTE ON FUNCTION pg_stat_statements_reset(Oid, Oid, bigint) TO neon_superuser;' >> $file; \
-            ;; \
-          pg_stat_statements--1.10--1.11.sql) \
-            # Then with the 1.10-1.11 migration it is re-created with a new signature again, thus add the permissions back
-            echo 'GRANT EXECUTE ON FUNCTION pg_stat_statements_reset(Oid, Oid, bigint, boolean) TO neon_superuser;' >> $file; \
-            ;; \
-        esac; \
-    done;
+        fi; \
+    done

 #########################################################################################
 #
@@ -1031,15 +1026,6 @@ FROM debian:bullseye-slim AS compute-tools-image

 COPY --from=compute-tools /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl

-#########################################################################################
-#
-# Layers postgres_exporter and sql_exporter
-#
-#########################################################################################
-
-FROM quay.io/prometheuscommunity/postgres-exporter:v0.12.1 AS postgres-exporter
-FROM burningalchemist/sql_exporter:0.13 AS sql-exporter
-
 #########################################################################################
 #
 # Clean up postgres folder before inclusion
@@ -1169,19 +1155,9 @@ RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
 COPY --from=postgres-cleanup-layer --chown=postgres /usr/local/pgsql /usr/local
 COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl

-# Metrics exporter binaries and  configuration files
-COPY --from=postgres-exporter /bin/postgres_exporter /bin/postgres_exporter
-COPY --from=sql-exporter      /bin/sql_exporter      /bin/sql_exporter
-
-COPY --chmod=0644 compute/etc/sql_exporter.yml               /etc/sql_exporter.yml
-COPY --chmod=0644 compute/etc/neon_collector.yml             /etc/neon_collector.yml
-COPY --chmod=0644 compute/etc/sql_exporter_autoscaling.yml   /etc/sql_exporter_autoscaling.yml
-COPY --chmod=0644 compute/etc/neon_collector_autoscaling.yml /etc/neon_collector_autoscaling.yml
-
 # Create remote extension download directory
 RUN mkdir /usr/local/download_extensions && chown -R postgres:postgres /usr/local/download_extensions

-
 # Install:
 # libreadline8 for psql
 # libicu67, locales for collations (including ICU and plpgsql_check)
--- a/compute/README.md
+++ b/compute/README.md
@@ -1,5 +0,0 @@
-This directory contains configuration files that are included in the
-compute images. TODO: move Dockerilfes, patches, and everything else
-that's needed to build the compute image here.
-
-
--- a/compute/etc/neon_collector.yml
+++ b/compute/etc/neon_collector.yml
@@ -1,247 +0,0 @@
-collector_name: neon_collector
-metrics:
- metric_name: lfc_misses
-  type: gauge
-  help: 'lfc_misses'
-  key_labels:
-  values: [lfc_misses]
-  query: |
-    select lfc_value as lfc_misses from neon.neon_lfc_stats where lfc_key='file_cache_misses';
-
- metric_name: lfc_used
-  type: gauge
-  help: 'LFC chunks used (chunk = 1MB)'
-  key_labels:
-  values: [lfc_used]
-  query: |
-    select lfc_value as lfc_used from neon.neon_lfc_stats where lfc_key='file_cache_used';
-
- metric_name: lfc_hits
-  type: gauge
-  help: 'lfc_hits'
-  key_labels:
-  values: [lfc_hits]
-  query: |
-    select lfc_value as lfc_hits from neon.neon_lfc_stats where lfc_key='file_cache_hits';
-
- metric_name: lfc_writes
-  type: gauge
-  help: 'lfc_writes'
-  key_labels:
-  values: [lfc_writes]
-  query: |
-    select lfc_value as lfc_writes from neon.neon_lfc_stats where lfc_key='file_cache_writes';
-
- metric_name: lfc_cache_size_limit
-  type: gauge
-  help: 'LFC cache size limit in bytes'
-  key_labels:
-  values: [lfc_cache_size_limit]
-  query: |
-    select pg_size_bytes(current_setting('neon.file_cache_size_limit')) as lfc_cache_size_limit;
-
- metric_name: connection_counts
-  type: gauge
-  help: 'Connection counts'
-  key_labels:
-    - datname
-    - state
-  values: [count]
-  query: |
-    select datname, state, count(*) as count from pg_stat_activity where state <> '' group by datname, state;
-
- metric_name: pg_stats_userdb
-  type: gauge
-  help: 'Stats for several oldest non-system dbs'
-  key_labels:
-    - datname
-  value_label: kind
-  values:
-    - db_size
-    - deadlocks
-    # Rows
-    - inserted
-    - updated
-    - deleted
-  # We export stats for 10 non-system database. Without this limit
-  # it is too easy to abuse the system by creating lots of databases.
-  query: |
-    select pg_database_size(datname) as db_size, deadlocks,
-       tup_inserted as inserted, tup_updated as updated, tup_deleted as deleted,
-       datname
-     from pg_stat_database
-     where datname IN (
-       select datname
-       from pg_database
-       where datname <> 'postgres' and not datistemplate
-       order by oid
-       limit 10
-     );
-
- metric_name: max_cluster_size
-  type: gauge
-  help: 'neon.max_cluster_size setting'
-  key_labels:
-  values: [max_cluster_size]
-  query: |
-    select setting::int as max_cluster_size from pg_settings where name = 'neon.max_cluster_size';
-
- metric_name: db_total_size
-  type: gauge
-  help: 'Size of all databases'
-  key_labels:
-  values: [total]
-  query: |
-    select sum(pg_database_size(datname)) as total from pg_database;
-
-# DEPRECATED
- metric_name: lfc_approximate_working_set_size
-  type: gauge
-  help: 'Approximate working set size in pages of 8192 bytes'
-  key_labels:
-  values: [approximate_working_set_size]
-  query: |
-    select neon.approximate_working_set_size(false) as approximate_working_set_size;
-
- metric_name: lfc_approximate_working_set_size_windows
-  type: gauge
-  help: 'Approximate working set size in pages of 8192 bytes'
-  key_labels: [duration]
-  values: [size]
-  # NOTE: This is the "public" / "human-readable" version. Here, we supply a small selection
-  # of durations in a pretty-printed form.
-  query: |
-    select
-      x as duration,
-      neon.approximate_working_set_size_seconds(extract('epoch' from x::interval)::int) as size
-    from
-      (values ('5m'),('15m'),('1h')) as t (x);
-
- metric_name: compute_current_lsn
-  type: gauge
-  help: 'Current LSN of the database'
-  key_labels:
-  values: [lsn]
-  query: |
-    select
-      case
-        when pg_catalog.pg_is_in_recovery()
-        then (pg_last_wal_replay_lsn() - '0/0')::FLOAT8
-        else (pg_current_wal_lsn() - '0/0')::FLOAT8
-      end as lsn;
-
- metric_name: compute_receive_lsn
-  type: gauge
-  help: 'Returns the last write-ahead log location that has been received and synced to disk by streaming replication'
-  key_labels:
-  values: [lsn]
-  query: |
-    SELECT
-      CASE
-        WHEN pg_catalog.pg_is_in_recovery()
-        THEN (pg_last_wal_receive_lsn() - '0/0')::FLOAT8
-        ELSE 0
-      END AS lsn;
-
- metric_name: replication_delay_bytes
-  type: gauge
-  help: 'Bytes between received and replayed LSN'
-  key_labels:
-  values: [replication_delay_bytes]
-  # We use a GREATEST call here because this calculation can be negative.
-  # The calculation is not atomic, meaning after we've gotten the receive
-  # LSN, the replay LSN may have advanced past the receive LSN we
-  # are using for the calculation.
-  query: |
-    SELECT GREATEST(0, pg_wal_lsn_diff(pg_last_wal_receive_lsn(), pg_last_wal_replay_lsn())) AS replication_delay_bytes;
-
- metric_name: replication_delay_seconds
-  type: gauge
-  help: 'Time since last LSN was replayed'
-  key_labels:
-  values: [replication_delay_seconds]
-  query: |
-    SELECT
-      CASE
-        WHEN pg_last_wal_receive_lsn() = pg_last_wal_replay_lsn() THEN 0
-        ELSE GREATEST (0, EXTRACT (EPOCH FROM now() - pg_last_xact_replay_timestamp()))
-     END AS replication_delay_seconds;
-
- metric_name: checkpoints_req
-  type: gauge
-  help: 'Number of requested checkpoints'
-  key_labels:
-  values: [checkpoints_req]
-  query: |
-    SELECT checkpoints_req FROM pg_stat_bgwriter;
-
- metric_name: checkpoints_timed
-  type: gauge
-  help: 'Number of scheduled checkpoints'
-  key_labels:
-  values: [checkpoints_timed]
-  query: |
-    SELECT checkpoints_timed FROM pg_stat_bgwriter;
-
- metric_name: compute_logical_snapshot_files
-  type: gauge
-  help: 'Number of snapshot files in pg_logical/snapshot'
-  key_labels:
-    - timeline_id
-  values: [num_logical_snapshot_files]
-  query: |
-    SELECT
-      (SELECT setting FROM pg_settings WHERE name = 'neon.timeline_id') AS timeline_id,
-      -- Postgres creates temporary snapshot files of the form %X-%X.snap.%d.tmp. These
-      -- temporary snapshot files are renamed to the actual snapshot files after they are
-      -- completely built. We only WAL-log the completely built snapshot files.
-      (SELECT COUNT(*) FROM pg_ls_logicalsnapdir() WHERE name LIKE '%.snap') AS num_logical_snapshot_files;
-
-# In all the below metrics, we cast LSNs to floats because Prometheus only supports floats.
-# It's probably fine because float64 can store integers from -2^53 to +2^53 exactly.
-
-# Number of slots is limited by max_replication_slots, so collecting position for all of them shouldn't be bad.
- metric_name: logical_slot_restart_lsn
-  type: gauge
-  help: 'restart_lsn of logical slots'
-  key_labels:
-    - slot_name
-  values: [restart_lsn]
-  query: |
-    select slot_name, (restart_lsn - '0/0')::FLOAT8 as restart_lsn
-    from pg_replication_slots
-    where slot_type = 'logical';
-
- metric_name: compute_subscriptions_count
-  type: gauge
-  help: 'Number of logical replication subscriptions grouped by enabled/disabled'
-  key_labels:
-    - enabled
-  values: [subscriptions_count]
-  query: |
-    select subenabled::text as enabled, count(*) as subscriptions_count
-    from pg_subscription
-    group by subenabled;
-
- metric_name: retained_wal
-  type: gauge
-  help: 'Retained WAL in inactive replication slots'
-  key_labels:
-    - slot_name
-  values: [retained_wal]
-  query: |
-    SELECT slot_name, pg_wal_lsn_diff(pg_current_wal_lsn(), restart_lsn)::FLOAT8 AS retained_wal
-    FROM pg_replication_slots
-    WHERE active = false;
-
- metric_name: wal_is_lost
-  type: gauge
-  help: 'Whether or not the replication slot wal_status is lost'
-  key_labels:
-    - slot_name
-  values: [wal_is_lost]
-  query: |
-    SELECT slot_name,
-           CASE WHEN wal_status = 'lost' THEN 1 ELSE 0 END AS wal_is_lost
-    FROM pg_replication_slots;
-
--- a/compute/etc/neon_collector_autoscaling.yml
+++ b/compute/etc/neon_collector_autoscaling.yml
@@ -1,55 +0,0 @@
-collector_name: neon_collector_autoscaling
-metrics:
- metric_name: lfc_misses
-  type: gauge
-  help: 'lfc_misses'
-  key_labels:
-  values: [lfc_misses]
-  query: |
-    select lfc_value as lfc_misses from neon.neon_lfc_stats where lfc_key='file_cache_misses';
-
- metric_name: lfc_used
-  type: gauge
-  help: 'LFC chunks used (chunk = 1MB)'
-  key_labels:
-  values: [lfc_used]
-  query: |
-    select lfc_value as lfc_used from neon.neon_lfc_stats where lfc_key='file_cache_used';
-
- metric_name: lfc_hits
-  type: gauge
-  help: 'lfc_hits'
-  key_labels:
-  values: [lfc_hits]
-  query: |
-    select lfc_value as lfc_hits from neon.neon_lfc_stats where lfc_key='file_cache_hits';
-
- metric_name: lfc_writes
-  type: gauge
-  help: 'lfc_writes'
-  key_labels:
-  values: [lfc_writes]
-  query: |
-    select lfc_value as lfc_writes from neon.neon_lfc_stats where lfc_key='file_cache_writes';
-
- metric_name: lfc_cache_size_limit
-  type: gauge
-  help: 'LFC cache size limit in bytes'
-  key_labels:
-  values: [lfc_cache_size_limit]
-  query: |
-    select pg_size_bytes(current_setting('neon.file_cache_size_limit')) as lfc_cache_size_limit;
-
- metric_name: lfc_approximate_working_set_size_windows
-  type: gauge
-  help: 'Approximate working set size in pages of 8192 bytes'
-  key_labels: [duration_seconds]
-  values: [size]
-  # NOTE: This is the "internal" / "machine-readable" version. This outputs the working set
-  # size looking back 1..60 minutes, labeled with the number of minutes.
-  query: |
-    select
-      x::text as duration_seconds,
-      neon.approximate_working_set_size_seconds(x) as size
-    from
-      (select generate_series * 60 as x from generate_series(1, 60)) as t (x);
--- a/compute/etc/sql_exporter.yml
+++ b/compute/etc/sql_exporter.yml
@@ -1,33 +0,0 @@
-# Configuration for sql_exporter
-# Global defaults.
-global:
-  # If scrape_timeout <= 0, no timeout is set unless Prometheus provides one. The default is 10s.
-  scrape_timeout: 10s
-  # Subtracted from Prometheus' scrape_timeout to give us some headroom and prevent Prometheus from timing out first.
-  scrape_timeout_offset: 500ms
-  # Minimum interval between collector runs: by default (0s) collectors are executed on every scrape.
-  min_interval: 0s
-  # Maximum number of open connections to any one target. Metric queries will run concurrently on multiple connections,
-  # as will concurrent scrapes.
-  max_connections: 1
-  # Maximum number of idle connections to any one target. Unless you use very long collection intervals, this should
-  # always be the same as max_connections.
-  max_idle_connections: 1
-  # Maximum number of maximum amount of time a connection may be reused. Expired connections may be closed lazily before reuse.
-  # If 0, connections are not closed due to a connection's age.
-  max_connection_lifetime: 5m
-
-# The target to monitor and the collectors to execute on it.
-target:
-  # Data source name always has a URI schema that matches the driver name. In some cases (e.g. MySQL)
-  # the schema gets dropped or replaced to match the driver expected DSN format.
-  data_source_name: 'postgresql://cloud_admin@127.0.0.1:5432/postgres?sslmode=disable&application_name=sql_exporter'
-
-  # Collectors (referenced by name) to execute on the target.
-  # Glob patterns are supported (see <https://pkg.go.dev/path/filepath#Match> for syntax).
-  collectors: [neon_collector]
-
-# Collector files specifies a list of globs. One collector definition is read from each matching file.
-# Glob patterns are supported (see <https://pkg.go.dev/path/filepath#Match> for syntax).
-collector_files:
-  - "neon_collector.yml"
--- a/compute/etc/sql_exporter_autoscaling.yml
+++ b/compute/etc/sql_exporter_autoscaling.yml
@@ -1,33 +0,0 @@
-# Configuration for sql_exporter for autoscaling-agent
-# Global defaults.
-global:
-  # If scrape_timeout <= 0, no timeout is set unless Prometheus provides one. The default is 10s.
-  scrape_timeout: 10s
-  # Subtracted from Prometheus' scrape_timeout to give us some headroom and prevent Prometheus from timing out first.
-  scrape_timeout_offset: 500ms
-  # Minimum interval between collector runs: by default (0s) collectors are executed on every scrape.
-  min_interval: 0s
-  # Maximum number of open connections to any one target. Metric queries will run concurrently on multiple connections,
-  # as will concurrent scrapes.
-  max_connections: 1
-  # Maximum number of idle connections to any one target. Unless you use very long collection intervals, this should
-  # always be the same as max_connections.
-  max_idle_connections: 1
-  # Maximum number of maximum amount of time a connection may be reused. Expired connections may be closed lazily before reuse.
-  # If 0, connections are not closed due to a connection's age.
-  max_connection_lifetime: 5m
-
-# The target to monitor and the collectors to execute on it.
-target:
-  # Data source name always has a URI schema that matches the driver name. In some cases (e.g. MySQL)
-  # the schema gets dropped or replaced to match the driver expected DSN format.
-  data_source_name: 'postgresql://cloud_admin@127.0.0.1:5432/postgres?sslmode=disable&application_name=sql_exporter_autoscaling'
-
-  # Collectors (referenced by name) to execute on the target.
-  # Glob patterns are supported (see <https://pkg.go.dev/path/filepath#Match> for syntax).
-  collectors: [neon_collector_autoscaling]
-
-# Collector files specifies a list of globs. One collector definition is read from each matching file.
-# Glob patterns are supported (see <https://pkg.go.dev/path/filepath#Match> for syntax).
-collector_files:
-  - "neon_collector_autoscaling.yml"
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -11,6 +11,7 @@ testing = []

 [dependencies]
 anyhow.workspace = true
+async-compression.workspace = true
 chrono.workspace = true
 cfg-if.workspace = true
 clap.workspace = true
@@ -23,6 +24,7 @@ num_cpus.workspace = true
 opentelemetry.workspace = true
 postgres.workspace = true
 regex.workspace = true
+serde.workspace = true
 serde_json.workspace = true
 signal-hook.workspace = true
 tar.workspace = true
@@ -41,6 +43,7 @@ url.workspace = true
 compute_api.workspace = true
 utils.workspace = true
 workspace_hack.workspace = true
+toml_edit.workspace = true
 remote_storage = { version = "0.1", path = "../libs/remote_storage/" }
 vm_monitor = { version = "0.1", path = "../libs/vm_monitor/" }
 zstd = "0.13"
--- a/compute_tools/src/migrations/0011-grant_pg_show_replication_origin_status_to_neon_superuser.sql
+++ b/compute_tools/src/migrations/0011-grant_pg_show_replication_origin_status_to_neon_superuser.sql
@@ -1 +0,0 @@
-GRANT EXECUTE ON FUNCTION pg_show_replication_origin_status TO neon_superuser;
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -793,9 +793,6 @@ pub fn handle_migrations(client: &mut Client) -> Result<()> {
        include_str!(
            "./migrations/0010-grant_snapshot_synchronization_funcs_to_neon_superuser.sql"
        ),
-        include_str!(
-            "./migrations/0011-grant_pg_show_replication_origin_status_to_neon_superuser.sql"
-        ),
    ];

    MigrationRunner::new(client, &migrations).run_migrations()?;
--- a/control_plane/Cargo.toml
+++ b/control_plane/Cargo.toml
@@ -9,10 +9,13 @@ anyhow.workspace = true
 camino.workspace = true
 clap.workspace = true
 comfy-table.workspace = true
+futures.workspace = true
 git-version.workspace = true
 humantime.workspace = true
 nix.workspace = true
 once_cell.workspace = true
+postgres.workspace = true
+hex.workspace = true
 humantime-serde.workspace = true
 hyper.workspace = true
 regex.workspace = true
@@ -20,6 +23,8 @@ reqwest = { workspace = true, features = ["blocking", "json"] }
 scopeguard.workspace = true
 serde.workspace = true
 serde_json.workspace = true
+serde_with.workspace = true
+tar.workspace = true
 thiserror.workspace = true
 toml.workspace = true
 toml_edit.workspace = true
--- a/control_plane/src/background_process.rs
+++ b/control_plane/src/background_process.rs
@@ -151,7 +151,7 @@ where
                    print!(".");
                    io::stdout().flush().unwrap();
                }
-                tokio::time::sleep(RETRY_INTERVAL).await;
+                thread::sleep(RETRY_INTERVAL);
            }
            Err(e) => {
                println!("error starting process {process_name:?}: {e:#}");
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -34,14 +34,12 @@ use safekeeper_api::{
    DEFAULT_HTTP_LISTEN_PORT as DEFAULT_SAFEKEEPER_HTTP_PORT,
    DEFAULT_PG_LISTEN_PORT as DEFAULT_SAFEKEEPER_PG_PORT,
 };
-use std::borrow::Cow;
 use std::collections::{BTreeSet, HashMap};
 use std::path::PathBuf;
 use std::process::exit;
 use std::str::FromStr;
 use std::time::Duration;
 use storage_broker::DEFAULT_LISTEN_ADDR as DEFAULT_BROKER_ADDR;
-use tokio::task::JoinSet;
 use url::Host;
 use utils::{
    auth::{Claims, Scope},
@@ -89,35 +87,34 @@ fn main() -> Result<()> {

    // Check for 'neon init' command first.
    let subcommand_result = if sub_name == "init" {
-        handle_init(sub_args).map(|env| Some(Cow::Owned(env)))
+        handle_init(sub_args).map(Some)
    } else {
        // all other commands need an existing config
-
-        let env = LocalEnv::load_config(&local_env::base_path()).context("Error loading config")?;
+        let mut env =
+            LocalEnv::load_config(&local_env::base_path()).context("Error loading config")?;
        let original_env = env.clone();
-        let env = Box::leak(Box::new(env));
+
        let rt = tokio::runtime::Builder::new_current_thread()
            .enable_all()
            .build()
            .unwrap();

        let subcommand_result = match sub_name {
-            "tenant" => rt.block_on(handle_tenant(sub_args, env)),
-            "timeline" => rt.block_on(handle_timeline(sub_args, env)),
-            "start" => rt.block_on(handle_start_all(env, get_start_timeout(sub_args))),
-            "stop" => rt.block_on(handle_stop_all(sub_args, env)),
-            "pageserver" => rt.block_on(handle_pageserver(sub_args, env)),
-            "storage_controller" => rt.block_on(handle_storage_controller(sub_args, env)),
-            "storage_broker" => rt.block_on(handle_storage_broker(sub_args, env)),
-            "safekeeper" => rt.block_on(handle_safekeeper(sub_args, env)),
-            "endpoint" => rt.block_on(handle_endpoint(sub_args, env)),
-            "mappings" => handle_mappings(sub_args, env),
+            "tenant" => rt.block_on(handle_tenant(sub_args, &mut env)),
+            "timeline" => rt.block_on(handle_timeline(sub_args, &mut env)),
+            "start" => rt.block_on(handle_start_all(&env, get_start_timeout(sub_args))),
+            "stop" => rt.block_on(handle_stop_all(sub_args, &env)),
+            "pageserver" => rt.block_on(handle_pageserver(sub_args, &env)),
+            "storage_controller" => rt.block_on(handle_storage_controller(sub_args, &env)),
+            "safekeeper" => rt.block_on(handle_safekeeper(sub_args, &env)),
+            "endpoint" => rt.block_on(handle_endpoint(sub_args, &env)),
+            "mappings" => handle_mappings(sub_args, &mut env),
            "pg" => bail!("'pg' subcommand has been renamed to 'endpoint'"),
            _ => bail!("unexpected subcommand {sub_name}"),
        };

-        if &original_env != env {
-            subcommand_result.map(|()| Some(Cow::Borrowed(env)))
+        if original_env != env {
+            subcommand_result.map(|()| Some(env))
        } else {
            subcommand_result.map(|()| None)
        }
@@ -1248,122 +1245,49 @@ async fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
    Ok(())
 }

-async fn handle_storage_broker(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
-    let (sub_name, sub_args) = match sub_match.subcommand() {
-        Some(broker_command_data) => broker_command_data,
-        None => bail!("no broker subcommand provided"),
-    };
-
-    match sub_name {
-        "start" => {
-            if let Err(e) = broker::start_broker_process(env, get_start_timeout(sub_args)).await {
-                eprintln!("broker start failed: {e}");
-                exit(1);
-            }
-        }
-
-        "stop" => {
-            if let Err(e) = broker::stop_broker_process(env) {
-                eprintln!("broker stop failed: {e}");
-                exit(1);
-            }
-        }
-
-        _ => bail!("Unexpected broker subcommand '{}'", sub_name),
-    }
-    Ok(())
-}
-
 async fn handle_start_all(
-    env: &'static local_env::LocalEnv,
+    env: &local_env::LocalEnv,
    retry_timeout: &Duration,
 ) -> anyhow::Result<()> {
-    let Err(errors) = handle_start_all_impl(env, *retry_timeout).await else {
-        neon_start_status_check(env, retry_timeout)
-            .await
-            .context("status check after successful startup of all services")?;
-        return Ok(());
-    };
-
-    eprintln!("startup failed because one or more services could not be started");
-
-    for e in errors {
-        eprintln!("{e}");
-        let debug_repr = format!("{e:?}");
-        for line in debug_repr.lines() {
-            eprintln!("  {line}");
-        }
-    }
-
-    try_stop_all(env, true).await;
-
-    exit(2);
-}
-
-/// Returns Ok() if and only if all services could be started successfully.
-/// Otherwise, returns the list of errors that occurred during startup.
-async fn handle_start_all_impl(
-    env: &'static local_env::LocalEnv,
-    retry_timeout: Duration,
-) -> Result<(), Vec<anyhow::Error>> {
    // Endpoints are not started automatically

-    let mut js = JoinSet::new();
+    broker::start_broker_process(env, retry_timeout).await?;

-    // force infalliblity through closure
-    #[allow(clippy::redundant_closure_call)]
-    (|| {
-        js.spawn(async move {
-            let retry_timeout = retry_timeout;
-            broker::start_broker_process(env, &retry_timeout).await
-        });
-
-        // Only start the storage controller if the pageserver is configured to need it
-        if env.control_plane_api.is_some() {
-            js.spawn(async move {
-                let storage_controller = StorageController::from_env(env);
-                storage_controller
-                    .start(NeonStorageControllerStartArgs::with_default_instance_id(
-                        retry_timeout.into(),
-                    ))
-                    .await
-                    .map_err(|e| e.context("start storage_controller"))
-            });
-        }
-
-        for ps_conf in &env.pageservers {
-            js.spawn(async move {
-                let pageserver = PageServerNode::from_env(env, ps_conf);
-                pageserver
-                    .start(&retry_timeout)
-                    .await
-                    .map_err(|e| e.context(format!("start pageserver {}", ps_conf.id)))
-            });
-        }
-
-        for node in env.safekeepers.iter() {
-            js.spawn(async move {
-                let safekeeper = SafekeeperNode::from_env(env, node);
-                safekeeper
-                    .start(vec![], &retry_timeout)
-                    .await
-                    .map_err(|e| e.context(format!("start safekeeper {}", safekeeper.id)))
-            });
-        }
-    })();
-
-    let mut errors = Vec::new();
-    while let Some(result) = js.join_next().await {
-        let result = result.expect("we don't panic or cancel the tasks");
-        if let Err(e) = result {
-            errors.push(e);
+    // Only start the storage controller if the pageserver is configured to need it
+    if env.control_plane_api.is_some() {
+        let storage_controller = StorageController::from_env(env);
+        if let Err(e) = storage_controller
+            .start(NeonStorageControllerStartArgs::with_default_instance_id(
+                (*retry_timeout).into(),
+            ))
+            .await
+        {
+            eprintln!("storage_controller start failed: {:#}", e);
+            try_stop_all(env, true).await;
+            exit(1);
        }
    }

-    if !errors.is_empty() {
-        return Err(errors);
+    for ps_conf in &env.pageservers {
+        let pageserver = PageServerNode::from_env(env, ps_conf);
+        if let Err(e) = pageserver.start(retry_timeout).await {
+            eprintln!("pageserver {} start failed: {:#}", ps_conf.id, e);
+            try_stop_all(env, true).await;
+            exit(1);
+        }
    }

+    for node in env.safekeepers.iter() {
+        let safekeeper = SafekeeperNode::from_env(env, node);
+        if let Err(e) = safekeeper.start(vec![], retry_timeout).await {
+            eprintln!("safekeeper {} start failed: {:#}", safekeeper.id, e);
+            try_stop_all(env, false).await;
+            exit(1);
+        }
+    }
+
+    neon_start_status_check(env, retry_timeout).await?;
+
    Ok(())
 }

@@ -1748,19 +1672,6 @@ fn cli() -> Command {
                            .arg(stop_mode_arg.clone())
                            .arg(instance_id))
        )
-        .subcommand(
-            Command::new("storage_broker")
-                .arg_required_else_help(true)
-                .about("Manage broker")
-                .subcommand(Command::new("start")
-                            .about("Start broker")
-                            .arg(timeout_arg.clone())
-                )
-                .subcommand(Command::new("stop")
-                            .about("Stop broker")
-                            .arg(stop_mode_arg.clone())
-                )
-        )
        .subcommand(
            Command::new("safekeeper")
                .arg_required_else_help(true)
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -702,7 +702,7 @@ impl Endpoint {
                    }
                }
            }
-            tokio::time::sleep(ATTEMPT_INTERVAL).await;
+            std::thread::sleep(ATTEMPT_INTERVAL);
        }

        // disarm the scopeguard, let the child outlive this function (and neon_local invoction)
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -17,7 +17,9 @@ use std::time::Duration;

 use anyhow::{bail, Context};
 use camino::Utf8PathBuf;
-use pageserver_api::models::{self, AuxFilePolicy, TenantInfo, TimelineInfo};
+use pageserver_api::models::{
+    self, AuxFilePolicy, LocationConfig, TenantHistorySize, TenantInfo, TimelineInfo,
+};
 use pageserver_api::shard::TenantShardId;
 use pageserver_client::mgmt_api;
 use postgres_backend::AuthType;
@@ -322,6 +324,22 @@ impl PageServerNode {
        background_process::stop_process(immediate, "pageserver", &self.pid_file())
    }

+    pub async fn page_server_psql_client(
+        &self,
+    ) -> anyhow::Result<(
+        tokio_postgres::Client,
+        tokio_postgres::Connection<tokio_postgres::Socket, tokio_postgres::tls::NoTlsStream>,
+    )> {
+        let mut config = self.pg_connection_config.clone();
+        if self.conf.pg_auth_type == AuthType::NeonJWT {
+            let token = self
+                .env
+                .generate_auth_token(&Claims::new(None, Scope::PageServerApi))?;
+            config = config.set_password(Some(token));
+        }
+        Ok(config.connect_no_tls().await?)
+    }
+
    pub async fn check_status(&self) -> mgmt_api::Result<()> {
        self.http_client.status().await
    }
@@ -522,6 +540,19 @@ impl PageServerNode {
        Ok(())
    }

+    pub async fn location_config(
+        &self,
+        tenant_shard_id: TenantShardId,
+        config: LocationConfig,
+        flush_ms: Option<Duration>,
+        lazy: bool,
+    ) -> anyhow::Result<()> {
+        Ok(self
+            .http_client
+            .location_config(tenant_shard_id, config, flush_ms, lazy)
+            .await?)
+    }
+
    pub async fn timeline_list(
        &self,
        tenant_shard_id: &TenantShardId,
@@ -605,4 +636,14 @@ impl PageServerNode {

        Ok(())
    }
+
+    pub async fn tenant_synthetic_size(
+        &self,
+        tenant_shard_id: TenantShardId,
+    ) -> anyhow::Result<TenantHistorySize> {
+        Ok(self
+            .http_client
+            .tenant_synthetic_size(tenant_shard_id)
+            .await?)
+    }
 }
--- a/control_plane/src/postgresql_conf.rs
+++ b/control_plane/src/postgresql_conf.rs
@@ -4,10 +4,13 @@
 /// NOTE: This doesn't implement the full, correct postgresql.conf syntax. Just
 /// enough to extract a few settings we need in Neon, assuming you don't do
 /// funny stuff like include-directives or funny escaping.
+use anyhow::{bail, Context, Result};
 use once_cell::sync::Lazy;
 use regex::Regex;
 use std::collections::HashMap;
 use std::fmt;
+use std::io::BufRead;
+use std::str::FromStr;

 /// In-memory representation of a postgresql.conf file
 #[derive(Default, Debug)]
@@ -16,16 +19,84 @@ pub struct PostgresConf {
    hash: HashMap<String, String>,
 }

+static CONF_LINE_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"^((?:\w|\.)+)\s*=\s*(\S+)$").unwrap());
+
 impl PostgresConf {
    pub fn new() -> PostgresConf {
        PostgresConf::default()
    }

+    /// Read file into memory
+    pub fn read(read: impl std::io::Read) -> Result<PostgresConf> {
+        let mut result = Self::new();
+
+        for line in std::io::BufReader::new(read).lines() {
+            let line = line?;
+
+            // Store each line in a vector, in original format
+            result.lines.push(line.clone());
+
+            // Also parse each line and insert key=value lines into a hash map.
+            //
+            // FIXME: This doesn't match exactly the flex/bison grammar in PostgreSQL.
+            // But it's close enough for our usage.
+            let line = line.trim();
+            if line.starts_with('#') {
+                // comment, ignore
+                continue;
+            } else if let Some(caps) = CONF_LINE_RE.captures(line) {
+                let name = caps.get(1).unwrap().as_str();
+                let raw_val = caps.get(2).unwrap().as_str();
+
+                if let Ok(val) = deescape_str(raw_val) {
+                    // Note: if there's already an entry in the hash map for
+                    // this key, this will replace it. That's the behavior what
+                    // we want; when PostgreSQL reads the file, each line
+                    // overrides any previous value for the same setting.
+                    result.hash.insert(name.to_string(), val.to_string());
+                }
+            }
+        }
+        Ok(result)
+    }
+
    /// Return the current value of 'option'
    pub fn get(&self, option: &str) -> Option<&str> {
        self.hash.get(option).map(|x| x.as_ref())
    }

+    /// Return the current value of a field, parsed to the right datatype.
+    ///
+    /// This calls the FromStr::parse() function on the value of the field. If
+    /// the field does not exist, or parsing fails, returns an error.
+    ///
+    pub fn parse_field<T>(&self, field_name: &str, context: &str) -> Result<T>
+    where
+        T: FromStr,
+        <T as FromStr>::Err: std::error::Error + Send + Sync + 'static,
+    {
+        self.get(field_name)
+            .with_context(|| format!("could not find '{}' option {}", field_name, context))?
+            .parse::<T>()
+            .with_context(|| format!("could not parse '{}' option {}", field_name, context))
+    }
+
+    pub fn parse_field_optional<T>(&self, field_name: &str, context: &str) -> Result<Option<T>>
+    where
+        T: FromStr,
+        <T as FromStr>::Err: std::error::Error + Send + Sync + 'static,
+    {
+        if let Some(val) = self.get(field_name) {
+            let result = val
+                .parse::<T>()
+                .with_context(|| format!("could not parse '{}' option {}", field_name, context))?;
+
+            Ok(Some(result))
+        } else {
+            Ok(None)
+        }
+    }
+
    ///
    /// Note: if you call this multiple times for the same option, the config
    /// file will a line for each call. It would be nice to have a function
@@ -83,8 +154,48 @@ fn escape_str(s: &str) -> String {
    }
 }

+/// De-escape a possibly-quoted value.
+///
+/// See `DeescapeQuotedString` function in PostgreSQL sources for how PostgreSQL
+/// does this.
+fn deescape_str(s: &str) -> Result<String> {
+    // If the string has a quote at the beginning and end, strip them out.
+    if s.len() >= 2 && s.starts_with('\'') && s.ends_with('\'') {
+        let mut result = String::new();
+
+        let mut iter = s[1..(s.len() - 1)].chars().peekable();
+        while let Some(c) = iter.next() {
+            let newc = if c == '\\' {
+                match iter.next() {
+                    Some('b') => '\x08',
+                    Some('f') => '\x0c',
+                    Some('n') => '\n',
+                    Some('r') => '\r',
+                    Some('t') => '\t',
+                    Some('0'..='7') => {
+                        // TODO
+                        bail!("octal escapes not supported");
+                    }
+                    Some(n) => n,
+                    None => break,
+                }
+            } else if c == '\'' && iter.peek() == Some(&'\'') {
+                // doubled quote becomes just one quote
+                iter.next().unwrap()
+            } else {
+                c
+            };
+
+            result.push(newc);
+        }
+        Ok(result)
+    } else {
+        Ok(s.to_string())
+    }
+}
+
 #[test]
-fn test_postgresql_conf_escapes() -> anyhow::Result<()> {
+fn test_postgresql_conf_escapes() -> Result<()> {
    assert_eq!(escape_str("foo bar"), "'foo bar'");
    // these don't need to be quoted
    assert_eq!(escape_str("foo"), "foo");
@@ -103,5 +214,13 @@ fn test_postgresql_conf_escapes() -> anyhow::Result<()> {
    assert_eq!(escape_str("fo\\o"), "'fo\\\\o'");
    assert_eq!(escape_str("10 cats"), "'10 cats'");

+    // Test de-escaping
+    assert_eq!(deescape_str(&escape_str("foo"))?, "foo");
+    assert_eq!(deescape_str(&escape_str("fo'o\nba\\r"))?, "fo'o\nba\\r");
+    assert_eq!(deescape_str("'\\b\\f\\n\\r\\t'")?, "\x08\x0c\n\r\t");
+
+    // octal-escapes are currently not supported
+    assert!(deescape_str("'foo\\7\\07\\007'").is_err());
+
    Ok(())
 }
--- a/control_plane/storcon_cli/Cargo.toml
+++ b/control_plane/storcon_cli/Cargo.toml
@@ -11,11 +11,14 @@ clap.workspace = true
 comfy-table.workspace = true
 futures.workspace = true
 humantime.workspace = true
+hyper.workspace = true
 pageserver_api.workspace = true
 pageserver_client.workspace = true
 reqwest.workspace = true
+serde.workspace = true
 serde_json = { workspace = true, features = ["raw_value"] }
 storage_controller_client.workspace = true
+thiserror.workspace = true
 tokio.workspace = true
 tracing.workspace = true
 utils.workspace = true
--- a/libs/compute_api/Cargo.toml
+++ b/libs/compute_api/Cargo.toml
@@ -8,6 +8,7 @@ license.workspace = true
 anyhow.workspace = true
 chrono.workspace = true
 serde.workspace = true
+serde_with.workspace = true
 serde_json.workspace = true
 regex.workspace = true

--- a/libs/consumption_metrics/Cargo.toml
+++ b/libs/consumption_metrics/Cargo.toml
@@ -5,6 +5,9 @@ edition = "2021"
 license = "Apache-2.0"

 [dependencies]
+anyhow.workspace = true
 chrono = { workspace = true, features = ["serde"] }
 rand.workspace = true
 serde.workspace = true
+serde_with.workspace = true
+utils.workspace = true
--- a/libs/consumption_metrics/src/lib.rs
+++ b/libs/consumption_metrics/src/lib.rs
@@ -5,7 +5,7 @@ use chrono::{DateTime, Utc};
 use rand::Rng;
 use serde::{Deserialize, Serialize};

-#[derive(Serialize, Deserialize, Debug, Clone, Copy, Eq, PartialEq, Ord, PartialOrd)]
+#[derive(Serialize, serde::Deserialize, Debug, Clone, Copy, Eq, PartialEq, Ord, PartialOrd)]
 #[serde(tag = "type")]
 pub enum EventType {
    #[serde(rename = "absolute")]
@@ -107,7 +107,7 @@ pub const CHUNK_SIZE: usize = 1000;

 // Just a wrapper around a slice of events
 // to serialize it as `{"events" : [ ] }
-#[derive(serde::Serialize, Deserialize)]
+#[derive(serde::Serialize, serde::Deserialize)]
 pub struct EventChunk<'a, T: Clone> {
    pub events: std::borrow::Cow<'a, [T]>,
 }
--- a/libs/desim/Cargo.toml
+++ b/libs/desim/Cargo.toml
@@ -12,4 +12,5 @@ bytes.workspace = true
 utils.workspace = true
 parking_lot.workspace = true
 hex.workspace = true
+scopeguard.workspace = true
 smallvec = { workspace = true, features = ["write"] }
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -173,6 +173,40 @@ impl Default for EvictionOrder {
    }
 }

+#[derive(
+    Eq,
+    PartialEq,
+    Debug,
+    Copy,
+    Clone,
+    strum_macros::EnumString,
+    strum_macros::Display,
+    serde_with::DeserializeFromStr,
+    serde_with::SerializeDisplay,
+)]
+#[strum(serialize_all = "kebab-case")]
+pub enum GetVectoredImpl {
+    Sequential,
+    Vectored,
+}
+
+#[derive(
+    Eq,
+    PartialEq,
+    Debug,
+    Copy,
+    Clone,
+    strum_macros::EnumString,
+    strum_macros::Display,
+    serde_with::DeserializeFromStr,
+    serde_with::SerializeDisplay,
+)]
+#[strum(serialize_all = "kebab-case")]
+pub enum GetImpl {
+    Legacy,
+    Vectored,
+}
+
 #[derive(Copy, Clone, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
 #[serde(transparent)]
 pub struct MaxVectoredReadBytes(pub NonZeroUsize);
@@ -304,6 +338,8 @@ pub mod defaults {
    pub const DEFAULT_IMAGE_COMPRESSION: ImageCompressionAlgorithm =
        ImageCompressionAlgorithm::Zstd { level: Some(1) };

+    pub const DEFAULT_VALIDATE_VECTORED_GET: bool = false;
+
    pub const DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB: usize = 0;

    pub const DEFAULT_IO_BUFFER_ALIGNMENT: usize = 512;
@@ -340,10 +376,7 @@ impl Default for ConfigToml {

            concurrent_tenant_warmup: (NonZeroUsize::new(DEFAULT_CONCURRENT_TENANT_WARMUP)
                .expect("Invalid default constant")),
-            concurrent_tenant_size_logical_size_queries: NonZeroUsize::new(
-                DEFAULT_CONCURRENT_TENANT_SIZE_LOGICAL_SIZE_QUERIES,
-            )
-            .unwrap(),
+            concurrent_tenant_size_logical_size_queries: NonZeroUsize::new(1).unwrap(),
            metric_collection_interval: (humantime::parse_duration(
                DEFAULT_METRIC_COLLECTION_INTERVAL,
            )
@@ -434,6 +467,8 @@ pub mod tenant_conf_defaults {
    // By default ingest enough WAL for two new L0 layers before checking if new image
    // image layers should be created.
    pub const DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD: u8 = 2;
+
+    pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100;
 }

 impl Default for TenantConfigToml {
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -495,7 +495,7 @@ pub struct CompactionAlgorithmSettings {
    pub kind: CompactionAlgorithm,
 }

-#[derive(Debug, PartialEq, Eq, Clone, Deserialize, Serialize)]
+#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize)]
 #[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)]
 pub enum L0FlushConfig {
    #[serde(rename_all = "snake_case")]
--- a/libs/postgres_backend/Cargo.toml
+++ b/libs/postgres_backend/Cargo.toml
@@ -5,8 +5,10 @@ edition.workspace = true
 license.workspace = true

 [dependencies]
+async-trait.workspace = true
 anyhow.workspace = true
 bytes.workspace = true
+futures.workspace = true
 rustls.workspace = true
 serde.workspace = true
 thiserror.workspace = true
--- a/libs/postgres_backend/src/lib.rs
+++ b/libs/postgres_backend/src/lib.rs
@@ -280,6 +280,16 @@ pub struct PostgresBackend<IO> {

 pub type PostgresBackendTCP = PostgresBackend<tokio::net::TcpStream>;

+pub fn query_from_cstring(query_string: Bytes) -> Vec<u8> {
+    let mut query_string = query_string.to_vec();
+    if let Some(ch) = query_string.last() {
+        if *ch == 0 {
+            query_string.pop();
+        }
+    }
+    query_string
+}
+
 /// Cast a byte slice to a string slice, dropping null terminator if there's one.
 fn cstr_to_str(bytes: &[u8]) -> anyhow::Result<&str> {
    let without_null = bytes.strip_suffix(&[0]).unwrap_or(bytes);
--- a/libs/postgres_ffi/Cargo.toml
+++ b/libs/postgres_ffi/Cargo.toml
@@ -5,10 +5,13 @@ edition.workspace = true
 license.workspace = true

 [dependencies]
+rand.workspace = true
 regex.workspace = true
 bytes.workspace = true
+byteorder.workspace = true
 anyhow.workspace = true
 crc32c.workspace = true
+hex.workspace = true
 once_cell.workspace = true
 log.workspace = true
 memoffset.workspace = true
--- a/libs/postgres_ffi/src/pg_constants.rs
+++ b/libs/postgres_ffi/src/pg_constants.rs
@@ -9,8 +9,8 @@
 //! comments on them.
 //!

-use crate::PageHeaderData;
 use crate::BLCKSZ;
+use crate::{PageHeaderData, XLogRecord};

 //
 // From pg_tablespace_d.h
@@ -194,6 +194,8 @@ pub const XLR_RMGR_INFO_MASK: u8 = 0xF0;
 pub const XLOG_TBLSPC_CREATE: u8 = 0x00;
 pub const XLOG_TBLSPC_DROP: u8 = 0x10;

+pub const SIZEOF_XLOGRECORD: u32 = size_of::<XLogRecord>() as u32;
+
 //
 // from xlogrecord.h
 //
@@ -217,6 +219,8 @@ pub const BKPIMAGE_HAS_HOLE: u8 = 0x01; /* page image has "hole" */
 /* From transam.h */
 pub const FIRST_NORMAL_TRANSACTION_ID: u32 = 3;
 pub const INVALID_TRANSACTION_ID: u32 = 0;
+pub const FIRST_BOOTSTRAP_OBJECT_ID: u32 = 12000;
+pub const FIRST_NORMAL_OBJECT_ID: u32 = 16384;

 /* pg_control.h */
 pub const XLOG_CHECKPOINT_SHUTDOWN: u8 = 0x00;
--- a/libs/postgres_ffi/src/xlog_utils.rs
+++ b/libs/postgres_ffi/src/xlog_utils.rs
@@ -26,12 +26,11 @@ use bytes::{Buf, Bytes};
 use log::*;

 use serde::Serialize;
-use std::ffi::OsStr;
 use std::fs::File;
 use std::io::prelude::*;
 use std::io::ErrorKind;
 use std::io::SeekFrom;
-use std::path::Path;
+use std::path::{Path, PathBuf};
 use std::time::SystemTime;
 use utils::bin_ser::DeserializeError;
 use utils::bin_ser::SerializeError;
@@ -79,34 +78,19 @@ pub fn XLogFileName(tli: TimeLineID, logSegNo: XLogSegNo, wal_segsz_bytes: usize
    )
 }

-pub fn XLogFromFileName(
-    fname: &OsStr,
-    wal_seg_size: usize,
-) -> anyhow::Result<(XLogSegNo, TimeLineID)> {
-    if let Some(fname_str) = fname.to_str() {
-        let tli = u32::from_str_radix(&fname_str[0..8], 16)?;
-        let log = u32::from_str_radix(&fname_str[8..16], 16)? as XLogSegNo;
-        let seg = u32::from_str_radix(&fname_str[16..24], 16)? as XLogSegNo;
-        Ok((log * XLogSegmentsPerXLogId(wal_seg_size) + seg, tli))
-    } else {
-        anyhow::bail!("non-ut8 filename: {:?}", fname);
-    }
+pub fn XLogFromFileName(fname: &str, wal_seg_size: usize) -> (XLogSegNo, TimeLineID) {
+    let tli = u32::from_str_radix(&fname[0..8], 16).unwrap();
+    let log = u32::from_str_radix(&fname[8..16], 16).unwrap() as XLogSegNo;
+    let seg = u32::from_str_radix(&fname[16..24], 16).unwrap() as XLogSegNo;
+    (log * XLogSegmentsPerXLogId(wal_seg_size) + seg, tli)
 }

-pub fn IsXLogFileName(fname: &OsStr) -> bool {
-    if let Some(fname) = fname.to_str() {
-        fname.len() == XLOG_FNAME_LEN && fname.chars().all(|c| c.is_ascii_hexdigit())
-    } else {
-        false
-    }
+pub fn IsXLogFileName(fname: &str) -> bool {
+    return fname.len() == XLOG_FNAME_LEN && fname.chars().all(|c| c.is_ascii_hexdigit());
 }

-pub fn IsPartialXLogFileName(fname: &OsStr) -> bool {
-    if let Some(fname) = fname.to_str() {
-        fname.ends_with(".partial") && IsXLogFileName(OsStr::new(&fname[0..fname.len() - 8]))
-    } else {
-        false
-    }
+pub fn IsPartialXLogFileName(fname: &str) -> bool {
+    fname.ends_with(".partial") && IsXLogFileName(&fname[0..fname.len() - 8])
 }

 /// If LSN points to the beginning of the page, then shift it to first record,
@@ -276,6 +260,13 @@ fn open_wal_segment(seg_file_path: &Path) -> anyhow::Result<Option<File>> {
    }
 }

+pub fn main() {
+    let mut data_dir = PathBuf::new();
+    data_dir.push(".");
+    let wal_end = find_end_of_wal(&data_dir, WAL_SEGMENT_SIZE, Lsn(0)).unwrap();
+    println!("wal_end={:?}", wal_end);
+}
+
 impl XLogRecord {
    pub fn from_slice(buf: &[u8]) -> Result<XLogRecord, DeserializeError> {
        use utils::bin_ser::LeSer;
--- a/libs/postgres_ffi/wal_craft/Cargo.toml
+++ b/libs/postgres_ffi/wal_craft/Cargo.toml
@@ -9,6 +9,7 @@ anyhow.workspace = true
 clap.workspace = true
 env_logger.workspace = true
 log.workspace = true
+once_cell.workspace = true
 postgres.workspace = true
 postgres_ffi.workspace = true
 camino-tempfile.workspace = true
--- a/libs/postgres_ffi/wal_craft/src/lib.rs
+++ b/libs/postgres_ffi/wal_craft/src/lib.rs
@@ -7,7 +7,6 @@ use postgres_ffi::{WAL_SEGMENT_SIZE, XLOG_BLCKSZ};
 use postgres_ffi::{
    XLOG_SIZE_OF_XLOG_LONG_PHD, XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD,
 };
-use std::ffi::OsStr;
 use std::path::{Path, PathBuf};
 use std::process::Command;
 use std::time::{Duration, Instant};
@@ -27,6 +26,7 @@ macro_rules! xlog_utils_test {

 postgres_ffi::for_all_postgres_versions! { xlog_utils_test }

+#[derive(Debug, Clone, PartialEq, Eq)]
 pub struct Conf {
    pub pg_version: u32,
    pub pg_distrib_dir: PathBuf,
@@ -136,8 +136,8 @@ impl Conf {

    pub fn pg_waldump(
        &self,
-        first_segment_name: &OsStr,
-        last_segment_name: &OsStr,
+        first_segment_name: &str,
+        last_segment_name: &str,
    ) -> anyhow::Result<std::process::Output> {
        let first_segment_file = self.datadir.join(first_segment_name);
        let last_segment_file = self.datadir.join(last_segment_name);
--- a/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs
+++ b/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs
@@ -4,7 +4,6 @@ use super::*;
 use crate::{error, info};
 use regex::Regex;
 use std::cmp::min;
-use std::ffi::OsStr;
 use std::fs::{self, File};
 use std::io::Write;
 use std::{env, str::FromStr};
@@ -55,7 +54,7 @@ fn test_end_of_wal<C: crate::Crafter>(test_name: &str) {
        .wal_dir()
        .read_dir()
        .unwrap()
-        .map(|f| f.unwrap().file_name())
+        .map(|f| f.unwrap().file_name().into_string().unwrap())
        .filter(|fname| IsXLogFileName(fname))
        .max()
        .unwrap();
@@ -71,11 +70,11 @@ fn test_end_of_wal<C: crate::Crafter>(test_name: &str) {
            start_lsn
        );
        for file in fs::read_dir(cfg.wal_dir()).unwrap().flatten() {
-            let fname = file.file_name();
+            let fname = file.file_name().into_string().unwrap();
            if !IsXLogFileName(&fname) {
                continue;
            }
-            let (segno, _) = XLogFromFileName(&fname, WAL_SEGMENT_SIZE).unwrap();
+            let (segno, _) = XLogFromFileName(&fname, WAL_SEGMENT_SIZE);
            let seg_start_lsn = XLogSegNoOffsetToRecPtr(segno, 0, WAL_SEGMENT_SIZE);
            if seg_start_lsn > u64::from(*start_lsn) {
                continue;
@@ -94,10 +93,10 @@ fn test_end_of_wal<C: crate::Crafter>(test_name: &str) {
    }
 }

-fn find_pg_waldump_end_of_wal(cfg: &crate::Conf, last_segment: &OsStr) -> Lsn {
+fn find_pg_waldump_end_of_wal(cfg: &crate::Conf, last_segment: &str) -> Lsn {
    // Get the actual end of WAL by pg_waldump
    let waldump_output = cfg
-        .pg_waldump(OsStr::new("000000010000000000000001"), last_segment)
+        .pg_waldump("000000010000000000000001", last_segment)
        .unwrap()
        .stderr;
    let waldump_output = std::str::from_utf8(&waldump_output).unwrap();
@@ -118,7 +117,7 @@ fn find_pg_waldump_end_of_wal(cfg: &crate::Conf, last_segment: &OsStr) -> Lsn {

 fn check_end_of_wal(
    cfg: &crate::Conf,
-    last_segment: &OsStr,
+    last_segment: &str,
    start_lsn: Lsn,
    expected_end_of_wal: Lsn,
 ) {
@@ -133,8 +132,7 @@ fn check_end_of_wal(
    // Rename file to partial to actually find last valid lsn, then rename it back.
    fs::rename(
        cfg.wal_dir().join(last_segment),
-        cfg.wal_dir()
-            .join(format!("{}.partial", last_segment.to_str().unwrap())),
+        cfg.wal_dir().join(format!("{}.partial", last_segment)),
    )
    .unwrap();
    let wal_end = find_end_of_wal(&cfg.wal_dir(), WAL_SEGMENT_SIZE, start_lsn).unwrap();
@@ -144,8 +142,7 @@ fn check_end_of_wal(
    );
    assert_eq!(wal_end, expected_end_of_wal);
    fs::rename(
-        cfg.wal_dir()
-            .join(format!("{}.partial", last_segment.to_str().unwrap())),
+        cfg.wal_dir().join(format!("{}.partial", last_segment)),
        cfg.wal_dir().join(last_segment),
    )
    .unwrap();
--- a/libs/pq_proto/Cargo.toml
+++ b/libs/pq_proto/Cargo.toml
@@ -8,8 +8,10 @@ license.workspace = true
 bytes.workspace = true
 byteorder.workspace = true
 itertools.workspace = true
+pin-project-lite.workspace = true
 postgres-protocol.workspace = true
 rand.workspace = true
 tokio = { workspace = true, features = ["io-util"] }
+tracing.workspace = true
 thiserror.workspace = true
 serde.workspace = true
--- a/libs/remote_storage/Cargo.toml
+++ b/libs/remote_storage/Cargo.toml
@@ -13,11 +13,14 @@ aws-smithy-async.workspace = true
 aws-smithy-types.workspace = true
 aws-config.workspace = true
 aws-sdk-s3.workspace = true
+aws-credential-types.workspace = true
 bytes.workspace = true
 camino = { workspace = true, features = ["serde1"] }
+humantime.workspace = true
 humantime-serde.workspace = true
 hyper = { workspace = true, features = ["stream"] }
 futures.workspace = true
+rand.workspace = true
 serde.workspace = true
 serde_json.workspace = true
 tokio = { workspace = true, features = ["sync", "fs", "io-util"] }
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -127,6 +127,10 @@ impl RemotePath {
        &self.0
    }

+    pub fn extension(&self) -> Option<&str> {
+        self.0.extension()
+    }
+
    pub fn strip_prefix(&self, p: &RemotePath) -> Result<&Utf8Path, std::path::StripPrefixError> {
        self.0.strip_prefix(&p.0)
    }
--- a/libs/safekeeper_api/Cargo.toml
+++ b/libs/safekeeper_api/Cargo.toml
@@ -6,5 +6,6 @@ license.workspace = true

 [dependencies]
 serde.workspace = true
+serde_with.workspace = true
 const_format.workspace = true
 utils.workspace = true
--- a/libs/tracing-utils/Cargo.toml
+++ b/libs/tracing-utils/Cargo.toml
@@ -9,9 +9,8 @@ hyper.workspace = true
 opentelemetry = { workspace = true, features=["rt-tokio"] }
 opentelemetry-otlp = { workspace = true, default-features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
 opentelemetry-semantic-conventions.workspace = true
+reqwest = { workspace = true, default-features = false, features = ["rustls-tls"] }
 tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
 tracing.workspace = true
 tracing-opentelemetry.workspace = true
-
-[dev-dependencies]
-tracing-subscriber.workspace = true    # For examples in docs
+tracing-subscriber.workspace = true
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -42,6 +42,7 @@ tracing.workspace = true
 tracing-error.workspace = true
 tracing-subscriber = { workspace = true, features = ["json", "registry"] }
 rand.workspace = true
+serde_with.workspace = true
 strum.workspace = true
 strum_macros.workspace = true
 url.workspace = true
--- a/libs/utils/src/accum.rs
+++ b/libs/utils/src/accum.rs
@@ -0,0 +1,33 @@
+/// A helper to "accumulate" a value similar to `Iterator::reduce`, but lets you
+/// feed the accumulated values by calling the 'accum' function, instead of having an
+/// iterator.
+///
+/// For example, to calculate the smallest value among some integers:
+///
+/// ```
+/// use utils::accum::Accum;
+///
+/// let values = [1, 2, 3];
+///
+/// let mut min_value: Accum<u32> = Accum(None);
+/// for new_value in &values {
+///     min_value.accum(std::cmp::min, *new_value);
+/// }
+///
+/// assert_eq!(min_value.0.unwrap(), 1);
+/// ```
+pub struct Accum<T>(pub Option<T>);
+impl<T: Copy> Accum<T> {
+    pub fn accum<F>(&mut self, func: F, new_value: T)
+    where
+        F: FnOnce(T, T) -> T,
+    {
+        // If there is no previous value, just store the new value.
+        // Otherwise call the function to decide which one to keep.
+        self.0 = Some(if let Some(accum) = self.0 {
+            func(accum, new_value)
+        } else {
+            new_value
+        });
+    }
+}
--- a/libs/utils/src/http/error.rs
+++ b/libs/utils/src/http/error.rs
@@ -82,7 +82,7 @@ impl ApiError {
                StatusCode::INTERNAL_SERVER_ERROR,
            ),
            ApiError::InternalServerError(err) => HttpErrorBody::response_from_msg_and_status(
-                format!("{err:#}"), // use alternative formatting so that we give the cause without backtrace
+                err.to_string(),
                StatusCode::INTERNAL_SERVER_ERROR,
            ),
        }
--- a/libs/utils/src/id.rs
+++ b/libs/utils/src/id.rs
@@ -88,6 +88,12 @@ impl<'de> Deserialize<'de> for Id {
 }

 impl Id {
+    pub fn get_from_buf(buf: &mut impl bytes::Buf) -> Id {
+        let mut arr = [0u8; 16];
+        buf.copy_to_slice(&mut arr);
+        Id::from(arr)
+    }
+
    pub fn from_slice(src: &[u8]) -> Result<Id, IdError> {
        if src.len() != 16 {
            return Err(IdError::SliceParseError(src.len()));
@@ -173,6 +179,10 @@ impl fmt::Debug for Id {
 macro_rules! id_newtype {
    ($t:ident) => {
        impl $t {
+            pub fn get_from_buf(buf: &mut impl bytes::Buf) -> $t {
+                $t(Id::get_from_buf(buf))
+            }
+
            pub fn from_slice(src: &[u8]) -> Result<$t, IdError> {
                Ok($t(Id::from_slice(src)?))
            }
--- a/libs/utils/src/leaky_bucket.rs
+++ b/libs/utils/src/leaky_bucket.rs
@@ -21,13 +21,7 @@
 //!
 //! Another explaination can be found here: <https://brandur.org/rate-limiting>

-use std::{
-    sync::{
-        atomic::{AtomicU64, Ordering},
-        Mutex,
-    },
-    time::Duration,
-};
+use std::{sync::Mutex, time::Duration};

 use tokio::{sync::Notify, time::Instant};

@@ -134,7 +128,6 @@ impl LeakyBucketState {

 pub struct RateLimiter {
    pub config: LeakyBucketConfig,
-    pub sleep_counter: AtomicU64,
    pub state: Mutex<LeakyBucketState>,
    /// a queue to provide this fair ordering.
    pub queue: Notify,
@@ -151,7 +144,6 @@ impl Drop for Requeue<'_> {
 impl RateLimiter {
    pub fn with_initial_tokens(config: LeakyBucketConfig, initial_tokens: f64) -> Self {
        RateLimiter {
-            sleep_counter: AtomicU64::new(0),
            state: Mutex::new(LeakyBucketState::with_initial_tokens(
                &config,
                initial_tokens,
@@ -171,16 +163,15 @@ impl RateLimiter {

    /// returns true if we did throttle
    pub async fn acquire(&self, count: usize) -> bool {
-        let start = tokio::time::Instant::now();
+        let mut throttled = false;

-        let start_count = self.sleep_counter.load(Ordering::Acquire);
-        let mut end_count = start_count;
+        let start = tokio::time::Instant::now();

        // wait until we are the first in the queue
        let mut notified = std::pin::pin!(self.queue.notified());
        if !notified.as_mut().enable() {
+            throttled = true;
            notified.await;
-            end_count = self.sleep_counter.load(Ordering::Acquire);
        }

        // notify the next waiter in the queue when we are done.
@@ -193,22 +184,9 @@ impl RateLimiter {
                .unwrap()
                .add_tokens(&self.config, start, count as f64);
            match res {
-                Ok(()) => return end_count > start_count,
+                Ok(()) => return throttled,
                Err(ready_at) => {
-                    struct Increment<'a>(&'a AtomicU64);
-
-                    impl Drop for Increment<'_> {
-                        fn drop(&mut self) {
-                            self.0.fetch_add(1, Ordering::AcqRel);
-                        }
-                    }
-
-                    // increment the counter after we finish sleeping (or cancel this task).
-                    // this ensures that tasks that have already started the acquire will observe
-                    // the new sleep count when they are allowed to resume on the notify.
-                    let _inc = Increment(&self.sleep_counter);
-                    end_count += 1;
-
+                    throttled = true;
                    tokio::time::sleep_until(ready_at).await;
                }
            }
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -43,9 +43,16 @@ pub mod logging;
 pub mod lock_file;
 pub mod pid_file;

+// Misc
+pub mod accum;
+pub mod shutdown;
+
 // Utility for binding TcpListeners with proper socket options.
 pub mod tcp_listener;

+// Utility for putting a raw file descriptor into non-blocking mode
+pub mod nonblock;
+
 // Default signal handling
 pub mod sentry_init;
 pub mod signals;
--- a/libs/utils/src/lsn.rs
+++ b/libs/utils/src/lsn.rs
@@ -1,5 +1,6 @@
 #![warn(missing_docs)]

+use camino::Utf8Path;
 use serde::{de::Visitor, Deserialize, Serialize};
 use std::fmt;
 use std::ops::{Add, AddAssign};
@@ -144,6 +145,14 @@ impl Lsn {
        i128::from(self.0) - i128::from(other)
    }

+    /// Parse an LSN from a filename in the form `0000000000000000`
+    pub fn from_filename<F>(filename: F) -> Result<Self, LsnParseError>
+    where
+        F: AsRef<Utf8Path>,
+    {
+        Lsn::from_hex(filename.as_ref().as_str())
+    }
+
    /// Parse an LSN from a string in the form `0000000000000000`
    pub fn from_hex<S>(s: S) -> Result<Self, LsnParseError>
    where
--- a/libs/utils/src/nonblock.rs
+++ b/libs/utils/src/nonblock.rs
@@ -0,0 +1,17 @@
+use nix::fcntl::{fcntl, OFlag, F_GETFL, F_SETFL};
+use std::os::unix::io::RawFd;
+
+/// Put a file descriptor into non-blocking mode
+pub fn set_nonblock(fd: RawFd) -> Result<(), std::io::Error> {
+    let bits = fcntl(fd, F_GETFL)?;
+
+    // If F_GETFL returns some unknown bits, they should be valid
+    // for passing back to F_SETFL, too. If we left them out, the F_SETFL
+    // would effectively clear them, which is not what we want.
+    let mut flags = OFlag::from_bits_retain(bits);
+    flags |= OFlag::O_NONBLOCK;
+
+    fcntl(fd, F_SETFL(flags))?;
+
+    Ok(())
+}
--- a/libs/utils/src/shutdown.rs
+++ b/libs/utils/src/shutdown.rs
@@ -0,0 +1,7 @@
+/// Immediately terminate the calling process without calling
+/// atexit callbacks, C runtime destructors etc. We mainly use
+/// this to protect coverage data from concurrent writes.
+pub fn exit_now(code: u8) -> ! {
+    // SAFETY: exiting is safe, the ffi is not safe
+    unsafe { nix::libc::_exit(code as _) };
+}
--- a/libs/utils/src/vec_map.rs
+++ b/libs/utils/src/vec_map.rs
@@ -120,6 +120,32 @@ impl<K: Ord, V> VecMap<K, V> {
        Ok((None, delta_size))
    }

+    /// Split the map into two.
+    ///
+    /// The left map contains everything before `cutoff` (exclusive).
+    /// Right map contains `cutoff` and everything after (inclusive).
+    pub fn split_at(&self, cutoff: &K) -> (Self, Self)
+    where
+        K: Clone,
+        V: Clone,
+    {
+        let split_idx = self
+            .data
+            .binary_search_by_key(&cutoff, extract_key)
+            .unwrap_or_else(std::convert::identity);
+
+        (
+            VecMap {
+                data: self.data[..split_idx].to_vec(),
+                ordering: self.ordering,
+            },
+            VecMap {
+                data: self.data[split_idx..].to_vec(),
+                ordering: self.ordering,
+            },
+        )
+    }
+
    /// Move items from `other` to the end of `self`, leaving `other` empty.
    /// If the `other` ordering is different from `self` ordering
    /// `ExtendOrderingError` error will be returned.
--- a/libs/vm_monitor/Cargo.toml
+++ b/libs/vm_monitor/Cargo.toml
@@ -15,11 +15,13 @@ anyhow.workspace = true
 axum.workspace = true
 clap.workspace = true
 futures.workspace = true
+inotify.workspace = true
 serde.workspace = true
 serde_json.workspace = true
 sysinfo.workspace = true
 tokio = { workspace = true, features = ["rt-multi-thread"] }
 tokio-postgres.workspace = true
+tokio-stream.workspace = true
 tokio-util.workspace = true
 tracing.workspace = true
 tracing-subscriber.workspace = true
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -15,6 +15,7 @@ anyhow.workspace = true
 arc-swap.workspace = true
 async-compression.workspace = true
 async-stream.workspace = true
+async-trait.workspace = true
 bit_field.workspace = true
 byteorder.workspace = true
 bytes.workspace = true
@@ -22,9 +23,12 @@ camino.workspace = true
 camino-tempfile.workspace = true
 chrono = { workspace = true, features = ["serde"] }
 clap = { workspace = true, features = ["string"] }
+const_format.workspace = true
 consumption_metrics.workspace = true
 crc32c.workspace = true
+crossbeam-utils.workspace = true
 either.workspace = true
+flate2.workspace = true
 fail.workspace = true
 futures.workspace = true
 git-version.workspace = true
@@ -53,6 +57,10 @@ serde.workspace = true
 serde_json = { workspace = true, features = ["raw_value"] }
 serde_path_to_error.workspace = true
 serde_with.workspace = true
+signal-hook.workspace = true
+smallvec = { workspace = true, features = ["write"] }
+svg_fmt.workspace = true
+sync_wrapper.workspace = true
 sysinfo.workspace = true
 tokio-tar.workspace = true
 thiserror.workspace = true
@@ -65,6 +73,7 @@ tokio-stream.workspace = true
 tokio-util.workspace = true
 toml_edit = { workspace = true, features = [ "serde" ] }
 tracing.workspace = true
+twox-hash.workspace = true
 url.workspace = true
 walkdir.workspace = true
 metrics.workspace = true
--- a/pageserver/compaction/Cargo.toml
+++ b/pageserver/compaction/Cargo.toml
@@ -9,19 +9,41 @@ default = []

 [dependencies]
 anyhow.workspace = true
+async-compression.workspace = true
 async-stream.workspace = true
+byteorder.workspace = true
+bytes.workspace = true
+chrono = { workspace = true, features = ["serde"] }
 clap = { workspace = true, features = ["string"] }
+const_format.workspace = true
+consumption_metrics.workspace = true
+crossbeam-utils.workspace = true
+either.workspace = true
+flate2.workspace = true
+fail.workspace = true
 futures.workspace = true
 git-version.workspace = true
+hex.workspace = true
+humantime.workspace = true
+humantime-serde.workspace = true
 itertools.workspace = true
 once_cell.workspace = true
 pageserver_api.workspace = true
 pin-project-lite.workspace = true
 rand.workspace = true
+smallvec = { workspace = true, features = ["write"] }
 svg_fmt.workspace = true
+sync_wrapper.workspace = true
+thiserror.workspace = true
 tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time"] }
+tokio-io-timeout.workspace = true
+tokio-util.workspace = true
 tracing.workspace = true
+tracing-error.workspace = true
 tracing-subscriber.workspace = true
+url.workspace = true
+walkdir.workspace = true
+metrics.workspace = true
 utils.workspace = true
 workspace_hack.workspace = true

--- a/pageserver/ctl/Cargo.toml
+++ b/pageserver/ctl/Cargo.toml
@@ -8,6 +8,7 @@ license.workspace = true

 [dependencies]
 anyhow.workspace = true
+bytes.workspace = true
 camino.workspace = true
 clap = { workspace = true, features = ["string"] }
 git-version.workspace = true
@@ -23,4 +24,5 @@ toml_edit.workspace = true
 utils.workspace = true
 svg_fmt.workspace = true
 workspace_hack.workspace = true
+serde.workspace = true
 serde_json.workspace = true
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -13,6 +13,7 @@ use pageserver_api::{
 use remote_storage::{RemotePath, RemoteStorageConfig};
 use std::env;
 use storage_broker::Uri;
+use utils::crashsafe::path_with_suffix_extension;
 use utils::logging::SecretString;

 use once_cell::sync::OnceCell;
@@ -32,7 +33,7 @@ use crate::tenant::storage_layer::inmemory_layer::IndexEntry;
 use crate::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
 use crate::virtual_file;
 use crate::virtual_file::io_engine;
-use crate::{TENANT_HEATMAP_BASENAME, TENANT_LOCATION_CONFIG_NAME};
+use crate::{TENANT_HEATMAP_BASENAME, TENANT_LOCATION_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX};

 /// Global state of pageserver.
 ///
@@ -256,6 +257,17 @@ impl PageServerConf {
            .join(timeline_id.to_string())
    }

+    pub(crate) fn timeline_delete_mark_file_path(
+        &self,
+        tenant_shard_id: TenantShardId,
+        timeline_id: TimelineId,
+    ) -> Utf8PathBuf {
+        path_with_suffix_extension(
+            self.timeline_path(&tenant_shard_id, &timeline_id),
+            TIMELINE_DELETE_MARK_SUFFIX,
+        )
+    }
+
    /// Turns storage remote path of a file into its local path.
    pub fn local_path(&self, remote_path: &RemotePath) -> Utf8PathBuf {
        remote_path.with_base(&self.workdir)
@@ -479,6 +491,11 @@ pub struct ConfigurableSemaphore {
 }

 impl ConfigurableSemaphore {
+    pub const DEFAULT_INITIAL: NonZeroUsize = match NonZeroUsize::new(1) {
+        Some(x) => x,
+        None => panic!("const unwrap is not yet stable"),
+    };
+
    /// Initializse using a non-zero amount of permits.
    ///
    /// Require a non-zero initial permits, because using permits == 0 is a crude way to disable a
@@ -499,6 +516,12 @@ impl ConfigurableSemaphore {
    }
 }

+impl Default for ConfigurableSemaphore {
+    fn default() -> Self {
+        Self::new(Self::DEFAULT_INITIAL)
+    }
+}
+
 impl PartialEq for ConfigurableSemaphore {
    fn eq(&self, other: &Self) -> bool {
        // the number of permits can be increased at runtime, so we cannot really fulfill the
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -178,7 +178,7 @@ async fn collect_metrics(
                )
                .await;
                if let Err(e) = res {
-                    tracing::error!("failed to upload to remote storage: {e:#}");
+                    tracing::error!("failed to upload to S3: {e:#}");
                }
            }
        };
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1177,10 +1177,10 @@ pub(crate) mod virtual_file_io_engine {
 }

 struct GlobalAndPerTimelineHistogramTimer<'a, 'c> {
-    global_latency_histo: &'a Histogram,
+    global_metric: &'a Histogram,

    // Optional because not all op types are tracked per-timeline
-    per_timeline_latency_histo: Option<&'a Histogram>,
+    timeline_metric: Option<&'a Histogram>,

    ctx: &'c RequestContext,
    start: std::time::Instant,
@@ -1212,10 +1212,9 @@ impl<'a, 'c> Drop for GlobalAndPerTimelineHistogramTimer<'a, 'c> {
                elapsed
            }
        };
-        self.global_latency_histo
-            .observe(ex_throttled.as_secs_f64());
-        if let Some(per_timeline_getpage_histo) = self.per_timeline_latency_histo {
-            per_timeline_getpage_histo.observe(ex_throttled.as_secs_f64());
+        self.global_metric.observe(ex_throttled.as_secs_f64());
+        if let Some(timeline_metric) = self.timeline_metric {
+            timeline_metric.observe(ex_throttled.as_secs_f64());
        }
    }
 }
@@ -1241,32 +1240,10 @@ pub enum SmgrQueryType {

 #[derive(Debug)]
 pub(crate) struct SmgrQueryTimePerTimeline {
-    global_started: [IntCounter; SmgrQueryType::COUNT],
-    global_latency: [Histogram; SmgrQueryType::COUNT],
-    per_timeline_getpage_started: IntCounter,
-    per_timeline_getpage_latency: Histogram,
+    global_metrics: [Histogram; SmgrQueryType::COUNT],
+    per_timeline_getpage: Histogram,
 }

-static SMGR_QUERY_STARTED_GLOBAL: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
-        // it's a counter, but, name is prepared to extend it to a histogram of queue depth
-        "pageserver_smgr_query_started_global_count",
-        "Number of smgr queries started, aggregated by query type.",
-        &["smgr_query_type"],
-    )
-    .expect("failed to define a metric")
-});
-
-static SMGR_QUERY_STARTED_PER_TENANT_TIMELINE: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
-        // it's a counter, but, name is prepared to extend it to a histogram of queue depth
-        "pageserver_smgr_query_started_count",
-        "Number of smgr queries started, aggregated by query type and tenant/timeline.",
-        &["smgr_query_type", "tenant_id", "shard_id", "timeline_id"],
-    )
-    .expect("failed to define a metric")
-});
-
 static SMGR_QUERY_TIME_PER_TENANT_TIMELINE: Lazy<HistogramVec> = Lazy::new(|| {
    register_histogram_vec!(
        "pageserver_smgr_query_seconds",
@@ -1342,20 +1319,14 @@ impl SmgrQueryTimePerTimeline {
        let tenant_id = tenant_shard_id.tenant_id.to_string();
        let shard_slug = format!("{}", tenant_shard_id.shard_slug());
        let timeline_id = timeline_id.to_string();
-        let global_started = std::array::from_fn(|i| {
-            let op = SmgrQueryType::from_repr(i).unwrap();
-            SMGR_QUERY_STARTED_GLOBAL
-                .get_metric_with_label_values(&[op.into()])
-                .unwrap()
-        });
-        let global_latency = std::array::from_fn(|i| {
+        let global_metrics = std::array::from_fn(|i| {
            let op = SmgrQueryType::from_repr(i).unwrap();
            SMGR_QUERY_TIME_GLOBAL
                .get_metric_with_label_values(&[op.into()])
                .unwrap()
        });

-        let per_timeline_getpage_started = SMGR_QUERY_STARTED_PER_TENANT_TIMELINE
+        let per_timeline_getpage = SMGR_QUERY_TIME_PER_TENANT_TIMELINE
            .get_metric_with_label_values(&[
                SmgrQueryType::GetPageAtLsn.into(),
                &tenant_id,
@@ -1363,20 +1334,9 @@ impl SmgrQueryTimePerTimeline {
                &timeline_id,
            ])
            .unwrap();
-        let per_timeline_getpage_latency = SMGR_QUERY_TIME_PER_TENANT_TIMELINE
-            .get_metric_with_label_values(&[
-                SmgrQueryType::GetPageAtLsn.into(),
-                &tenant_id,
-                &shard_slug,
-                &timeline_id,
-            ])
-            .unwrap();
-
        Self {
-            global_started,
-            global_latency,
-            per_timeline_getpage_latency,
-            per_timeline_getpage_started,
+            global_metrics,
+            per_timeline_getpage,
        }
    }
    pub(crate) fn start_timer<'c: 'a, 'a>(
@@ -1384,11 +1344,8 @@ impl SmgrQueryTimePerTimeline {
        op: SmgrQueryType,
        ctx: &'c RequestContext,
    ) -> Option<impl Drop + '_> {
+        let global_metric = &self.global_metrics[op as usize];
        let start = Instant::now();
-
-        self.global_started[op as usize].inc();
-
-        // We subtract time spent throttled from the observed latency.
        match ctx.micros_spent_throttled.open() {
            Ok(()) => (),
            Err(error) => {
@@ -1407,16 +1364,15 @@ impl SmgrQueryTimePerTimeline {
            }
        }

-        let per_timeline_latency_histo = if matches!(op, SmgrQueryType::GetPageAtLsn) {
-            self.per_timeline_getpage_started.inc();
-            Some(&self.per_timeline_getpage_latency)
+        let timeline_metric = if matches!(op, SmgrQueryType::GetPageAtLsn) {
+            Some(&self.per_timeline_getpage)
        } else {
            None
        };

        Some(GlobalAndPerTimelineHistogramTimer {
-            global_latency_histo: &self.global_latency[op as usize],
-            per_timeline_latency_histo,
+            global_metric,
+            timeline_metric,
            ctx,
            start,
            op,
@@ -1467,12 +1423,9 @@ mod smgr_query_time_tests {
            let get_counts = || {
                let global: u64 = ops
                    .iter()
-                    .map(|op| metrics.global_latency[*op as usize].get_sample_count())
+                    .map(|op| metrics.global_metrics[*op as usize].get_sample_count())
                    .sum();
-                (
-                    global,
-                    metrics.per_timeline_getpage_latency.get_sample_count(),
-                )
+                (global, metrics.per_timeline_getpage.get_sample_count())
            };

            let (pre_global, pre_per_tenant_timeline) = get_counts();
@@ -1824,7 +1777,7 @@ pub(crate) static SECONDARY_MODE: Lazy<SecondaryModeMetrics> = Lazy::new(|| {
    .expect("failed to define a metric"),
    upload_heatmap_duration: register_histogram!(
        "pageserver_secondary_upload_heatmap_duration",
-        "Time to build and upload a heatmap, including any waiting inside the remote storage client"
+        "Time to build and upload a heatmap, including any waiting inside the S3 client"
    )
    .expect("failed to define a metric"),
    download_heatmap: register_int_counter!(
@@ -2623,12 +2576,6 @@ impl TimelineMetrics {
            let _ = STORAGE_IO_SIZE.remove_label_values(&[op, tenant_id, shard_id, timeline_id]);
        }

-        let _ = SMGR_QUERY_STARTED_PER_TENANT_TIMELINE.remove_label_values(&[
-            SmgrQueryType::GetPageAtLsn.into(),
-            tenant_id,
-            shard_id,
-            timeline_id,
-        ]);
        let _ = SMGR_QUERY_TIME_PER_TENANT_TIMELINE.remove_label_values(&[
            SmgrQueryType::GetPageAtLsn.into(),
            tenant_id,
@@ -2645,8 +2592,6 @@ pub(crate) fn remove_tenant_metrics(tenant_shard_id: &TenantShardId) {
        let _ = TENANT_SYNTHETIC_SIZE_METRIC.remove_label_values(&[&tid]);
    }

-    tenant_throttling::remove_tenant_metrics(tenant_shard_id);
-
    // we leave the BROKEN_TENANTS_SET entry if any
 }

@@ -3110,180 +3055,41 @@ pub mod tokio_epoll_uring {
 pub(crate) mod tenant_throttling {
    use metrics::{register_int_counter_vec, IntCounter};
    use once_cell::sync::Lazy;
-    use utils::shard::TenantShardId;

    use crate::tenant::{self, throttle::Metric};

-    struct GlobalAndPerTenantIntCounter {
-        global: IntCounter,
-        per_tenant: IntCounter,
-    }
-
-    impl GlobalAndPerTenantIntCounter {
-        #[inline(always)]
-        pub(crate) fn inc(&self) {
-            self.inc_by(1)
-        }
-        #[inline(always)]
-        pub(crate) fn inc_by(&self, n: u64) {
-            self.global.inc_by(n);
-            self.per_tenant.inc_by(n);
-        }
-    }
-
    pub(crate) struct TimelineGet {
-        count_accounted_start: GlobalAndPerTenantIntCounter,
-        count_accounted_finish: GlobalAndPerTenantIntCounter,
-        wait_time: GlobalAndPerTenantIntCounter,
-        count_throttled: GlobalAndPerTenantIntCounter,
+        wait_time: IntCounter,
+        count: IntCounter,
    }

-    static COUNT_ACCOUNTED_START: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
-        register_int_counter_vec!(
-            "pageserver_tenant_throttling_count_accounted_start_global",
-            "Count of tenant throttling starts, by kind of throttle.",
-            &["kind"]
-        )
-        .unwrap()
-    });
-    static COUNT_ACCOUNTED_START_PER_TENANT: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
-        register_int_counter_vec!(
-            "pageserver_tenant_throttling_count_accounted_start",
-            "Count of tenant throttling starts, by kind of throttle.",
-            &["kind", "tenant_id", "shard_id"]
-        )
-        .unwrap()
-    });
-    static COUNT_ACCOUNTED_FINISH: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
-        register_int_counter_vec!(
-            "pageserver_tenant_throttling_count_accounted_finish_global",
-            "Count of tenant throttling finishes, by kind of throttle.",
-            &["kind"]
-        )
-        .unwrap()
-    });
-    static COUNT_ACCOUNTED_FINISH_PER_TENANT: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
-        register_int_counter_vec!(
-            "pageserver_tenant_throttling_count_accounted_finish",
-            "Count of tenant throttling finishes, by kind of throttle.",
-            &["kind", "tenant_id", "shard_id"]
-        )
-        .unwrap()
-    });
-    static WAIT_USECS: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
-        register_int_counter_vec!(
+    pub(crate) static TIMELINE_GET: Lazy<TimelineGet> = Lazy::new(|| {
+        static WAIT_USECS: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
+            register_int_counter_vec!(
            "pageserver_tenant_throttling_wait_usecs_sum_global",
-            "Sum of microseconds that spent waiting throttle by kind of throttle.",
+            "Sum of microseconds that tenants spent waiting for a tenant throttle of a given kind.",
            &["kind"]
        )
-        .unwrap()
-    });
-    static WAIT_USECS_PER_TENANT: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
-        register_int_counter_vec!(
-            "pageserver_tenant_throttling_wait_usecs_sum",
-            "Sum of microseconds that spent waiting throttle by kind of throttle.",
-            &["kind", "tenant_id", "shard_id"]
-        )
-        .unwrap()
+            .unwrap()
+        });
+
+        static WAIT_COUNT: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
+            register_int_counter_vec!(
+                "pageserver_tenant_throttling_count_global",
+                "Count of tenant throttlings, by kind of throttle.",
+                &["kind"]
+            )
+            .unwrap()
+        });
+
+        let kind = "timeline_get";
+        TimelineGet {
+            wait_time: WAIT_USECS.with_label_values(&[kind]),
+            count: WAIT_COUNT.with_label_values(&[kind]),
+        }
    });

-    static WAIT_COUNT: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
-        register_int_counter_vec!(
-            "pageserver_tenant_throttling_count_global",
-            "Count of tenant throttlings, by kind of throttle.",
-            &["kind"]
-        )
-        .unwrap()
-    });
-    static WAIT_COUNT_PER_TENANT: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
-        register_int_counter_vec!(
-            "pageserver_tenant_throttling_count",
-            "Count of tenant throttlings, by kind of throttle.",
-            &["kind", "tenant_id", "shard_id"]
-        )
-        .unwrap()
-    });
-
-    const KIND: &str = "timeline_get";
-
-    impl TimelineGet {
-        pub(crate) fn new(tenant_shard_id: &TenantShardId) -> Self {
-            TimelineGet {
-                count_accounted_start: {
-                    GlobalAndPerTenantIntCounter {
-                        global: COUNT_ACCOUNTED_START.with_label_values(&[KIND]),
-                        per_tenant: COUNT_ACCOUNTED_START_PER_TENANT.with_label_values(&[
-                            KIND,
-                            &tenant_shard_id.tenant_id.to_string(),
-                            &tenant_shard_id.shard_slug().to_string(),
-                        ]),
-                    }
-                },
-                count_accounted_finish: {
-                    GlobalAndPerTenantIntCounter {
-                        global: COUNT_ACCOUNTED_FINISH.with_label_values(&[KIND]),
-                        per_tenant: COUNT_ACCOUNTED_FINISH_PER_TENANT.with_label_values(&[
-                            KIND,
-                            &tenant_shard_id.tenant_id.to_string(),
-                            &tenant_shard_id.shard_slug().to_string(),
-                        ]),
-                    }
-                },
-                wait_time: {
-                    GlobalAndPerTenantIntCounter {
-                        global: WAIT_USECS.with_label_values(&[KIND]),
-                        per_tenant: WAIT_USECS_PER_TENANT.with_label_values(&[
-                            KIND,
-                            &tenant_shard_id.tenant_id.to_string(),
-                            &tenant_shard_id.shard_slug().to_string(),
-                        ]),
-                    }
-                },
-                count_throttled: {
-                    GlobalAndPerTenantIntCounter {
-                        global: WAIT_COUNT.with_label_values(&[KIND]),
-                        per_tenant: WAIT_COUNT_PER_TENANT.with_label_values(&[
-                            KIND,
-                            &tenant_shard_id.tenant_id.to_string(),
-                            &tenant_shard_id.shard_slug().to_string(),
-                        ]),
-                    }
-                },
-            }
-        }
-    }
-
-    pub(crate) fn preinitialize_global_metrics() {
-        Lazy::force(&COUNT_ACCOUNTED_START);
-        Lazy::force(&COUNT_ACCOUNTED_FINISH);
-        Lazy::force(&WAIT_USECS);
-        Lazy::force(&WAIT_COUNT);
-    }
-
-    pub(crate) fn remove_tenant_metrics(tenant_shard_id: &TenantShardId) {
-        for m in &[
-            &COUNT_ACCOUNTED_START_PER_TENANT,
-            &COUNT_ACCOUNTED_FINISH_PER_TENANT,
-            &WAIT_USECS_PER_TENANT,
-            &WAIT_COUNT_PER_TENANT,
-        ] {
-            let _ = m.remove_label_values(&[
-                KIND,
-                &tenant_shard_id.tenant_id.to_string(),
-                &tenant_shard_id.shard_slug().to_string(),
-            ]);
-        }
-    }
-
-    impl Metric for TimelineGet {
-        #[inline(always)]
-        fn accounting_start(&self) {
-            self.count_accounted_start.inc();
-        }
-        #[inline(always)]
-        fn accounting_finish(&self) {
-            self.count_accounted_finish.inc();
-        }
+    impl Metric for &'static TimelineGet {
        #[inline(always)]
        fn observe_throttling(
            &self,
@@ -3291,7 +3097,7 @@ pub(crate) mod tenant_throttling {
        ) {
            let val = u64::try_from(wait_time.as_micros()).unwrap();
            self.wait_time.inc_by(val);
-            self.count_throttled.inc();
+            self.count.inc();
        }
    }
 }
@@ -3421,14 +3227,11 @@ pub fn preinitialize_metrics() {
    }

    // countervecs
-    [
-        &BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT,
-        &SMGR_QUERY_STARTED_GLOBAL,
-    ]
-    .into_iter()
-    .for_each(|c| {
-        Lazy::force(c);
-    });
+    [&BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT]
+        .into_iter()
+        .for_each(|c| {
+            Lazy::force(c);
+        });

    // gauges
    WALRECEIVER_ACTIVE_MANAGERS.get();
@@ -3450,8 +3253,7 @@ pub fn preinitialize_metrics() {

    // Custom
    Lazy::force(&RECONSTRUCT_TIME);
+    Lazy::force(&tenant_throttling::TIMELINE_GET);
    Lazy::force(&BASEBACKUP_QUERY_TIME);
    Lazy::force(&COMPUTE_COMMANDS_COUNTERS);
-
-    tenant_throttling::preinitialize_global_metrics();
 }
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -18,6 +18,7 @@ use camino::Utf8Path;
 use camino::Utf8PathBuf;
 use enumset::EnumSet;
 use futures::stream::FuturesUnordered;
+use futures::FutureExt;
 use futures::StreamExt;
 use pageserver_api::models;
 use pageserver_api::models::AuxFilePolicy;
@@ -33,7 +34,6 @@ use remote_storage::GenericRemoteStorage;
 use remote_storage::TimeoutOrCancel;
 use std::collections::BTreeMap;
 use std::fmt;
-use std::future::Future;
 use std::sync::Weak;
 use std::time::SystemTime;
 use storage_broker::BrokerClientChannel;
@@ -140,7 +140,6 @@ pub mod metadata;
 pub mod remote_timeline_client;
 pub mod storage_layer;

-pub mod checks;
 pub mod config;
 pub mod mgr;
 pub mod secondary;
@@ -302,7 +301,7 @@ pub struct Tenant {
    /// Throttle applied at the top of [`Timeline::get`].
    /// All [`Tenant::timelines`] of a given [`Tenant`] instance share the same [`throttle::Throttle`] instance.
    pub(crate) timeline_get_throttle:
-        Arc<throttle::Throttle<crate::metrics::tenant_throttling::TimelineGet>>,
+        Arc<throttle::Throttle<&'static crate::metrics::tenant_throttling::TimelineGet>>,

    /// An ongoing timeline detach concurrency limiter.
    ///
@@ -1031,9 +1030,13 @@ impl Tenant {
        }

        Ok(TenantPreload {
-            timelines: self
-                .load_timelines_metadata(remote_timeline_ids, remote_storage, cancel)
-                .await?,
+            timelines: Self::load_timeline_metadata(
+                self,
+                remote_timeline_ids,
+                remote_storage,
+                cancel,
+            )
+            .await?,
        })
    }

@@ -1299,7 +1302,7 @@ impl Tenant {
        .await
    }

-    async fn load_timelines_metadata(
+    async fn load_timeline_metadata(
        self: &Arc<Tenant>,
        timeline_ids: HashSet<TimelineId>,
        remote_storage: &GenericRemoteStorage,
@@ -1307,10 +1310,33 @@ impl Tenant {
    ) -> anyhow::Result<HashMap<TimelineId, TimelinePreload>> {
        let mut part_downloads = JoinSet::new();
        for timeline_id in timeline_ids {
+            let client = RemoteTimelineClient::new(
+                remote_storage.clone(),
+                self.deletion_queue_client.clone(),
+                self.conf,
+                self.tenant_shard_id,
+                timeline_id,
+                self.generation,
+            );
            let cancel_clone = cancel.clone();
            part_downloads.spawn(
-                self.load_timeline_metadata(timeline_id, remote_storage.clone(), cancel_clone)
-                    .instrument(info_span!("download_index_part", %timeline_id)),
+                async move {
+                    debug!("starting index part download");
+
+                    let index_part = client.download_index_file(&cancel_clone).await;
+
+                    debug!("finished index part download");
+
+                    Result::<_, anyhow::Error>::Ok(TimelinePreload {
+                        client,
+                        timeline_id,
+                        index_part,
+                    })
+                }
+                .map(move |res| {
+                    res.with_context(|| format!("download index part for timeline {timeline_id}"))
+                })
+                .instrument(info_span!("download_index_part", %timeline_id)),
            );
        }

@@ -1321,7 +1347,8 @@ impl Tenant {
                next = part_downloads.join_next() => {
                    match next {
                        Some(result) => {
-                            let preload = result.context("join preload task")?;
+                            let preload_result = result.context("join preload task")?;
+                            let preload = preload_result?;
                            timeline_preloads.insert(preload.timeline_id, preload);
                        },
                        None => {
@@ -1338,36 +1365,6 @@ impl Tenant {
        Ok(timeline_preloads)
    }

-    fn load_timeline_metadata(
-        self: &Arc<Tenant>,
-        timeline_id: TimelineId,
-        remote_storage: GenericRemoteStorage,
-        cancel: CancellationToken,
-    ) -> impl Future<Output = TimelinePreload> {
-        let client = RemoteTimelineClient::new(
-            remote_storage.clone(),
-            self.deletion_queue_client.clone(),
-            self.conf,
-            self.tenant_shard_id,
-            timeline_id,
-            self.generation,
-        );
-        async move {
-            debug_assert_current_span_has_tenant_and_timeline_id();
-            debug!("starting index part download");
-
-            let index_part = client.download_index_file(&cancel).await;
-
-            debug!("finished index part download");
-
-            TimelinePreload {
-                client,
-                timeline_id,
-                index_part,
-            }
-        }
-    }
-
    pub(crate) async fn apply_timeline_archival_config(
        &self,
        timeline_id: TimelineId,
@@ -1576,9 +1573,6 @@ impl Tenant {
        image_layer_desc: Vec<(Lsn, Vec<(pageserver_api::key::Key, bytes::Bytes)>)>,
        end_lsn: Lsn,
    ) -> anyhow::Result<Arc<Timeline>> {
-        use checks::check_valid_layermap;
-        use itertools::Itertools;
-
        let tline = self
            .create_test_timeline(new_timeline_id, initdb_lsn, pg_version, ctx)
            .await?;
@@ -1593,18 +1587,6 @@ impl Tenant {
                .force_create_image_layer(lsn, images, Some(initdb_lsn), ctx)
                .await?;
        }
-        let layer_names = tline
-            .layers
-            .read()
-            .await
-            .layer_map()
-            .unwrap()
-            .iter_historic_layers()
-            .map(|layer| layer.layer_name())
-            .collect_vec();
-        if let Some(err) = check_valid_layermap(&layer_names) {
-            bail!("invalid layermap: {err}");
-        }
        Ok(tline)
    }

@@ -2833,7 +2815,7 @@ impl Tenant {
            gate: Gate::default(),
            timeline_get_throttle: Arc::new(throttle::Throttle::new(
                Tenant::get_timeline_get_throttle_config(conf, &attached_conf.tenant_conf),
-                crate::metrics::tenant_throttling::TimelineGet::new(&tenant_shard_id),
+                &crate::metrics::tenant_throttling::TIMELINE_GET,
            )),
            tenant_conf: Arc::new(ArcSwap::from_pointee(attached_conf)),
            ongoing_timeline_detach: std::sync::Mutex::default(),
@@ -3215,9 +3197,6 @@ impl Tenant {
        image_layer_desc: Vec<(Lsn, Vec<(pageserver_api::key::Key, bytes::Bytes)>)>,
        end_lsn: Lsn,
    ) -> anyhow::Result<Arc<Timeline>> {
-        use checks::check_valid_layermap;
-        use itertools::Itertools;
-
        let tline = self
            .branch_timeline_test(src_timeline, dst_id, ancestor_lsn, ctx)
            .await?;
@@ -3238,18 +3217,6 @@ impl Tenant {
                .force_create_image_layer(lsn, images, Some(ancestor_lsn), ctx)
                .await?;
        }
-        let layer_names = tline
-            .layers
-            .read()
-            .await
-            .layer_map()
-            .unwrap()
-            .iter_historic_layers()
-            .map(|layer| layer.layer_name())
-            .collect_vec();
-        if let Some(err) = check_valid_layermap(&layer_names) {
-            bail!("invalid layermap: {err}");
-        }
        Ok(tline)
    }

@@ -4197,18 +4164,9 @@ pub(crate) mod harness {
            let records_neon = records.iter().all(|r| apply_neon::can_apply_in_neon(&r.1));
            if records_neon {
                // For Neon wal records, we can decode without spawning postgres, so do so.
-                let mut page = match (base_img, records.first()) {
-                    (Some((_lsn, img)), _) => {
-                        let mut page = BytesMut::new();
-                        page.extend_from_slice(&img);
-                        page
-                    }
-                    (_, Some((_lsn, rec))) if rec.will_init() => BytesMut::new(),
-                    _ => {
-                        panic!("Neon WAL redo requires base image or will init record");
-                    }
-                };
-
+                let base_img = base_img.expect("Neon WAL redo requires base image").1;
+                let mut page = BytesMut::new();
+                page.extend_from_slice(&base_img);
                for (record_lsn, record) in records {
                    apply_neon::apply_in_neon(&record, record_lsn, key, &mut page)?;
                }
@@ -8534,7 +8492,6 @@ mod tests {
        let harness = TenantHarness::create("test_vectored_read_with_nested_image_layer").await?;
        let (tenant, ctx) = harness.load().await;

-        let will_init_keys = [2, 6];
        fn get_key(id: u32) -> Key {
            let mut key = Key::from_hex("110000000033333333444444445500000000").unwrap();
            key.field6 = id;
@@ -8584,25 +8541,18 @@ mod tests {
                }
            };

-            let will_init = will_init_keys.contains(&i);
-            if will_init {
-                delta_layer_spec.push((key, lsn, Value::WalRecord(NeonWalRecord::wal_init())));
-
-                expected_key_values.insert(key, "".to_string());
-            } else {
-                let delta = format!("@{lsn}");
-                delta_layer_spec.push((
-                    key,
-                    lsn,
-                    Value::WalRecord(NeonWalRecord::wal_append(&delta)),
-                ));
-
-                expected_key_values
-                    .get_mut(&key)
-                    .expect("An image exists for each key")
-                    .push_str(delta.as_str());
-            }
+            let delta = format!("@{lsn}");
+            delta_layer_spec.push((
+                key,
+                lsn,
+                Value::WalRecord(NeonWalRecord::wal_append(&delta)),
+            ));
            delta_layer_end_lsn = std::cmp::max(delta_layer_start_lsn, lsn);
+
+            expected_key_values
+                .get_mut(&key)
+                .expect("An image exists for each key")
+                .push_str(delta.as_str());
        }

        delta_layer_end_lsn = Lsn(delta_layer_end_lsn.0 + 1);
--- a/pageserver/src/tenant/checks.rs
+++ b/pageserver/src/tenant/checks.rs
@@ -1,55 +0,0 @@
-use std::collections::BTreeSet;
-
-use itertools::Itertools;
-
-use super::storage_layer::LayerName;
-
-/// Checks whether a layer map is valid (i.e., is a valid result of the current compaction algorithm if nothing goes wrong).
-/// The function checks if we can split the LSN range of a delta layer only at the LSNs of the delta layers. For example,
-///
-/// ```plain
-/// |       |                 |       |
-/// |   1   |    |   2   |    |   3   |
-/// |       |    |       |    |       |
-/// ```
-///
-/// This is not a valid layer map because the LSN range of layer 1 intersects with the LSN range of layer 2. 1 and 2 should have
-/// the same LSN range.
-///
-/// The exception is that when layer 2 only contains a single key, it could be split over the LSN range. For example,
-///
-/// ```plain
-/// |       |    |   2   |    |       |
-/// |   1   |    |-------|    |   3   |
-/// |       |    |   4   |    |       |
-///
-/// If layer 2 and 4 contain the same single key, this is also a valid layer map.
-pub fn check_valid_layermap(metadata: &[LayerName]) -> Option<String> {
-    let mut lsn_split_point = BTreeSet::new(); // TODO: use a better data structure (range tree / range set?)
-    let mut all_delta_layers = Vec::new();
-    for name in metadata {
-        if let LayerName::Delta(layer) = name {
-            if layer.key_range.start.next() != layer.key_range.end {
-                all_delta_layers.push(layer.clone());
-            }
-        }
-    }
-    for layer in &all_delta_layers {
-        let lsn_range = &layer.lsn_range;
-        lsn_split_point.insert(lsn_range.start);
-        lsn_split_point.insert(lsn_range.end);
-    }
-    for layer in &all_delta_layers {
-        let lsn_range = layer.lsn_range.clone();
-        let intersects = lsn_split_point.range(lsn_range).collect_vec();
-        if intersects.len() > 1 {
-            let err = format!(
-                "layer violates the layer map LSN split assumption: layer {} intersects with LSN [{}]",
-                layer,
-                intersects.into_iter().map(|lsn| lsn.to_string()).join(", ")
-            );
-            return Some(err);
-        }
-    }
-    None
-}
--- a/pageserver/src/tenant/gc_block.rs
+++ b/pageserver/src/tenant/gc_block.rs
@@ -1,29 +1,11 @@
-use std::{collections::HashMap, time::Duration};
+use std::collections::HashMap;

-use super::remote_timeline_client::index::GcBlockingReason;
-use tokio::time::Instant;
 use utils::id::TimelineId;

-type TimelinesBlocked = HashMap<TimelineId, enumset::EnumSet<GcBlockingReason>>;
+use super::remote_timeline_client::index::GcBlockingReason;

-#[derive(Default)]
-struct Storage {
-    timelines_blocked: TimelinesBlocked,
-    /// The deadline before which we are blocked from GC so that
-    /// leases have a chance to be renewed.
-    lsn_lease_deadline: Option<Instant>,
-}
+type Storage = HashMap<TimelineId, enumset::EnumSet<GcBlockingReason>>;

-impl Storage {
-    fn is_blocked_by_lsn_lease_deadline(&self) -> bool {
-        self.lsn_lease_deadline
-            .map(|d| Instant::now() < d)
-            .unwrap_or(false)
-    }
-}
-
-/// GcBlock provides persistent (per-timeline) gc blocking and facilitates transient time based gc
-/// blocking.
 #[derive(Default)]
 pub(crate) struct GcBlock {
    /// The timelines which have current reasons to block gc.
@@ -31,12 +13,6 @@ pub(crate) struct GcBlock {
    /// LOCK ORDER: this is held locked while scheduling the next index_part update. This is done
    /// to keep the this field up to date with RemoteTimelineClient `upload_queue.dirty`.
    reasons: std::sync::Mutex<Storage>,
-
-    /// GC background task or manually run `Tenant::gc_iteration` holds a lock on this.
-    ///
-    /// Do not add any more features taking and forbidding taking this lock. It should be
-    /// `tokio::sync::Notify`, but that is rarely used. On the other side, [`GcBlock::insert`]
-    /// synchronizes with gc attempts by locking and unlocking this mutex.
    blocking: tokio::sync::Mutex<()>,
 }

@@ -66,20 +42,6 @@ impl GcBlock {
        }
    }

-    /// Sets a deadline before which we cannot proceed to GC due to lsn lease.
-    ///
-    /// We do this as the leases mapping are not persisted to disk. By delaying GC by lease
-    /// length, we guarantee that all the leases we granted before will have a chance to renew
-    /// when we run GC for the first time after restart / transition from AttachedMulti to AttachedSingle.
-    pub(super) fn set_lsn_lease_deadline(&self, lsn_lease_length: Duration) {
-        let deadline = Instant::now() + lsn_lease_length;
-        let mut g = self.reasons.lock().unwrap();
-        g.lsn_lease_deadline = Some(deadline);
-    }
-
-    /// Describe the current gc blocking reasons.
-    ///
-    /// TODO: make this json serializable.
    pub(crate) fn summary(&self) -> Option<BlockingReasons> {
        let g = self.reasons.lock().unwrap();

@@ -102,7 +64,7 @@ impl GcBlock {
    ) -> anyhow::Result<bool> {
        let (added, uploaded) = {
            let mut g = self.reasons.lock().unwrap();
-            let set = g.timelines_blocked.entry(timeline.timeline_id).or_default();
+            let set = g.entry(timeline.timeline_id).or_default();
            let added = set.insert(reason);

            // LOCK ORDER: intentionally hold the lock, see self.reasons.
@@ -133,7 +95,7 @@ impl GcBlock {

        let (remaining_blocks, uploaded) = {
            let mut g = self.reasons.lock().unwrap();
-            match g.timelines_blocked.entry(timeline.timeline_id) {
+            match g.entry(timeline.timeline_id) {
                Entry::Occupied(mut oe) => {
                    let set = oe.get_mut();
                    set.remove(reason);
@@ -147,7 +109,7 @@ impl GcBlock {
                }
            }

-            let remaining_blocks = g.timelines_blocked.len();
+            let remaining_blocks = g.len();

            // LOCK ORDER: intentionally hold the lock while scheduling; see self.reasons
            let uploaded = timeline
@@ -172,11 +134,11 @@ impl GcBlock {
    pub(crate) fn before_delete(&self, timeline: &super::Timeline) {
        let unblocked = {
            let mut g = self.reasons.lock().unwrap();
-            if g.timelines_blocked.is_empty() {
+            if g.is_empty() {
                return;
            }

-            g.timelines_blocked.remove(&timeline.timeline_id);
+            g.remove(&timeline.timeline_id);

            BlockingReasons::clean_and_summarize(g).is_none()
        };
@@ -187,11 +149,10 @@ impl GcBlock {
    }

    /// Initialize with the non-deleted timelines of this tenant.
-    pub(crate) fn set_scanned(&self, scanned: TimelinesBlocked) {
+    pub(crate) fn set_scanned(&self, scanned: Storage) {
        let mut g = self.reasons.lock().unwrap();
-        assert!(g.timelines_blocked.is_empty());
-        g.timelines_blocked
-            .extend(scanned.into_iter().filter(|(_, v)| !v.is_empty()));
+        assert!(g.is_empty());
+        g.extend(scanned.into_iter().filter(|(_, v)| !v.is_empty()));

        if let Some(reasons) = BlockingReasons::clean_and_summarize(g) {
            tracing::info!(summary=?reasons, "initialized with gc blocked");
@@ -205,7 +166,6 @@ pub(super) struct Guard<'a> {

 #[derive(Debug)]
 pub(crate) struct BlockingReasons {
-    tenant_blocked_by_lsn_lease_deadline: bool,
    timelines: usize,
    reasons: enumset::EnumSet<GcBlockingReason>,
 }
@@ -214,8 +174,8 @@ impl std::fmt::Display for BlockingReasons {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(
            f,
-            "tenant_blocked_by_lsn_lease_deadline: {}, {} timelines block for {:?}",
-            self.tenant_blocked_by_lsn_lease_deadline, self.timelines, self.reasons
+            "{} timelines block for {:?}",
+            self.timelines, self.reasons
        )
    }
 }
@@ -223,15 +183,13 @@ impl std::fmt::Display for BlockingReasons {
 impl BlockingReasons {
    fn clean_and_summarize(mut g: std::sync::MutexGuard<'_, Storage>) -> Option<Self> {
        let mut reasons = enumset::EnumSet::empty();
-        g.timelines_blocked.retain(|_key, value| {
+        g.retain(|_key, value| {
            reasons = reasons.union(*value);
            !value.is_empty()
        });
-        let blocked_by_lsn_lease_deadline = g.is_blocked_by_lsn_lease_deadline();
-        if !g.timelines_blocked.is_empty() || blocked_by_lsn_lease_deadline {
+        if !g.is_empty() {
            Some(BlockingReasons {
-                tenant_blocked_by_lsn_lease_deadline: blocked_by_lsn_lease_deadline,
-                timelines: g.timelines_blocked.len(),
+                timelines: g.len(),
                reasons,
            })
        } else {
@@ -240,17 +198,14 @@ impl BlockingReasons {
    }

    fn summarize(g: &std::sync::MutexGuard<'_, Storage>) -> Option<Self> {
-        let blocked_by_lsn_lease_deadline = g.is_blocked_by_lsn_lease_deadline();
-        if g.timelines_blocked.is_empty() && !blocked_by_lsn_lease_deadline {
+        if g.is_empty() {
            None
        } else {
            let reasons = g
-                .timelines_blocked
                .values()
                .fold(enumset::EnumSet::empty(), |acc, next| acc.union(*next));
            Some(BlockingReasons {
-                tenant_blocked_by_lsn_lease_deadline: blocked_by_lsn_lease_deadline,
-                timelines: g.timelines_blocked.len(),
+                timelines: g.len(),
                reasons,
            })
        }
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -949,12 +949,6 @@ impl TenantManager {
                (LocationMode::Attached(attach_conf), Some(TenantSlot::Attached(tenant))) => {
                    match attach_conf.generation.cmp(&tenant.generation) {
                        Ordering::Equal => {
-                            if attach_conf.attach_mode == AttachmentMode::Single {
-                                tenant
-                                    .gc_block
-                                    .set_lsn_lease_deadline(tenant.get_lsn_lease_length());
-                            }
-
                            // A transition from Attached to Attached in the same generation, we may
                            // take our fast path and just provide the updated configuration
                            // to the tenant.
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -276,20 +276,10 @@ pub(crate) enum LayerId {
    InMemoryLayerId(InMemoryLayerFileId),
 }

-/// Uniquely identify a layer visit by the layer
-/// and LSN floor (or start LSN) of the reads.
-/// The layer itself is not enough since we may
-/// have different LSN lower bounds for delta layer reads.
-#[derive(Debug, PartialEq, Eq, Clone, Hash)]
-struct LayerToVisitId {
-    layer_id: LayerId,
-    lsn_floor: Lsn,
-}
-
 /// Layer wrapper for the read path. Note that it is valid
 /// to use these layers even after external operations have
 /// been performed on them (compaction, freeze, etc.).
-#[derive(Debug)]
+#[derive(Debug, Clone)]
 pub(crate) enum ReadableLayer {
    PersistentLayer(Layer),
    InMemoryLayer(Arc<InMemoryLayer>),
@@ -297,11 +287,13 @@ pub(crate) enum ReadableLayer {

 /// A partial description of a read to be done.
 #[derive(Debug, Clone)]
-struct LayerVisit {
+struct ReadDesc {
    /// An id used to resolve the readable layer within the fringe
-    layer_to_visit_id: LayerToVisitId,
+    layer_id: LayerId,
    /// Lsn range for the read, used for selecting the next read
    lsn_range: Range<Lsn>,
+    /// This read's index in [`LayerKeyspace::reads`];
+    read_id: LayerKeyspaceReadId,
 }

 /// Data structure which maintains a fringe of layers for the
@@ -313,46 +305,52 @@ struct LayerVisit {
 /// a two layer indexing scheme.
 #[derive(Debug)]
 pub(crate) struct LayerFringe {
-    planned_visits_by_lsn: BinaryHeap<LayerVisit>,
-    visit_reads: HashMap<LayerToVisitId, LayerVisitReads>,
+    planned_reads_by_lsn: BinaryHeap<ReadDesc>,
+    layers: HashMap<LayerId, LayerKeyspace>,
 }

 #[derive(Debug)]
-struct LayerVisitReads {
+struct LayerKeyspace {
    layer: ReadableLayer,
-    target_keyspace: KeySpaceRandomAccum,
+    next_read_id: LayerKeyspaceReadId,
+    reads: HashMap<LayerKeyspaceReadId, (Range<Lsn>, KeySpace)>,
 }

+#[derive(PartialEq, Eq, Hash, Debug, Clone, Copy)]
+struct LayerKeyspaceReadId(usize);
+
 impl LayerFringe {
    pub(crate) fn new() -> Self {
        LayerFringe {
-            planned_visits_by_lsn: BinaryHeap::new(),
-            visit_reads: HashMap::new(),
+            planned_reads_by_lsn: BinaryHeap::new(),
+            layers: HashMap::new(),
        }
    }

    pub(crate) fn next_layer(&mut self) -> Option<(ReadableLayer, KeySpace, Range<Lsn>)> {
-        let read_desc = match self.planned_visits_by_lsn.pop() {
+        let read_desc = match self.planned_reads_by_lsn.pop() {
            Some(desc) => desc,
            None => return None,
        };

-        let removed = self.visit_reads.remove_entry(&read_desc.layer_to_visit_id);
+        let mut entry = match self.layers.entry(read_desc.layer_id) {
+            Entry::Occupied(o) => o,
+            Entry::Vacant(_) => unreachable!("fringe internals are always consistent"),
+        };

-        match removed {
-            Some((
-                _,
-                LayerVisitReads {
-                    layer,
-                    mut target_keyspace,
-                },
-            )) => Some((
-                layer,
-                target_keyspace.consume_keyspace(),
-                read_desc.lsn_range,
-            )),
-            None => unreachable!("fringe internals are always consistent"),
+        let (lsn_range, keyspace) = entry
+            .get_mut()
+            .reads
+            .remove(&read_desc.read_id)
+            .expect("fringe internals are always consistent");
+
+        let layer = entry.get().layer.clone();
+
+        if entry.get().reads.is_empty() {
+            entry.remove();
        }
+
+        Some((layer, keyspace, lsn_range))
    }

    pub(crate) fn update(
@@ -361,26 +359,35 @@ impl LayerFringe {
        keyspace: KeySpace,
        lsn_range: Range<Lsn>,
    ) {
-        let layer_to_visit_id = LayerToVisitId {
-            layer_id: layer.id(),
-            lsn_floor: lsn_range.start,
-        };
-
-        let entry = self.visit_reads.entry(layer_to_visit_id.clone());
+        let layer_id = layer.id();
+        let entry = self.layers.entry(layer_id.clone());
        match entry {
            Entry::Occupied(mut entry) => {
-                entry.get_mut().target_keyspace.add_keyspace(keyspace);
+                let read_id = {
+                    let r = &mut entry.get_mut().next_read_id;
+                    let read_id = *r;
+                    *r = LayerKeyspaceReadId(r.0 + 1);
+                    read_id
+                };
+                self.planned_reads_by_lsn.push(ReadDesc {
+                    lsn_range: lsn_range.clone(),
+                    layer_id: layer_id.clone(),
+                    read_id,
+                });
+                let replaced = entry.get_mut().reads.insert(read_id, (lsn_range, keyspace));
+                assert!(replaced.is_none());
            }
            Entry::Vacant(entry) => {
-                self.planned_visits_by_lsn.push(LayerVisit {
-                    lsn_range,
-                    layer_to_visit_id: layer_to_visit_id.clone(),
+                let read_id = LayerKeyspaceReadId(0);
+                self.planned_reads_by_lsn.push(ReadDesc {
+                    lsn_range: lsn_range.clone(),
+                    layer_id: layer_id.clone(),
+                    read_id,
                });
-                let mut accum = KeySpaceRandomAccum::new();
-                accum.add_keyspace(keyspace);
-                entry.insert(LayerVisitReads {
+                entry.insert(LayerKeyspace {
                    layer,
-                    target_keyspace: accum,
+                    next_read_id: LayerKeyspaceReadId(1),
+                    reads: [(read_id, (lsn_range, keyspace))].into(),
                });
            }
        }
@@ -393,7 +400,7 @@ impl Default for LayerFringe {
    }
 }

-impl Ord for LayerVisit {
+impl Ord for ReadDesc {
    fn cmp(&self, other: &Self) -> Ordering {
        let ord = self.lsn_range.end.cmp(&other.lsn_range.end);
        if ord == std::cmp::Ordering::Equal {
@@ -404,19 +411,19 @@ impl Ord for LayerVisit {
    }
 }

-impl PartialOrd for LayerVisit {
+impl PartialOrd for ReadDesc {
    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
        Some(self.cmp(other))
    }
 }

-impl PartialEq for LayerVisit {
+impl PartialEq for ReadDesc {
    fn eq(&self, other: &Self) -> bool {
        self.lsn_range == other.lsn_range
    }
 }

-impl Eq for LayerVisit {}
+impl Eq for ReadDesc {}

 impl ReadableLayer {
    pub(crate) fn id(&self) -> LayerId {
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -38,7 +38,7 @@ use crate::tenant::timeline::GetVectoredError;
 use crate::tenant::vectored_blob_io::{
    BlobFlag, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead, VectoredReadPlanner,
 };
-use crate::tenant::PageReconstructError;
+use crate::tenant::{PageReconstructError, Timeline};
 use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt;
 use crate::virtual_file::{self, VirtualFile};
 use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX};
@@ -58,6 +58,7 @@ use std::io::SeekFrom;
 use std::ops::Range;
 use std::os::unix::prelude::FileExt;
 use std::str::FromStr;
+use std::sync::Arc;
 use tokio::sync::OnceCell;
 use tokio_stream::StreamExt;
 use tracing::*;
@@ -69,7 +70,9 @@ use utils::{
 };

 use super::layer_name::ImageLayerName;
-use super::{AsLayerDesc, LayerName, PersistentLayerDesc, ValuesReconstructState};
+use super::{
+    AsLayerDesc, Layer, LayerName, PersistentLayerDesc, ResidentLayer, ValuesReconstructState,
+};

 ///
 /// Header stored in the beginning of the file
@@ -797,9 +800,10 @@ impl ImageLayerWriterInner {
    ///
    async fn finish(
        self,
+        timeline: &Arc<Timeline>,
        ctx: &RequestContext,
        end_key: Option<Key>,
-    ) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> {
+    ) -> anyhow::Result<ResidentLayer> {
        let index_start_blk =
            ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;

@@ -875,9 +879,12 @@ impl ImageLayerWriterInner {
        // fsync the file
        file.sync_all().await?;

-        trace!("created image layer {}", self.path);
+        // FIXME: why not carry the virtualfile here, it supports renaming?
+        let layer = Layer::finish_creating(self.conf, timeline, desc, &self.path)?;

-        Ok((desc, self.path))
+        info!("created image layer {}", layer.local_path());
+
+        Ok(layer)
    }
 }

@@ -956,18 +963,24 @@ impl ImageLayerWriter {
    ///
    pub(crate) async fn finish(
        mut self,
+        timeline: &Arc<Timeline>,
        ctx: &RequestContext,
-    ) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> {
-        self.inner.take().unwrap().finish(ctx, None).await
+    ) -> anyhow::Result<super::ResidentLayer> {
+        self.inner.take().unwrap().finish(timeline, ctx, None).await
    }

    /// Finish writing the image layer with an end key, used in [`super::split_writer::SplitImageLayerWriter`]. The end key determines the end of the image layer's covered range and is exclusive.
    pub(super) async fn finish_with_end_key(
        mut self,
+        timeline: &Arc<Timeline>,
        end_key: Key,
        ctx: &RequestContext,
-    ) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> {
-        self.inner.take().unwrap().finish(ctx, Some(end_key)).await
+    ) -> anyhow::Result<super::ResidentLayer> {
+        self.inner
+            .take()
+            .unwrap()
+            .finish(timeline, ctx, Some(end_key))
+            .await
    }
 }

@@ -1071,7 +1084,7 @@ mod test {
        tenant::{
            config::TenantConf,
            harness::{TenantHarness, TIMELINE_ID},
-            storage_layer::{Layer, ResidentLayer},
+            storage_layer::ResidentLayer,
            vectored_blob_io::StreamingVectoredReadPlanner,
            Tenant, Timeline,
        },
@@ -1142,8 +1155,7 @@ mod test {

                key = key.next();
            }
-            let (desc, path) = writer.finish(&ctx).await.unwrap();
-            Layer::finish_creating(tenant.conf, &timeline, desc, &path).unwrap()
+            writer.finish(&timeline, &ctx).await.unwrap()
        };
        let original_size = resident.metadata().file_size;

@@ -1205,9 +1217,7 @@ mod test {
                .await
                .unwrap();
            let replacement = if wrote_keys > 0 {
-                let (desc, path) = filtered_writer.finish(&ctx).await.unwrap();
-                let resident = Layer::finish_creating(tenant.conf, &timeline, desc, &path).unwrap();
-                Some(resident)
+                Some(filtered_writer.finish(&timeline, &ctx).await.unwrap())
            } else {
                None
            };
@@ -1280,8 +1290,7 @@ mod test {
        for (key, img) in images {
            writer.put_image(key, img, ctx).await?;
        }
-        let (desc, path) = writer.finish(ctx).await?;
-        let img_layer = Layer::finish_creating(tenant.conf, tline, desc, &path)?;
+        let img_layer = writer.finish(tline, ctx).await?;

        Ok::<_, anyhow::Error>(img_layer)
    }
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -439,30 +439,11 @@ impl Layer {

    fn record_access(&self, ctx: &RequestContext) {
        if self.0.access_stats.record_access(ctx) {
-            // Visibility was modified to Visible: maybe log about this
-            match ctx.task_kind() {
-                TaskKind::CalculateSyntheticSize
-                | TaskKind::GarbageCollector
-                | TaskKind::MgmtRequest => {
-                    // This situation is expected in code paths do binary searches of the LSN space to resolve
-                    // an LSN to a timestamp, which happens during GC, during GC cutoff calculations in synthetic size,
-                    // and on-demand for certain HTTP API requests.
-                }
-                _ => {
-                    // In all other contexts, it is unusual to do I/O involving layers which are not visible at
-                    // some branch tip, so we log the fact that we are accessing something that the visibility
-                    // calculation thought should not be visible.
-                    //
-                    // This case is legal in brief time windows: for example an in-flight getpage request can hold on to a layer object
-                    // which was covered by a concurrent compaction.
-                    tracing::info!(
-                        "Layer {} became visible as a result of access",
-                        self.0.desc.key()
-                    );
-                }
-            }
-
-            // Update the timeline's visible bytes count
+            // Visibility was modified to Visible
+            tracing::info!(
+                "Layer {} became visible as a result of access",
+                self.0.desc.key()
+            );
            if let Some(tl) = self.0.timeline.upgrade() {
                tl.metrics
                    .visible_physical_size_gauge
--- a/pageserver/src/tenant/storage_layer/layer/tests.rs
+++ b/pageserver/src/tenant/storage_layer/layer/tests.rs
@@ -1025,15 +1025,6 @@ fn access_stats() {
    assert_eq!(access_stats.latest_activity(), lowres_time(atime));
    access_stats.set_visibility(LayerVisibilityHint::Visible);
    assert_eq!(access_stats.latest_activity(), lowres_time(atime));
-
-    // Recording access implicitly makes layer visible, if it wasn't already
-    let atime = UNIX_EPOCH + Duration::from_secs(2200000000);
-    access_stats.set_visibility(LayerVisibilityHint::Covered);
-    assert_eq!(access_stats.visibility(), LayerVisibilityHint::Covered);
-    assert!(access_stats.record_access_at(atime));
-    access_stats.set_visibility(LayerVisibilityHint::Visible);
-    assert!(!access_stats.record_access_at(atime));
-    access_stats.set_visibility(LayerVisibilityHint::Visible);
 }

 #[test]
--- a/pageserver/src/tenant/storage_layer/split_writer.rs
+++ b/pageserver/src/tenant/storage_layer/split_writer.rs
@@ -121,11 +121,11 @@ impl SplitImageLayerWriter {
                self.generated_layers
                    .push(SplitWriterResult::Discarded(layer_key));
            } else {
-                let (desc, path) = prev_image_writer.finish_with_end_key(key, ctx).await?;
-
-                let layer = Layer::finish_creating(self.conf, tline, desc, &path)?;
-                self.generated_layers
-                    .push(SplitWriterResult::Produced(layer));
+                self.generated_layers.push(SplitWriterResult::Produced(
+                    prev_image_writer
+                        .finish_with_end_key(tline, key, ctx)
+                        .await?,
+                ));
            }
        }
        self.inner.put_image(key, img, ctx).await
@@ -170,9 +170,9 @@ impl SplitImageLayerWriter {
        if discard(&layer_key).await {
            generated_layers.push(SplitWriterResult::Discarded(layer_key));
        } else {
-            let (desc, path) = inner.finish_with_end_key(end_key, ctx).await?;
-            let layer = Layer::finish_creating(self.conf, tline, desc, &path)?;
-            generated_layers.push(SplitWriterResult::Produced(layer));
+            generated_layers.push(SplitWriterResult::Produced(
+                inner.finish_with_end_key(tline, end_key, ctx).await?,
+            ));
        }
        Ok(generated_layers)
    }
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -163,6 +163,8 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
    // How many errors we have seen consequtively
    let mut error_run_count = 0;

+    let mut last_throttle_flag_reset_at = Instant::now();
+
    TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
    async {
        let ctx = RequestContext::todo_child(TaskKind::Compaction, DownloadBehavior::Download);
@@ -189,6 +191,8 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                }
            }

+
+
            let sleep_duration;
            if period == Duration::ZERO {
                #[cfg(not(feature = "testing"))]
@@ -203,18 +207,12 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                };

                // Run compaction
-                let IterationResult { output, elapsed } = iteration
-                    .run(tenant.compaction_iteration(&cancel, &ctx))
-                    .await;
+                let IterationResult { output, elapsed } = iteration.run(tenant.compaction_iteration(&cancel, &ctx)).await;
                match output {
                    Ok(has_pending_task) => {
                        error_run_count = 0;
                        // schedule the next compaction immediately in case there is a pending compaction task
-                        sleep_duration = if has_pending_task {
-                            Duration::ZERO
-                        } else {
-                            period
-                        };
+                        sleep_duration = if has_pending_task { Duration::ZERO } else { period };
                    }
                    Err(e) => {
                        let wait_duration = backoff::exponential_backoff_duration_seconds(
@@ -235,20 +233,38 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                }

                // the duration is recorded by performance tests by enabling debug in this function
-                tracing::debug!(
-                    elapsed_ms = elapsed.as_millis(),
-                    "compaction iteration complete"
-                );
+                tracing::debug!(elapsed_ms=elapsed.as_millis(), "compaction iteration complete");
            };

+
            // Perhaps we did no work and the walredo process has been idle for some time:
            // give it a chance to shut down to avoid leaving walredo process running indefinitely.
-            // TODO: move this to a separate task (housekeeping loop) that isn't affected by the back-off,
-            // so we get some upper bound guarantee on when walredo quiesce / this throttling reporting here happens.
            if let Some(walredo_mgr) = &tenant.walredo_mgr {
                walredo_mgr.maybe_quiesce(period * 10);
            }

+            // TODO: move this (and walredo quiesce) to a separate task that isn't affected by the back-off,
+            // so we get some upper bound guarantee on when walredo quiesce / this throttling reporting here happens.
+            info_span!(parent: None, "timeline_get_throttle", tenant_id=%tenant.tenant_shard_id, shard_id=%tenant.tenant_shard_id.shard_slug()).in_scope(|| {
+                let now = Instant::now();
+                let prev = std::mem::replace(&mut last_throttle_flag_reset_at, now);
+                let Stats { count_accounted, count_throttled, sum_throttled_usecs } = tenant.timeline_get_throttle.reset_stats();
+                if count_throttled == 0 {
+                    return;
+                }
+                let allowed_rps = tenant.timeline_get_throttle.steady_rps();
+                let delta = now - prev;
+                info!(
+                    n_seconds=%format_args!("{:.3}",
+                    delta.as_secs_f64()),
+                    count_accounted,
+                    count_throttled,
+                    sum_throttled_usecs,
+                    allowed_rps=%format_args!("{allowed_rps:.0}"),
+                    "shard was throttled in the last n_seconds"
+                );
+            });
+
            // Sleep
            if tokio::time::timeout(sleep_duration, cancel.cancelled())
                .await
@@ -330,7 +346,6 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
            RequestContext::todo_child(TaskKind::GarbageCollector, DownloadBehavior::Download);

        let mut first = true;
-        tenant.gc_block.set_lsn_lease_deadline(tenant.get_lsn_lease_length());
        loop {
            tokio::select! {
                _ = cancel.cancelled() => {
@@ -348,6 +363,7 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                first = false;

                let delays = async {
+                    delay_by_lease_length(tenant.get_lsn_lease_length(), &cancel).await?;
                    random_init_delay(period, &cancel).await?;
                    Ok::<_, Cancelled>(())
                };
@@ -421,7 +437,6 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
 async fn ingest_housekeeping_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
    TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
    async {
-    let mut last_throttle_flag_reset_at = Instant::now();
        loop {
            tokio::select! {
                _ = cancel.cancelled() => {
@@ -468,29 +483,6 @@ async fn ingest_housekeeping_loop(tenant: Arc<Tenant>, cancel: CancellationToken
                kind: BackgroundLoopKind::IngestHouseKeeping,
            };
            iteration.run(tenant.ingest_housekeeping()).await;
-
-            // TODO: rename the background loop kind to something more generic, like, tenant housekeeping.
-            // Or just spawn another background loop for this throttle, it's not like it's super costly.
-            info_span!(parent: None, "timeline_get_throttle", tenant_id=%tenant.tenant_shard_id, shard_id=%tenant.tenant_shard_id.shard_slug()).in_scope(|| {
-                let now = Instant::now();
-                let prev = std::mem::replace(&mut last_throttle_flag_reset_at, now);
-                let Stats { count_accounted_start, count_accounted_finish, count_throttled, sum_throttled_usecs} = tenant.timeline_get_throttle.reset_stats();
-                if count_throttled == 0 {
-                    return;
-                }
-                let allowed_rps = tenant.timeline_get_throttle.steady_rps();
-                let delta = now - prev;
-                info!(
-                    n_seconds=%format_args!("{:.3}",
-                    delta.as_secs_f64()),
-                    count_accounted = count_accounted_finish,  // don't break existing log scraping
-                    count_throttled,
-                    sum_throttled_usecs,
-                    count_accounted_start, // log after pre-existing fields to not break existing log scraping
-                    allowed_rps=%format_args!("{allowed_rps:.0}"),
-                    "shard was throttled in the last n_seconds"
-                );
-            });
        }
    }
    .await;
@@ -546,12 +538,28 @@ pub(crate) async fn random_init_delay(
        let mut rng = rand::thread_rng();
        rng.gen_range(Duration::ZERO..=period)
    };
+
    match tokio::time::timeout(d, cancel.cancelled()).await {
        Ok(_) => Err(Cancelled),
        Err(_) => Ok(()),
    }
 }

+/// Delays GC by defaul lease length at restart.
+///
+/// We do this as the leases mapping are not persisted to disk. By delaying GC by default
+/// length, we gurantees that all the leases we granted before the restart will expire
+/// when we run GC for the first time after the restart.
+pub(crate) async fn delay_by_lease_length(
+    length: Duration,
+    cancel: &CancellationToken,
+) -> Result<(), Cancelled> {
+    match tokio::time::timeout(length, cancel.cancelled()).await {
+        Ok(_) => Err(Cancelled),
+        Err(_) => Ok(()),
+    }
+}
+
 struct Iteration {
    started_at: Instant,
    period: Duration,
--- a/pageserver/src/tenant/throttle.rs
+++ b/pageserver/src/tenant/throttle.rs
@@ -24,10 +24,8 @@ use crate::{context::RequestContext, task_mgr::TaskKind};
 pub struct Throttle<M: Metric> {
    inner: ArcSwap<Inner>,
    metric: M,
-    /// will be turned into [`Stats::count_accounted_start`]
-    count_accounted_start: AtomicU64,
-    /// will be turned into [`Stats::count_accounted_finish`]
-    count_accounted_finish: AtomicU64,
+    /// will be turned into [`Stats::count_accounted`]
+    count_accounted: AtomicU64,
    /// will be turned into [`Stats::count_throttled`]
    count_throttled: AtomicU64,
    /// will be turned into [`Stats::sum_throttled_usecs`]
@@ -45,21 +43,17 @@ pub struct Observation {
    pub wait_time: Duration,
 }
 pub trait Metric {
-    fn accounting_start(&self);
-    fn accounting_finish(&self);
    fn observe_throttling(&self, observation: &Observation);
 }

 /// See [`Throttle::reset_stats`].
 pub struct Stats {
-    /// Number of requests that started [`Throttle::throttle`] calls.
-    pub count_accounted_start: u64,
-    /// Number of requests that finished [`Throttle::throttle`] calls.
-    pub count_accounted_finish: u64,
-    /// Subset of the `accounted` requests that were actually throttled.
-    /// Note that the numbers are stored as two independent atomics, so, there might be a slight drift.
+    // Number of requests that were subject to throttling, i.e., requests of the configured [`Config::task_kinds`].
+    pub count_accounted: u64,
+    // Subset of the `accounted` requests that were actually throttled.
+    // Note that the numbers are stored as two independent atomics, so, there might be a slight drift.
    pub count_throttled: u64,
-    /// Sum of microseconds that throttled requests spent waiting for throttling.
+    // Sum of microseconds that throttled requests spent waiting for throttling.
    pub sum_throttled_usecs: u64,
 }

@@ -71,8 +65,7 @@ where
        Self {
            inner: ArcSwap::new(Arc::new(Self::new_inner(config))),
            metric,
-            count_accounted_start: AtomicU64::new(0),
-            count_accounted_finish: AtomicU64::new(0),
+            count_accounted: AtomicU64::new(0),
            count_throttled: AtomicU64::new(0),
            sum_throttled_usecs: AtomicU64::new(0),
        }
@@ -124,13 +117,11 @@ where
    /// This method allows retrieving & resetting that flag.
    /// Useful for periodic reporting.
    pub fn reset_stats(&self) -> Stats {
-        let count_accounted_start = self.count_accounted_start.swap(0, Ordering::Relaxed);
-        let count_accounted_finish = self.count_accounted_finish.swap(0, Ordering::Relaxed);
+        let count_accounted = self.count_accounted.swap(0, Ordering::Relaxed);
        let count_throttled = self.count_throttled.swap(0, Ordering::Relaxed);
        let sum_throttled_usecs = self.sum_throttled_usecs.swap(0, Ordering::Relaxed);
        Stats {
-            count_accounted_start,
-            count_accounted_finish,
+            count_accounted,
            count_throttled,
            sum_throttled_usecs,
        }
@@ -148,12 +139,9 @@ where
        };
        let start = std::time::Instant::now();

-        self.metric.accounting_start();
-        self.count_accounted_start.fetch_add(1, Ordering::Relaxed);
        let did_throttle = inner.rate_limiter.acquire(key_count).await;
-        self.count_accounted_finish.fetch_add(1, Ordering::Relaxed);
-        self.metric.accounting_finish();

+        self.count_accounted.fetch_add(1, Ordering::Relaxed);
        if did_throttle {
            self.count_throttled.fetch_add(1, Ordering::Relaxed);
            let now = Instant::now();
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -196,8 +196,9 @@ fn drop_wlock<T>(rlock: tokio::sync::RwLockWriteGuard<'_, T>) {
 /// The outward-facing resources required to build a Timeline
 pub struct TimelineResources {
    pub remote_client: RemoteTimelineClient,
-    pub timeline_get_throttle:
-        Arc<crate::tenant::throttle::Throttle<crate::metrics::tenant_throttling::TimelineGet>>,
+    pub timeline_get_throttle: Arc<
+        crate::tenant::throttle::Throttle<&'static crate::metrics::tenant_throttling::TimelineGet>,
+    >,
    pub l0_flush_global_state: l0_flush::L0FlushGlobalState,
 }

@@ -405,8 +406,9 @@ pub struct Timeline {
    gc_lock: tokio::sync::Mutex<()>,

    /// Cloned from [`super::Tenant::timeline_get_throttle`] on construction.
-    timeline_get_throttle:
-        Arc<crate::tenant::throttle::Throttle<crate::metrics::tenant_throttling::TimelineGet>>,
+    timeline_get_throttle: Arc<
+        crate::tenant::throttle::Throttle<&'static crate::metrics::tenant_throttling::TimelineGet>,
+    >,

    /// Keep aux directory cache to avoid it's reconstruction on each update
    pub(crate) aux_files: tokio::sync::Mutex<AuxFilesState>,
@@ -4011,9 +4013,7 @@ impl Timeline {
        if wrote_keys {
            // Normal path: we have written some data into the new image layer for this
            // partition, so flush it to disk.
-            let (desc, path) = image_layer_writer.finish(ctx).await?;
-            let image_layer = Layer::finish_creating(self.conf, self, desc, &path)?;
-            info!("created image layer for rel {}", image_layer.local_path());
+            let image_layer = image_layer_writer.finish(self, ctx).await?;
            Ok(ImageLayerCreationOutcome {
                image: Some(image_layer),
                next_start_key: img_range.end,
@@ -4101,12 +4101,7 @@ impl Timeline {
        if wrote_any_image {
            // Normal path: we have written some data into the new image layer for this
            // partition, so flush it to disk.
-            let (desc, path) = image_layer_writer.finish(ctx).await?;
-            let image_layer = Layer::finish_creating(self.conf, self, desc, &path)?;
-            info!(
-                "created image layer for metadata {}",
-                image_layer.local_path()
-            );
+            let image_layer = image_layer_writer.finish(self, ctx).await?;
            Ok(ImageLayerCreationOutcome {
                image: Some(image_layer),
                next_start_key: img_range.end,
@@ -4314,9 +4309,7 @@ impl Timeline {
        timer.stop_and_record();

        // Creating image layers may have caused some previously visible layers to be covered
-        if !image_layers.is_empty() {
-            self.update_layer_visibility().await?;
-        }
+        self.update_layer_visibility().await?;

        Ok(image_layers)
    }
@@ -5378,8 +5371,7 @@ impl Timeline {
    /// Force create an image layer and place it into the layer map.
    ///
    /// DO NOT use this function directly. Use [`Tenant::branch_timeline_test_with_layers`]
-    /// or [`Tenant::create_test_timeline_with_layers`] to ensure all these layers are
-    /// placed into the layer map in one run AND be validated.
+    /// or [`Tenant::create_test_timeline_with_layers`] to ensure all these layers are placed into the layer map in one run.
    #[cfg(test)]
    pub(super) async fn force_create_image_layer(
        self: &Arc<Timeline>,
@@ -5411,9 +5403,8 @@ impl Timeline {
        for (key, img) in images {
            image_layer_writer.put_image(key, img, ctx).await?;
        }
-        let (desc, path) = image_layer_writer.finish(ctx).await?;
-        let image_layer = Layer::finish_creating(self.conf, self, desc, &path)?;
-        info!("force created image layer {}", image_layer.local_path());
+        let image_layer = image_layer_writer.finish(self, ctx).await?;
+
        {
            let mut guard = self.layers.write().await;
            guard.open_mut().unwrap().force_insert_layer(image_layer);
@@ -5425,8 +5416,7 @@ impl Timeline {
    /// Force create a delta layer and place it into the layer map.
    ///
    /// DO NOT use this function directly. Use [`Tenant::branch_timeline_test_with_layers`]
-    /// or [`Tenant::create_test_timeline_with_layers`] to ensure all these layers are
-    /// placed into the layer map in one run AND be validated.
+    /// or [`Tenant::create_test_timeline_with_layers`] to ensure all these layers are placed into the layer map in one run.
    #[cfg(test)]
    pub(super) async fn force_create_delta_layer(
        self: &Arc<Timeline>,
@@ -5452,6 +5442,33 @@ impl Timeline {
        if let Some(check_start_lsn) = check_start_lsn {
            assert!(deltas.lsn_range.start >= check_start_lsn);
        }
+        // check if the delta layer does not violate the LSN invariant, the legacy compaction should always produce a batch of
+        // layers of the same start/end LSN, and so should the force inserted layer
+        {
+            /// Checks if a overlaps with b, assume a/b = [start, end).
+            pub fn overlaps_with<T: Ord>(a: &Range<T>, b: &Range<T>) -> bool {
+                !(a.end <= b.start || b.end <= a.start)
+            }
+
+            if deltas.key_range.start.next() != deltas.key_range.end {
+                let guard = self.layers.read().await;
+                let mut invalid_layers =
+                    guard.layer_map()?.iter_historic_layers().filter(|layer| {
+                        layer.is_delta()
+                        && overlaps_with(&layer.lsn_range, &deltas.lsn_range)
+                        && layer.lsn_range != deltas.lsn_range
+                        // skip single-key layer files
+                        && layer.key_range.start.next() != layer.key_range.end
+                    });
+                if let Some(layer) = invalid_layers.next() {
+                    // If a delta layer overlaps with another delta layer AND their LSN range is not the same, panic
+                    panic!(
+                        "inserted layer violates delta layer LSN invariant: current_lsn_range={}..{}, conflict_lsn_range={}..{}",
+                        deltas.lsn_range.start, deltas.lsn_range.end, layer.lsn_range.start, layer.lsn_range.end
+                    );
+                }
+            }
+        }
        let mut delta_layer_writer = DeltaLayerWriter::new(
            self.conf,
            self.timeline_id,
@@ -5466,7 +5483,7 @@ impl Timeline {
        }
        let (desc, path) = delta_layer_writer.finish(deltas.key_range.end, ctx).await?;
        let delta_layer = Layer::finish_creating(self.conf, self, desc, &path)?;
-        info!("force created delta layer {}", delta_layer.local_path());
+
        {
            let mut guard = self.layers.write().await;
            guard.open_mut().unwrap().force_insert_layer(delta_layer);
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -29,7 +29,6 @@ use utils::id::TimelineId;

 use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder};
 use crate::page_cache;
-use crate::tenant::checks::check_valid_layermap;
 use crate::tenant::remote_timeline_client::WaitCompletionError;
 use crate::tenant::storage_layer::merge_iterator::MergeIterator;
 use crate::tenant::storage_layer::split_writer::{
@@ -564,12 +563,10 @@ impl Timeline {
                .await?;

            if keys_written > 0 {
-                let (desc, path) = image_layer_writer
-                    .finish(ctx)
+                let new_layer = image_layer_writer
+                    .finish(self, ctx)
                    .await
                    .map_err(CompactionError::Other)?;
-                let new_layer = Layer::finish_creating(self.conf, self, desc, &path)
-                    .map_err(CompactionError::Other)?;
                tracing::info!(layer=%new_layer, "Rewrote layer, {} -> {} bytes",
                    layer.metadata().file_size,
                    new_layer.metadata().file_size);
@@ -1789,12 +1786,20 @@ impl Timeline {
                stat.visit_image_layer(desc.file_size());
            }
        }
-        let layer_names: Vec<crate::tenant::storage_layer::LayerName> = layer_selection
-            .iter()
-            .map(|layer| layer.layer_desc().layer_name())
-            .collect_vec();
-        if let Some(err) = check_valid_layermap(&layer_names) {
-            bail!("cannot run gc-compaction because {}", err);
+        for layer in &layer_selection {
+            let desc = layer.layer_desc();
+            let key_range = &desc.key_range;
+            if desc.is_delta() && key_range.start.next() != key_range.end {
+                let lsn_range = desc.lsn_range.clone();
+                let intersects = lsn_split_point.range(lsn_range).collect_vec();
+                if intersects.len() > 1 {
+                    bail!(
+                        "cannot run gc-compaction because it violates the layer map LSN split assumption: layer {} intersects with LSN [{}]",
+                        desc.key(),
+                        intersects.into_iter().map(|lsn| lsn.to_string()).join(", ")
+                    );
+                }
+            }
        }
        // The maximum LSN we are processing in this compaction loop
        let end_lsn = layer_selection
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -135,6 +135,25 @@ async fn delete_remote_layers_and_index(timeline: &Timeline) -> anyhow::Result<(
        .context("delete_all")
 }

+// This function removs remaining traces of a timeline on disk.
+// Namely: metadata file, timeline directory, delete mark.
+// Note: io::ErrorKind::NotFound are ignored for metadata and timeline dir.
+// delete mark should be present because it is the last step during deletion.
+// (nothing can fail after its deletion)
+async fn cleanup_remaining_timeline_fs_traces(
+    conf: &PageServerConf,
+    tenant_shard_id: TenantShardId,
+    timeline_id: TimelineId,
+) -> anyhow::Result<()> {
+    // Remove delete mark
+    // TODO: once we are confident that no more exist in the field, remove this
+    // line.  It cleans up a legacy marker file that might in rare cases be present.
+    tokio::fs::remove_file(conf.timeline_delete_mark_file_path(tenant_shard_id, timeline_id))
+        .await
+        .or_else(fs_ext::ignore_not_found)
+        .context("remove delete mark")
+}
+
 /// It is important that this gets called when DeletionGuard is being held.
 /// For more context see comments in [`DeleteTimelineFlow::prepare`]
 async fn remove_timeline_from_tenant(
@@ -175,10 +194,12 @@ async fn remove_timeline_from_tenant(
 /// 7. Delete mark file
 ///
 /// It is resumable from any step in case a crash/restart occurs.
-/// There are two entrypoints to the process:
+/// There are three entrypoints to the process:
 /// 1. [`DeleteTimelineFlow::run`] this is the main one called by a management api handler.
 /// 2. [`DeleteTimelineFlow::resume_deletion`] is called during restarts when local metadata is still present
 ///    and we possibly neeed to continue deletion of remote files.
+/// 3. [`DeleteTimelineFlow::cleanup_remaining_timeline_fs_traces`] is used when we deleted remote
+///    index but still have local metadata, timeline directory and delete mark.
 ///
 /// Note the only other place that messes around timeline delete mark is the logic that scans directory with timelines during tenant load.
 #[derive(Default)]
@@ -290,6 +311,18 @@ impl DeleteTimelineFlow {
        Ok(())
    }

+    #[instrument(skip_all, fields(%timeline_id))]
+    pub async fn cleanup_remaining_timeline_fs_traces(
+        tenant: &Tenant,
+        timeline_id: TimelineId,
+    ) -> anyhow::Result<()> {
+        let r =
+            cleanup_remaining_timeline_fs_traces(tenant.conf, tenant.tenant_shard_id, timeline_id)
+                .await;
+        info!("Done");
+        r
+    }
+
    fn prepare(
        tenant: &Tenant,
        timeline_id: TimelineId,
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -35,7 +35,6 @@ use anyhow::Context;
 use bytes::{Bytes, BytesMut};
 use pageserver_api::models::{WalRedoManagerProcessStatus, WalRedoManagerStatus};
 use pageserver_api::shard::TenantShardId;
-use std::future::Future;
 use std::sync::Arc;
 use std::time::Duration;
 use std::time::Instant;
@@ -297,97 +296,6 @@ impl PostgresRedoManager {
        }
    }

-    async fn do_with_walredo_process<
-        F: FnOnce(Arc<Process>) -> Fut,
-        Fut: Future<Output = Result<O, Error>>,
-        O,
-    >(
-        &self,
-        pg_version: u32,
-        closure: F,
-    ) -> Result<O, Error> {
-        let proc: Arc<Process> = match self.redo_process.get_or_init_detached().await {
-            Ok(guard) => match &*guard {
-                ProcessOnceCell::Spawned(proc) => Arc::clone(proc),
-                ProcessOnceCell::ManagerShutDown => {
-                    return Err(Error::Cancelled);
-                }
-            },
-            Err(permit) => {
-                let start = Instant::now();
-                // acquire guard before spawning process, so that we don't spawn new processes
-                // if the gate is already closed.
-                let _launched_processes_guard = match self.launched_processes.enter() {
-                    Ok(guard) => guard,
-                    Err(GateError::GateClosed) => unreachable!(
-                        "shutdown sets the once cell to `ManagerShutDown` state before closing the gate"
-                    ),
-                };
-                let proc = Arc::new(Process {
-                    process: process::WalRedoProcess::launch(
-                        self.conf,
-                        self.tenant_shard_id,
-                        pg_version,
-                    )
-                    .context("launch walredo process")?,
-                    _launched_processes_guard,
-                });
-                let duration = start.elapsed();
-                WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM.observe(duration.as_secs_f64());
-                info!(
-                    elapsed_ms = duration.as_millis(),
-                    pid = proc.id(),
-                    "launched walredo process"
-                );
-                self.redo_process
-                    .set(ProcessOnceCell::Spawned(Arc::clone(&proc)), permit);
-                proc
-            }
-        };
-
-        // async closures are unstable, would support &Process
-        let result = closure(proc.clone()).await;
-
-        if result.is_err() {
-            // Avoid concurrent callers hitting the same issue by taking `proc` out of the rotation.
-            // Note that there may be other tasks concurrent with us that also hold `proc`.
-            // We have to deal with that here.
-            // Also read the doc comment on field `self.redo_process`.
-            //
-            // NB: there may still be other concurrent threads using `proc`.
-            // The last one will send SIGKILL when the underlying Arc reaches refcount 0.
-            //
-            // NB: the drop impl blocks the dropping thread with a wait() system call for
-            // the child process. In some ways the blocking is actually good: if we
-            // deferred the waiting into the background / to tokio if we used `tokio::process`,
-            // it could happen that if walredo always fails immediately, we spawn processes faster
-            // than we can SIGKILL & `wait` for them to exit. By doing it the way we do here,
-            // we limit this risk of run-away to at most $num_runtimes * $num_executor_threads.
-            // This probably needs revisiting at some later point.
-            match self.redo_process.get() {
-                None => (),
-                Some(guard) => {
-                    match &*guard {
-                        ProcessOnceCell::ManagerShutDown => {}
-                        ProcessOnceCell::Spawned(guard_proc) => {
-                            if Arc::ptr_eq(&proc, guard_proc) {
-                                // We're the first to observe an error from `proc`, it's our job to take it out of rotation.
-                                guard.take_and_deinit();
-                            } else {
-                                // Another task already spawned another redo process (further up in this method)
-                                // and put it into `redo_process`. Do nothing, our view of the world is behind.
-                            }
-                        }
-                    }
-                }
-            }
-            // The last task that does this `drop()` of `proc` will do a blocking `wait()` syscall.
-            drop(proc);
-        }
-
-        result
-    }
-
    ///
    /// Process one request for WAL redo using wal-redo postgres
    ///
@@ -411,63 +319,130 @@ impl PostgresRedoManager {
        const MAX_RETRY_ATTEMPTS: u32 = 1;
        let mut n_attempts = 0u32;
        loop {
-            let base_img = &base_img;
-            let closure = |proc: Arc<Process>| async move {
-                let started_at = std::time::Instant::now();
-
-                // Relational WAL records are applied using wal-redo-postgres
-                let result = proc
-                    .apply_wal_records(rel, blknum, base_img, records, wal_redo_timeout)
-                    .await
-                    .context("apply_wal_records");
-
-                let duration = started_at.elapsed();
-
-                let len = records.len();
-                let nbytes = records.iter().fold(0, |acumulator, record| {
-                    acumulator
-                        + match &record.1 {
-                            NeonWalRecord::Postgres { rec, .. } => rec.len(),
-                            _ => unreachable!("Only PostgreSQL records are accepted in this batch"),
-                        }
-                });
-
-                WAL_REDO_TIME.observe(duration.as_secs_f64());
-                WAL_REDO_RECORDS_HISTOGRAM.observe(len as f64);
-                WAL_REDO_BYTES_HISTOGRAM.observe(nbytes as f64);
-
-                debug!(
-                    "postgres applied {} WAL records ({} bytes) in {} us to reconstruct page image at LSN {}",
-                    len,
-                    nbytes,
-                    duration.as_micros(),
-                    lsn
-                );
-
-                if let Err(e) = result.as_ref() {
-                    error!(
-                        "error applying {} WAL records {}..{} ({} bytes) to key {key}, from base image with LSN {} to reconstruct page image at LSN {} n_attempts={}: {:?}",
-                        records.len(),
-                        records.first().map(|p| p.0).unwrap_or(Lsn(0)),
-                        records.last().map(|p| p.0).unwrap_or(Lsn(0)),
-                        nbytes,
-                        base_img_lsn,
-                        lsn,
-                        n_attempts,
-                        e,
+            let proc: Arc<Process> = match self.redo_process.get_or_init_detached().await {
+                Ok(guard) => match &*guard {
+                    ProcessOnceCell::Spawned(proc) => Arc::clone(proc),
+                    ProcessOnceCell::ManagerShutDown => {
+                        return Err(Error::Cancelled);
+                    }
+                },
+                Err(permit) => {
+                    let start = Instant::now();
+                    // acquire guard before spawning process, so that we don't spawn new processes
+                    // if the gate is already closed.
+                    let _launched_processes_guard = match self.launched_processes.enter() {
+                                Ok(guard) => guard,
+                                Err(GateError::GateClosed) => unreachable!(
+                                    "shutdown sets the once cell to `ManagerShutDown` state before closing the gate"
+                                ),
+                            };
+                    let proc = Arc::new(Process {
+                        process: process::WalRedoProcess::launch(
+                            self.conf,
+                            self.tenant_shard_id,
+                            pg_version,
+                        )
+                        .context("launch walredo process")?,
+                        _launched_processes_guard,
+                    });
+                    let duration = start.elapsed();
+                    WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM.observe(duration.as_secs_f64());
+                    info!(
+                        duration_ms = duration.as_millis(),
+                        pid = proc.id(),
+                        "launched walredo process"
                    );
+                    self.redo_process
+                        .set(ProcessOnceCell::Spawned(Arc::clone(&proc)), permit);
+                    proc
                }
-
-                result.map_err(Error::Other)
            };
-            let result = self.do_with_walredo_process(pg_version, closure).await;

-            if result.is_ok() && n_attempts != 0 {
+            let started_at = std::time::Instant::now();
+
+            // Relational WAL records are applied using wal-redo-postgres
+            let result = proc
+                .apply_wal_records(rel, blknum, &base_img, records, wal_redo_timeout)
+                .await
+                .context("apply_wal_records");
+
+            let duration = started_at.elapsed();
+
+            let len = records.len();
+            let nbytes = records.iter().fold(0, |acumulator, record| {
+                acumulator
+                    + match &record.1 {
+                        NeonWalRecord::Postgres { rec, .. } => rec.len(),
+                        _ => unreachable!("Only PostgreSQL records are accepted in this batch"),
+                    }
+            });
+
+            WAL_REDO_TIME.observe(duration.as_secs_f64());
+            WAL_REDO_RECORDS_HISTOGRAM.observe(len as f64);
+            WAL_REDO_BYTES_HISTOGRAM.observe(nbytes as f64);
+
+            debug!(
+                "postgres applied {} WAL records ({} bytes) in {} us to reconstruct page image at LSN {}",
+                len,
+                nbytes,
+                duration.as_micros(),
+                lsn
+            );
+
+            // If something went wrong, don't try to reuse the process. Kill it, and
+            // next request will launch a new one.
+            if let Err(e) = result.as_ref() {
+                error!(
+                    "error applying {} WAL records {}..{} ({} bytes) to key {key}, from base image with LSN {} to reconstruct page image at LSN {} n_attempts={}: {:?}",
+                    records.len(),
+                    records.first().map(|p| p.0).unwrap_or(Lsn(0)),
+                    records.last().map(|p| p.0).unwrap_or(Lsn(0)),
+                    nbytes,
+                    base_img_lsn,
+                    lsn,
+                    n_attempts,
+                    e,
+                );
+                // Avoid concurrent callers hitting the same issue by taking `proc` out of the rotation.
+                // Note that there may be other tasks concurrent with us that also hold `proc`.
+                // We have to deal with that here.
+                // Also read the doc comment on field `self.redo_process`.
+                //
+                // NB: there may still be other concurrent threads using `proc`.
+                // The last one will send SIGKILL when the underlying Arc reaches refcount 0.
+                //
+                // NB: the drop impl blocks the dropping thread with a wait() system call for
+                // the child process. In some ways the blocking is actually good: if we
+                // deferred the waiting into the background / to tokio if we used `tokio::process`,
+                // it could happen that if walredo always fails immediately, we spawn processes faster
+                // than we can SIGKILL & `wait` for them to exit. By doing it the way we do here,
+                // we limit this risk of run-away to at most $num_runtimes * $num_executor_threads.
+                // This probably needs revisiting at some later point.
+                match self.redo_process.get() {
+                    None => (),
+                    Some(guard) => {
+                        match &*guard {
+                            ProcessOnceCell::ManagerShutDown => {}
+                            ProcessOnceCell::Spawned(guard_proc) => {
+                                if Arc::ptr_eq(&proc, guard_proc) {
+                                    // We're the first to observe an error from `proc`, it's our job to take it out of rotation.
+                                    guard.take_and_deinit();
+                                } else {
+                                    // Another task already spawned another redo process (further up in this method)
+                                    // and put it into `redo_process`. Do nothing, our view of the world is behind.
+                                }
+                            }
+                        }
+                    }
+                }
+                // The last task that does this `drop()` of `proc` will do a blocking `wait()` syscall.
+                drop(proc);
+            } else if n_attempts != 0 {
                info!(n_attempts, "retried walredo succeeded");
            }
            n_attempts += 1;
            if n_attempts > MAX_RETRY_ATTEMPTS || result.is_ok() {
-                return result;
+                return result.map_err(Error::Other);
            }
        }
    }
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -18,6 +18,7 @@ atomic-take.workspace = true
 aws-config.workspace = true
 aws-sdk-iam.workspace = true
 aws-sigv4.workspace = true
+aws-types.workspace = true
 base64.workspace = true
 bstr.workspace = true
 bytes = { workspace = true, features = ["serde"] }
@@ -25,6 +26,7 @@ camino.workspace = true
 chrono.workspace = true
 clap.workspace = true
 consumption_metrics.workspace = true
+crossbeam-deque.workspace = true
 dashmap.workspace = true
 env_logger.workspace = true
 framed-websockets.workspace = true
@@ -46,9 +48,11 @@ indexmap.workspace = true
 ipnet.workspace = true
 itertools.workspace = true
 lasso = { workspace = true, features = ["multi-threaded"] }
+md5.workspace = true
 measured = { workspace = true, features = ["lasso"] }
 metrics.workspace = true
 once_cell.workspace = true
+opentelemetry.workspace = true
 parking_lot.workspace = true
 parquet.workspace = true
 parquet_derive.workspace = true
@@ -63,6 +67,7 @@ reqwest.workspace = true
 reqwest-middleware = { workspace = true, features = ["json"] }
 reqwest-retry.workspace = true
 reqwest-tracing.workspace = true
+routerify.workspace = true
 rustc-hash.workspace = true
 rustls-pemfile.workspace = true
 rustls.workspace = true
@@ -74,6 +79,7 @@ smol_str.workspace = true
 smallvec.workspace = true
 socket2.workspace = true
 subtle.workspace = true
+task-local-extensions.workspace = true
 thiserror.workspace = true
 tikv-jemallocator.workspace = true
 tikv-jemalloc-ctl = { workspace = true, features = ["use_std"] }
@@ -82,6 +88,7 @@ tokio-postgres-rustls.workspace = true
 tokio-rustls.workspace = true
 tokio-util.workspace = true
 tokio = { workspace = true, features = ["signal"] }
+tower-service.workspace = true
 tracing-opentelemetry.workspace = true
 tracing-subscriber.workspace = true
 tracing-utils.workspace = true
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -163,7 +163,6 @@ impl ComputeUserInfo {
 }

 pub(crate) enum ComputeCredentialKeys {
-    #[cfg(any(test, feature = "testing"))]
    Password(Vec<u8>),
    AuthKeys(AuthKeys),
    None,
@@ -294,10 +293,16 @@ async fn auth_quirks(
    // We now expect to see a very specific payload in the place of password.
    let (info, unauthenticated_password) = match user_info.try_into() {
        Err(info) => {
-            let (info, password) =
-                hacks::password_hack_no_authentication(ctx, info, client).await?;
-            ctx.set_endpoint_id(info.endpoint.clone());
-            (info, Some(password))
+            let res = hacks::password_hack_no_authentication(ctx, info, client).await?;
+
+            ctx.set_endpoint_id(res.info.endpoint.clone());
+            let password = match res.keys {
+                ComputeCredentialKeys::Password(p) => p,
+                ComputeCredentialKeys::AuthKeys(_) | ComputeCredentialKeys::None => {
+                    unreachable!("password hack should return a password")
+                }
+            };
+            (res.info, Some(password))
        }
        Ok(info) => (info, None),
    };
--- a/proxy/src/auth/backend/hacks.rs
+++ b/proxy/src/auth/backend/hacks.rs
@@ -1,4 +1,6 @@
-use super::{ComputeCredentials, ComputeUserInfo, ComputeUserInfoNoEndpoint};
+use super::{
+    ComputeCredentialKeys, ComputeCredentials, ComputeUserInfo, ComputeUserInfoNoEndpoint,
+};
 use crate::{
    auth::{self, AuthFlow},
    config::AuthenticationConfig,
@@ -61,7 +63,7 @@ pub(crate) async fn password_hack_no_authentication(
    ctx: &RequestMonitoring,
    info: ComputeUserInfoNoEndpoint,
    client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
-) -> auth::Result<(ComputeUserInfo, Vec<u8>)> {
+) -> auth::Result<ComputeCredentials> {
    warn!("project not specified, resorting to the password hack auth flow");
    ctx.set_auth_method(crate::context::AuthMethod::Cleartext);

@@ -77,12 +79,12 @@ pub(crate) async fn password_hack_no_authentication(
    info!(project = &*payload.endpoint, "received missing parameter");

    // Report tentative success; compute node will check the password anyway.
-    Ok((
-        ComputeUserInfo {
+    Ok(ComputeCredentials {
+        info: ComputeUserInfo {
            user: info.user,
            options: info.options,
            endpoint: payload.endpoint,
        },
-        payload.password,
-    ))
+        keys: ComputeCredentialKeys::Password(payload.password),
+    })
 }
--- a/proxy/src/auth/backend/jwt.rs
+++ b/proxy/src/auth/backend/jwt.rs
@@ -25,8 +25,6 @@ const MAX_JWK_BODY_SIZE: usize = 64 * 1024;
 pub(crate) trait FetchAuthRules: Clone + Send + Sync + 'static {
    fn fetch_auth_rules(
        &self,
-        ctx: &RequestMonitoring,
-        endpoint: EndpointId,
        role_name: RoleName,
    ) -> impl Future<Output = anyhow::Result<Vec<AuthRule>>> + Send;
 }
@@ -103,9 +101,7 @@ impl JwkCacheEntryLock {
    async fn renew_jwks<F: FetchAuthRules>(
        &self,
        _permit: JwkRenewalPermit<'_>,
-        ctx: &RequestMonitoring,
        client: &reqwest::Client,
-        endpoint: EndpointId,
        role_name: RoleName,
        auth_rules: &F,
    ) -> anyhow::Result<Arc<JwkCacheEntry>> {
@@ -119,9 +115,7 @@ impl JwkCacheEntryLock {
            }
        }

-        let rules = auth_rules
-            .fetch_auth_rules(ctx, endpoint, role_name)
-            .await?;
+        let rules = auth_rules.fetch_auth_rules(role_name).await?;
        let mut key_sets =
            ahash::HashMap::with_capacity_and_hasher(rules.len(), ahash::RandomState::new());
        // TODO(conrad): run concurrently
@@ -172,7 +166,6 @@ impl JwkCacheEntryLock {
        self: &Arc<Self>,
        ctx: &RequestMonitoring,
        client: &reqwest::Client,
-        endpoint: EndpointId,
        role_name: RoleName,
        fetch: &F,
    ) -> Result<Arc<JwkCacheEntry>, anyhow::Error> {
@@ -183,9 +176,7 @@ impl JwkCacheEntryLock {
        let Some(cached) = guard else {
            let _paused = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
            let permit = self.acquire_permit().await;
-            return self
-                .renew_jwks(permit, ctx, client, endpoint, role_name, fetch)
-                .await;
+            return self.renew_jwks(permit, client, role_name, fetch).await;
        };

        let last_update = now.duration_since(cached.last_retrieved);
@@ -196,9 +187,7 @@ impl JwkCacheEntryLock {
            let permit = self.acquire_permit().await;

            // it's been too long since we checked the keys. wait for them to update.
-            return self
-                .renew_jwks(permit, ctx, client, endpoint, role_name, fetch)
-                .await;
+            return self.renew_jwks(permit, client, role_name, fetch).await;
        }

        // every 5 minutes we should spawn a job to eagerly update the token.
@@ -209,12 +198,8 @@ impl JwkCacheEntryLock {
                let entry = self.clone();
                let client = client.clone();
                let fetch = fetch.clone();
-                let ctx = ctx.clone();
                tokio::spawn(async move {
-                    if let Err(e) = entry
-                        .renew_jwks(permit, &ctx, &client, endpoint, role_name, &fetch)
-                        .await
-                    {
+                    if let Err(e) = entry.renew_jwks(permit, &client, role_name, &fetch).await {
                        tracing::warn!(error=?e, "could not fetch JWKs in background job");
                    }
                });
@@ -231,7 +216,6 @@ impl JwkCacheEntryLock {
        ctx: &RequestMonitoring,
        jwt: &str,
        client: &reqwest::Client,
-        endpoint: EndpointId,
        role_name: RoleName,
        fetch: &F,
    ) -> Result<(), anyhow::Error> {
@@ -258,7 +242,7 @@ impl JwkCacheEntryLock {
        let kid = header.key_id.context("missing key id")?;

        let mut guard = self
-            .get_or_update_jwk_cache(ctx, client, endpoint.clone(), role_name.clone(), fetch)
+            .get_or_update_jwk_cache(ctx, client, role_name.clone(), fetch)
            .await?;

        // get the key from the JWKs if possible. If not, wait for the keys to update.
@@ -270,14 +254,7 @@ impl JwkCacheEntryLock {

                    let permit = self.acquire_permit().await;
                    guard = self
-                        .renew_jwks(
-                            permit,
-                            ctx,
-                            client,
-                            endpoint.clone(),
-                            role_name.clone(),
-                            fetch,
-                        )
+                        .renew_jwks(permit, client, role_name.clone(), fetch)
                        .await?;
                }
                _ => {
@@ -341,7 +318,7 @@ impl JwkCache {
        jwt: &str,
    ) -> Result<(), anyhow::Error> {
        // try with just a read lock first
-        let key = (endpoint.clone(), role_name.clone());
+        let key = (endpoint, role_name.clone());
        let entry = self.map.get(&key).as_deref().map(Arc::clone);
        let entry = entry.unwrap_or_else(|| {
            // acquire a write lock after to insert.
@@ -350,7 +327,7 @@ impl JwkCache {
        });

        entry
-            .check_jwt(ctx, jwt, &self.client, endpoint, role_name, fetch)
+            .check_jwt(ctx, jwt, &self.client, role_name, fetch)
            .await
    }
 }
@@ -711,8 +688,6 @@ X0n5X2/pBLJzxZc62ccvZYVnctBiFs6HbSnxpuMQCfkt/BcR/ttIepBQQIW86wHL
        impl FetchAuthRules for Fetch {
            async fn fetch_auth_rules(
                &self,
-                _ctx: &RequestMonitoring,
-                _endpoint: EndpointId,
                _role_name: RoleName,
            ) -> anyhow::Result<Vec<AuthRule>> {
                Ok(vec![
@@ -731,7 +706,6 @@ X0n5X2/pBLJzxZc62ccvZYVnctBiFs6HbSnxpuMQCfkt/BcR/ttIepBQQIW86wHL
        }

        let role_name = RoleName::from("user");
-        let endpoint = EndpointId::from("ep");

        let jwk_cache = Arc::new(JwkCacheEntryLock::default());

@@ -741,7 +715,6 @@ X0n5X2/pBLJzxZc62ccvZYVnctBiFs6HbSnxpuMQCfkt/BcR/ttIepBQQIW86wHL
                    &RequestMonitoring::test(),
                    &token,
                    &client,
-                    endpoint.clone(),
                    role_name.clone(),
                    &Fetch(addr),
                )
--- a/proxy/src/auth/backend/local.rs
+++ b/proxy/src/auth/backend/local.rs
@@ -9,9 +9,8 @@ use crate::{
        messages::{ColdStartInfo, EndpointJwksResponse, MetricsAuxInfo},
        NodeInfo,
    },
-    context::RequestMonitoring,
    intern::{BranchIdInt, BranchIdTag, EndpointIdTag, InternId, ProjectIdInt, ProjectIdTag},
-    EndpointId, RoleName,
+    RoleName,
 };

 use super::jwt::{AuthRule, FetchAuthRules, JwkCache};
@@ -58,12 +57,7 @@ pub struct JwksRoleSettings {
 }

 impl FetchAuthRules for StaticAuthRules {
-    async fn fetch_auth_rules(
-        &self,
-        _ctx: &RequestMonitoring,
-        _endpoint: EndpointId,
-        role_name: RoleName,
-    ) -> anyhow::Result<Vec<AuthRule>> {
+    async fn fetch_auth_rules(&self, role_name: RoleName) -> anyhow::Result<Vec<AuthRule>> {
        let mappings = JWKS_ROLE_MAP.load();
        let role_mappings = mappings
            .as_deref()
--- a/proxy/src/bin/local_proxy.rs
+++ b/proxy/src/bin/local_proxy.rs
@@ -92,12 +92,6 @@ struct SqlOverHttpArgs {

    #[clap(long, default_value_t = 16)]
    sql_over_http_cancel_set_shards: usize,
-
-    #[clap(long, default_value_t = 10 * 1024 * 1024)] // 10 MiB
-    sql_over_http_max_request_size_bytes: u64,
-
-    #[clap(long, default_value_t = 10 * 1024 * 1024)] // 10 MiB
-    sql_over_http_max_response_size_bytes: usize,
 }

 #[tokio::main]
@@ -214,8 +208,6 @@ fn build_config(args: &LocalProxyCliArgs) -> anyhow::Result<&'static ProxyConfig
        },
        cancel_set: CancelSet::new(args.sql_over_http.sql_over_http_cancel_set_shards),
        client_conn_threshold: args.sql_over_http.sql_over_http_client_conn_threshold,
-        max_request_size_bytes: args.sql_over_http.sql_over_http_max_request_size_bytes,
-        max_response_size_bytes: args.sql_over_http.sql_over_http_max_response_size_bytes,
    };

    Ok(Box::leak(Box::new(ProxyConfig {
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -62,13 +62,12 @@ static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
 #[derive(Clone, Debug, ValueEnum)]
 enum AuthBackendType {
    Console,
+    #[cfg(feature = "testing")]
+    Postgres,
    // clap only shows the name, not the alias, in usage text.
    // TODO: swap name/alias and deprecate "link"
    #[value(name("link"), alias("web"))]
    Web,
-
-    #[cfg(feature = "testing")]
-    Postgres,
 }

 /// Neon proxy/router
@@ -269,12 +268,6 @@ struct SqlOverHttpArgs {

    #[clap(long, default_value_t = 64)]
    sql_over_http_cancel_set_shards: usize,
-
-    #[clap(long, default_value_t = 10 * 1024 * 1024)] // 10 MiB
-    sql_over_http_max_request_size_bytes: u64,
-
-    #[clap(long, default_value_t = 10 * 1024 * 1024)] // 10 MiB
-    sql_over_http_max_response_size_bytes: usize,
 }

 #[tokio::main]
@@ -640,19 +633,17 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
            let api = console::provider::ConsoleBackend::Console(api);
            auth::Backend::Console(MaybeOwned::Owned(api), ())
        }
-
-        AuthBackendType::Web => {
-            let url = args.uri.parse()?;
-            auth::Backend::Web(MaybeOwned::Owned(url), ())
-        }
-
        #[cfg(feature = "testing")]
        AuthBackendType::Postgres => {
            let url = args.auth_endpoint.parse()?;
-            let api = console::provider::mock::Api::new(url, !args.is_private_access_proxy);
+            let api = console::provider::mock::Api::new(url);
            let api = console::provider::ConsoleBackend::Postgres(api);
            auth::Backend::Console(MaybeOwned::Owned(api), ())
        }
+        AuthBackendType::Web => {
+            let url = args.uri.parse()?;
+            auth::Backend::Web(MaybeOwned::Owned(url), ())
+        }
    };

    let config::ConcurrencyLockOptions {
@@ -688,8 +679,6 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
        },
        cancel_set: CancelSet::new(args.sql_over_http.sql_over_http_cancel_set_shards),
        client_conn_threshold: args.sql_over_http.sql_over_http_client_conn_threshold,
-        max_request_size_bytes: args.sql_over_http.sql_over_http_max_request_size_bytes,
-        max_response_size_bytes: args.sql_over_http.sql_over_http_max_response_size_bytes,
    };
    let authentication_config = AuthenticationConfig {
        thread_pool,
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -56,8 +56,6 @@ pub struct HttpConfig {
    pub pool_options: GlobalConnPoolOptions,
    pub cancel_set: CancelSet,
    pub client_conn_threshold: u64,
-    pub max_request_size_bytes: u64,
-    pub max_response_size_bytes: usize,
 }

 pub struct AuthenticationConfig {
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -303,7 +303,6 @@ impl NodeInfo {

    pub(crate) fn set_keys(&mut self, keys: &ComputeCredentialKeys) {
        match keys {
-            #[cfg(any(test, feature = "testing"))]
            ComputeCredentialKeys::Password(password) => self.config.password(password),
            ComputeCredentialKeys::AuthKeys(auth_keys) => self.config.auth_keys(*auth_keys),
            ComputeCredentialKeys::None => &mut self.config,
--- a/proxy/src/console/provider/mock.rs
+++ b/proxy/src/console/provider/mock.rs
@@ -41,15 +41,11 @@ impl From<tokio_postgres::Error> for ApiError {
 #[derive(Clone)]
 pub struct Api {
    endpoint: ApiUrl,
-    ip_allowlist_check_enabled: bool,
 }

 impl Api {
-    pub fn new(endpoint: ApiUrl, ip_allowlist_check_enabled: bool) -> Self {
-        Self {
-            endpoint,
-            ip_allowlist_check_enabled,
-        }
+    pub fn new(endpoint: ApiUrl) -> Self {
+        Self { endpoint }
    }

    pub(crate) fn url(&self) -> &str {
@@ -68,7 +64,6 @@ impl Api {
                tokio_postgres::connect(self.endpoint.as_str(), tokio_postgres::NoTls).await?;

            tokio::spawn(connection);
-
            let secret = if let Some(entry) = get_execute_postgres_query(
                &client,
                "select rolpassword from pg_catalog.pg_authid where rolname = $1",
@@ -84,26 +79,21 @@ impl Api {
                warn!("user '{}' does not exist", user_info.user);
                None
            };
-
-            let allowed_ips = if self.ip_allowlist_check_enabled {
-                match get_execute_postgres_query(
-                    &client,
-                    "select allowed_ips from neon_control_plane.endpoints where endpoint_id = $1",
-                    &[&user_info.endpoint.as_str()],
-                    "allowed_ips",
-                )
-                .await?
-                {
-                    Some(s) => {
-                        info!("got allowed_ips: {s}");
-                        s.split(',')
-                            .map(|s| IpPattern::from_str(s).unwrap())
-                            .collect()
-                    }
-                    None => vec![],
+            let allowed_ips = match get_execute_postgres_query(
+                &client,
+                "select allowed_ips from neon_control_plane.endpoints where endpoint_id = $1",
+                &[&user_info.endpoint.as_str()],
+                "allowed_ips",
+            )
+            .await?
+            {
+                Some(s) => {
+                    info!("got allowed_ips: {s}");
+                    s.split(',')
+                        .map(|s| IpPattern::from_str(s).unwrap())
+                        .collect()
                }
-            } else {
-                vec![]
+                None => vec![],
            };

            Ok((secret, allowed_ips))
--- a/proxy/src/context.rs
+++ b/proxy/src/context.rs
@@ -79,40 +79,6 @@ pub(crate) enum AuthMethod {
    Cleartext,
 }

-impl Clone for RequestMonitoring {
-    fn clone(&self) -> Self {
-        let inner = self.0.try_lock().expect("should not deadlock");
-        let new = RequestMonitoringInner {
-            peer_addr: inner.peer_addr,
-            session_id: inner.session_id,
-            protocol: inner.protocol,
-            first_packet: inner.first_packet,
-            region: inner.region,
-            span: info_span!("background_task"),
-
-            project: inner.project,
-            branch: inner.branch,
-            endpoint_id: inner.endpoint_id.clone(),
-            dbname: inner.dbname.clone(),
-            user: inner.user.clone(),
-            application: inner.application.clone(),
-            error_kind: inner.error_kind,
-            auth_method: inner.auth_method.clone(),
-            success: inner.success,
-            rejected: inner.rejected,
-            cold_start_info: inner.cold_start_info,
-            pg_options: inner.pg_options.clone(),
-
-            sender: None,
-            disconnect_sender: None,
-            latency_timer: LatencyTimer::noop(inner.protocol),
-            disconnect_timestamp: inner.disconnect_timestamp,
-        };
-
-        Self(TryLock::new(new))
-    }
-}
-
 impl RequestMonitoring {
    pub fn new(
        session_id: Uuid,
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -397,8 +397,6 @@ pub struct LatencyTimer {
    protocol: Protocol,
    cold_start_info: ColdStartInfo,
    outcome: ConnectOutcome,
-
-    skip_reporting: bool,
 }

 impl LatencyTimer {
@@ -411,20 +409,6 @@ impl LatencyTimer {
            cold_start_info: ColdStartInfo::Unknown,
            // assume failed unless otherwise specified
            outcome: ConnectOutcome::Failed,
-            skip_reporting: false,
-        }
-    }
-
-    pub(crate) fn noop(protocol: Protocol) -> Self {
-        Self {
-            start: time::Instant::now(),
-            stop: None,
-            accumulated: Accumulated::default(),
-            protocol,
-            cold_start_info: ColdStartInfo::Unknown,
-            // assume failed unless otherwise specified
-            outcome: ConnectOutcome::Failed,
-            skip_reporting: true,
        }
    }

@@ -459,10 +443,6 @@ pub enum ConnectOutcome {

 impl Drop for LatencyTimer {
    fn drop(&mut self) {
-        if self.skip_reporting {
-            return;
-        }
-
        let duration = self
            .stop
            .unwrap_or_else(time::Instant::now)
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -27,7 +27,7 @@ use crate::{
    Host,
 };

-use super::conn_pool::{poll_client, Client, ConnInfo, GlobalConnPool};
+use super::conn_pool::{poll_client, AuthData, Client, ConnInfo, GlobalConnPool};

 pub(crate) struct PoolingBackend {
    pub(crate) pool: Arc<GlobalConnPool<tokio_postgres::Client>>,
@@ -274,6 +274,13 @@ impl ConnectMechanism for TokioMechanism {
            .dbname(&self.conn_info.dbname)
            .connect_timeout(timeout);

+        match &self.conn_info.auth {
+            AuthData::Jwt(_) => {}
+            AuthData::Password(pw) => {
+                config.password(pw);
+            }
+        }
+
        let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
        let res = config.connect(tokio_postgres::NoTls).await;
        drop(pause);
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -29,16 +29,11 @@ use tracing::{info, info_span, Instrument};

 use super::backend::HttpConnError;

-#[derive(Debug, Clone)]
-pub(crate) struct ConnInfoWithAuth {
-    pub(crate) conn_info: ConnInfo,
-    pub(crate) auth: AuthData,
-}
-
 #[derive(Debug, Clone)]
 pub(crate) struct ConnInfo {
    pub(crate) user_info: ComputeUserInfo,
    pub(crate) dbname: DbName,
+    pub(crate) auth: AuthData,
 }

 #[derive(Debug, Clone)]
@@ -781,8 +776,6 @@ mod tests {
            },
            cancel_set: CancelSet::new(0),
            client_conn_threshold: u64::MAX,
-            max_request_size_bytes: u64::MAX,
-            max_response_size_bytes: usize::MAX,
        }));
        let pool = GlobalConnPool::new(config);
        let conn_info = ConnInfo {
@@ -792,6 +785,7 @@ mod tests {
                options: NeonOptions::default(),
            },
            dbname: "dbname".into(),
+            auth: AuthData::Password("password".as_bytes().into()),
        };
        let ep_pool = Arc::downgrade(
            &pool.get_or_create_endpoint_pool(&conn_info.endpoint_cache_key().unwrap()),
@@ -849,6 +843,7 @@ mod tests {
                options: NeonOptions::default(),
            },
            dbname: "dbname".into(),
+            auth: AuthData::Password("password".as_bytes().into()),
        };
        let ep_pool = Arc::downgrade(
            &pool.get_or_create_endpoint_pool(&conn_info.endpoint_cache_key().unwrap()),
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -60,7 +60,6 @@ use super::backend::PoolingBackend;
 use super::conn_pool::AuthData;
 use super::conn_pool::Client;
 use super::conn_pool::ConnInfo;
-use super::conn_pool::ConnInfoWithAuth;
 use super::http_util::json_response;
 use super::json::json_to_pg_text;
 use super::json::pg_text_row_to_json;
@@ -88,6 +87,9 @@ enum Payload {
    Batch(BatchQueryData),
 }

+const MAX_RESPONSE_SIZE: usize = 10 * 1024 * 1024; // 10 MiB
+const MAX_REQUEST_SIZE: u64 = 10 * 1024 * 1024; // 10 MiB
+
 static CONN_STRING: HeaderName = HeaderName::from_static("neon-connection-string");
 static RAW_TEXT_OUTPUT: HeaderName = HeaderName::from_static("neon-raw-text-output");
 static ARRAY_MODE: HeaderName = HeaderName::from_static("neon-array-mode");
@@ -149,7 +151,7 @@ fn get_conn_info(
    ctx: &RequestMonitoring,
    headers: &HeaderMap,
    tls: Option<&TlsConfig>,
-) -> Result<ConnInfoWithAuth, ConnInfoError> {
+) -> Result<ConnInfo, ConnInfoError> {
    // HTTP only uses cleartext (for now and likely always)
    ctx.set_auth_method(crate::context::AuthMethod::Cleartext);

@@ -236,8 +238,11 @@ fn get_conn_info(
        options: options.unwrap_or_default(),
    };

-    let conn_info = ConnInfo { user_info, dbname };
-    Ok(ConnInfoWithAuth { conn_info, auth })
+    Ok(ConnInfo {
+        user_info,
+        dbname,
+        auth,
+    })
 }

 // TODO: return different http error codes
@@ -361,10 +366,10 @@ pub(crate) enum SqlOverHttpError {
    ConnectCompute(#[from] HttpConnError),
    #[error("{0}")]
    ConnInfo(#[from] ConnInfoError),
-    #[error("request is too large (max is {0} bytes)")]
-    RequestTooLarge(u64),
-    #[error("response is too large (max is {0} bytes)")]
-    ResponseTooLarge(usize),
+    #[error("request is too large (max is {MAX_REQUEST_SIZE} bytes)")]
+    RequestTooLarge,
+    #[error("response is too large (max is {MAX_RESPONSE_SIZE} bytes)")]
+    ResponseTooLarge,
    #[error("invalid isolation level")]
    InvalidIsolationLevel,
    #[error("{0}")]
@@ -381,8 +386,8 @@ impl ReportableError for SqlOverHttpError {
            SqlOverHttpError::ReadPayload(e) => e.get_error_kind(),
            SqlOverHttpError::ConnectCompute(e) => e.get_error_kind(),
            SqlOverHttpError::ConnInfo(e) => e.get_error_kind(),
-            SqlOverHttpError::RequestTooLarge(_) => ErrorKind::User,
-            SqlOverHttpError::ResponseTooLarge(_) => ErrorKind::User,
+            SqlOverHttpError::RequestTooLarge => ErrorKind::User,
+            SqlOverHttpError::ResponseTooLarge => ErrorKind::User,
            SqlOverHttpError::InvalidIsolationLevel => ErrorKind::User,
            SqlOverHttpError::Postgres(p) => p.get_error_kind(),
            SqlOverHttpError::JsonConversion(_) => ErrorKind::Postgres,
@@ -397,8 +402,8 @@ impl UserFacingError for SqlOverHttpError {
            SqlOverHttpError::ReadPayload(p) => p.to_string(),
            SqlOverHttpError::ConnectCompute(c) => c.to_string_client(),
            SqlOverHttpError::ConnInfo(c) => c.to_string_client(),
-            SqlOverHttpError::RequestTooLarge(_) => self.to_string(),
-            SqlOverHttpError::ResponseTooLarge(_) => self.to_string(),
+            SqlOverHttpError::RequestTooLarge => self.to_string(),
+            SqlOverHttpError::ResponseTooLarge => self.to_string(),
            SqlOverHttpError::InvalidIsolationLevel => self.to_string(),
            SqlOverHttpError::Postgres(p) => p.to_string(),
            SqlOverHttpError::JsonConversion(_) => "could not parse postgres response".to_string(),
@@ -521,10 +526,7 @@ async fn handle_inner(

    // TLS config should be there.
    let conn_info = get_conn_info(ctx, headers, config.tls_config.as_ref())?;
-    info!(
-        user = conn_info.conn_info.user_info.user.as_str(),
-        "credentials"
-    );
+    info!(user = conn_info.user_info.user.as_str(), "credentials");

    // Allow connection pooling only if explicitly requested
    // or if we have decided that http pool is no longer opt-in
@@ -535,7 +537,7 @@ async fn handle_inner(

    let request_content_length = match request.body().size_hint().upper() {
        Some(v) => v,
-        None => config.http_config.max_request_size_bytes + 1,
+        None => MAX_REQUEST_SIZE + 1,
    };
    info!(request_content_length, "request size in bytes");
    Metrics::get()
@@ -545,10 +547,8 @@ async fn handle_inner(

    // we don't have a streaming request support yet so this is to prevent OOM
    // from a malicious user sending an extremely large request body
-    if request_content_length > config.http_config.max_request_size_bytes {
-        return Err(SqlOverHttpError::RequestTooLarge(
-            config.http_config.max_request_size_bytes,
-        ));
+    if request_content_length > MAX_REQUEST_SIZE {
+        return Err(SqlOverHttpError::RequestTooLarge);
    }

    let fetch_and_process_request = Box::pin(
@@ -569,20 +569,20 @@ async fn handle_inner(
                        .authenticate_with_password(
                            ctx,
                            &config.authentication_config,
-                            &conn_info.conn_info.user_info,
+                            &conn_info.user_info,
                            pw,
                        )
                        .await?
                }
                AuthData::Jwt(jwt) => {
                    backend
-                        .authenticate_with_jwt(ctx, &conn_info.conn_info.user_info, jwt)
+                        .authenticate_with_jwt(ctx, &conn_info.user_info, jwt)
                        .await?
                }
            };

            let client = backend
-                .connect_to_compute(ctx, conn_info.conn_info, keys, !allow_pool)
+                .connect_to_compute(ctx, conn_info, keys, !allow_pool)
                .await?;
            // not strictly necessary to mark success here,
            // but it's just insurance for if we forget it somewhere else
@@ -612,10 +612,7 @@ async fn handle_inner(

    // Now execute the query and return the result.
    let json_output = match payload {
-        Payload::Single(stmt) => {
-            stmt.process(config, cancel, &mut client, parsed_headers)
-                .await?
-        }
+        Payload::Single(stmt) => stmt.process(cancel, &mut client, parsed_headers).await?,
        Payload::Batch(statements) => {
            if parsed_headers.txn_read_only {
                response = response.header(TXN_READ_ONLY.clone(), &HEADER_VALUE_TRUE);
@@ -631,7 +628,7 @@ async fn handle_inner(
            }

            statements
-                .process(config, cancel, &mut client, parsed_headers)
+                .process(cancel, &mut client, parsed_headers)
                .await?
        }
    };
@@ -659,7 +656,6 @@ async fn handle_inner(
 impl QueryData {
    async fn process(
        self,
-        config: &'static ProxyConfig,
        cancel: CancellationToken,
        client: &mut Client<tokio_postgres::Client>,
        parsed_headers: HttpHeaders,
@@ -668,7 +664,7 @@ impl QueryData {
        let cancel_token = inner.cancel_token();

        let res = match select(
-            pin!(query_to_json(config, &*inner, self, &mut 0, parsed_headers)),
+            pin!(query_to_json(&*inner, self, &mut 0, parsed_headers)),
            pin!(cancel.cancelled()),
        )
        .await
@@ -731,7 +727,6 @@ impl QueryData {
 impl BatchQueryData {
    async fn process(
        self,
-        config: &'static ProxyConfig,
        cancel: CancellationToken,
        client: &mut Client<tokio_postgres::Client>,
        parsed_headers: HttpHeaders,
@@ -756,52 +751,44 @@ impl BatchQueryData {
            discard.discard();
        })?;

-        let json_output = match query_batch(
-            config,
-            cancel.child_token(),
-            &transaction,
-            self,
-            parsed_headers,
-        )
-        .await
-        {
-            Ok(json_output) => {
-                info!("commit");
-                let status = transaction.commit().await.inspect_err(|_| {
-                    // if we cannot commit - for now don't return connection to pool
-                    // TODO: get a query status from the error
-                    discard.discard();
-                })?;
-                discard.check_idle(status);
-                json_output
-            }
-            Err(SqlOverHttpError::Cancelled(_)) => {
-                if let Err(err) = cancel_token.cancel_query(NoTls).await {
-                    tracing::error!(?err, "could not cancel query");
+        let json_output =
+            match query_batch(cancel.child_token(), &transaction, self, parsed_headers).await {
+                Ok(json_output) => {
+                    info!("commit");
+                    let status = transaction.commit().await.inspect_err(|_| {
+                        // if we cannot commit - for now don't return connection to pool
+                        // TODO: get a query status from the error
+                        discard.discard();
+                    })?;
+                    discard.check_idle(status);
+                    json_output
                }
-                // TODO: after cancelling, wait to see if we can get a status. maybe the connection is still safe.
-                discard.discard();
-
-                return Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Postgres));
-            }
-            Err(err) => {
-                info!("rollback");
-                let status = transaction.rollback().await.inspect_err(|_| {
-                    // if we cannot rollback - for now don't return connection to pool
-                    // TODO: get a query status from the error
+                Err(SqlOverHttpError::Cancelled(_)) => {
+                    if let Err(err) = cancel_token.cancel_query(NoTls).await {
+                        tracing::error!(?err, "could not cancel query");
+                    }
+                    // TODO: after cancelling, wait to see if we can get a status. maybe the connection is still safe.
                    discard.discard();
-                })?;
-                discard.check_idle(status);
-                return Err(err);
-            }
-        };
+
+                    return Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Postgres));
+                }
+                Err(err) => {
+                    info!("rollback");
+                    let status = transaction.rollback().await.inspect_err(|_| {
+                        // if we cannot rollback - for now don't return connection to pool
+                        // TODO: get a query status from the error
+                        discard.discard();
+                    })?;
+                    discard.check_idle(status);
+                    return Err(err);
+                }
+            };

        Ok(json_output)
    }
 }

 async fn query_batch(
-    config: &'static ProxyConfig,
    cancel: CancellationToken,
    transaction: &Transaction<'_>,
    queries: BatchQueryData,
@@ -811,7 +798,6 @@ async fn query_batch(
    let mut current_size = 0;
    for stmt in queries.queries {
        let query = pin!(query_to_json(
-            config,
            transaction,
            stmt,
            &mut current_size,
@@ -840,7 +826,6 @@ async fn query_batch(
 }

 async fn query_to_json<T: GenericClient>(
-    config: &'static ProxyConfig,
    client: &T,
    data: QueryData,
    current_size: &mut usize,
@@ -861,10 +846,8 @@ async fn query_to_json<T: GenericClient>(
        rows.push(row);
        // we don't have a streaming response support yet so this is to prevent OOM
        // from a malicious query (eg a cross join)
-        if *current_size > config.http_config.max_response_size_bytes {
-            return Err(SqlOverHttpError::ResponseTooLarge(
-                config.http_config.max_response_size_bytes,
-            ));
+        if *current_size > MAX_RESPONSE_SIZE {
+            return Err(SqlOverHttpError::ResponseTooLarge);
        }
    }

--- a/safekeeper/Cargo.toml
+++ b/safekeeper/Cargo.toml
@@ -13,12 +13,14 @@ testing = ["fail/failpoints"]
 [dependencies]
 async-stream.workspace = true
 anyhow.workspace = true
+async-trait.workspace = true
 byteorder.workspace = true
 bytes.workspace = true
 camino.workspace = true
 camino-tempfile.workspace = true
 chrono.workspace = true
 clap = { workspace = true, features = ["derive"] }
+const_format.workspace = true
 crc32c.workspace = true
 fail.workspace = true
 git-version.workspace = true
@@ -36,6 +38,8 @@ scopeguard.workspace = true
 reqwest = { workspace = true, features = ["json"] }
 serde.workspace = true
 serde_json.workspace = true
+serde_with.workspace = true
+signal-hook.workspace = true
 strum.workspace = true
 strum_macros.workspace = true
 thiserror.workspace = true
@@ -44,6 +48,7 @@ tokio-util = { workspace = true }
 tokio-io-timeout.workspace = true
 tokio-postgres.workspace = true
 tokio-tar.workspace = true
+toml_edit.workspace = true
 tracing.workspace = true
 url.workspace = true
 metrics.workspace = true
--- a/safekeeper/src/debug_dump.rs
+++ b/safekeeper/src/debug_dump.rs
@@ -17,7 +17,6 @@ use postgres_ffi::MAX_SEND_SIZE;
 use serde::Deserialize;
 use serde::Serialize;

-use postgres_ffi::v14::xlog_utils::{IsPartialXLogFileName, IsXLogFileName};
 use sha2::{Digest, Sha256};
 use utils::id::NodeId;
 use utils::id::TenantTimelineId;
@@ -52,9 +51,6 @@ pub struct Args {
    /// Dump full term history. True by default.
    pub dump_term_history: bool,

-    /// Dump last modified time of WAL segments. Uses value of `dump_all` by default.
-    pub dump_wal_last_modified: bool,
-
    /// Filter timelines by tenant_id.
    pub tenant_id: Option<TenantId>,

@@ -132,19 +128,12 @@ async fn build_from_tli_dump(
        None
    };

-    let wal_last_modified = if args.dump_wal_last_modified {
-        get_wal_last_modified(timeline_dir).ok().flatten()
-    } else {
-        None
-    };
-
    Timeline {
        tenant_id: timeline.ttid.tenant_id,
        timeline_id: timeline.ttid.timeline_id,
        control_file,
        memory,
        disk_content,
-        wal_last_modified,
    }
 }

@@ -167,7 +156,6 @@ pub struct Timeline {
    pub control_file: Option<TimelinePersistentState>,
    pub memory: Option<Memory>,
    pub disk_content: Option<DiskContent>,
-    pub wal_last_modified: Option<DateTime<Utc>>,
 }

 #[derive(Debug, Serialize, Deserialize)]
@@ -314,27 +302,6 @@ fn build_file_info(entry: DirEntry) -> Result<FileInfo> {
    })
 }

-/// Get highest modified time of WAL segments in the directory.
-fn get_wal_last_modified(path: &Utf8Path) -> Result<Option<DateTime<Utc>>> {
-    let mut res = None;
-    for entry in fs::read_dir(path)? {
-        if entry.is_err() {
-            continue;
-        }
-        let entry = entry?;
-        /* Ignore files that are not XLOG segments */
-        let fname = entry.file_name();
-        if !IsXLogFileName(&fname) && !IsPartialXLogFileName(&fname) {
-            continue;
-        }
-
-        let metadata = entry.metadata()?;
-        let modified: DateTime<Utc> = DateTime::from(metadata.modified()?);
-        res = std::cmp::max(res, Some(modified));
-    }
-    Ok(res)
-}
-
 /// Converts SafeKeeperConf to Config, filtering out the fields that are not
 /// supposed to be exposed.
 fn build_config(config: SafeKeeperConf) -> Config {
--- a/safekeeper/src/http/openapi_spec.yaml
+++ b/safekeeper/src/http/openapi_spec.yaml
@@ -1,11 +1,7 @@
 openapi: "3.0.2"
 info:
  title: Safekeeper control API
-  description: Neon Safekeeper API
  version: "1.0"
-  license:
-    name: "Apache"
-    url: https://github.com/neondatabase/neon/blob/main/LICENSE


 servers:
@@ -390,12 +386,6 @@ components:
        msg:
          type: string

-    NotFoundError:
-      type: object
-      properties:
-        msg:
-          type: string
-
  responses:

    #
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -481,7 +481,6 @@ async fn dump_debug_handler(mut request: Request<Body>) -> Result<Response<Body>
    let mut dump_memory: Option<bool> = None;
    let mut dump_disk_content: Option<bool> = None;
    let mut dump_term_history: Option<bool> = None;
-    let mut dump_wal_last_modified: Option<bool> = None;
    let mut tenant_id: Option<TenantId> = None;
    let mut timeline_id: Option<TimelineId> = None;

@@ -495,7 +494,6 @@ async fn dump_debug_handler(mut request: Request<Body>) -> Result<Response<Body>
            "dump_memory" => dump_memory = Some(parse_kv_str(&k, &v)?),
            "dump_disk_content" => dump_disk_content = Some(parse_kv_str(&k, &v)?),
            "dump_term_history" => dump_term_history = Some(parse_kv_str(&k, &v)?),
-            "dump_wal_last_modified" => dump_wal_last_modified = Some(parse_kv_str(&k, &v)?),
            "tenant_id" => tenant_id = Some(parse_kv_str(&k, &v)?),
            "timeline_id" => timeline_id = Some(parse_kv_str(&k, &v)?),
            _ => Err(ApiError::BadRequest(anyhow::anyhow!(
@@ -510,7 +508,6 @@ async fn dump_debug_handler(mut request: Request<Body>) -> Result<Response<Body>
    let dump_memory = dump_memory.unwrap_or(dump_all);
    let dump_disk_content = dump_disk_content.unwrap_or(dump_all);
    let dump_term_history = dump_term_history.unwrap_or(true);
-    let dump_wal_last_modified = dump_wal_last_modified.unwrap_or(dump_all);

    let args = debug_dump::Args {
        dump_all,
@@ -518,7 +515,6 @@ async fn dump_debug_handler(mut request: Request<Body>) -> Result<Response<Body>
        dump_memory,
        dump_disk_content,
        dump_term_history,
-        dump_wal_last_modified,
        tenant_id,
        timeline_id,
    };
--- a/safekeeper/src/pull_timeline.rs
+++ b/safekeeper/src/pull_timeline.rs
@@ -278,7 +278,7 @@ impl WalResidentTimeline {
 }

 /// pull_timeline request body.
-#[derive(Debug, Deserialize)]
+#[derive(Debug, Serialize, Deserialize)]
 pub struct Request {
    pub tenant_id: TenantId,
    pub timeline_id: TimelineId,
@@ -293,7 +293,7 @@ pub struct Response {
 }

 /// Response for debug dump request.
-#[derive(Debug, Deserialize)]
+#[derive(Debug, Serialize, Deserialize)]
 pub struct DebugDumpResponse {
    pub start_time: DateTime<Utc>,
    pub finish_time: DateTime<Utc>,
--- a/safekeeper/src/wal_storage.rs
+++ b/safekeeper/src/wal_storage.rs
@@ -539,17 +539,20 @@ async fn remove_segments_from_disk(
    while let Some(entry) = entries.next_entry().await? {
        let entry_path = entry.path();
        let fname = entry_path.file_name().unwrap();
-        /* Ignore files that are not XLOG segments */
-        if !IsXLogFileName(fname) && !IsPartialXLogFileName(fname) {
-            continue;
-        }
-        let (segno, _) = XLogFromFileName(fname, wal_seg_size)?;
-        if remove_predicate(segno) {
-            remove_file(entry_path).await?;
-            n_removed += 1;
-            min_removed = min(min_removed, segno);
-            max_removed = max(max_removed, segno);
-            REMOVED_WAL_SEGMENTS.inc();
+
+        if let Some(fname_str) = fname.to_str() {
+            /* Ignore files that are not XLOG segments */
+            if !IsXLogFileName(fname_str) && !IsPartialXLogFileName(fname_str) {
+                continue;
+            }
+            let (segno, _) = XLogFromFileName(fname_str, wal_seg_size);
+            if remove_predicate(segno) {
+                remove_file(entry_path).await?;
+                n_removed += 1;
+                min_removed = min(min_removed, segno);
+                max_removed = max(max_removed, segno);
+                REMOVED_WAL_SEGMENTS.inc();
+            }
        }
    }

--- a/storage_broker/Cargo.toml
+++ b/storage_broker/Cargo.toml
@@ -10,6 +10,7 @@ bench = []
 [dependencies]
 anyhow.workspace = true
 async-stream.workspace = true
+bytes.workspace = true
 clap = { workspace = true, features = ["derive"] }
 const_format.workspace = true
 futures.workspace = true
@@ -23,6 +24,7 @@ parking_lot.workspace = true
 prost.workspace = true
 tonic.workspace = true
 tokio = { workspace = true, features = ["rt-multi-thread"] }
+tokio-stream.workspace = true
 tracing.workspace = true
 metrics.workspace = true
 utils.workspace = true
--- a/storage_controller/Cargo.toml
+++ b/storage_controller/Cargo.toml
@@ -15,7 +15,9 @@ testing = []

 [dependencies]
 anyhow.workspace = true
+aws-config.workspace = true
 bytes.workspace = true
+camino.workspace = true
 chrono.workspace = true
 clap.workspace = true
 fail.workspace = true
--- a/storage_controller/client/Cargo.toml
+++ b/storage_controller/client/Cargo.toml
@@ -5,7 +5,18 @@ edition.workspace = true
 license.workspace = true

 [dependencies]
+pageserver_api.workspace = true
 pageserver_client.workspace = true
+thiserror.workspace = true
 reqwest.workspace = true
+utils.workspace = true
 serde.workspace = true
 workspace_hack = { version = "0.1", path = "../../workspace_hack" }
+tokio-postgres.workspace = true
+tokio-stream.workspace = true
+tokio.workspace = true
+futures.workspace = true
+tokio-util.workspace = true
+anyhow.workspace = true
+postgres.workspace = true
+bytes.workspace = true
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -1,11 +1,10 @@
-use crate::http;
 use crate::metrics::{
    HttpRequestLatencyLabelGroup, HttpRequestStatusLabelGroup, PageserverRequestLabelGroup,
    METRICS_REGISTRY,
 };
 use crate::persistence::SafekeeperPersistence;
 use crate::reconciler::ReconcileError;
-use crate::service::{LeadershipStatus, Service, RECONCILE_TIMEOUT, STARTUP_RECONCILE_TIMEOUT};
+use crate::service::{LeadershipStatus, Service, STARTUP_RECONCILE_TIMEOUT};
 use anyhow::Context;
 use futures::Future;
 use hyper::header::CONTENT_TYPE;
@@ -23,7 +22,6 @@ use pageserver_api::models::{
 };
 use pageserver_api::shard::TenantShardId;
 use pageserver_client::{mgmt_api, BlockUnblock};
-use std::str::FromStr;
 use std::sync::Arc;
 use std::time::{Duration, Instant};
 use tokio_util::sync::CancellationToken;
@@ -89,16 +87,9 @@ fn get_state(request: &Request<Body>) -> &HttpState {
 }

 /// Pageserver calls into this on startup, to learn which tenants it should attach
-async fn handle_re_attach(req: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn handle_re_attach(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
    check_permissions(&req, Scope::GenerationsApi)?;

-    let mut req = match maybe_forward(req).await {
-        ForwardOutcome::Forwarded(res) => {
-            return res;
-        }
-        ForwardOutcome::NotForwarded(req) => req,
-    };
-
    let reattach_req = json_request::<ReAttachRequest>(&mut req).await?;
    let state = get_state(&req);
    json_response(StatusCode::OK, state.service.re_attach(reattach_req).await?)
@@ -106,16 +97,9 @@ async fn handle_re_attach(req: Request<Body>) -> Result<Response<Body>, ApiError

 /// Pageserver calls into this before doing deletions, to confirm that it still
 /// holds the latest generation for the tenants with deletions enqueued
-async fn handle_validate(req: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn handle_validate(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
    check_permissions(&req, Scope::GenerationsApi)?;

-    let mut req = match maybe_forward(req).await {
-        ForwardOutcome::Forwarded(res) => {
-            return res;
-        }
-        ForwardOutcome::NotForwarded(req) => req,
-    };
-
    let validate_req = json_request::<ValidateRequest>(&mut req).await?;
    let state = get_state(&req);
    json_response(StatusCode::OK, state.service.validate(validate_req).await?)
@@ -124,16 +108,9 @@ async fn handle_validate(req: Request<Body>) -> Result<Response<Body>, ApiError>
 /// Call into this before attaching a tenant to a pageserver, to acquire a generation number
 /// (in the real control plane this is unnecessary, because the same program is managing
 ///  generation numbers and doing attachments).
-async fn handle_attach_hook(req: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
    check_permissions(&req, Scope::Admin)?;

-    let mut req = match maybe_forward(req).await {
-        ForwardOutcome::Forwarded(res) => {
-            return res;
-        }
-        ForwardOutcome::NotForwarded(req) => req,
-    };
-
    let attach_req = json_request::<AttachHookRequest>(&mut req).await?;
    let state = get_state(&req);

@@ -147,16 +124,9 @@ async fn handle_attach_hook(req: Request<Body>) -> Result<Response<Body>, ApiErr
    )
 }

-async fn handle_inspect(req: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn handle_inspect(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
    check_permissions(&req, Scope::Admin)?;

-    let mut req = match maybe_forward(req).await {
-        ForwardOutcome::Forwarded(res) => {
-            return res;
-        }
-        ForwardOutcome::NotForwarded(req) => req,
-    };
-
    let inspect_req = json_request::<InspectRequest>(&mut req).await?;

    let state = get_state(&req);
@@ -166,17 +136,10 @@ async fn handle_inspect(req: Request<Body>) -> Result<Response<Body>, ApiError>

 async fn handle_tenant_create(
    service: Arc<Service>,
-    req: Request<Body>,
+    mut req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
    check_permissions(&req, Scope::PageServerApi)?;

-    let mut req = match maybe_forward(req).await {
-        ForwardOutcome::Forwarded(res) => {
-            return res;
-        }
-        ForwardOutcome::NotForwarded(req) => req,
-    };
-
    let create_req = json_request::<TenantCreateRequest>(&mut req).await?;

    json_response(
@@ -187,18 +150,11 @@ async fn handle_tenant_create(

 async fn handle_tenant_location_config(
    service: Arc<Service>,
-    req: Request<Body>,
+    mut req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
    let tenant_shard_id: TenantShardId = parse_request_param(&req, "tenant_shard_id")?;
    check_permissions(&req, Scope::PageServerApi)?;

-    let mut req = match maybe_forward(req).await {
-        ForwardOutcome::Forwarded(res) => {
-            return res;
-        }
-        ForwardOutcome::NotForwarded(req) => req,
-    };
-
    let config_req = json_request::<TenantLocationConfigRequest>(&mut req).await?;
    json_response(
        StatusCode::OK,
@@ -210,17 +166,10 @@ async fn handle_tenant_location_config(

 async fn handle_tenant_config_set(
    service: Arc<Service>,
-    req: Request<Body>,
+    mut req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
    check_permissions(&req, Scope::PageServerApi)?;

-    let mut req = match maybe_forward(req).await {
-        ForwardOutcome::Forwarded(res) => {
-            return res;
-        }
-        ForwardOutcome::NotForwarded(req) => req,
-    };
-
    let config_req = json_request::<TenantConfigRequest>(&mut req).await?;

    json_response(StatusCode::OK, service.tenant_config_set(config_req).await?)
@@ -233,30 +182,16 @@ async fn handle_tenant_config_get(
    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
    check_permissions(&req, Scope::PageServerApi)?;

-    match maybe_forward(req).await {
-        ForwardOutcome::Forwarded(res) => {
-            return res;
-        }
-        ForwardOutcome::NotForwarded(_req) => {}
-    };
-
    json_response(StatusCode::OK, service.tenant_config_get(tenant_id)?)
 }

 async fn handle_tenant_time_travel_remote_storage(
    service: Arc<Service>,
-    req: Request<Body>,
+    mut req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
    check_permissions(&req, Scope::PageServerApi)?;

-    let mut req = match maybe_forward(req).await {
-        ForwardOutcome::Forwarded(res) => {
-            return res;
-        }
-        ForwardOutcome::NotForwarded(req) => req,
-    };
-
    let time_travel_req = json_request::<TenantTimeTravelRequest>(&mut req).await?;

    let timestamp_raw = must_get_query_param(&req, "travel_to")?;
@@ -297,13 +232,6 @@ async fn handle_tenant_secondary_download(
    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
    let wait = parse_query_param(&req, "wait_ms")?.map(Duration::from_millis);

-    match maybe_forward(req).await {
-        ForwardOutcome::Forwarded(res) => {
-            return res;
-        }
-        ForwardOutcome::NotForwarded(_req) => {}
-    };
-
    let (status, progress) = service.tenant_secondary_download(tenant_id, wait).await?;
    json_response(map_reqwest_hyper_status(status)?, progress)
 }
@@ -315,13 +243,6 @@ async fn handle_tenant_delete(
    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
    check_permissions(&req, Scope::PageServerApi)?;

-    match maybe_forward(req).await {
-        ForwardOutcome::Forwarded(res) => {
-            return res;
-        }
-        ForwardOutcome::NotForwarded(_req) => {}
-    };
-
    let status_code = service
        .tenant_delete(tenant_id)
        .await
@@ -337,18 +258,11 @@ async fn handle_tenant_delete(

 async fn handle_tenant_timeline_create(
    service: Arc<Service>,
-    req: Request<Body>,
+    mut req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
    check_permissions(&req, Scope::PageServerApi)?;

-    let mut req = match maybe_forward(req).await {
-        ForwardOutcome::Forwarded(res) => {
-            return res;
-        }
-        ForwardOutcome::NotForwarded(req) => req,
-    };
-
    let create_req = json_request::<TimelineCreateRequest>(&mut req).await?;
    json_response(
        StatusCode::CREATED,
@@ -363,16 +277,9 @@ async fn handle_tenant_timeline_delete(
    req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
-    let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?;
-
    check_permissions(&req, Scope::PageServerApi)?;

-    match maybe_forward(req).await {
-        ForwardOutcome::Forwarded(res) => {
-            return res;
-        }
-        ForwardOutcome::NotForwarded(_req) => {}
-    };
+    let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?;

    // For timeline deletions, which both implement an "initially return 202, then 404 once
    // we're done" semantic, we wrap with a retry loop to expose a simpler API upstream.
@@ -430,19 +337,12 @@ async fn handle_tenant_timeline_delete(

 async fn handle_tenant_timeline_archival_config(
    service: Arc<Service>,
-    req: Request<Body>,
+    mut req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
-    let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?;
-
    check_permissions(&req, Scope::PageServerApi)?;

-    let mut req = match maybe_forward(req).await {
-        ForwardOutcome::Forwarded(res) => {
-            return res;
-        }
-        ForwardOutcome::NotForwarded(req) => req,
-    };
+    let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?;

    let create_req = json_request::<TimelineArchivalConfigRequest>(&mut req).await?;

@@ -458,16 +358,9 @@ async fn handle_tenant_timeline_detach_ancestor(
    req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
-    let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?;
-
    check_permissions(&req, Scope::PageServerApi)?;

-    match maybe_forward(req).await {
-        ForwardOutcome::Forwarded(res) => {
-            return res;
-        }
-        ForwardOutcome::NotForwarded(_req) => {}
-    };
+    let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?;

    let res = service
        .tenant_timeline_detach_ancestor(tenant_id, timeline_id)
@@ -500,13 +393,6 @@ async fn handle_tenant_timeline_passthrough(
    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
    check_permissions(&req, Scope::PageServerApi)?;

-    let req = match maybe_forward(req).await {
-        ForwardOutcome::Forwarded(res) => {
-            return res;
-        }
-        ForwardOutcome::NotForwarded(req) => req,
-    };
-
    let Some(path) = req.uri().path_and_query() else {
        // This should never happen, our request router only calls us if there is a path
        return Err(ApiError::BadRequest(anyhow::anyhow!("Missing path")));
@@ -574,17 +460,9 @@ async fn handle_tenant_locate(
    service: Arc<Service>,
    req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
-    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
-
    check_permissions(&req, Scope::Admin)?;

-    match maybe_forward(req).await {
-        ForwardOutcome::Forwarded(res) => {
-            return res;
-        }
-        ForwardOutcome::NotForwarded(_req) => {}
-    };
-
+    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
    json_response(StatusCode::OK, service.tenant_locate(tenant_id)?)
 }

@@ -595,14 +473,6 @@ async fn handle_tenant_describe(
    check_permissions(&req, Scope::Scrubber)?;

    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
-
-    match maybe_forward(req).await {
-        ForwardOutcome::Forwarded(res) => {
-            return res;
-        }
-        ForwardOutcome::NotForwarded(_req) => {}
-    };
-
    json_response(StatusCode::OK, service.tenant_describe(tenant_id)?)
 }

@@ -612,26 +482,12 @@ async fn handle_tenant_list(
 ) -> Result<Response<Body>, ApiError> {
    check_permissions(&req, Scope::Admin)?;

-    match maybe_forward(req).await {
-        ForwardOutcome::Forwarded(res) => {
-            return res;
-        }
-        ForwardOutcome::NotForwarded(_req) => {}
-    };
-
    json_response(StatusCode::OK, service.tenant_list())
 }

-async fn handle_node_register(req: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn handle_node_register(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
    check_permissions(&req, Scope::Admin)?;

-    let mut req = match maybe_forward(req).await {
-        ForwardOutcome::Forwarded(res) => {
-            return res;
-        }
-        ForwardOutcome::NotForwarded(req) => req,
-    };
-
    let register_req = json_request::<NodeRegisterRequest>(&mut req).await?;
    let state = get_state(&req);
    state.service.node_register(register_req).await?;
@@ -641,13 +497,6 @@ async fn handle_node_register(req: Request<Body>) -> Result<Response<Body>, ApiE
 async fn handle_node_list(req: Request<Body>) -> Result<Response<Body>, ApiError> {
    check_permissions(&req, Scope::Admin)?;

-    let req = match maybe_forward(req).await {
-        ForwardOutcome::Forwarded(res) => {
-            return res;
-        }
-        ForwardOutcome::NotForwarded(req) => req,
-    };
-
    let state = get_state(&req);
    let nodes = state.service.node_list().await?;
    let api_nodes = nodes.into_iter().map(|n| n.describe()).collect::<Vec<_>>();
@@ -658,13 +507,6 @@ async fn handle_node_list(req: Request<Body>) -> Result<Response<Body>, ApiError
 async fn handle_node_drop(req: Request<Body>) -> Result<Response<Body>, ApiError> {
    check_permissions(&req, Scope::Admin)?;

-    let req = match maybe_forward(req).await {
-        ForwardOutcome::Forwarded(res) => {
-            return res;
-        }
-        ForwardOutcome::NotForwarded(req) => req,
-    };
-
    let state = get_state(&req);
    let node_id: NodeId = parse_request_param(&req, "node_id")?;
    json_response(StatusCode::OK, state.service.node_drop(node_id).await?)
@@ -673,28 +515,14 @@ async fn handle_node_drop(req: Request<Body>) -> Result<Response<Body>, ApiError
 async fn handle_node_delete(req: Request<Body>) -> Result<Response<Body>, ApiError> {
    check_permissions(&req, Scope::Admin)?;

-    let req = match maybe_forward(req).await {
-        ForwardOutcome::Forwarded(res) => {
-            return res;
-        }
-        ForwardOutcome::NotForwarded(req) => req,
-    };
-
    let state = get_state(&req);
    let node_id: NodeId = parse_request_param(&req, "node_id")?;
    json_response(StatusCode::OK, state.service.node_delete(node_id).await?)
 }

-async fn handle_node_configure(req: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn handle_node_configure(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
    check_permissions(&req, Scope::Admin)?;

-    let mut req = match maybe_forward(req).await {
-        ForwardOutcome::Forwarded(res) => {
-            return res;
-        }
-        ForwardOutcome::NotForwarded(req) => req,
-    };
-
    let node_id: NodeId = parse_request_param(&req, "node_id")?;
    let config_req = json_request::<NodeConfigureRequest>(&mut req).await?;
    if node_id != config_req.node_id {
@@ -720,13 +548,6 @@ async fn handle_node_configure(req: Request<Body>) -> Result<Response<Body>, Api
 async fn handle_node_status(req: Request<Body>) -> Result<Response<Body>, ApiError> {
    check_permissions(&req, Scope::Admin)?;

-    let req = match maybe_forward(req).await {
-        ForwardOutcome::Forwarded(res) => {
-            return res;
-        }
-        ForwardOutcome::NotForwarded(req) => req,
-    };
-
    let state = get_state(&req);
    let node_id: NodeId = parse_request_param(&req, "node_id")?;

@@ -749,13 +570,6 @@ async fn handle_node_shards(req: Request<Body>) -> Result<Response<Body>, ApiErr
 async fn handle_get_leader(req: Request<Body>) -> Result<Response<Body>, ApiError> {
    check_permissions(&req, Scope::Admin)?;

-    let req = match maybe_forward(req).await {
-        ForwardOutcome::Forwarded(res) => {
-            return res;
-        }
-        ForwardOutcome::NotForwarded(req) => req,
-    };
-
    let state = get_state(&req);
    let leader = state.service.get_leader().await.map_err(|err| {
        ApiError::InternalServerError(anyhow::anyhow!(
@@ -769,13 +583,6 @@ async fn handle_get_leader(req: Request<Body>) -> Result<Response<Body>, ApiErro
 async fn handle_node_drain(req: Request<Body>) -> Result<Response<Body>, ApiError> {
    check_permissions(&req, Scope::Admin)?;

-    let req = match maybe_forward(req).await {
-        ForwardOutcome::Forwarded(res) => {
-            return res;
-        }
-        ForwardOutcome::NotForwarded(req) => req,
-    };
-
    let state = get_state(&req);
    let node_id: NodeId = parse_request_param(&req, "node_id")?;

@@ -787,13 +594,6 @@ async fn handle_node_drain(req: Request<Body>) -> Result<Response<Body>, ApiErro
 async fn handle_cancel_node_drain(req: Request<Body>) -> Result<Response<Body>, ApiError> {
    check_permissions(&req, Scope::Admin)?;

-    let req = match maybe_forward(req).await {
-        ForwardOutcome::Forwarded(res) => {
-            return res;
-        }
-        ForwardOutcome::NotForwarded(req) => req,
-    };
-
    let state = get_state(&req);
    let node_id: NodeId = parse_request_param(&req, "node_id")?;

@@ -805,13 +605,6 @@ async fn handle_cancel_node_drain(req: Request<Body>) -> Result<Response<Body>,
 async fn handle_node_fill(req: Request<Body>) -> Result<Response<Body>, ApiError> {
    check_permissions(&req, Scope::Admin)?;

-    let req = match maybe_forward(req).await {
-        ForwardOutcome::Forwarded(res) => {
-            return res;
-        }
-        ForwardOutcome::NotForwarded(req) => req,
-    };
-
    let state = get_state(&req);
    let node_id: NodeId = parse_request_param(&req, "node_id")?;

@@ -823,13 +616,6 @@ async fn handle_node_fill(req: Request<Body>) -> Result<Response<Body>, ApiError
 async fn handle_cancel_node_fill(req: Request<Body>) -> Result<Response<Body>, ApiError> {
    check_permissions(&req, Scope::Admin)?;

-    let req = match maybe_forward(req).await {
-        ForwardOutcome::Forwarded(res) => {
-            return res;
-        }
-        ForwardOutcome::NotForwarded(req) => req,
-    };
-
    let state = get_state(&req);
    let node_id: NodeId = parse_request_param(&req, "node_id")?;

@@ -838,16 +624,9 @@ async fn handle_cancel_node_fill(req: Request<Body>) -> Result<Response<Body>, A
    json_response(StatusCode::ACCEPTED, ())
 }

-async fn handle_metadata_health_update(req: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn handle_metadata_health_update(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
    check_permissions(&req, Scope::Scrubber)?;

-    let mut req = match maybe_forward(req).await {
-        ForwardOutcome::Forwarded(res) => {
-            return res;
-        }
-        ForwardOutcome::NotForwarded(req) => req,
-    };
-
    let update_req = json_request::<MetadataHealthUpdateRequest>(&mut req).await?;
    let state = get_state(&req);

@@ -861,13 +640,6 @@ async fn handle_metadata_health_list_unhealthy(
 ) -> Result<Response<Body>, ApiError> {
    check_permissions(&req, Scope::Admin)?;

-    let req = match maybe_forward(req).await {
-        ForwardOutcome::Forwarded(res) => {
-            return res;
-        }
-        ForwardOutcome::NotForwarded(req) => req,
-    };
-
    let state = get_state(&req);
    let unhealthy_tenant_shards = state.service.metadata_health_list_unhealthy().await?;

@@ -880,17 +652,10 @@ async fn handle_metadata_health_list_unhealthy(
 }

 async fn handle_metadata_health_list_outdated(
-    req: Request<Body>,
+    mut req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
    check_permissions(&req, Scope::Admin)?;

-    let mut req = match maybe_forward(req).await {
-        ForwardOutcome::Forwarded(res) => {
-            return res;
-        }
-        ForwardOutcome::NotForwarded(req) => req,
-    };
-
    let list_outdated_req = json_request::<MetadataHealthListOutdatedRequest>(&mut req).await?;
    let state = get_state(&req);
    let health_records = state
@@ -906,17 +671,10 @@ async fn handle_metadata_health_list_outdated(

 async fn handle_tenant_shard_split(
    service: Arc<Service>,
-    req: Request<Body>,
+    mut req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
    check_permissions(&req, Scope::Admin)?;

-    let mut req = match maybe_forward(req).await {
-        ForwardOutcome::Forwarded(res) => {
-            return res;
-        }
-        ForwardOutcome::NotForwarded(req) => req,
-    };
-
    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
    let split_req = json_request::<TenantShardSplitRequest>(&mut req).await?;

@@ -928,17 +686,10 @@ async fn handle_tenant_shard_split(

 async fn handle_tenant_shard_migrate(
    service: Arc<Service>,
-    req: Request<Body>,
+    mut req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
    check_permissions(&req, Scope::Admin)?;

-    let mut req = match maybe_forward(req).await {
-        ForwardOutcome::Forwarded(res) => {
-            return res;
-        }
-        ForwardOutcome::NotForwarded(req) => req,
-    };
-
    let tenant_shard_id: TenantShardId = parse_request_param(&req, "tenant_shard_id")?;
    let migrate_req = json_request::<TenantShardMigrateRequest>(&mut req).await?;
    json_response(
@@ -949,16 +700,9 @@ async fn handle_tenant_shard_migrate(
    )
 }

-async fn handle_tenant_update_policy(req: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn handle_tenant_update_policy(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
    check_permissions(&req, Scope::Admin)?;

-    let mut req = match maybe_forward(req).await {
-        ForwardOutcome::Forwarded(res) => {
-            return res;
-        }
-        ForwardOutcome::NotForwarded(req) => req,
-    };
-
    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
    let update_req = json_request::<TenantPolicyRequest>(&mut req).await?;
    let state = get_state(&req);
@@ -972,16 +716,9 @@ async fn handle_tenant_update_policy(req: Request<Body>) -> Result<Response<Body
    )
 }

-async fn handle_update_preferred_azs(req: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn handle_update_preferred_azs(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
    check_permissions(&req, Scope::Admin)?;

-    let mut req = match maybe_forward(req).await {
-        ForwardOutcome::Forwarded(res) => {
-            return res;
-        }
-        ForwardOutcome::NotForwarded(req) => req,
-    };
-
    let azs_req = json_request::<ShardsPreferredAzsRequest>(&mut req).await?;
    let state = get_state(&req);

@@ -994,28 +731,13 @@ async fn handle_update_preferred_azs(req: Request<Body>) -> Result<Response<Body
 async fn handle_step_down(req: Request<Body>) -> Result<Response<Body>, ApiError> {
    check_permissions(&req, Scope::Admin)?;

-    let req = match maybe_forward(req).await {
-        ForwardOutcome::Forwarded(res) => {
-            return res;
-        }
-        ForwardOutcome::NotForwarded(req) => req,
-    };
-
    let state = get_state(&req);
    json_response(StatusCode::OK, state.service.step_down().await)
 }

 async fn handle_tenant_drop(req: Request<Body>) -> Result<Response<Body>, ApiError> {
-    check_permissions(&req, Scope::PageServerApi)?;
-
    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
-
-    let req = match maybe_forward(req).await {
-        ForwardOutcome::Forwarded(res) => {
-            return res;
-        }
-        ForwardOutcome::NotForwarded(req) => req,
-    };
+    check_permissions(&req, Scope::PageServerApi)?;

    let state = get_state(&req);

@@ -1023,16 +745,8 @@ async fn handle_tenant_drop(req: Request<Body>) -> Result<Response<Body>, ApiErr
 }

 async fn handle_tenant_import(req: Request<Body>) -> Result<Response<Body>, ApiError> {
-    check_permissions(&req, Scope::PageServerApi)?;
-
    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
-
-    let req = match maybe_forward(req).await {
-        ForwardOutcome::Forwarded(res) => {
-            return res;
-        }
-        ForwardOutcome::NotForwarded(req) => req,
-    };
+    check_permissions(&req, Scope::PageServerApi)?;

    let state = get_state(&req);

@@ -1045,13 +759,6 @@ async fn handle_tenant_import(req: Request<Body>) -> Result<Response<Body>, ApiE
 async fn handle_tenants_dump(req: Request<Body>) -> Result<Response<Body>, ApiError> {
    check_permissions(&req, Scope::Admin)?;

-    let req = match maybe_forward(req).await {
-        ForwardOutcome::Forwarded(res) => {
-            return res;
-        }
-        ForwardOutcome::NotForwarded(req) => req,
-    };
-
    let state = get_state(&req);
    state.service.tenants_dump()
 }
@@ -1059,13 +766,6 @@ async fn handle_tenants_dump(req: Request<Body>) -> Result<Response<Body>, ApiEr
 async fn handle_scheduler_dump(req: Request<Body>) -> Result<Response<Body>, ApiError> {
    check_permissions(&req, Scope::Admin)?;

-    let req = match maybe_forward(req).await {
-        ForwardOutcome::Forwarded(res) => {
-            return res;
-        }
-        ForwardOutcome::NotForwarded(req) => req,
-    };
-
    let state = get_state(&req);
    state.service.scheduler_dump()
 }
@@ -1073,13 +773,6 @@ async fn handle_scheduler_dump(req: Request<Body>) -> Result<Response<Body>, Api
 async fn handle_consistency_check(req: Request<Body>) -> Result<Response<Body>, ApiError> {
    check_permissions(&req, Scope::Admin)?;

-    let req = match maybe_forward(req).await {
-        ForwardOutcome::Forwarded(res) => {
-            return res;
-        }
-        ForwardOutcome::NotForwarded(req) => req,
-    };
-
    let state = get_state(&req);

    json_response(StatusCode::OK, state.service.consistency_check().await?)
@@ -1088,40 +781,19 @@ async fn handle_consistency_check(req: Request<Body>) -> Result<Response<Body>,
 async fn handle_reconcile_all(req: Request<Body>) -> Result<Response<Body>, ApiError> {
    check_permissions(&req, Scope::Admin)?;

-    let req = match maybe_forward(req).await {
-        ForwardOutcome::Forwarded(res) => {
-            return res;
-        }
-        ForwardOutcome::NotForwarded(req) => req,
-    };
-
    let state = get_state(&req);

    json_response(StatusCode::OK, state.service.reconcile_all_now().await?)
 }

 /// Status endpoint is just used for checking that our HTTP listener is up
-async fn handle_status(req: Request<Body>) -> Result<Response<Body>, ApiError> {
-    match maybe_forward(req).await {
-        ForwardOutcome::Forwarded(res) => {
-            return res;
-        }
-        ForwardOutcome::NotForwarded(_req) => {}
-    };
-
+async fn handle_status(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
    json_response(StatusCode::OK, ())
 }

 /// Readiness endpoint indicates when we're done doing startup I/O (e.g. reconciling
 /// with remote pageserver nodes).  This is intended for use as a kubernetes readiness probe.
 async fn handle_ready(req: Request<Body>) -> Result<Response<Body>, ApiError> {
-    let req = match maybe_forward(req).await {
-        ForwardOutcome::Forwarded(res) => {
-            return res;
-        }
-        ForwardOutcome::NotForwarded(req) => req,
-    };
-
    let state = get_state(&req);
    if state.service.startup_complete.is_ready() {
        json_response(StatusCode::OK, ())
@@ -1144,13 +816,6 @@ async fn handle_get_safekeeper(req: Request<Body>) -> Result<Response<Body>, Api

    let id = parse_request_param::<i64>(&req, "id")?;

-    let req = match maybe_forward(req).await {
-        ForwardOutcome::Forwarded(res) => {
-            return res;
-        }
-        ForwardOutcome::NotForwarded(req) => req,
-    };
-
    let state = get_state(&req);

    let res = state.service.get_safekeeper(id).await;
@@ -1182,13 +847,6 @@ async fn handle_upsert_safekeeper(mut req: Request<Body>) -> Result<Response<Bod
        )));
    }

-    let req = match maybe_forward(req).await {
-        ForwardOutcome::Forwarded(res) => {
-            return res;
-        }
-        ForwardOutcome::NotForwarded(req) => req,
-    };
-
    let state = get_state(&req);

    state.service.upsert_safekeeper(body).await?;
@@ -1267,7 +925,10 @@ pub fn prologue_leadership_status_check_middleware<

        let allowed_routes = match leadership_status {
            LeadershipStatus::Leader => AllowedRoutes::All,
-            LeadershipStatus::SteppedDown => AllowedRoutes::All,
+            LeadershipStatus::SteppedDown => {
+                // TODO: does it make sense to allow /status here?
+                AllowedRoutes::Some(["/control/v1/step_down", "/status", "/metrics"].to_vec())
+            }
            LeadershipStatus::Candidate => {
                AllowedRoutes::Some(["/ready", "/status", "/metrics"].to_vec())
            }
@@ -1344,13 +1005,6 @@ fn epilogue_metrics_middleware<B: hyper::body::HttpBody + Send + Sync + 'static>
 pub async fn measured_metrics_handler(req: Request<Body>) -> Result<Response<Body>, ApiError> {
    pub const TEXT_FORMAT: &str = "text/plain; version=0.0.4";

-    let req = match maybe_forward(req).await {
-        ForwardOutcome::Forwarded(res) => {
-            return res;
-        }
-        ForwardOutcome::NotForwarded(req) => req,
-    };
-
    let state = get_state(&req);
    let payload = crate::metrics::METRICS_REGISTRY.encode(&state.neon_metrics);
    let response = Response::builder()
@@ -1378,220 +1032,6 @@ where
    request_span(request, handler).await
 }

-enum ForwardOutcome {
-    Forwarded(Result<Response<Body>, ApiError>),
-    NotForwarded(Request<Body>),
-}
-
-/// Potentially forward the request to the current storage controler leader.
-/// More specifically we forward when:
-/// 1. Request is not one of ["/control/v1/step_down", "/status", "/ready", "/metrics"]
-/// 2. Current instance is in [`LeadershipStatus::SteppedDown`] state
-/// 3. There is a leader in the database to forward to
-/// 4. Leader from step (3) is not the current instance
-///
-/// Why forward?
-/// It turns out that we can't rely on external orchestration to promptly route trafic to the
-/// new leader. This is downtime inducing. Forwarding provides a safe way out.
-///
-/// Why is it safe?
-/// If a storcon instance is persisted in the database, then we know that it is the current leader.
-/// There's one exception: time between handling step-down request and the new leader updating the
-/// database.
-///
-/// Let's treat the happy case first. The stepped down node does not produce any side effects,
-/// since all request handling happens on the leader.
-///
-/// As for the edge case, we are guaranteed to always have a maximum of two running instances.
-/// Hence, if we are in the edge case scenario the leader persisted in the database is the
-/// stepped down instance that received the request. Condition (4) above covers this scenario.
-async fn maybe_forward(req: Request<Body>) -> ForwardOutcome {
-    const NOT_FOR_FORWARD: [&str; 4] = ["/control/v1/step_down", "/status", "/ready", "/metrics"];
-
-    let uri = req.uri().to_string();
-    let uri_for_forward = !NOT_FOR_FORWARD.contains(&uri.as_str());
-
-    let state = get_state(&req);
-    let leadership_status = state.service.get_leadership_status();
-
-    if leadership_status != LeadershipStatus::SteppedDown || !uri_for_forward {
-        return ForwardOutcome::NotForwarded(req);
-    }
-
-    let leader = state.service.get_leader().await;
-    let leader = {
-        match leader {
-            Ok(Some(leader)) => leader,
-            Ok(None) => {
-                return ForwardOutcome::Forwarded(Err(ApiError::ResourceUnavailable(
-                    "No leader to forward to while in stepped down state".into(),
-                )));
-            }
-            Err(err) => {
-                return ForwardOutcome::Forwarded(Err(ApiError::InternalServerError(
-                    anyhow::anyhow!(
-                        "Failed to get leader for forwarding while in stepped down state: {err}"
-                    ),
-                )));
-            }
-        }
-    };
-
-    let cfg = state.service.get_config();
-    if let Some(ref self_addr) = cfg.address_for_peers {
-        let leader_addr = match Uri::from_str(leader.address.as_str()) {
-            Ok(uri) => uri,
-            Err(err) => {
-                return ForwardOutcome::Forwarded(Err(ApiError::InternalServerError(
-                    anyhow::anyhow!(
-                    "Failed to parse leader uri for forwarding while in stepped down state: {err}"
-                ),
-                )));
-            }
-        };
-
-        if *self_addr == leader_addr {
-            return ForwardOutcome::Forwarded(Err(ApiError::InternalServerError(anyhow::anyhow!(
-                "Leader is stepped down instance"
-            ))));
-        }
-    }
-
-    tracing::info!("Forwarding {} to leader at {}", uri, leader.address);
-
-    // Use [`RECONCILE_TIMEOUT`] as the max amount of time a request should block for and
-    // include some leeway to get the timeout for proxied requests.
-    const PROXIED_REQUEST_TIMEOUT: Duration = Duration::from_secs(RECONCILE_TIMEOUT.as_secs() + 10);
-    let client = reqwest::ClientBuilder::new()
-        .timeout(PROXIED_REQUEST_TIMEOUT)
-        .build();
-    let client = match client {
-        Ok(client) => client,
-        Err(err) => {
-            return ForwardOutcome::Forwarded(Err(ApiError::InternalServerError(anyhow::anyhow!(
-                "Failed to build leader client for forwarding while in stepped down state: {err}"
-            ))));
-        }
-    };
-
-    let request: reqwest::Request = match convert_request(req, &client, leader.address).await {
-        Ok(r) => r,
-        Err(err) => {
-            return ForwardOutcome::Forwarded(Err(ApiError::InternalServerError(anyhow::anyhow!(
-                "Failed to convert request for forwarding while in stepped down state: {err}"
-            ))));
-        }
-    };
-
-    let response = match client.execute(request).await {
-        Ok(r) => r,
-        Err(err) => {
-            return ForwardOutcome::Forwarded(Err(ApiError::InternalServerError(anyhow::anyhow!(
-                "Failed to forward while in stepped down state: {err}"
-            ))));
-        }
-    };
-
-    ForwardOutcome::Forwarded(convert_response(response).await)
-}
-
-/// Convert a [`reqwest::Response`] to a [hyper::Response`] by passing through
-/// a stable representation (string, bytes or integer)
-///
-/// Ideally, we would not have to do this since both types use the http crate
-/// under the hood. However, they use different versions of the crate and keeping
-/// second order dependencies in sync is difficult.
-async fn convert_response(resp: reqwest::Response) -> Result<hyper::Response<Body>, ApiError> {
-    use std::str::FromStr;
-
-    let mut builder = hyper::Response::builder().status(resp.status().as_u16());
-    for (key, value) in resp.headers().into_iter() {
-        let key = hyper::header::HeaderName::from_str(key.as_str()).map_err(|err| {
-            ApiError::InternalServerError(anyhow::anyhow!("Response conversion failed: {err}"))
-        })?;
-
-        let value = hyper::header::HeaderValue::from_bytes(value.as_bytes()).map_err(|err| {
-            ApiError::InternalServerError(anyhow::anyhow!("Response conversion failed: {err}"))
-        })?;
-
-        builder = builder.header(key, value);
-    }
-
-    let body = http::Body::wrap_stream(resp.bytes_stream());
-
-    builder.body(body).map_err(|err| {
-        ApiError::InternalServerError(anyhow::anyhow!("Response conversion failed: {err}"))
-    })
-}
-
-/// Convert a [`reqwest::Request`] to a [hyper::Request`] by passing through
-/// a stable representation (string, bytes or integer)
-///
-/// See [`convert_response`] for why we are doing it this way.
-async fn convert_request(
-    req: hyper::Request<Body>,
-    client: &reqwest::Client,
-    to_address: String,
-) -> Result<reqwest::Request, ApiError> {
-    use std::str::FromStr;
-
-    let (parts, body) = req.into_parts();
-    let method = reqwest::Method::from_str(parts.method.as_str()).map_err(|err| {
-        ApiError::InternalServerError(anyhow::anyhow!("Request conversion failed: {err}"))
-    })?;
-
-    let path_and_query = parts.uri.path_and_query().ok_or_else(|| {
-        ApiError::InternalServerError(anyhow::anyhow!(
-            "Request conversion failed: no path and query"
-        ))
-    })?;
-
-    let uri = reqwest::Url::from_str(
-        format!(
-            "{}{}",
-            to_address.trim_end_matches("/"),
-            path_and_query.as_str()
-        )
-        .as_str(),
-    )
-    .map_err(|err| {
-        ApiError::InternalServerError(anyhow::anyhow!("Request conversion failed: {err}"))
-    })?;
-
-    let mut headers = reqwest::header::HeaderMap::new();
-    for (key, value) in parts.headers.into_iter() {
-        let key = match key {
-            Some(k) => k,
-            None => {
-                continue;
-            }
-        };
-
-        let key = reqwest::header::HeaderName::from_str(key.as_str()).map_err(|err| {
-            ApiError::InternalServerError(anyhow::anyhow!("Request conversion failed: {err}"))
-        })?;
-
-        let value = reqwest::header::HeaderValue::from_bytes(value.as_bytes()).map_err(|err| {
-            ApiError::InternalServerError(anyhow::anyhow!("Request conversion failed: {err}"))
-        })?;
-
-        headers.insert(key, value);
-    }
-
-    let body = hyper::body::to_bytes(body).await.map_err(|err| {
-        ApiError::InternalServerError(anyhow::anyhow!("Request conversion failed: {err}"))
-    })?;
-
-    client
-        .request(method, uri)
-        .headers(headers)
-        .body(body)
-        .build()
-        .map_err(|err| {
-            ApiError::InternalServerError(anyhow::anyhow!("Request conversion failed: {err}"))
-        })
-}
-
 pub fn make_router(
    service: Arc<Service>,
    auth: Option<Arc<SwappableJwtAuth>>,
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -3,7 +3,6 @@ use std::{
    borrow::Cow,
    cmp::Ordering,
    collections::{BTreeMap, HashMap, HashSet},
-    error::Error,
    ops::Deref,
    path::PathBuf,
    str::FromStr,
@@ -219,16 +218,9 @@ fn passthrough_api_error(node: &Node, e: mgmt_api::Error) -> ApiError {
                format!("{node} error receiving error body: {str}").into(),
            )
        }
-        mgmt_api::Error::ReceiveBody(err) if err.is_decode() => {
-            // Return 500 for decoding errors.
-            ApiError::InternalServerError(anyhow::Error::from(err).context("error decoding body"))
-        }
-        mgmt_api::Error::ReceiveBody(err) => {
-            // Presume errors receiving body are connectivity/availability issues except for decoding errors
-            let src_str = err.source().map(|e| e.to_string()).unwrap_or_default();
-            ApiError::ResourceUnavailable(
-                format!("{node} error receiving error body: {err} {}", src_str).into(),
-            )
+        mgmt_api::Error::ReceiveBody(str) => {
+            // Presume errors receiving body are connectivity/availability issues
+            ApiError::ResourceUnavailable(format!("{node} error receiving body: {str}").into())
        }
        mgmt_api::Error::ApiError(StatusCode::NOT_FOUND, msg) => {
            ApiError::NotFound(anyhow::anyhow!(format!("{node}: {msg}")).into())
--- a/storage_scrubber/Cargo.toml
+++ b/storage_scrubber/Cargo.toml
@@ -6,13 +6,21 @@ license.workspace = true

 [dependencies]
 aws-sdk-s3.workspace = true
+aws-smithy-async.workspace = true
 either.workspace = true
+tokio-rustls.workspace = true
 anyhow.workspace = true
 git-version.workspace = true
 hex.workspace = true
 humantime.workspace = true
+thiserror.workspace = true
+rand.workspace = true
+bytes.workspace = true
+bincode.workspace = true
+crc32c.workspace = true
 serde.workspace = true
 serde_json.workspace = true
+serde_with.workspace = true
 workspace_hack.workspace = true
 utils.workspace = true
 async-stream.workspace = true
--- a/storage_scrubber/src/checks.rs
+++ b/storage_scrubber/src/checks.rs
@@ -1,8 +1,7 @@
-use std::collections::{HashMap, HashSet};
+use std::collections::{BTreeSet, HashMap, HashSet};

 use anyhow::Context;
 use itertools::Itertools;
-use pageserver::tenant::checks::check_valid_layermap;
 use pageserver::tenant::layer_map::LayerMap;
 use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata;
 use pageserver_api::shard::ShardIndex;
@@ -49,6 +48,56 @@ impl TimelineAnalysis {
    }
 }

+/// Checks whether a layer map is valid (i.e., is a valid result of the current compaction algorithm if nothing goes wrong).
+/// The function checks if we can split the LSN range of a delta layer only at the LSNs of the delta layers. For example,
+///
+/// ```plain
+/// |       |                 |       |
+/// |   1   |    |   2   |    |   3   |
+/// |       |    |       |    |       |
+/// ```
+///
+/// This is not a valid layer map because the LSN range of layer 1 intersects with the LSN range of layer 2. 1 and 2 should have
+/// the same LSN range.
+///
+/// The exception is that when layer 2 only contains a single key, it could be split over the LSN range. For example,
+///
+/// ```plain
+/// |       |    |   2   |    |       |
+/// |   1   |    |-------|    |   3   |
+/// |       |    |   4   |    |       |
+///
+/// If layer 2 and 4 contain the same single key, this is also a valid layer map.
+fn check_valid_layermap(metadata: &HashMap<LayerName, LayerFileMetadata>) -> Option<String> {
+    let mut lsn_split_point = BTreeSet::new(); // TODO: use a better data structure (range tree / range set?)
+    let mut all_delta_layers = Vec::new();
+    for (name, _) in metadata.iter() {
+        if let LayerName::Delta(layer) = name {
+            if layer.key_range.start.next() != layer.key_range.end {
+                all_delta_layers.push(layer.clone());
+            }
+        }
+    }
+    for layer in &all_delta_layers {
+        let lsn_range = &layer.lsn_range;
+        lsn_split_point.insert(lsn_range.start);
+        lsn_split_point.insert(lsn_range.end);
+    }
+    for layer in &all_delta_layers {
+        let lsn_range = layer.lsn_range.clone();
+        let intersects = lsn_split_point.range(lsn_range).collect_vec();
+        if intersects.len() > 1 {
+            let err = format!(
+                        "layer violates the layer map LSN split assumption: layer {} intersects with LSN [{}]",
+                        layer,
+                        intersects.into_iter().map(|lsn| lsn.to_string()).join(", ")
+                    );
+            return Some(err);
+        }
+    }
+    None
+}
+
 pub(crate) async fn branch_cleanup_and_check_errors(
    remote_client: &GenericRemoteStorage,
    id: &TenantShardTimelineId,
@@ -128,8 +177,7 @@ pub(crate) async fn branch_cleanup_and_check_errors(
                        }
                    }

-                    let layer_names = index_part.layer_metadata.keys().cloned().collect_vec();
-                    if let Some(err) = check_valid_layermap(&layer_names) {
+                    if let Some(err) = check_valid_layermap(&index_part.layer_metadata) {
                        result.errors.push(format!(
                            "index_part.json contains invalid layer map structure: {err}"
                        ));
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Vlad Lazar	e02334c15e	fixup: doc reference to renamed field	2024-09-16 19:44:09 +01:00
Christian Schwarz	f0430e97a2	prove hypothesis (inefficient fix)	2024-09-16 17:17:49 +01:00
Vlad Lazar	25e31b247b	tests: add unit test for vec read with overlapped images	2024-09-16 17:17:26 +01:00
				`@@ -1 +0,0 @@`
				`GRANT EXECUTE ON FUNCTION pg_show_replication_origin_status TO neon_superuser;`