Merge branch 'layer-stats' of github.com:neondatabase/neon into layer-stats

Address comments
accept suggestion
2026-06-09 16:30:37 +00:00 · 2023-06-10 10:47:42 -04:00 · 2023-06-10 10:46:28 -04:00 · 2023-06-10 10:35:13 -04:00 · 2023-06-02 16:46:48 -04:00
205 changed files with 5410 additions and 10633 deletions
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -180,8 +180,7 @@ jobs:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
      options: --init

-    # Increase timeout to 8h, default timeout is 6h
-    timeout-minutes: 480
+    timeout-minutes: 360 # 6h

    steps:
    - uses: actions/checkout@v3
@@ -322,6 +321,8 @@ jobs:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
      options: --init

+    timeout-minutes: 360 # 6h
+
    steps:
    - uses: actions/checkout@v3

@@ -413,6 +414,8 @@ jobs:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
      options: --init

+    timeout-minutes: 360 # 6h
+
    steps:
    - uses: actions/checkout@v3

@@ -498,6 +501,8 @@ jobs:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
      options: --init

+    timeout-minutes: 360 # 6h
+
    steps:
    - uses: actions/checkout@v3

--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -264,7 +264,7 @@ jobs:
          export REMOTE_STORAGE_S3_BUCKET=neon-github-public-dev
          export REMOTE_STORAGE_S3_REGION=eu-central-1
          # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
-          ${cov_prefix} cargo test $CARGO_FLAGS --package remote_storage --test test_real_s3
+          ${cov_prefix} cargo test $CARGO_FLAGS --package remote_storage --test pagination_tests -- s3_pagination_should_work --exact

      - name: Install rust binaries
        run: |
@@ -623,6 +623,51 @@ jobs:
      - name: Cleanup ECR folder
        run: rm -rf ~/.ecr

+
+  neon-image-depot:
+    # For testing this will run side-by-side for a few merges.
+    # This action is not really optimized yet, but gets the job done
+    runs-on: [ self-hosted, gen3, large ]
+    needs: [ tag ]
+    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
+    permissions:
+      contents: read
+      id-token: write
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          submodules: true
+          fetch-depth: 0
+
+      - name: Setup go
+        uses: actions/setup-go@v3
+        with:
+          go-version: '1.19'
+
+      - name: Set up Depot CLI
+        uses: depot/setup-action@v1
+
+      - name: Install Crane & ECR helper
+        run: go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@69c85dc22db6511932bbf119e1a0cc5c90c69a7f # v0.6.0
+
+      - name: Configure ECR login
+        run: |
+          mkdir /github/home/.docker/
+          echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json
+
+      - name: Build and push
+        uses: depot/build-push-action@v1
+        with:
+          # if no depot.json file is at the root of your repo, you must specify the project id
+          project: nrdv0s4kcs
+          push: true
+          tags: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:depot-${{needs.tag.outputs.build-tag}}
+          build-args: |
+            GIT_VERSION=${{ github.sha }}
+            REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
+
  compute-tools-image:
    runs-on: [ self-hosted, gen3, large ]
    needs: [ tag ]
@@ -659,7 +704,6 @@ jobs:
                           --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache
                           --context .
                           --build-arg GIT_VERSION=${{ github.sha }}
-                           --build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}}
                           --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
                           --dockerfile Dockerfile.compute-tools
                           --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}}
@@ -717,40 +761,10 @@ jobs:
                           --context .
                           --build-arg GIT_VERSION=${{ github.sha }}
                           --build-arg PG_VERSION=${{ matrix.version }}
-                           --build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}}
                           --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
                           --dockerfile Dockerfile.compute-node
                           --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
                           --destination neondatabase/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
-                           --cleanup
-
-      # Due to a kaniko bug, we can't use cache for extensions image, thus it takes about the same amount of time as compute-node image to build (~10 min)
-      # During the transition period we need to have extensions in both places (in S3 and in compute-node image),
-      # so we won't build extension twice, but extract them from compute-node.
-      #
-      # For now we use extensions image only for new custom extensitons
-      - name: Kaniko build extensions only
-        run: |
-          # Kaniko is suposed to clean up after itself if --cleanup flag is set, but it doesn't.
-          # Despite some fixes were made in https://github.com/GoogleContainerTools/kaniko/pull/2504 (in kaniko v1.11.0),
-          # it still fails with error:
-          #   error building image: could not save file: copying file: symlink postgres /kaniko/1/usr/local/pgsql/bin/postmaster: file exists
-          #
-          # Ref https://github.com/GoogleContainerTools/kaniko/issues/1406
-          find /kaniko -maxdepth 1 -mindepth 1 -type d -regex "/kaniko/[0-9]*" -exec rm -rv {} \;
-
-          /kaniko/executor --reproducible --snapshot-mode=redo --skip-unused-stages --cache=true \
-                           --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache \
-                           --context . \
-                           --build-arg GIT_VERSION=${{ github.sha }} \
-                           --build-arg PG_VERSION=${{ matrix.version }} \
-                           --build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}} \
-                           --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com \
-                           --dockerfile Dockerfile.compute-node \
-                           --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/extensions-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} \
-                           --destination neondatabase/extensions-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} \
-                           --cleanup \
-                           --target postgres-extensions

      # Cleanup script fails otherwise - rm: cannot remove '/nvme/actions-runner/_work/_temp/_github_home/.ecr': Permission denied
      - name: Cleanup ECR folder
@@ -767,7 +781,7 @@ jobs:
      run:
        shell: sh -eu {0}
    env:
-      VM_BUILDER_VERSION: v0.11.1
+      VM_BUILDER_VERSION: v0.8.0

    steps:
      - name: Checkout
@@ -869,10 +883,8 @@ jobs:
          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} latest
          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} latest
          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} latest
-          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/extensions-v14:${{needs.tag.outputs.build-tag}} latest
          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} latest
          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest
-          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/extensions-v15:${{needs.tag.outputs.build-tag}} latest

      - name: Push images to production ECR
        if: |
@@ -883,10 +895,8 @@ jobs:
          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:latest
          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:latest
          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:latest
-          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/extensions-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/extensions-v14:latest
          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:latest
          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:latest
-          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/extensions-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/extensions-v15:latest

      - name: Configure Docker Hub login
        run: |
@@ -908,93 +918,16 @@ jobs:
          crane tag neondatabase/compute-tools:${{needs.tag.outputs.build-tag}} latest
          crane tag neondatabase/compute-node-v14:${{needs.tag.outputs.build-tag}} latest
          crane tag neondatabase/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} latest
-          crane tag neondatabase/extensions-v14:${{needs.tag.outputs.build-tag}} latest
          crane tag neondatabase/compute-node-v15:${{needs.tag.outputs.build-tag}} latest
          crane tag neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest
-          crane tag neondatabase/extensions-v15:${{needs.tag.outputs.build-tag}} latest

      - name: Cleanup ECR folder
        run: rm -rf ~/.ecr

-  upload-postgres-extensions-to-s3:
-    if: |
-      (github.ref_name == 'main' || github.ref_name == 'release') &&
-       github.event_name != 'workflow_dispatch'
-    runs-on: ${{ github.ref_name == 'release' && fromJSON('["self-hosted", "prod", "x64"]') || fromJSON('["self-hosted", "gen3", "small"]') }}
-    needs: [ tag, promote-images ]
-    strategy:
-      fail-fast: false
-      matrix:
-        version: [ v14, v15 ]
-
-    env:
-      # While on transition period we extract public extensions from compute-node image and custom extensions from extensions image.
-      # Later all the extensions will be moved to extensions image.
-      EXTENSIONS_IMAGE: ${{ github.ref_name == 'release' && '093970136003' || '369495373322'}}.dkr.ecr.eu-central-1.amazonaws.com/extensions-${{ matrix.version }}:latest
-      COMPUTE_NODE_IMAGE: ${{ github.ref_name == 'release' && '093970136003' || '369495373322'}}.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:latest
-      AWS_ACCESS_KEY_ID: ${{ github.ref_name == 'release' && secrets.AWS_ACCESS_KEY_PROD || secrets.AWS_ACCESS_KEY_DEV }}
-      AWS_SECRET_ACCESS_KEY: ${{ github.ref_name == 'release' && secrets.AWS_SECRET_KEY_PROD || secrets.AWS_SECRET_KEY_DEV }}
-      S3_BUCKETS: |
-        ${{ github.ref_name == 'release' &&
-          'neon-prod-extensions-ap-southeast-1 neon-prod-extensions-eu-central-1 neon-prod-extensions-us-east-1 neon-prod-extensions-us-east-2 neon-prod-extensions-us-west-2' ||
-          'neon-dev-extensions-eu-central-1 neon-dev-extensions-eu-west-1 neon-dev-extensions-us-east-2' }}
-
-    steps:
-      - name: Pull postgres-extensions image
-        run: |
-          docker pull ${EXTENSIONS_IMAGE}
-          docker pull ${COMPUTE_NODE_IMAGE}
-
-      - name: Create postgres-extensions container
-        id: create-container
-        run: |
-          EID=$(docker create ${EXTENSIONS_IMAGE} true)
-          echo "EID=${EID}" >> $GITHUB_OUTPUT
-
-          CID=$(docker create ${COMPUTE_NODE_IMAGE} true)
-          echo "CID=${CID}" >> $GITHUB_OUTPUT
-
-      - name: Extract postgres-extensions from container
-        run: |
-          rm -rf ./extensions-to-upload ./custom-extensions # Just in case
-
-          # In compute image we have a bit different directory layout
-          mkdir -p extensions-to-upload/share
-          docker cp ${{ steps.create-container.outputs.CID }}:/usr/local/share/extension ./extensions-to-upload/share/extension
-          docker cp ${{ steps.create-container.outputs.CID }}:/usr/local/lib             ./extensions-to-upload/lib
-
-          # Delete Neon extensitons (they always present on compute-node image)
-          rm -rf ./extensions-to-upload/share/extension/neon*
-          rm -rf ./extensions-to-upload/lib/neon*
-
-          # Delete leftovers from the extension build step
-          rm -rf ./extensions-to-upload/lib/pgxs
-          rm -rf ./extensions-to-upload/lib/pkgconfig
-
-          docker cp ${{ steps.create-container.outputs.EID }}:/extensions ./custom-extensions
-          for EXT_NAME in $(ls ./custom-extensions); do
-            mkdir -p ./extensions-to-upload/${EXT_NAME}/share
-
-            mv ./custom-extensions/${EXT_NAME}/share/extension ./extensions-to-upload/${EXT_NAME}/share/extension
-            mv ./custom-extensions/${EXT_NAME}/lib             ./extensions-to-upload/${EXT_NAME}/lib
-          done
-
-      - name: Upload postgres-extensions to S3
-        run: |
-          for BUCKET in $(echo ${S3_BUCKETS}); do
-            aws s3 cp --recursive --only-show-errors ./extensions-to-upload s3://${BUCKET}/${{ needs.tag.outputs.build-tag }}/${{ matrix.version }}
-          done
-
-      - name: Cleanup
-        if: ${{ always() && (steps.create-container.outputs.CID || steps.create-container.outputs.EID) }}
-        run: |
-          docker rm ${{ steps.create-container.outputs.CID }} || true
-          docker rm ${{ steps.create-container.outputs.EID }} || true
-
  deploy:
    runs-on: [ self-hosted, gen3, small ]
    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
-    needs: [ upload-postgres-extensions-to-s3, promote-images, tag, regress-tests ]
+    needs: [ promote-images, tag, regress-tests ]
    if: ( github.ref_name == 'main' || github.ref_name == 'release' ) && github.event_name != 'workflow_dispatch'
    steps:
      - name: Fix git ownership
@@ -1026,20 +959,6 @@ jobs:
            exit 1
          fi

-      - name: Create git tag
-        if: github.ref_name == 'release'
-        uses: actions/github-script@v6
-        with:
-          # Retry script for 5XX server errors: https://github.com/actions/github-script#retries
-          retries: 5
-          script: |
-            github.rest.git.createRef({
-              owner: context.repo.owner,
-              repo: context.repo.repo,
-              ref: "refs/tags/${{ needs.tag.outputs.build-tag }}",
-              sha: context.sha,
-            })
-
  promote-compatibility-data:
    runs-on: [ self-hosted, gen3, small ]
    container:
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -3,7 +3,6 @@ name: Create Release Branch
 on:
  schedule:
    - cron: '0 10 * * 2'
-  workflow_dispatch:

 jobs:
  create_release_branch:
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -200,6 +200,17 @@ dependencies = [
 "critical-section",
 ]

+[[package]]
+name = "atty"
+version = "0.2.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8"
+dependencies = [
+ "hermit-abi 0.1.19",
+ "libc",
+ "winapi",
+]
+
 [[package]]
 name = "autocfg"
 version = "1.1.0"
@@ -794,6 +805,18 @@ dependencies = [
 "libloading",
 ]

+[[package]]
+name = "clap"
+version = "3.2.25"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4ea181bf566f71cb9a5d17a59e1871af638180a18fb0035c92ae62b705207123"
+dependencies = [
+ "bitflags",
+ "clap_lex 0.2.4",
+ "indexmap",
+ "textwrap",
+]
+
 [[package]]
 name = "clap"
 version = "4.3.0"
@@ -814,7 +837,7 @@ dependencies = [
 "anstream",
 "anstyle",
 "bitflags",
- "clap_lex",
+ "clap_lex 0.5.0",
 "strsim",
 ]

@@ -830,6 +853,15 @@ dependencies = [
 "syn 2.0.16",
 ]

+[[package]]
+name = "clap_lex"
+version = "0.2.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2850f2f5a82cbf437dd5af4d49848fbdfc27c157c3d010345776f952765261c5"
+dependencies = [
+ "os_str_bytes",
+]
+
 [[package]]
 name = "clap_lex"
 version = "0.5.0"
@@ -883,7 +915,7 @@ version = "0.1.0"
 dependencies = [
 "anyhow",
 "chrono",
- "clap",
+ "clap 4.3.0",
 "compute_api",
 "futures",
 "hyper",
@@ -945,7 +977,7 @@ name = "control_plane"
 version = "0.1.0"
 dependencies = [
 "anyhow",
- "clap",
+ "clap 4.3.0",
 "comfy-table",
 "compute_api",
 "git-version",
@@ -1015,19 +1047,19 @@ dependencies = [

 [[package]]
 name = "criterion"
-version = "0.5.1"
+version = "0.4.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f2b12d017a929603d80db1831cd3a24082f8137ce19c69e6447f54f5fc8d692f"
+checksum = "e7c76e09c1aae2bc52b3d2f29e13c6572553b30c4aa1b8a49fd70de6412654cb"
 dependencies = [
 "anes",
+ "atty",
 "cast",
 "ciborium",
- "clap",
+ "clap 3.2.25",
 "criterion-plot",
- "is-terminal",
 "itertools",
+ "lazy_static",
 "num-traits",
- "once_cell",
 "oorandom",
 "plotters",
 "rayon",
@@ -1108,7 +1140,7 @@ dependencies = [
 "crossterm_winapi",
 "libc",
 "mio",
- "parking_lot 0.12.1",
+ "parking_lot",
 "signal-hook",
 "signal-hook-mio",
 "winapi",
@@ -1178,7 +1210,7 @@ dependencies = [
 "hashbrown 0.12.3",
 "lock_api",
 "once_cell",
- "parking_lot_core 0.9.7",
+ "parking_lot_core",
 ]

 [[package]]
@@ -1644,6 +1676,15 @@ version = "0.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8"

+[[package]]
+name = "hermit-abi"
+version = "0.1.19"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33"
+dependencies = [
+ "libc",
+]
+
 [[package]]
 name = "hermit-abi"
 version = "0.2.6"
@@ -1898,9 +1939,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7a5bbe824c507c5da5956355e86a746d82e0e1464f65d862cc5e71da70e94b2c"
 dependencies = [
 "cfg-if",
- "js-sys",
- "wasm-bindgen",
- "web-sys",
 ]

 [[package]]
@@ -2229,6 +2267,16 @@ dependencies = [
 "windows-sys 0.45.0",
 ]

+[[package]]
+name = "nu-ansi-term"
+version = "0.46.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "77a8165726e8236064dbb45459242600304b42a5ea24ee2948e18e023bf7ba84"
+dependencies = [
+ "overload",
+ "winapi",
+]
+
 [[package]]
 name = "num-bigint"
 version = "0.4.3"
@@ -2301,9 +2349,9 @@ checksum = "0ab1bc2a289d34bd04a330323ac98a1b4bc82c9d9fcb1e66b63caa84da26b575"

 [[package]]
 name = "openssl"
-version = "0.10.55"
+version = "0.10.52"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "345df152bc43501c5eb9e4654ff05f794effb78d4efe3d53abc158baddc0703d"
+checksum = "01b8574602df80f7b85fdfc5392fa884a4e3b3f4f35402c070ab34c3d3f78d56"
 dependencies = [
 "bitflags",
 "cfg-if",
@@ -2333,9 +2381,9 @@ checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf"

 [[package]]
 name = "openssl-sys"
-version = "0.9.90"
+version = "0.9.87"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "374533b0e45f3a7ced10fcaeccca020e66656bc03dac384f852e4e5a7a8104a6"
+checksum = "8e17f59264b2809d77ae94f0e1ebabc434773f370d6ca667bd223ea10e06cc7e"
 dependencies = [
 "cc",
 "libc",
@@ -2456,20 +2504,33 @@ dependencies = [
 "winapi",
 ]

+[[package]]
+name = "os_str_bytes"
+version = "6.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ceedf44fb00f2d1984b0bc98102627ce622e083e49a5bacdb3e514fa4238e267"
+
 [[package]]
 name = "outref"
 version = "0.5.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4030760ffd992bef45b0ae3f10ce1aba99e33464c90d14dd7c039884963ddc7a"

+[[package]]
+name = "overload"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39"
+
 [[package]]
 name = "pagectl"
 version = "0.1.0"
 dependencies = [
 "anyhow",
 "bytes",
- "clap",
+ "clap 4.3.0",
 "git-version",
+ "itertools",
 "pageserver",
 "postgres_ffi",
 "svg_fmt",
@@ -2487,7 +2548,7 @@ dependencies = [
 "byteorder",
 "bytes",
 "chrono",
- "clap",
+ "clap 4.3.0",
 "close_fds",
 "const_format",
 "consumption_metrics",
@@ -2569,17 +2630,6 @@ dependencies = [
 "workspace_hack",
 ]

-[[package]]
-name = "parking_lot"
-version = "0.11.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7d17b78036a60663b797adeaee46f5c9dfebb86948d1255007a1d6be0271ff99"
-dependencies = [
- "instant",
- "lock_api",
- "parking_lot_core 0.8.6",
-]
-
 [[package]]
 name = "parking_lot"
 version = "0.12.1"
@@ -2587,21 +2637,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3742b2c103b9f06bc9fff0a37ff4912935851bee6d36f3c02bcc755bcfec228f"
 dependencies = [
 "lock_api",
- "parking_lot_core 0.9.7",
-]
-
-[[package]]
-name = "parking_lot_core"
-version = "0.8.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "60a2cfe6f0ad2bfc16aefa463b497d5c7a5ecd44a23efa72aa342d90177356dc"
-dependencies = [
- "cfg-if",
- "instant",
- "libc",
- "redox_syscall 0.2.16",
- "smallvec",
- "winapi",
+ "parking_lot_core",
 ]

 [[package]]
@@ -2617,16 +2653,6 @@ dependencies = [
 "windows-sys 0.45.0",
 ]

-[[package]]
-name = "pbkdf2"
-version = "0.12.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f0ca0b5a68607598bf3bad68f32227a8164f6254833f84eafaac409cd6746c31"
-dependencies = [
- "digest",
- "hmac",
-]
-
 [[package]]
 name = "peeking_take_while"
 version = "0.1.2"
@@ -2745,7 +2771,7 @@ dependencies = [
 [[package]]
 name = "postgres"
 version = "0.19.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=1aaedab101b23f7612042850d8f2036810fa7c7f#1aaedab101b23f7612042850d8f2036810fa7c7f"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9#2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9"
 dependencies = [
 "bytes",
 "fallible-iterator",
@@ -2758,7 +2784,7 @@ dependencies = [
 [[package]]
 name = "postgres-native-tls"
 version = "0.5.0"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=1aaedab101b23f7612042850d8f2036810fa7c7f#1aaedab101b23f7612042850d8f2036810fa7c7f"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9#2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9"
 dependencies = [
 "native-tls",
 "tokio",
@@ -2769,7 +2795,7 @@ dependencies = [
 [[package]]
 name = "postgres-protocol"
 version = "0.6.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=1aaedab101b23f7612042850d8f2036810fa7c7f#1aaedab101b23f7612042850d8f2036810fa7c7f"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9#2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9"
 dependencies = [
 "base64 0.20.0",
 "byteorder",
@@ -2787,7 +2813,7 @@ dependencies = [
 [[package]]
 name = "postgres-types"
 version = "0.2.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=1aaedab101b23f7612042850d8f2036810fa7c7f#1aaedab101b23f7612042850d8f2036810fa7c7f"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9#2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9"
 dependencies = [
 "bytes",
 "fallible-iterator",
@@ -2849,6 +2875,7 @@ dependencies = [
 "serde",
 "thiserror",
 "utils",
+ "wal_craft",
 "workspace_hack",
 ]

@@ -2932,7 +2959,7 @@ dependencies = [
 "lazy_static",
 "libc",
 "memchr",
- "parking_lot 0.12.1",
+ "parking_lot",
 "procfs",
 "thiserror",
 ]
@@ -2997,11 +3024,12 @@ version = "0.1.0"
 dependencies = [
 "anyhow",
 "async-trait",
+ "atty",
 "base64 0.13.1",
 "bstr",
 "bytes",
 "chrono",
- "clap",
+ "clap 4.3.0",
 "consumption_metrics",
 "futures",
 "git-version",
@@ -3019,8 +3047,7 @@ dependencies = [
 "native-tls",
 "once_cell",
 "opentelemetry",
- "parking_lot 0.12.1",
- "pbkdf2",
+ "parking_lot",
 "pin-project-lite",
 "postgres-native-tls",
 "postgres_backend",
@@ -3031,7 +3058,6 @@ dependencies = [
 "regex",
 "reqwest",
 "reqwest-middleware",
- "reqwest-retry",
 "reqwest-tracing",
 "routerify",
 "rstest",
@@ -3267,29 +3293,6 @@ dependencies = [
 "thiserror",
 ]

-[[package]]
-name = "reqwest-retry"
-version = "0.2.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "48d0fd6ef4c6d23790399fe15efc8d12cd9f3d4133958f9bd7801ee5cbaec6c4"
-dependencies = [
- "anyhow",
- "async-trait",
- "chrono",
- "futures",
- "getrandom",
- "http",
- "hyper",
- "parking_lot 0.11.2",
- "reqwest",
- "reqwest-middleware",
- "retry-policies",
- "task-local-extensions",
- "tokio",
- "tracing",
- "wasm-timer",
-]
-
 [[package]]
 name = "reqwest-tracing"
 version = "0.4.4"
@@ -3308,17 +3311,6 @@ dependencies = [
 "tracing-opentelemetry",
 ]

-[[package]]
-name = "retry-policies"
-version = "0.1.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e09bbcb5003282bcb688f0bae741b278e9c7e8f378f561522c9806c58e075d9b"
-dependencies = [
- "anyhow",
- "chrono",
- "rand",
-]
-
 [[package]]
 name = "ring"
 version = "0.16.20"
@@ -3517,7 +3509,7 @@ dependencies = [
 "byteorder",
 "bytes",
 "chrono",
- "clap",
+ "clap 4.3.0",
 "const_format",
 "crc32c",
 "fs2",
@@ -3528,7 +3520,7 @@ dependencies = [
 "hyper",
 "metrics",
 "once_cell",
- "parking_lot 0.12.1",
+ "parking_lot",
 "postgres",
 "postgres-protocol",
 "postgres_backend",
@@ -3947,7 +3939,7 @@ dependencies = [
 "anyhow",
 "async-stream",
 "bytes",
- "clap",
+ "clap 4.3.0",
 "const_format",
 "futures",
 "futures-core",
@@ -3957,7 +3949,7 @@ dependencies = [
 "hyper",
 "metrics",
 "once_cell",
- "parking_lot 0.12.1",
+ "parking_lot",
 "prost",
 "tokio",
 "tokio-stream",
@@ -4128,6 +4120,12 @@ dependencies = [
 "syn 1.0.109",
 ]

+[[package]]
+name = "textwrap"
+version = "0.16.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "222a222a5bfe1bba4a77b45ec488a741b3cb8872e5e499451fd7d0129c9c7c3d"
+
 [[package]]
 name = "thiserror"
 version = "1.0.40"
@@ -4276,7 +4274,7 @@ dependencies = [
 [[package]]
 name = "tokio-postgres"
 version = "0.7.7"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=1aaedab101b23f7612042850d8f2036810fa7c7f#1aaedab101b23f7612042850d8f2036810fa7c7f"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9#2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9"
 dependencies = [
 "async-trait",
 "byteorder",
@@ -4285,7 +4283,7 @@ dependencies = [
 "futures-channel",
 "futures-util",
 "log",
- "parking_lot 0.12.1",
+ "parking_lot",
 "percent-encoding",
 "phf",
 "pin-project-lite",
@@ -4543,7 +4541,7 @@ name = "trace"
 version = "0.1.0"
 dependencies = [
 "anyhow",
- "clap",
+ "clap 4.3.0",
 "pageserver_api",
 "utils",
 "workspace_hack",
@@ -4645,6 +4643,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "30a651bc37f915e81f087d86e62a18eec5f79550c7faff886f7090b4ea757c77"
 dependencies = [
 "matchers",
+ "nu-ansi-term",
 "once_cell",
 "regex",
 "serde",
@@ -4813,6 +4812,7 @@ version = "0.1.0"
 dependencies = [
 "anyhow",
 "async-trait",
+ "atty",
 "bincode",
 "byteorder",
 "bytes",
@@ -4889,15 +4889,13 @@ name = "wal_craft"
 version = "0.1.0"
 dependencies = [
 "anyhow",
- "clap",
+ "clap 4.3.0",
 "env_logger",
 "log",
 "once_cell",
 "postgres",
 "postgres_ffi",
- "regex",
 "tempfile",
- "utils",
 "workspace_hack",
 ]

@@ -4993,21 +4991,6 @@ version = "0.2.86"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ed9d5b4305409d1fc9482fee2d7f9bcbf24b3972bf59817ef757e23982242a93"

-[[package]]
-name = "wasm-timer"
-version = "0.2.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "be0ecb0db480561e9a7642b5d3e4187c128914e58aa84330b9493e3eb68c5e7f"
-dependencies = [
- "futures",
- "js-sys",
- "parking_lot 0.11.2",
- "pin-utils",
- "wasm-bindgen",
- "wasm-bindgen-futures",
- "web-sys",
-]
-
 [[package]]
 name = "web-sys"
 version = "0.3.63"
@@ -5269,7 +5252,7 @@ dependencies = [
 "anyhow",
 "bytes",
 "chrono",
- "clap",
+ "clap 4.3.0",
 "clap_builder",
 "crossbeam-utils",
 "either",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -9,20 +9,7 @@ members = [
    "storage_broker",
    "workspace_hack",
    "trace",
-    "libs/compute_api",
-    "libs/pageserver_api",
-    "libs/postgres_ffi",
-    "libs/safekeeper_api",
-    "libs/utils",
-    "libs/consumption_metrics",
-    "libs/postgres_backend",
-    "libs/pq_proto",
-    "libs/tenant_size_model",
-    "libs/metrics",
-    "libs/postgres_connection",
-    "libs/remote_storage",
-    "libs/tracing-utils",
-    "libs/postgres_ffi/wal_craft",
+    "libs/*",
 ]

 [workspace.package]
@@ -34,6 +21,7 @@ license = "Apache-2.0"
 anyhow = { version = "1.0", features = ["backtrace"] }
 async-stream = "0.3"
 async-trait = "0.1"
+atty = "0.2.14"
 aws-config = { version = "0.55", default-features = false, features=["rustls"] }
 aws-sdk-s3 = "0.27"
 aws-smithy-http = "0.55"
@@ -86,7 +74,6 @@ opentelemetry = "0.18.0"
 opentelemetry-otlp = { version = "0.11.0", default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
 opentelemetry-semantic-conventions = "0.10.0"
 parking_lot = "0.12"
-pbkdf2 = "0.12.1"
 pin-project-lite = "0.2"
 prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency
 prost = "0.11"
@@ -95,7 +82,6 @@ regex = "1.4"
 reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"] }
 reqwest-tracing = { version = "0.4.0", features = ["opentelemetry_0_18"] }
 reqwest-middleware = "0.2.0"
-reqwest-retry = "0.2.2"
 routerify = "3"
 rpds = "0.13"
 rustls = "0.20"
@@ -129,7 +115,7 @@ tonic = {version = "0.9", features = ["tls", "tls-roots"]}
 tracing = "0.1"
 tracing-error = "0.2.0"
 tracing-opentelemetry = "0.18.0"
-tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter"] }
+tracing-subscriber = { version = "0.3", features = ["env-filter"] }
 url = "2.2"
 uuid = { version = "1.2", features = ["v4", "serde"] }
 walkdir = "2.3.2"
@@ -141,11 +127,11 @@ env_logger = "0.10"
 log = "0.4"

 ## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
-postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
-postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
-postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
+postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9" }
+postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9" }
+postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9" }
+postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9" }
 tokio-tar = { git = "https://github.com/neondatabase/tokio-tar.git", rev="404df61437de0feef49ba2ccdbdd94eb8ad6e142" }

 ## Other git libraries
@@ -171,7 +157,7 @@ utils = { version = "0.1", path = "./libs/utils/" }
 workspace_hack = { version = "0.1", path = "./workspace_hack/" }

 ## Build dependencies
-criterion = "0.5.1"
+criterion = "0.4"
 rcgen = "0.10"
 rstest = "0.17"
 tempfile = "3.4"
@@ -181,7 +167,7 @@ tonic-build = "0.9"

 # This is only needed for proxy's tests.
 # TODO: we should probably fork `tokio-postgres-rustls` instead.
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9" }

 # Changes the MAX_THREADS limit from 4096 to 32768.
 # This is a temporary workaround for using tracing from many threads in safekeepers code,
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -2,7 +2,6 @@ ARG PG_VERSION
 ARG REPOSITORY=neondatabase
 ARG IMAGE=rust
 ARG TAG=pinned
-ARG BUILD_TAG

 #########################################################################################
 #
@@ -68,7 +67,7 @@ RUN apt update && \
 RUN wget https://gitlab.com/Oslandia/SFCGAL/-/archive/v1.3.10/SFCGAL-v1.3.10.tar.gz -O SFCGAL.tar.gz && \
    echo "4e39b3b2adada6254a7bdba6d297bb28e1a9835a9f879b74f37e2dab70203232 SFCGAL.tar.gz" | sha256sum --check && \
    mkdir sfcgal-src && cd sfcgal-src && tar xvzf ../SFCGAL.tar.gz --strip-components=1 -C . && \
-    cmake -DCMAKE_BUILD_TYPE=Release . && make -j $(getconf _NPROCESSORS_ONLN) && \
+    cmake . && make -j $(getconf _NPROCESSORS_ONLN) && \
    DESTDIR=/sfcgal make install -j $(getconf _NPROCESSORS_ONLN) && \
    make clean && cp -R /sfcgal/* /

@@ -96,7 +95,7 @@ RUN wget https://github.com/pgRouting/pgrouting/archive/v3.4.2.tar.gz -O pgrouti
    mkdir pgrouting-src && cd pgrouting-src && tar xvzf ../pgrouting.tar.gz --strip-components=1 -C . && \
    mkdir build && \
    cd build && \
-    cmake -DCMAKE_BUILD_TYPE=Release .. && \
+    cmake .. && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
    make -j $(getconf _NPROCESSORS_ONLN) install && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrouting.control
@@ -189,8 +188,8 @@ RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz -
 FROM build-deps AS vector-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

-RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.4.4.tar.gz -O pgvector.tar.gz && \
-    echo "1cb70a63f8928e396474796c22a20be9f7285a8a013009deb8152445b61b72e6 pgvector.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.4.0.tar.gz -O pgvector.tar.gz && \
+    echo "b76cf84ddad452cc880a6c8c661d137ddd8679c000a16332f4f03ecf6e10bcc8 pgvector.tar.gz" | sha256sum --check && \
    mkdir pgvector-src && cd pgvector-src && tar xvzf ../pgvector.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
@@ -356,7 +355,7 @@ RUN apt-get update && \
    wget https://github.com/timescale/timescaledb/archive/refs/tags/2.10.1.tar.gz -O timescaledb.tar.gz && \
    echo "6fca72a6ed0f6d32d2b3523951ede73dc5f9b0077b38450a029a5f411fdb8c73 timescaledb.tar.gz" | sha256sum --check && \
    mkdir timescaledb-src && cd timescaledb-src && tar xvzf ../timescaledb.tar.gz --strip-components=1 -C . && \
-    ./bootstrap -DSEND_TELEMETRY_DEFAULT:BOOL=OFF -DUSE_TELEMETRY:BOOL=OFF -DAPACHE_ONLY:BOOL=ON -DCMAKE_BUILD_TYPE=Release && \
+    ./bootstrap -DSEND_TELEMETRY_DEFAULT:BOOL=OFF -DUSE_TELEMETRY:BOOL=OFF -DAPACHE_ONLY:BOOL=ON && \
    cd build && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
    make install -j $(getconf _NPROCESSORS_ONLN) && \
@@ -411,7 +410,7 @@ RUN apt-get update && \
    mkdir kq_imcx-src && cd kq_imcx-src && tar xvzf ../kq_imcx.tar.gz --strip-components=1 -C . && \
    mkdir build && \
    cd build && \
-    cmake -DCMAKE_BUILD_TYPE=Release .. && \
+    cmake .. && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
    make -j $(getconf _NPROCESSORS_ONLN) install && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/kq_imcx.control
@@ -433,127 +432,6 @@ RUN wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.5.2.tar.gz -O
    make -j $(getconf _NPROCESSORS_ONLN) install && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_cron.control

-#########################################################################################
-#
-# Layer "rdkit-pg-build"
-# compile rdkit extension
-#
-#########################################################################################
-FROM build-deps AS rdkit-pg-build
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
-
-RUN apt-get update && \
-    apt-get install -y \
-        cmake \
-        libboost-iostreams1.74-dev \
-        libboost-regex1.74-dev \
-        libboost-serialization1.74-dev \
-        libboost-system1.74-dev \
-        libeigen3-dev \
-        libfreetype6-dev
-
-ENV PATH "/usr/local/pgsql/bin/:/usr/local/pgsql/:$PATH"
-RUN wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_1.tar.gz -O rdkit.tar.gz && \
-    echo "db346afbd0ba52c843926a2a62f8a38c7b774ffab37eaf382d789a824f21996c rdkit.tar.gz" | sha256sum --check && \
-    mkdir rdkit-src && cd rdkit-src && tar xvzf ../rdkit.tar.gz --strip-components=1 -C . && \
-    cmake \
-        -D RDK_BUILD_CAIRO_SUPPORT=OFF \
-        -D RDK_BUILD_INCHI_SUPPORT=ON \
-        -D RDK_BUILD_AVALON_SUPPORT=ON \
-        -D RDK_BUILD_PYTHON_WRAPPERS=OFF \
-        -D RDK_BUILD_DESCRIPTORS3D=OFF \
-        -D RDK_BUILD_FREESASA_SUPPORT=OFF \
-        -D RDK_BUILD_COORDGEN_SUPPORT=ON \
-        -D RDK_BUILD_MOLINTERCHANGE_SUPPORT=OFF \
-        -D RDK_BUILD_YAEHMOP_SUPPORT=OFF \
-        -D RDK_BUILD_STRUCTCHECKER_SUPPORT=OFF \
-        -D RDK_USE_URF=OFF \
-        -D RDK_BUILD_PGSQL=ON \
-        -D RDK_PGSQL_STATIC=ON \
-        -D PostgreSQL_CONFIG=pg_config \
-        -D PostgreSQL_INCLUDE_DIR=`pg_config --includedir` \
-        -D PostgreSQL_TYPE_INCLUDE_DIR=`pg_config --includedir-server` \
-        -D PostgreSQL_LIBRARY_DIR=`pg_config --libdir` \
-        -D RDK_INSTALL_INTREE=OFF \
-        -D CMAKE_BUILD_TYPE=Release \
-        . && \
-    make -j $(getconf _NPROCESSORS_ONLN) && \
-    make -j $(getconf _NPROCESSORS_ONLN) install && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/rdkit.control
-
-#########################################################################################
-#
-# Layer "pg-uuidv7-pg-build"
-# compile pg_uuidv7 extension
-#
-#########################################################################################
-FROM build-deps AS pg-uuidv7-pg-build
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
-
-ENV PATH "/usr/local/pgsql/bin/:$PATH"
-RUN wget https://github.com/fboulnois/pg_uuidv7/archive/refs/tags/v1.0.1.tar.gz -O pg_uuidv7.tar.gz && \
-    echo "0d0759ab01b7fb23851ecffb0bce27822e1868a4a5819bfd276101c716637a7a pg_uuidv7.tar.gz" | sha256sum --check && \
-    mkdir pg_uuidv7-src && cd pg_uuidv7-src && tar xvzf ../pg_uuidv7.tar.gz --strip-components=1 -C . && \
-    make -j $(getconf _NPROCESSORS_ONLN) && \
-    make -j $(getconf _NPROCESSORS_ONLN) install && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_uuidv7.control
-
-#########################################################################################
-#
-# Layer "pg-roaringbitmap-pg-build"
-# compile pg_roaringbitmap extension
-#
-#########################################################################################
-FROM build-deps AS pg-roaringbitmap-pg-build
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
-
-ENV PATH "/usr/local/pgsql/bin/:$PATH"
-RUN wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4.tar.gz -O pg_roaringbitmap.tar.gz && \
-    echo "b75201efcb1c2d1b014ec4ae6a22769cc7a224e6e406a587f5784a37b6b5a2aa pg_roaringbitmap.tar.gz" | sha256sum --check && \
-    mkdir pg_roaringbitmap-src && cd pg_roaringbitmap-src && tar xvzf ../pg_roaringbitmap.tar.gz --strip-components=1 -C . && \
-    make -j $(getconf _NPROCESSORS_ONLN) && \
-    make -j $(getconf _NPROCESSORS_ONLN) install && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/roaringbitmap.control
-
-#########################################################################################
-#
-# Layer "pg-embedding-pg-build"
-# compile pg_embedding extension
-#
-#########################################################################################
-FROM build-deps AS pg-embedding-pg-build
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
-
-ENV PATH "/usr/local/pgsql/bin/:$PATH"
-# 2465f831ea1f8d49c1d74f8959adb7fc277d70cd made on 05/07/2023
-# There is no release tag yet
-RUN wget https://github.com/neondatabase/pg_embedding/archive/2465f831ea1f8d49c1d74f8959adb7fc277d70cd.tar.gz -O pg_embedding.tar.gz && \
-    echo "047af2b1f664a1e6e37867bd4eeaf5934fa27d6ba3d6c4461efa388ddf7cd1d5 pg_embedding.tar.gz" | sha256sum --check && \
-    mkdir pg_embedding-src && cd pg_embedding-src && tar xvzf ../pg_embedding.tar.gz --strip-components=1 -C . && \
-    make -j $(getconf _NPROCESSORS_ONLN) && \
-    make -j $(getconf _NPROCESSORS_ONLN) install && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/embedding.control
-
-#########################################################################################
-#
-# Layer "pg-anon-pg-build"
-# compile anon extension
-#
-#########################################################################################
-FROM build-deps AS pg-anon-pg-build
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
-
-# Kaniko doesn't allow to do `${from#/usr/local/pgsql/}`, so we use `${from:17}` instead
-ENV PATH "/usr/local/pgsql/bin/:$PATH"
-RUN wget https://gitlab.com/dalibo/postgresql_anonymizer/-/archive/1.1.0/postgresql_anonymizer-1.1.0.tar.gz -O pg_anon.tar.gz && \
-    echo "08b09d2ff9b962f96c60db7e6f8e79cf7253eb8772516998fc35ece08633d3ad pg_anon.tar.gz" | sha256sum --check && \
-    mkdir pg_anon-src && cd pg_anon-src && tar xvzf ../pg_anon.tar.gz --strip-components=1 -C . && \
-    find /usr/local/pgsql -type f | sort  > /before.txt && \
-    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/anon.control && \
-    find /usr/local/pgsql -type f | sort  > /after.txt && \
-    /bin/bash -c 'for from in $(comm -13 /before.txt /after.txt); do to=/extensions/anon/${from:17} && mkdir -p $(dirname ${to}) && cp -a ${from} ${to}; done'
-
 #########################################################################################
 #
 # Layer "rust extensions"
@@ -639,22 +517,6 @@ RUN wget https://github.com/kelvich/pg_tiktoken/archive/801f84f08c6881c8aa30f405
    cargo pgx install --release && \
    echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_tiktoken.control

-#########################################################################################
-#
-# Layer "pg-pgx-ulid-build"
-# Compile "pgx_ulid" extension
-#
-#########################################################################################
-
-FROM rust-extensions-build AS pg-pgx-ulid-build
-
-RUN wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.0.tar.gz -O pgx_ulid.tar.gz && \
-    echo "908b7358e6f846e87db508ae5349fb56a88ee6305519074b12f3d5b0ff09f791 pgx_ulid.tar.gz" | sha256sum --check && \
-    mkdir pgx_ulid-src && cd pgx_ulid-src && tar xvzf ../pgx_ulid.tar.gz --strip-components=1 -C . && \
-    sed -i 's/pgx        = "=0.7.3"/pgx = { version = "0.7.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
-    cargo pgx install --release && \
-    echo "trusted = true" >> /usr/local/pgsql/share/extension/ulid.control
-
 #########################################################################################
 #
 # Layer "neon-pg-ext-build"
@@ -662,7 +524,6 @@ RUN wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.0.tar.gz -
 #
 #########################################################################################
 FROM build-deps AS neon-pg-ext-build
-# Public extensions
 COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=postgis-build /sfcgal/* /
 COPY --from=plv8-build /usr/local/pgsql/ /usr/local/pgsql/
@@ -686,11 +547,6 @@ COPY --from=timescaledb-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-hint-plan-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=kq-imcx-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-cron-pg-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=pg-pgx-ulid-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=rdkit-pg-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=pg-uuidv7-pg-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=pg-roaringbitmap-pg-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=pg-embedding-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY pgxn/ pgxn/

 RUN make -j $(getconf _NPROCESSORS_ONLN) \
@@ -700,10 +556,6 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) \
    make -j $(getconf _NPROCESSORS_ONLN) \
        PG_CONFIG=/usr/local/pgsql/bin/pg_config \
        -C pgxn/neon_utils \
-        -s install && \
-    make -j $(getconf _NPROCESSORS_ONLN) \
-        PG_CONFIG=/usr/local/pgsql/bin/pg_config \
-        -C pgxn/hnsw \
        -s install

 #########################################################################################
@@ -712,9 +564,6 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) \
 #
 #########################################################################################
 FROM $REPOSITORY/$IMAGE:$TAG AS compute-tools
-ARG BUILD_TAG
-ENV BUILD_TAG=$BUILD_TAG
-
 USER nonroot
 # Copy entire project to get Cargo.* files with proper dependencies for the whole project
 COPY --chown=nonroot . .
@@ -739,22 +588,6 @@ RUN rm -r /usr/local/pgsql/include
 # if they were to be used by other libraries.
 RUN rm /usr/local/pgsql/lib/lib*.a

-#########################################################################################
-#
-# Extenstion only
-#
-#########################################################################################
-FROM scratch AS postgres-extensions
-# After the transition this layer will include all extensitons.
-# As for now, it's only for new custom ones
-#
-# # Default extensions
-# COPY --from=postgres-cleanup-layer /usr/local/pgsql/share/extension /usr/local/pgsql/share/extension
-# COPY --from=postgres-cleanup-layer /usr/local/pgsql/lib             /usr/local/pgsql/lib
-# Custom extensions
-COPY --from=pg-anon-pg-build /extensions/anon/lib/ /extensions/anon/lib
-COPY --from=pg-anon-pg-build /extensions/anon/share/extension /extensions/anon/share/extension
-
 #########################################################################################
 #
 # Final layer
@@ -783,19 +616,14 @@ COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-deb
 # libgeos, libgdal, libsfcgal1, libproj and libprotobuf-c1 for PostGIS
 # libxml2, libxslt1.1 for xml2
 # libzstd1 for zstd
-# libboost*, libfreetype6, and zlib1g for rdkit
 RUN apt update &&  \
    apt install --no-install-recommends -y \
        gdb \
+        locales \
        libicu67 \
        liblz4-1 \
        libreadline8 \
-        libboost-iostreams1.74.0 \
-        libboost-regex1.74.0 \
-        libboost-serialization1.74.0 \
-        libboost-system1.74.0 \
        libossp-uuid16 \
-        libfreetype6 \
        libgeos-c1v5 \
        libgdal28 \
        libproj19 \
@@ -805,9 +633,7 @@ RUN apt update &&  \
        libxslt1.1 \
        libzstd1 \
        libcurl4-openssl-dev \
-        locales \
-        procps \
-        zlib1g && \
+        procps && \
    rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
    localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8

--- a/Dockerfile.compute-tools
+++ b/Dockerfile.compute-tools
@@ -3,7 +3,6 @@
 ARG REPOSITORY=neondatabase
 ARG IMAGE=rust
 ARG TAG=pinned
-ARG BUILD_TAG

 FROM $REPOSITORY/$IMAGE:$TAG AS rust-build
 WORKDIR /home/nonroot
@@ -17,8 +16,6 @@ ENV CACHEPOT_S3_KEY_PREFIX=cachepot
 ARG CACHEPOT_BUCKET=neon-github-dev
 #ARG AWS_ACCESS_KEY_ID
 #ARG AWS_SECRET_ACCESS_KEY
-ARG BUILD_TAG
-ENV BUILD_TAG=$BUILD_TAG

 COPY . .

--- a/8
+++ b/8
@@ -138,11 +138,6 @@ neon-pg-ext-%: postgres-%
 	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
 		-C $(POSTGRES_INSTALL_DIR)/build/neon-utils-$* \
 		-f $(ROOT_PROJECT_DIR)/pgxn/neon_utils/Makefile install
-	+@echo "Compiling hnsw $*"
-	mkdir -p $(POSTGRES_INSTALL_DIR)/build/hnsw-$*
-	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
-		-C $(POSTGRES_INSTALL_DIR)/build/hnsw-$* \
-		-f $(ROOT_PROJECT_DIR)/pgxn/hnsw/Makefile install

 .PHONY: neon-pg-ext-clean-%
 neon-pg-ext-clean-%:
@@ -158,9 +153,6 @@ neon-pg-ext-clean-%:
 	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config \
 	-C $(POSTGRES_INSTALL_DIR)/build/neon-utils-$* \
 	-f $(ROOT_PROJECT_DIR)/pgxn/neon_utils/Makefile clean
-	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config \
-	-C $(POSTGRES_INSTALL_DIR)/build/hnsw-$* \
-	-f $(ROOT_PROJECT_DIR)/pgxn/hnsw/Makefile clean

 .PHONY: neon-pg-ext
 neon-pg-ext: \
--- a/README.md
+++ b/README.md
@@ -28,19 +28,18 @@ See developer documentation in [SUMMARY.md](/docs/SUMMARY.md) for more informati
 * On Ubuntu or Debian, this set of packages should be sufficient to build the code:
 ```bash
 apt install build-essential libtool libreadline-dev zlib1g-dev flex bison libseccomp-dev \
-libssl-dev clang pkg-config libpq-dev cmake postgresql-client protobuf-compiler \
-libcurl4-openssl-dev
+libssl-dev clang pkg-config libpq-dev cmake postgresql-client protobuf-compiler
 ```
 * On Fedora, these packages are needed:
 ```bash
 dnf install flex bison readline-devel zlib-devel openssl-devel \
  libseccomp-devel perl clang cmake postgresql postgresql-contrib protobuf-compiler \
-  protobuf-devel libcurl-devel
+  protobuf-devel
 ```
 * On Arch based systems, these packages are needed:
 ```bash
 pacman -S base-devel readline zlib libseccomp openssl clang \
-postgresql-libs cmake postgresql protobuf curl
+postgresql-libs cmake postgresql protobuf
 ```

 Building Neon requires 3.15+ version of `protoc` (protobuf-compiler). If your distribution provides an older version, you can install a newer version from [here](https://github.com/protocolbuffers/protobuf/releases).
@@ -132,13 +131,13 @@ Python (3.9 or higher), and install python3 packages using `./scripts/pysync` (r
 # Create repository in .neon with proper paths to binaries and data
 # Later that would be responsibility of a package install script
 > cargo neon init
-Initializing pageserver node 1 at '127.0.0.1:64000' in ".neon"
+Starting pageserver at '127.0.0.1:64000' in '.neon'.

 # start pageserver, safekeeper, and broker for their intercommunication
 > cargo neon start
-Starting neon broker at 127.0.0.1:50051.
+Starting neon broker at 127.0.0.1:50051
 storage_broker started, pid: 2918372
-Starting pageserver node 1 at '127.0.0.1:64000' in ".neon".
+Starting pageserver at '127.0.0.1:64000' in '.neon'.
 pageserver started, pid: 2918386
 Starting safekeeper at '127.0.0.1:5454' in '.neon/safekeepers/sk1'.
 safekeeper 1 started, pid: 2918437
@@ -152,7 +151,8 @@ Setting tenant 9ef87a5bf0d92544f6fafeeb3239695c as a default one
 # start postgres compute node
 > cargo neon endpoint start main
 Starting new endpoint main (PostgreSQL v14) on timeline de200bd42b49cc1814412c7e592dd6e9 ...
-Starting postgres at 'postgresql://cloud_admin@127.0.0.1:55432/postgres'
+Extracting base backup to create postgres instance: path=.neon/pgdatadirs/tenants/9ef87a5bf0d92544f6fafeeb3239695c/main port=55432
+Starting postgres at 'host=127.0.0.1 port=55432 user=cloud_admin dbname=postgres'

 # check list of running postgres instances
 > cargo neon endpoint list
@@ -188,17 +188,18 @@ Created timeline 'b3b863fa45fa9e57e615f9f2d944e601' at Lsn 0/16F9A00 for tenant:
 # start postgres on that branch
 > cargo neon endpoint start migration_check --branch-name migration_check
 Starting new endpoint migration_check (PostgreSQL v14) on timeline b3b863fa45fa9e57e615f9f2d944e601 ...
-Starting postgres at 'postgresql://cloud_admin@127.0.0.1:55434/postgres'
+Extracting base backup to create postgres instance: path=.neon/pgdatadirs/tenants/9ef87a5bf0d92544f6fafeeb3239695c/migration_check port=55433
+Starting postgres at 'host=127.0.0.1 port=55433 user=cloud_admin dbname=postgres'

 # check the new list of running postgres instances
 > cargo neon endpoint list
 ENDPOINT         ADDRESS          TIMELINE                          BRANCH NAME      LSN        STATUS
 main             127.0.0.1:55432  de200bd42b49cc1814412c7e592dd6e9  main             0/16F9A38  running
- migration_check  127.0.0.1:55434  b3b863fa45fa9e57e615f9f2d944e601  migration_check  0/16F9A70  running
+ migration_check  127.0.0.1:55433  b3b863fa45fa9e57e615f9f2d944e601  migration_check  0/16F9A70  running

 # this new postgres instance will have all the data from 'main' postgres,
 # but all modifications would not affect data in original postgres
-> psql -p55434 -h 127.0.0.1 -U cloud_admin postgres
+> psql -p55433 -h 127.0.0.1 -U cloud_admin postgres
 postgres=# select * from t;
 key | value
 -----+-------
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -54,20 +54,11 @@ use compute_tools::monitor::launch_monitor;
 use compute_tools::params::*;
 use compute_tools::spec::*;

-const BUILD_TAG_DEFAULT: &str = "local";
-
 fn main() -> Result<()> {
    init_tracing_and_logging(DEFAULT_LOG_LEVEL)?;

-    let build_tag = option_env!("BUILD_TAG").unwrap_or(BUILD_TAG_DEFAULT);
-
-    info!("build_tag: {build_tag}");
-
    let matches = cli().get_matches();

-    let http_port = *matches
-        .get_one::<u16>("http-port")
-        .expect("http-port is required");
    let pgdata = matches
        .get_one::<String>("pgdata")
        .expect("PGDATA path is required");
@@ -187,8 +178,7 @@ fn main() -> Result<()> {

    // Launch http service first, so we were able to serve control-plane
    // requests, while configuration is still in progress.
-    let _http_handle =
-        launch_http_server(http_port, &compute).expect("cannot launch http endpoint thread");
+    let _http_handle = launch_http_server(&compute).expect("cannot launch http endpoint thread");

    if !spec_set {
        // No spec provided, hang waiting for it.
@@ -256,16 +246,6 @@ fn main() -> Result<()> {
        exit_code = ecode.code()
    }

-    // Maybe sync safekeepers again, to speed up next startup
-    let compute_state = compute.state.lock().unwrap().clone();
-    let pspec = compute_state.pspec.as_ref().expect("spec must be set");
-    if matches!(pspec.spec.mode, compute_api::spec::ComputeMode::Primary) {
-        info!("syncing safekeepers on shutdown");
-        let storage_auth_token = pspec.storage_auth_token.clone();
-        let lsn = compute.sync_safekeepers(storage_auth_token)?;
-        info!("synced safekeepers at lsn {lsn}");
-    }
-
    if let Err(err) = compute.check_for_core_dumps() {
        error!("error while checking for core dumps: {err:?}");
    }
@@ -306,14 +286,6 @@ fn cli() -> clap::Command {
    let version = option_env!("CARGO_PKG_VERSION").unwrap_or("unknown");
    clap::Command::new("compute_ctl")
        .version(version)
-        .arg(
-            Arg::new("http-port")
-                .long("http-port")
-                .value_name("HTTP_PORT")
-                .default_value("3080")
-                .value_parser(clap::value_parser!(u16))
-                .required(false),
-        )
        .arg(
            Arg::new("connstr")
                .short('C')
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -1,3 +1,19 @@
+//
+// XXX: This starts to be scarry similar to the `PostgresNode` from `control_plane`,
+// but there are several things that makes `PostgresNode` usage inconvenient in the
+// cloud:
+// - it inherits from `LocalEnv`, which contains **all-all** the information about
+//   a complete service running
+// - it uses `PageServerNode` with information about http endpoint, which we do not
+//   need in the cloud again
+// - many tiny pieces like, for example, we do not use `pg_ctl` in the cloud
+//
+// Thus, to use `PostgresNode` in the cloud, we need to 'mock' a bunch of required
+// attributes (not required for the cloud). Yet, it is still tempting to unify these
+// `PostgresNode` and `ComputeNode` and use one in both places.
+//
+// TODO: stabilize `ComputeNode` and think about using it in the `control_plane`.
+//
 use std::fs;
 use std::os::unix::fs::PermissionsExt;
 use std::path::Path;
@@ -90,38 +106,26 @@ pub struct ParsedSpec {
 impl TryFrom<ComputeSpec> for ParsedSpec {
    type Error = String;
    fn try_from(spec: ComputeSpec) -> Result<Self, String> {
-        // Extract the options from the spec file that are needed to connect to
-        // the storage system.
-        //
-        // For backwards-compatibility, the top-level fields in the spec file
-        // may be empty. In that case, we need to dig them from the GUCs in the
-        // cluster.settings field.
        let pageserver_connstr = spec
-            .pageserver_connstring
-            .clone()
-            .or_else(|| spec.cluster.settings.find("neon.pageserver_connstring"))
+            .cluster
+            .settings
+            .find("neon.pageserver_connstring")
            .ok_or("pageserver connstr should be provided")?;
        let storage_auth_token = spec.storage_auth_token.clone();
-        let tenant_id: TenantId = if let Some(tenant_id) = spec.tenant_id {
-            tenant_id
-        } else {
-            spec.cluster
-                .settings
-                .find("neon.tenant_id")
-                .ok_or("tenant id should be provided")
-                .map(|s| TenantId::from_str(&s))?
-                .or(Err("invalid tenant id"))?
-        };
-        let timeline_id: TimelineId = if let Some(timeline_id) = spec.timeline_id {
-            timeline_id
-        } else {
-            spec.cluster
-                .settings
-                .find("neon.timeline_id")
-                .ok_or("timeline id should be provided")
-                .map(|s| TimelineId::from_str(&s))?
-                .or(Err("invalid timeline id"))?
-        };
+        let tenant_id: TenantId = spec
+            .cluster
+            .settings
+            .find("neon.tenant_id")
+            .ok_or("tenant id should be provided")
+            .map(|s| TenantId::from_str(&s))?
+            .or(Err("invalid tenant id"))?;
+        let timeline_id: TimelineId = spec
+            .cluster
+            .settings
+            .find("neon.timeline_id")
+            .ok_or("timeline id should be provided")
+            .map(|s| TimelineId::from_str(&s))?
+            .or(Err("invalid timeline id"))?;

        Ok(ParsedSpec {
            spec,
@@ -133,84 +137,6 @@ impl TryFrom<ComputeSpec> for ParsedSpec {
    }
 }

-/// Create special neon_superuser role, that's a slightly nerfed version of a real superuser
-/// that we give to customers
-fn create_neon_superuser(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
-    let roles = spec
-        .cluster
-        .roles
-        .iter()
-        .map(|r| format!("'{}'", escape_literal(&r.name)))
-        .collect::<Vec<_>>();
-
-    let dbs = spec
-        .cluster
-        .databases
-        .iter()
-        .map(|db| format!("'{}'", escape_literal(&db.name)))
-        .collect::<Vec<_>>();
-
-    let roles_decl = if roles.is_empty() {
-        String::from("roles text[] := NULL;")
-    } else {
-        format!(
-            r#"
-               roles text[] := ARRAY(SELECT rolname
-                                     FROM pg_catalog.pg_roles
-                                     WHERE rolname IN ({}));"#,
-            roles.join(", ")
-        )
-    };
-
-    let database_decl = if dbs.is_empty() {
-        String::from("dbs text[] := NULL;")
-    } else {
-        format!(
-            r#"
-               dbs text[] := ARRAY(SELECT datname
-                                   FROM pg_catalog.pg_database
-                                   WHERE datname IN ({}));"#,
-            dbs.join(", ")
-        )
-    };
-
-    // ALL PRIVILEGES grants CREATE, CONNECT, and TEMPORARY on all databases
-    // (see https://www.postgresql.org/docs/current/ddl-priv.html)
-    let query = format!(
-        r#"
-            DO $$
-                DECLARE
-                    r text;
-                    {}
-                    {}
-                BEGIN
-                    IF NOT EXISTS (
-                        SELECT FROM pg_catalog.pg_roles WHERE rolname = 'neon_superuser')
-                    THEN
-                        CREATE ROLE neon_superuser CREATEDB CREATEROLE NOLOGIN IN ROLE pg_read_all_data, pg_write_all_data;
-                        IF array_length(roles, 1) IS NOT NULL THEN
-                            EXECUTE format('GRANT neon_superuser TO %s',
-                                           array_to_string(ARRAY(SELECT quote_ident(x) FROM unnest(roles) as x), ', '));
-                            FOREACH r IN ARRAY roles LOOP
-                                EXECUTE format('ALTER ROLE %s CREATEROLE CREATEDB', quote_ident(r));
-                            END LOOP;
-                        END IF;
-                        IF array_length(dbs, 1) IS NOT NULL THEN
-                            EXECUTE format('GRANT ALL PRIVILEGES ON DATABASE %s TO neon_superuser',
-                                           array_to_string(ARRAY(SELECT quote_ident(x) FROM unnest(dbs) as x), ', '));
-                        END IF;
-                    END IF;
-                END
-            $$;"#,
-        roles_decl, database_decl,
-    );
-    info!("Neon superuser created:\n{}", &query);
-    client
-        .simple_query(&query)
-        .map_err(|e| anyhow::anyhow!(e).context(query))?;
-    Ok(())
-}
-
 impl ComputeNode {
    pub fn set_status(&self, status: ComputeStatus) {
        let mut state = self.state.lock().unwrap();
@@ -235,7 +161,7 @@ impl ComputeNode {

    // Get basebackup from the libpq connection to pageserver using `connstr` and
    // unarchive it to `pgdata` directory overriding all its previous content.
-    #[instrument(skip_all, fields(%lsn))]
+    #[instrument(skip(self, compute_state))]
    fn get_basebackup(&self, compute_state: &ComputeState, lsn: Lsn) -> Result<()> {
        let spec = compute_state.pspec.as_ref().expect("spec must be set");
        let start_time = Utc::now();
@@ -277,8 +203,8 @@ impl ComputeNode {

    // Run `postgres` in a special mode with `--sync-safekeepers` argument
    // and return the reported LSN back to the caller.
-    #[instrument(skip_all)]
-    pub fn sync_safekeepers(&self, storage_auth_token: Option<String>) -> Result<Lsn> {
+    #[instrument(skip(self, storage_auth_token))]
+    fn sync_safekeepers(&self, storage_auth_token: Option<String>) -> Result<Lsn> {
        let start_time = Utc::now();

        let sync_handle = Command::new(&self.pgbin)
@@ -322,7 +248,7 @@ impl ComputeNode {

    /// Do all the preparations like PGDATA directory creation, configuration,
    /// safekeepers sync, basebackup, etc.
-    #[instrument(skip_all)]
+    #[instrument(skip(self, compute_state))]
    pub fn prepare_pgdata(&self, compute_state: &ComputeState) -> Result<()> {
        let pspec = compute_state.pspec.as_ref().expect("spec must be set");
        let spec = &pspec.spec;
@@ -369,8 +295,8 @@ impl ComputeNode {
        update_pg_hba(pgdata_path)?;

        match spec.mode {
-            ComputeMode::Primary => {}
-            ComputeMode::Replica | ComputeMode::Static(..) => {
+            ComputeMode::Primary | ComputeMode::Static(..) => {}
+            ComputeMode::Replica => {
                add_standby_signal(pgdata_path)?;
            }
        }
@@ -380,7 +306,7 @@ impl ComputeNode {

    /// Start Postgres as a child process and manage DBs/roles.
    /// After that this will hang waiting on the postmaster process to exit.
-    #[instrument(skip_all)]
+    #[instrument(skip(self))]
    pub fn start_postgres(
        &self,
        storage_auth_token: Option<String>,
@@ -404,7 +330,7 @@ impl ComputeNode {
    }

    /// Do initial configuration of the already started Postgres.
-    #[instrument(skip_all)]
+    #[instrument(skip(self, compute_state))]
    pub fn apply_config(&self, compute_state: &ComputeState) -> Result<()> {
        // If connection fails,
        // it may be the old node with `zenith_admin` superuser.
@@ -425,8 +351,6 @@ impl ComputeNode {
                    .map_err(|_| anyhow::anyhow!("invalid connstr"))?;

                let mut client = Client::connect(zenith_admin_connstr.as_str(), NoTls)?;
-                // Disable forwarding so that users don't get a cloud_admin role
-                client.simple_query("SET neon.forward_ddl = false")?;
                client.simple_query("CREATE USER cloud_admin WITH SUPERUSER")?;
                client.simple_query("GRANT zenith_admin TO cloud_admin")?;
                drop(client);
@@ -437,28 +361,31 @@ impl ComputeNode {
            Ok(client) => client,
        };

+        // Proceed with post-startup configuration. Note, that order of operations is important.
        // Disable DDL forwarding because control plane already knows about these roles/databases.
        client.simple_query("SET neon.forward_ddl = false")?;
-
-        // Proceed with post-startup configuration. Note, that order of operations is important.
        let spec = &compute_state.pspec.as_ref().expect("spec must be set").spec;
-        create_neon_superuser(spec, &mut client)?;
        handle_roles(spec, &mut client)?;
        handle_databases(spec, &mut client)?;
        handle_role_deletions(spec, self.connstr.as_str(), &mut client)?;
-        handle_grants(spec, self.connstr.as_str())?;
+        handle_grants(spec, self.connstr.as_str(), &mut client)?;
        handle_extensions(spec, &mut client)?;

        // 'Close' connection
        drop(client);

+        info!(
+            "finished configuration of compute for project {}",
+            spec.cluster.cluster_id
+        );
+
        Ok(())
    }

    // We could've wrapped this around `pg_ctl reload`, but right now we don't use
    // `pg_ctl` for start / stop, so this just seems much easier to do as we already
    // have opened connection to Postgres and superuser access.
-    #[instrument(skip_all)]
+    #[instrument(skip(self, client))]
    fn pg_reload_conf(&self, client: &mut Client) -> Result<()> {
        client.simple_query("SELECT pg_reload_conf()")?;
        Ok(())
@@ -466,7 +393,7 @@ impl ComputeNode {

    /// Similar to `apply_config()`, but does a bit different sequence of operations,
    /// as it's used to reconfigure a previously started and configured Postgres node.
-    #[instrument(skip_all)]
+    #[instrument(skip(self))]
    pub fn reconfigure(&self) -> Result<()> {
        let spec = self.state.lock().unwrap().pspec.clone().unwrap().spec;

@@ -484,7 +411,7 @@ impl ComputeNode {
            handle_roles(&spec, &mut client)?;
            handle_databases(&spec, &mut client)?;
            handle_role_deletions(&spec, self.connstr.as_str(), &mut client)?;
-            handle_grants(&spec, self.connstr.as_str())?;
+            handle_grants(&spec, self.connstr.as_str(), &mut client)?;
            handle_extensions(&spec, &mut client)?;
        }

@@ -501,38 +428,33 @@ impl ComputeNode {
        Ok(())
    }

-    #[instrument(skip_all)]
+    #[instrument(skip(self))]
    pub fn start_compute(&self) -> Result<std::process::Child> {
        let compute_state = self.state.lock().unwrap().clone();
-        let pspec = compute_state.pspec.as_ref().expect("spec must be set");
+        let spec = compute_state.pspec.as_ref().expect("spec must be set");
        info!(
            "starting compute for project {}, operation {}, tenant {}, timeline {}",
-            pspec.spec.cluster.cluster_id.as_deref().unwrap_or("None"),
-            pspec.spec.operation_uuid.as_deref().unwrap_or("None"),
-            pspec.tenant_id,
-            pspec.timeline_id,
+            spec.spec.cluster.cluster_id,
+            spec.spec.operation_uuid.as_deref().unwrap_or("None"),
+            spec.tenant_id,
+            spec.timeline_id,
        );

        self.prepare_pgdata(&compute_state)?;

        let start_time = Utc::now();
-        let pg = self.start_postgres(pspec.storage_auth_token.clone())?;

-        let config_time = Utc::now();
-        if pspec.spec.mode == ComputeMode::Primary && !pspec.spec.skip_pg_catalog_updates {
+        let pg = self.start_postgres(spec.storage_auth_token.clone())?;
+
+        if spec.spec.mode == ComputeMode::Primary {
            self.apply_config(&compute_state)?;
        }

        let startup_end_time = Utc::now();
        {
            let mut state = self.state.lock().unwrap();
-            state.metrics.start_postgres_ms = config_time
-                .signed_duration_since(start_time)
-                .to_std()
-                .unwrap()
-                .as_millis() as u64;
            state.metrics.config_ms = startup_end_time
-                .signed_duration_since(config_time)
+                .signed_duration_since(start_time)
                .to_std()
                .unwrap()
                .as_millis() as u64;
@@ -544,11 +466,6 @@ impl ComputeNode {
        }
        self.set_status(ComputeStatus::Running);

-        info!(
-            "finished configuration of compute for project {}",
-            pspec.spec.cluster.cluster_id.as_deref().unwrap_or("None")
-        );
-
        Ok(pg)
    }

--- a/compute_tools/src/config.rs
+++ b/compute_tools/src/config.rs
@@ -5,7 +5,6 @@ use std::path::Path;

 use anyhow::Result;

-use crate::pg_helpers::escape_conf_value;
 use crate::pg_helpers::PgOptionsSerialize;
 use compute_api::spec::{ComputeMode, ComputeSpec};

@@ -37,44 +36,10 @@ pub fn write_postgres_conf(path: &Path, spec: &ComputeSpec) -> Result<()> {
    // File::create() destroys the file content if it exists.
    let mut file = File::create(path)?;

-    // Write the postgresql.conf content from the spec file as is.
-    if let Some(conf) = &spec.cluster.postgresql_conf {
-        writeln!(file, "{}", conf)?;
-    }
+    writeln!(file, "# Managed by compute_ctl: begin")?;

    write!(file, "{}", &spec.cluster.settings.as_pg_settings())?;

-    // Add options for connecting to storage
-    writeln!(file, "# Neon storage settings")?;
-    if let Some(s) = &spec.pageserver_connstring {
-        writeln!(
-            file,
-            "neon.pageserver_connstring='{}'",
-            escape_conf_value(s)
-        )?;
-    }
-    if !spec.safekeeper_connstrings.is_empty() {
-        writeln!(
-            file,
-            "neon.safekeepers='{}'",
-            escape_conf_value(&spec.safekeeper_connstrings.join(","))
-        )?;
-    }
-    if let Some(s) = &spec.tenant_id {
-        writeln!(
-            file,
-            "neon.tenant_id='{}'",
-            escape_conf_value(&s.to_string())
-        )?;
-    }
-    if let Some(s) = &spec.timeline_id {
-        writeln!(
-            file,
-            "neon.timeline_id='{}'",
-            escape_conf_value(&s.to_string())
-        )?;
-    }
-
    match spec.mode {
        ComputeMode::Primary => {}
        ComputeMode::Static(lsn) => {
@@ -88,12 +53,7 @@ pub fn write_postgres_conf(path: &Path, spec: &ComputeSpec) -> Result<()> {
        }
    }

-    // If there are any extra options in the 'settings' field, append those
-    if spec.cluster.settings.is_some() {
-        writeln!(file, "# Managed by compute_ctl: begin")?;
-        write!(file, "{}", spec.cluster.settings.as_pg_settings())?;
-        writeln!(file, "# Managed by compute_ctl: end")?;
-    }
+    writeln!(file, "# Managed by compute_ctl: end")?;

    Ok(())
 }
--- a/compute_tools/src/configurator.rs
+++ b/compute_tools/src/configurator.rs
@@ -8,7 +8,7 @@ use compute_api::responses::ComputeStatus;

 use crate::compute::ComputeNode;

-#[instrument(skip_all)]
+#[instrument(skip(compute))]
 fn configurator_main_loop(compute: &Arc<ComputeNode>) {
    info!("waiting for reconfiguration requests");
    loop {
--- a/compute_tools/src/http/api.rs
+++ b/compute_tools/src/http/api.rs
@@ -220,8 +220,8 @@ fn render_json_error(e: &str, status: StatusCode) -> Response<Body> {

 // Main Hyper HTTP server function that runs it and blocks waiting on it forever.
 #[tokio::main]
-async fn serve(port: u16, state: Arc<ComputeNode>) {
-    let addr = SocketAddr::from(([0, 0, 0, 0], port));
+async fn serve(state: Arc<ComputeNode>) {
+    let addr = SocketAddr::from(([0, 0, 0, 0], 3080));

    let make_service = make_service_fn(move |_conn| {
        let state = state.clone();
@@ -256,10 +256,10 @@ async fn serve(port: u16, state: Arc<ComputeNode>) {
 }

 /// Launch a separate Hyper HTTP API server thread and return its `JoinHandle`.
-pub fn launch_http_server(port: u16, state: &Arc<ComputeNode>) -> Result<thread::JoinHandle<()>> {
+pub fn launch_http_server(state: &Arc<ComputeNode>) -> Result<thread::JoinHandle<()>> {
    let state = Arc::clone(state);

    Ok(thread::Builder::new()
        .name("http-endpoint".into())
-        .spawn(move || serve(port, state))?)
+        .spawn(move || serve(state))?)
 }
--- a/compute_tools/src/logger.rs
+++ b/compute_tools/src/logger.rs
@@ -18,7 +18,6 @@ pub fn init_tracing_and_logging(default_log_level: &str) -> anyhow::Result<()> {
        .unwrap_or_else(|_| tracing_subscriber::EnvFilter::new(default_log_level));

    let fmt_layer = tracing_subscriber::fmt::layer()
-        .with_ansi(false)
        .with_target(false)
        .with_writer(std::io::stderr);

--- a/compute_tools/src/pg_helpers.rs
+++ b/compute_tools/src/pg_helpers.rs
@@ -17,13 +17,13 @@ use compute_api::spec::{Database, GenericOption, GenericOptions, PgIdent, Role};
 const POSTGRES_WAIT_TIMEOUT: Duration = Duration::from_millis(60 * 1000); // milliseconds

 /// Escape a string for including it in a SQL literal
-pub fn escape_literal(s: &str) -> String {
+fn escape_literal(s: &str) -> String {
    s.replace('\'', "''").replace('\\', "\\\\")
 }

 /// Escape a string so that it can be used in postgresql.conf.
 /// Same as escape_literal, currently.
-pub fn escape_conf_value(s: &str) -> String {
+fn escape_conf_value(s: &str) -> String {
    s.replace('\'', "''").replace('\\', "\\\\")
 }

@@ -215,7 +215,7 @@ pub fn get_existing_dbs(client: &mut Client) -> Result<Vec<Database>> {
 /// Wait for Postgres to become ready to accept connections. It's ready to
 /// accept connections when the state-field in `pgdata/postmaster.pid` says
 /// 'ready'.
-#[instrument(skip_all, fields(pgdata = %pgdata.display()))]
+#[instrument(skip(pg))]
 pub fn wait_for_postgres(pg: &mut Child, pgdata: &Path) -> Result<()> {
    let pid_path = pgdata.join("postmaster.pid");

--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -269,13 +269,17 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
                xact.execute(query.as_str(), &[])?;
            }
            RoleAction::Create => {
-                let mut query: String = format!(
-                    "CREATE ROLE {} CREATEROLE CREATEDB IN ROLE neon_superuser",
-                    name.pg_quote()
-                );
+                let mut query: String = format!("CREATE ROLE {} ", name.pg_quote());
                info!("role create query: '{}'", &query);
                query.push_str(&role.to_pg_options());
                xact.execute(query.as_str(), &[])?;
+
+                let grant_query = format!(
+                    "GRANT pg_read_all_data, pg_write_all_data TO {}",
+                    name.pg_quote()
+                );
+                xact.execute(grant_query.as_str(), &[])?;
+                info!("role grant query: '{}'", &grant_query);
            }
        }

@@ -472,11 +476,6 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
                query.push_str(&db.to_pg_options());
                let _guard = info_span!("executing", query).entered();
                client.execute(query.as_str(), &[])?;
-                let grant_query: String = format!(
-                    "GRANT ALL PRIVILEGES ON DATABASE {} TO neon_superuser",
-                    name.pg_quote()
-                );
-                client.execute(grant_query.as_str(), &[])?;
            }
        };

@@ -496,9 +495,35 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
 /// Grant CREATE ON DATABASE to the database owner and do some other alters and grants
 /// to allow users creating trusted extensions and re-creating `public` schema, for example.
 #[instrument(skip_all)]
-pub fn handle_grants(spec: &ComputeSpec, connstr: &str) -> Result<()> {
+pub fn handle_grants(spec: &ComputeSpec, connstr: &str, client: &mut Client) -> Result<()> {
    info!("cluster spec grants:");

+    // We now have a separate `web_access` role to connect to the database
+    // via the web interface and proxy link auth. And also we grant a
+    // read / write all data privilege to every role. So also grant
+    // create to everyone.
+    // XXX: later we should stop messing with Postgres ACL in such horrible
+    // ways.
+    let roles = spec
+        .cluster
+        .roles
+        .iter()
+        .map(|r| r.name.pg_quote())
+        .collect::<Vec<_>>();
+
+    for db in &spec.cluster.databases {
+        let dbname = &db.name;
+
+        let query: String = format!(
+            "GRANT CREATE ON DATABASE {} TO {}",
+            dbname.pg_quote(),
+            roles.join(", ")
+        );
+        info!("grant query {}", &query);
+
+        client.execute(query.as_str(), &[])?;
+    }
+
    // Do some per-database access adjustments. We'd better do this at db creation time,
    // but CREATE DATABASE isn't transactional. So we cannot create db + do some grants
    // atomically.
--- a/control_plane/src/background_process.rs
+++ b/control_plane/src/background_process.rs
@@ -180,11 +180,6 @@ pub fn stop_process(immediate: bool, process_name: &str, pid_file: &Path) -> any
    }

    // Wait until process is gone
-    wait_until_stopped(process_name, pid)?;
-    Ok(())
-}
-
-pub fn wait_until_stopped(process_name: &str, pid: Pid) -> anyhow::Result<()> {
    for retries in 0..RETRIES {
        match process_has_stopped(pid) {
            Ok(true) => {
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -308,8 +308,7 @@ fn handle_init(init_match: &ArgMatches) -> anyhow::Result<LocalEnv> {

    let mut env =
        LocalEnv::parse_config(&toml_file).context("Failed to create neon configuration")?;
-    let force = init_match.get_flag("force");
-    env.init(pg_version, force)
+    env.init(pg_version)
        .context("Failed to initialize neon repository")?;

    // Initialize pageserver, create initial tenant and timeline.
@@ -477,11 +476,10 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -

            println!("Creating endpoint for imported timeline ...");
            cplane.new_endpoint(
-                name,
                tenant_id,
+                name,
                timeline_id,
                None,
-                None,
                pg_version,
                ComputeMode::Primary,
            )?;
@@ -593,7 +591,7 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(

                table.add_row([
                    endpoint_id.as_str(),
-                    &endpoint.pg_address.to_string(),
+                    &endpoint.address.to_string(),
                    &endpoint.timeline_id.to_string(),
                    branch_name,
                    lsn_str.as_str(),
@@ -622,8 +620,8 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
                .get_branch_timeline_id(branch_name, tenant_id)
                .ok_or_else(|| anyhow!("Found no timeline id for branch name '{branch_name}'"))?;

-            let pg_port: Option<u16> = sub_args.get_one::<u16>("pg-port").copied();
-            let http_port: Option<u16> = sub_args.get_one::<u16>("http-port").copied();
+            let port: Option<u16> = sub_args.get_one::<u16>("port").copied();
+
            let pg_version = sub_args
                .get_one::<u32>("pg-version")
                .copied()
@@ -641,38 +639,14 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
                (Some(_), true) => anyhow::bail!("cannot specify both lsn and hot-standby"),
            };

-            cplane.new_endpoint(
-                &endpoint_id,
-                tenant_id,
-                timeline_id,
-                pg_port,
-                http_port,
-                pg_version,
-                mode,
-            )?;
+            cplane.new_endpoint(tenant_id, &endpoint_id, timeline_id, port, pg_version, mode)?;
        }
        "start" => {
-            let pg_port: Option<u16> = sub_args.get_one::<u16>("pg-port").copied();
-            let http_port: Option<u16> = sub_args.get_one::<u16>("http-port").copied();
+            let port: Option<u16> = sub_args.get_one::<u16>("port").copied();
            let endpoint_id = sub_args
                .get_one::<String>("endpoint_id")
                .ok_or_else(|| anyhow!("No endpoint ID was provided to start"))?;

-            // If --safekeepers argument is given, use only the listed safekeeper nodes.
-            let safekeepers =
-                if let Some(safekeepers_str) = sub_args.get_one::<String>("safekeepers") {
-                    let mut safekeepers: Vec<NodeId> = Vec::new();
-                    for sk_id in safekeepers_str.split(',').map(str::trim) {
-                        let sk_id = NodeId(u64::from_str(sk_id).map_err(|_| {
-                            anyhow!("invalid node ID \"{sk_id}\" in --safekeepers list")
-                        })?);
-                        safekeepers.push(sk_id);
-                    }
-                    safekeepers
-                } else {
-                    env.safekeepers.iter().map(|sk| sk.id).collect()
-                };
-
            let endpoint = cplane.endpoints.get(endpoint_id.as_str());

            let auth_token = if matches!(env.pageserver.pg_auth_type, AuthType::NeonJWT) {
@@ -699,7 +673,7 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
                    _ => {}
                }
                println!("Starting existing endpoint {endpoint_id}...");
-                endpoint.start(&auth_token, safekeepers)?;
+                endpoint.start(&auth_token)?;
            } else {
                let branch_name = sub_args
                    .get_one::<String>("branch-name")
@@ -735,15 +709,14 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
                println!("Starting new endpoint {endpoint_id} (PostgreSQL v{pg_version}) on timeline {timeline_id} ...");

                let ep = cplane.new_endpoint(
-                    endpoint_id,
                    tenant_id,
+                    endpoint_id,
                    timeline_id,
-                    pg_port,
-                    http_port,
+                    port,
                    pg_version,
                    mode,
                )?;
-                ep.start(&auth_token, safekeepers)?;
+                ep.start(&auth_token)?;
            }
        }
        "stop" => {
@@ -971,22 +944,11 @@ fn cli() -> Command {
        .value_parser(value_parser!(u32))
        .default_value(DEFAULT_PG_VERSION);

-    let pg_port_arg = Arg::new("pg-port")
-        .long("pg-port")
+    let port_arg = Arg::new("port")
+        .long("port")
        .required(false)
        .value_parser(value_parser!(u16))
-        .value_name("pg-port");
-
-    let http_port_arg = Arg::new("http-port")
-        .long("http-port")
-        .required(false)
-        .value_parser(value_parser!(u16))
-        .value_name("http-port");
-
-    let safekeepers_arg = Arg::new("safekeepers")
-        .long("safekeepers")
-        .required(false)
-        .value_name("safekeepers");
+        .value_name("port");

    let stop_mode_arg = Arg::new("stop-mode")
        .short('m')
@@ -1014,13 +976,6 @@ fn cli() -> Command {
        .help("If set, the node will be a hot replica on the specified timeline")
        .required(false);

-    let force_arg = Arg::new("force")
-        .value_parser(value_parser!(bool))
-        .long("force")
-        .action(ArgAction::SetTrue)
-        .help("Force initialization even if the repository is not empty")
-        .required(false);
-
    Command::new("Neon CLI")
        .arg_required_else_help(true)
        .version(GIT_VERSION)
@@ -1036,7 +991,6 @@ fn cli() -> Command {
                        .value_name("config"),
                )
                .arg(pg_version_arg.clone())
-                .arg(force_arg)
        )
        .subcommand(
            Command::new("timeline")
@@ -1139,8 +1093,7 @@ fn cli() -> Command {
                    .arg(branch_name_arg.clone())
                    .arg(tenant_id_arg.clone())
                    .arg(lsn_arg.clone())
-                    .arg(pg_port_arg.clone())
-                    .arg(http_port_arg.clone())
+                    .arg(port_arg.clone())
                    .arg(
                        Arg::new("config-only")
                            .help("Don't do basebackup, create endpoint directory with only config files")
@@ -1156,11 +1109,9 @@ fn cli() -> Command {
                    .arg(branch_name_arg)
                    .arg(timeline_id_arg)
                    .arg(lsn_arg)
-                    .arg(pg_port_arg)
-                    .arg(http_port_arg)
+                    .arg(port_arg)
                    .arg(pg_version_arg)
                    .arg(hot_standby_arg)
-                    .arg(safekeepers_arg)
                )
                .subcommand(
                    Command::new("stop")
--- a/control_plane/src/broker.rs
+++ b/control_plane/src/broker.rs
@@ -1,9 +1,3 @@
-//! Code to manage the storage broker
-//!
-//! In the local test environment, the data for each safekeeper is stored in
-//!
-//!   .neon/safekeepers/<safekeeper id>
-//!
 use anyhow::Context;

 use std::path::PathBuf;
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -1,73 +1,41 @@
-//! Code to manage compute endpoints
-//!
-//! In the local test environment, the data for each endpoint is stored in
-//!
-//!   .neon/endpoints/<endpoint id>
-//!
-//! Some basic information about the endpoint, like the tenant and timeline IDs,
-//! are stored in the `endpoint.json` file. The `endpoint.json` file is created
-//! when the endpoint is created, and doesn't change afterwards.
-//!
-//! The endpoint is managed by the `compute_ctl` binary. When an endpoint is
-//! started, we launch `compute_ctl` It synchronizes the safekeepers, downloads
-//! the basebackup from the pageserver to initialize the the data directory, and
-//! finally launches the PostgreSQL process. It watches the PostgreSQL process
-//! until it exits.
-//!
-//! When an endpoint is created, a `postgresql.conf` file is also created in
-//! the endpoint's directory. The file can be modified before starting PostgreSQL.
-//! However, the `postgresql.conf` file in the endpoint directory is not used directly
-//! by PostgreSQL. It is passed to `compute_ctl`, and `compute_ctl` writes another
-//! copy of it in the data directory.
-//!
-//! Directory contents:
-//!
-//! ```ignore
-//! .neon/endpoints/main/
-//!     compute.log               - log output of `compute_ctl` and `postgres`
-//!     endpoint.json             - serialized `EndpointConf` struct
-//!     postgresql.conf           - postgresql settings
-//!     spec.json                 - passed to `compute_ctl`
-//!     pgdata/
-//!         postgresql.conf       - copy of postgresql.conf created by `compute_ctl`
-//!         zenith.signal
-//!         <other PostgreSQL files>
-//! ```
-//!
 use std::collections::BTreeMap;
+use std::fs::{self, File};
+use std::io::Write;
 use std::net::SocketAddr;
 use std::net::TcpStream;
+use std::os::unix::fs::PermissionsExt;
 use std::path::PathBuf;
-use std::process::Command;
+use std::process::{Command, Stdio};
+use std::str::FromStr;
 use std::sync::Arc;
 use std::time::Duration;

-use anyhow::{anyhow, bail, Context, Result};
+use anyhow::{Context, Result};
 use serde::{Deserialize, Serialize};
 use serde_with::{serde_as, DisplayFromStr};
-use utils::id::{NodeId, TenantId, TimelineId};
+use utils::{
+    id::{TenantId, TimelineId},
+    lsn::Lsn,
+};

 use crate::local_env::LocalEnv;
 use crate::pageserver::PageServerNode;
 use crate::postgresql_conf::PostgresConf;

-use compute_api::responses::{ComputeState, ComputeStatus};
-use compute_api::spec::{Cluster, ComputeMode, ComputeSpec};
+use compute_api::spec::ComputeMode;

 // contents of a endpoint.json file
 #[serde_as]
 #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
 pub struct EndpointConf {
-    endpoint_id: String,
+    name: String,
    #[serde_as(as = "DisplayFromStr")]
    tenant_id: TenantId,
    #[serde_as(as = "DisplayFromStr")]
    timeline_id: TimelineId,
    mode: ComputeMode,
-    pg_port: u16,
-    http_port: u16,
+    port: u16,
    pg_version: u32,
-    skip_pg_catalog_updates: bool,
 }

 //
@@ -89,11 +57,11 @@ impl ComputeControlPlane {
        let pageserver = Arc::new(PageServerNode::from_env(&env));

        let mut endpoints = BTreeMap::default();
-        for endpoint_dir in std::fs::read_dir(env.endpoints_path())
+        for endpoint_dir in fs::read_dir(env.endpoints_path())
            .with_context(|| format!("failed to list {}", env.endpoints_path().display()))?
        {
            let ep = Endpoint::from_dir_entry(endpoint_dir?, &env, &pageserver)?;
-            endpoints.insert(ep.endpoint_id.clone(), Arc::new(ep));
+            endpoints.insert(ep.name.clone(), Arc::new(ep));
        }

        Ok(ComputeControlPlane {
@@ -108,58 +76,47 @@ impl ComputeControlPlane {
        1 + self
            .endpoints
            .values()
-            .map(|ep| std::cmp::max(ep.pg_address.port(), ep.http_address.port()))
+            .map(|ep| ep.address.port())
            .max()
            .unwrap_or(self.base_port)
    }

-    #[allow(clippy::too_many_arguments)]
    pub fn new_endpoint(
        &mut self,
-        endpoint_id: &str,
        tenant_id: TenantId,
+        name: &str,
        timeline_id: TimelineId,
-        pg_port: Option<u16>,
-        http_port: Option<u16>,
+        port: Option<u16>,
        pg_version: u32,
        mode: ComputeMode,
    ) -> Result<Arc<Endpoint>> {
-        let pg_port = pg_port.unwrap_or_else(|| self.get_port());
-        let http_port = http_port.unwrap_or_else(|| self.get_port() + 1);
+        let port = port.unwrap_or_else(|| self.get_port());
+
        let ep = Arc::new(Endpoint {
-            endpoint_id: endpoint_id.to_owned(),
-            pg_address: SocketAddr::new("127.0.0.1".parse().unwrap(), pg_port),
-            http_address: SocketAddr::new("127.0.0.1".parse().unwrap(), http_port),
+            name: name.to_owned(),
+            address: SocketAddr::new("127.0.0.1".parse().unwrap(), port),
            env: self.env.clone(),
            pageserver: Arc::clone(&self.pageserver),
            timeline_id,
            mode,
            tenant_id,
            pg_version,
-            skip_pg_catalog_updates: false,
        });
-
-        ep.create_endpoint_dir()?;
+        ep.create_pgdata()?;
        std::fs::write(
            ep.endpoint_path().join("endpoint.json"),
            serde_json::to_string_pretty(&EndpointConf {
-                endpoint_id: endpoint_id.to_string(),
+                name: name.to_string(),
                tenant_id,
                timeline_id,
                mode,
-                http_port,
-                pg_port,
+                port,
                pg_version,
-                skip_pg_catalog_updates: false,
            })?,
        )?;
-        std::fs::write(
-            ep.endpoint_path().join("postgresql.conf"),
-            ep.setup_pg_conf()?.to_string(),
-        )?;
+        ep.setup_pg_conf()?;

-        self.endpoints
-            .insert(ep.endpoint_id.clone(), Arc::clone(&ep));
+        self.endpoints.insert(ep.name.clone(), Arc::clone(&ep));

        Ok(ep)
    }
@@ -170,15 +127,13 @@ impl ComputeControlPlane {
 #[derive(Debug)]
 pub struct Endpoint {
    /// used as the directory name
-    endpoint_id: String,
+    name: String,
    pub tenant_id: TenantId,
    pub timeline_id: TimelineId,
    pub mode: ComputeMode,

-    // port and address of the Postgres server and `compute_ctl`'s HTTP API
-    pub pg_address: SocketAddr,
-    pub http_address: SocketAddr,
-
+    // port and address of the Postgres server
+    pub address: SocketAddr,
    // postgres major version in the format: 14, 15, etc.
    pg_version: u32,

@@ -186,9 +141,6 @@ pub struct Endpoint {
    // the endpoint runs in.
    pub env: LocalEnv,
    pageserver: Arc<PageServerNode>,
-
-    // Optimizations
-    skip_pg_catalog_updates: bool,
 }

 impl Endpoint {
@@ -206,37 +158,123 @@ impl Endpoint {

        // parse data directory name
        let fname = entry.file_name();
-        let endpoint_id = fname.to_str().unwrap().to_string();
+        let name = fname.to_str().unwrap().to_string();

        // Read the endpoint.json file
        let conf: EndpointConf =
            serde_json::from_slice(&std::fs::read(entry.path().join("endpoint.json"))?)?;

+        // ok now
        Ok(Endpoint {
-            pg_address: SocketAddr::new("127.0.0.1".parse().unwrap(), conf.pg_port),
-            http_address: SocketAddr::new("127.0.0.1".parse().unwrap(), conf.http_port),
-            endpoint_id,
+            address: SocketAddr::new("127.0.0.1".parse().unwrap(), conf.port),
+            name,
            env: env.clone(),
            pageserver: Arc::clone(pageserver),
            timeline_id: conf.timeline_id,
            mode: conf.mode,
            tenant_id: conf.tenant_id,
            pg_version: conf.pg_version,
-            skip_pg_catalog_updates: conf.skip_pg_catalog_updates,
        })
    }

-    fn create_endpoint_dir(&self) -> Result<()> {
-        std::fs::create_dir_all(self.endpoint_path()).with_context(|| {
-            format!(
-                "could not create endpoint directory {}",
-                self.endpoint_path().display()
+    fn sync_safekeepers(&self, auth_token: &Option<String>, pg_version: u32) -> Result<Lsn> {
+        let pg_path = self.env.pg_bin_dir(pg_version)?.join("postgres");
+        let mut cmd = Command::new(pg_path);
+
+        cmd.arg("--sync-safekeepers")
+            .env_clear()
+            .env(
+                "LD_LIBRARY_PATH",
+                self.env.pg_lib_dir(pg_version)?.to_str().unwrap(),
            )
-        })
+            .env(
+                "DYLD_LIBRARY_PATH",
+                self.env.pg_lib_dir(pg_version)?.to_str().unwrap(),
+            )
+            .env("PGDATA", self.pgdata().to_str().unwrap())
+            .stdout(Stdio::piped())
+            // Comment this to avoid capturing stderr (useful if command hangs)
+            .stderr(Stdio::piped());
+
+        if let Some(token) = auth_token {
+            cmd.env("NEON_AUTH_TOKEN", token);
+        }
+
+        let sync_handle = cmd
+            .spawn()
+            .expect("postgres --sync-safekeepers failed to start");
+
+        let sync_output = sync_handle
+            .wait_with_output()
+            .expect("postgres --sync-safekeepers failed");
+        if !sync_output.status.success() {
+            anyhow::bail!(
+                "sync-safekeepers failed: '{}'",
+                String::from_utf8_lossy(&sync_output.stderr)
+            );
+        }
+
+        let lsn = Lsn::from_str(std::str::from_utf8(&sync_output.stdout)?.trim())?;
+        println!("Safekeepers synced on {}", lsn);
+        Ok(lsn)
    }

-    // Generate postgresql.conf with default configuration
-    fn setup_pg_conf(&self) -> Result<PostgresConf> {
+    /// Get basebackup from the pageserver as a tar archive and extract it
+    /// to the `self.pgdata()` directory.
+    fn do_basebackup(&self, lsn: Option<Lsn>) -> Result<()> {
+        println!(
+            "Extracting base backup to create postgres instance: path={} port={}",
+            self.pgdata().display(),
+            self.address.port()
+        );
+
+        let sql = if let Some(lsn) = lsn {
+            format!("basebackup {} {} {}", self.tenant_id, self.timeline_id, lsn)
+        } else {
+            format!("basebackup {} {}", self.tenant_id, self.timeline_id)
+        };
+
+        let mut client = self
+            .pageserver
+            .page_server_psql_client()
+            .context("connecting to page server failed")?;
+
+        let copyreader = client
+            .copy_out(sql.as_str())
+            .context("page server 'basebackup' command failed")?;
+
+        // Read the archive directly from the `CopyOutReader`
+        //
+        // Set `ignore_zeros` so that unpack() reads all the Copy data and
+        // doesn't stop at the end-of-archive marker. Otherwise, if the server
+        // sends an Error after finishing the tarball, we will not notice it.
+        let mut ar = tar::Archive::new(copyreader);
+        ar.set_ignore_zeros(true);
+        ar.unpack(&self.pgdata())
+            .context("extracting base backup failed")?;
+
+        Ok(())
+    }
+
+    fn create_pgdata(&self) -> Result<()> {
+        fs::create_dir_all(self.pgdata()).with_context(|| {
+            format!(
+                "could not create data directory {}",
+                self.pgdata().display()
+            )
+        })?;
+        fs::set_permissions(self.pgdata().as_path(), fs::Permissions::from_mode(0o700))
+            .with_context(|| {
+                format!(
+                    "could not set permissions in data directory {}",
+                    self.pgdata().display()
+                )
+            })
+    }
+
+    // Write postgresql.conf with default configuration
+    // and PG_VERSION file to the data directory of a new endpoint.
+    fn setup_pg_conf(&self) -> Result<()> {
        let mut conf = PostgresConf::new();
        conf.append("max_wal_senders", "10");
        conf.append("wal_log_hints", "off");
@@ -249,14 +287,25 @@ impl Endpoint {
        // wal_sender_timeout is the maximum time to wait for WAL replication.
        // It also defines how often the walreciever will send a feedback message to the wal sender.
        conf.append("wal_sender_timeout", "5s");
-        conf.append("listen_addresses", &self.pg_address.ip().to_string());
-        conf.append("port", &self.pg_address.port().to_string());
+        conf.append("listen_addresses", &self.address.ip().to_string());
+        conf.append("port", &self.address.port().to_string());
        conf.append("wal_keep_size", "0");
        // walproposer panics when basebackup is invalid, it is pointless to restart in this case.
        conf.append("restart_after_crash", "off");

-        // Load the 'neon' extension
+        // Configure the Neon Postgres extension to fetch pages from pageserver
+        let pageserver_connstr = {
+            let config = &self.pageserver.pg_connection_config;
+            let (host, port) = (config.host(), config.port());
+
+            // NOTE: avoid spaces in connection string, because it is less error prone if we forward it somewhere.
+            format!("postgresql://no_user@{host}:{port}")
+        };
        conf.append("shared_preload_libraries", "neon");
+        conf.append_line("");
+        conf.append("neon.pageserver_connstring", &pageserver_connstr);
+        conf.append("neon.tenant_id", &self.tenant_id.to_string());
+        conf.append("neon.timeline_id", &self.timeline_id.to_string());

        conf.append_line("");
        // Replication-related configurations, such as WAL sending
@@ -341,11 +390,46 @@ impl Endpoint {
            }
        }

-        Ok(conf)
+        let mut file = File::create(self.pgdata().join("postgresql.conf"))?;
+        file.write_all(conf.to_string().as_bytes())?;
+
+        let mut file = File::create(self.pgdata().join("PG_VERSION"))?;
+        file.write_all(self.pg_version.to_string().as_bytes())?;
+
+        Ok(())
+    }
+
+    fn load_basebackup(&self, auth_token: &Option<String>) -> Result<()> {
+        let backup_lsn = match &self.mode {
+            ComputeMode::Primary => {
+                if !self.env.safekeepers.is_empty() {
+                    // LSN 0 means that it is bootstrap and we need to download just
+                    // latest data from the pageserver. That is a bit clumsy but whole bootstrap
+                    // procedure evolves quite actively right now, so let's think about it again
+                    // when things would be more stable (TODO).
+                    let lsn = self.sync_safekeepers(auth_token, self.pg_version)?;
+                    if lsn == Lsn(0) {
+                        None
+                    } else {
+                        Some(lsn)
+                    }
+                } else {
+                    None
+                }
+            }
+            ComputeMode::Static(lsn) => Some(*lsn),
+            ComputeMode::Replica => {
+                None // Take the latest snapshot available to start with
+            }
+        };
+
+        self.do_basebackup(backup_lsn)?;
+
+        Ok(())
    }

    pub fn endpoint_path(&self) -> PathBuf {
-        self.env.endpoints_path().join(&self.endpoint_id)
+        self.env.endpoints_path().join(&self.name)
    }

    pub fn pgdata(&self) -> PathBuf {
@@ -355,7 +439,7 @@ impl Endpoint {
    pub fn status(&self) -> &str {
        let timeout = Duration::from_millis(300);
        let has_pidfile = self.pgdata().join("postmaster.pid").exists();
-        let can_connect = TcpStream::connect_timeout(&self.pg_address, timeout).is_ok();
+        let can_connect = TcpStream::connect_timeout(&self.address, timeout).is_ok();

        match (has_pidfile, can_connect) {
            (true, true) => "running",
@@ -373,6 +457,8 @@ impl Endpoint {
                &[
                    "-D",
                    self.pgdata().to_str().unwrap(),
+                    "-l",
+                    self.pgdata().join("pg.log").to_str().unwrap(),
                    "-w", //wait till pg_ctl actually does what was asked
                ],
                args,
@@ -405,203 +491,39 @@ impl Endpoint {
                String::from_utf8_lossy(&pg_ctl.stderr),
            );
        }
-
-        // Also wait for the compute_ctl process to die. It might have some cleanup
-        // work to do after postgres stops, like syncing safekeepers, etc.
-        //
-        // TODO use background_process::stop_process instead
-        let pidfile_path = self.endpoint_path().join("compute_ctl.pid");
-        let pid: u32 = std::fs::read_to_string(pidfile_path)?.parse()?;
-        let pid = nix::unistd::Pid::from_raw(pid as i32);
-        crate::background_process::wait_until_stopped("compute_ctl", pid)?;
-
        Ok(())
    }

-    pub fn start(&self, auth_token: &Option<String>, safekeepers: Vec<NodeId>) -> Result<()> {
+    pub fn start(&self, auth_token: &Option<String>) -> Result<()> {
        if self.status() == "running" {
            anyhow::bail!("The endpoint is already running");
        }

-        // Slurp the endpoints/<endpoint id>/postgresql.conf file into
-        // memory. We will include it in the spec file that we pass to
-        // `compute_ctl`, and `compute_ctl` will write it to the postgresql.conf
-        // in the data directory.
-        let postgresql_conf_path = self.endpoint_path().join("postgresql.conf");
-        let postgresql_conf = match std::fs::read(&postgresql_conf_path) {
-            Ok(content) => String::from_utf8(content)?,
-            Err(e) if e.kind() == std::io::ErrorKind::NotFound => "".to_string(),
-            Err(e) => {
-                return Err(anyhow::Error::new(e).context(format!(
-                    "failed to read config file in {}",
-                    postgresql_conf_path.to_str().unwrap()
-                )))
-            }
-        };
-
-        // We always start the compute node from scratch, so if the Postgres
-        // data dir exists from a previous launch, remove it first.
-        if self.pgdata().exists() {
-            std::fs::remove_dir_all(self.pgdata())?;
-        }
-
-        let pageserver_connstring = {
-            let config = &self.pageserver.pg_connection_config;
-            let (host, port) = (config.host(), config.port());
-
-            // NOTE: avoid spaces in connection string, because it is less error prone if we forward it somewhere.
-            format!("postgresql://no_user@{host}:{port}")
-        };
-        let mut safekeeper_connstrings = Vec::new();
-        if self.mode == ComputeMode::Primary {
-            for sk_id in safekeepers {
-                let sk = self
-                    .env
-                    .safekeepers
-                    .iter()
-                    .find(|node| node.id == sk_id)
-                    .ok_or_else(|| anyhow!("safekeeper {sk_id} does not exist"))?;
-                safekeeper_connstrings.push(format!("127.0.0.1:{}", sk.pg_port));
-            }
-        }
-
-        // Create spec file
-        let spec = ComputeSpec {
-            skip_pg_catalog_updates: self.skip_pg_catalog_updates,
-            format_version: 1.0,
-            operation_uuid: None,
-            cluster: Cluster {
-                cluster_id: None, // project ID: not used
-                name: None,       // project name: not used
-                state: None,
-                roles: vec![],
-                databases: vec![],
-                settings: None,
-                postgresql_conf: Some(postgresql_conf),
-            },
-            delta_operations: None,
-            tenant_id: Some(self.tenant_id),
-            timeline_id: Some(self.timeline_id),
-            mode: self.mode,
-            pageserver_connstring: Some(pageserver_connstring),
-            safekeeper_connstrings,
-            storage_auth_token: auth_token.clone(),
-        };
-        let spec_path = self.endpoint_path().join("spec.json");
-        std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?;
-
-        // Open log file. We'll redirect the stdout and stderr of `compute_ctl` to it.
-        let logfile = std::fs::OpenOptions::new()
-            .create(true)
-            .append(true)
-            .open(self.endpoint_path().join("compute.log"))?;
-
-        // Launch compute_ctl
-        println!("Starting postgres node at '{}'", self.connstr());
-        let mut cmd = Command::new(self.env.neon_distrib_dir.join("compute_ctl"));
-        cmd.args(["--http-port", &self.http_address.port().to_string()])
-            .args(["--pgdata", self.pgdata().to_str().unwrap()])
-            .args(["--connstr", &self.connstr()])
-            .args([
-                "--spec-path",
-                self.endpoint_path().join("spec.json").to_str().unwrap(),
-            ])
-            .args([
-                "--pgbin",
-                self.env
-                    .pg_bin_dir(self.pg_version)?
-                    .join("postgres")
-                    .to_str()
-                    .unwrap(),
-            ])
-            .stdin(std::process::Stdio::null())
-            .stderr(logfile.try_clone()?)
-            .stdout(logfile);
-        let child = cmd.spawn()?;
-
-        // Write down the pid so we can wait for it when we want to stop
-        // TODO use background_process::start_process instead
-        let pid = child.id();
-        let pidfile_path = self.endpoint_path().join("compute_ctl.pid");
-        std::fs::write(pidfile_path, pid.to_string())?;
-
-        // Wait for it to start
-        let mut attempt = 0;
-        const ATTEMPT_INTERVAL: Duration = Duration::from_millis(100);
-        const MAX_ATTEMPTS: u32 = 10 * 30; // Wait up to 30 s
-        loop {
-            attempt += 1;
-            match self.get_status() {
-                Ok(state) => {
-                    match state.status {
-                        ComputeStatus::Init => {
-                            if attempt == MAX_ATTEMPTS {
-                                bail!("compute startup timed out; still in Init state");
-                            }
-                            // keep retrying
-                        }
-                        ComputeStatus::Running => {
-                            // All good!
-                            break;
-                        }
-                        ComputeStatus::Failed => {
-                            bail!(
-                                "compute startup failed: {}",
-                                state
-                                    .error
-                                    .as_deref()
-                                    .unwrap_or("<no error from compute_ctl>")
-                            );
-                        }
-                        ComputeStatus::Empty
-                        | ComputeStatus::ConfigurationPending
-                        | ComputeStatus::Configuration => {
-                            bail!("unexpected compute status: {:?}", state.status)
-                        }
-                    }
-                }
-                Err(e) => {
-                    if attempt == MAX_ATTEMPTS {
-                        return Err(e).context(
-                            "timed out waiting to connect to compute_ctl HTTP; last error: {e}",
-                        );
-                    }
-                }
-            }
-            std::thread::sleep(ATTEMPT_INTERVAL);
-        }
-
-        Ok(())
-    }
-
-    // Call the /status HTTP API
-    pub fn get_status(&self) -> Result<ComputeState> {
-        let client = reqwest::blocking::Client::new();
-
-        let response = client
-            .request(
-                reqwest::Method::GET,
-                format!(
-                    "http://{}:{}/status",
-                    self.http_address.ip(),
-                    self.http_address.port()
-                ),
+        // 1. We always start Postgres from scratch, so
+        // if old dir exists, preserve 'postgresql.conf' and drop the directory
+        let postgresql_conf_path = self.pgdata().join("postgresql.conf");
+        let postgresql_conf = fs::read(&postgresql_conf_path).with_context(|| {
+            format!(
+                "failed to read config file in {}",
+                postgresql_conf_path.to_str().unwrap()
            )
-            .send()?;
+        })?;
+        fs::remove_dir_all(self.pgdata())?;
+        self.create_pgdata()?;

-        // Interpret the response
-        let status = response.status();
-        if !(status.is_client_error() || status.is_server_error()) {
-            Ok(response.json()?)
-        } else {
-            // reqwest does not export its error construction utility functions, so let's craft the message ourselves
-            let url = response.url().to_owned();
-            let msg = match response.text() {
-                Ok(err_body) => format!("Error: {}", err_body),
-                Err(_) => format!("Http error ({}) at {}.", status.as_u16(), url),
-            };
-            Err(anyhow::anyhow!(msg))
+        // 2. Bring back config files
+        fs::write(&postgresql_conf_path, postgresql_conf)?;
+
+        // 3. Load basebackup
+        self.load_basebackup(auth_token)?;
+
+        if self.mode != ComputeMode::Primary {
+            File::create(self.pgdata().join("standby.signal"))?;
        }
+
+        // 4. Finally start postgres
+        println!("Starting postgres at '{}'", self.connstr());
+        self.pg_ctl(&["start"], auth_token)
    }

    pub fn stop(&self, destroy: bool) -> Result<()> {
@@ -618,7 +540,7 @@ impl Endpoint {
                "Destroying postgres data directory '{}'",
                self.pgdata().to_str().unwrap()
            );
-            std::fs::remove_dir_all(self.endpoint_path())?;
+            fs::remove_dir_all(self.endpoint_path())?;
        } else {
            self.pg_ctl(&["stop"], &None)?;
        }
@@ -627,10 +549,10 @@ impl Endpoint {

    pub fn connstr(&self) -> String {
        format!(
-            "postgresql://{}@{}:{}/{}",
+            "host={} port={} user={} dbname={}",
+            self.address.ip(),
+            self.address.port(),
            "cloud_admin",
-            self.pg_address.ip(),
-            self.pg_address.port(),
            "postgres"
        )
    }
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -37,7 +37,7 @@ pub const DEFAULT_PG_VERSION: u32 = 15;
 #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
 pub struct LocalEnv {
    // Base directory for all the nodes (the pageserver, safekeepers and
-    // compute endpoints).
+    // compute nodes).
    //
    // This is not stored in the config file. Rather, this is the path where the
    // config file itself is. It is read from the NEON_REPO_DIR env variable or
@@ -364,7 +364,7 @@ impl LocalEnv {
    //
    // Initialize a new Neon repository
    //
-    pub fn init(&mut self, pg_version: u32, force: bool) -> anyhow::Result<()> {
+    pub fn init(&mut self, pg_version: u32) -> anyhow::Result<()> {
        // check if config already exists
        let base_path = &self.base_data_dir;
        ensure!(
@@ -372,29 +372,11 @@ impl LocalEnv {
            "repository base path is missing"
        );

-        if base_path.exists() {
-            if force {
-                println!("removing all contents of '{}'", base_path.display());
-                // instead of directly calling `remove_dir_all`, we keep the original dir but removing
-                // all contents inside. This helps if the developer symbol links another directory (i.e.,
-                // S3 local SSD) to the `.neon` base directory.
-                for entry in std::fs::read_dir(base_path)? {
-                    let entry = entry?;
-                    let path = entry.path();
-                    if path.is_dir() {
-                        fs::remove_dir_all(&path)?;
-                    } else {
-                        fs::remove_file(&path)?;
-                    }
-                }
-            } else {
-                bail!(
-                    "directory '{}' already exists. Perhaps already initialized? (Hint: use --force to remove all contents)",
-                    base_path.display()
-                );
-            }
-        }
-
+        ensure!(
+            !base_path.exists(),
+            "directory '{}' already exists. Perhaps already initialized?",
+            base_path.display()
+        );
        if !self.pg_bin_dir(pg_version)?.join("postgres").exists() {
            bail!(
                "Can't find postgres binary at {}",
@@ -410,9 +392,7 @@ impl LocalEnv {
            }
        }

-        if !base_path.exists() {
-            fs::create_dir(base_path)?;
-        }
+        fs::create_dir(base_path)?;

        // Generate keypair for JWT.
        //
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -1,9 +1,3 @@
-//! Code to manage pageservers
-//!
-//! In the local test environment, the pageserver stores its data directly in
-//!
-//!   .neon/
-//!
 use std::borrow::Cow;
 use std::collections::HashMap;
 use std::fs::File;
--- a/control_plane/src/safekeeper.rs
+++ b/control_plane/src/safekeeper.rs
@@ -1,9 +1,3 @@
-//! Code to manage safekeepers
-//!
-//! In the local test environment, the data for each safekeeper is stored in
-//!
-//!   .neon/safekeepers/<safekeeper id>
-//!
 use std::io::Write;
 use std::path::PathBuf;
 use std::process::Child;
--- a/docker-compose/docker-compose.yml
+++ b/docker-compose/docker-compose.yml
@@ -189,7 +189,7 @@ services:
      - "/bin/bash"
      - "-c"
    command:
-      - "until pg_isready -h compute -p 55433 -U cloud_admin ; do
+      - "until pg_isready -h compute -p 55433 ; do
            echo 'Waiting to start compute...' && sleep 1;
         done"
    depends_on:
--- a/docs/docker.md
+++ b/docs/docker.md
@@ -48,7 +48,6 @@ Creating docker-compose_storage_broker_1       ... done
 2. connect compute node
 ```
 $ echo "localhost:55433:postgres:cloud_admin:cloud_admin" >> ~/.pgpass
-$ chmod 600 ~/.pgpass
 $ psql -h localhost -p 55433 -U cloud_admin
 postgres=# CREATE TABLE t(key int primary key, value text);
 CREATE TABLE
--- a/libs/compute_api/src/responses.rs
+++ b/libs/compute_api/src/responses.rs
@@ -5,13 +5,13 @@ use serde::{Deserialize, Serialize, Serializer};

 use crate::spec::ComputeSpec;

-#[derive(Serialize, Debug, Deserialize)]
+#[derive(Serialize, Debug)]
 pub struct GenericAPIError {
    pub error: String,
 }

 /// Response of the /status API
-#[derive(Serialize, Debug, Deserialize)]
+#[derive(Serialize, Debug)]
 #[serde(rename_all = "snake_case")]
 pub struct ComputeStatusResponse {
    pub start_time: DateTime<Utc>,
@@ -23,7 +23,7 @@ pub struct ComputeStatusResponse {
    pub error: Option<String>,
 }

-#[derive(Deserialize, Serialize)]
+#[derive(Serialize)]
 #[serde(rename_all = "snake_case")]
 pub struct ComputeState {
    pub status: ComputeStatus,
@@ -33,7 +33,7 @@ pub struct ComputeState {
    pub error: Option<String>,
 }

-#[derive(Serialize, Clone, Copy, Debug, Deserialize, PartialEq, Eq)]
+#[derive(Serialize, Clone, Copy, Debug, PartialEq, Eq)]
 #[serde(rename_all = "snake_case")]
 pub enum ComputeStatus {
    // Spec wasn't provided at start, waiting for it to be
@@ -71,7 +71,6 @@ pub struct ComputeMetrics {
    pub wait_for_spec_ms: u64,
    pub sync_safekeepers_ms: u64,
    pub basebackup_ms: u64,
-    pub start_postgres_ms: u64,
    pub config_ms: u64,
    pub total_startup_ms: u64,
 }
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -5,7 +5,6 @@
 //! and connect it to the storage nodes.
 use serde::{Deserialize, Serialize};
 use serde_with::{serde_as, DisplayFromStr};
-use utils::id::{TenantId, TimelineId};
 use utils::lsn::Lsn;

 /// String type alias representing Postgres identifier and
@@ -15,7 +14,7 @@ pub type PgIdent = String;
 /// Cluster spec or configuration represented as an optional number of
 /// delta operations + final cluster state description.
 #[serde_as]
-#[derive(Clone, Debug, Default, Deserialize, Serialize)]
+#[derive(Clone, Debug, Default, Deserialize)]
 pub struct ComputeSpec {
    pub format_version: f32,

@@ -27,38 +26,9 @@ pub struct ComputeSpec {
    pub cluster: Cluster,
    pub delta_operations: Option<Vec<DeltaOp>>,

-    /// An optinal hint that can be passed to speed up startup time if we know
-    /// that no pg catalog mutations (like role creation, database creation,
-    /// extension creation) need to be done on the actual database to start.
-    #[serde(default)] // Default false
-    pub skip_pg_catalog_updates: bool,
-
-    // Information needed to connect to the storage layer.
-    //
-    // `tenant_id`, `timeline_id` and `pageserver_connstring` are always needed.
-    //
-    // Depending on `mode`, this can be a primary read-write node, a read-only
-    // replica, or a read-only node pinned at an older LSN.
-    // `safekeeper_connstrings` must be set for a primary.
-    //
-    // For backwards compatibility, the control plane may leave out all of
-    // these, and instead set the "neon.tenant_id", "neon.timeline_id",
-    // etc. GUCs in cluster.settings. TODO: Once the control plane has been
-    // updated to fill these fields, we can make these non optional.
-    #[serde_as(as = "Option<DisplayFromStr>")]
-    pub tenant_id: Option<TenantId>,
-    #[serde_as(as = "Option<DisplayFromStr>")]
-    pub timeline_id: Option<TimelineId>,
-    #[serde_as(as = "Option<DisplayFromStr>")]
-    pub pageserver_connstring: Option<String>,
-    #[serde(default)]
-    pub safekeeper_connstrings: Vec<String>,
-
    #[serde(default)]
    pub mode: ComputeMode,

-    /// If set, 'storage_auth_token' is used as the password to authenticate to
-    /// the pageserver and safekeepers.
    pub storage_auth_token: Option<String>,
 }

@@ -77,19 +47,13 @@ pub enum ComputeMode {
    Replica,
 }

-#[derive(Clone, Debug, Default, Deserialize, Serialize)]
+#[derive(Clone, Debug, Default, Deserialize)]
 pub struct Cluster {
-    pub cluster_id: Option<String>,
-    pub name: Option<String>,
+    pub cluster_id: String,
+    pub name: String,
    pub state: Option<String>,
    pub roles: Vec<Role>,
    pub databases: Vec<Database>,
-
-    /// Desired contents of 'postgresql.conf' file. (The 'compute_ctl'
-    /// tool may add additional settings to the final file.)
-    pub postgresql_conf: Option<String>,
-
-    /// Additional settings that will be appended to the 'postgresql.conf' file.
    pub settings: GenericOptions,
 }

@@ -99,7 +63,7 @@ pub struct Cluster {
 /// - DROP ROLE
 /// - ALTER ROLE name RENAME TO new_name
 /// - ALTER DATABASE name RENAME TO new_name
-#[derive(Clone, Debug, Deserialize, Serialize)]
+#[derive(Clone, Debug, Deserialize)]
 pub struct DeltaOp {
    pub action: String,
    pub name: PgIdent,
@@ -108,7 +72,7 @@ pub struct DeltaOp {

 /// Rust representation of Postgres role info with only those fields
 /// that matter for us.
-#[derive(Clone, Debug, Deserialize, Serialize)]
+#[derive(Clone, Debug, Deserialize)]
 pub struct Role {
    pub name: PgIdent,
    pub encrypted_password: Option<String>,
@@ -117,7 +81,7 @@ pub struct Role {

 /// Rust representation of Postgres database info with only those fields
 /// that matter for us.
-#[derive(Clone, Debug, Deserialize, Serialize)]
+#[derive(Clone, Debug, Deserialize)]
 pub struct Database {
    pub name: PgIdent,
    pub owner: PgIdent,
@@ -127,7 +91,7 @@ pub struct Database {
 /// Common type representing both SQL statement params with or without value,
 /// like `LOGIN` or `OWNER username` in the `CREATE/ALTER ROLE`, and config
 /// options like `wal_level = logical`.
-#[derive(Clone, Debug, Deserialize, Serialize)]
+#[derive(Clone, Debug, Deserialize)]
 pub struct GenericOption {
    pub name: String,
    pub value: Option<String>,
@@ -148,14 +112,4 @@ mod tests {
        let file = File::open("tests/cluster_spec.json").unwrap();
        let _spec: ComputeSpec = serde_json::from_reader(file).unwrap();
    }
-
-    #[test]
-    fn parse_unknown_fields() {
-        // Forward compatibility test
-        let file = File::open("tests/cluster_spec.json").unwrap();
-        let mut json: serde_json::Value = serde_json::from_reader(file).unwrap();
-        let ob = json.as_object_mut().unwrap();
-        ob.insert("unknown_field_123123123".into(), "hello".into());
-        let _spec: ComputeSpec = serde_json::from_value(json).unwrap();
-    }
 }
--- a/libs/metrics/src/lib.rs
+++ b/libs/metrics/src/lib.rs
@@ -23,7 +23,6 @@ use prometheus::{Registry, Result};
 pub mod launch_timestamp;
 mod wrappers;
 pub use wrappers::{CountedReader, CountedWriter};
-pub mod metric_vec_duration;

 pub type UIntGauge = GenericGauge<AtomicU64>;
 pub type UIntGaugeVec = GenericGaugeVec<AtomicU64>;
--- a/libs/metrics/src/metric_vec_duration.rs
+++ b/libs/metrics/src/metric_vec_duration.rs
@@ -1,23 +0,0 @@
-//! Helpers for observing duration on HistogramVec / CounterVec / GaugeVec / MetricVec<T>.
-
-use std::{future::Future, time::Instant};
-
-pub trait DurationResultObserver {
-    fn observe_result<T, E>(&self, res: &Result<T, E>, duration: std::time::Duration);
-}
-
-pub async fn observe_async_block_duration_by_result<
-    T,
-    E,
-    F: Future<Output = Result<T, E>>,
-    O: DurationResultObserver,
->(
-    observer: &O,
-    block: F,
-) -> Result<T, E> {
-    let start = Instant::now();
-    let result = block.await;
-    let duration = start.elapsed();
-    observer.observe_result(&result, duration);
-    result
-}
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -110,11 +110,12 @@ impl TenantState {
            Self::Active => Attached,
            // If the (initial or resumed) attach procedure fails, the tenant becomes Broken.
            // However, it also becomes Broken if the regular load fails.
-            // From Console's perspective there's no practical difference
-            // because attachment_status is polled by console only during attach operation execution.
-            Self::Broken { reason, .. } => Failed {
-                reason: reason.to_owned(),
-            },
+            // We would need a separate TenantState variant to distinguish these cases.
+            // However, there's no practical difference from Console's perspective.
+            // It will run a Postgres-level health check as soon as it observes Attached.
+            // That will fail on Broken tenants.
+            // Console can then rollback the attach, or, wait for operator to fix the Broken tenant.
+            Self::Broken { .. } => Attached,
            // Why is Stopping a Maybe case? Because, during pageserver shutdown,
            // we set the Stopping state irrespective of whether the tenant
            // has finished attaching or not.
@@ -152,7 +153,7 @@ pub enum ActivatingFrom {
 }

 /// A state of a timeline in pageserver's memory.
-#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
+#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
 pub enum TimelineState {
    /// The timeline is recognized by the pageserver but is not yet operational.
    /// In particular, the walreceiver connection loop is not running for this timeline.
@@ -165,7 +166,7 @@ pub enum TimelineState {
    /// It cannot transition back into any other state.
    Stopping,
    /// The timeline is broken and not operational (previous states: Loading or Active).
-    Broken { reason: String, backtrace: String },
+    Broken,
 }

 #[serde_as]
@@ -311,11 +312,10 @@ impl std::ops::Deref for TenantAttachConfig {

 /// See [`TenantState::attachment_status`] and the OpenAPI docs for context.
 #[derive(Serialize, Deserialize, Clone)]
-#[serde(tag = "slug", content = "data", rename_all = "snake_case")]
+#[serde(rename_all = "snake_case")]
 pub enum TenantAttachmentStatus {
    Maybe,
    Attached,
-    Failed { reason: String },
 }

 #[serde_as]
@@ -809,9 +809,7 @@ mod tests {
                "slug": "Active",
            },
            "current_physical_size": 42,
-            "attachment_status": {
-                "slug":"attached",
-            }
+            "attachment_status": "attached",
        });

        let original_broken = TenantInfo {
@@ -833,9 +831,7 @@ mod tests {
                }
            },
            "current_physical_size": 42,
-            "attachment_status": {
-                "slug":"attached",
-            }
+            "attachment_status": "attached",
        });

        assert_eq!(
--- a/libs/postgres_ffi/Cargo.toml
+++ b/libs/postgres_ffi/Cargo.toml
@@ -24,6 +24,7 @@ workspace_hack.workspace = true
 [dev-dependencies]
 env_logger.workspace = true
 postgres.workspace = true
+wal_craft = { path = "wal_craft" }

 [build-dependencies]
 anyhow.workspace = true
--- a/libs/postgres_ffi/src/lib.rs
+++ b/libs/postgres_ffi/src/lib.rs
@@ -33,7 +33,6 @@ macro_rules! postgres_ffi {
            }
            pub mod controlfile_utils;
            pub mod nonrelfile_utils;
-            pub mod wal_craft_test_export;
            pub mod waldecoder_handler;
            pub mod xlog_utils;

@@ -46,15 +45,8 @@ macro_rules! postgres_ffi {
    };
 }

-#[macro_export]
-macro_rules! for_all_postgres_versions {
-    ($macro:tt) => {
-        $macro!(v14);
-        $macro!(v15);
-    };
-}
-
-for_all_postgres_versions! { postgres_ffi }
+postgres_ffi!(v14);
+postgres_ffi!(v15);

 pub mod pg_constants;
 pub mod relfile_utils;
--- a/libs/postgres_ffi/src/nonrelfile_utils.rs
+++ b/libs/postgres_ffi/src/nonrelfile_utils.rs
@@ -57,9 +57,9 @@ pub fn slru_may_delete_clogsegment(segpage: u32, cutoff_page: u32) -> bool {
 // Multixact utils

 pub fn mx_offset_to_flags_offset(xid: MultiXactId) -> usize {
-    ((xid / pg_constants::MULTIXACT_MEMBERS_PER_MEMBERGROUP as u32)
-        % pg_constants::MULTIXACT_MEMBERGROUPS_PER_PAGE as u32
-        * pg_constants::MULTIXACT_MEMBERGROUP_SIZE as u32) as usize
+    ((xid / pg_constants::MULTIXACT_MEMBERS_PER_MEMBERGROUP as u32) as u16
+        % pg_constants::MULTIXACT_MEMBERGROUPS_PER_PAGE
+        * pg_constants::MULTIXACT_MEMBERGROUP_SIZE) as usize
 }

 pub fn mx_offset_to_flags_bitshift(xid: MultiXactId) -> u16 {
--- a/libs/postgres_ffi/src/wal_craft_test_export.rs
+++ b/libs/postgres_ffi/src/wal_craft_test_export.rs
@@ -1,6 +0,0 @@
-//! This module is for WAL craft to test with postgres_ffi. Should not import any thing in normal usage.
-
-pub use super::PG_MAJORVERSION;
-pub use super::xlog_utils::*;
-pub use super::bindings::*;
-pub use crate::WAL_SEGMENT_SIZE;
--- a/libs/postgres_ffi/src/xlog_utils.rs
+++ b/libs/postgres_ffi/src/xlog_utils.rs
@@ -481,4 +481,220 @@ pub fn encode_logical_message(prefix: &str, message: &str) -> Vec<u8> {
    wal
 }

-// If you need to craft WAL and write tests for this module, put it at wal_craft crate.
+#[cfg(test)]
+mod tests {
+    use super::super::PG_MAJORVERSION;
+    use super::*;
+    use regex::Regex;
+    use std::cmp::min;
+    use std::fs;
+    use std::{env, str::FromStr};
+    use utils::const_assert;
+
+    fn init_logging() {
+        let _ = env_logger::Builder::from_env(env_logger::Env::default().default_filter_or(
+            format!("wal_craft=info,postgres_ffi::{PG_MAJORVERSION}::xlog_utils=trace"),
+        ))
+        .is_test(true)
+        .try_init();
+    }
+
+    fn test_end_of_wal<C: wal_craft::Crafter>(test_name: &str) {
+        use wal_craft::*;
+
+        let pg_version = PG_MAJORVERSION[1..3].parse::<u32>().unwrap();
+
+        // Craft some WAL
+        let top_path = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
+            .join("..")
+            .join("..");
+        let cfg = Conf {
+            pg_version,
+            pg_distrib_dir: top_path.join("pg_install"),
+            datadir: top_path.join(format!("test_output/{}-{PG_MAJORVERSION}", test_name)),
+        };
+        if cfg.datadir.exists() {
+            fs::remove_dir_all(&cfg.datadir).unwrap();
+        }
+        cfg.initdb().unwrap();
+        let srv = cfg.start_server().unwrap();
+        let (intermediate_lsns, expected_end_of_wal_partial) =
+            C::craft(&mut srv.connect_with_timeout().unwrap()).unwrap();
+        let intermediate_lsns: Vec<Lsn> = intermediate_lsns
+            .iter()
+            .map(|&lsn| u64::from(lsn).into())
+            .collect();
+        let expected_end_of_wal: Lsn = u64::from(expected_end_of_wal_partial).into();
+        srv.kill();
+
+        // Check find_end_of_wal on the initial WAL
+        let last_segment = cfg
+            .wal_dir()
+            .read_dir()
+            .unwrap()
+            .map(|f| f.unwrap().file_name().into_string().unwrap())
+            .filter(|fname| IsXLogFileName(fname))
+            .max()
+            .unwrap();
+        check_pg_waldump_end_of_wal(&cfg, &last_segment, expected_end_of_wal);
+        for start_lsn in intermediate_lsns
+            .iter()
+            .chain(std::iter::once(&expected_end_of_wal))
+        {
+            // Erase all WAL before `start_lsn` to ensure it's not used by `find_end_of_wal`.
+            // We assume that `start_lsn` is non-decreasing.
+            info!(
+                "Checking with start_lsn={}, erasing WAL before it",
+                start_lsn
+            );
+            for file in fs::read_dir(cfg.wal_dir()).unwrap().flatten() {
+                let fname = file.file_name().into_string().unwrap();
+                if !IsXLogFileName(&fname) {
+                    continue;
+                }
+                let (segno, _) = XLogFromFileName(&fname, WAL_SEGMENT_SIZE);
+                let seg_start_lsn = XLogSegNoOffsetToRecPtr(segno, 0, WAL_SEGMENT_SIZE);
+                if seg_start_lsn > u64::from(*start_lsn) {
+                    continue;
+                }
+                let mut f = File::options().write(true).open(file.path()).unwrap();
+                const ZEROS: [u8; WAL_SEGMENT_SIZE] = [0u8; WAL_SEGMENT_SIZE];
+                f.write_all(
+                    &ZEROS[0..min(
+                        WAL_SEGMENT_SIZE,
+                        (u64::from(*start_lsn) - seg_start_lsn) as usize,
+                    )],
+                )
+                .unwrap();
+            }
+            check_end_of_wal(&cfg, &last_segment, *start_lsn, expected_end_of_wal);
+        }
+    }
+
+    fn check_pg_waldump_end_of_wal(
+        cfg: &wal_craft::Conf,
+        last_segment: &str,
+        expected_end_of_wal: Lsn,
+    ) {
+        // Get the actual end of WAL by pg_waldump
+        let waldump_output = cfg
+            .pg_waldump("000000010000000000000001", last_segment)
+            .unwrap()
+            .stderr;
+        let waldump_output = std::str::from_utf8(&waldump_output).unwrap();
+        let caps = match Regex::new(r"invalid record length at (.+):")
+            .unwrap()
+            .captures(waldump_output)
+        {
+            Some(caps) => caps,
+            None => {
+                error!("Unable to parse pg_waldump's stderr:\n{}", waldump_output);
+                panic!();
+            }
+        };
+        let waldump_wal_end = Lsn::from_str(caps.get(1).unwrap().as_str()).unwrap();
+        info!(
+            "waldump erred on {}, expected wal end at {}",
+            waldump_wal_end, expected_end_of_wal
+        );
+        assert_eq!(waldump_wal_end, expected_end_of_wal);
+    }
+
+    fn check_end_of_wal(
+        cfg: &wal_craft::Conf,
+        last_segment: &str,
+        start_lsn: Lsn,
+        expected_end_of_wal: Lsn,
+    ) {
+        // Check end_of_wal on non-partial WAL segment (we treat it as fully populated)
+        // let wal_end = find_end_of_wal(&cfg.wal_dir(), WAL_SEGMENT_SIZE, start_lsn).unwrap();
+        // info!(
+        //     "find_end_of_wal returned wal_end={} with non-partial WAL segment",
+        //     wal_end
+        // );
+        // assert_eq!(wal_end, expected_end_of_wal_non_partial);
+
+        // Rename file to partial to actually find last valid lsn, then rename it back.
+        fs::rename(
+            cfg.wal_dir().join(last_segment),
+            cfg.wal_dir().join(format!("{}.partial", last_segment)),
+        )
+        .unwrap();
+        let wal_end = find_end_of_wal(&cfg.wal_dir(), WAL_SEGMENT_SIZE, start_lsn).unwrap();
+        info!(
+            "find_end_of_wal returned wal_end={} with partial WAL segment",
+            wal_end
+        );
+        assert_eq!(wal_end, expected_end_of_wal);
+        fs::rename(
+            cfg.wal_dir().join(format!("{}.partial", last_segment)),
+            cfg.wal_dir().join(last_segment),
+        )
+        .unwrap();
+    }
+
+    const_assert!(WAL_SEGMENT_SIZE == 16 * 1024 * 1024);
+
+    #[test]
+    pub fn test_find_end_of_wal_simple() {
+        init_logging();
+        test_end_of_wal::<wal_craft::Simple>("test_find_end_of_wal_simple");
+    }
+
+    #[test]
+    pub fn test_find_end_of_wal_crossing_segment_followed_by_small_one() {
+        init_logging();
+        test_end_of_wal::<wal_craft::WalRecordCrossingSegmentFollowedBySmallOne>(
+            "test_find_end_of_wal_crossing_segment_followed_by_small_one",
+        );
+    }
+
+    #[test]
+    pub fn test_find_end_of_wal_last_crossing_segment() {
+        init_logging();
+        test_end_of_wal::<wal_craft::LastWalRecordCrossingSegment>(
+            "test_find_end_of_wal_last_crossing_segment",
+        );
+    }
+
+    /// Check the math in update_next_xid
+    ///
+    /// NOTE: These checks are sensitive to the value of XID_CHECKPOINT_INTERVAL,
+    /// currently 1024.
+    #[test]
+    pub fn test_update_next_xid() {
+        let checkpoint_buf = [0u8; std::mem::size_of::<CheckPoint>()];
+        let mut checkpoint = CheckPoint::decode(&checkpoint_buf).unwrap();
+
+        checkpoint.nextXid = FullTransactionId { value: 10 };
+        assert_eq!(checkpoint.nextXid.value, 10);
+
+        // The input XID gets rounded up to the next XID_CHECKPOINT_INTERVAL
+        // boundary
+        checkpoint.update_next_xid(100);
+        assert_eq!(checkpoint.nextXid.value, 1024);
+
+        // No change
+        checkpoint.update_next_xid(500);
+        assert_eq!(checkpoint.nextXid.value, 1024);
+        checkpoint.update_next_xid(1023);
+        assert_eq!(checkpoint.nextXid.value, 1024);
+
+        // The function returns the *next* XID, given the highest XID seen so
+        // far. So when we pass 1024, the nextXid gets bumped up to the next
+        // XID_CHECKPOINT_INTERVAL boundary.
+        checkpoint.update_next_xid(1024);
+        assert_eq!(checkpoint.nextXid.value, 2048);
+    }
+
+    #[test]
+    pub fn test_encode_logical_message() {
+        let expected = [
+            64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 21, 0, 0, 170, 34, 166, 227, 255,
+            38, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 112, 114,
+            101, 102, 105, 120, 0, 109, 101, 115, 115, 97, 103, 101,
+        ];
+        let actual = encode_logical_message("prefix", "message");
+        assert_eq!(expected, actual[..]);
+    }
+}
--- a/libs/postgres_ffi/wal_craft/Cargo.toml
+++ b/libs/postgres_ffi/wal_craft/Cargo.toml
@@ -15,7 +15,3 @@ postgres_ffi.workspace = true
 tempfile.workspace = true

 workspace_hack.workspace = true
-
-[dev-dependencies]
-regex.workspace = true
-utils.workspace = true
--- a/libs/postgres_ffi/wal_craft/src/lib.rs
+++ b/libs/postgres_ffi/wal_craft/src/lib.rs
@@ -10,20 +10,6 @@ use std::process::Command;
 use std::time::{Duration, Instant};
 use tempfile::{tempdir, TempDir};

-macro_rules! xlog_utils_test {
-    ($version:ident) => {
-        #[path = "."]
-        mod $version {
-            pub use postgres_ffi::$version::wal_craft_test_export::*;
-            #[allow(clippy::duplicate_mod)]
-            #[cfg(test)]
-            mod xlog_utils_test;
-        }
-    };
-}
-
-postgres_ffi::for_all_postgres_versions! { xlog_utils_test }
-
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub struct Conf {
    pub pg_version: u32,
--- a/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs
+++ b/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs
@@ -1,219 +0,0 @@
-//! Tests for postgres_ffi xlog_utils module. Put it here to break cyclic dependency.
-
-use super::*;
-use crate::{error, info};
-use regex::Regex;
-use std::cmp::min;
-use std::fs::{self, File};
-use std::io::Write;
-use std::{env, str::FromStr};
-use utils::const_assert;
-use utils::lsn::Lsn;
-
-fn init_logging() {
-    let _ = env_logger::Builder::from_env(env_logger::Env::default().default_filter_or(
-        format!("crate=info,postgres_ffi::{PG_MAJORVERSION}::xlog_utils=trace"),
-    ))
-    .is_test(true)
-    .try_init();
-}
-
-fn test_end_of_wal<C: crate::Crafter>(test_name: &str) {
-    use crate::*;
-
-    let pg_version = PG_MAJORVERSION[1..3].parse::<u32>().unwrap();
-
-    // Craft some WAL
-    let top_path = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
-        .join("..")
-        .join("..")
-        .join("..");
-    let cfg = Conf {
-        pg_version,
-        pg_distrib_dir: top_path.join("pg_install"),
-        datadir: top_path.join(format!("test_output/{}-{PG_MAJORVERSION}", test_name)),
-    };
-    if cfg.datadir.exists() {
-        fs::remove_dir_all(&cfg.datadir).unwrap();
-    }
-    cfg.initdb().unwrap();
-    let srv = cfg.start_server().unwrap();
-    let (intermediate_lsns, expected_end_of_wal_partial) =
-        C::craft(&mut srv.connect_with_timeout().unwrap()).unwrap();
-    let intermediate_lsns: Vec<Lsn> = intermediate_lsns
-        .iter()
-        .map(|&lsn| u64::from(lsn).into())
-        .collect();
-    let expected_end_of_wal: Lsn = u64::from(expected_end_of_wal_partial).into();
-    srv.kill();
-
-    // Check find_end_of_wal on the initial WAL
-    let last_segment = cfg
-        .wal_dir()
-        .read_dir()
-        .unwrap()
-        .map(|f| f.unwrap().file_name().into_string().unwrap())
-        .filter(|fname| IsXLogFileName(fname))
-        .max()
-        .unwrap();
-    check_pg_waldump_end_of_wal(&cfg, &last_segment, expected_end_of_wal);
-    for start_lsn in intermediate_lsns
-        .iter()
-        .chain(std::iter::once(&expected_end_of_wal))
-    {
-        // Erase all WAL before `start_lsn` to ensure it's not used by `find_end_of_wal`.
-        // We assume that `start_lsn` is non-decreasing.
-        info!(
-            "Checking with start_lsn={}, erasing WAL before it",
-            start_lsn
-        );
-        for file in fs::read_dir(cfg.wal_dir()).unwrap().flatten() {
-            let fname = file.file_name().into_string().unwrap();
-            if !IsXLogFileName(&fname) {
-                continue;
-            }
-            let (segno, _) = XLogFromFileName(&fname, WAL_SEGMENT_SIZE);
-            let seg_start_lsn = XLogSegNoOffsetToRecPtr(segno, 0, WAL_SEGMENT_SIZE);
-            if seg_start_lsn > u64::from(*start_lsn) {
-                continue;
-            }
-            let mut f = File::options().write(true).open(file.path()).unwrap();
-            const ZEROS: [u8; WAL_SEGMENT_SIZE] = [0u8; WAL_SEGMENT_SIZE];
-            f.write_all(
-                &ZEROS[0..min(
-                    WAL_SEGMENT_SIZE,
-                    (u64::from(*start_lsn) - seg_start_lsn) as usize,
-                )],
-            )
-            .unwrap();
-        }
-        check_end_of_wal(&cfg, &last_segment, *start_lsn, expected_end_of_wal);
-    }
-}
-
-fn check_pg_waldump_end_of_wal(
-    cfg: &crate::Conf,
-    last_segment: &str,
-    expected_end_of_wal: Lsn,
-) {
-    // Get the actual end of WAL by pg_waldump
-    let waldump_output = cfg
-        .pg_waldump("000000010000000000000001", last_segment)
-        .unwrap()
-        .stderr;
-    let waldump_output = std::str::from_utf8(&waldump_output).unwrap();
-    let caps = match Regex::new(r"invalid record length at (.+):")
-        .unwrap()
-        .captures(waldump_output)
-    {
-        Some(caps) => caps,
-        None => {
-            error!("Unable to parse pg_waldump's stderr:\n{}", waldump_output);
-            panic!();
-        }
-    };
-    let waldump_wal_end = Lsn::from_str(caps.get(1).unwrap().as_str()).unwrap();
-    info!(
-        "waldump erred on {}, expected wal end at {}",
-        waldump_wal_end, expected_end_of_wal
-    );
-    assert_eq!(waldump_wal_end, expected_end_of_wal);
-}
-
-fn check_end_of_wal(
-    cfg: &crate::Conf,
-    last_segment: &str,
-    start_lsn: Lsn,
-    expected_end_of_wal: Lsn,
-) {
-    // Check end_of_wal on non-partial WAL segment (we treat it as fully populated)
-    // let wal_end = find_end_of_wal(&cfg.wal_dir(), WAL_SEGMENT_SIZE, start_lsn).unwrap();
-    // info!(
-    //     "find_end_of_wal returned wal_end={} with non-partial WAL segment",
-    //     wal_end
-    // );
-    // assert_eq!(wal_end, expected_end_of_wal_non_partial);
-
-    // Rename file to partial to actually find last valid lsn, then rename it back.
-    fs::rename(
-        cfg.wal_dir().join(last_segment),
-        cfg.wal_dir().join(format!("{}.partial", last_segment)),
-    )
-    .unwrap();
-    let wal_end = find_end_of_wal(&cfg.wal_dir(), WAL_SEGMENT_SIZE, start_lsn).unwrap();
-    info!(
-        "find_end_of_wal returned wal_end={} with partial WAL segment",
-        wal_end
-    );
-    assert_eq!(wal_end, expected_end_of_wal);
-    fs::rename(
-        cfg.wal_dir().join(format!("{}.partial", last_segment)),
-        cfg.wal_dir().join(last_segment),
-    )
-    .unwrap();
-}
-
-const_assert!(WAL_SEGMENT_SIZE == 16 * 1024 * 1024);
-
-#[test]
-pub fn test_find_end_of_wal_simple() {
-    init_logging();
-    test_end_of_wal::<crate::Simple>("test_find_end_of_wal_simple");
-}
-
-#[test]
-pub fn test_find_end_of_wal_crossing_segment_followed_by_small_one() {
-    init_logging();
-    test_end_of_wal::<crate::WalRecordCrossingSegmentFollowedBySmallOne>(
-        "test_find_end_of_wal_crossing_segment_followed_by_small_one",
-    );
-}
-
-#[test]
-pub fn test_find_end_of_wal_last_crossing_segment() {
-    init_logging();
-    test_end_of_wal::<crate::LastWalRecordCrossingSegment>(
-        "test_find_end_of_wal_last_crossing_segment",
-    );
-}
-
-/// Check the math in update_next_xid
-///
-/// NOTE: These checks are sensitive to the value of XID_CHECKPOINT_INTERVAL,
-/// currently 1024.
-#[test]
-pub fn test_update_next_xid() {
-    let checkpoint_buf = [0u8; std::mem::size_of::<CheckPoint>()];
-    let mut checkpoint = CheckPoint::decode(&checkpoint_buf).unwrap();
-
-    checkpoint.nextXid = FullTransactionId { value: 10 };
-    assert_eq!(checkpoint.nextXid.value, 10);
-
-    // The input XID gets rounded up to the next XID_CHECKPOINT_INTERVAL
-    // boundary
-    checkpoint.update_next_xid(100);
-    assert_eq!(checkpoint.nextXid.value, 1024);
-
-    // No change
-    checkpoint.update_next_xid(500);
-    assert_eq!(checkpoint.nextXid.value, 1024);
-    checkpoint.update_next_xid(1023);
-    assert_eq!(checkpoint.nextXid.value, 1024);
-
-    // The function returns the *next* XID, given the highest XID seen so
-    // far. So when we pass 1024, the nextXid gets bumped up to the next
-    // XID_CHECKPOINT_INTERVAL boundary.
-    checkpoint.update_next_xid(1024);
-    assert_eq!(checkpoint.nextXid.value, 2048);
-}
-
-#[test]
-pub fn test_encode_logical_message() {
-    let expected = [
-        64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 21, 0, 0, 170, 34, 166, 227, 255,
-        38, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 112, 114,
-        101, 102, 105, 120, 0, 109, 101, 115, 115, 97, 103, 101,
-    ];
-    let actual = encode_logical_message("prefix", "message");
-    assert_eq!(expected, actual[..]);
-}
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -70,14 +70,6 @@ impl RemotePath {
    pub fn join(&self, segment: &Path) -> Self {
        Self(self.0.join(segment))
    }
-
-    pub fn get_path(&self) -> &PathBuf {
-        &self.0
-    }
-
-    pub fn extension(&self) -> Option<&str> {
-        self.0.extension()?.to_str()
-    }
 }

 /// Storage (potentially remote) API to manage its state.
@@ -94,19 +86,6 @@ pub trait RemoteStorage: Send + Sync + 'static {
        prefix: Option<&RemotePath>,
    ) -> Result<Vec<RemotePath>, DownloadError>;

-    /// Lists all files in directory "recursively"
-    /// (not really recursively, because AWS has a flat namespace)
-    /// Note: This is subtely different than list_prefixes,
-    /// because it is for listing files instead of listing
-    /// names sharing common prefixes.
-    /// For example,
-    /// list_files("foo/bar") = ["foo/bar/cat123.txt",
-    /// "foo/bar/cat567.txt", "foo/bar/dog123.txt", "foo/bar/dog456.txt"]
-    /// whereas,
-    /// list_prefixes("foo/bar/") = ["cat", "dog"]
-    /// See `test_real_s3.rs` for more details.
-    async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>>;
-
    /// Streams the local file contents into remote into the remote storage entry.
    async fn upload(
        &self,
@@ -132,8 +111,6 @@ pub trait RemoteStorage: Send + Sync + 'static {
    ) -> Result<Download, DownloadError>;

    async fn delete(&self, path: &RemotePath) -> anyhow::Result<()>;
-
-    async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()>;
 }

 pub struct Download {
@@ -195,14 +172,6 @@ impl GenericRemoteStorage {
        }
    }

-    pub async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
-        match self {
-            Self::LocalFs(s) => s.list_files(folder).await,
-            Self::AwsS3(s) => s.list_files(folder).await,
-            Self::Unreliable(s) => s.list_files(folder).await,
-        }
-    }
-
    pub async fn upload(
        &self,
        from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
@@ -254,14 +223,6 @@ impl GenericRemoteStorage {
            Self::Unreliable(s) => s.delete(path).await,
        }
    }
-
-    pub async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> {
-        match self {
-            Self::LocalFs(s) => s.delete_objects(paths).await,
-            Self::AwsS3(s) => s.delete_objects(paths).await,
-            Self::Unreliable(s) => s.delete_objects(paths).await,
-        }
-    }
 }

 impl GenericRemoteStorage {
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -17,7 +17,7 @@ use tokio::{
    io::{self, AsyncReadExt, AsyncSeekExt, AsyncWriteExt},
 };
 use tracing::*;
-use utils::{crashsafe::path_with_suffix_extension, fs_ext::is_directory_empty};
+use utils::crashsafe::path_with_suffix_extension;

 use crate::{Download, DownloadError, RemotePath};

@@ -48,14 +48,6 @@ impl LocalFs {
        Ok(Self { storage_root })
    }

-    // mirrors S3Bucket::s3_object_to_relative_path
-    fn local_file_to_relative_path(&self, key: PathBuf) -> RemotePath {
-        let relative_path = key
-            .strip_prefix(&self.storage_root)
-            .expect("relative path must contain storage_root as prefix");
-        RemotePath(relative_path.into())
-    }
-
    async fn read_storage_metadata(
        &self,
        file_path: &Path,
@@ -109,63 +101,19 @@ impl RemoteStorage for LocalFs {
            Some(prefix) => Cow::Owned(prefix.with_base(&self.storage_root)),
            None => Cow::Borrowed(&self.storage_root),
        };
-
-        let prefixes_to_filter = get_all_files(path.as_ref(), false)
+        Ok(get_all_files(path.as_ref(), false)
            .await
-            .map_err(DownloadError::Other)?;
-
-        let mut prefixes = Vec::with_capacity(prefixes_to_filter.len());
-
-        // filter out empty directories to mirror s3 behavior.
-        for prefix in prefixes_to_filter {
-            if prefix.is_dir()
-                && is_directory_empty(&prefix)
-                    .await
-                    .map_err(DownloadError::Other)?
-            {
-                continue;
-            }
-
-            prefixes.push(
-                prefix
-                    .strip_prefix(&self.storage_root)
-                    .context("Failed to strip prefix")
+            .map_err(DownloadError::Other)?
+            .into_iter()
+            .map(|path| {
+                path.strip_prefix(&self.storage_root)
+                    .context("Failed to strip preifix")
                    .and_then(RemotePath::new)
                    .expect(
                        "We list files for storage root, hence should be able to remote the prefix",
-                    ),
-            )
-        }
-
-        Ok(prefixes)
-    }
-
-    // recursively lists all files in a directory,
-    // mirroring the `list_files` for `s3_bucket`
-    async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
-        let full_path = match folder {
-            Some(folder) => folder.with_base(&self.storage_root),
-            None => self.storage_root.clone(),
-        };
-        let mut files = vec![];
-        let mut directory_queue = vec![full_path.clone()];
-
-        while !directory_queue.is_empty() {
-            let cur_folder = directory_queue
-                .pop()
-                .expect("queue cannot be empty: we just checked");
-            let mut entries = fs::read_dir(cur_folder.clone()).await?;
-            while let Some(entry) = entries.next_entry().await? {
-                let file_name: PathBuf = entry.file_name().into();
-                let full_file_name = cur_folder.clone().join(&file_name);
-                let file_remote_path = self.local_file_to_relative_path(full_file_name.clone());
-                files.push(file_remote_path.clone());
-                if full_file_name.is_dir() {
-                    directory_queue.push(full_file_name);
-                }
-            }
-        }
-        Ok(files)
+                    )
+            })
+            .collect())
    }

    async fn upload(
@@ -343,25 +291,11 @@ impl RemoteStorage for LocalFs {

    async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
        let file_path = path.with_base(&self.storage_root);
-        if !file_path.exists() {
-            // See https://docs.aws.amazon.com/AmazonS3/latest/API/API_DeleteObject.html
-            // > If there isn't a null version, Amazon S3 does not remove any objects but will still respond that the command was successful.
-            return Ok(());
+        if file_path.exists() && file_path.is_file() {
+            Ok(fs::remove_file(file_path).await?)
+        } else {
+            bail!("File {file_path:?} either does not exist or is not a file")
        }
-
-        if !file_path.is_file() {
-            anyhow::bail!("{file_path:?} is not a file");
-        }
-        Ok(fs::remove_file(file_path)
-            .await
-            .map_err(|e| anyhow::anyhow!(e))?)
-    }
-
-    async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> {
-        for path in paths {
-            self.delete(path).await?
-        }
-        Ok(())
    }
 }

@@ -386,7 +320,7 @@ where
                    let file_type = dir_entry.file_type().await?;
                    let entry_path = dir_entry.path();
                    if file_type.is_symlink() {
-                        debug!("{entry_path:?} is a symlink, skipping")
+                        debug!("{entry_path:?} us a symlink, skipping")
                    } else if file_type.is_dir() {
                        if recursive {
                            paths.extend(get_all_files(&entry_path, true).await?.into_iter())
@@ -661,11 +595,15 @@ mod fs_tests {
        storage.delete(&upload_target).await?;
        assert!(storage.list().await?.is_empty());

-        storage
-            .delete(&upload_target)
-            .await
-            .expect("Should allow deleting non-existing storage files");
-
+        match storage.delete(&upload_target).await {
+            Ok(()) => panic!("Should not allow deleting non-existing storage files"),
+            Err(e) => {
+                let error_string = e.to_string();
+                assert!(error_string.contains("does not exist"));
+                let expected_path = upload_target.with_base(&storage.storage_root);
+                assert!(error_string.contains(expected_path.to_str().unwrap()));
+            }
+        }
        Ok(())
    }

--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -17,7 +17,6 @@ use aws_sdk_s3::{
    error::SdkError,
    operation::get_object::GetObjectError,
    primitives::ByteStream,
-    types::{Delete, ObjectIdentifier},
    Client,
 };
 use aws_smithy_http::body::SdkBody;
@@ -34,8 +33,6 @@ use crate::{
    Download, DownloadError, RemotePath, RemoteStorage, S3Config, REMOTE_STORAGE_PREFIX_SEPARATOR,
 };

-const MAX_DELETE_OBJECTS_REQUEST_SIZE: usize = 1000;
-
 pub(super) mod metrics {
    use metrics::{register_int_counter_vec, IntCounterVec};
    use once_cell::sync::Lazy;
@@ -84,24 +81,12 @@ pub(super) mod metrics {
            .inc();
    }

-    pub fn inc_delete_objects(count: u64) {
-        S3_REQUESTS_COUNT
-            .with_label_values(&["delete_object"])
-            .inc_by(count);
-    }
-
    pub fn inc_delete_object_fail() {
        S3_REQUESTS_FAIL_COUNT
            .with_label_values(&["delete_object"])
            .inc();
    }

-    pub fn inc_delete_objects_fail(count: u64) {
-        S3_REQUESTS_FAIL_COUNT
-            .with_label_values(&["delete_object"])
-            .inc_by(count);
-    }
-
    pub fn inc_list_objects() {
        S3_REQUESTS_COUNT.with_label_values(&["list_objects"]).inc();
    }
@@ -347,51 +332,6 @@ impl RemoteStorage for S3Bucket {
        Ok(document_keys)
    }

-    /// See the doc for `RemoteStorage::list_files`
-    async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
-        let folder_name = folder
-            .map(|p| self.relative_path_to_s3_object(p))
-            .or_else(|| self.prefix_in_bucket.clone());
-
-        // AWS may need to break the response into several parts
-        let mut continuation_token = None;
-        let mut all_files = vec![];
-        loop {
-            let _guard = self
-                .concurrency_limiter
-                .acquire()
-                .await
-                .context("Concurrency limiter semaphore got closed during S3 list_files")?;
-            metrics::inc_list_objects();
-
-            let response = self
-                .client
-                .list_objects_v2()
-                .bucket(self.bucket_name.clone())
-                .set_prefix(folder_name.clone())
-                .set_continuation_token(continuation_token)
-                .set_max_keys(self.max_keys_per_list_response)
-                .send()
-                .await
-                .map_err(|e| {
-                    metrics::inc_list_objects_fail();
-                    e
-                })
-                .context("Failed to list files in S3 bucket")?;
-
-            for object in response.contents().unwrap_or_default() {
-                let object_path = object.key().expect("response does not contain a key");
-                let remote_path = self.s3_object_to_relative_path(object_path);
-                all_files.push(remote_path);
-            }
-            match response.next_continuation_token {
-                Some(new_token) => continuation_token = Some(new_token),
-                None => break,
-            }
-        }
-        Ok(all_files)
-    }
-
    async fn upload(
        &self,
        from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
@@ -456,50 +396,6 @@ impl RemoteStorage for S3Bucket {
        })
        .await
    }
-    async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> {
-        let _guard = self
-            .concurrency_limiter
-            .acquire()
-            .await
-            .context("Concurrency limiter semaphore got closed during S3 delete")?;
-
-        let mut delete_objects = Vec::with_capacity(paths.len());
-        for path in paths {
-            let obj_id = ObjectIdentifier::builder()
-                .set_key(Some(self.relative_path_to_s3_object(path)))
-                .build();
-            delete_objects.push(obj_id);
-        }
-
-        for chunk in delete_objects.chunks(MAX_DELETE_OBJECTS_REQUEST_SIZE) {
-            metrics::inc_delete_objects(chunk.len() as u64);
-
-            let resp = self
-                .client
-                .delete_objects()
-                .bucket(self.bucket_name.clone())
-                .delete(Delete::builder().set_objects(Some(chunk.to_vec())).build())
-                .send()
-                .await;
-
-            match resp {
-                Ok(resp) => {
-                    if let Some(errors) = resp.errors {
-                        metrics::inc_delete_objects_fail(errors.len() as u64);
-                        return Err(anyhow::format_err!(
-                            "Failed to delete {} objects",
-                            errors.len()
-                        ));
-                    }
-                }
-                Err(e) => {
-                    metrics::inc_delete_objects_fail(chunk.len() as u64);
-                    return Err(e.into());
-                }
-            }
-        }
-        Ok(())
-    }

    async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
        let _guard = self
--- a/libs/remote_storage/src/simulate_failures.rs
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -24,7 +24,6 @@ enum RemoteOp {
    Upload(RemotePath),
    Download(RemotePath),
    Delete(RemotePath),
-    DeleteObjects(Vec<RemotePath>),
 }

 impl UnreliableWrapper {
@@ -83,11 +82,6 @@ impl RemoteStorage for UnreliableWrapper {
        self.inner.list_prefixes(prefix).await
    }

-    async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
-        self.attempt(RemoteOp::ListPrefixes(folder.cloned()))?;
-        self.inner.list_files(folder).await
-    }
-
    async fn upload(
        &self,
        data: impl tokio::io::AsyncRead + Unpin + Send + Sync + 'static,
@@ -125,21 +119,4 @@ impl RemoteStorage for UnreliableWrapper {
        self.attempt(RemoteOp::Delete(path.clone()))?;
        self.inner.delete(path).await
    }
-
-    async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> {
-        self.attempt(RemoteOp::DeleteObjects(paths.to_vec()))?;
-        let mut error_counter = 0;
-        for path in paths {
-            if (self.delete(path).await).is_err() {
-                error_counter += 1;
-            }
-        }
-        if error_counter > 0 {
-            return Err(anyhow::anyhow!(
-                "failed to delete {} objects",
-                error_counter
-            ));
-        }
-        Ok(())
-    }
 }
--- a/libs/remote_storage/tests/pagination_tests.rs
+++ b/libs/remote_storage/tests/pagination_tests.rs
@@ -0,0 +1,274 @@
+use std::collections::HashSet;
+use std::env;
+use std::num::{NonZeroU32, NonZeroUsize};
+use std::ops::ControlFlow;
+use std::path::{Path, PathBuf};
+use std::sync::Arc;
+use std::time::UNIX_EPOCH;
+
+use anyhow::Context;
+use remote_storage::{
+    GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind, S3Config,
+};
+use test_context::{test_context, AsyncTestContext};
+use tokio::task::JoinSet;
+use tracing::{debug, error, info};
+
+const ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME: &str = "ENABLE_REAL_S3_REMOTE_STORAGE";
+
+/// Tests that S3 client can list all prefixes, even if the response come paginated and requires multiple S3 queries.
+/// Uses real S3 and requires [`ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME`] and related S3 cred env vars specified.
+/// See the client creation in [`create_s3_client`] for details on the required env vars.
+/// If real S3 tests are disabled, the test passes, skipping any real test run: currently, there's no way to mark the test ignored in runtime with the
+/// deafult test framework, see https://github.com/rust-lang/rust/issues/68007 for details.
+///
+/// First, the test creates a set of S3 objects with keys `/${random_prefix_part}/${base_prefix_str}/sub_prefix_${i}/blob_${i}` in [`upload_s3_data`]
+/// where
+/// * `random_prefix_part` is set for the entire S3 client during the S3 client creation in [`create_s3_client`], to avoid multiple test runs interference
+/// * `base_prefix_str` is a common prefix to use in the client requests: we would want to ensure that the client is able to list nested prefixes inside the bucket
+///
+/// Then, verifies that the client does return correct prefixes when queried:
+/// * with no prefix, it lists everything after its `${random_prefix_part}/` — that should be `${base_prefix_str}` value only
+/// * with `${base_prefix_str}/` prefix, it lists every `sub_prefix_${i}`
+///
+/// With the real S3 enabled and `#[cfg(test)]` Rust configuration used, the S3 client test adds a `max-keys` param to limit the response keys.
+/// This way, we are able to test the pagination implicitly, by ensuring all results are returned from the remote storage and avoid uploading too many blobs to S3,
+/// since current default AWS S3 pagination limit is 1000.
+/// (see https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html#API_ListObjectsV2_RequestSyntax)
+///
+/// Lastly, the test attempts to clean up and remove all uploaded S3 files.
+/// If any errors appear during the clean up, they get logged, but the test is not failed or stopped until clean up is finished.
+#[test_context(MaybeEnabledS3)]
+#[tokio::test]
+async fn s3_pagination_should_work(ctx: &mut MaybeEnabledS3) -> anyhow::Result<()> {
+    let ctx = match ctx {
+        MaybeEnabledS3::Enabled(ctx) => ctx,
+        MaybeEnabledS3::Disabled => return Ok(()),
+        MaybeEnabledS3::UploadsFailed(e, _) => anyhow::bail!("S3 init failed: {e:?}"),
+    };
+
+    let test_client = Arc::clone(&ctx.client_with_excessive_pagination);
+    let expected_remote_prefixes = ctx.remote_prefixes.clone();
+
+    let base_prefix =
+        RemotePath::new(Path::new(ctx.base_prefix_str)).context("common_prefix construction")?;
+    let root_remote_prefixes = test_client
+        .list_prefixes(None)
+        .await
+        .context("client list root prefixes failure")?
+        .into_iter()
+        .collect::<HashSet<_>>();
+    assert_eq!(
+        root_remote_prefixes, HashSet::from([base_prefix.clone()]),
+        "remote storage root prefixes list mismatches with the uploads. Returned prefixes: {root_remote_prefixes:?}"
+    );
+
+    let nested_remote_prefixes = test_client
+        .list_prefixes(Some(&base_prefix))
+        .await
+        .context("client list nested prefixes failure")?
+        .into_iter()
+        .collect::<HashSet<_>>();
+    let remote_only_prefixes = nested_remote_prefixes
+        .difference(&expected_remote_prefixes)
+        .collect::<HashSet<_>>();
+    let missing_uploaded_prefixes = expected_remote_prefixes
+        .difference(&nested_remote_prefixes)
+        .collect::<HashSet<_>>();
+    assert_eq!(
+        remote_only_prefixes.len() + missing_uploaded_prefixes.len(), 0,
+        "remote storage nested prefixes list mismatches with the uploads. Remote only prefixes: {remote_only_prefixes:?}, missing uploaded prefixes: {missing_uploaded_prefixes:?}",
+    );
+
+    Ok(())
+}
+
+enum MaybeEnabledS3 {
+    Enabled(S3WithTestBlobs),
+    Disabled,
+    UploadsFailed(anyhow::Error, S3WithTestBlobs),
+}
+
+struct S3WithTestBlobs {
+    client_with_excessive_pagination: Arc<GenericRemoteStorage>,
+    base_prefix_str: &'static str,
+    remote_prefixes: HashSet<RemotePath>,
+    remote_blobs: HashSet<RemotePath>,
+}
+
+#[async_trait::async_trait]
+impl AsyncTestContext for MaybeEnabledS3 {
+    async fn setup() -> Self {
+        utils::logging::init(
+            utils::logging::LogFormat::Test,
+            utils::logging::TracingErrorLayerEnablement::Disabled,
+        )
+        .expect("logging init failed");
+        if env::var(ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME).is_err() {
+            info!(
+                "`{}` env variable is not set, skipping the test",
+                ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME
+            );
+            return Self::Disabled;
+        }
+
+        let max_keys_in_list_response = 10;
+        let upload_tasks_count = 1 + (2 * usize::try_from(max_keys_in_list_response).unwrap());
+
+        let client_with_excessive_pagination = create_s3_client(max_keys_in_list_response)
+            .context("S3 client creation")
+            .expect("S3 client creation failed");
+
+        let base_prefix_str = "test/";
+        match upload_s3_data(
+            &client_with_excessive_pagination,
+            base_prefix_str,
+            upload_tasks_count,
+        )
+        .await
+        {
+            ControlFlow::Continue(uploads) => {
+                info!("Remote objects created successfully");
+                Self::Enabled(S3WithTestBlobs {
+                    client_with_excessive_pagination,
+                    base_prefix_str,
+                    remote_prefixes: uploads.prefixes,
+                    remote_blobs: uploads.blobs,
+                })
+            }
+            ControlFlow::Break(uploads) => Self::UploadsFailed(
+                anyhow::anyhow!("One or multiple blobs failed to upload to S3"),
+                S3WithTestBlobs {
+                    client_with_excessive_pagination,
+                    base_prefix_str,
+                    remote_prefixes: uploads.prefixes,
+                    remote_blobs: uploads.blobs,
+                },
+            ),
+        }
+    }
+
+    async fn teardown(self) {
+        match self {
+            Self::Disabled => {}
+            Self::Enabled(ctx) | Self::UploadsFailed(_, ctx) => {
+                cleanup(&ctx.client_with_excessive_pagination, ctx.remote_blobs).await;
+            }
+        }
+    }
+}
+
+fn create_s3_client(max_keys_per_list_response: i32) -> anyhow::Result<Arc<GenericRemoteStorage>> {
+    let remote_storage_s3_bucket = env::var("REMOTE_STORAGE_S3_BUCKET")
+        .context("`REMOTE_STORAGE_S3_BUCKET` env var is not set, but real S3 tests are enabled")?;
+    let remote_storage_s3_region = env::var("REMOTE_STORAGE_S3_REGION")
+        .context("`REMOTE_STORAGE_S3_REGION` env var is not set, but real S3 tests are enabled")?;
+    let random_prefix_part = std::time::SystemTime::now()
+        .duration_since(UNIX_EPOCH)
+        .context("random s3 test prefix part calculation")?
+        .as_millis();
+    let remote_storage_config = RemoteStorageConfig {
+        max_concurrent_syncs: NonZeroUsize::new(100).unwrap(),
+        max_sync_errors: NonZeroU32::new(5).unwrap(),
+        storage: RemoteStorageKind::AwsS3(S3Config {
+            bucket_name: remote_storage_s3_bucket,
+            bucket_region: remote_storage_s3_region,
+            prefix_in_bucket: Some(format!("pagination_should_work_test_{random_prefix_part}/")),
+            endpoint: None,
+            concurrency_limit: NonZeroUsize::new(100).unwrap(),
+            max_keys_per_list_response: Some(max_keys_per_list_response),
+        }),
+    };
+    Ok(Arc::new(
+        GenericRemoteStorage::from_config(&remote_storage_config).context("remote storage init")?,
+    ))
+}
+
+struct Uploads {
+    prefixes: HashSet<RemotePath>,
+    blobs: HashSet<RemotePath>,
+}
+
+async fn upload_s3_data(
+    client: &Arc<GenericRemoteStorage>,
+    base_prefix_str: &'static str,
+    upload_tasks_count: usize,
+) -> ControlFlow<Uploads, Uploads> {
+    info!("Creating {upload_tasks_count} S3 files");
+    let mut upload_tasks = JoinSet::new();
+    for i in 1..upload_tasks_count + 1 {
+        let task_client = Arc::clone(client);
+        upload_tasks.spawn(async move {
+            let prefix = PathBuf::from(format!("{base_prefix_str}/sub_prefix_{i}/"));
+            let blob_prefix = RemotePath::new(&prefix)
+                .with_context(|| format!("{prefix:?} to RemotePath conversion"))?;
+            let blob_path = blob_prefix.join(Path::new(&format!("blob_{i}")));
+            debug!("Creating remote item {i} at path {blob_path:?}");
+
+            let data = format!("remote blob data {i}").into_bytes();
+            let data_len = data.len();
+            task_client
+                .upload(std::io::Cursor::new(data), data_len, &blob_path, None)
+                .await?;
+
+            Ok::<_, anyhow::Error>((blob_prefix, blob_path))
+        });
+    }
+
+    let mut upload_tasks_failed = false;
+    let mut uploaded_prefixes = HashSet::with_capacity(upload_tasks_count);
+    let mut uploaded_blobs = HashSet::with_capacity(upload_tasks_count);
+    while let Some(task_run_result) = upload_tasks.join_next().await {
+        match task_run_result
+            .context("task join failed")
+            .and_then(|task_result| task_result.context("upload task failed"))
+        {
+            Ok((upload_prefix, upload_path)) => {
+                uploaded_prefixes.insert(upload_prefix);
+                uploaded_blobs.insert(upload_path);
+            }
+            Err(e) => {
+                error!("Upload task failed: {e:?}");
+                upload_tasks_failed = true;
+            }
+        }
+    }
+
+    let uploads = Uploads {
+        prefixes: uploaded_prefixes,
+        blobs: uploaded_blobs,
+    };
+    if upload_tasks_failed {
+        ControlFlow::Break(uploads)
+    } else {
+        ControlFlow::Continue(uploads)
+    }
+}
+
+async fn cleanup(client: &Arc<GenericRemoteStorage>, objects_to_delete: HashSet<RemotePath>) {
+    info!(
+        "Removing {} objects from the remote storage during cleanup",
+        objects_to_delete.len()
+    );
+    let mut delete_tasks = JoinSet::new();
+    for object_to_delete in objects_to_delete {
+        let task_client = Arc::clone(client);
+        delete_tasks.spawn(async move {
+            debug!("Deleting remote item at path {object_to_delete:?}");
+            task_client
+                .delete(&object_to_delete)
+                .await
+                .with_context(|| format!("{object_to_delete:?} removal"))
+        });
+    }
+
+    while let Some(task_run_result) = delete_tasks.join_next().await {
+        match task_run_result {
+            Ok(task_result) => match task_result {
+                Ok(()) => {}
+                Err(e) => error!("Delete task failed: {e:?}"),
+            },
+            Err(join_err) => error!("Delete task did not finish correctly: {join_err}"),
+        }
+    }
+}
--- a/libs/remote_storage/tests/test_real_s3.rs
+++ b/libs/remote_storage/tests/test_real_s3.rs
@@ -1,542 +0,0 @@
-use std::collections::HashSet;
-use std::env;
-use std::num::{NonZeroU32, NonZeroUsize};
-use std::ops::ControlFlow;
-use std::path::{Path, PathBuf};
-use std::sync::Arc;
-use std::time::UNIX_EPOCH;
-
-use anyhow::Context;
-use once_cell::sync::OnceCell;
-use remote_storage::{
-    GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind, S3Config,
-};
-use test_context::{test_context, AsyncTestContext};
-use tokio::task::JoinSet;
-use tracing::{debug, error, info};
-
-static LOGGING_DONE: OnceCell<()> = OnceCell::new();
-
-const ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME: &str = "ENABLE_REAL_S3_REMOTE_STORAGE";
-
-const BASE_PREFIX: &str = "test/";
-
-/// Tests that S3 client can list all prefixes, even if the response come paginated and requires multiple S3 queries.
-/// Uses real S3 and requires [`ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME`] and related S3 cred env vars specified.
-/// See the client creation in [`create_s3_client`] for details on the required env vars.
-/// If real S3 tests are disabled, the test passes, skipping any real test run: currently, there's no way to mark the test ignored in runtime with the
-/// deafult test framework, see https://github.com/rust-lang/rust/issues/68007 for details.
-///
-/// First, the test creates a set of S3 objects with keys `/${random_prefix_part}/${base_prefix_str}/sub_prefix_${i}/blob_${i}` in [`upload_s3_data`]
-/// where
-/// * `random_prefix_part` is set for the entire S3 client during the S3 client creation in [`create_s3_client`], to avoid multiple test runs interference
-/// * `base_prefix_str` is a common prefix to use in the client requests: we would want to ensure that the client is able to list nested prefixes inside the bucket
-///
-/// Then, verifies that the client does return correct prefixes when queried:
-/// * with no prefix, it lists everything after its `${random_prefix_part}/` — that should be `${base_prefix_str}` value only
-/// * with `${base_prefix_str}/` prefix, it lists every `sub_prefix_${i}`
-///
-/// With the real S3 enabled and `#[cfg(test)]` Rust configuration used, the S3 client test adds a `max-keys` param to limit the response keys.
-/// This way, we are able to test the pagination implicitly, by ensuring all results are returned from the remote storage and avoid uploading too many blobs to S3,
-/// since current default AWS S3 pagination limit is 1000.
-/// (see https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html#API_ListObjectsV2_RequestSyntax)
-///
-/// Lastly, the test attempts to clean up and remove all uploaded S3 files.
-/// If any errors appear during the clean up, they get logged, but the test is not failed or stopped until clean up is finished.
-#[test_context(MaybeEnabledS3WithTestBlobs)]
-#[tokio::test]
-async fn s3_pagination_should_work(ctx: &mut MaybeEnabledS3WithTestBlobs) -> anyhow::Result<()> {
-    let ctx = match ctx {
-        MaybeEnabledS3WithTestBlobs::Enabled(ctx) => ctx,
-        MaybeEnabledS3WithTestBlobs::Disabled => return Ok(()),
-        MaybeEnabledS3WithTestBlobs::UploadsFailed(e, _) => anyhow::bail!("S3 init failed: {e:?}"),
-    };
-
-    let test_client = Arc::clone(&ctx.enabled.client);
-    let expected_remote_prefixes = ctx.remote_prefixes.clone();
-
-    let base_prefix = RemotePath::new(Path::new(ctx.enabled.base_prefix))
-        .context("common_prefix construction")?;
-    let root_remote_prefixes = test_client
-        .list_prefixes(None)
-        .await
-        .context("client list root prefixes failure")?
-        .into_iter()
-        .collect::<HashSet<_>>();
-    assert_eq!(
-        root_remote_prefixes, HashSet::from([base_prefix.clone()]),
-        "remote storage root prefixes list mismatches with the uploads. Returned prefixes: {root_remote_prefixes:?}"
-    );
-
-    let nested_remote_prefixes = test_client
-        .list_prefixes(Some(&base_prefix))
-        .await
-        .context("client list nested prefixes failure")?
-        .into_iter()
-        .collect::<HashSet<_>>();
-    let remote_only_prefixes = nested_remote_prefixes
-        .difference(&expected_remote_prefixes)
-        .collect::<HashSet<_>>();
-    let missing_uploaded_prefixes = expected_remote_prefixes
-        .difference(&nested_remote_prefixes)
-        .collect::<HashSet<_>>();
-    assert_eq!(
-        remote_only_prefixes.len() + missing_uploaded_prefixes.len(), 0,
-        "remote storage nested prefixes list mismatches with the uploads. Remote only prefixes: {remote_only_prefixes:?}, missing uploaded prefixes: {missing_uploaded_prefixes:?}",
-    );
-
-    Ok(())
-}
-
-/// Tests that S3 client can list all files in a folder, even if the response comes paginated and requirees multiple S3 queries.
-/// Uses real S3 and requires [`ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME`] and related S3 cred env vars specified. Test will skip real code and pass if env vars not set.
-/// See `s3_pagination_should_work` for more information.
-///
-/// First, create a set of S3 objects with keys `random_prefix/folder{j}/blob_{i}.txt` in [`upload_s3_data`]
-/// Then performs the following queries:
-///    1. `list_files(None)`. This should return all files `random_prefix/folder{j}/blob_{i}.txt`
-///    2. `list_files("folder1")`.  This  should return all files `random_prefix/folder1/blob_{i}.txt`
-#[test_context(MaybeEnabledS3WithSimpleTestBlobs)]
-#[tokio::test]
-async fn s3_list_files_works(ctx: &mut MaybeEnabledS3WithSimpleTestBlobs) -> anyhow::Result<()> {
-    let ctx = match ctx {
-        MaybeEnabledS3WithSimpleTestBlobs::Enabled(ctx) => ctx,
-        MaybeEnabledS3WithSimpleTestBlobs::Disabled => return Ok(()),
-        MaybeEnabledS3WithSimpleTestBlobs::UploadsFailed(e, _) => {
-            anyhow::bail!("S3 init failed: {e:?}")
-        }
-    };
-    let test_client = Arc::clone(&ctx.enabled.client);
-    let base_prefix =
-        RemotePath::new(Path::new("folder1")).context("common_prefix construction")?;
-    let root_files = test_client
-        .list_files(None)
-        .await
-        .context("client list root files failure")?
-        .into_iter()
-        .collect::<HashSet<_>>();
-    assert_eq!(
-        root_files,
-        ctx.remote_blobs.clone(),
-        "remote storage list_files on root mismatches with the uploads."
-    );
-    let nested_remote_files = test_client
-        .list_files(Some(&base_prefix))
-        .await
-        .context("client list nested files failure")?
-        .into_iter()
-        .collect::<HashSet<_>>();
-    let trim_remote_blobs: HashSet<_> = ctx
-        .remote_blobs
-        .iter()
-        .map(|x| x.get_path().to_str().expect("must be valid name"))
-        .filter(|x| x.starts_with("folder1"))
-        .map(|x| RemotePath::new(Path::new(x)).expect("must be valid name"))
-        .collect();
-    assert_eq!(
-        nested_remote_files, trim_remote_blobs,
-        "remote storage list_files on subdirrectory mismatches with the uploads."
-    );
-    Ok(())
-}
-
-#[test_context(MaybeEnabledS3)]
-#[tokio::test]
-async fn s3_delete_non_exising_works(ctx: &mut MaybeEnabledS3) -> anyhow::Result<()> {
-    let ctx = match ctx {
-        MaybeEnabledS3::Enabled(ctx) => ctx,
-        MaybeEnabledS3::Disabled => return Ok(()),
-    };
-
-    let path = RemotePath::new(&PathBuf::from(format!(
-        "{}/for_sure_there_is_nothing_there_really",
-        ctx.base_prefix,
-    )))
-    .with_context(|| "RemotePath conversion")?;
-
-    ctx.client.delete(&path).await.expect("should succeed");
-
-    Ok(())
-}
-
-#[test_context(MaybeEnabledS3)]
-#[tokio::test]
-async fn s3_delete_objects_works(ctx: &mut MaybeEnabledS3) -> anyhow::Result<()> {
-    let ctx = match ctx {
-        MaybeEnabledS3::Enabled(ctx) => ctx,
-        MaybeEnabledS3::Disabled => return Ok(()),
-    };
-
-    let path1 = RemotePath::new(&PathBuf::from(format!("{}/path1", ctx.base_prefix,)))
-        .with_context(|| "RemotePath conversion")?;
-
-    let path2 = RemotePath::new(&PathBuf::from(format!("{}/path2", ctx.base_prefix,)))
-        .with_context(|| "RemotePath conversion")?;
-
-    let path3 = RemotePath::new(&PathBuf::from(format!("{}/path3", ctx.base_prefix,)))
-        .with_context(|| "RemotePath conversion")?;
-
-    let data1 = "remote blob data1".as_bytes();
-    let data1_len = data1.len();
-    let data2 = "remote blob data2".as_bytes();
-    let data2_len = data2.len();
-    let data3 = "remote blob data3".as_bytes();
-    let data3_len = data3.len();
-    ctx.client
-        .upload(std::io::Cursor::new(data1), data1_len, &path1, None)
-        .await?;
-
-    ctx.client
-        .upload(std::io::Cursor::new(data2), data2_len, &path2, None)
-        .await?;
-
-    ctx.client
-        .upload(std::io::Cursor::new(data3), data3_len, &path3, None)
-        .await?;
-
-    ctx.client.delete_objects(&[path1, path2]).await?;
-
-    let prefixes = ctx.client.list_prefixes(None).await?;
-
-    assert_eq!(prefixes.len(), 1);
-
-    ctx.client.delete_objects(&[path3]).await?;
-
-    Ok(())
-}
-
-fn ensure_logging_ready() {
-    LOGGING_DONE.get_or_init(|| {
-        utils::logging::init(
-            utils::logging::LogFormat::Test,
-            utils::logging::TracingErrorLayerEnablement::Disabled,
-        )
-        .expect("logging init failed");
-    });
-}
-
-struct EnabledS3 {
-    client: Arc<GenericRemoteStorage>,
-    base_prefix: &'static str,
-}
-
-impl EnabledS3 {
-    async fn setup(max_keys_in_list_response: Option<i32>) -> Self {
-        let client = create_s3_client(max_keys_in_list_response)
-            .context("S3 client creation")
-            .expect("S3 client creation failed");
-
-        EnabledS3 {
-            client,
-            base_prefix: BASE_PREFIX,
-        }
-    }
-}
-
-enum MaybeEnabledS3 {
-    Enabled(EnabledS3),
-    Disabled,
-}
-
-#[async_trait::async_trait]
-impl AsyncTestContext for MaybeEnabledS3 {
-    async fn setup() -> Self {
-        ensure_logging_ready();
-
-        if env::var(ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME).is_err() {
-            info!(
-                "`{}` env variable is not set, skipping the test",
-                ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME
-            );
-            return Self::Disabled;
-        }
-
-        Self::Enabled(EnabledS3::setup(None).await)
-    }
-}
-
-enum MaybeEnabledS3WithTestBlobs {
-    Enabled(S3WithTestBlobs),
-    Disabled,
-    UploadsFailed(anyhow::Error, S3WithTestBlobs),
-}
-
-struct S3WithTestBlobs {
-    enabled: EnabledS3,
-    remote_prefixes: HashSet<RemotePath>,
-    remote_blobs: HashSet<RemotePath>,
-}
-
-#[async_trait::async_trait]
-impl AsyncTestContext for MaybeEnabledS3WithTestBlobs {
-    async fn setup() -> Self {
-        ensure_logging_ready();
-        if env::var(ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME).is_err() {
-            info!(
-                "`{}` env variable is not set, skipping the test",
-                ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME
-            );
-            return Self::Disabled;
-        }
-
-        let max_keys_in_list_response = 10;
-        let upload_tasks_count = 1 + (2 * usize::try_from(max_keys_in_list_response).unwrap());
-
-        let enabled = EnabledS3::setup(Some(max_keys_in_list_response)).await;
-
-        match upload_s3_data(&enabled.client, enabled.base_prefix, upload_tasks_count).await {
-            ControlFlow::Continue(uploads) => {
-                info!("Remote objects created successfully");
-
-                Self::Enabled(S3WithTestBlobs {
-                    enabled,
-                    remote_prefixes: uploads.prefixes,
-                    remote_blobs: uploads.blobs,
-                })
-            }
-            ControlFlow::Break(uploads) => Self::UploadsFailed(
-                anyhow::anyhow!("One or multiple blobs failed to upload to S3"),
-                S3WithTestBlobs {
-                    enabled,
-                    remote_prefixes: uploads.prefixes,
-                    remote_blobs: uploads.blobs,
-                },
-            ),
-        }
-    }
-
-    async fn teardown(self) {
-        match self {
-            Self::Disabled => {}
-            Self::Enabled(ctx) | Self::UploadsFailed(_, ctx) => {
-                cleanup(&ctx.enabled.client, ctx.remote_blobs).await;
-            }
-        }
-    }
-}
-
-// NOTE: the setups for the list_prefixes test and the list_files test are very similar
-// However, they are not idential. The list_prefixes function is concerned with listing prefixes,
-// whereas the list_files function is concerned with listing files.
-// See `RemoteStorage::list_files` documentation for more details
-enum MaybeEnabledS3WithSimpleTestBlobs {
-    Enabled(S3WithSimpleTestBlobs),
-    Disabled,
-    UploadsFailed(anyhow::Error, S3WithSimpleTestBlobs),
-}
-struct S3WithSimpleTestBlobs {
-    enabled: EnabledS3,
-    remote_blobs: HashSet<RemotePath>,
-}
-
-#[async_trait::async_trait]
-impl AsyncTestContext for MaybeEnabledS3WithSimpleTestBlobs {
-    async fn setup() -> Self {
-        ensure_logging_ready();
-        if env::var(ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME).is_err() {
-            info!(
-                "`{}` env variable is not set, skipping the test",
-                ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME
-            );
-            return Self::Disabled;
-        }
-
-        let max_keys_in_list_response = 10;
-        let upload_tasks_count = 1 + (2 * usize::try_from(max_keys_in_list_response).unwrap());
-
-        let enabled = EnabledS3::setup(Some(max_keys_in_list_response)).await;
-
-        match upload_simple_s3_data(&enabled.client, upload_tasks_count).await {
-            ControlFlow::Continue(uploads) => {
-                info!("Remote objects created successfully");
-
-                Self::Enabled(S3WithSimpleTestBlobs {
-                    enabled,
-                    remote_blobs: uploads,
-                })
-            }
-            ControlFlow::Break(uploads) => Self::UploadsFailed(
-                anyhow::anyhow!("One or multiple blobs failed to upload to S3"),
-                S3WithSimpleTestBlobs {
-                    enabled,
-                    remote_blobs: uploads,
-                },
-            ),
-        }
-    }
-
-    async fn teardown(self) {
-        match self {
-            Self::Disabled => {}
-            Self::Enabled(ctx) | Self::UploadsFailed(_, ctx) => {
-                cleanup(&ctx.enabled.client, ctx.remote_blobs).await;
-            }
-        }
-    }
-}
-
-fn create_s3_client(
-    max_keys_per_list_response: Option<i32>,
-) -> anyhow::Result<Arc<GenericRemoteStorage>> {
-    let remote_storage_s3_bucket = env::var("REMOTE_STORAGE_S3_BUCKET")
-        .context("`REMOTE_STORAGE_S3_BUCKET` env var is not set, but real S3 tests are enabled")?;
-    let remote_storage_s3_region = env::var("REMOTE_STORAGE_S3_REGION")
-        .context("`REMOTE_STORAGE_S3_REGION` env var is not set, but real S3 tests are enabled")?;
-    let random_prefix_part = std::time::SystemTime::now()
-        .duration_since(UNIX_EPOCH)
-        .context("random s3 test prefix part calculation")?
-        .as_nanos();
-    let remote_storage_config = RemoteStorageConfig {
-        max_concurrent_syncs: NonZeroUsize::new(100).unwrap(),
-        max_sync_errors: NonZeroU32::new(5).unwrap(),
-        storage: RemoteStorageKind::AwsS3(S3Config {
-            bucket_name: remote_storage_s3_bucket,
-            bucket_region: remote_storage_s3_region,
-            prefix_in_bucket: Some(format!("pagination_should_work_test_{random_prefix_part}/")),
-            endpoint: None,
-            concurrency_limit: NonZeroUsize::new(100).unwrap(),
-            max_keys_per_list_response,
-        }),
-    };
-    Ok(Arc::new(
-        GenericRemoteStorage::from_config(&remote_storage_config).context("remote storage init")?,
-    ))
-}
-
-struct Uploads {
-    prefixes: HashSet<RemotePath>,
-    blobs: HashSet<RemotePath>,
-}
-
-async fn upload_s3_data(
-    client: &Arc<GenericRemoteStorage>,
-    base_prefix_str: &'static str,
-    upload_tasks_count: usize,
-) -> ControlFlow<Uploads, Uploads> {
-    info!("Creating {upload_tasks_count} S3 files");
-    let mut upload_tasks = JoinSet::new();
-    for i in 1..upload_tasks_count + 1 {
-        let task_client = Arc::clone(client);
-        upload_tasks.spawn(async move {
-            let prefix = PathBuf::from(format!("{base_prefix_str}/sub_prefix_{i}/"));
-            let blob_prefix = RemotePath::new(&prefix)
-                .with_context(|| format!("{prefix:?} to RemotePath conversion"))?;
-            let blob_path = blob_prefix.join(Path::new(&format!("blob_{i}")));
-            debug!("Creating remote item {i} at path {blob_path:?}");
-
-            let data = format!("remote blob data {i}").into_bytes();
-            let data_len = data.len();
-            task_client
-                .upload(std::io::Cursor::new(data), data_len, &blob_path, None)
-                .await?;
-
-            Ok::<_, anyhow::Error>((blob_prefix, blob_path))
-        });
-    }
-
-    let mut upload_tasks_failed = false;
-    let mut uploaded_prefixes = HashSet::with_capacity(upload_tasks_count);
-    let mut uploaded_blobs = HashSet::with_capacity(upload_tasks_count);
-    while let Some(task_run_result) = upload_tasks.join_next().await {
-        match task_run_result
-            .context("task join failed")
-            .and_then(|task_result| task_result.context("upload task failed"))
-        {
-            Ok((upload_prefix, upload_path)) => {
-                uploaded_prefixes.insert(upload_prefix);
-                uploaded_blobs.insert(upload_path);
-            }
-            Err(e) => {
-                error!("Upload task failed: {e:?}");
-                upload_tasks_failed = true;
-            }
-        }
-    }
-
-    let uploads = Uploads {
-        prefixes: uploaded_prefixes,
-        blobs: uploaded_blobs,
-    };
-    if upload_tasks_failed {
-        ControlFlow::Break(uploads)
-    } else {
-        ControlFlow::Continue(uploads)
-    }
-}
-
-async fn cleanup(client: &Arc<GenericRemoteStorage>, objects_to_delete: HashSet<RemotePath>) {
-    info!(
-        "Removing {} objects from the remote storage during cleanup",
-        objects_to_delete.len()
-    );
-    let mut delete_tasks = JoinSet::new();
-    for object_to_delete in objects_to_delete {
-        let task_client = Arc::clone(client);
-        delete_tasks.spawn(async move {
-            debug!("Deleting remote item at path {object_to_delete:?}");
-            task_client
-                .delete(&object_to_delete)
-                .await
-                .with_context(|| format!("{object_to_delete:?} removal"))
-        });
-    }
-
-    while let Some(task_run_result) = delete_tasks.join_next().await {
-        match task_run_result {
-            Ok(task_result) => match task_result {
-                Ok(()) => {}
-                Err(e) => error!("Delete task failed: {e:?}"),
-            },
-            Err(join_err) => error!("Delete task did not finish correctly: {join_err}"),
-        }
-    }
-}
-
-// Uploads files `folder{j}/blob{i}.txt`. See test description for more details.
-async fn upload_simple_s3_data(
-    client: &Arc<GenericRemoteStorage>,
-    upload_tasks_count: usize,
-) -> ControlFlow<HashSet<RemotePath>, HashSet<RemotePath>> {
-    info!("Creating {upload_tasks_count} S3 files");
-    let mut upload_tasks = JoinSet::new();
-    for i in 1..upload_tasks_count + 1 {
-        let task_client = Arc::clone(client);
-        upload_tasks.spawn(async move {
-            let blob_path = PathBuf::from(format!("folder{}/blob_{}.txt", i / 7, i));
-            let blob_path = RemotePath::new(&blob_path)
-                .with_context(|| format!("{blob_path:?} to RemotePath conversion"))?;
-            debug!("Creating remote item {i} at path {blob_path:?}");
-
-            let data = format!("remote blob data {i}").into_bytes();
-            let data_len = data.len();
-            task_client
-                .upload(std::io::Cursor::new(data), data_len, &blob_path, None)
-                .await?;
-
-            Ok::<_, anyhow::Error>(blob_path)
-        });
-    }
-
-    let mut upload_tasks_failed = false;
-    let mut uploaded_blobs = HashSet::with_capacity(upload_tasks_count);
-    while let Some(task_run_result) = upload_tasks.join_next().await {
-        match task_run_result
-            .context("task join failed")
-            .and_then(|task_result| task_result.context("upload task failed"))
-        {
-            Ok(upload_path) => {
-                uploaded_blobs.insert(upload_path);
-            }
-            Err(e) => {
-                error!("Upload task failed: {e:?}");
-                upload_tasks_failed = true;
-            }
-        }
-    }
-
-    if upload_tasks_failed {
-        ControlFlow::Break(uploaded_blobs)
-    } else {
-        ControlFlow::Continue(uploaded_blobs)
-    }
-}
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -5,6 +5,7 @@ edition.workspace = true
 license.workspace = true

 [dependencies]
+atty.workspace = true
 sentry.workspace = true
 async-trait.workspace = true
 anyhow.workspace = true
--- a/libs/utils/src/fs_ext.rs
+++ b/libs/utils/src/fs_ext.rs
@@ -1,8 +1,6 @@
 /// Extensions to `std::fs` types.
 use std::{fs, io, path::Path};

-use anyhow::Context;
-
 pub trait PathExt {
    /// Returns an error if `self` is not a directory.
    fn is_empty_dir(&self) -> io::Result<bool>;
@@ -17,19 +15,10 @@ where
    }
 }

-pub async fn is_directory_empty(path: impl AsRef<Path>) -> anyhow::Result<bool> {
-    let mut dir = tokio::fs::read_dir(&path)
-        .await
-        .context(format!("read_dir({})", path.as_ref().display()))?;
-    Ok(dir.next_entry().await?.is_none())
-}
-
 #[cfg(test)]
 mod test {
    use std::path::PathBuf;

-    use crate::fs_ext::is_directory_empty;
-
    #[test]
    fn is_empty_dir() {
        use super::PathExt;
@@ -53,26 +42,4 @@ mod test {
        std::fs::remove_file(&file_path).unwrap();
        assert!(file_path.is_empty_dir().is_err());
    }
-
-    #[tokio::test]
-    async fn is_empty_dir_async() {
-        let dir = tempfile::tempdir().unwrap();
-        let dir_path = dir.path();
-
-        // test positive case
-        assert!(
-            is_directory_empty(dir_path).await.expect("test failure"),
-            "new tempdir should be empty"
-        );
-
-        // invoke on a file to ensure it returns an error
-        let file_path: PathBuf = dir_path.join("testfile");
-        let f = std::fs::File::create(&file_path).unwrap();
-        drop(f);
-        assert!(is_directory_empty(&file_path).await.is_err());
-
-        // do it again on a path, we know to be nonexistent
-        std::fs::remove_file(&file_path).unwrap();
-        assert!(is_directory_empty(file_path).await.is_err());
-    }
 }
--- a/libs/utils/src/http/endpoint.rs
+++ b/libs/utils/src/http/endpoint.rs
@@ -1,18 +1,19 @@
 use crate::auth::{Claims, JwtAuth};
 use crate::http::error::{api_error_handler, route_error_handler, ApiError};
-use anyhow::Context;
+use anyhow::{anyhow, Context};
 use hyper::header::{HeaderName, AUTHORIZATION};
 use hyper::http::HeaderValue;
 use hyper::Method;
-use hyper::{header::CONTENT_TYPE, Body, Request, Response};
+use hyper::{header::CONTENT_TYPE, Body, Request, Response, Server};
 use metrics::{register_int_counter, Encoder, IntCounter, TextEncoder};
 use once_cell::sync::Lazy;
 use routerify::ext::RequestExt;
-use routerify::{Middleware, RequestInfo, Router, RouterBuilder};
+use routerify::{Middleware, RequestInfo, Router, RouterBuilder, RouterService};
 use tokio::task::JoinError;
 use tracing::{self, debug, info, info_span, warn, Instrument};

 use std::future::Future;
+use std::net::TcpListener;
 use std::str::FromStr;

 static SERVE_METRICS_COUNT: Lazy<IntCounter> = Lazy::new(|| {
@@ -347,6 +348,40 @@ pub fn check_permission_with(
    }
 }

+///
+/// Start listening for HTTP requests on given socket.
+///
+/// 'shutdown_future' can be used to stop. If the Future becomes
+/// ready, we stop listening for new requests, and the function returns.
+///
+pub fn serve_thread_main<S>(
+    router_builder: RouterBuilder<hyper::Body, ApiError>,
+    listener: TcpListener,
+    shutdown_future: S,
+) -> anyhow::Result<()>
+where
+    S: Future<Output = ()> + Send + Sync,
+{
+    info!("Starting an HTTP endpoint at {}", listener.local_addr()?);
+
+    // Create a Service from the router above to handle incoming requests.
+    let service = RouterService::new(router_builder.build().map_err(|err| anyhow!(err))?).unwrap();
+
+    // Enter a single-threaded tokio runtime bound to the current thread
+    let runtime = tokio::runtime::Builder::new_current_thread()
+        .enable_all()
+        .build()?;
+
+    let _guard = runtime.enter();
+
+    let server = Server::from_tcp(listener)?
+        .serve(service)
+        .with_graceful_shutdown(shutdown_future);
+
+    runtime.block_on(server)?;
+
+    Ok(())
+}
 #[cfg(test)]
 mod tests {
    use super::*;
--- a/libs/utils/src/http/error.rs
+++ b/libs/utils/src/http/error.rs
@@ -1,6 +1,5 @@
 use hyper::{header, Body, Response, StatusCode};
 use serde::{Deserialize, Serialize};
-use std::error::Error as StdError;
 use thiserror::Error;
 use tracing::error;

@@ -16,13 +15,13 @@ pub enum ApiError {
    Unauthorized(String),

    #[error("NotFound: {0}")]
-    NotFound(Box<dyn StdError + Send + Sync + 'static>),
+    NotFound(anyhow::Error),

    #[error("Conflict: {0}")]
    Conflict(String),

    #[error("Precondition failed: {0}")]
-    PreconditionFailed(Box<str>),
+    PreconditionFailed(&'static str),

    #[error(transparent)]
    InternalServerError(anyhow::Error),
--- a/libs/utils/src/logging.rs
+++ b/libs/utils/src/logging.rs
@@ -84,7 +84,7 @@ pub fn init(
    let r = r.with({
        let log_layer = tracing_subscriber::fmt::layer()
            .with_target(false)
-            .with_ansi(false)
+            .with_ansi(atty::is(atty::Stream::Stdout))
            .with_writer(std::io::stdout);
        let log_layer = match log_format {
            LogFormat::Json => log_layer.json().boxed(),
--- a/pageserver/benches/bench_layer_map.rs
+++ b/pageserver/benches/bench_layer_map.rs
@@ -1,23 +1,22 @@
 use pageserver::keyspace::{KeyPartitioning, KeySpace};
 use pageserver::repository::Key;
 use pageserver::tenant::layer_map::LayerMap;
-use pageserver::tenant::storage_layer::{tests::LayerDescriptor, Layer, LayerFileName};
-use pageserver::tenant::storage_layer::{PersistentLayer, PersistentLayerDesc};
+use pageserver::tenant::storage_layer::{Layer, LayerDescriptor, LayerFileName};
 use rand::prelude::{SeedableRng, SliceRandom, StdRng};
 use std::cmp::{max, min};
 use std::fs::File;
 use std::io::{BufRead, BufReader};
 use std::path::PathBuf;
 use std::str::FromStr;
+use std::sync::Arc;
 use std::time::Instant;
-use utils::id::{TenantId, TimelineId};

 use utils::lsn::Lsn;

 use criterion::{black_box, criterion_group, criterion_main, Criterion};

-fn build_layer_map(filename_dump: PathBuf) -> LayerMap {
-    let mut layer_map = LayerMap::default();
+fn build_layer_map(filename_dump: PathBuf) -> LayerMap<LayerDescriptor> {
+    let mut layer_map = LayerMap::<LayerDescriptor>::default();

    let mut min_lsn = Lsn(u64::MAX);
    let mut max_lsn = Lsn(0);
@@ -34,7 +33,7 @@ fn build_layer_map(filename_dump: PathBuf) -> LayerMap {
        min_lsn = min(min_lsn, lsn_range.start);
        max_lsn = max(max_lsn, Lsn(lsn_range.end.0 - 1));

-        updates.insert_historic(layer.layer_desc().clone());
+        updates.insert_historic(Arc::new(layer));
    }

    println!("min: {min_lsn}, max: {max_lsn}");
@@ -44,7 +43,7 @@ fn build_layer_map(filename_dump: PathBuf) -> LayerMap {
 }

 /// Construct a layer map query pattern for benchmarks
-fn uniform_query_pattern(layer_map: &LayerMap) -> Vec<(Key, Lsn)> {
+fn uniform_query_pattern(layer_map: &LayerMap<LayerDescriptor>) -> Vec<(Key, Lsn)> {
    // For each image layer we query one of the pages contained, at LSN right
    // before the image layer was created. This gives us a somewhat uniform
    // coverage of both the lsn and key space because image layers have
@@ -70,7 +69,7 @@ fn uniform_query_pattern(layer_map: &LayerMap) -> Vec<(Key, Lsn)> {

 // Construct a partitioning for testing get_difficulty map when we
 // don't have an exact result of `collect_keyspace` to work with.
-fn uniform_key_partitioning(layer_map: &LayerMap, _lsn: Lsn) -> KeyPartitioning {
+fn uniform_key_partitioning(layer_map: &LayerMap<LayerDescriptor>, _lsn: Lsn) -> KeyPartitioning {
    let mut parts = Vec::new();

    // We add a partition boundary at the start of each image layer,
@@ -210,15 +209,13 @@ fn bench_sequential(c: &mut Criterion) {
    for i in 0..100_000 {
        let i32 = (i as u32) % 100;
        let zero = Key::from_hex("000000000000000000000000000000000000").unwrap();
-        let layer = LayerDescriptor::from(PersistentLayerDesc::new_img(
-            TenantId::generate(),
-            TimelineId::generate(),
-            zero.add(10 * i32)..zero.add(10 * i32 + 1),
-            Lsn(i),
-            false,
-            0,
-        ));
-        updates.insert_historic(layer.layer_desc().clone());
+        let layer = LayerDescriptor {
+            key: zero.add(10 * i32)..zero.add(10 * i32 + 1),
+            lsn: Lsn(i)..Lsn(i + 1),
+            is_incremental: false,
+            short_id: format!("Layer {}", i),
+        };
+        updates.insert_historic(Arc::new(layer));
    }
    updates.flush();
    println!("Finished layer map init in {:?}", now.elapsed());
--- a/pageserver/ctl/Cargo.toml
+++ b/pageserver/ctl/Cargo.toml
@@ -16,3 +16,4 @@ postgres_ffi.workspace = true
 utils.workspace = true
 svg_fmt.workspace = true
 workspace_hack.workspace = true
+itertools.workspace = true
--- a/pageserver/ctl/src/layers.rs
+++ b/pageserver/ctl/src/layers.rs
@@ -41,9 +41,18 @@ pub(crate) enum LayerCmd {
        /// The id from list-layer command
        id: usize,
    },
+    /// Output layer statistics
+    GetStats {
+        path: PathBuf,
+        tenant: String,
+        timeline: String,
+        /// The id from list-layer command
+        id: usize,
+    },
 }

-fn read_delta_file(path: impl AsRef<Path>) -> Result<()> {
+// Return (key, value.len) for all keys, sorted by key.
+fn read_delta_file(path: impl AsRef<Path>) -> Result<Vec<(Key, usize)>> {
    use pageserver::tenant::blob_io::BlobCursor;
    use pageserver::tenant::block_io::BlockReader;

@@ -70,11 +79,48 @@ fn read_delta_file(path: impl AsRef<Path>) -> Result<()> {
        },
    )?;
    let mut cursor = BlockCursor::new(&file);
+
+    let mut result = vec![];
    for (k, v) in all {
        let value = cursor.read_blob(v.pos())?;
-        println!("key:{} value_len:{}", k, value.len());
+        result.push((k, value.len()));
    }
    // TODO(chi): special handling for last key?
+    Ok(result)
+}
+
+// We divide the entire i128 keyspace into pre-assigned fixed segments,
+// 8MB each. Group keys by segment, and report segment size for each.
+//
+// 8MB is chosen as the segment size because we're unlikely to make
+// s3 partial downloads smaller than 8MB (due to cost). So summarizing
+// layer metadata in 8MB segments could be enough to generate good test
+// data for write amplification tests.
+//
+// Note that the segments are fixed, and don't depend on what keyspace
+// is actually in use.
+fn read_delta_segments(path: impl AsRef<Path>) -> Result<Vec<(i128, usize)>> {
+    fn key_to_segment(key: &Key) -> i128 {
+        // A page is 8KB. So 1024 pages are 8MB.
+        key.to_i128() >> 10
+    }
+
+    use itertools::Itertools;
+    let delta_metadata = read_delta_file(path)?;
+    let group_iter = delta_metadata.iter().group_by(|(k, _)| key_to_segment(k));
+    let group_sizes = group_iter.into_iter().map(|(segment, lengths_group)| {
+        let sum: usize = lengths_group.map(|(_k, len)| len).sum();
+        (segment, sum)
+    });
+    Ok(group_sizes.collect())
+}
+
+fn summarize_delta_file(path: impl AsRef<Path>) -> Result<()> {
+    // TODO write in some compressed binary format
+    for (segment, size) in read_delta_segments(path)? {
+        println!("segment:{} size:{}", segment, size);
+    }
+
    Ok(())
 }

@@ -153,7 +199,38 @@ pub(crate) fn main(cmd: &LayerCmd) -> Result<()> {
                        );

                        if layer_file.is_delta {
-                            read_delta_file(layer.path())?;
+                            for (k, len) in read_delta_file(layer.path())? {
+                                println!("key:{} value_len:{}", k, len);
+                            }
+                        } else {
+                            anyhow::bail!("not supported yet :(");
+                        }
+
+                        break;
+                    }
+                    idx += 1;
+                }
+            }
+        }
+        LayerCmd::GetStats {
+            path,
+            tenant,
+            timeline,
+            id,
+        } => {
+            let timeline_path = path
+                .join("tenants")
+                .join(tenant)
+                .join("timelines")
+                .join(timeline);
+            let mut idx = 0;
+            for layer in fs::read_dir(timeline_path)? {
+                let layer = layer?;
+                if let Some(layer_file) = parse_filename(&layer.file_name().into_string().unwrap())
+                {
+                    if *id == idx {
+                        if layer_file.is_delta {
+                            summarize_delta_file(layer.path())?;
                        } else {
                            anyhow::bail!("not supported yet :(");
                        }
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -335,116 +335,31 @@ fn start_pageserver(
    // Set up remote storage client
    let remote_storage = create_remote_storage_client(conf)?;

-    // Startup staging or optimizing:
-    //
-    // We want to minimize downtime for `page_service` connections, and trying not to overload
-    // BACKGROUND_RUNTIME by doing initial compactions and initial logical sizes at the same time.
-    //
-    // init_done_rx will notify when all initial load operations have completed.
-    //
-    // background_jobs_can_start (same name used to hold off background jobs from starting at
-    // consumer side) will be dropped once we can start the background jobs. Currently it is behind
-    // completing all initial logical size calculations (init_logical_size_done_rx) and a timeout
-    // (background_task_maximum_delay).
+    // All tenant load operations carry this while they are ongoing; it will be dropped once those
+    // operations finish either successfully or in some other manner. However, the initial load
+    // will be then done, and we can start the global background tasks.
    let (init_done_tx, init_done_rx) = utils::completion::channel();

-    let (init_logical_size_done_tx, init_logical_size_done_rx) = utils::completion::channel();
-
-    let (background_jobs_can_start, background_jobs_barrier) = utils::completion::channel();
-
-    let order = pageserver::InitializationOrder {
-        initial_tenant_load: Some(init_done_tx),
-        initial_logical_size_can_start: init_done_rx.clone(),
-        initial_logical_size_attempt: init_logical_size_done_tx,
-        background_jobs_can_start: background_jobs_barrier.clone(),
-    };
-
    // Scan the local 'tenants/' directory and start loading the tenants
    let init_started_at = std::time::Instant::now();
-    let shutdown_pageserver = tokio_util::sync::CancellationToken::new();
-
    BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(
        conf,
        broker_client.clone(),
        remote_storage.clone(),
-        order,
+        (init_done_tx, init_done_rx.clone()),
    ))?;

    BACKGROUND_RUNTIME.spawn({
-        let init_done_rx = init_done_rx;
-        let shutdown_pageserver = shutdown_pageserver.clone();
-        let drive_init = async move {
-            // NOTE: unlike many futures in pageserver, this one is cancellation-safe
-            let guard = scopeguard::guard_on_success((), |_| tracing::info!("Cancelled before initial load completed"));
-
+        let init_done_rx = init_done_rx.clone();
+        async move {
            init_done_rx.wait().await;
-            // initial logical sizes can now start, as they were waiting on init_done_rx.

-            scopeguard::ScopeGuard::into_inner(guard);
-
-            let init_done = std::time::Instant::now();
-            let elapsed = init_done - init_started_at;
+            let elapsed = init_started_at.elapsed();

            tracing::info!(
                elapsed_millis = elapsed.as_millis(),
-                "Initial load completed"
+                "Initial load completed."
            );
-
-            let mut init_sizes_done = std::pin::pin!(init_logical_size_done_rx.wait());
-
-            let timeout = conf.background_task_maximum_delay;
-
-            let guard = scopeguard::guard_on_success((), |_| tracing::info!("Cancelled before initial logical sizes completed"));
-
-            let init_sizes_done = tokio::select! {
-                _ = &mut init_sizes_done => {
-                    let now = std::time::Instant::now();
-                    tracing::info!(
-                        from_init_done_millis = (now - init_done).as_millis(),
-                        from_init_millis = (now - init_started_at).as_millis(),
-                        "Initial logical sizes completed"
-                    );
-                    None
-                }
-                _ = tokio::time::sleep(timeout) => {
-                    tracing::info!(
-                        timeout_millis = timeout.as_millis(),
-                        "Initial logical size timeout elapsed; starting background jobs"
-                    );
-                    Some(init_sizes_done)
-                }
-            };
-
-            scopeguard::ScopeGuard::into_inner(guard);
-
-            // allow background jobs to start
-            drop(background_jobs_can_start);
-
-            if let Some(init_sizes_done) = init_sizes_done {
-                // ending up here is not a bug; at the latest logical sizes will be queried by
-                // consumption metrics.
-                let guard = scopeguard::guard_on_success((), |_| tracing::info!("Cancelled before initial logical sizes completed"));
-                init_sizes_done.await;
-
-                scopeguard::ScopeGuard::into_inner(guard);
-
-                let now = std::time::Instant::now();
-                tracing::info!(
-                    from_init_done_millis = (now - init_done).as_millis(),
-                    from_init_millis = (now - init_started_at).as_millis(),
-                    "Initial logical sizes completed after timeout (background jobs already started)"
-                );
-
-            }
-        };
-
-        async move {
-            let mut drive_init = std::pin::pin!(drive_init);
-            // just race these tasks
-            tokio::select! {
-                _ = shutdown_pageserver.cancelled() => {},
-                _ = &mut drive_init => {},
-            }
        }
    });

@@ -459,7 +374,7 @@ fn start_pageserver(
            conf,
            remote_storage.clone(),
            disk_usage_eviction_state.clone(),
-            background_jobs_barrier.clone(),
+            init_done_rx.clone(),
        )?;
    }

@@ -495,50 +410,45 @@ fn start_pageserver(
                Ok(())
            },
        );
-    }

-    if let Some(metric_collection_endpoint) = &conf.metric_collection_endpoint {
-        let background_jobs_barrier = background_jobs_barrier;
-        let metrics_ctx = RequestContext::todo_child(
-            TaskKind::MetricsCollection,
-            // This task itself shouldn't download anything.
-            // The actual size calculation does need downloads, and
-            // creates a child context with the right DownloadBehavior.
-            DownloadBehavior::Error,
-        );
-        task_mgr::spawn(
-            crate::BACKGROUND_RUNTIME.handle(),
-            TaskKind::MetricsCollection,
-            None,
-            None,
-            "consumption metrics collection",
-            true,
-            async move {
-                // first wait until background jobs are cleared to launch.
-                //
-                // this is because we only process active tenants and timelines, and the
-                // Timeline::get_current_logical_size will spawn the logical size calculation,
-                // which will not be rate-limited.
-                let cancel = task_mgr::shutdown_token();
+        if let Some(metric_collection_endpoint) = &conf.metric_collection_endpoint {
+            let init_done_rx = init_done_rx;
+            let metrics_ctx = RequestContext::todo_child(
+                TaskKind::MetricsCollection,
+                // This task itself shouldn't download anything.
+                // The actual size calculation does need downloads, and
+                // creates a child context with the right DownloadBehavior.
+                DownloadBehavior::Error,
+            );
+            task_mgr::spawn(
+                MGMT_REQUEST_RUNTIME.handle(),
+                TaskKind::MetricsCollection,
+                None,
+                None,
+                "consumption metrics collection",
+                true,
+                async move {
+                    // first wait for initial load to complete before first iteration.
+                    //
+                    // this is because we only process active tenants and timelines, and the
+                    // Timeline::get_current_logical_size will spawn the logical size calculation,
+                    // which will not be rate-limited.
+                    init_done_rx.wait().await;

-                tokio::select! {
-                    _ = cancel.cancelled() => { return Ok(()); },
-                    _ = background_jobs_barrier.wait() => {}
-                };
-
-                pageserver::consumption_metrics::collect_metrics(
-                    metric_collection_endpoint,
-                    conf.metric_collection_interval,
-                    conf.cached_metric_collection_interval,
-                    conf.synthetic_size_calculation_interval,
-                    conf.id,
-                    metrics_ctx,
-                )
-                .instrument(info_span!("metrics_collection"))
-                .await?;
-                Ok(())
-            },
-        );
+                    pageserver::consumption_metrics::collect_metrics(
+                        metric_collection_endpoint,
+                        conf.metric_collection_interval,
+                        conf.cached_metric_collection_interval,
+                        conf.synthetic_size_calculation_interval,
+                        conf.id,
+                        metrics_ctx,
+                    )
+                    .instrument(info_span!("metrics_collection"))
+                    .await?;
+                    Ok(())
+                },
+            );
+        }
    }

    // Spawn a task to listen for libpq connections. It will spawn further tasks
@@ -573,8 +483,6 @@ fn start_pageserver(
        );
    }

-    let mut shutdown_pageserver = Some(shutdown_pageserver.drop_guard());
-
    // All started up! Now just sit and wait for shutdown signal.
    ShutdownSignals::handle(|signal| match signal {
        Signal::Quit => {
@@ -590,11 +498,6 @@ fn start_pageserver(
                "Got {}. Terminating gracefully in fast shutdown mode",
                signal.name()
            );
-
-            // This cancels the `shutdown_pageserver` cancellation tree.
-            // Right now that tree doesn't reach very far, and `task_mgr` is used instead.
-            // The plan is to change that over time.
-            shutdown_pageserver.take();
            BACKGROUND_RUNTIME.block_on(pageserver::shutdown_pageserver(0));
            unreachable!()
        }
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -63,7 +63,6 @@ pub mod defaults {
    pub const DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL: &str = "1 hour";
    pub const DEFAULT_METRIC_COLLECTION_ENDPOINT: Option<reqwest::Url> = None;
    pub const DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL: &str = "10 min";
-    pub const DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY: &str = "10s";

    ///
    /// Default built-in configuration file.
@@ -92,16 +91,15 @@ pub mod defaults {
 #cached_metric_collection_interval = '{DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL}'
 #synthetic_size_calculation_interval = '{DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL}'

+
 #disk_usage_based_eviction = {{ max_usage_pct = .., min_avail_bytes = .., period = "10s"}}

-#background_task_maximum_delay = '{DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY}'
-
-[tenant_config]
+# [tenant_config]
 #checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
 #checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT}
 #compaction_target_size = {DEFAULT_COMPACTION_TARGET_SIZE} # in bytes
 #compaction_period = '{DEFAULT_COMPACTION_PERIOD}'
-#compaction_threshold = {DEFAULT_COMPACTION_THRESHOLD}
+#compaction_threshold = '{DEFAULT_COMPACTION_THRESHOLD}'

 #gc_period = '{DEFAULT_GC_PERIOD}'
 #gc_horizon = {DEFAULT_GC_HORIZON}
@@ -111,8 +109,7 @@ pub mod defaults {
 #min_resident_size_override = .. # in bytes
 #evictions_low_residence_duration_metric_threshold = '{DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD}'
 #gc_feedback = false
-
-[remote_storage]
+# [remote_storage]

 "###
    );
@@ -190,15 +187,6 @@ pub struct PageServerConf {
    pub test_remote_failures: u64,

    pub ondemand_download_behavior_treat_error_as_warn: bool,
-
-    /// How long will background tasks be delayed at most after initial load of tenants.
-    ///
-    /// Our largest initialization completions are in the range of 100-200s, so perhaps 10s works
-    /// as we now isolate initial loading, initial logical size calculation and background tasks.
-    /// Smaller nodes will have background tasks "not running" for this long unless every timeline
-    /// has it's initial logical size calculated. Not running background tasks for some seconds is
-    /// not terrible.
-    pub background_task_maximum_delay: Duration,
 }

 /// We do not want to store this in a PageServerConf because the latter may be logged
@@ -271,8 +259,6 @@ struct PageServerConfigBuilder {
    test_remote_failures: BuilderValue<u64>,

    ondemand_download_behavior_treat_error_as_warn: BuilderValue<bool>,
-
-    background_task_maximum_delay: BuilderValue<Duration>,
 }

 impl Default for PageServerConfigBuilder {
@@ -330,11 +316,6 @@ impl Default for PageServerConfigBuilder {
            test_remote_failures: Set(0),

            ondemand_download_behavior_treat_error_as_warn: Set(false),
-
-            background_task_maximum_delay: Set(humantime::parse_duration(
-                DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY,
-            )
-            .unwrap()),
        }
    }
 }
@@ -459,10 +440,6 @@ impl PageServerConfigBuilder {
            BuilderValue::Set(ondemand_download_behavior_treat_error_as_warn);
    }

-    pub fn background_task_maximum_delay(&mut self, delay: Duration) {
-        self.background_task_maximum_delay = BuilderValue::Set(delay);
-    }
-
    pub fn build(self) -> anyhow::Result<PageServerConf> {
        let concurrent_tenant_size_logical_size_queries = self
            .concurrent_tenant_size_logical_size_queries
@@ -545,9 +522,6 @@ impl PageServerConfigBuilder {
                .ok_or(anyhow!(
                    "missing ondemand_download_behavior_treat_error_as_warn"
                ))?,
-            background_task_maximum_delay: self
-                .background_task_maximum_delay
-                .ok_or(anyhow!("missing background_task_maximum_delay"))?,
        })
    }
 }
@@ -736,7 +710,6 @@ impl PageServerConf {
                    )
                },
                "ondemand_download_behavior_treat_error_as_warn" => builder.ondemand_download_behavior_treat_error_as_warn(parse_toml_bool(key, item)?),
-                "background_task_maximum_delay" => builder.background_task_maximum_delay(parse_toml_duration(key, item)?),
                _ => bail!("unrecognized pageserver option '{key}'"),
            }
        }
@@ -904,7 +877,6 @@ impl PageServerConf {
            disk_usage_based_eviction: None,
            test_remote_failures: 0,
            ondemand_download_behavior_treat_error_as_warn: false,
-            background_task_maximum_delay: Duration::ZERO,
        }
    }
 }
@@ -1064,7 +1036,6 @@ metric_collection_endpoint = 'http://localhost:80/metrics'
 synthetic_size_calculation_interval = '333 s'

 log_format = 'json'
-background_task_maximum_delay = '334 s'

 "#;

@@ -1123,9 +1094,6 @@ background_task_maximum_delay = '334 s'
                disk_usage_based_eviction: None,
                test_remote_failures: 0,
                ondemand_download_behavior_treat_error_as_warn: false,
-                background_task_maximum_delay: humantime::parse_duration(
-                    defaults::DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY
-                )?,
            },
            "Correct defaults should be used when no config values are provided"
        );
@@ -1180,7 +1148,6 @@ background_task_maximum_delay = '334 s'
                disk_usage_based_eviction: None,
                test_remote_failures: 0,
                ondemand_download_behavior_treat_error_as_warn: false,
-                background_task_maximum_delay: Duration::from_secs(334),
            },
            "Should be able to parse all basic config values correctly"
        );
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -24,8 +24,6 @@ const RESIDENT_SIZE: &str = "resident_size";
 const REMOTE_STORAGE_SIZE: &str = "remote_storage_size";
 const TIMELINE_LOGICAL_SIZE: &str = "timeline_logical_size";

-const DEFAULT_HTTP_REPORTING_TIMEOUT: Duration = Duration::from_secs(60);
-
 #[serde_as]
 #[derive(Serialize, Debug)]
 struct Ids {
@@ -75,10 +73,7 @@ pub async fn collect_metrics(
    );

    // define client here to reuse it for all requests
-    let client = reqwest::ClientBuilder::new()
-        .timeout(DEFAULT_HTTP_REPORTING_TIMEOUT)
-        .build()
-        .expect("Failed to create http client with timeout");
+    let client = reqwest::Client::new();
    let mut cached_metrics: HashMap<PageserverConsumptionMetricsKey, u64> = HashMap::new();
    let mut prev_iteration_time: std::time::Instant = std::time::Instant::now();

@@ -88,7 +83,7 @@ pub async fn collect_metrics(
                info!("collect_metrics received cancellation request");
                return Ok(());
            },
-            tick_at = ticker.tick() => {
+            _ = ticker.tick() => {

                // send cached metrics every cached_metric_collection_interval
                let send_cached = prev_iteration_time.elapsed() >= cached_metric_collection_interval;
@@ -98,12 +93,6 @@ pub async fn collect_metrics(
                }

                collect_metrics_iteration(&client, &mut cached_metrics, metric_collection_endpoint, node_id, &ctx, send_cached).await;
-
-                crate::tenant::tasks::warn_when_period_overrun(
-                    tick_at.elapsed(),
-                    metric_collection_interval,
-                    "consumption_metrics_collect_metrics",
-                );
            }
        }
    }
@@ -234,18 +223,14 @@ pub async fn collect_metrics_iteration(
        // Note that this metric is calculated in a separate bgworker
        // Here we only use cached value, which may lag behind the real latest one
        let tenant_synthetic_size = tenant.get_cached_synthetic_size();
-
-        if tenant_synthetic_size != 0 {
-            // only send non-zeroes because otherwise these show up as errors in logs
-            current_metrics.push((
-                PageserverConsumptionMetricsKey {
-                    tenant_id,
-                    timeline_id: None,
-                    metric: SYNTHETIC_STORAGE_SIZE,
-                },
-                tenant_synthetic_size,
-            ));
-        }
+        current_metrics.push((
+            PageserverConsumptionMetricsKey {
+                tenant_id,
+                timeline_id: None,
+                metric: SYNTHETIC_STORAGE_SIZE,
+            },
+            tenant_synthetic_size,
+        ));
    }

    // Filter metrics, unless we want to send all metrics, including cached ones.
@@ -288,43 +273,32 @@ pub async fn collect_metrics_iteration(
        })
        .expect("PageserverConsumptionMetric should not fail serialization");

-        const MAX_RETRIES: u32 = 3;
+        let res = client
+            .post(metric_collection_endpoint.clone())
+            .json(&chunk_json)
+            .send()
+            .await;

-        for attempt in 0..MAX_RETRIES {
-            let res = client
-                .post(metric_collection_endpoint.clone())
-                .json(&chunk_json)
-                .send()
-                .await;
-
-            match res {
-                Ok(res) => {
-                    if res.status().is_success() {
-                        // update cached metrics after they were sent successfully
-                        for (curr_key, curr_val) in chunk.iter() {
-                            cached_metrics.insert(curr_key.clone(), *curr_val);
-                        }
-                    } else {
-                        error!("metrics endpoint refused the sent metrics: {:?}", res);
-                        for metric in chunk_to_send
-                            .iter()
-                            .filter(|metric| metric.value > (1u64 << 40))
-                        {
-                            // Report if the metric value is suspiciously large
+        match res {
+            Ok(res) => {
+                if res.status().is_success() {
+                    // update cached metrics after they were sent successfully
+                    for (curr_key, curr_val) in chunk.iter() {
+                        cached_metrics.insert(curr_key.clone(), *curr_val);
+                    }
+                } else {
+                    error!("metrics endpoint refused the sent metrics: {:?}", res);
+                    for metric in chunk_to_send.iter() {
+                        // Report if the metric value is suspiciously large
+                        if metric.value > (1u64 << 40) {
                            error!("potentially abnormal metric value: {:?}", metric);
                        }
                    }
-                    break;
-                }
-                Err(err) if err.is_timeout() => {
-                    error!(attempt, "timeout sending metrics, retrying immediately");
-                    continue;
-                }
-                Err(err) => {
-                    error!(attempt, ?err, "failed to send metrics");
-                    break;
                }
            }
+            Err(err) => {
+                error!("failed to send metrics: {:?}", err);
+            }
        }
    }
 }
@@ -343,7 +317,7 @@ pub async fn calculate_synthetic_size_worker(
            _ = task_mgr::shutdown_watcher() => {
                return Ok(());
            },
-        tick_at = ticker.tick() => {
+        _ = ticker.tick() => {

                let tenants = match mgr::list_tenants().await {
                    Ok(tenants) => tenants,
@@ -369,12 +343,6 @@ pub async fn calculate_synthetic_size_worker(
                    }

                }
-
-                crate::tenant::tasks::warn_when_period_overrun(
-                    tick_at.elapsed(),
-                    synthetic_size_calculation_interval,
-                    "consumption_metrics_synthetic_size_worker",
-                );
            }
        }
    }
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -83,7 +83,7 @@ pub fn launch_disk_usage_global_eviction_task(
    conf: &'static PageServerConf,
    storage: GenericRemoteStorage,
    state: Arc<State>,
-    background_jobs_barrier: completion::Barrier,
+    init_done: completion::Barrier,
 ) -> anyhow::Result<()> {
    let Some(task_config) = &conf.disk_usage_based_eviction else {
        info!("disk usage based eviction task not configured");
@@ -100,16 +100,18 @@ pub fn launch_disk_usage_global_eviction_task(
        "disk usage based eviction",
        false,
        async move {
-            let cancel = task_mgr::shutdown_token();
-
            // wait until initial load is complete, because we cannot evict from loading tenants.
-            tokio::select! {
-                _ = cancel.cancelled() => { return Ok(()); },
-                _ = background_jobs_barrier.wait() => { }
-            };
+            init_done.wait().await;

-            disk_usage_eviction_task(&state, task_config, storage, &conf.tenants_path(), cancel)
-                .await;
+            disk_usage_eviction_task(
+                &state,
+                task_config,
+                storage,
+                &conf.tenants_path(),
+                task_mgr::shutdown_token(),
+            )
+            .await;
+            info!("disk usage based eviction task finishing");
            Ok(())
        },
    );
@@ -125,16 +127,13 @@ async fn disk_usage_eviction_task(
    tenants_dir: &Path,
    cancel: CancellationToken,
 ) {
-    scopeguard::defer! {
-        info!("disk usage based eviction task finishing");
-    };
-
    use crate::tenant::tasks::random_init_delay;
    {
        if random_init_delay(task_config.period, &cancel)
            .await
            .is_err()
        {
+            info!("shutting down");
            return;
        }
    }
@@ -169,6 +168,7 @@ async fn disk_usage_eviction_task(
        tokio::select! {
            _ = tokio::time::sleep_until(sleep_until) => {},
            _ = cancel.cancelled() => {
+                info!("shutting down");
                break
            }
        }
@@ -315,7 +315,7 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
            partition,
            candidate.layer.get_tenant_id(),
            candidate.layer.get_timeline_id(),
-            candidate.layer,
+            candidate.layer.filename().file_name(),
        );
    }

@@ -517,7 +517,7 @@ async fn collect_eviction_candidates(
            if !tl.is_active() {
                continue;
            }
-            let info = tl.get_local_layers_for_disk_usage_eviction().await;
+            let info = tl.get_local_layers_for_disk_usage_eviction();
            debug!(tenant_id=%tl.tenant_id, timeline_id=%tl.timeline_id, "timeline resident layers count: {}", info.resident_layers.len());
            tenant_candidates.extend(
                info.resident_layers
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -186,8 +186,10 @@ paths:
              schema:
                $ref: "#/components/schemas/Error"
    delete:
-      description: "Attempts to delete specified timeline. 500 and 409 errors should be retried"
+      description: "Attempts to delete specified timeline. On 500 errors should be retried"
      responses:
+        "200":
+          description: Ok
        "400":
          description: Error when no tenant id found in path or no timeline id
          content:
@@ -212,14 +214,8 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/NotFoundError"
-        "409":
-          description: Deletion is already in progress, continue polling
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ConflictError"
        "412":
-          description: Tenant is missing, or timeline has children
+          description: Tenant is missing
          content:
            application/json:
              schema:
@@ -390,7 +386,6 @@ paths:
        "202":
          description: Tenant attaching scheduled
        "400":
-          description: Bad Request
          content:
            application/json:
              schema:
@@ -722,12 +717,6 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/ForbiddenError"
-        "406":
-          description: Permanently unsatisfiable request, don't retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
        "409":
          description: Timeline already exists, creation skipped
          content:
@@ -939,28 +928,12 @@ components:
              writing to the tenant's S3 state, so, DO NOT ATTACH the
              tenant to any other pageserver, or we risk split-brain.
            - `attached` means that the attach operation has completed,
-              successfully
-            - `failed` means that attach has failed. For reason check corresponding `reason` failed.
-              `failed` is the terminal state, retrying attach call wont resolve the issue.
-              For example this can be caused by s3 being unreachable. The retry may be implemented
-              with call to detach, though it would be better to not automate it and inspec failed state
-              manually before proceeding with a retry.
+              maybe successfully, maybe not. Perform a health check at
+              the Postgres level to determine healthiness of the tenant.

            See the tenant `/attach` endpoint for more information.
-          type: object
-          required:
-            - slug
-            - data
-          properties:
-            slug:
-              type: string
-              enum: [ "maybe", "attached", "failed" ]
-            data:
-              type: object
-              properties:
-                reason:
-                  type: string
-
+          type: string
+          enum: [ "maybe", "attached" ]
    TenantCreateRequest:
      allOf:
        - $ref: '#/components/schemas/TenantConfig'
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -23,6 +23,7 @@ use super::models::{
    TimelineCreateRequest, TimelineGcRequest, TimelineInfo,
 };
 use crate::context::{DownloadBehavior, RequestContext};
+use crate::disk_usage_eviction_task;
 use crate::metrics::{StorageTimeOperation, STORAGE_TIME_GLOBAL};
 use crate::pgdatadir_mapping::LsnForTimestamp;
 use crate::task_mgr::TaskKind;
@@ -34,7 +35,6 @@ use crate::tenant::size::ModelInputs;
 use crate::tenant::storage_layer::LayerAccessStatsReset;
 use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError, Timeline};
 use crate::{config::PageServerConf, tenant::mgr};
-use crate::{disk_usage_eviction_task, tenant};
 use utils::{
    auth::JwtAuth,
    http::{
@@ -142,7 +142,7 @@ impl From<TenantMapInsertError> for ApiError {
 impl From<TenantStateError> for ApiError {
    fn from(tse: TenantStateError) -> ApiError {
        match tse {
-            TenantStateError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {}", tid).into()),
+            TenantStateError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {}", tid)),
            _ => ApiError::InternalServerError(anyhow::Error::new(tse)),
        }
    }
@@ -151,7 +151,7 @@ impl From<TenantStateError> for ApiError {
 impl From<GetTenantError> for ApiError {
    fn from(tse: GetTenantError) -> ApiError {
        match tse {
-            GetTenantError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {}", tid).into()),
+            GetTenantError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {}", tid)),
            e @ GetTenantError::NotActive(_) => {
                // Why is this not `ApiError::NotFound`?
                // Because we must be careful to never return 404 for a tenant if it does
@@ -169,7 +169,7 @@ impl From<SetNewTenantConfigError> for ApiError {
    fn from(e: SetNewTenantConfigError) -> ApiError {
        match e {
            SetNewTenantConfigError::GetTenant(tid) => {
-                ApiError::NotFound(anyhow!("tenant {}", tid).into())
+                ApiError::NotFound(anyhow!("tenant {}", tid))
            }
            e @ SetNewTenantConfigError::Persist(_) => {
                ApiError::InternalServerError(anyhow::Error::new(e))
@@ -182,12 +182,10 @@ impl From<crate::tenant::DeleteTimelineError> for ApiError {
    fn from(value: crate::tenant::DeleteTimelineError) -> Self {
        use crate::tenant::DeleteTimelineError::*;
        match value {
-            NotFound => ApiError::NotFound(anyhow::anyhow!("timeline not found").into()),
-            HasChildren(children) => ApiError::PreconditionFailed(
-                format!("Cannot delete timeline which has child timelines: {children:?}")
-                    .into_boxed_str(),
-            ),
-            a @ AlreadyInProgress => ApiError::Conflict(a.to_string()),
+            NotFound => ApiError::NotFound(anyhow::anyhow!("timeline not found")),
+            HasChildren => ApiError::BadRequest(anyhow::anyhow!(
+                "Cannot delete timeline which has child timelines"
+            )),
            Other(e) => ApiError::InternalServerError(e),
        }
    }
@@ -199,9 +197,9 @@ impl From<crate::tenant::mgr::DeleteTimelineError> for ApiError {
        match value {
            // Report Precondition failed so client can distinguish between
            // "tenant is missing" case from "timeline is missing"
-            Tenant(GetTenantError::NotFound(..)) => ApiError::PreconditionFailed(
-                "Requested tenant is missing".to_owned().into_boxed_str(),
-            ),
+            Tenant(GetTenantError::NotFound(..)) => {
+                ApiError::PreconditionFailed("Requested tenant is missing")
+            }
            Tenant(t) => ApiError::from(t),
            Timeline(t) => ApiError::from(t),
        }
@@ -216,7 +214,7 @@ async fn build_timeline_info(
 ) -> anyhow::Result<TimelineInfo> {
    crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id();

-    let mut info = build_timeline_info_common(timeline, ctx).await?;
+    let mut info = build_timeline_info_common(timeline, ctx)?;
    if include_non_incremental_logical_size {
        // XXX we should be using spawn_ondemand_logical_size_calculation here.
        // Otherwise, if someone deletes the timeline / detaches the tenant while
@@ -234,7 +232,7 @@ async fn build_timeline_info(
    Ok(info)
 }

-async fn build_timeline_info_common(
+fn build_timeline_info_common(
    timeline: &Arc<Timeline>,
    ctx: &RequestContext,
 ) -> anyhow::Result<TimelineInfo> {
@@ -265,7 +263,7 @@ async fn build_timeline_info_common(
            None
        }
    };
-    let current_physical_size = Some(timeline.layer_size_sum().await);
+    let current_physical_size = Some(timeline.layer_size_sum());
    let state = timeline.current_state();
    let remote_consistent_lsn = timeline.get_remote_consistent_lsn().unwrap_or(Lsn(0));

@@ -328,22 +326,14 @@ async fn timeline_create_handler(
            &ctx,
        )
        .await {
-            Ok(new_timeline) => {
+            Ok(Some(new_timeline)) => {
                // Created. Construct a TimelineInfo for it.
                let timeline_info = build_timeline_info_common(&new_timeline, &ctx)
-                    .await
                    .map_err(ApiError::InternalServerError)?;
                json_response(StatusCode::CREATED, timeline_info)
            }
-            Err(tenant::CreateTimelineError::AlreadyExists) => {
-                json_response(StatusCode::CONFLICT, ())
-            }
-            Err(tenant::CreateTimelineError::AncestorLsn(err)) => {
-                json_response(StatusCode::NOT_ACCEPTABLE, HttpErrorBody::from_msg(
-                    format!("{err:#}")
-                ))
-            }
-            Err(tenant::CreateTimelineError::Other(err)) => Err(ApiError::InternalServerError(err)),
+            Ok(None) => json_response(StatusCode::CONFLICT, ()), // timeline already exists
+            Err(err) => Err(ApiError::InternalServerError(err)),
        }
    }
    .instrument(info_span!("timeline_create", tenant = %tenant_id, timeline_id = %new_timeline_id, lsn=?request_data.ancestor_start_lsn, pg_version=?request_data.pg_version))
@@ -405,7 +395,7 @@ async fn timeline_detail_handler(

        let timeline = tenant
            .get_timeline(timeline_id, false)
-            .map_err(|e| ApiError::NotFound(e.into()))?;
+            .map_err(ApiError::NotFound)?;

        let timeline_info = build_timeline_info(
            &timeline,
@@ -504,8 +494,7 @@ async fn timeline_delete_handler(
        .instrument(info_span!("timeline_delete", tenant = %tenant_id, timeline = %timeline_id))
        .await?;

-    // FIXME: needs to be an error for console to retry it. Ideally Accepted should be used and retried until 404.
-    json_response(StatusCode::ACCEPTED, ())
+    json_response(StatusCode::OK, ())
 }

 async fn tenant_detach_handler(
@@ -600,7 +589,7 @@ async fn tenant_status(
        // Calculate total physical size of all timelines
        let mut current_physical_size = 0;
        for timeline in tenant.list_timelines().iter() {
-            current_physical_size += timeline.layer_size_sum().await;
+            current_physical_size += timeline.layer_size_sum();
        }

        let state = tenant.current_state();
@@ -710,7 +699,7 @@ async fn layer_map_info_handler(
    check_permission(&request, Some(tenant_id))?;

    let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
-    let layer_map_info = timeline.layer_map_info(reset).await;
+    let layer_map_info = timeline.layer_map_info(reset);

    json_response(StatusCode::OK, layer_map_info)
 }
@@ -1069,7 +1058,7 @@ async fn timeline_download_remote_layers_handler_get(
    let info = timeline
        .get_download_all_remote_layers_task_info()
        .context("task never started since last pageserver process start")
-        .map_err(|e| ApiError::NotFound(e.into()))?;
+        .map_err(ApiError::NotFound)?;
    json_response(StatusCode::OK, info)
 }

@@ -1080,7 +1069,7 @@ async fn active_timeline_of_active_tenant(
    let tenant = mgr::get_tenant(tenant_id, true).await?;
    tenant
        .get_timeline(timeline_id, true)
-        .map_err(|e| ApiError::NotFound(e.into()))
+        .map_err(ApiError::NotFound)
 }

 async fn always_panic_handler(
@@ -1136,6 +1125,8 @@ async fn disk_usage_eviction_run(
        freed_bytes: 0,
    };

+    use crate::task_mgr::MGMT_REQUEST_RUNTIME;
+
    let (tx, rx) = tokio::sync::oneshot::channel();

    let state = get_state(&r);
@@ -1153,7 +1144,7 @@ async fn disk_usage_eviction_run(
    let _g = cancel.drop_guard();

    crate::task_mgr::spawn(
-        crate::task_mgr::BACKGROUND_RUNTIME.handle(),
+        MGMT_REQUEST_RUNTIME.handle(),
        TaskKind::DiskUsageEviction,
        None,
        None,
--- a/pageserver/src/import_datadir.rs
+++ b/pageserver/src/import_datadir.rs
@@ -75,12 +75,12 @@ pub async fn import_timeline_from_postgres_datadir(
            {
                pg_control = Some(control_file);
            }
-            modification.flush().await?;
+            modification.flush()?;
        }
    }

    // We're done importing all the data files.
-    modification.commit().await?;
+    modification.commit()?;

    // We expect the Postgres server to be shut down cleanly.
    let pg_control = pg_control.context("pg_control file not found")?;
@@ -148,17 +148,17 @@ async fn import_rel(
    // because there is no guarantee about the order in which we are processing segments.
    // ignore "relation already exists" error
    //
-    // FIXME: Keep track of which relations we've already created?
+    // FIXME: use proper error type for this, instead of parsing the error message.
+    // Or better yet, keep track of which relations we've already created
    // https://github.com/neondatabase/neon/issues/3309
    if let Err(e) = modification
        .put_rel_creation(rel, nblocks as u32, ctx)
        .await
    {
-        match e {
-            RelationError::AlreadyExists => {
-                debug!("Relation {} already exist. We must be extending it.", rel)
-            }
-            _ => return Err(e.into()),
+        if e.to_string().contains("already exists") {
+            debug!("relation {} already exists. we must be extending it", rel);
+        } else {
+            return Err(e);
        }
    }

@@ -359,7 +359,7 @@ pub async fn import_basebackup_from_tar(
                    // We found the pg_control file.
                    pg_control = Some(res);
                }
-                modification.flush().await?;
+                modification.flush()?;
            }
            tokio_tar::EntryType::Directory => {
                debug!("directory {:?}", file_path);
@@ -377,7 +377,7 @@ pub async fn import_basebackup_from_tar(
    // sanity check: ensure that pg_control is loaded
    let _pg_control = pg_control.context("pg_control file not found")?;

-    modification.commit().await?;
+    modification.commit()?;
    Ok(())
 }

@@ -594,7 +594,7 @@ async fn import_file(
        // zenith.signal is not necessarily the last file, that we handle
        // but it is ok to call `finish_write()`, because final `modification.commit()`
        // will update lsn once more to the final one.
-        let writer = modification.tline.writer().await;
+        let writer = modification.tline.writer();
        writer.finish_write(prev_lsn);

        debug!("imported zenith signal {}", prev_lsn);
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -58,6 +58,12 @@ pub async fn shutdown_pageserver(exit_code: i32) {
    // the checkpoint and GC tasks.
    tenant::mgr::shutdown_all_tenants().await;

+    // Stop syncing with remote storage.
+    //
+    // FIXME: Does this wait for the sync tasks to finish syncing what's queued up?
+    // Should it?
+    task_mgr::shutdown_tasks(Some(TaskKind::RemoteUploadTask), None, None).await;
+
    // Shut down the HTTP endpoint last, so that you can still check the server's
    // status while it's shutting down.
    // FIXME: We should probably stop accepting commands like attach/detach earlier.
@@ -132,29 +138,6 @@ pub fn is_uninit_mark(path: &Path) -> bool {
    }
 }

-/// During pageserver startup, we need to order operations not to exhaust tokio worker threads by
-/// blocking.
-///
-/// The instances of this value exist only during startup, otherwise `None` is provided, meaning no
-/// delaying is needed.
-#[derive(Clone)]
-pub struct InitializationOrder {
-    /// Each initial tenant load task carries this until completion.
-    pub initial_tenant_load: Option<utils::completion::Completion>,
-
-    /// Barrier for when we can start initial logical size calculations.
-    pub initial_logical_size_can_start: utils::completion::Barrier,
-
-    /// Each timeline owns a clone of this to be consumed on the initial logical size calculation
-    /// attempt. It is important to drop this once the attempt has completed.
-    pub initial_logical_size_attempt: utils::completion::Completion,
-
-    /// Barrier for when we can start any background jobs.
-    ///
-    /// This can be broken up later on, but right now there is just one class of a background job.
-    pub background_jobs_can_start: utils::completion::Barrier,
-}
-
 #[cfg(test)]
 mod backoff_defaults_tests {
    use super::*;
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1,9 +1,9 @@
-use metrics::metric_vec_duration::DurationResultObserver;
+use metrics::core::{AtomicU64, GenericCounter};
 use metrics::{
    register_counter_vec, register_histogram, register_histogram_vec, register_int_counter,
-    register_int_counter_vec, register_int_gauge, register_int_gauge_vec, register_uint_gauge,
-    register_uint_gauge_vec, Counter, CounterVec, Histogram, HistogramVec, IntCounter,
-    IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
+    register_int_counter_vec, register_int_gauge, register_int_gauge_vec, register_uint_gauge_vec,
+    Counter, CounterVec, Histogram, HistogramVec, IntCounter, IntCounterVec, IntGauge, IntGaugeVec,
+    UIntGauge, UIntGaugeVec,
 };
 use once_cell::sync::Lazy;
 use pageserver_api::models::TenantState;
@@ -95,19 +95,21 @@ static READ_NUM_FS_LAYERS: Lazy<HistogramVec> = Lazy::new(|| {
 });

 // Metrics collected on operations on the storage repository.
-pub static RECONSTRUCT_TIME: Lazy<Histogram> = Lazy::new(|| {
-    register_histogram!(
+static RECONSTRUCT_TIME: Lazy<HistogramVec> = Lazy::new(|| {
+    register_histogram_vec!(
        "pageserver_getpage_reconstruct_seconds",
-        "Time spent in reconstruct_value (reconstruct a page from deltas)",
+        "Time spent in reconstruct_value",
+        &["tenant_id", "timeline_id"],
        CRITICAL_OP_BUCKETS.into(),
    )
    .expect("failed to define a metric")
 });

-pub static MATERIALIZED_PAGE_CACHE_HIT_DIRECT: Lazy<IntCounter> = Lazy::new(|| {
-    register_int_counter!(
+static MATERIALIZED_PAGE_CACHE_HIT_DIRECT: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
        "pageserver_materialized_cache_hits_direct_total",
        "Number of cache hits from materialized page cache without redo",
+        &["tenant_id", "timeline_id"]
    )
    .expect("failed to define a metric")
 });
@@ -122,130 +124,15 @@ static GET_RECONSTRUCT_DATA_TIME: Lazy<HistogramVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub static MATERIALIZED_PAGE_CACHE_HIT: Lazy<IntCounter> = Lazy::new(|| {
-    register_int_counter!(
+static MATERIALIZED_PAGE_CACHE_HIT: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
        "pageserver_materialized_cache_hits_total",
        "Number of cache hits from materialized page cache",
+        &["tenant_id", "timeline_id"]
    )
    .expect("failed to define a metric")
 });

-pub struct PageCacheMetrics {
-    pub read_accesses_materialized_page: IntCounter,
-    pub read_accesses_ephemeral: IntCounter,
-    pub read_accesses_immutable: IntCounter,
-
-    pub read_hits_ephemeral: IntCounter,
-    pub read_hits_immutable: IntCounter,
-    pub read_hits_materialized_page_exact: IntCounter,
-    pub read_hits_materialized_page_older_lsn: IntCounter,
-}
-
-static PAGE_CACHE_READ_HITS: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
-        "pageserver_page_cache_read_hits_total",
-        "Number of read accesses to the page cache that hit",
-        &["key_kind", "hit_kind"]
-    )
-    .expect("failed to define a metric")
-});
-
-static PAGE_CACHE_READ_ACCESSES: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
-        "pageserver_page_cache_read_accesses_total",
-        "Number of read accesses to the page cache",
-        &["key_kind"]
-    )
-    .expect("failed to define a metric")
-});
-
-pub static PAGE_CACHE: Lazy<PageCacheMetrics> = Lazy::new(|| PageCacheMetrics {
-    read_accesses_materialized_page: {
-        PAGE_CACHE_READ_ACCESSES
-            .get_metric_with_label_values(&["materialized_page"])
-            .unwrap()
-    },
-
-    read_accesses_ephemeral: {
-        PAGE_CACHE_READ_ACCESSES
-            .get_metric_with_label_values(&["ephemeral"])
-            .unwrap()
-    },
-
-    read_accesses_immutable: {
-        PAGE_CACHE_READ_ACCESSES
-            .get_metric_with_label_values(&["immutable"])
-            .unwrap()
-    },
-
-    read_hits_ephemeral: {
-        PAGE_CACHE_READ_HITS
-            .get_metric_with_label_values(&["ephemeral", "-"])
-            .unwrap()
-    },
-
-    read_hits_immutable: {
-        PAGE_CACHE_READ_HITS
-            .get_metric_with_label_values(&["immutable", "-"])
-            .unwrap()
-    },
-
-    read_hits_materialized_page_exact: {
-        PAGE_CACHE_READ_HITS
-            .get_metric_with_label_values(&["materialized_page", "exact"])
-            .unwrap()
-    },
-
-    read_hits_materialized_page_older_lsn: {
-        PAGE_CACHE_READ_HITS
-            .get_metric_with_label_values(&["materialized_page", "older_lsn"])
-            .unwrap()
-    },
-});
-
-pub struct PageCacheSizeMetrics {
-    pub max_bytes: UIntGauge,
-
-    pub current_bytes_ephemeral: UIntGauge,
-    pub current_bytes_immutable: UIntGauge,
-    pub current_bytes_materialized_page: UIntGauge,
-}
-
-static PAGE_CACHE_SIZE_CURRENT_BYTES: Lazy<UIntGaugeVec> = Lazy::new(|| {
-    register_uint_gauge_vec!(
-        "pageserver_page_cache_size_current_bytes",
-        "Current size of the page cache in bytes, by key kind",
-        &["key_kind"]
-    )
-    .expect("failed to define a metric")
-});
-
-pub static PAGE_CACHE_SIZE: Lazy<PageCacheSizeMetrics> = Lazy::new(|| PageCacheSizeMetrics {
-    max_bytes: {
-        register_uint_gauge!(
-            "pageserver_page_cache_size_max_bytes",
-            "Maximum size of the page cache in bytes"
-        )
-        .expect("failed to define a metric")
-    },
-
-    current_bytes_ephemeral: {
-        PAGE_CACHE_SIZE_CURRENT_BYTES
-            .get_metric_with_label_values(&["ephemeral"])
-            .unwrap()
-    },
-    current_bytes_immutable: {
-        PAGE_CACHE_SIZE_CURRENT_BYTES
-            .get_metric_with_label_values(&["immutable"])
-            .unwrap()
-    },
-    current_bytes_materialized_page: {
-        PAGE_CACHE_SIZE_CURRENT_BYTES
-            .get_metric_with_label_values(&["materialized_page"])
-            .unwrap()
-    },
-});
-
 static WAIT_LSN_TIME: Lazy<HistogramVec> = Lazy::new(|| {
    register_histogram_vec!(
        "pageserver_wait_lsn_seconds",
@@ -320,11 +207,11 @@ pub static TENANT_STATE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|| {

 pub static TENANT_SYNTHETIC_SIZE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|| {
    register_uint_gauge_vec!(
-        "pageserver_tenant_synthetic_cached_size_bytes",
-        "Synthetic size of each tenant in bytes",
+        "pageserver_tenant_synthetic_size",
+        "Synthetic size of each tenant",
        &["tenant_id"]
    )
-    .expect("Failed to register pageserver_tenant_synthetic_cached_size_bytes metric")
+    .expect("Failed to register pageserver_tenant_synthetic_size metric")
 });

 // Metrics for cloud upload. These metrics reflect data uploaded to cloud storage,
@@ -541,27 +428,6 @@ pub static SMGR_QUERY_TIME: Lazy<HistogramVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub struct BasebackupQueryTime(HistogramVec);
-pub static BASEBACKUP_QUERY_TIME: Lazy<BasebackupQueryTime> = Lazy::new(|| {
-    BasebackupQueryTime({
-        register_histogram_vec!(
-            "pageserver_basebackup_query_seconds",
-            "Histogram of basebackup queries durations, by result type",
-            &["result"],
-            CRITICAL_OP_BUCKETS.into(),
-        )
-        .expect("failed to define a metric")
-    })
-});
-
-impl DurationResultObserver for BasebackupQueryTime {
-    fn observe_result<T, E>(&self, res: &Result<T, E>, duration: std::time::Duration) {
-        let label_value = if res.is_ok() { "ok" } else { "error" };
-        let metric = self.0.get_metric_with_label_values(&[label_value]).unwrap();
-        metric.observe(duration.as_secs_f64());
-    }
-}
-
 pub static LIVE_CONNECTIONS_COUNT: Lazy<IntGaugeVec> = Lazy::new(|| {
    register_int_gauge_vec!(
        "pageserver_live_connections",
@@ -886,7 +752,10 @@ impl StorageTimeMetrics {
 pub struct TimelineMetrics {
    tenant_id: String,
    timeline_id: String,
+    pub reconstruct_time_histo: Histogram,
    pub get_reconstruct_data_time_histo: Histogram,
+    pub materialized_page_cache_hit_counter: GenericCounter<AtomicU64>,
+    pub materialized_page_cache_hit_upon_request_counter: GenericCounter<AtomicU64>,
    pub flush_time_histo: StorageTimeMetrics,
    pub compact_time_histo: StorageTimeMetrics,
    pub create_images_time_histo: StorageTimeMetrics,
@@ -914,9 +783,15 @@ impl TimelineMetrics {
    ) -> Self {
        let tenant_id = tenant_id.to_string();
        let timeline_id = timeline_id.to_string();
+        let reconstruct_time_histo = RECONSTRUCT_TIME
+            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
+            .unwrap();
        let get_reconstruct_data_time_histo = GET_RECONSTRUCT_DATA_TIME
            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
            .unwrap();
+        let materialized_page_cache_hit_counter = MATERIALIZED_PAGE_CACHE_HIT
+            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
+            .unwrap();
        let flush_time_histo =
            StorageTimeMetrics::new(StorageTimeOperation::LayerFlush, &tenant_id, &timeline_id);
        let compact_time_histo =
@@ -958,13 +833,19 @@ impl TimelineMetrics {
        let read_num_fs_layers = READ_NUM_FS_LAYERS
            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
            .unwrap();
+        let materialized_page_cache_hit_upon_request_counter = MATERIALIZED_PAGE_CACHE_HIT_DIRECT
+            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
+            .unwrap();
        let evictions_with_low_residence_duration =
            evictions_with_low_residence_duration_builder.build(&tenant_id, &timeline_id);

        TimelineMetrics {
            tenant_id,
            timeline_id,
+            reconstruct_time_histo,
            get_reconstruct_data_time_histo,
+            materialized_page_cache_hit_counter,
+            materialized_page_cache_hit_upon_request_counter,
            flush_time_histo,
            compact_time_histo,
            create_images_time_histo,
@@ -991,7 +872,10 @@ impl Drop for TimelineMetrics {
    fn drop(&mut self) {
        let tenant_id = &self.tenant_id;
        let timeline_id = &self.timeline_id;
+        let _ = RECONSTRUCT_TIME.remove_label_values(&[tenant_id, timeline_id]);
        let _ = GET_RECONSTRUCT_DATA_TIME.remove_label_values(&[tenant_id, timeline_id]);
+        let _ = MATERIALIZED_PAGE_CACHE_HIT.remove_label_values(&[tenant_id, timeline_id]);
+        let _ = MATERIALIZED_PAGE_CACHE_HIT_DIRECT.remove_label_values(&[tenant_id, timeline_id]);
        let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, timeline_id]);
        let _ = WAIT_LSN_TIME.remove_label_values(&[tenant_id, timeline_id]);
        let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
@@ -1084,6 +968,7 @@ impl RemoteTimelineClientMetrics {
        op_kind: &RemoteOpKind,
        status: &'static str,
    ) -> Histogram {
+        // XXX would be nice to have an upgradable RwLock
        let mut guard = self.remote_operation_time.lock().unwrap();
        let key = (file_kind.as_str(), op_kind.as_str(), status);
        let metric = guard.entry(key).or_insert_with(move || {
@@ -1105,6 +990,7 @@ impl RemoteTimelineClientMetrics {
        file_kind: &RemoteOpFileKind,
        op_kind: &RemoteOpKind,
    ) -> IntGauge {
+        // XXX would be nice to have an upgradable RwLock
        let mut guard = self.calls_unfinished_gauge.lock().unwrap();
        let key = (file_kind.as_str(), op_kind.as_str());
        let metric = guard.entry(key).or_insert_with(move || {
@@ -1125,6 +1011,7 @@ impl RemoteTimelineClientMetrics {
        file_kind: &RemoteOpFileKind,
        op_kind: &RemoteOpKind,
    ) -> Histogram {
+        // XXX would be nice to have an upgradable RwLock
        let mut guard = self.calls_started_hist.lock().unwrap();
        let key = (file_kind.as_str(), op_kind.as_str());
        let metric = guard.entry(key).or_insert_with(move || {
@@ -1145,6 +1032,7 @@ impl RemoteTimelineClientMetrics {
        file_kind: &RemoteOpFileKind,
        op_kind: &RemoteOpKind,
    ) -> IntCounter {
+        // XXX would be nice to have an upgradable RwLock
        let mut guard = self.bytes_started_counter.lock().unwrap();
        let key = (file_kind.as_str(), op_kind.as_str());
        let metric = guard.entry(key).or_insert_with(move || {
@@ -1165,6 +1053,7 @@ impl RemoteTimelineClientMetrics {
        file_kind: &RemoteOpFileKind,
        op_kind: &RemoteOpKind,
    ) -> IntCounter {
+        // XXX would be nice to have an upgradable RwLock
        let mut guard = self.bytes_finished_counter.lock().unwrap();
        let key = (file_kind.as_str(), op_kind.as_str());
        let metric = guard.entry(key).or_insert_with(move || {
@@ -1430,8 +1319,4 @@ pub fn preinitialize_metrics() {

    // Same as above for this metric, but, it's a Vec-type metric for which we don't know all the labels.
    BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT.reset();
-
-    // Python tests need these.
-    MATERIALIZED_PAGE_CACHE_HIT_DIRECT.get();
-    MATERIALIZED_PAGE_CACHE_HIT.get();
 }
--- a/pageserver/src/page_cache.rs
+++ b/pageserver/src/page_cache.rs
@@ -53,8 +53,8 @@ use utils::{
    lsn::Lsn,
 };

+use crate::repository::Key;
 use crate::tenant::writeback_ephemeral_file;
-use crate::{metrics::PageCacheSizeMetrics, repository::Key};

 static PAGE_CACHE: OnceCell<PageCache> = OnceCell::new();
 const TEST_PAGE_CACHE_SIZE: usize = 50;
@@ -187,8 +187,6 @@ pub struct PageCache {
    /// Index of the next candidate to evict, for the Clock replacement algorithm.
    /// This is interpreted modulo the page cache size.
    next_evict_slot: AtomicUsize,
-
-    size_metrics: &'static PageCacheSizeMetrics,
 }

 ///
@@ -315,10 +313,6 @@ impl PageCache {
        key: &Key,
        lsn: Lsn,
    ) -> Option<(Lsn, PageReadGuard)> {
-        crate::metrics::PAGE_CACHE
-            .read_accesses_materialized_page
-            .inc();
-
        let mut cache_key = CacheKey::MaterializedPage {
            hash_key: MaterializedPageHashKey {
                tenant_id,
@@ -329,21 +323,8 @@ impl PageCache {
        };

        if let Some(guard) = self.try_lock_for_read(&mut cache_key) {
-            if let CacheKey::MaterializedPage {
-                hash_key: _,
-                lsn: available_lsn,
-            } = cache_key
-            {
-                if available_lsn == lsn {
-                    crate::metrics::PAGE_CACHE
-                        .read_hits_materialized_page_exact
-                        .inc();
-                } else {
-                    crate::metrics::PAGE_CACHE
-                        .read_hits_materialized_page_older_lsn
-                        .inc();
-                }
-                Some((available_lsn, guard))
+            if let CacheKey::MaterializedPage { hash_key: _, lsn } = cache_key {
+                Some((lsn, guard))
            } else {
                panic!("unexpected key type in slot");
            }
@@ -518,31 +499,11 @@ impl PageCache {
    /// ```
    ///
    fn lock_for_read(&self, cache_key: &mut CacheKey) -> anyhow::Result<ReadBufResult> {
-        let (read_access, hit) = match cache_key {
-            CacheKey::MaterializedPage { .. } => {
-                unreachable!("Materialized pages use lookup_materialized_page")
-            }
-            CacheKey::EphemeralPage { .. } => (
-                &crate::metrics::PAGE_CACHE.read_accesses_ephemeral,
-                &crate::metrics::PAGE_CACHE.read_hits_ephemeral,
-            ),
-            CacheKey::ImmutableFilePage { .. } => (
-                &crate::metrics::PAGE_CACHE.read_accesses_immutable,
-                &crate::metrics::PAGE_CACHE.read_hits_immutable,
-            ),
-        };
-        read_access.inc();
-
-        let mut is_first_iteration = true;
        loop {
            // First check if the key already exists in the cache.
            if let Some(read_guard) = self.try_lock_for_read(cache_key) {
-                if is_first_iteration {
-                    hit.inc();
-                }
                return Ok(ReadBufResult::Found(read_guard));
            }
-            is_first_iteration = false;

            // Not found. Find a victim buffer
            let (slot_idx, mut inner) =
@@ -720,9 +681,6 @@ impl PageCache {

                    if let Ok(version_idx) = versions.binary_search_by_key(old_lsn, |v| v.lsn) {
                        versions.remove(version_idx);
-                        self.size_metrics
-                            .current_bytes_materialized_page
-                            .sub_page_sz(1);
                        if versions.is_empty() {
                            old_entry.remove_entry();
                        }
@@ -735,13 +693,11 @@ impl PageCache {
                let mut map = self.ephemeral_page_map.write().unwrap();
                map.remove(&(*file_id, *blkno))
                    .expect("could not find old key in mapping");
-                self.size_metrics.current_bytes_ephemeral.sub_page_sz(1);
            }
            CacheKey::ImmutableFilePage { file_id, blkno } => {
                let mut map = self.immutable_page_map.write().unwrap();
                map.remove(&(*file_id, *blkno))
                    .expect("could not find old key in mapping");
-                self.size_metrics.current_bytes_immutable.sub_page_sz(1);
            }
        }
    }
@@ -769,9 +725,6 @@ impl PageCache {
                                slot_idx,
                            },
                        );
-                        self.size_metrics
-                            .current_bytes_materialized_page
-                            .add_page_sz(1);
                        None
                    }
                }
@@ -782,7 +735,6 @@ impl PageCache {
                    Entry::Occupied(entry) => Some(*entry.get()),
                    Entry::Vacant(entry) => {
                        entry.insert(slot_idx);
-                        self.size_metrics.current_bytes_ephemeral.add_page_sz(1);
                        None
                    }
                }
@@ -793,7 +745,6 @@ impl PageCache {
                    Entry::Occupied(entry) => Some(*entry.get()),
                    Entry::Vacant(entry) => {
                        entry.insert(slot_idx);
-                        self.size_metrics.current_bytes_immutable.add_page_sz(1);
                        None
                    }
                }
@@ -893,12 +844,6 @@ impl PageCache {

        let page_buffer = Box::leak(vec![0u8; num_pages * PAGE_SZ].into_boxed_slice());

-        let size_metrics = &crate::metrics::PAGE_CACHE_SIZE;
-        size_metrics.max_bytes.set_page_sz(num_pages);
-        size_metrics.current_bytes_ephemeral.set_page_sz(0);
-        size_metrics.current_bytes_immutable.set_page_sz(0);
-        size_metrics.current_bytes_materialized_page.set_page_sz(0);
-
        let slots = page_buffer
            .chunks_exact_mut(PAGE_SZ)
            .map(|chunk| {
@@ -921,30 +866,6 @@ impl PageCache {
            immutable_page_map: Default::default(),
            slots,
            next_evict_slot: AtomicUsize::new(0),
-            size_metrics,
        }
    }
 }
-
-trait PageSzBytesMetric {
-    fn set_page_sz(&self, count: usize);
-    fn add_page_sz(&self, count: usize);
-    fn sub_page_sz(&self, count: usize);
-}
-
-#[inline(always)]
-fn count_times_page_sz(count: usize) -> u64 {
-    u64::try_from(count).unwrap() * u64::try_from(PAGE_SZ).unwrap()
-}
-
-impl PageSzBytesMetric for metrics::UIntGauge {
-    fn set_page_sz(&self, count: usize) {
-        self.set(count_times_page_sz(count));
-    }
-    fn add_page_sz(&self, count: usize) {
-        self.add(count_times_page_sz(count));
-    }
-    fn sub_page_sz(&self, count: usize) {
-        self.sub(count_times_page_sz(count));
-    }
-}
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -390,9 +390,7 @@ impl PageServerHandler {
        };

        // Check that the timeline exists
-        let timeline = tenant
-            .get_timeline(timeline_id, true)
-            .map_err(|e| anyhow::anyhow!(e))?;
+        let timeline = tenant.get_timeline(timeline_id, true)?;

        // switch client to COPYBOTH
        pgb.write_message_noflush(&BeMessage::CopyBothResponse)?;
@@ -904,7 +902,7 @@ where

            self.check_permission(Some(tenant_id))?;

-            let lsn = if params.len() >= 3 {
+            let lsn = if params.len() == 3 {
                Some(
                    Lsn::from_str(params[2])
                        .with_context(|| format!("Failed to parse Lsn from {}", params[2]))?,
@@ -913,24 +911,10 @@ where
                None
            };

-            metrics::metric_vec_duration::observe_async_block_duration_by_result(
-                &*crate::metrics::BASEBACKUP_QUERY_TIME,
-                async move {
-                    self.handle_basebackup_request(
-                        pgb,
-                        tenant_id,
-                        timeline_id,
-                        lsn,
-                        None,
-                        false,
-                        ctx,
-                    )
-                    .await?;
-                    pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
-                    anyhow::Ok(())
-                },
-            )
-            .await?;
+            // Check that the timeline exists
+            self.handle_basebackup_request(pgb, tenant_id, timeline_id, lsn, None, false, ctx)
+                .await?;
+            pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
        }
        // return pair of prev_lsn and last_lsn
        else if query_string.starts_with("get_last_record_rlsn ") {
@@ -1246,6 +1230,6 @@ async fn get_active_tenant_timeline(
        .map_err(GetActiveTimelineError::Tenant)?;
    let timeline = tenant
        .get_timeline(timeline_id, true)
-        .map_err(|e| GetActiveTimelineError::Timeline(anyhow::anyhow!(e)))?;
+        .map_err(GetActiveTimelineError::Timeline)?;
    Ok(timeline)
 }
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -43,16 +43,6 @@ pub enum CalculateLogicalSizeError {
    Other(#[from] anyhow::Error),
 }

-#[derive(Debug, thiserror::Error)]
-pub enum RelationError {
-    #[error("Relation Already Exists")]
-    AlreadyExists,
-    #[error("invalid relnode")]
-    InvalidRelnode,
-    #[error(transparent)]
-    Other(#[from] anyhow::Error),
-}
-
 ///
 /// This impl provides all the functionality to store PostgreSQL relations, SLRUs,
 /// and other special kinds of files, in a versioned key-value store. The
@@ -111,9 +101,9 @@ impl Timeline {
        ctx: &RequestContext,
    ) -> Result<Bytes, PageReconstructError> {
        if tag.relnode == 0 {
-            return Err(PageReconstructError::Other(
-                RelationError::InvalidRelnode.into(),
-            ));
+            return Err(PageReconstructError::Other(anyhow::anyhow!(
+                "invalid relnode"
+            )));
        }

        let nblocks = self.get_rel_size(tag, lsn, latest, ctx).await?;
@@ -158,9 +148,9 @@ impl Timeline {
        ctx: &RequestContext,
    ) -> Result<BlockNumber, PageReconstructError> {
        if tag.relnode == 0 {
-            return Err(PageReconstructError::Other(
-                RelationError::InvalidRelnode.into(),
-            ));
+            return Err(PageReconstructError::Other(anyhow::anyhow!(
+                "invalid relnode"
+            )));
        }

        if let Some(nblocks) = self.get_cached_rel_size(&tag, lsn) {
@@ -203,9 +193,9 @@ impl Timeline {
        ctx: &RequestContext,
    ) -> Result<bool, PageReconstructError> {
        if tag.relnode == 0 {
-            return Err(PageReconstructError::Other(
-                RelationError::InvalidRelnode.into(),
-            ));
+            return Err(PageReconstructError::Other(anyhow::anyhow!(
+                "invalid relnode"
+            )));
        }

        // first try to lookup relation in cache
@@ -709,20 +699,6 @@ impl<'a> DatadirModification<'a> {
        Ok(())
    }

-    #[cfg(test)]
-    pub fn init_empty_test_timeline(&mut self) -> anyhow::Result<()> {
-        self.init_empty()?;
-        self.put_control_file(bytes::Bytes::from_static(
-            b"control_file contents do not matter",
-        ))
-        .context("put_control_file")?;
-        self.put_checkpoint(bytes::Bytes::from_static(
-            b"checkpoint_file contents do not matter",
-        ))
-        .context("put_checkpoint_file")?;
-        Ok(())
-    }
-
    /// Put a new page version that can be constructed from a WAL record
    ///
    /// NOTE: this will *not* implicitly extend the relation, if the page is beyond the
@@ -734,7 +710,7 @@ impl<'a> DatadirModification<'a> {
        blknum: BlockNumber,
        rec: NeonWalRecord,
    ) -> anyhow::Result<()> {
-        anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
+        anyhow::ensure!(rel.relnode != 0, "invalid relnode");
        self.put(rel_block_to_key(rel, blknum), Value::WalRecord(rec));
        Ok(())
    }
@@ -761,7 +737,7 @@ impl<'a> DatadirModification<'a> {
        blknum: BlockNumber,
        img: Bytes,
    ) -> anyhow::Result<()> {
-        anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
+        anyhow::ensure!(rel.relnode != 0, "invalid relnode");
        self.put(rel_block_to_key(rel, blknum), Value::Image(img));
        Ok(())
    }
@@ -885,38 +861,32 @@ impl<'a> DatadirModification<'a> {
        rel: RelTag,
        nblocks: BlockNumber,
        ctx: &RequestContext,
-    ) -> Result<(), RelationError> {
-        if rel.relnode == 0 {
-            return Err(RelationError::InvalidRelnode);
-        }
+    ) -> anyhow::Result<()> {
+        anyhow::ensure!(rel.relnode != 0, "invalid relnode");
        // It's possible that this is the first rel for this db in this
        // tablespace.  Create the reldir entry for it if so.
-        let mut dbdir = DbDirectory::des(&self.get(DBDIR_KEY, ctx).await.context("read db")?)
-            .context("deserialize db")?;
+        let mut dbdir = DbDirectory::des(&self.get(DBDIR_KEY, ctx).await?)?;
        let rel_dir_key = rel_dir_to_key(rel.spcnode, rel.dbnode);
        let mut rel_dir = if dbdir.dbdirs.get(&(rel.spcnode, rel.dbnode)).is_none() {
            // Didn't exist. Update dbdir
            dbdir.dbdirs.insert((rel.spcnode, rel.dbnode), false);
-            let buf = DbDirectory::ser(&dbdir).context("serialize db")?;
+            let buf = DbDirectory::ser(&dbdir)?;
            self.put(DBDIR_KEY, Value::Image(buf.into()));

            // and create the RelDirectory
            RelDirectory::default()
        } else {
            // reldir already exists, fetch it
-            RelDirectory::des(&self.get(rel_dir_key, ctx).await.context("read db")?)
-                .context("deserialize db")?
+            RelDirectory::des(&self.get(rel_dir_key, ctx).await?)?
        };

        // Add the new relation to the rel directory entry, and write it back
        if !rel_dir.rels.insert((rel.relnode, rel.forknum)) {
-            return Err(RelationError::AlreadyExists);
+            anyhow::bail!("rel {rel} already exists");
        }
        self.put(
            rel_dir_key,
-            Value::Image(Bytes::from(
-                RelDirectory::ser(&rel_dir).context("serialize")?,
-            )),
+            Value::Image(Bytes::from(RelDirectory::ser(&rel_dir)?)),
        );

        // Put size
@@ -941,7 +911,7 @@ impl<'a> DatadirModification<'a> {
        nblocks: BlockNumber,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
-        anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
+        anyhow::ensure!(rel.relnode != 0, "invalid relnode");
        let last_lsn = self.tline.get_last_record_lsn();
        if self.tline.get_rel_exists(rel, last_lsn, true, ctx).await? {
            let size_key = rel_size_to_key(rel);
@@ -972,7 +942,7 @@ impl<'a> DatadirModification<'a> {
        nblocks: BlockNumber,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
-        anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
+        anyhow::ensure!(rel.relnode != 0, "invalid relnode");

        // Put size
        let size_key = rel_size_to_key(rel);
@@ -993,7 +963,7 @@ impl<'a> DatadirModification<'a> {

    /// Drop a relation.
    pub async fn put_rel_drop(&mut self, rel: RelTag, ctx: &RequestContext) -> anyhow::Result<()> {
-        anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
+        anyhow::ensure!(rel.relnode != 0, "invalid relnode");

        // Remove it from the directory entry
        let dir_key = rel_dir_to_key(rel.spcnode, rel.dbnode);
@@ -1138,7 +1108,7 @@ impl<'a> DatadirModification<'a> {
    /// retains all the metadata, but data pages are flushed. That's again OK
    /// for bulk import, where you are just loading data pages and won't try to
    /// modify the same pages twice.
-    pub async fn flush(&mut self) -> anyhow::Result<()> {
+    pub fn flush(&mut self) -> anyhow::Result<()> {
        // Unless we have accumulated a decent amount of changes, it's not worth it
        // to scan through the pending_updates list.
        let pending_nblocks = self.pending_nblocks;
@@ -1146,20 +1116,19 @@ impl<'a> DatadirModification<'a> {
            return Ok(());
        }

-        let writer = self.tline.writer().await;
+        let writer = self.tline.writer();

        // Flush relation and  SLRU data blocks, keep metadata.
-        let mut retained_pending_updates = HashMap::new();
-        for (key, value) in self.pending_updates.drain() {
-            if is_rel_block_key(key) || is_slru_block_key(key) {
-                // This bails out on first error without modifying pending_updates.
-                // That's Ok, cf this function's doc comment.
-                writer.put(key, self.lsn, &value).await?;
+        let mut result: anyhow::Result<()> = Ok(());
+        self.pending_updates.retain(|&key, value| {
+            if result.is_ok() && (is_rel_block_key(key) || is_slru_block_key(key)) {
+                result = writer.put(key, self.lsn, value);
+                false
            } else {
-                retained_pending_updates.insert(key, value);
+                true
            }
-        }
-        self.pending_updates.extend(retained_pending_updates);
+        });
+        result?;

        if pending_nblocks != 0 {
            writer.update_current_logical_size(pending_nblocks * i64::from(BLCKSZ));
@@ -1174,17 +1143,17 @@ impl<'a> DatadirModification<'a> {
    /// underlying timeline.
    /// All the modifications in this atomic update are stamped by the specified LSN.
    ///
-    pub async fn commit(&mut self) -> anyhow::Result<()> {
-        let writer = self.tline.writer().await;
+    pub fn commit(&mut self) -> anyhow::Result<()> {
+        let writer = self.tline.writer();
        let lsn = self.lsn;
        let pending_nblocks = self.pending_nblocks;
        self.pending_nblocks = 0;

        for (key, value) in self.pending_updates.drain() {
-            writer.put(key, lsn, &value).await?;
+            writer.put(key, lsn, &value)?;
        }
        for key_range in self.pending_deletions.drain(..) {
-            writer.delete(key_range, lsn).await?;
+            writer.delete(key_range, lsn)?;
        }

        writer.finish_write(lsn);
@@ -1624,6 +1593,20 @@ fn is_slru_block_key(key: Key) -> bool {
        && key.field6 != 0xffffffff // and not SlruSegSize
 }

+#[cfg(test)]
+pub fn create_test_timeline(
+    tenant: &crate::tenant::Tenant,
+    timeline_id: utils::id::TimelineId,
+    pg_version: u32,
+    ctx: &RequestContext,
+) -> anyhow::Result<std::sync::Arc<Timeline>> {
+    let tline = tenant.create_test_timeline(timeline_id, Lsn(8), pg_version, ctx)?;
+    let mut m = tline.begin_modification(Lsn(8));
+    m.init_empty()?;
+    m.commit()?;
+    Ok(tline)
+}
+
 #[allow(clippy::bool_assert_comparison)]
 #[cfg(test)]
 mod tests {
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -257,9 +257,6 @@ pub enum TaskKind {
    // task that handles attaching a tenant
    Attach,

-    // Used mostly for background deletion from s3
-    TimelineDeletionWorker,
-
    // task that handhes metrics collection
    MetricsCollection,

@@ -479,44 +476,27 @@ pub async fn shutdown_tasks(
                && (timeline_id.is_none() || task_mut.timeline_id == timeline_id)
            {
                task.cancel.cancel();
-                victim_tasks.push((
-                    Arc::clone(task),
-                    task.kind,
-                    task_mut.tenant_id,
-                    task_mut.timeline_id,
-                ));
+                victim_tasks.push(Arc::clone(task));
            }
        }
    }

-    let log_all = kind.is_none() && tenant_id.is_none() && timeline_id.is_none();
-
-    for (task, task_kind, tenant_id, timeline_id) in victim_tasks {
+    for task in victim_tasks {
        let join_handle = {
            let mut task_mut = task.mutable.lock().unwrap();
            task_mut.join_handle.take()
        };
        if let Some(mut join_handle) = join_handle {
-            if log_all {
-                if tenant_id.is_none() {
-                    // there are quite few of these
-                    info!(name = task.name, kind = ?task_kind, "stopping global task");
-                } else {
-                    // warn to catch these in tests; there shouldn't be any
-                    warn!(name = task.name, tenant_id = ?tenant_id, timeline_id = ?timeline_id, kind = ?task_kind, "stopping left-over");
-                }
-            }
-            let join_handle = tokio::select! {
-                biased;
-                _ = &mut join_handle => { None },
+            let completed = tokio::select! {
+                _ = &mut join_handle => { true },
                _ = tokio::time::sleep(std::time::Duration::from_secs(1)) => {
                    // allow some time to elapse before logging to cut down the number of log
                    // lines.
                    info!("waiting for {} to shut down", task.name);
-                    Some(join_handle)
+                    false
                }
            };
-            if let Some(join_handle) = join_handle {
+            if !completed {
                // we never handled this return value, but:
                // - we don't deschedule which would lead to is_cancelled
                // - panics are already logged (is_panicked)
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -38,8 +38,8 @@ pub mod defaults {
    pub const DEFAULT_GC_PERIOD: &str = "1 hr";
    pub const DEFAULT_IMAGE_CREATION_THRESHOLD: usize = 3;
    pub const DEFAULT_PITR_INTERVAL: &str = "7 days";
-    pub const DEFAULT_WALRECEIVER_CONNECT_TIMEOUT: &str = "10 seconds";
-    pub const DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT: &str = "10 seconds";
+    pub const DEFAULT_WALRECEIVER_CONNECT_TIMEOUT: &str = "2 seconds";
+    pub const DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT: &str = "3 seconds";
    pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 10 * 1024 * 1024;
    pub const DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD: &str = "24 hour";
 }
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -58,16 +58,14 @@ use std::sync::Arc;
 use utils::lsn::Lsn;

 use historic_layer_coverage::BufferedHistoricLayerCoverage;
-pub use historic_layer_coverage::LayerKey;
+pub use historic_layer_coverage::Replacement;

 use super::storage_layer::range_eq;
-use super::storage_layer::PersistentLayerDesc;

 ///
 /// LayerMap tracks what layers exist on a timeline.
 ///
-#[derive(Default)]
-pub struct LayerMap {
+pub struct LayerMap<L: ?Sized> {
    //
    // 'open_layer' holds the current InMemoryLayer that is accepting new
    // records. If it is None, 'next_open_layer_at' will be set instead, indicating
@@ -88,11 +86,23 @@ pub struct LayerMap {
    pub frozen_layers: VecDeque<Arc<InMemoryLayer>>,

    /// Index of the historic layers optimized for search
-    historic: BufferedHistoricLayerCoverage<Arc<PersistentLayerDesc>>,
+    historic: BufferedHistoricLayerCoverage<Arc<L>>,

    /// L0 layers have key range Key::MIN..Key::MAX, and locating them using R-Tree search is very inefficient.
    /// So L0 layers are held in l0_delta_layers vector, in addition to the R-tree.
-    l0_delta_layers: Vec<Arc<PersistentLayerDesc>>,
+    l0_delta_layers: Vec<Arc<L>>,
+}
+
+impl<L: ?Sized> Default for LayerMap<L> {
+    fn default() -> Self {
+        Self {
+            open_layer: None,
+            next_open_layer_at: None,
+            frozen_layers: VecDeque::default(),
+            l0_delta_layers: Vec::default(),
+            historic: BufferedHistoricLayerCoverage::default(),
+        }
+    }
 }

 /// The primary update API for the layer map.
@@ -100,21 +110,23 @@ pub struct LayerMap {
 /// Batching historic layer insertions and removals is good for
 /// performance and this struct helps us do that correctly.
 #[must_use]
-pub struct BatchedUpdates<'a> {
+pub struct BatchedUpdates<'a, L: ?Sized + Layer> {
    // While we hold this exclusive reference to the layer map the type checker
    // will prevent us from accidentally reading any unflushed updates.
-    layer_map: &'a mut LayerMap,
+    layer_map: &'a mut LayerMap<L>,
 }

 /// Provide ability to batch more updates while hiding the read
 /// API so we don't accidentally read without flushing.
-impl BatchedUpdates<'_> {
+impl<L> BatchedUpdates<'_, L>
+where
+    L: ?Sized + Layer,
+{
    ///
    /// Insert an on-disk layer.
    ///
-    // TODO remove the `layer` argument when `mapping` is refactored out of `LayerMap`
-    pub fn insert_historic(&mut self, layer_desc: PersistentLayerDesc) {
-        self.layer_map.insert_historic_noflush(layer_desc)
+    pub fn insert_historic(&mut self, layer: Arc<L>) {
+        self.layer_map.insert_historic_noflush(layer)
    }

    ///
@@ -122,8 +134,28 @@ impl BatchedUpdates<'_> {
    ///
    /// This should be called when the corresponding file on disk has been deleted.
    ///
-    pub fn remove_historic(&mut self, layer_desc: PersistentLayerDesc) {
-        self.layer_map.remove_historic_noflush(layer_desc)
+    pub fn remove_historic(&mut self, layer: Arc<L>) {
+        self.layer_map.remove_historic_noflush(layer)
+    }
+
+    /// Replaces existing layer iff it is the `expected`.
+    ///
+    /// If the expected layer has been removed it will not be inserted by this function.
+    ///
+    /// Returned `Replacement` describes succeeding in replacement or the reason why it could not
+    /// be done.
+    ///
+    /// TODO replacement can be done without buffering and rebuilding layer map updates.
+    ///      One way to do that is to add a layer of indirection for returned values, so
+    ///      that we can replace values only by updating a hashmap.
+    pub fn replace_historic(
+        &mut self,
+        expected: &Arc<L>,
+        new: Arc<L>,
+    ) -> anyhow::Result<Replacement<Arc<L>>> {
+        fail::fail_point!("layermap-replace-notfound", |_| Ok(Replacement::NotFound));
+
+        self.layer_map.replace_historic_noflush(expected, new)
    }

    // We will flush on drop anyway, but this method makes it
@@ -139,19 +171,25 @@ impl BatchedUpdates<'_> {
 // than panic later or read without flushing.
 //
 // TODO maybe warn if flush hasn't explicitly been called
-impl Drop for BatchedUpdates<'_> {
+impl<L> Drop for BatchedUpdates<'_, L>
+where
+    L: ?Sized + Layer,
+{
    fn drop(&mut self) {
        self.layer_map.flush_updates();
    }
 }

 /// Return value of LayerMap::search
-pub struct SearchResult {
-    pub layer: Arc<PersistentLayerDesc>,
+pub struct SearchResult<L: ?Sized> {
+    pub layer: Arc<L>,
    pub lsn_floor: Lsn,
 }

-impl LayerMap {
+impl<L> LayerMap<L>
+where
+    L: ?Sized + Layer,
+{
    ///
    /// Find the latest layer (by lsn.end) that covers the given
    /// 'key', with lsn.start < 'end_lsn'.
@@ -183,7 +221,7 @@ impl LayerMap {
    /// NOTE: This only searches the 'historic' layers, *not* the
    /// 'open' and 'frozen' layers!
    ///
-    pub fn search(&self, key: Key, end_lsn: Lsn) -> Option<SearchResult> {
+    pub fn search(&self, key: Key, end_lsn: Lsn) -> Option<SearchResult<L>> {
        let version = self.historic.get().unwrap().get_version(end_lsn.0 - 1)?;
        let latest_delta = version.delta_coverage.query(key.to_i128());
        let latest_image = version.image_coverage.query(key.to_i128());
@@ -226,7 +264,7 @@ impl LayerMap {
    }

    /// Start a batch of updates, applied on drop
-    pub fn batch_update(&mut self) -> BatchedUpdates<'_> {
+    pub fn batch_update(&mut self) -> BatchedUpdates<'_, L> {
        BatchedUpdates { layer_map: self }
    }

@@ -235,18 +273,16 @@ impl LayerMap {
    ///
    /// Helper function for BatchedUpdates::insert_historic
    ///
-    /// TODO(chi): remove L generic so that we do not need to pass layer object.
-    pub(self) fn insert_historic_noflush(&mut self, layer_desc: PersistentLayerDesc) {
+    pub(self) fn insert_historic_noflush(&mut self, layer: Arc<L>) {
        // TODO: See #3869, resulting #4088, attempted fix and repro #4094
-
-        if Self::is_l0(&layer_desc) {
-            self.l0_delta_layers.push(layer_desc.clone().into());
-        }
-
        self.historic.insert(
-            historic_layer_coverage::LayerKey::from(&layer_desc),
-            layer_desc.into(),
+            historic_layer_coverage::LayerKey::from(&*layer),
+            Arc::clone(&layer),
        );
+
+        if Self::is_l0(&layer) {
+            self.l0_delta_layers.push(layer);
+        }
    }

    ///
@@ -254,15 +290,14 @@ impl LayerMap {
    ///
    /// Helper function for BatchedUpdates::remove_historic
    ///
-    pub fn remove_historic_noflush(&mut self, layer_desc: PersistentLayerDesc) {
+    pub fn remove_historic_noflush(&mut self, layer: Arc<L>) {
        self.historic
-            .remove(historic_layer_coverage::LayerKey::from(&layer_desc));
-        let layer_key = layer_desc.key();
-        if Self::is_l0(&layer_desc) {
+            .remove(historic_layer_coverage::LayerKey::from(&*layer));
+
+        if Self::is_l0(&layer) {
            let len_before = self.l0_delta_layers.len();
-            let mut l0_delta_layers = std::mem::take(&mut self.l0_delta_layers);
-            l0_delta_layers.retain(|other| other.key() != layer_key);
-            self.l0_delta_layers = l0_delta_layers;
+            self.l0_delta_layers
+                .retain(|other| !Self::compare_arced_layers(other, &layer));
            // this assertion is related to use of Arc::ptr_eq in Self::compare_arced_layers,
            // there's a chance that the comparison fails at runtime due to it comparing (pointer,
            // vtable) pairs.
@@ -274,6 +309,55 @@ impl LayerMap {
        }
    }

+    pub(self) fn replace_historic_noflush(
+        &mut self,
+        expected: &Arc<L>,
+        new: Arc<L>,
+    ) -> anyhow::Result<Replacement<Arc<L>>> {
+        let key = historic_layer_coverage::LayerKey::from(&**expected);
+        let other = historic_layer_coverage::LayerKey::from(&*new);
+
+        let expected_l0 = Self::is_l0(expected);
+        let new_l0 = Self::is_l0(&new);
+
+        anyhow::ensure!(
+            key == other,
+            "expected and new must have equal LayerKeys: {key:?} != {other:?}"
+        );
+
+        anyhow::ensure!(
+            expected_l0 == new_l0,
+            "expected and new must both be l0 deltas or neither should be: {expected_l0} != {new_l0}"
+        );
+
+        let l0_index = if expected_l0 {
+            // find the index in case replace worked, we need to replace that as well
+            let pos = self
+                .l0_delta_layers
+                .iter()
+                .position(|slot| Self::compare_arced_layers(slot, expected));
+
+            if pos.is_none() {
+                return Ok(Replacement::NotFound);
+            }
+            pos
+        } else {
+            None
+        };
+
+        let replaced = self.historic.replace(&key, new.clone(), |existing| {
+            Self::compare_arced_layers(existing, expected)
+        });
+
+        if let Replacement::Replaced { .. } = &replaced {
+            if let Some(index) = l0_index {
+                self.l0_delta_layers[index] = new;
+            }
+        }
+
+        Ok(replaced)
+    }
+
    /// Helper function for BatchedUpdates::drop.
    pub(self) fn flush_updates(&mut self) {
        self.historic.rebuild();
@@ -299,7 +383,7 @@ impl LayerMap {
        let start = key.start.to_i128();
        let end = key.end.to_i128();

-        let layer_covers = |layer: Option<Arc<PersistentLayerDesc>>| match layer {
+        let layer_covers = |layer: Option<Arc<L>>| match layer {
            Some(layer) => layer.get_lsn_range().start >= lsn.start,
            None => false,
        };
@@ -319,7 +403,7 @@ impl LayerMap {
        Ok(true)
    }

-    pub fn iter_historic_layers(&self) -> impl '_ + Iterator<Item = Arc<PersistentLayerDesc>> {
+    pub fn iter_historic_layers(&self) -> impl '_ + Iterator<Item = Arc<L>> {
        self.historic.iter()
    }

@@ -335,7 +419,7 @@ impl LayerMap {
        &self,
        key_range: &Range<Key>,
        lsn: Lsn,
-    ) -> Result<Vec<(Range<Key>, Option<Arc<PersistentLayerDesc>>)>> {
+    ) -> Result<Vec<(Range<Key>, Option<Arc<L>>)>> {
        let version = match self.historic.get().unwrap().get_version(lsn.0) {
            Some(v) => v,
            None => return Ok(vec![]),
@@ -345,7 +429,7 @@ impl LayerMap {
        let end = key_range.end.to_i128();

        // Initialize loop variables
-        let mut coverage: Vec<(Range<Key>, Option<Arc<PersistentLayerDesc>>)> = vec![];
+        let mut coverage: Vec<(Range<Key>, Option<Arc<L>>)> = vec![];
        let mut current_key = start;
        let mut current_val = version.image_coverage.query(start);

@@ -364,7 +448,7 @@ impl LayerMap {
        Ok(coverage)
    }

-    pub fn is_l0(layer: &PersistentLayerDesc) -> bool {
+    pub fn is_l0(layer: &L) -> bool {
        range_eq(&layer.get_key_range(), &(Key::MIN..Key::MAX))
    }

@@ -390,7 +474,7 @@ impl LayerMap {
    /// TODO The optimal number should probably be slightly higher than 1, but to
    ///      implement that we need to plumb a lot more context into this function
    ///      than just the current partition_range.
-    pub fn is_reimage_worthy(layer: &PersistentLayerDesc, partition_range: &Range<Key>) -> bool {
+    pub fn is_reimage_worthy(layer: &L, partition_range: &Range<Key>) -> bool {
        // Case 1
        if !Self::is_l0(layer) {
            return true;
@@ -621,8 +705,8 @@ impl LayerMap {
    }

    /// Return all L0 delta layers
-    pub fn get_level0_deltas(&self) -> Result<Vec<Arc<PersistentLayerDesc>>> {
-        Ok(self.l0_delta_layers.to_vec())
+    pub fn get_level0_deltas(&self) -> Result<Vec<Arc<L>>> {
+        Ok(self.l0_delta_layers.clone())
    }

    /// debugging function to print out the contents of the layer map
@@ -647,51 +731,72 @@ impl LayerMap {
        println!("End dump LayerMap");
        Ok(())
    }
+
+    /// Similar to `Arc::ptr_eq`, but only compares the object pointers, not vtables.
+    ///
+    /// Returns `true` if the two `Arc` point to the same layer, false otherwise.
+    #[inline(always)]
+    pub fn compare_arced_layers(left: &Arc<L>, right: &Arc<L>) -> bool {
+        // "dyn Trait" objects are "fat pointers" in that they have two components:
+        // - pointer to the object
+        // - pointer to the vtable
+        //
+        // rust does not provide a guarantee that these vtables are unique, but however
+        // `Arc::ptr_eq` as of writing (at least up to 1.67) uses a comparison where both the
+        // pointer and the vtable need to be equal.
+        //
+        // See: https://github.com/rust-lang/rust/issues/103763
+        //
+        // A future version of rust will most likely use this form below, where we cast each
+        // pointer into a pointer to unit, which drops the inaccessible vtable pointer, making it
+        // not affect the comparison.
+        //
+        // See: https://github.com/rust-lang/rust/pull/106450
+        let left = Arc::as_ptr(left) as *const ();
+        let right = Arc::as_ptr(right) as *const ();
+
+        left == right
+    }
 }

 #[cfg(test)]
 mod tests {
-    use super::LayerMap;
-    use crate::tenant::storage_layer::{tests::LayerDescriptor, LayerFileName};
+    use super::{LayerMap, Replacement};
+    use crate::tenant::storage_layer::{Layer, LayerDescriptor, LayerFileName};
    use std::str::FromStr;
    use std::sync::Arc;

    mod l0_delta_layers_updated {

-        use crate::tenant::{
-            storage_layer::{PersistentLayer, PersistentLayerDesc},
-            timeline::LayerFileManager,
-        };
-
        use super::*;

        #[test]
        fn for_full_range_delta() {
            // l0_delta_layers are used by compaction, and should observe all buffered updates
            l0_delta_layers_updated_scenario(
-                 "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000053423C21-0000000053424D69",
-                 true
-             )
+                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000053423C21-0000000053424D69",
+                true
+            )
        }

        #[test]
        fn for_non_full_range_delta() {
            // has minimal uncovered areas compared to l0_delta_layers_updated_on_insert_replace_remove_for_full_range_delta
            l0_delta_layers_updated_scenario(
-                 "000000000000000000000000000000000001-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFE__0000000053423C21-0000000053424D69",
-                 // because not full range
-                 false
-             )
+                "000000000000000000000000000000000001-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFE__0000000053423C21-0000000053424D69",
+                // because not full range
+                false
+            )
        }

        #[test]
        fn for_image() {
            l0_delta_layers_updated_scenario(
-                 "000000000000000000000000000000000000-000000000000000000000000000000010000__0000000053424D69",
-                 // code only checks if it is a full range layer, doesn't care about images, which must
-                 // mean we should in practice never have full range images
-                 false
-             )
+                "000000000000000000000000000000000000-000000000000000000000000000000010000__0000000053424D69",
+                // code only checks if it is a full range layer, doesn't care about images, which must
+                // mean we should in practice never have full range images
+                false
+            )
        }

        #[test]
@@ -704,67 +809,60 @@ mod tests {
            let layer = LayerDescriptor::from(layer);

            // same skeletan construction; see scenario below
-            let not_found = Arc::new(layer.clone());
-            let new_version = Arc::new(layer);
+            let not_found: Arc<dyn Layer> = Arc::new(layer.clone());
+            let new_version: Arc<dyn Layer> = Arc::new(layer);

-            // after the immutable storage state refactor, the replace operation
-            // will not use layer map any more. We keep it here for consistency in test cases
-            // and can remove it in the future.
-            let _map = LayerMap::default();
+            let mut map = LayerMap::default();

-            let mut mapping = LayerFileManager::new();
+            let res = map.batch_update().replace_historic(&not_found, new_version);

-            mapping
-                .replace_and_verify(not_found, new_version)
-                .unwrap_err();
+            assert!(matches!(res, Ok(Replacement::NotFound)), "{res:?}");
        }

        fn l0_delta_layers_updated_scenario(layer_name: &str, expected_l0: bool) {
            let name = LayerFileName::from_str(layer_name).unwrap();
            let skeleton = LayerDescriptor::from(name);

-            let remote = Arc::new(skeleton.clone());
-            let downloaded = Arc::new(skeleton);
+            let remote: Arc<dyn Layer> = Arc::new(skeleton.clone());
+            let downloaded: Arc<dyn Layer> = Arc::new(skeleton);

            let mut map = LayerMap::default();
-            let mut mapping = LayerFileManager::new();

            // two disjoint Arcs in different lifecycle phases. even if it seems they must be the
            // same layer, we use LayerMap::compare_arced_layers as the identity of layers.
-            assert_eq!(remote.layer_desc(), downloaded.layer_desc());
+            assert!(!LayerMap::compare_arced_layers(&remote, &downloaded));

            let expected_in_counts = (1, usize::from(expected_l0));

-            map.batch_update()
-                .insert_historic(remote.layer_desc().clone());
-            mapping.insert(remote.clone());
-            assert_eq!(
-                count_layer_in(&map, remote.layer_desc()),
-                expected_in_counts
-            );
+            map.batch_update().insert_historic(remote.clone());
+            assert_eq!(count_layer_in(&map, &remote), expected_in_counts);

-            mapping
-                .replace_and_verify(remote, downloaded.clone())
+            let replaced = map
+                .batch_update()
+                .replace_historic(&remote, downloaded.clone())
                .expect("name derived attributes are the same");
-            assert_eq!(
-                count_layer_in(&map, downloaded.layer_desc()),
-                expected_in_counts
+            assert!(
+                matches!(replaced, Replacement::Replaced { .. }),
+                "{replaced:?}"
            );
+            assert_eq!(count_layer_in(&map, &downloaded), expected_in_counts);

-            map.batch_update()
-                .remove_historic(downloaded.layer_desc().clone());
-            assert_eq!(count_layer_in(&map, downloaded.layer_desc()), (0, 0));
+            map.batch_update().remove_historic(downloaded.clone());
+            assert_eq!(count_layer_in(&map, &downloaded), (0, 0));
        }

-        fn count_layer_in(map: &LayerMap, layer: &PersistentLayerDesc) -> (usize, usize) {
+        fn count_layer_in(map: &LayerMap<dyn Layer>, layer: &Arc<dyn Layer>) -> (usize, usize) {
            let historic = map
                .iter_historic_layers()
-                .filter(|x| x.key() == layer.key())
+                .filter(|x| LayerMap::compare_arced_layers(x, layer))
                .count();
            let l0s = map
                .get_level0_deltas()
                .expect("why does this return a result");
-            let l0 = l0s.iter().filter(|x| x.key() == layer.key()).count();
+            let l0 = l0s
+                .iter()
+                .filter(|x| LayerMap::compare_arced_layers(x, layer))
+                .count();

            (historic, l0)
        }
--- a/pageserver/src/tenant/layer_map/historic_layer_coverage.rs
+++ b/pageserver/src/tenant/layer_map/historic_layer_coverage.rs
@@ -3,8 +3,6 @@ use std::ops::Range;

 use tracing::info;

-use crate::tenant::storage_layer::PersistentLayerDesc;
-
 use super::layer_coverage::LayerCoverageTuple;

 /// Layers in this module are identified and indexed by this data.
@@ -43,8 +41,8 @@ impl Ord for LayerKey {
    }
 }

-impl From<&PersistentLayerDesc> for LayerKey {
-    fn from(layer: &PersistentLayerDesc) -> Self {
+impl<'a, L: crate::tenant::storage_layer::Layer + ?Sized> From<&'a L> for LayerKey {
+    fn from(layer: &'a L) -> Self {
        let kr = layer.get_key_range();
        let lr = layer.get_lsn_range();
        LayerKey {
@@ -456,6 +454,59 @@ impl<Value: Clone> BufferedHistoricLayerCoverage<Value> {
        self.buffer.insert(layer_key, None);
    }

+    /// Replaces a previous layer with a new layer value.
+    ///
+    /// The replacement is conditional on:
+    /// - there is an existing `LayerKey` record
+    /// - there is no buffered removal for the given `LayerKey`
+    /// - the given closure returns true for the current `Value`
+    ///
+    /// The closure is used to compare the latest value (buffered insert, or existing layer)
+    /// against some expectation. This allows to use `Arc::ptr_eq` or similar which would be
+    /// inaccessible via `PartialEq` trait.
+    ///
+    /// Returns a `Replacement` value describing the outcome; only the case of
+    /// `Replacement::Replaced` modifies the map and requires a rebuild.
+    pub fn replace<F>(
+        &mut self,
+        layer_key: &LayerKey,
+        new: Value,
+        check_expected: F,
+    ) -> Replacement<Value>
+    where
+        F: FnOnce(&Value) -> bool,
+    {
+        let (slot, in_buffered) = match self.buffer.get(layer_key) {
+            Some(inner @ Some(_)) => {
+                // we compare against the buffered version, because there will be a later
+                // rebuild before querying
+                (inner.as_ref(), true)
+            }
+            Some(None) => {
+                // buffer has removal for this key; it will not be equivalent by any check_expected.
+                return Replacement::RemovalBuffered;
+            }
+            None => {
+                // no pending modification for the key, check layers
+                (self.layers.get(layer_key), false)
+            }
+        };
+
+        match slot {
+            Some(existing) if !check_expected(existing) => {
+                // unfortunate clone here, but otherwise the nll borrowck grows the region of
+                // 'a to cover the whole function, and we could not mutate in the other
+                // Some(existing) branch
+                Replacement::Unexpected(existing.clone())
+            }
+            None => Replacement::NotFound,
+            Some(_existing) => {
+                self.insert(layer_key.to_owned(), new);
+                Replacement::Replaced { in_buffered }
+            }
+        }
+    }
+
    pub fn rebuild(&mut self) {
        // Find the first LSN that needs to be rebuilt
        let rebuild_since: u64 = match self.buffer.iter().next() {
@@ -524,6 +575,22 @@ impl<Value: Clone> BufferedHistoricLayerCoverage<Value> {
    }
 }

+/// Outcome of the replace operation.
+#[derive(Debug)]
+pub enum Replacement<Value> {
+    /// Previous value was replaced with the new value.
+    Replaced {
+        /// Replacement happened for a scheduled insert.
+        in_buffered: bool,
+    },
+    /// Key was not found buffered updates or existing layers.
+    NotFound,
+    /// Key has been scheduled for removal, it was not replaced.
+    RemovalBuffered,
+    /// Previous value was rejected by the closure.
+    Unexpected(Value),
+}
+
 #[test]
 fn test_retroactive_regression_1() {
    let mut map = BufferedHistoricLayerCoverage::new();
@@ -632,3 +699,139 @@ fn test_retroactive_simple() {
        assert_eq!(version.image_coverage.query(8), Some("Image 4".to_string()));
    }
 }
+
+#[test]
+fn test_retroactive_replacement() {
+    let mut map = BufferedHistoricLayerCoverage::new();
+
+    let keys = [
+        LayerKey {
+            key: 0..5,
+            lsn: 100..101,
+            is_image: true,
+        },
+        LayerKey {
+            key: 3..9,
+            lsn: 110..111,
+            is_image: true,
+        },
+        LayerKey {
+            key: 4..6,
+            lsn: 120..121,
+            is_image: true,
+        },
+    ];
+
+    let layers = [
+        "Image 1".to_string(),
+        "Image 2".to_string(),
+        "Image 3".to_string(),
+    ];
+
+    for (key, layer) in keys.iter().zip(layers.iter()) {
+        map.insert(key.to_owned(), layer.to_owned());
+    }
+
+    // rebuild is not necessary here, because replace works for both buffered updates and existing
+    // layers.
+
+    for (key, orig_layer) in keys.iter().zip(layers.iter()) {
+        let replacement = format!("Remote {orig_layer}");
+
+        // evict
+        let ret = map.replace(key, replacement.clone(), |l| l == orig_layer);
+        assert!(
+            matches!(ret, Replacement::Replaced { .. }),
+            "replace {orig_layer}: {ret:?}"
+        );
+        map.rebuild();
+
+        let at = key.lsn.end + 1;
+
+        let version = map.get().expect("rebuilt").get_version(at).unwrap();
+        assert_eq!(
+            version.image_coverage.query(4).as_deref(),
+            Some(replacement.as_str()),
+            "query for 4 at version {at} after eviction",
+        );
+
+        // download
+        let ret = map.replace(key, orig_layer.clone(), |l| l == &replacement);
+        assert!(
+            matches!(ret, Replacement::Replaced { .. }),
+            "replace {orig_layer} back: {ret:?}"
+        );
+        map.rebuild();
+        let version = map.get().expect("rebuilt").get_version(at).unwrap();
+        assert_eq!(
+            version.image_coverage.query(4).as_deref(),
+            Some(orig_layer.as_str()),
+            "query for 4 at version {at} after download",
+        );
+    }
+}
+
+#[test]
+fn missing_key_is_not_inserted_with_replace() {
+    let mut map = BufferedHistoricLayerCoverage::new();
+    let key = LayerKey {
+        key: 0..5,
+        lsn: 100..101,
+        is_image: true,
+    };
+
+    let ret = map.replace(&key, "should not replace", |_| true);
+    assert!(matches!(ret, Replacement::NotFound), "{ret:?}");
+    map.rebuild();
+    assert!(map
+        .get()
+        .expect("no changes to rebuild")
+        .get_version(102)
+        .is_none());
+}
+
+#[test]
+fn replacing_buffered_insert_and_remove() {
+    let mut map = BufferedHistoricLayerCoverage::new();
+    let key = LayerKey {
+        key: 0..5,
+        lsn: 100..101,
+        is_image: true,
+    };
+
+    map.insert(key.clone(), "Image 1");
+    let ret = map.replace(&key, "Remote Image 1", |&l| l == "Image 1");
+    assert!(
+        matches!(ret, Replacement::Replaced { in_buffered: true }),
+        "{ret:?}"
+    );
+    map.rebuild();
+
+    assert_eq!(
+        map.get()
+            .expect("rebuilt")
+            .get_version(102)
+            .unwrap()
+            .image_coverage
+            .query(4),
+        Some("Remote Image 1")
+    );
+
+    map.remove(key.clone());
+    let ret = map.replace(&key, "should not replace", |_| true);
+    assert!(
+        matches!(ret, Replacement::RemovalBuffered),
+        "cannot replace after scheduled remove: {ret:?}"
+    );
+
+    map.rebuild();
+
+    let ret = map.replace(&key, "should not replace", |_| true);
+    assert!(
+        matches!(ret, Replacement::NotFound),
+        "cannot replace after remove + rebuild: {ret:?}"
+    );
+
+    let at_version = map.get().expect("rebuilt").get_version(102);
+    assert!(at_version.is_none());
+}
--- a/pageserver/src/tenant/manifest.rs
+++ b/pageserver/src/tenant/manifest.rs
@@ -1,325 +0,0 @@
-//! This module contains the encoding and decoding of the local manifest file.
-//!
-//! MANIFEST is a write-ahead log which is stored locally to each timeline. It
-//! records the state of the storage engine. It contains a snapshot of the
-//! state and all operations proceeding that snapshot. The file begins with a
-//! header recording MANIFEST version number. After that, it contains a snapshot.
-//! The snapshot is followed by a list of operations. Each operation is a list
-//! of records. Each record is either an addition or a removal of a layer.
-//!
-//! With MANIFEST, we can:
-//!
-//! 1. recover state quickly by reading the file, potentially boosting the
-//!    startup speed.
-//! 2. ensure all operations are atomic and avoid corruption, solving issues
-//!    like redundant image layer and preparing us for future compaction
-//!    strategies.
-//!
-//! There is also a format for storing all layer files on S3, called
-//! `index_part.json`. Compared with index_part, MANIFEST is an WAL which
-//! records all operations as logs, and therefore we can easily replay the
-//! operations when recovering from crash, while ensuring those operations
-//! are atomic upon restart.
-//!
-//! Currently, this is not used in the system. Future refactors will ensure
-//! the storage state will be recorded in this file, and the system can be
-//! recovered from this file. This is tracked in
-//! https://github.com/neondatabase/neon/issues/4418
-
-use std::io::{self, Read, Write};
-
-use crate::virtual_file::VirtualFile;
-use anyhow::Result;
-use bytes::{Buf, BufMut, Bytes, BytesMut};
-use crc32c::crc32c;
-use serde::{Deserialize, Serialize};
-use tracing::log::warn;
-use utils::lsn::Lsn;
-
-use super::storage_layer::PersistentLayerDesc;
-
-pub struct Manifest {
-    file: VirtualFile,
-}
-
-#[derive(Clone, Serialize, Deserialize, PartialEq, Eq, Debug)]
-pub struct Snapshot {
-    pub layers: Vec<PersistentLayerDesc>,
-}
-
-/// serde by default encode this in tagged enum, and therefore it will be something
-/// like `{ "AddLayer": { ... } }`.
-#[derive(Clone, Serialize, Deserialize, PartialEq, Eq, Debug)]
-pub enum Record {
-    AddLayer(PersistentLayerDesc),
-    RemoveLayer(PersistentLayerDesc),
-}
-
-/// `echo neon.manifest | sha1sum` and take the leading 8 bytes.
-const MANIFEST_MAGIC_NUMBER: u64 = 0xf5c44592b806109c;
-const MANIFEST_VERSION: u64 = 1;
-
-#[derive(Clone, Serialize, Deserialize, PartialEq, Eq, Debug)]
-pub struct ManifestHeader {
-    magic_number: u64,
-    version: u64,
-}
-
-const MANIFEST_HEADER_LEN: usize = 16;
-
-impl ManifestHeader {
-    fn encode(&self) -> BytesMut {
-        let mut buf = BytesMut::with_capacity(MANIFEST_HEADER_LEN);
-        buf.put_u64(self.magic_number);
-        buf.put_u64(self.version);
-        buf
-    }
-
-    fn decode(mut buf: &[u8]) -> Self {
-        assert!(buf.len() == MANIFEST_HEADER_LEN, "invalid header");
-        Self {
-            magic_number: buf.get_u64(),
-            version: buf.get_u64(),
-        }
-    }
-}
-
-#[derive(Clone, Serialize, Deserialize, PartialEq, Eq, Debug)]
-pub enum Operation {
-    /// A snapshot of the current state.
-    ///
-    /// Lsn field represents the LSN that is persisted to disk for this snapshot.
-    Snapshot(Snapshot, Lsn),
-    /// An atomic operation that changes the state.
-    ///
-    /// Lsn field represents the LSN that is persisted to disk after the operation is done.
-    /// This will only change when new L0 is flushed to the disk.
-    Operation(Vec<Record>, Lsn),
-}
-
-struct RecordHeader {
-    size: u32,
-    checksum: u32,
-}
-
-const RECORD_HEADER_LEN: usize = 8;
-
-impl RecordHeader {
-    fn encode(&self) -> BytesMut {
-        let mut buf = BytesMut::with_capacity(RECORD_HEADER_LEN);
-        buf.put_u32(self.size);
-        buf.put_u32(self.checksum);
-        buf
-    }
-
-    fn decode(mut buf: &[u8]) -> Self {
-        assert!(buf.len() == RECORD_HEADER_LEN, "invalid header");
-        Self {
-            size: buf.get_u32(),
-            checksum: buf.get_u32(),
-        }
-    }
-}
-
-#[derive(Debug, thiserror::Error)]
-pub enum ManifestLoadError {
-    #[error("manifest header is corrupted")]
-    CorruptedManifestHeader,
-    #[error("unsupported manifest version: got {0}, expected {1}")]
-    UnsupportedVersion(u64, u64),
-    #[error("error when decoding record: {0}")]
-    DecodeRecord(serde_json::Error),
-    #[error("I/O error: {0}")]
-    Io(io::Error),
-}
-
-#[must_use = "Should check if the manifest is partially corrupted"]
-pub struct ManifestPartiallyCorrupted(bool);
-
-impl Manifest {
-    /// Create a new manifest by writing the manifest header and a snapshot record to the given file.
-    pub fn init(file: VirtualFile, snapshot: Snapshot, lsn: Lsn) -> Result<Self> {
-        let mut manifest = Self { file };
-        manifest.append_manifest_header(ManifestHeader {
-            magic_number: MANIFEST_MAGIC_NUMBER,
-            version: MANIFEST_VERSION,
-        })?;
-        manifest.append_operation(Operation::Snapshot(snapshot, lsn))?;
-        Ok(manifest)
-    }
-
-    /// Load a manifest. Returns the manifest and a list of operations. If the manifest is corrupted,
-    /// the bool flag will be set to true and the user is responsible to reconstruct a new manifest and
-    /// backup the current one.
-    pub fn load(
-        mut file: VirtualFile,
-    ) -> Result<(Self, Vec<Operation>, ManifestPartiallyCorrupted), ManifestLoadError> {
-        let mut buf = vec![];
-        file.read_to_end(&mut buf).map_err(ManifestLoadError::Io)?;
-
-        // Read manifest header
-        let mut buf = Bytes::from(buf);
-        if buf.remaining() < MANIFEST_HEADER_LEN {
-            return Err(ManifestLoadError::CorruptedManifestHeader);
-        }
-        let header = ManifestHeader::decode(&buf[..MANIFEST_HEADER_LEN]);
-        buf.advance(MANIFEST_HEADER_LEN);
-        if header.version != MANIFEST_VERSION {
-            return Err(ManifestLoadError::UnsupportedVersion(
-                header.version,
-                MANIFEST_VERSION,
-            ));
-        }
-
-        // Read operations
-        let mut operations = Vec::new();
-        let corrupted = loop {
-            if buf.remaining() == 0 {
-                break false;
-            }
-            if buf.remaining() < RECORD_HEADER_LEN {
-                warn!("incomplete header when decoding manifest, could be corrupted");
-                break true;
-            }
-            let RecordHeader { size, checksum } = RecordHeader::decode(&buf[..RECORD_HEADER_LEN]);
-            let size = size as usize;
-            buf.advance(RECORD_HEADER_LEN);
-            if buf.remaining() < size {
-                warn!("incomplete data when decoding manifest, could be corrupted");
-                break true;
-            }
-            let data = &buf[..size];
-            if crc32c(data) != checksum {
-                warn!("checksum mismatch when decoding manifest, could be corrupted");
-                break true;
-            }
-            // if the following decode fails, we cannot use the manifest or safely ignore any record.
-            operations.push(serde_json::from_slice(data).map_err(ManifestLoadError::DecodeRecord)?);
-            buf.advance(size);
-        };
-        Ok((
-            Self { file },
-            operations,
-            ManifestPartiallyCorrupted(corrupted),
-        ))
-    }
-
-    fn append_data(&mut self, data: &[u8]) -> Result<()> {
-        if data.len() >= u32::MAX as usize {
-            panic!("data too large");
-        }
-        let header = RecordHeader {
-            size: data.len() as u32,
-            checksum: crc32c(data),
-        };
-        let header = header.encode();
-        self.file.write_all(&header)?;
-        self.file.write_all(data)?;
-        self.file.sync_all()?;
-        Ok(())
-    }
-
-    fn append_manifest_header(&mut self, header: ManifestHeader) -> Result<()> {
-        let encoded = header.encode();
-        self.file.write_all(&encoded)?;
-        Ok(())
-    }
-
-    /// Add an operation to the manifest. The operation will be appended to the end of the file,
-    /// and the file will fsync.
-    pub fn append_operation(&mut self, operation: Operation) -> Result<()> {
-        let encoded = Vec::from(serde_json::to_string(&operation)?);
-        self.append_data(&encoded)
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use std::fs::OpenOptions;
-
-    use crate::repository::Key;
-
-    use super::*;
-
-    #[test]
-    fn test_read_manifest() {
-        let testdir = crate::config::PageServerConf::test_repo_dir("test_read_manifest");
-        std::fs::create_dir_all(&testdir).unwrap();
-        let file = VirtualFile::create(&testdir.join("MANIFEST")).unwrap();
-        let layer1 = PersistentLayerDesc::new_test(Key::from_i128(0)..Key::from_i128(233));
-        let layer2 = PersistentLayerDesc::new_test(Key::from_i128(233)..Key::from_i128(2333));
-        let layer3 = PersistentLayerDesc::new_test(Key::from_i128(2333)..Key::from_i128(23333));
-        let layer4 = PersistentLayerDesc::new_test(Key::from_i128(23333)..Key::from_i128(233333));
-
-        // Write a manifest with a snapshot and some operations
-        let snapshot = Snapshot {
-            layers: vec![layer1, layer2],
-        };
-        let mut manifest = Manifest::init(file, snapshot.clone(), Lsn::from(0)).unwrap();
-        manifest
-            .append_operation(Operation::Operation(
-                vec![Record::AddLayer(layer3.clone())],
-                Lsn::from(1),
-            ))
-            .unwrap();
-        drop(manifest);
-
-        // Open the second time and write
-        let file = VirtualFile::open_with_options(
-            &testdir.join("MANIFEST"),
-            OpenOptions::new()
-                .read(true)
-                .write(true)
-                .create_new(false)
-                .truncate(false),
-        )
-        .unwrap();
-        let (mut manifest, operations, corrupted) = Manifest::load(file).unwrap();
-        assert!(!corrupted.0);
-        assert_eq!(operations.len(), 2);
-        assert_eq!(
-            &operations[0],
-            &Operation::Snapshot(snapshot.clone(), Lsn::from(0))
-        );
-        assert_eq!(
-            &operations[1],
-            &Operation::Operation(vec![Record::AddLayer(layer3.clone())], Lsn::from(1))
-        );
-        manifest
-            .append_operation(Operation::Operation(
-                vec![
-                    Record::RemoveLayer(layer3.clone()),
-                    Record::AddLayer(layer4.clone()),
-                ],
-                Lsn::from(2),
-            ))
-            .unwrap();
-        drop(manifest);
-
-        // Open the third time and verify
-        let file = VirtualFile::open_with_options(
-            &testdir.join("MANIFEST"),
-            OpenOptions::new()
-                .read(true)
-                .write(true)
-                .create_new(false)
-                .truncate(false),
-        )
-        .unwrap();
-        let (_manifest, operations, corrupted) = Manifest::load(file).unwrap();
-        assert!(!corrupted.0);
-        assert_eq!(operations.len(), 3);
-        assert_eq!(&operations[0], &Operation::Snapshot(snapshot, Lsn::from(0)));
-        assert_eq!(
-            &operations[1],
-            &Operation::Operation(vec![Record::AddLayer(layer3.clone())], Lsn::from(1))
-        );
-        assert_eq!(
-            &operations[2],
-            &Operation::Operation(
-                vec![Record::RemoveLayer(layer3), Record::AddLayer(layer4)],
-                Lsn::from(2)
-            )
-        );
-    }
-}
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -20,9 +20,12 @@ use crate::config::PageServerConf;
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::task_mgr::{self, TaskKind};
 use crate::tenant::config::TenantConfOpt;
-use crate::tenant::{create_tenant_files, CreateTenantFilesMode, Tenant, TenantState};
-use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME};
+use crate::tenant::{
+    create_tenant_files, CreateTenantFilesMode, SetStoppingError, Tenant, TenantState,
+};
+use crate::IGNORED_TENANT_FILE_NAME;

+use utils::completion;
 use utils::fs_ext::PathExt;
 use utils::id::{TenantId, TimelineId};

@@ -64,7 +67,7 @@ pub async fn init_tenant_mgr(
    conf: &'static PageServerConf,
    broker_client: storage_broker::BrokerClientChannel,
    remote_storage: Option<GenericRemoteStorage>,
-    init_order: InitializationOrder,
+    init_done: (completion::Completion, completion::Barrier),
 ) -> anyhow::Result<()> {
    // Scan local filesystem for attached tenants
    let tenants_dir = conf.tenants_path();
@@ -121,7 +124,7 @@ pub async fn init_tenant_mgr(
                        &tenant_dir_path,
                        broker_client.clone(),
                        remote_storage.clone(),
-                        Some(init_order.clone()),
+                        Some(init_done.clone()),
                        &ctx,
                    ) {
                        Ok(tenant) => {
@@ -157,7 +160,7 @@ pub fn schedule_local_tenant_processing(
    tenant_path: &Path,
    broker_client: storage_broker::BrokerClientChannel,
    remote_storage: Option<GenericRemoteStorage>,
-    init_order: Option<InitializationOrder>,
+    init_done: Option<(completion::Completion, completion::Barrier)>,
    ctx: &RequestContext,
 ) -> anyhow::Result<Arc<Tenant>> {
    anyhow::ensure!(
@@ -216,7 +219,7 @@ pub fn schedule_local_tenant_processing(
            tenant_id,
            broker_client,
            remote_storage,
-            init_order,
+            init_done,
            ctx,
        )
    };
@@ -250,28 +253,46 @@ pub async fn shutdown_all_tenants() {
                tenants_clone
            }
            TenantsMap::ShuttingDown(_) => {
-                // TODO: it is possible that detach and shutdown happen at the same time. as a
-                // result, during shutdown we do not wait for detach.
                error!("already shutting down, this function isn't supposed to be called more than once");
                return;
            }
        }
    };

+    // Set tenant (and its timlines) to Stoppping state.
+    //
+    // Since we can only transition into Stopping state after activation is complete,
+    // run it in a JoinSet so all tenants have a chance to stop before we get SIGKILLed.
+    //
+    // Transitioning tenants to Stopping state has a couple of non-obvious side effects:
+    // 1. Lock out any new requests to the tenants.
+    // 2. Signal cancellation to WAL receivers (we wait on it below).
+    // 3. Signal cancellation for other tenant background loops.
+    // 4. ???
+    //
+    // The waiting for the cancellation is not done uniformly.
+    // We certainly wait for WAL receivers to shut down.
+    // That is necessary so that no new data comes in before the freeze_and_flush.
+    // But the tenant background loops are joined-on in our caller.
+    // It's mesed up.
    let mut join_set = JoinSet::new();
+    let mut tenants_to_freeze_and_flush = Vec::with_capacity(tenants_to_shut_down.len());
    for (tenant_id, tenant) in tenants_to_shut_down {
        join_set.spawn(
            async move {
-                let freeze_and_flush = true;
-
-                match tenant.shutdown(freeze_and_flush).await {
+                match tenant.set_stopping().await {
                    Ok(()) => debug!("tenant successfully stopped"),
-                    Err(super::ShutdownError::AlreadyStopping) => {
-                        warn!("tenant was already shutting down")
+                    Err(SetStoppingError::Broken) => {
+                        info!("tenant is broken, so stopping failed, freeze_and_flush is likely going to make noise as well");
+                    },
+                    Err(SetStoppingError::AlreadyStopping) => {
+                        // our task_mgr::shutdown_tasks are going to coalesce on that just fine
                    }
                }
+
+                tenant
            }
-            .instrument(info_span!("shutdown", %tenant_id)),
+            .instrument(info_span!("set_stopping", %tenant_id)),
        );
    }

@@ -279,7 +300,6 @@ pub async fn shutdown_all_tenants() {

    while let Some(res) = join_set.join_next().await {
        match res {
-            Ok(()) => {}
            Err(join_error) if join_error.is_cancelled() => {
                unreachable!("we are not cancelling any of the futures");
            }
@@ -290,11 +310,50 @@ pub async fn shutdown_all_tenants() {
            Err(join_error) => {
                warn!("unknown kind of JoinError: {join_error}");
            }
+            Ok(tenant) => tenants_to_freeze_and_flush.push(tenant),
        }
    }

    if panicked > 0 {
-        warn!(panicked, "observed panicks while shutting down tenants");
+        warn!(panicked, "observed panicks while stopping tenants");
+    }
+
+    // Shut down all existing walreceiver connections and stop accepting the new ones.
+    task_mgr::shutdown_tasks(Some(TaskKind::WalReceiverManager), None, None).await;
+
+    // Ok, no background tasks running anymore. Flush any remaining data in
+    // memory to disk.
+    //
+    // We assume that any incoming connections that might request pages from
+    // the tenant have already been terminated by the caller, so there
+    // should be no more activity in any of the repositories.
+    //
+    // On error, log it but continue with the shutdown for other tenants.
+
+    let mut join_set = tokio::task::JoinSet::new();
+
+    for tenant in tenants_to_freeze_and_flush {
+        let tenant_id = tenant.tenant_id();
+
+        join_set.spawn(
+            async move {
+                if let Err(err) = tenant.freeze_and_flush().await {
+                    warn!("Could not checkpoint tenant during shutdown: {err:?}");
+                }
+            }
+            .instrument(info_span!("freeze_and_flush", %tenant_id)),
+        );
+    }
+
+    while let Some(next) = join_set.join_next().await {
+        match next {
+            Ok(()) => {}
+            Err(join_error) if join_error.is_cancelled() => {
+                unreachable!("no cancelling")
+            }
+            Err(join_error) if join_error.is_panic() => { /* reported already */ }
+            Err(join_error) => warn!("unknown kind of JoinError: {join_error}"),
+        }
    }
 }

@@ -396,9 +455,7 @@ pub async fn delete_timeline(
    ctx: &RequestContext,
 ) -> Result<(), DeleteTimelineError> {
    let tenant = get_tenant(tenant_id, true).await?;
-    tenant
-        .prepare_and_schedule_delete_timeline(timeline_id, ctx)
-        .await?;
+    tenant.delete_timeline(timeline_id, ctx).await?;
    Ok(())
 }

@@ -612,26 +669,35 @@ where
    // The exclusive lock here ensures we don't miss the tenant state updates before trying another removal.
    // tenant-wde cleanup operations may take some time (removing the entire tenant directory), we want to
    // avoid holding the lock for the entire process.
-    let tenant = {
-        TENANTS
-            .write()
-            .await
-            .get(&tenant_id)
-            .cloned()
-            .ok_or(TenantStateError::NotFound(tenant_id))?
-    };
-
-    let freeze_and_flush = false;
-
-    // shutdown is sure to transition tenant to stopping, and wait for all tasks to complete, so
-    // that we can continue safely to cleanup.
-    match tenant.shutdown(freeze_and_flush).await {
-        Ok(()) => {}
-        Err(super::ShutdownError::AlreadyStopping) => {
-            return Err(TenantStateError::IsStopping(tenant_id))
+    {
+        let tenants_accessor = TENANTS.write().await;
+        match tenants_accessor.get(&tenant_id) {
+            Some(tenant) => {
+                let tenant = Arc::clone(tenant);
+                // don't hold TENANTS lock while set_stopping waits for activation to finish
+                drop(tenants_accessor);
+                match tenant.set_stopping().await {
+                    Ok(()) => {
+                        // we won, continue stopping procedure
+                    }
+                    Err(SetStoppingError::Broken) => {
+                        // continue the procedure, let's hope the closure can deal with broken tenants
+                    }
+                    Err(SetStoppingError::AlreadyStopping) => {
+                        // the tenant is already stopping or broken, don't do anything
+                        return Err(TenantStateError::IsStopping(tenant_id));
+                    }
+                }
+            }
+            None => return Err(TenantStateError::NotFound(tenant_id)),
        }
    }

+    // shutdown all tenant and timeline tasks: gc, compaction, page service)
+    // No new tasks will be started for this tenant because it's in `Stopping` state.
+    // Hence, once we're done here, the `tenant_cleanup` callback can mutate tenant on-disk state freely.
+    task_mgr::shutdown_tasks(None, Some(tenant_id), None).await;
+
    match tenant_cleanup
        .await
        .with_context(|| format!("Failed to run cleanup for tenant {tenant_id}"))
@@ -675,7 +741,7 @@ pub async fn immediate_gc(
        .get(&tenant_id)
        .map(Arc::clone)
        .with_context(|| format!("tenant {tenant_id}"))
-        .map_err(|e| ApiError::NotFound(e.into()))?;
+        .map_err(ApiError::NotFound)?;

    let gc_horizon = gc_req.gc_horizon.unwrap_or_else(|| tenant.get_gc_horizon());
    // Use tenant's pitr setting
@@ -724,11 +790,11 @@ pub async fn immediate_compact(
        .get(&tenant_id)
        .map(Arc::clone)
        .with_context(|| format!("tenant {tenant_id}"))
-        .map_err(|e| ApiError::NotFound(e.into()))?;
+        .map_err(ApiError::NotFound)?;

    let timeline = tenant
        .get_timeline(timeline_id, true)
-        .map_err(|e| ApiError::NotFound(e.into()))?;
+        .map_err(ApiError::NotFound)?;

    // Run in task_mgr to avoid race with tenant_detach operation
    let ctx = ctx.detached_child(TaskKind::Compaction, DownloadBehavior::Download);
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -210,15 +210,13 @@ use chrono::{NaiveDateTime, Utc};
 pub use download::{is_temp_download_file, list_remote_timelines};
 use scopeguard::ScopeGuard;

-use std::collections::{HashMap, VecDeque};
-use std::path::Path;
 use std::sync::atomic::{AtomicU32, Ordering};
 use std::sync::{Arc, Mutex};

-use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath};
+use remote_storage::{DownloadError, GenericRemoteStorage};
 use std::ops::DerefMut;
 use tokio::runtime::Runtime;
-use tracing::{debug, error, info, instrument, warn};
+use tracing::{debug, error, info, warn};
 use tracing::{info_span, Instrument};
 use utils::lsn::Lsn;

@@ -227,9 +225,7 @@ use crate::metrics::{
    RemoteTimelineClientMetricsCallTrackSize, REMOTE_ONDEMAND_DOWNLOADED_BYTES,
    REMOTE_ONDEMAND_DOWNLOADED_LAYERS,
 };
-use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
-use crate::tenant::upload_queue::Delete;
 use crate::{
    config::PageServerConf,
    task_mgr,
@@ -263,7 +259,7 @@ const FAILED_UPLOAD_WARN_THRESHOLD: u32 = 3;

 pub enum MaybeDeletedIndexPart {
    IndexPart(IndexPart),
-    Deleted(IndexPart),
+    Deleted,
 }

 /// Errors that can arise when calling [`RemoteTimelineClient::stop`].
@@ -365,42 +361,11 @@ impl RemoteTimelineClient {
        Ok(())
    }

-    /// Initialize the queue in stopped state. Used in startup path
-    /// to continue deletion operation interrupted by pageserver crash or restart.
-    pub fn init_upload_queue_stopped_to_continue_deletion(
-        &self,
-        index_part: &IndexPart,
-    ) -> anyhow::Result<()> {
-        // FIXME: consider newtype for DeletedIndexPart.
-        let deleted_at = index_part.deleted_at.ok_or(anyhow::anyhow!(
-            "bug: it is responsibility of the caller to provide index part from MaybeDeletedIndexPart::Deleted"
-        ))?;
-
-        {
-            let mut upload_queue = self.upload_queue.lock().unwrap();
-            upload_queue.initialize_with_current_remote_index_part(index_part)?;
-            self.update_remote_physical_size_gauge(Some(index_part));
-        }
-        // also locks upload queue, without dropping the guard above it will be a deadlock
-        self.stop().expect("initialized line above");
-
-        let mut upload_queue = self.upload_queue.lock().unwrap();
-
-        upload_queue
-            .stopped_mut()
-            .expect("stopped above")
-            .deleted_at = SetDeletedFlagProgress::Successful(deleted_at);
-
-        Ok(())
-    }
-
    pub fn last_uploaded_consistent_lsn(&self) -> Option<Lsn> {
        match &*self.upload_queue.lock().unwrap() {
            UploadQueue::Uninitialized => None,
            UploadQueue::Initialized(q) => Some(q.last_uploaded_consistent_lsn),
-            UploadQueue::Stopped(q) => {
-                Some(q.upload_queue_for_deletion.last_uploaded_consistent_lsn)
-            }
+            UploadQueue::Stopped(q) => Some(q.last_uploaded_consistent_lsn),
        }
    }

@@ -455,7 +420,7 @@ impl RemoteTimelineClient {
        .await?;

        if index_part.deleted_at.is_some() {
-            Ok(MaybeDeletedIndexPart::Deleted(index_part))
+            Ok(MaybeDeletedIndexPart::Deleted)
        } else {
            Ok(MaybeDeletedIndexPart::IndexPart(index_part))
        }
@@ -608,7 +573,10 @@ impl RemoteTimelineClient {
        self.calls_unfinished_metric_begin(&op);
        upload_queue.queued_operations.push_back(op);

-        info!("scheduled layer file upload {layer_file_name}");
+        info!(
+            "scheduled layer file upload {}",
+            layer_file_name.file_name()
+        );

        // Launch the task immediately, if possible
        self.launch_queued_tasks(upload_queue);
@@ -654,14 +622,10 @@ impl RemoteTimelineClient {

            // schedule the actual deletions
            for name in names {
-                let op = UploadOp::Delete(Delete {
-                    file_kind: RemoteOpFileKind::Layer,
-                    layer_file_name: name.clone(),
-                    scheduled_from_timeline_delete: false,
-                });
+                let op = UploadOp::Delete(RemoteOpFileKind::Layer, name.clone());
                self.calls_unfinished_metric_begin(&op);
                upload_queue.queued_operations.push_back(op);
-                info!("scheduled layer file deletion {name}");
+                info!("scheduled layer file deletion {}", name.file_name());
            }

            // Launch the tasks immediately, if possible
@@ -675,11 +639,18 @@ impl RemoteTimelineClient {
    /// Wait for all previously scheduled uploads/deletions to complete
    ///
    pub async fn wait_completion(self: &Arc<Self>) -> anyhow::Result<()> {
-        let mut receiver = {
+        let (sender, mut receiver) = tokio::sync::watch::channel(());
+        let barrier_op = UploadOp::Barrier(sender);
+
+        {
            let mut guard = self.upload_queue.lock().unwrap();
            let upload_queue = guard.initialized_mut()?;
-            self.schedule_barrier(upload_queue)
-        };
+            upload_queue.queued_operations.push_back(barrier_op);
+            // Don't count this kind of operation!
+
+            // Launch the task immediately, if possible
+            self.launch_queued_tasks(upload_queue);
+        }

        if receiver.changed().await.is_err() {
            anyhow::bail!("wait_completion aborted because upload queue was stopped");
@@ -687,22 +658,6 @@ impl RemoteTimelineClient {
        Ok(())
    }

-    fn schedule_barrier(
-        self: &Arc<Self>,
-        upload_queue: &mut UploadQueueInitialized,
-    ) -> tokio::sync::watch::Receiver<()> {
-        let (sender, receiver) = tokio::sync::watch::channel(());
-        let barrier_op = UploadOp::Barrier(sender);
-
-        upload_queue.queued_operations.push_back(barrier_op);
-        // Don't count this kind of operation!
-
-        // Launch the task immediately, if possible
-        self.launch_queued_tasks(upload_queue);
-
-        receiver
-    }
-
    /// Set the deleted_at field in the remote index file.
    ///
    /// This fails if the upload queue has not been `stop()`ed.
@@ -710,7 +665,6 @@ impl RemoteTimelineClient {
    /// The caller is responsible for calling `stop()` AND for waiting
    /// for any ongoing upload tasks to finish after `stop()` has succeeded.
    /// Check method [`RemoteTimelineClient::stop`] for details.
-    #[instrument(skip_all)]
    pub(crate) async fn persist_index_part_with_deleted_flag(
        self: &Arc<Self>,
    ) -> Result<(), PersistIndexPartWithDeletedFlagError> {
@@ -720,7 +674,15 @@ impl RemoteTimelineClient {
            // We must be in stopped state because otherwise
            // we can have inprogress index part upload that can overwrite the file
            // with missing is_deleted flag that we going to set below
-            let stopped = locked.stopped_mut()?;
+            let stopped = match &mut *locked {
+                UploadQueue::Uninitialized => {
+                    return Err(anyhow::anyhow!("is not Stopped but Uninitialized").into())
+                }
+                UploadQueue::Initialized(_) => {
+                    return Err(anyhow::anyhow!("is not Stopped but Initialized").into())
+                }
+                UploadQueue::Stopped(stopped) => stopped,
+            };

            match stopped.deleted_at {
                SetDeletedFlagProgress::NotRunning => (), // proceed
@@ -734,34 +696,48 @@ impl RemoteTimelineClient {
            let deleted_at = Utc::now().naive_utc();
            stopped.deleted_at = SetDeletedFlagProgress::InProgress(deleted_at);

-            let mut index_part = IndexPart::try_from(&stopped.upload_queue_for_deletion)
-                .context("IndexPart serialize")?;
+            let mut index_part = IndexPart::new(
+                stopped.latest_files.clone(),
+                stopped.last_uploaded_consistent_lsn,
+                stopped
+                    .latest_metadata
+                    .to_bytes()
+                    .context("serialize metadata")?,
+            );
            index_part.deleted_at = Some(deleted_at);
            index_part
        };

        let undo_deleted_at = scopeguard::guard(Arc::clone(self), |self_clone| {
            let mut locked = self_clone.upload_queue.lock().unwrap();
-            let stopped = locked
-                .stopped_mut()
-                .expect("there's no way out of Stopping, and we checked it's Stopping above");
+            let stopped = match &mut *locked {
+                UploadQueue::Uninitialized | UploadQueue::Initialized(_) => unreachable!(
+                    "there's no way out of Stopping, and we checked it's Stopping above: {:?}",
+                    locked.as_str(),
+                ),
+                UploadQueue::Stopped(stopped) => stopped,
+            };
            stopped.deleted_at = SetDeletedFlagProgress::NotRunning;
        });

        // Have a failpoint that can use the `pause` failpoint action.
        // We don't want to block the executor thread, hence, spawn_blocking + await.
-        if cfg!(feature = "testing") {
-            tokio::task::spawn_blocking({
-                let current = tracing::Span::current();
-                move || {
-                    let _entered = current.entered();
-                    tracing::info!("at failpoint persist_deleted_index_part");
-                    fail::fail_point!("persist_deleted_index_part");
-                }
-            })
-            .await
-            .expect("spawn_blocking");
-        }
+        #[cfg(feature = "testing")]
+        tokio::task::spawn_blocking({
+            let current = tracing::Span::current();
+            move || {
+                let _entered = current.entered();
+                tracing::info!(
+                    "at failpoint persist_index_part_with_deleted_flag_after_set_before_upload_pause"
+                );
+                fail::fail_point!(
+                    "persist_index_part_with_deleted_flag_after_set_before_upload_pause"
+                );
+            }
+        })
+        .await
+        .expect("spawn_blocking");
+
        upload::upload_index_part(
            self.conf,
            &self.storage_impl,
@@ -775,10 +751,13 @@ impl RemoteTimelineClient {
        ScopeGuard::into_inner(undo_deleted_at);
        {
            let mut locked = self.upload_queue.lock().unwrap();
-
-            let stopped = locked
-                .stopped_mut()
-                .expect("there's no way out of Stopping, and we checked it's Stopping above");
+            let stopped = match &mut *locked {
+                UploadQueue::Uninitialized | UploadQueue::Initialized(_) => unreachable!(
+                    "there's no way out of Stopping, and we checked it's Stopping above: {:?}",
+                    locked.as_str(),
+                ),
+                UploadQueue::Stopped(stopped) => stopped,
+            };
            stopped.deleted_at = SetDeletedFlagProgress::Successful(
                index_part_with_deleted_at
                    .deleted_at
@@ -789,90 +768,6 @@ impl RemoteTimelineClient {
        Ok(())
    }

-    /// Prerequisites: UploadQueue should be in stopped state and deleted_at should be successfuly set.
-    /// The function deletes layer files one by one, then lists the prefix to see if we leaked something
-    /// deletes leaked files if any and proceeds with deletion of index file at the end.
-    pub(crate) async fn delete_all(self: &Arc<Self>) -> anyhow::Result<()> {
-        debug_assert_current_span_has_tenant_and_timeline_id();
-
-        let (mut receiver, deletions_queued) = {
-            let mut deletions_queued = 0;
-
-            let mut locked = self.upload_queue.lock().unwrap();
-            let stopped = locked.stopped_mut()?;
-
-            if !matches!(stopped.deleted_at, SetDeletedFlagProgress::Successful(_)) {
-                anyhow::bail!("deleted_at is not set")
-            }
-
-            debug_assert!(stopped.upload_queue_for_deletion.no_pending_work());
-
-            stopped
-                .upload_queue_for_deletion
-                .queued_operations
-                .reserve(stopped.upload_queue_for_deletion.latest_files.len());
-
-            // schedule the actual deletions
-            for name in stopped.upload_queue_for_deletion.latest_files.keys() {
-                let op = UploadOp::Delete(Delete {
-                    file_kind: RemoteOpFileKind::Layer,
-                    layer_file_name: name.clone(),
-                    scheduled_from_timeline_delete: true,
-                });
-                self.calls_unfinished_metric_begin(&op);
-                stopped
-                    .upload_queue_for_deletion
-                    .queued_operations
-                    .push_back(op);
-
-                info!("scheduled layer file deletion {name}");
-                deletions_queued += 1;
-            }
-
-            self.launch_queued_tasks(&mut stopped.upload_queue_for_deletion);
-
-            (
-                self.schedule_barrier(&mut stopped.upload_queue_for_deletion),
-                deletions_queued,
-            )
-        };
-
-        receiver.changed().await?;
-
-        // Do not delete index part yet, it is needed for possible retry. If we remove it first
-        // and retry will arrive to different pageserver there wont be any traces of it on remote storage
-        let timeline_path = self.conf.timeline_path(&self.timeline_id, &self.tenant_id);
-        let timeline_storage_path = self.conf.remote_path(&timeline_path)?;
-
-        let remaining = self
-            .storage_impl
-            .list_prefixes(Some(&timeline_storage_path))
-            .await?;
-
-        let remaining: Vec<RemotePath> = remaining
-            .into_iter()
-            .filter(|p| p.object_name() != Some(IndexPart::FILE_NAME))
-            .collect();
-
-        if !remaining.is_empty() {
-            warn!(
-                "Found {} files not bound to index_file.json, proceeding with their deletion",
-                remaining.len()
-            );
-            warn!("About to remove {} files", remaining.len());
-            self.storage_impl.delete_objects(&remaining).await?;
-        }
-
-        let index_file_path = timeline_storage_path.join(Path::new(IndexPart::FILE_NAME));
-
-        debug!("deleting index part");
-        self.storage_impl.delete(&index_file_path).await?;
-
-        info!(deletions_queued, "done deleting, including index_part.json");
-
-        Ok(())
-    }
-
    ///
    /// Pick next tasks from the queue, and start as many of them as possible without violating
    /// the ordering constraints.
@@ -891,7 +786,7 @@ impl RemoteTimelineClient {
                    // have finished.
                    upload_queue.inprogress_tasks.is_empty()
                }
-                UploadOp::Delete(_) => {
+                UploadOp::Delete(_, _) => {
                    // Wait for preceding uploads to finish. Concurrent deletions are OK, though.
                    upload_queue.num_inprogress_deletions == upload_queue.inprogress_tasks.len()
                }
@@ -922,7 +817,7 @@ impl RemoteTimelineClient {
                UploadOp::UploadMetadata(_, _) => {
                    upload_queue.num_inprogress_metadata_uploads += 1;
                }
-                UploadOp::Delete(_) => {
+                UploadOp::Delete(_, _) => {
                    upload_queue.num_inprogress_deletions += 1;
                }
                UploadOp::Barrier(sender) => {
@@ -996,6 +891,7 @@ impl RemoteTimelineClient {
                        unreachable!("we never launch an upload task if the queue is uninitialized, and once it is initialized, we never go back")
                    }
                }
+                self.calls_unfinished_metric_end(&task.op);
                return;
            }

@@ -1041,16 +937,16 @@ impl RemoteTimelineClient {
                    }
                    res
                }
-                UploadOp::Delete(delete) => {
+                UploadOp::Delete(metric_file_kind, ref layer_file_name) => {
                    let path = &self
                        .conf
                        .timeline_path(&self.timeline_id, &self.tenant_id)
-                        .join(delete.layer_file_name.file_name());
+                        .join(layer_file_name.file_name());
                    delete::delete_layer(self.conf, &self.storage_impl, path)
                        .measure_remote_op(
                            self.tenant_id,
                            self.timeline_id,
-                            delete.file_kind,
+                            *metric_file_kind,
                            RemoteOpKind::Delete,
                            Arc::clone(&self.metrics),
                        )
@@ -1116,24 +1012,11 @@ impl RemoteTimelineClient {
            let mut upload_queue_guard = self.upload_queue.lock().unwrap();
            let upload_queue = match upload_queue_guard.deref_mut() {
                UploadQueue::Uninitialized => panic!("callers are responsible for ensuring this is only called on an initialized queue"),
-                UploadQueue::Stopped(stopped) => {
-                    // Special care is needed for deletions, if it was an earlier deletion (not scheduled from deletion)
-                    // then stop() took care of it so we just return.
-                    // For deletions that come from delete_all we still want to maintain metrics, launch following tasks, etc.
-                    match &task.op {
-                        UploadOp::Delete(delete) if delete.scheduled_from_timeline_delete => Some(&mut stopped.upload_queue_for_deletion),
-                        _ => None
-                    }
-                },
-                UploadQueue::Initialized(qi) => { Some(qi) }
-            };
-
-            let upload_queue = match upload_queue {
-                Some(upload_queue) => upload_queue,
-                None => {
+                UploadQueue::Stopped(_) => {
                    info!("another concurrent task already stopped the queue");
                    return;
-                }
+                }, // nothing to do
+                UploadQueue::Initialized(qi) => { qi }
            };

            upload_queue.inprogress_tasks.remove(&task.task_id);
@@ -1146,7 +1029,7 @@ impl RemoteTimelineClient {
                    upload_queue.num_inprogress_metadata_uploads -= 1;
                    upload_queue.last_uploaded_consistent_lsn = lsn; // XXX monotonicity check?
                }
-                UploadOp::Delete(_) => {
+                UploadOp::Delete(_, _) => {
                    upload_queue.num_inprogress_deletions -= 1;
                }
                UploadOp::Barrier(_) => unreachable!(),
@@ -1180,8 +1063,8 @@ impl RemoteTimelineClient {
                    reason: "metadata uploads are tiny",
                },
            ),
-            UploadOp::Delete(delete) => (
-                delete.file_kind,
+            UploadOp::Delete(file_kind, _) => (
+                *file_kind,
                RemoteOpKind::Delete,
                DontTrackSize {
                    reason: "should we track deletes? positive or negative sign?",
@@ -1228,36 +1111,32 @@ impl RemoteTimelineClient {
                info!("another concurrent task already shut down the queue");
                Ok(())
            }
-            UploadQueue::Initialized(initialized) => {
+            UploadQueue::Initialized(UploadQueueInitialized {
+                latest_files,
+                latest_metadata,
+                last_uploaded_consistent_lsn,
+                ..
+            }) => {
                info!("shutting down upload queue");

                // Replace the queue with the Stopped state, taking ownership of the old
                // Initialized queue. We will do some checks on it, and then drop it.
                let qi = {
-                    // Here we preserve working version of the upload queue for possible use during deletions.
-                    // In-place replace of Initialized to Stopped can be done with the help of https://github.com/Sgeo/take_mut
-                    // but for this use case it doesnt really makes sense to bring unsafe code only for this usage point.
-                    // Deletion is not really perf sensitive so there shouldnt be any problems with cloning a fraction of it.
-                    let upload_queue_for_deletion = UploadQueueInitialized {
-                        task_counter: 0,
-                        latest_files: initialized.latest_files.clone(),
-                        latest_files_changes_since_metadata_upload_scheduled: 0,
-                        latest_metadata: initialized.latest_metadata.clone(),
-                        last_uploaded_consistent_lsn: initialized.last_uploaded_consistent_lsn,
-                        num_inprogress_layer_uploads: 0,
-                        num_inprogress_metadata_uploads: 0,
-                        num_inprogress_deletions: 0,
-                        inprogress_tasks: HashMap::default(),
-                        queued_operations: VecDeque::default(),
+                    // take or clone what we need
+                    let latest_files = std::mem::take(latest_files);
+                    let last_uploaded_consistent_lsn = *last_uploaded_consistent_lsn;
+                    // this could be Copy
+                    let latest_metadata = latest_metadata.clone();
+
+                    let stopped = UploadQueueStopped {
+                        latest_files,
+                        last_uploaded_consistent_lsn,
+                        latest_metadata,
+                        deleted_at: SetDeletedFlagProgress::NotRunning,
                    };

-                    let upload_queue = std::mem::replace(
-                        &mut *guard,
-                        UploadQueue::Stopped(UploadQueueStopped {
-                            upload_queue_for_deletion,
-                            deleted_at: SetDeletedFlagProgress::NotRunning,
-                        }),
-                    );
+                    let upload_queue =
+                        std::mem::replace(&mut *guard, UploadQueue::Stopped(stopped));
                    if let UploadQueue::Initialized(qi) = upload_queue {
                        qi
                    } else {
@@ -1265,6 +1144,8 @@ impl RemoteTimelineClient {
                    }
                };

+                assert!(qi.latest_files.is_empty(), "do not use this anymore");
+
                // consistency check
                assert_eq!(
                    qi.num_inprogress_layer_uploads
@@ -1362,7 +1243,7 @@ mod tests {
    struct TestSetup {
        runtime: &'static tokio::runtime::Runtime,
        entered_runtime: EnterGuard<'static>,
-        harness: TenantHarness,
+        harness: TenantHarness<'static>,
        tenant: Arc<Tenant>,
        tenant_ctx: RequestContext,
        remote_fs_dir: PathBuf,
@@ -1383,12 +1264,7 @@ mod tests {
            let harness = TenantHarness::create(test_name)?;
            let (tenant, ctx) = runtime.block_on(harness.load());
            // create an empty timeline directory
-            let _ = runtime.block_on(tenant.create_test_timeline(
-                TIMELINE_ID,
-                Lsn(8),
-                DEFAULT_PG_VERSION,
-                &ctx,
-            ))?;
+            let _ = tenant.create_test_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?;

            let remote_fs_dir = harness.conf.workdir.join("remote_fs");
            std::fs::create_dir_all(remote_fs_dir)?;
@@ -1532,7 +1408,7 @@ mod tests {
        // Download back the index.json, and check that the list of files is correct
        let index_part = match runtime.block_on(client.download_index_file())? {
            MaybeDeletedIndexPart::IndexPart(index_part) => index_part,
-            MaybeDeletedIndexPart::Deleted(_) => panic!("unexpectedly got deleted index part"),
+            MaybeDeletedIndexPart::Deleted => panic!("unexpectedly got deleted index part"),
        };

        assert_file_list(
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -16,7 +16,7 @@ use tracing::{info, warn};

 use crate::config::PageServerConf;
 use crate::tenant::storage_layer::LayerFileName;
-use crate::tenant::timeline::span::debug_assert_current_span_has_tenant_and_timeline_id;
+use crate::tenant::timeline::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::{exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS};
 use remote_storage::{DownloadError, GenericRemoteStorage};
 use utils::crashsafe::path_with_suffix_extension;
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -7,11 +7,9 @@ use std::collections::{HashMap, HashSet};
 use chrono::NaiveDateTime;
 use serde::{Deserialize, Serialize};
 use serde_with::{serde_as, DisplayFromStr};
-use utils::bin_ser::SerializeError;

 use crate::tenant::metadata::TimelineMetadata;
 use crate::tenant::storage_layer::LayerFileName;
-use crate::tenant::upload_queue::UploadQueueInitialized;

 use utils::lsn::Lsn;

@@ -117,21 +115,6 @@ impl IndexPart {
    }
 }

-impl TryFrom<&UploadQueueInitialized> for IndexPart {
-    type Error = SerializeError;
-
-    fn try_from(upload_queue: &UploadQueueInitialized) -> Result<Self, Self::Error> {
-        let disk_consistent_lsn = upload_queue.latest_metadata.disk_consistent_lsn();
-        let metadata_bytes = upload_queue.latest_metadata.to_bytes()?;
-
-        Ok(Self::new(
-            upload_queue.latest_files.clone(),
-            disk_consistent_lsn,
-            metadata_bytes,
-        ))
-    }
-}
-
 /// Serialized form of [`LayerFileMetadata`].
 #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, Default)]
 pub struct IndexLayerMetadata {
--- a/pageserver/src/tenant/span.rs
+++ b/pageserver/src/tenant/span.rs
@@ -1,20 +0,0 @@
-#[cfg(debug_assertions)]
-use utils::tracing_span_assert::{check_fields_present, MultiNameExtractor};
-
-#[cfg(not(debug_assertions))]
-pub(crate) fn debug_assert_current_span_has_tenant_id() {}
-
-#[cfg(debug_assertions)]
-pub(crate) static TENANT_ID_EXTRACTOR: once_cell::sync::Lazy<MultiNameExtractor<2>> =
-    once_cell::sync::Lazy::new(|| MultiNameExtractor::new("TenantId", ["tenant_id", "tenant"]));
-
-#[cfg(debug_assertions)]
-#[track_caller]
-pub(crate) fn debug_assert_current_span_has_tenant_id() {
-    if let Err(missing) = check_fields_present([&*TENANT_ID_EXTRACTOR]) {
-        panic!(
-            "missing extractors: {:?}",
-            missing.into_iter().map(|e| e.name()).collect::<Vec<_>>()
-        )
-    }
-}
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -38,7 +38,7 @@ pub use delta_layer::{DeltaLayer, DeltaLayerWriter};
 pub use filename::{DeltaFileName, ImageFileName, LayerFileName};
 pub use image_layer::{ImageLayer, ImageLayerWriter};
 pub use inmemory_layer::InMemoryLayer;
-pub use layer_desc::{PersistentLayerDesc, PersistentLayerKey};
+pub use layer_desc::PersistentLayerDesc;
 pub use remote_layer::RemoteLayer;

 use super::layer_map::BatchedUpdates;
@@ -176,10 +176,13 @@ impl LayerAccessStats {
    /// Create an empty stats object and record a [`LayerLoad`] event with the given residence status.
    ///
    /// See [`record_residence_event`] for why you need to do this while holding the layer map lock.
-    pub(crate) fn for_loading_layer(
-        layer_map_lock_held_witness: &BatchedUpdates<'_>,
+    pub(crate) fn for_loading_layer<L>(
+        layer_map_lock_held_witness: &BatchedUpdates<'_, L>,
        status: LayerResidenceStatus,
-    ) -> Self {
+    ) -> Self
+    where
+        L: ?Sized + Layer,
+    {
        let new = LayerAccessStats(Mutex::new(LayerAccessStatsLocked::default()));
        new.record_residence_event(
            layer_map_lock_held_witness,
@@ -194,11 +197,14 @@ impl LayerAccessStats {
    /// The `new_status` is not recorded in `self`.
    ///
    /// See [`record_residence_event`] for why you need to do this while holding the layer map lock.
-    pub(crate) fn clone_for_residence_change(
+    pub(crate) fn clone_for_residence_change<L>(
        &self,
-        layer_map_lock_held_witness: &BatchedUpdates<'_>,
+        layer_map_lock_held_witness: &BatchedUpdates<'_, L>,
        new_status: LayerResidenceStatus,
-    ) -> LayerAccessStats {
+    ) -> LayerAccessStats
+    where
+        L: ?Sized + Layer,
+    {
        let clone = {
            let inner = self.0.lock().unwrap();
            inner.clone()
@@ -226,12 +232,14 @@ impl LayerAccessStats {
    /// - Compact: Grab layer map lock, add the new L1 to layer map and remove the L0s, release layer map lock.
    /// - Eviction: observes the new L1 layer whose only activity timestamp is the LayerCreate event.
    ///
-    pub(crate) fn record_residence_event(
+    pub(crate) fn record_residence_event<L>(
        &self,
-        _layer_map_lock_held_witness: &BatchedUpdates<'_>,
+        _layer_map_lock_held_witness: &BatchedUpdates<'_, L>,
        status: LayerResidenceStatus,
        reason: LayerResidenceEventReason,
-    ) {
+    ) where
+        L: ?Sized + Layer,
+    {
        let mut locked = self.0.lock().unwrap();
        locked.iter_mut().for_each(|inner| {
            inner
@@ -335,7 +343,7 @@ impl LayerAccessStats {
 /// All layers should implement a minimal `std::fmt::Debug` without tenant or
 /// timeline names, because those are known in the context of which the layers
 /// are used in (timeline).
-pub trait Layer: std::fmt::Debug + std::fmt::Display + Send + Sync {
+pub trait Layer: std::fmt::Debug + Send + Sync {
    /// Range of keys that this layer covers
    fn get_key_range(&self) -> Range<Key>;

@@ -373,15 +381,18 @@ pub trait Layer: std::fmt::Debug + std::fmt::Display + Send + Sync {
        ctx: &RequestContext,
    ) -> Result<ValueReconstructResult>;

+    /// A short ID string that uniquely identifies the given layer within a [`LayerMap`].
+    fn short_id(&self) -> String;
+
    /// Dump summary of the contents of the layer to stdout
    fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()>;
 }

 /// Returned by [`Layer::iter`]
-pub type LayerIter<'i> = Box<dyn Iterator<Item = Result<(Key, Lsn, Value)>> + 'i + Send>;
+pub type LayerIter<'i> = Box<dyn Iterator<Item = Result<(Key, Lsn, Value)>> + 'i>;

 /// Returned by [`Layer::key_iter`]
-pub type LayerKeyIter<'i> = Box<dyn Iterator<Item = (Key, Lsn, u64)> + 'i + Send>;
+pub type LayerKeyIter<'i> = Box<dyn Iterator<Item = (Key, Lsn, u64)> + 'i>;

 /// A Layer contains all data in a "rectangle" consisting of a range of keys and
 /// range of LSNs.
@@ -443,9 +454,7 @@ pub trait PersistentLayer: Layer {
    ///
    /// Should not change over the lifetime of the layer object because
    /// current_physical_size is computed as the som of this value.
-    fn file_size(&self) -> u64 {
-        self.layer_desc().file_size
-    }
+    fn file_size(&self) -> u64;

    fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo;

@@ -462,127 +471,80 @@ pub fn downcast_remote_layer(
    }
 }

-pub mod tests {
-    use super::*;
+/// Holds metadata about a layer without any content. Used mostly for testing.
+///
+/// To use filenames as fixtures, parse them as [`LayerFileName`] then convert from that to a
+/// LayerDescriptor.
+#[derive(Clone, Debug)]
+pub struct LayerDescriptor {
+    pub key: Range<Key>,
+    pub lsn: Range<Lsn>,
+    pub is_incremental: bool,
+    pub short_id: String,
+}

-    /// Holds metadata about a layer without any content. Used mostly for testing.
-    ///
-    /// To use filenames as fixtures, parse them as [`LayerFileName`] then convert from that to a
-    /// LayerDescriptor.
-    #[derive(Clone, Debug)]
-    pub struct LayerDescriptor {
-        base: PersistentLayerDesc,
+impl Layer for LayerDescriptor {
+    fn get_key_range(&self) -> Range<Key> {
+        self.key.clone()
    }

-    impl From<PersistentLayerDesc> for LayerDescriptor {
-        fn from(base: PersistentLayerDesc) -> Self {
-            Self { base }
-        }
+    fn get_lsn_range(&self) -> Range<Lsn> {
+        self.lsn.clone()
    }

-    impl Layer for LayerDescriptor {
-        fn get_value_reconstruct_data(
-            &self,
-            _key: Key,
-            _lsn_range: Range<Lsn>,
-            _reconstruct_data: &mut ValueReconstructState,
-            _ctx: &RequestContext,
-        ) -> Result<ValueReconstructResult> {
-            todo!("This method shouldn't be part of the Layer trait")
-        }
-
-        fn dump(&self, _verbose: bool, _ctx: &RequestContext) -> Result<()> {
-            todo!()
-        }
-
-        /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
-        fn get_key_range(&self) -> Range<Key> {
-            self.layer_desc().key_range.clone()
-        }
-
-        /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
-        fn get_lsn_range(&self) -> Range<Lsn> {
-            self.layer_desc().lsn_range.clone()
-        }
-
-        /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
-        fn is_incremental(&self) -> bool {
-            self.layer_desc().is_incremental
-        }
+    fn is_incremental(&self) -> bool {
+        self.is_incremental
    }

-    /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
-    impl std::fmt::Display for LayerDescriptor {
-        fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-            write!(f, "{}", self.layer_desc().short_id())
-        }
+    fn get_value_reconstruct_data(
+        &self,
+        _key: Key,
+        _lsn_range: Range<Lsn>,
+        _reconstruct_data: &mut ValueReconstructState,
+        _ctx: &RequestContext,
+    ) -> Result<ValueReconstructResult> {
+        todo!("This method shouldn't be part of the Layer trait")
    }

-    impl PersistentLayer for LayerDescriptor {
-        fn layer_desc(&self) -> &PersistentLayerDesc {
-            &self.base
-        }
-
-        fn local_path(&self) -> Option<PathBuf> {
-            unimplemented!()
-        }
-
-        fn iter(&self, _: &RequestContext) -> Result<LayerIter<'_>> {
-            unimplemented!()
-        }
-
-        fn key_iter(&self, _: &RequestContext) -> Result<LayerKeyIter<'_>> {
-            unimplemented!()
-        }
-
-        fn delete_resident_layer_file(&self) -> Result<()> {
-            unimplemented!()
-        }
-
-        fn info(&self, _: LayerAccessStatsReset) -> HistoricLayerInfo {
-            unimplemented!()
-        }
-
-        fn access_stats(&self) -> &LayerAccessStats {
-            unimplemented!()
-        }
+    fn short_id(&self) -> String {
+        self.short_id.clone()
    }

-    impl From<DeltaFileName> for LayerDescriptor {
-        fn from(value: DeltaFileName) -> Self {
-            LayerDescriptor {
-                base: PersistentLayerDesc::new_delta(
-                    TenantId::from_array([0; 16]),
-                    TimelineId::from_array([0; 16]),
-                    value.key_range,
-                    value.lsn_range,
-                    233,
-                ),
-            }
+    fn dump(&self, _verbose: bool, _ctx: &RequestContext) -> Result<()> {
+        todo!()
+    }
+}
+
+impl From<DeltaFileName> for LayerDescriptor {
+    fn from(value: DeltaFileName) -> Self {
+        let short_id = value.to_string();
+        LayerDescriptor {
+            key: value.key_range,
+            lsn: value.lsn_range,
+            is_incremental: true,
+            short_id,
        }
    }
+}

-    impl From<ImageFileName> for LayerDescriptor {
-        fn from(value: ImageFileName) -> Self {
-            LayerDescriptor {
-                base: PersistentLayerDesc::new_img(
-                    TenantId::from_array([0; 16]),
-                    TimelineId::from_array([0; 16]),
-                    value.key_range,
-                    value.lsn,
-                    false,
-                    233,
-                ),
-            }
+impl From<ImageFileName> for LayerDescriptor {
+    fn from(value: ImageFileName) -> Self {
+        let short_id = value.to_string();
+        let lsn = value.lsn_as_range();
+        LayerDescriptor {
+            key: value.key_range,
+            lsn,
+            is_incremental: false,
+            short_id,
        }
    }
+}

-    impl From<LayerFileName> for LayerDescriptor {
-        fn from(value: LayerFileName) -> Self {
-            match value {
-                LayerFileName::Delta(d) => Self::from(d),
-                LayerFileName::Image(i) => Self::from(i),
-            }
+impl From<LayerFileName> for LayerDescriptor {
+    fn from(value: LayerFileName) -> Self {
+        match value {
+            LayerFileName::Delta(d) => Self::from(d),
+            LayerFileName::Image(i) => Self::from(i),
        }
    }
 }
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -37,7 +37,6 @@ use crate::virtual_file::VirtualFile;
 use crate::{walrecord, TEMP_FILE_SUFFIX};
 use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
 use anyhow::{bail, ensure, Context, Result};
-use once_cell::sync::OnceCell;
 use pageserver_api::models::{HistoricLayerInfo, LayerAccessKind};
 use rand::{distributions::Alphanumeric, Rng};
 use serde::{Deserialize, Serialize};
@@ -47,6 +46,7 @@ use std::io::{Seek, SeekFrom};
 use std::ops::Range;
 use std::os::unix::fs::FileExt;
 use std::path::{Path, PathBuf};
+use std::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard};
 use tracing::*;

 use utils::{
@@ -182,9 +182,11 @@ pub struct DeltaLayer {

    pub desc: PersistentLayerDesc,

+    pub file_size: u64,
+
    access_stats: LayerAccessStats,

-    inner: OnceCell<DeltaLayerInner>,
+    inner: RwLock<DeltaLayerInner>,
 }

 impl std::fmt::Debug for DeltaLayer {
@@ -194,24 +196,28 @@ impl std::fmt::Debug for DeltaLayer {
        f.debug_struct("DeltaLayer")
            .field("key_range", &RangeDisplayDebug(&self.desc.key_range))
            .field("lsn_range", &self.desc.lsn_range)
-            .field("file_size", &self.desc.file_size)
+            .field("file_size", &self.file_size)
            .field("inner", &self.inner)
            .finish()
    }
 }

 pub struct DeltaLayerInner {
+    /// If false, the fields below have not been loaded into memory yet.
+    loaded: bool,
+
    // values copied from summary
    index_start_blk: u32,
    index_root_blk: u32,

-    /// Reader object for reading blocks from the file.
-    file: FileBlockReader<VirtualFile>,
+    /// Reader object for reading blocks from the file. (None if not loaded yet)
+    file: Option<FileBlockReader<VirtualFile>>,
 }

 impl std::fmt::Debug for DeltaLayerInner {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.debug_struct("DeltaLayerInner")
+            .field("loaded", &self.loaded)
            .field("index_start_blk", &self.index_start_blk)
            .field("index_root_blk", &self.index_root_blk)
            .finish()
@@ -222,14 +228,13 @@ impl Layer for DeltaLayer {
    /// debugging function to print out the contents of the layer
    fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
        println!(
-            "----- delta layer for ten {} tli {} keys {}-{} lsn {}-{} size {} ----",
+            "----- delta layer for ten {} tli {} keys {}-{} lsn {}-{} ----",
            self.desc.tenant_id,
            self.desc.timeline_id,
            self.desc.key_range.start,
            self.desc.key_range.end,
            self.desc.lsn_range.start,
-            self.desc.lsn_range.end,
-            self.desc.file_size,
+            self.desc.lsn_range.end
        );

        if !verbose {
@@ -243,7 +248,7 @@ impl Layer for DeltaLayer {
            inner.index_start_blk, inner.index_root_blk
        );

-        let file = &inner.file;
+        let file = inner.file.as_ref().unwrap();
        let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
            inner.index_start_blk,
            inner.index_root_blk,
@@ -312,7 +317,7 @@ impl Layer for DeltaLayer {
            let inner = self.load(LayerAccessKind::GetValueReconstructData, ctx)?;

            // Scan the page versions backwards, starting from `lsn`.
-            let file = &inner.file;
+            let file = inner.file.as_ref().unwrap();
            let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
                inner.index_start_blk,
                inner.index_root_blk,
@@ -395,11 +400,10 @@ impl Layer for DeltaLayer {
    fn is_incremental(&self) -> bool {
        self.layer_desc().is_incremental
    }
-}
-/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
-impl std::fmt::Display for DeltaLayer {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(f, "{}", self.layer_desc().short_id())
+
+    /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
+    fn short_id(&self) -> String {
+        self.layer_desc().short_id()
    }
 }

@@ -435,6 +439,10 @@ impl PersistentLayer for DeltaLayer {
        Ok(())
    }

+    fn file_size(&self) -> u64 {
+        self.file_size
+    }
+
    fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
        let layer_file_name = self.filename().file_name();
        let lsn_range = self.get_lsn_range();
@@ -443,7 +451,7 @@ impl PersistentLayer for DeltaLayer {

        HistoricLayerInfo::Delta {
            layer_file_name,
-            layer_file_size: self.desc.file_size,
+            layer_file_size: self.file_size,
            lsn_start: lsn_range.start,
            lsn_end: lsn_range.end,
            remote: false,
@@ -498,22 +506,51 @@ impl DeltaLayer {
    /// Open the underlying file and read the metadata into memory, if it's
    /// not loaded already.
    ///
-    fn load(&self, access_kind: LayerAccessKind, ctx: &RequestContext) -> Result<&DeltaLayerInner> {
+    fn load(
+        &self,
+        access_kind: LayerAccessKind,
+        ctx: &RequestContext,
+    ) -> Result<RwLockReadGuard<DeltaLayerInner>> {
        self.access_stats
            .record_access(access_kind, ctx.task_kind());
-        // Quick exit if already loaded
-        self.inner
-            .get_or_try_init(|| self.load_inner())
-            .with_context(|| format!("Failed to load delta layer {}", self.path().display()))
+        loop {
+            // Quick exit if already loaded
+            let inner = self.inner.read().unwrap();
+            if inner.loaded {
+                return Ok(inner);
+            }
+
+            // Need to open the file and load the metadata. Upgrade our lock to
+            // a write lock. (Or rather, release and re-lock in write mode.)
+            drop(inner);
+            let inner = self.inner.write().unwrap();
+            if !inner.loaded {
+                self.load_inner(inner).with_context(|| {
+                    format!("Failed to load delta layer {}", self.path().display())
+                })?;
+            } else {
+                // Another thread loaded it while we were not holding the lock.
+            }
+
+            // We now have the file open and loaded. There's no function to do
+            // that in the std library RwLock, so we have to release and re-lock
+            // in read mode. (To be precise, the lock guard was moved in the
+            // above call to `load_inner`, so it's already been released). And
+            // while we do that, another thread could unload again, so we have
+            // to re-check and retry if that happens.
+        }
    }

-    fn load_inner(&self) -> Result<DeltaLayerInner> {
+    fn load_inner(&self, mut inner: RwLockWriteGuard<DeltaLayerInner>) -> Result<()> {
        let path = self.path();

-        let file = VirtualFile::open(&path)
-            .with_context(|| format!("Failed to open file '{}'", path.display()))?;
-        let file = FileBlockReader::new(file);
-
+        // Open the file if it's not open already.
+        if inner.file.is_none() {
+            let file = VirtualFile::open(&path)
+                .with_context(|| format!("Failed to open file '{}'", path.display()))?;
+            inner.file = Some(FileBlockReader::new(file));
+        }
+        let file = inner.file.as_mut().unwrap();
        let summary_blk = file.read_blk(0)?;
        let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;

@@ -540,13 +577,13 @@ impl DeltaLayer {
            }
        }

+        inner.index_start_blk = actual_summary.index_start_blk;
+        inner.index_root_blk = actual_summary.index_root_blk;
+
        debug!("loaded from {}", &path.display());

-        Ok(DeltaLayerInner {
-            file,
-            index_start_blk: actual_summary.index_start_blk,
-            index_root_blk: actual_summary.index_root_blk,
-        })
+        inner.loaded = true;
+        Ok(())
    }

    /// Create a DeltaLayer struct representing an existing file on disk.
@@ -565,10 +602,15 @@ impl DeltaLayer {
                timeline_id,
                filename.key_range.clone(),
                filename.lsn_range.clone(),
-                file_size,
            ),
+            file_size,
            access_stats,
-            inner: once_cell::sync::OnceCell::new(),
+            inner: RwLock::new(DeltaLayerInner {
+                loaded: false,
+                file: None,
+                index_start_blk: 0,
+                index_root_blk: 0,
+            }),
        }
    }

@@ -592,10 +634,15 @@ impl DeltaLayer {
                summary.timeline_id,
                summary.key_range,
                summary.lsn_range,
-                metadata.len(),
            ),
+            file_size: metadata.len(),
            access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
-            inner: once_cell::sync::OnceCell::new(),
+            inner: RwLock::new(DeltaLayerInner {
+                loaded: false,
+                file: None,
+                index_start_blk: 0,
+                index_root_blk: 0,
+            }),
        })
    }

@@ -756,10 +803,15 @@ impl DeltaLayerWriterInner {
                self.timeline_id,
                self.key_start..key_end,
                self.lsn_range.clone(),
-                metadata.len(),
            ),
+            file_size: metadata.len(),
            access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
-            inner: once_cell::sync::OnceCell::new(),
+            inner: RwLock::new(DeltaLayerInner {
+                loaded: false,
+                file: None,
+                index_start_blk,
+                index_root_blk,
+            }),
        };

        // fsync the file
@@ -894,13 +946,13 @@ struct DeltaValueIter<'a> {
    reader: BlockCursor<Adapter<'a>>,
 }

-struct Adapter<'a>(&'a DeltaLayerInner);
+struct Adapter<'a>(RwLockReadGuard<'a, DeltaLayerInner>);

 impl<'a> BlockReader for Adapter<'a> {
    type BlockLease = PageReadGuard<'static>;

    fn read_blk(&self, blknum: u32) -> Result<Self::BlockLease, std::io::Error> {
-        self.0.file.read_blk(blknum)
+        self.0.file.as_ref().unwrap().read_blk(blknum)
    }
 }

@@ -913,8 +965,8 @@ impl<'a> Iterator for DeltaValueIter<'a> {
 }

 impl<'a> DeltaValueIter<'a> {
-    fn new(inner: &'a DeltaLayerInner) -> Result<Self> {
-        let file = &inner.file;
+    fn new(inner: RwLockReadGuard<'a, DeltaLayerInner>) -> Result<Self> {
+        let file = inner.file.as_ref().unwrap();
        let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
            inner.index_start_blk,
            inner.index_root_blk,
@@ -987,8 +1039,8 @@ impl Iterator for DeltaKeyIter {
 }

 impl<'a> DeltaKeyIter {
-    fn new(inner: &'a DeltaLayerInner) -> Result<Self> {
-        let file = &inner.file;
+    fn new(inner: RwLockReadGuard<'a, DeltaLayerInner>) -> Result<Self> {
+        let file = inner.file.as_ref().unwrap();
        let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
            inner.index_start_blk,
            inner.index_root_blk,
@@ -1028,21 +1080,3 @@ impl<'a> DeltaKeyIter {
        Ok(iter)
    }
 }
-
-#[cfg(test)]
-mod test {
-    use super::DeltaKeyIter;
-    use super::DeltaLayer;
-    use super::DeltaValueIter;
-
-    // We will soon need the iters to be send in the compaction code.
-    // Cf https://github.com/neondatabase/neon/pull/4462#issuecomment-1587398883
-    // Cf https://github.com/neondatabase/neon/issues/4471
-    #[test]
-    fn is_send() {
-        fn assert_send<T: Send>() {}
-        assert_send::<DeltaLayer>();
-        assert_send::<DeltaValueIter>();
-        assert_send::<DeltaKeyIter>();
-    }
-}
--- a/pageserver/src/tenant/storage_layer/filename.rs
+++ b/pageserver/src/tenant/storage_layer/filename.rs
@@ -210,15 +210,9 @@ pub enum LayerFileName {

 impl LayerFileName {
    pub fn file_name(&self) -> String {
-        self.to_string()
-    }
-}
-
-impl fmt::Display for LayerFileName {
-    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        match self {
-            Self::Image(fname) => write!(f, "{fname}"),
-            Self::Delta(fname) => write!(f, "{fname}"),
+            Self::Image(fname) => fname.to_string(),
+            Self::Delta(fname) => fname.to_string(),
        }
    }
 }
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -109,6 +109,8 @@ pub struct ImageLayer {
    // This entry contains an image of all pages as of this LSN, should be the same as desc.lsn
    pub lsn: Lsn,

+    pub file_size: u64,
+
    access_stats: LayerAccessStats,

    inner: RwLock<ImageLayerInner>,
@@ -120,7 +122,7 @@ impl std::fmt::Debug for ImageLayer {

        f.debug_struct("ImageLayer")
            .field("key_range", &RangeDisplayDebug(&self.desc.key_range))
-            .field("file_size", &self.desc.file_size)
+            .field("file_size", &self.file_size)
            .field("lsn", &self.lsn)
            .field("inner", &self.inner)
            .finish()
@@ -153,14 +155,12 @@ impl Layer for ImageLayer {
    /// debugging function to print out the contents of the layer
    fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
        println!(
-            "----- image layer for ten {} tli {} key {}-{} at {} is_incremental {} size {} ----",
+            "----- image layer for ten {} tli {} key {}-{} at {} ----",
            self.desc.tenant_id,
            self.desc.timeline_id,
            self.desc.key_range.start,
            self.desc.key_range.end,
-            self.lsn,
-            self.desc.is_incremental,
-            self.desc.file_size
+            self.lsn
        );

        if !verbose {
@@ -232,12 +232,10 @@ impl Layer for ImageLayer {
    fn is_incremental(&self) -> bool {
        self.layer_desc().is_incremental
    }
-}

-/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
-impl std::fmt::Display for ImageLayer {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(f, "{}", self.layer_desc().short_id())
+    /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
+    fn short_id(&self) -> String {
+        self.layer_desc().short_id()
    }
 }

@@ -260,13 +258,17 @@ impl PersistentLayer for ImageLayer {
        Ok(())
    }

+    fn file_size(&self) -> u64 {
+        self.file_size
+    }
+
    fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
        let layer_file_name = self.filename().file_name();
        let lsn_range = self.get_lsn_range();

        HistoricLayerInfo::Image {
            layer_file_name,
-            layer_file_size: self.desc.file_size,
+            layer_file_size: self.file_size,
            lsn_start: lsn_range.start,
            remote: false,
            access_stats: self.access_stats.as_api_model(reset),
@@ -409,9 +411,9 @@ impl ImageLayer {
                filename.key_range.clone(),
                filename.lsn,
                false,
-                file_size,
            ), // Now we assume image layer ALWAYS covers the full range. This may change in the future.
            lsn: filename.lsn,
+            file_size,
            access_stats,
            inner: RwLock::new(ImageLayerInner {
                loaded: false,
@@ -441,9 +443,9 @@ impl ImageLayer {
                summary.key_range,
                summary.lsn,
                false,
-                metadata.len(),
            ), // Now we assume image layer ALWAYS covers the full range. This may change in the future.
            lsn: summary.lsn,
+            file_size: metadata.len(),
            access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
            inner: RwLock::new(ImageLayerInner {
                file: None,
@@ -576,6 +578,14 @@ impl ImageLayerWriterInner {
            file.write_all(buf.as_ref())?;
        }

+        let desc = PersistentLayerDesc::new_img(
+            self.tenant_id,
+            self.timeline_id,
+            self.key_range.clone(),
+            self.lsn,
+            self.is_incremental, // for now, image layer ALWAYS covers the full range
+        );
+
        // Fill in the summary on blk 0
        let summary = Summary {
            magic: IMAGE_FILE_MAGIC,
@@ -594,15 +604,6 @@ impl ImageLayerWriterInner {
            .metadata()
            .context("get metadata to determine file size")?;

-        let desc = PersistentLayerDesc::new_img(
-            self.tenant_id,
-            self.timeline_id,
-            self.key_range.clone(),
-            self.lsn,
-            self.is_incremental, // for now, image layer ALWAYS covers the full range
-            metadata.len(),
-        );
-
        // Note: Because we open the file in write-only mode, we cannot
        // reuse the same VirtualFile for reading later. That's why we don't
        // set inner.file here. The first read will have to re-open it.
@@ -610,6 +611,7 @@ impl ImageLayerWriterInner {
            path_or_conf: PathOrConf::Conf(self.conf),
            desc,
            lsn: self.lsn,
+            file_size: metadata.len(),
            access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
            inner: RwLock::new(ImageLayerInner {
                loaded: false,
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -131,6 +131,13 @@ impl Layer for InMemoryLayer {
        true
    }

+    fn short_id(&self) -> String {
+        let inner = self.inner.read().unwrap();
+
+        let end_lsn = inner.end_lsn.unwrap_or(Lsn(u64::MAX));
+        format!("inmem-{:016X}-{:016X}", self.start_lsn.0, end_lsn.0)
+    }
+
    /// debugging function to print out the contents of the layer
    fn dump(&self, verbose: bool, _ctx: &RequestContext) -> Result<()> {
        let inner = self.inner.read().unwrap();
@@ -233,15 +240,6 @@ impl Layer for InMemoryLayer {
    }
 }

-impl std::fmt::Display for InMemoryLayer {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        let inner = self.inner.read().unwrap();
-
-        let end_lsn = inner.end_lsn.unwrap_or(Lsn(u64::MAX));
-        write!(f, "inmem-{:016X}-{:016X}", self.start_lsn.0, end_lsn.0)
-    }
-}
-
 impl InMemoryLayer {
    ///
    /// Get layer size on the disk
@@ -306,7 +304,7 @@ impl InMemoryLayer {
        Ok(())
    }

-    pub async fn put_tombstone(&self, _key_range: Range<Key>, _lsn: Lsn) -> Result<()> {
+    pub fn put_tombstone(&self, _key_range: Range<Key>, _lsn: Lsn) -> Result<()> {
        // TODO: Currently, we just leak the storage for any deleted keys

        Ok(())
--- a/pageserver/src/tenant/storage_layer/layer_desc.rs
+++ b/pageserver/src/tenant/storage_layer/layer_desc.rs
@@ -1,21 +1,17 @@
-use anyhow::Result;
-use core::fmt::Display;
 use std::ops::Range;
 use utils::{
    id::{TenantId, TimelineId},
    lsn::Lsn,
 };

-use crate::{context::RequestContext, repository::Key};
+use crate::repository::Key;

 use super::{DeltaFileName, ImageFileName, LayerFileName};

-use serde::{Deserialize, Serialize};
-
 /// A unique identifier of a persistent layer. This is different from `LayerDescriptor`, which is only used in the
 /// benchmarks. This struct contains all necessary information to find the image / delta layer. It also provides
 /// a unified way to generate layer information like file name.
-#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)]
+#[derive(Debug, PartialEq, Eq, Clone)]
 pub struct PersistentLayerDesc {
    pub tenant_id: TenantId,
    pub timeline_id: TimelineId,
@@ -28,42 +24,11 @@ pub struct PersistentLayerDesc {
    /// always be equal to `is_delta`. If we land the partial image layer PR someday, image layer could also be
    /// incremental.
    pub is_incremental: bool,
-    /// File size
-    pub file_size: u64,
-}
-
-/// A unique identifier of a persistent layer within the context of one timeline.
-#[derive(Debug, PartialEq, Eq, Clone, Hash)]
-pub struct PersistentLayerKey {
-    pub key_range: Range<Key>,
-    pub lsn_range: Range<Lsn>,
-    pub is_delta: bool,
 }

 impl PersistentLayerDesc {
-    pub fn key(&self) -> PersistentLayerKey {
-        PersistentLayerKey {
-            key_range: self.key_range.clone(),
-            lsn_range: self.lsn_range.clone(),
-            is_delta: self.is_delta,
-        }
-    }
-
-    pub fn short_id(&self) -> impl Display {
-        self.filename()
-    }
-
-    #[cfg(test)]
-    pub fn new_test(key_range: Range<Key>) -> Self {
-        Self {
-            tenant_id: TenantId::generate(),
-            timeline_id: TimelineId::generate(),
-            key_range,
-            lsn_range: Lsn(0)..Lsn(1),
-            is_delta: false,
-            is_incremental: false,
-            file_size: 0,
-        }
+    pub fn short_id(&self) -> String {
+        self.filename().file_name()
    }

    pub fn new_img(
@@ -72,7 +37,6 @@ impl PersistentLayerDesc {
        key_range: Range<Key>,
        lsn: Lsn,
        is_incremental: bool,
-        file_size: u64,
    ) -> Self {
        Self {
            tenant_id,
@@ -81,7 +45,6 @@ impl PersistentLayerDesc {
            lsn_range: Self::image_layer_lsn_range(lsn),
            is_delta: false,
            is_incremental,
-            file_size,
        }
    }

@@ -90,7 +53,6 @@ impl PersistentLayerDesc {
        timeline_id: TimelineId,
        key_range: Range<Key>,
        lsn_range: Range<Lsn>,
-        file_size: u64,
    ) -> Self {
        Self {
            tenant_id,
@@ -99,7 +61,6 @@ impl PersistentLayerDesc {
            lsn_range,
            is_delta: true,
            is_incremental: true,
-            file_size,
        }
    }

@@ -145,51 +106,4 @@ impl PersistentLayerDesc {
            self.image_file_name().into()
        }
    }
-
-    // TODO: remove this in the future once we refactor timeline APIs.
-
-    pub fn get_lsn_range(&self) -> Range<Lsn> {
-        self.lsn_range.clone()
-    }
-
-    pub fn get_key_range(&self) -> Range<Key> {
-        self.key_range.clone()
-    }
-
-    pub fn get_timeline_id(&self) -> TimelineId {
-        self.timeline_id
-    }
-
-    pub fn get_tenant_id(&self) -> TenantId {
-        self.tenant_id
-    }
-
-    pub fn is_incremental(&self) -> bool {
-        self.is_incremental
-    }
-
-    pub fn is_delta(&self) -> bool {
-        self.is_delta
-    }
-
-    pub fn dump(&self, _verbose: bool, _ctx: &RequestContext) -> Result<()> {
-        println!(
-            "----- layer for ten {} tli {} keys {}-{} lsn {}-{} is_delta {} is_incremental {} size {} ----",
-            self.tenant_id,
-            self.timeline_id,
-            self.key_range.start,
-            self.key_range.end,
-            self.lsn_range.start,
-            self.lsn_range.end,
-            self.is_delta,
-            self.is_incremental,
-            self.file_size,
-        );
-
-        Ok(())
-    }
-
-    pub fn file_size(&self) -> u64 {
-        self.file_size
-    }
 }
--- a/pageserver/src/tenant/storage_layer/remote_layer.rs
+++ b/pageserver/src/tenant/storage_layer/remote_layer.rs
@@ -71,22 +71,22 @@ impl Layer for RemoteLayer {
        _reconstruct_state: &mut ValueReconstructState,
        _ctx: &RequestContext,
    ) -> Result<ValueReconstructResult> {
-        bail!("layer {self} needs to be downloaded");
+        bail!(
+            "layer {} needs to be downloaded",
+            self.filename().file_name()
+        );
    }

    /// debugging function to print out the contents of the layer
    fn dump(&self, _verbose: bool, _ctx: &RequestContext) -> Result<()> {
        println!(
-            "----- remote layer for ten {} tli {} keys {}-{} lsn {}-{} is_delta {} is_incremental {} size {} ----",
+            "----- remote layer for ten {} tli {} keys {}-{} lsn {}-{} ----",
            self.desc.tenant_id,
            self.desc.timeline_id,
            self.desc.key_range.start,
            self.desc.key_range.end,
            self.desc.lsn_range.start,
-            self.desc.lsn_range.end,
-            self.desc.is_delta,
-            self.desc.is_incremental,
-            self.desc.file_size,
+            self.desc.lsn_range.end
        );

        Ok(())
@@ -106,12 +106,10 @@ impl Layer for RemoteLayer {
    fn is_incremental(&self) -> bool {
        self.layer_desc().is_incremental
    }
-}

-/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
-impl std::fmt::Display for RemoteLayer {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(f, "{}", self.layer_desc().short_id())
+    /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
+    fn short_id(&self) -> String {
+        self.layer_desc().short_id()
    }
 }

@@ -144,6 +142,10 @@ impl PersistentLayer for RemoteLayer {
        true
    }

+    fn file_size(&self) -> u64 {
+        self.layer_metadata.file_size()
+    }
+
    fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
        let layer_file_name = self.filename().file_name();
        let lsn_range = self.get_lsn_range();
@@ -188,7 +190,6 @@ impl RemoteLayer {
                fname.key_range.clone(),
                fname.lsn,
                false,
-                layer_metadata.file_size(),
            ),
            layer_metadata: layer_metadata.clone(),
            ongoing_download: Arc::new(tokio::sync::Semaphore::new(1)),
@@ -210,7 +211,6 @@ impl RemoteLayer {
                timelineid,
                fname.key_range.clone(),
                fname.lsn_range.clone(),
-                layer_metadata.file_size(),
            ),
            layer_metadata: layer_metadata.clone(),
            ongoing_download: Arc::new(tokio::sync::Semaphore::new(1)),
@@ -220,12 +220,15 @@ impl RemoteLayer {
    }

    /// Create a Layer struct representing this layer, after it has been downloaded.
-    pub fn create_downloaded_layer(
+    pub fn create_downloaded_layer<L>(
        &self,
-        layer_map_lock_held_witness: &BatchedUpdates<'_>,
+        layer_map_lock_held_witness: &BatchedUpdates<'_, L>,
        conf: &'static PageServerConf,
        file_size: u64,
-    ) -> Arc<dyn PersistentLayer> {
+    ) -> Arc<dyn PersistentLayer>
+    where
+        L: ?Sized + Layer,
+    {
        if self.desc.is_delta {
            let fname = self.desc.delta_file_name();
            Arc::new(DeltaLayer::new(
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -14,11 +14,7 @@ use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::completion;

-/// Start per tenant background loops: compaction and gc.
-pub fn start_background_loops(
-    tenant: &Arc<Tenant>,
-    background_jobs_can_start: Option<&completion::Barrier>,
-) {
+pub fn start_background_loops(tenant: &Arc<Tenant>, init_done: Option<&completion::Barrier>) {
    let tenant_id = tenant.tenant_id;
    task_mgr::spawn(
        BACKGROUND_RUNTIME.handle(),
@@ -29,14 +25,10 @@ pub fn start_background_loops(
        false,
        {
            let tenant = Arc::clone(tenant);
-            let background_jobs_can_start = background_jobs_can_start.cloned();
+            let init_done = init_done.cloned();
            async move {
-                let cancel = task_mgr::shutdown_token();
-                tokio::select! {
-                    _ = cancel.cancelled() => { return Ok(()) },
-                    _ = completion::Barrier::maybe_wait(background_jobs_can_start) => {}
-                };
-                compaction_loop(tenant, cancel)
+                completion::Barrier::maybe_wait(init_done).await;
+                compaction_loop(tenant)
                    .instrument(info_span!("compaction_loop", tenant_id = %tenant_id))
                    .await;
                Ok(())
@@ -52,14 +44,10 @@ pub fn start_background_loops(
        false,
        {
            let tenant = Arc::clone(tenant);
-            let background_jobs_can_start = background_jobs_can_start.cloned();
+            let init_done = init_done.cloned();
            async move {
-                let cancel = task_mgr::shutdown_token();
-                tokio::select! {
-                    _ = cancel.cancelled() => { return Ok(()) },
-                    _ = completion::Barrier::maybe_wait(background_jobs_can_start) => {}
-                };
-                gc_loop(tenant, cancel)
+                completion::Barrier::maybe_wait(init_done).await;
+                gc_loop(tenant)
                    .instrument(info_span!("gc_loop", tenant_id = %tenant_id))
                    .await;
                Ok(())
@@ -71,11 +59,12 @@ pub fn start_background_loops(
 ///
 /// Compaction task's main loop
 ///
-async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
+async fn compaction_loop(tenant: Arc<Tenant>) {
    let wait_duration = Duration::from_secs(2);
    info!("starting");
    TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
    async {
+        let cancel = task_mgr::shutdown_token();
        let ctx = RequestContext::todo_child(TaskKind::Compaction, DownloadBehavior::Download);
        let mut first = true;
        loop {
@@ -140,11 +129,12 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
 ///
 /// GC task's main loop
 ///
-async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
+async fn gc_loop(tenant: Arc<Tenant>) {
    let wait_duration = Duration::from_secs(2);
    info!("starting");
    TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
    async {
+        let cancel = task_mgr::shutdown_token();
        // GC might require downloading, to find the cutoff LSN that corresponds to the
        // cutoff specified as time.
        let ctx =
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -34,8 +34,6 @@ use crate::{
    },
 };

-use utils::completion;
-
 use super::Timeline;

 #[derive(Default)]
@@ -49,12 +47,8 @@ pub struct EvictionTaskTenantState {
 }

 impl Timeline {
-    pub(super) fn launch_eviction_task(
-        self: &Arc<Self>,
-        background_tasks_can_start: Option<&completion::Barrier>,
-    ) {
+    pub(super) fn launch_eviction_task(self: &Arc<Self>) {
        let self_clone = Arc::clone(self);
-        let background_tasks_can_start = background_tasks_can_start.cloned();
        task_mgr::spawn(
            BACKGROUND_RUNTIME.handle(),
            TaskKind::Eviction,
@@ -63,13 +57,8 @@ impl Timeline {
            &format!("layer eviction for {}/{}", self.tenant_id, self.timeline_id),
            false,
            async move {
-                let cancel = task_mgr::shutdown_token();
-                tokio::select! {
-                    _ = cancel.cancelled() => { return Ok(()); }
-                    _ = completion::Barrier::maybe_wait(background_tasks_can_start) => {}
-                };
-
-                self_clone.eviction_task(cancel).await;
+                self_clone.eviction_task(task_mgr::shutdown_token()).await;
+                info!("eviction task finishing");
                Ok(())
            },
        );
@@ -77,9 +66,6 @@ impl Timeline {

    #[instrument(skip_all, fields(tenant_id = %self.tenant_id, timeline_id = %self.timeline_id))]
    async fn eviction_task(self: Arc<Self>, cancel: CancellationToken) {
-        scopeguard::defer! {
-            info!("eviction task finishing");
-        }
        use crate::tenant::tasks::random_init_delay;
        {
            let policy = self.get_eviction_policy();
@@ -88,6 +74,7 @@ impl Timeline {
                EvictionPolicy::NoEviction => Duration::from_secs(10),
            };
            if random_init_delay(period, &cancel).await.is_err() {
+                info!("shutting down");
                return;
            }
        }
@@ -102,6 +89,7 @@ impl Timeline {
                ControlFlow::Continue(sleep_until) => {
                    tokio::select! {
                        _ = cancel.cancelled() => {
+                            info!("shutting down");
                            break;
                        }
                        _ = tokio::time::sleep_until(sleep_until) => { }
@@ -197,11 +185,9 @@ impl Timeline {
        // We don't want to hold the layer map lock during eviction.
        // So, we just need to deal with this.
        let candidates: Vec<Arc<dyn PersistentLayer>> = {
-            let guard = self.layers.read().await;
-            let (layers, mapping) = &*guard;
+            let layers = self.layers.read().unwrap();
            let mut candidates = Vec::new();
            for hist_layer in layers.iter_historic_layers() {
-                let hist_layer = mapping.get_from_desc(&hist_layer);
                if hist_layer.is_remote_layer() {
                    continue;
                }
@@ -209,7 +195,7 @@ impl Timeline {
                let last_activity_ts = hist_layer.access_stats().latest_activity().unwrap_or_else(|| {
                    // We only use this fallback if there's an implementation error.
                    // `latest_activity` already does rate-limited warn!() log.
-                    debug!(layer=%hist_layer, "last_activity returns None, using SystemTime::now");
+                    debug!(layer=%hist_layer.filename().file_name(), "last_activity returns None, using SystemTime::now");
                    SystemTime::now()
                });

--- a/pageserver/src/tenant/timeline/logical_size.rs
+++ b/pageserver/src/tenant/timeline/logical_size.rs
@@ -1,128 +0,0 @@
-use anyhow::Context;
-use once_cell::sync::OnceCell;
-
-use tokio::sync::Semaphore;
-use utils::lsn::Lsn;
-
-use std::sync::atomic::{AtomicI64, Ordering as AtomicOrdering};
-use std::sync::Arc;
-
-/// Internal structure to hold all data needed for logical size calculation.
-///
-/// Calculation consists of two stages:
-///
-/// 1. Initial size calculation. That might take a long time, because it requires
-/// reading all layers containing relation sizes at `initial_part_end`.
-///
-/// 2. Collecting an incremental part and adding that to the initial size.
-/// Increments are appended on walreceiver writing new timeline data,
-/// which result in increase or decrease of the logical size.
-pub(super) struct LogicalSize {
-    /// Size, potentially slow to compute. Calculating this might require reading multiple
-    /// layers, and even ancestor's layers.
-    ///
-    /// NOTE: size at a given LSN is constant, but after a restart we will calculate
-    /// the initial size at a different LSN.
-    pub initial_logical_size: OnceCell<u64>,
-
-    /// Semaphore to track ongoing calculation of `initial_logical_size`.
-    pub initial_size_computation: Arc<tokio::sync::Semaphore>,
-
-    /// Latest Lsn that has its size uncalculated, could be absent for freshly created timelines.
-    pub initial_part_end: Option<Lsn>,
-
-    /// All other size changes after startup, combined together.
-    ///
-    /// Size shouldn't ever be negative, but this is signed for two reasons:
-    ///
-    /// 1. If we initialized the "baseline" size lazily, while we already
-    /// process incoming WAL, the incoming WAL records could decrement the
-    /// variable and temporarily make it negative. (This is just future-proofing;
-    /// the initialization is currently not done lazily.)
-    ///
-    /// 2. If there is a bug and we e.g. forget to increment it in some cases
-    /// when size grows, but remember to decrement it when it shrinks again, the
-    /// variable could go negative. In that case, it seems better to at least
-    /// try to keep tracking it, rather than clamp or overflow it. Note that
-    /// get_current_logical_size() will clamp the returned value to zero if it's
-    /// negative, and log an error. Could set it permanently to zero or some
-    /// special value to indicate "broken" instead, but this will do for now.
-    ///
-    /// Note that we also expose a copy of this value as a prometheus metric,
-    /// see `current_logical_size_gauge`. Use the `update_current_logical_size`
-    /// to modify this, it will also keep the prometheus metric in sync.
-    pub size_added_after_initial: AtomicI64,
-}
-
-/// Normalized current size, that the data in pageserver occupies.
-#[derive(Debug, Clone, Copy)]
-pub(super) enum CurrentLogicalSize {
-    /// The size is not yet calculated to the end, this is an intermediate result,
-    /// constructed from walreceiver increments and normalized: logical data could delete some objects, hence be negative,
-    /// yet total logical size cannot be below 0.
-    Approximate(u64),
-    // Fully calculated logical size, only other future walreceiver increments are changing it, and those changes are
-    // available for observation without any calculations.
-    Exact(u64),
-}
-
-impl CurrentLogicalSize {
-    pub(super) fn size(&self) -> u64 {
-        *match self {
-            Self::Approximate(size) => size,
-            Self::Exact(size) => size,
-        }
-    }
-}
-
-impl LogicalSize {
-    pub(super) fn empty_initial() -> Self {
-        Self {
-            initial_logical_size: OnceCell::with_value(0),
-            //  initial_logical_size already computed, so, don't admit any calculations
-            initial_size_computation: Arc::new(Semaphore::new(0)),
-            initial_part_end: None,
-            size_added_after_initial: AtomicI64::new(0),
-        }
-    }
-
-    pub(super) fn deferred_initial(compute_to: Lsn) -> Self {
-        Self {
-            initial_logical_size: OnceCell::new(),
-            initial_size_computation: Arc::new(Semaphore::new(1)),
-            initial_part_end: Some(compute_to),
-            size_added_after_initial: AtomicI64::new(0),
-        }
-    }
-
-    pub(super) fn current_size(&self) -> anyhow::Result<CurrentLogicalSize> {
-        let size_increment: i64 = self.size_added_after_initial.load(AtomicOrdering::Acquire);
-        //                  ^^^ keep this type explicit so that the casts in this function break if
-        //                  we change the type.
-        match self.initial_logical_size.get() {
-            Some(initial_size) => {
-                initial_size.checked_add_signed(size_increment)
-                    .with_context(|| format!("Overflow during logical size calculation, initial_size: {initial_size}, size_increment: {size_increment}"))
-                    .map(CurrentLogicalSize::Exact)
-            }
-            None => {
-                let non_negative_size_increment = u64::try_from(size_increment).unwrap_or(0);
-                Ok(CurrentLogicalSize::Approximate(non_negative_size_increment))
-            }
-        }
-    }
-
-    pub(super) fn increment_size(&self, delta: i64) {
-        self.size_added_after_initial
-            .fetch_add(delta, AtomicOrdering::SeqCst);
-    }
-
-    /// Make the value computed by initial logical size computation
-    /// available for re-use. This doesn't contain the incremental part.
-    pub(super) fn initialized_size(&self, lsn: Lsn) -> Option<u64> {
-        match self.initial_part_end {
-            Some(v) if v == lsn => self.initial_logical_size.get().copied(),
-            _ => None,
-        }
-    }
-}
--- a/pageserver/src/tenant/timeline/span.rs
+++ b/pageserver/src/tenant/timeline/span.rs
@@ -1,25 +0,0 @@
-#[cfg(debug_assertions)]
-use utils::tracing_span_assert::{check_fields_present, Extractor, MultiNameExtractor};
-
-#[cfg(not(debug_assertions))]
-pub(crate) fn debug_assert_current_span_has_tenant_and_timeline_id() {}
-
-#[cfg(debug_assertions)]
-#[track_caller]
-pub(crate) fn debug_assert_current_span_has_tenant_and_timeline_id() {
-    static TIMELINE_ID_EXTRACTOR: once_cell::sync::Lazy<MultiNameExtractor<2>> =
-        once_cell::sync::Lazy::new(|| {
-            MultiNameExtractor::new("TimelineId", ["timeline_id", "timeline"])
-        });
-
-    let fields: [&dyn Extractor; 2] = [
-        &*crate::tenant::span::TENANT_ID_EXTRACTOR,
-        &*TIMELINE_ID_EXTRACTOR,
-    ];
-    if let Err(missing) = check_fields_present(fields) {
-        panic!(
-            "missing extractors: {:?}",
-            missing.into_iter().map(|e| e.name()).collect::<Vec<_>>()
-        )
-    }
-}
--- a/pageserver/src/tenant/timeline/uninit.rs
+++ b/pageserver/src/tenant/timeline/uninit.rs
@@ -1,219 +0,0 @@
-use std::{collections::hash_map::Entry, fs, path::PathBuf, sync::Arc};
-
-use anyhow::Context;
-use tracing::{error, info, info_span, warn};
-use utils::{crashsafe, id::TimelineId, lsn::Lsn};
-
-use crate::{
-    context::RequestContext,
-    import_datadir,
-    tenant::{ignore_absent_files, Tenant},
-};
-
-use super::Timeline;
-
-/// A timeline with some of its files on disk, being initialized.
-/// This struct ensures the atomicity of the timeline init: it's either properly created and inserted into pageserver's memory, or
-/// its local files are removed. In the worst case of a crash, an uninit mark file is left behind, which causes the directory
-/// to be removed on next restart.
-///
-/// The caller is responsible for proper timeline data filling before the final init.
-#[must_use]
-pub struct UninitializedTimeline<'t> {
-    pub(crate) owning_tenant: &'t Tenant,
-    timeline_id: TimelineId,
-    raw_timeline: Option<(Arc<Timeline>, TimelineUninitMark)>,
-}
-
-impl<'t> UninitializedTimeline<'t> {
-    pub(crate) fn new(
-        owning_tenant: &'t Tenant,
-        timeline_id: TimelineId,
-        raw_timeline: Option<(Arc<Timeline>, TimelineUninitMark)>,
-    ) -> Self {
-        Self {
-            owning_tenant,
-            timeline_id,
-            raw_timeline,
-        }
-    }
-
-    /// Finish timeline creation: insert it into the Tenant's timelines map and remove the
-    /// uninit mark file.
-    ///
-    /// This function launches the flush loop if not already done.
-    ///
-    /// The caller is responsible for activating the timeline (function `.activate()`).
-    pub(crate) fn finish_creation(mut self) -> anyhow::Result<Arc<Timeline>> {
-        let timeline_id = self.timeline_id;
-        let tenant_id = self.owning_tenant.tenant_id;
-
-        let (new_timeline, uninit_mark) = self.raw_timeline.take().with_context(|| {
-            format!("No timeline for initalization found for {tenant_id}/{timeline_id}")
-        })?;
-
-        // Check that the caller initialized disk_consistent_lsn
-        let new_disk_consistent_lsn = new_timeline.get_disk_consistent_lsn();
-        anyhow::ensure!(
-            new_disk_consistent_lsn.is_valid(),
-            "new timeline {tenant_id}/{timeline_id} has invalid disk_consistent_lsn"
-        );
-
-        let mut timelines = self.owning_tenant.timelines.lock().unwrap();
-        match timelines.entry(timeline_id) {
-            Entry::Occupied(_) => anyhow::bail!(
-                "Found freshly initialized timeline {tenant_id}/{timeline_id} in the tenant map"
-            ),
-            Entry::Vacant(v) => {
-                uninit_mark.remove_uninit_mark().with_context(|| {
-                    format!(
-                        "Failed to remove uninit mark file for timeline {tenant_id}/{timeline_id}"
-                    )
-                })?;
-                v.insert(Arc::clone(&new_timeline));
-
-                new_timeline.maybe_spawn_flush_loop();
-            }
-        }
-
-        Ok(new_timeline)
-    }
-
-    /// Prepares timeline data by loading it from the basebackup archive.
-    pub(crate) async fn import_basebackup_from_tar(
-        self,
-        copyin_read: &mut (impl tokio::io::AsyncRead + Send + Sync + Unpin),
-        base_lsn: Lsn,
-        broker_client: storage_broker::BrokerClientChannel,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<Arc<Timeline>> {
-        let raw_timeline = self.raw_timeline()?;
-
-        import_datadir::import_basebackup_from_tar(raw_timeline, copyin_read, base_lsn, ctx)
-            .await
-            .context("Failed to import basebackup")?;
-
-        // Flush the new layer files to disk, before we make the timeline as available to
-        // the outside world.
-        //
-        // Flush loop needs to be spawned in order to be able to flush.
-        raw_timeline.maybe_spawn_flush_loop();
-
-        fail::fail_point!("before-checkpoint-new-timeline", |_| {
-            anyhow::bail!("failpoint before-checkpoint-new-timeline");
-        });
-
-        raw_timeline
-            .freeze_and_flush()
-            .await
-            .context("Failed to flush after basebackup import")?;
-
-        // All the data has been imported. Insert the Timeline into the tenant's timelines
-        // map and remove the uninit mark file.
-        let tl = self.finish_creation()?;
-        tl.activate(broker_client, None, ctx);
-        Ok(tl)
-    }
-
-    pub(crate) fn raw_timeline(&self) -> anyhow::Result<&Arc<Timeline>> {
-        Ok(&self
-            .raw_timeline
-            .as_ref()
-            .with_context(|| {
-                format!(
-                    "No raw timeline {}/{} found",
-                    self.owning_tenant.tenant_id, self.timeline_id
-                )
-            })?
-            .0)
-    }
-}
-
-impl Drop for UninitializedTimeline<'_> {
-    fn drop(&mut self) {
-        if let Some((_, uninit_mark)) = self.raw_timeline.take() {
-            let _entered = info_span!("drop_uninitialized_timeline", tenant = %self.owning_tenant.tenant_id, timeline = %self.timeline_id).entered();
-            error!("Timeline got dropped without initializing, cleaning its files");
-            cleanup_timeline_directory(uninit_mark);
-        }
-    }
-}
-
-pub(crate) fn cleanup_timeline_directory(uninit_mark: TimelineUninitMark) {
-    let timeline_path = &uninit_mark.timeline_path;
-    match ignore_absent_files(|| fs::remove_dir_all(timeline_path)) {
-        Ok(()) => {
-            info!("Timeline dir {timeline_path:?} removed successfully, removing the uninit mark")
-        }
-        Err(e) => {
-            error!("Failed to clean up uninitialized timeline directory {timeline_path:?}: {e:?}")
-        }
-    }
-    drop(uninit_mark); // mark handles its deletion on drop, gets retained if timeline dir exists
-}
-
-/// An uninit mark file, created along the timeline dir to ensure the timeline either gets fully initialized and loaded into pageserver's memory,
-/// or gets removed eventually.
-///
-/// XXX: it's important to create it near the timeline dir, not inside it to ensure timeline dir gets removed first.
-#[must_use]
-pub(crate) struct TimelineUninitMark {
-    uninit_mark_deleted: bool,
-    uninit_mark_path: PathBuf,
-    pub(crate) timeline_path: PathBuf,
-}
-
-impl TimelineUninitMark {
-    pub(crate) fn new(uninit_mark_path: PathBuf, timeline_path: PathBuf) -> Self {
-        Self {
-            uninit_mark_deleted: false,
-            uninit_mark_path,
-            timeline_path,
-        }
-    }
-
-    fn remove_uninit_mark(mut self) -> anyhow::Result<()> {
-        if !self.uninit_mark_deleted {
-            self.delete_mark_file_if_present()?;
-        }
-
-        Ok(())
-    }
-
-    fn delete_mark_file_if_present(&mut self) -> anyhow::Result<()> {
-        let uninit_mark_file = &self.uninit_mark_path;
-        let uninit_mark_parent = uninit_mark_file
-            .parent()
-            .with_context(|| format!("Uninit mark file {uninit_mark_file:?} has no parent"))?;
-        ignore_absent_files(|| fs::remove_file(uninit_mark_file)).with_context(|| {
-            format!("Failed to remove uninit mark file at path {uninit_mark_file:?}")
-        })?;
-        crashsafe::fsync(uninit_mark_parent).context("Failed to fsync uninit mark parent")?;
-        self.uninit_mark_deleted = true;
-
-        Ok(())
-    }
-}
-
-impl Drop for TimelineUninitMark {
-    fn drop(&mut self) {
-        if !self.uninit_mark_deleted {
-            if self.timeline_path.exists() {
-                error!(
-                    "Uninit mark {} is not removed, timeline {} stays uninitialized",
-                    self.uninit_mark_path.display(),
-                    self.timeline_path.display()
-                )
-            } else {
-                // unblock later timeline creation attempts
-                warn!(
-                    "Removing intermediate uninit mark file {}",
-                    self.uninit_mark_path.display()
-                );
-                if let Err(e) = self.delete_mark_file_if_present() {
-                    error!("Failed to remove the uninit mark file: {e}")
-                }
-            }
-        }
-    }
-}
--- a/pageserver/src/tenant/timeline/walreceiver.rs
+++ b/pageserver/src/tenant/timeline/walreceiver.rs
@@ -25,7 +25,6 @@ mod walreceiver_connection;

 use crate::context::{DownloadBehavior, RequestContext};
 use crate::task_mgr::{self, TaskKind, WALRECEIVER_RUNTIME};
-use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::tenant::timeline::walreceiver::connection_manager::{
    connection_manager_loop_step, ConnectionManagerState,
 };
@@ -86,8 +85,7 @@ impl WalReceiver {
            &format!("walreceiver for timeline {tenant_id}/{timeline_id}"),
            false,
            async move {
-                debug_assert_current_span_has_tenant_and_timeline_id();
-                debug!("WAL receiver manager started, connecting to broker");
+                info!("WAL receiver manager started, connecting to broker");
                let mut connection_manager_state = ConnectionManagerState::new(
                    timeline,
                    conf,
@@ -95,7 +93,7 @@ impl WalReceiver {
                loop {
                    select! {
                        _ = task_mgr::shutdown_watcher() => {
-                            trace!("WAL receiver shutdown requested, shutting down");
+                            info!("WAL receiver shutdown requested, shutting down");
                            break;
                        },
                        loop_step_result = connection_manager_loop_step(
@@ -106,7 +104,7 @@ impl WalReceiver {
                        ) => match loop_step_result {
                            ControlFlow::Continue(()) => continue,
                            ControlFlow::Break(()) => {
-                                trace!("Connection manager loop ended, shutting down");
+                                info!("Connection manager loop ended, shutting down");
                                break;
                            }
                        },
@@ -117,7 +115,7 @@ impl WalReceiver {
                *loop_status.write().unwrap() = None;
                Ok(())
            }
-            .instrument(info_span!(parent: None, "wal_connection_manager", tenant_id = %tenant_id, timeline_id = %timeline_id))
+            .instrument(info_span!(parent: None, "wal_connection_manager", tenant = %tenant_id, timeline = %timeline_id))
        );

        Self {
@@ -200,19 +198,29 @@ impl<E: Clone> TaskHandle<E> {
                TaskEvent::End(match self.join_handle.as_mut() {
                    Some(jh) => {
                        if !jh.is_finished() {
-                            // See: https://github.com/neondatabase/neon/issues/2885
-                            trace!("sender is dropped while join handle is still alive");
+                            // Barring any implementation errors in this module, we can
+                            // only arrive here while the task that executes the future
+                            // passed to `Self::spawn()` is still execution. Cf the comment
+                            // in Self::spawn().
+                            //
+                            // This was logging at warning level in earlier versions, presumably
+                            // to leave some breadcrumbs in case we had an implementation
+                            // error that would would make us get stuck in `jh.await`.
+                            //
+                            // There hasn't been such a bug so far.
+                            // But in a busy system, e.g., during pageserver restart,
+                            // we arrive here often enough that the warning-level logs
+                            // became a distraction.
+                            // So, tone them down to info-level.
+                            //
+                            // XXX: rewrite this module to eliminate the race condition.
+                            info!("sender is dropped while join handle is still alive");
                        }

-                        let res = match jh.await {
-                            Ok(res) => res,
-                            Err(je) if je.is_cancelled() => unreachable!("not used"),
-                            Err(je) if je.is_panic() => {
-                                // already logged
-                                Ok(())
-                            }
-                            Err(je) => Err(anyhow::Error::new(je).context("join walreceiver task")),
-                        };
+                        let res = jh
+                            .await
+                            .map_err(|e| anyhow::anyhow!("Failed to join task: {e}"))
+                            .and_then(|x| x);

                        // For cancellation-safety, drop join_handle only after successful .await.
                        self.join_handle = None;
@@ -235,12 +243,12 @@ impl<E: Clone> TaskHandle<E> {
            match jh.await {
                Ok(Ok(())) => debug!("Shutdown success"),
                Ok(Err(e)) => error!("Shutdown task error: {e:?}"),
-                Err(je) if je.is_cancelled() => unreachable!("not used"),
-                Err(je) if je.is_panic() => {
-                    // already logged
-                }
-                Err(je) => {
-                    error!("Shutdown task join error: {je}")
+                Err(join_error) => {
+                    if join_error.is_cancelled() {
+                        error!("Shutdown task was cancelled");
+                    } else {
+                        error!("Shutdown task join error: {join_error}")
+                    }
                }
            }
        }
--- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
@@ -18,7 +18,7 @@ use crate::metrics::{
    WALRECEIVER_CANDIDATES_REMOVED, WALRECEIVER_SWITCHES,
 };
 use crate::task_mgr::TaskKind;
-use crate::tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeline};
+use crate::tenant::Timeline;
 use anyhow::Context;
 use chrono::{NaiveDateTime, Utc};
 use pageserver_api::models::TimelineState;
@@ -55,11 +55,8 @@ pub(super) async fn connection_manager_loop_step(
        .await
    {
        Ok(()) => {}
-        Err(new_state) => {
-            debug!(
-                ?new_state,
-                "state changed, stopping wal connection manager loop"
-            );
+        Err(_) => {
+            info!("Timeline dropped state updates sender before becoming active, stopping wal connection manager loop");
            return ControlFlow::Break(());
        }
    }
@@ -82,7 +79,7 @@ pub(super) async fn connection_manager_loop_step(
    // with other streams on this client (other connection managers). When
    // object goes out of scope, stream finishes in drop() automatically.
    let mut broker_subscription = subscribe_for_timeline_updates(broker_client, id).await;
-    debug!("Subscribed for broker timeline updates");
+    info!("Subscribed for broker timeline updates");

    loop {
        let time_until_next_retry = connection_manager_state.time_until_next_retry();
@@ -153,13 +150,13 @@ pub(super) async fn connection_manager_loop_step(
                            match new_state {
                                // we're already active as walreceiver, no need to reactivate
                                TimelineState::Active => continue,
-                                TimelineState::Broken { .. } | TimelineState::Stopping => {
-                                    debug!("timeline entered terminal state {new_state:?}, stopping wal connection manager loop");
+                                TimelineState::Broken | TimelineState::Stopping => {
+                                    info!("timeline entered terminal state {new_state:?}, stopping wal connection manager loop");
                                    return ControlFlow::Break(());
                                }
                                TimelineState::Loading => {
                                    warn!("timeline transitioned back to Loading state, that should not happen");
-                                    return ControlFlow::Continue(());
+                                    return ControlFlow::Continue(new_state);
                                }
                            }
                        }
@@ -167,11 +164,12 @@ pub(super) async fn connection_manager_loop_step(
                    }
                }
            } => match new_event {
-                ControlFlow::Continue(()) => {
+                ControlFlow::Continue(new_state) => {
+                    info!("observed timeline state change, new state is {new_state:?}");
                    return ControlFlow::Continue(());
                }
                ControlFlow::Break(()) => {
-                    debug!("Timeline is no longer active, stopping wal connection manager loop");
+                    info!("Timeline dropped state updates sender, stopping wal connection manager loop");
                    return ControlFlow::Break(());
                }
            },
@@ -392,6 +390,7 @@ impl ConnectionManagerState {

        self.drop_old_connection(true).await;

+        let id = self.id;
        let node_id = new_sk.safekeeper_id;
        let connect_timeout = self.conf.wal_connect_timeout;
        let timeline = Arc::clone(&self.timeline);
@@ -399,13 +398,9 @@ impl ConnectionManagerState {
            TaskKind::WalReceiverConnectionHandler,
            DownloadBehavior::Download,
        );
-
-        let span = info_span!("connection", %node_id);
        let connection_handle = TaskHandle::spawn(move |events_sender, cancellation| {
            async move {
-                debug_assert_current_span_has_tenant_and_timeline_id();
-
-                let res = super::walreceiver_connection::handle_walreceiver_connection(
+                super::walreceiver_connection::handle_walreceiver_connection(
                    timeline,
                    new_sk.wal_source_connconf,
                    events_sender,
@@ -414,23 +409,12 @@ impl ConnectionManagerState {
                    ctx,
                    node_id,
                )
-                .await;
-
-                match res {
-                    Ok(()) => Ok(()),
-                    Err(e) => {
-                        use super::walreceiver_connection::ExpectedError;
-                        if e.is_expected() {
-                            info!("walreceiver connection handling ended: {e:#}");
-                            Ok(())
-                        } else {
-                            // give out an error to have task_mgr give it a really verbose logging
-                            Err(e).context("walreceiver connection handling failure")
-                        }
-                    }
-                }
+                .await
+                .context("walreceiver connection handling failure")
            }
-            .instrument(span)
+            .instrument(
+                info_span!("walreceiver_connection", tenant_id = %id.tenant_id, timeline_id = %id.timeline_id, %node_id),
+            )
        });

        let now = Utc::now().naive_utc();
@@ -1321,11 +1305,10 @@ mod tests {

    const DUMMY_SAFEKEEPER_HOST: &str = "safekeeper_connstr";

-    async fn dummy_state(harness: &TenantHarness) -> ConnectionManagerState {
+    async fn dummy_state(harness: &TenantHarness<'_>) -> ConnectionManagerState {
        let (tenant, ctx) = harness.load().await;
        let timeline = tenant
-            .create_test_timeline(TIMELINE_ID, Lsn(0x8), crate::DEFAULT_PG_VERSION, &ctx)
-            .await
+            .create_test_timeline(TIMELINE_ID, Lsn(0), crate::DEFAULT_PG_VERSION, &ctx)
            .expect("Failed to create an empty timeline for dummy wal connection manager");

        ConnectionManagerState {
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -21,16 +21,16 @@ use postgres_types::PgLsn;
 use tokio::{select, sync::watch, time};
 use tokio_postgres::{replication::ReplicationStream, Client};
 use tokio_util::sync::CancellationToken;
-use tracing::{debug, error, info, trace, warn, Instrument};
+use tracing::{debug, error, info, trace, warn};

 use super::TaskStateUpdate;
+use crate::metrics::LIVE_CONNECTIONS_COUNT;
+use crate::{context::RequestContext, metrics::WALRECEIVER_STARTED_CONNECTIONS};
 use crate::{
-    context::RequestContext,
-    metrics::{LIVE_CONNECTIONS_COUNT, WALRECEIVER_STARTED_CONNECTIONS},
    task_mgr,
    task_mgr::TaskKind,
    task_mgr::WALRECEIVER_RUNTIME,
-    tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeline, WalReceiverInfo},
+    tenant::{Timeline, WalReceiverInfo},
    walingest::WalIngest,
    walrecord::DecodedWALRecord,
 };
@@ -71,8 +71,6 @@ pub(super) async fn handle_walreceiver_connection(
    ctx: RequestContext,
    node: NodeId,
 ) -> anyhow::Result<()> {
-    debug_assert_current_span_has_tenant_and_timeline_id();
-
    WALRECEIVER_STARTED_CONNECTIONS.inc();

    // Connect to the database in replication mode.
@@ -83,8 +81,13 @@ pub(super) async fn handle_walreceiver_connection(
        config.application_name("pageserver");
        config.replication_mode(tokio_postgres::config::ReplicationMode::Physical);
        match time::timeout(connect_timeout, config.connect(postgres::NoTls)).await {
-            Ok(client_and_conn) => client_and_conn?,
-            Err(_elapsed) => {
+            Ok(Ok(client_and_conn)) => client_and_conn,
+            Ok(Err(conn_err)) => {
+                let expected_error = ignore_expected_errors(conn_err)?;
+                info!("DB connection stream finished: {expected_error}");
+                return Ok(());
+            }
+            Err(_) => {
                // Timing out to connect to a safekeeper node could happen long time, due to
                // many reasons that pageserver cannot control.
                // Do not produce an error, but make it visible, that timeouts happen by logging the `event.
@@ -94,7 +97,7 @@ pub(super) async fn handle_walreceiver_connection(
        }
    };

-    debug!("connected!");
+    info!("connected!");
    let mut connection_status = WalConnectionStatus {
        is_connected: true,
        has_processed_wal: false,
@@ -124,28 +127,20 @@ pub(super) async fn handle_walreceiver_connection(
        "walreceiver connection",
        false,
        async move {
-            debug_assert_current_span_has_tenant_and_timeline_id();
-
            select! {
                connection_result = connection => match connection_result {
-                    Ok(()) => debug!("Walreceiver db connection closed"),
+                    Ok(()) => info!("Walreceiver db connection closed"),
                    Err(connection_error) => {
-                        if connection_error.is_expected() {
-                            // silence, because most likely we've already exited the outer call
-                            // with a similar error.
-                        } else {
-                            warn!("Connection aborted: {connection_error:#}")
+                        if let Err(e) = ignore_expected_errors(connection_error) {
+                            warn!("Connection aborted: {e:#}")
                        }
                    }
                },
-                _ = connection_cancellation.cancelled() => debug!("Connection cancelled"),
+                // Future: replace connection_cancellation with connection_ctx cancellation
+                _ = connection_cancellation.cancelled() => info!("Connection cancelled"),
            }
            Ok(())
-        }
-        // Enrich the log lines emitted by this closure with meaningful context.
-        // TODO: technically, this task outlives the surrounding function, so, the
-        // spans won't be properly nested.
-        .instrument(tracing::info_span!("poller")),
+        },
    );

    // Immediately increment the gauge, then create a job to decrement it on task exit.
@@ -208,13 +203,20 @@ pub(super) async fn handle_walreceiver_connection(
    while let Some(replication_message) = {
        select! {
            _ = cancellation.cancelled() => {
-                debug!("walreceiver interrupted");
+                info!("walreceiver interrupted");
                None
            }
            replication_message = physical_stream.next() => replication_message,
        }
    } {
-        let replication_message = replication_message?;
+        let replication_message = match replication_message {
+            Ok(message) => message,
+            Err(replication_error) => {
+                let expected_error = ignore_expected_errors(replication_error)?;
+                info!("Replication stream finished: {expected_error}");
+                return Ok(());
+            }
+        };

        let now = Utc::now().naive_utc();
        let last_rec_lsn_before_msg = last_rec_lsn;
@@ -259,6 +261,8 @@ pub(super) async fn handle_walreceiver_connection(
                    let mut decoded = DecodedWALRecord::default();
                    let mut modification = timeline.begin_modification(endlsn);
                    while let Some((lsn, recdata)) = waldecoder.poll_decode()? {
+                        // let _enter = info_span!("processing record", lsn = %lsn).entered();
+
                        // It is important to deal with the aligned records as lsn in getPage@LSN is
                        // aligned and can be several bytes bigger. Without this alignment we are
                        // at risk of hitting a deadlock.
@@ -309,15 +313,12 @@ pub(super) async fn handle_walreceiver_connection(
            }
        }

-        timeline
-            .check_checkpoint_distance()
-            .await
-            .with_context(|| {
-                format!(
-                    "Failed to check checkpoint distance for timeline {}",
-                    timeline.timeline_id
-                )
-            })?;
+        timeline.check_checkpoint_distance().with_context(|| {
+            format!(
+                "Failed to check checkpoint distance for timeline {}",
+                timeline.timeline_id
+            )
+        })?;

        if let Some(last_lsn) = status_update {
            let timeline_remote_consistent_lsn =
@@ -420,50 +421,31 @@ async fn identify_system(client: &mut Client) -> anyhow::Result<IdentifySystem>
    }
 }

-/// Trait for avoid reporting walreceiver specific expected or "normal" or "ok" errors.
-pub(super) trait ExpectedError {
-    /// Test if this error is an ok error.
-    ///
-    /// We don't want to report connectivity problems as real errors towards connection manager because
-    /// 1. they happen frequently enough to make server logs hard to read and
-    /// 2. the connection manager can retry other safekeeper.
-    ///
-    /// If this function returns `true`, it's such an error.
-    /// The caller should log it at info level and then report to connection manager that we're done handling this connection.
-    /// Connection manager will then handle reconnections.
-    ///
-    /// If this function returns an `false` the error should be propagated and the connection manager
-    /// will log the error at ERROR level.
-    fn is_expected(&self) -> bool;
-}
-
-impl ExpectedError for postgres::Error {
-    fn is_expected(&self) -> bool {
-        self.is_closed()
-            || self
-                .source()
-                .and_then(|source| source.downcast_ref::<std::io::Error>())
-                .map(is_expected_io_error)
-                .unwrap_or(false)
-            || self
-                .as_db_error()
-                .filter(|db_error| {
-                    db_error.code() == &SqlState::SUCCESSFUL_COMPLETION
-                        && db_error.message().contains("ending streaming")
-                })
-                .is_some()
-    }
-}
-
-impl ExpectedError for anyhow::Error {
-    fn is_expected(&self) -> bool {
-        let head = self.downcast_ref::<postgres::Error>();
-
-        let tail = self
-            .chain()
-            .filter_map(|e| e.downcast_ref::<postgres::Error>());
-
-        // check if self or any of the chained/sourced errors are expected
-        head.into_iter().chain(tail).any(|e| e.is_expected())
+/// We don't want to report connectivity problems as real errors towards connection manager because
+/// 1. they happen frequently enough to make server logs hard to read and
+/// 2. the connection manager can retry other safekeeper.
+///
+/// If this function returns `Ok(pg_error)`, it's such an error.
+/// The caller should log it at info level and then report to connection manager that we're done handling this connection.
+/// Connection manager will then handle reconnections.
+///
+/// If this function returns an `Err()`, the caller can bubble it up using `?`.
+/// The connection manager will log the error at ERROR level.
+fn ignore_expected_errors(pg_error: postgres::Error) -> anyhow::Result<postgres::Error> {
+    if pg_error.is_closed()
+        || pg_error
+            .source()
+            .and_then(|source| source.downcast_ref::<std::io::Error>())
+            .map(is_expected_io_error)
+            .unwrap_or(false)
+    {
+        return Ok(pg_error);
+    } else if let Some(db_error) = pg_error.as_db_error() {
+        if db_error.code() == &SqlState::SUCCESSFUL_COMPLETION
+            && db_error.message().contains("ending streaming")
+        {
+            return Ok(pg_error);
+        }
    }
+    Err(pg_error).context("connection error")
 }
--- a/pageserver/src/tenant/upload_queue.rs
+++ b/pageserver/src/tenant/upload_queue.rs
@@ -76,12 +76,6 @@ pub(crate) struct UploadQueueInitialized {
    pub(crate) queued_operations: VecDeque<UploadOp>,
 }

-impl UploadQueueInitialized {
-    pub(super) fn no_pending_work(&self) -> bool {
-        self.inprogress_tasks.is_empty() && self.queued_operations.is_empty()
-    }
-}
-
 #[derive(Clone, Copy)]
 pub(super) enum SetDeletedFlagProgress {
    NotRunning,
@@ -90,7 +84,9 @@ pub(super) enum SetDeletedFlagProgress {
 }

 pub(super) struct UploadQueueStopped {
-    pub(super) upload_queue_for_deletion: UploadQueueInitialized,
+    pub(super) latest_files: HashMap<LayerFileName, LayerFileMetadata>,
+    pub(super) last_uploaded_consistent_lsn: Lsn,
+    pub(super) latest_metadata: TimelineMetadata,
    pub(super) deleted_at: SetDeletedFlagProgress,
 }

@@ -191,15 +187,6 @@ impl UploadQueue {
            UploadQueue::Initialized(x) => Ok(x),
        }
    }
-
-    pub(crate) fn stopped_mut(&mut self) -> anyhow::Result<&mut UploadQueueStopped> {
-        match self {
-            UploadQueue::Initialized(_) | UploadQueue::Uninitialized => {
-                anyhow::bail!("queue is in state {}", self.as_str())
-            }
-            UploadQueue::Stopped(stopped) => Ok(stopped),
-        }
-    }
 }

 /// An in-progress upload or delete task.
@@ -212,13 +199,6 @@ pub(crate) struct UploadTask {
    pub(crate) op: UploadOp,
 }

-#[derive(Debug)]
-pub(crate) struct Delete {
-    pub(crate) file_kind: RemoteOpFileKind,
-    pub(crate) layer_file_name: LayerFileName,
-    pub(crate) scheduled_from_timeline_delete: bool,
-}
-
 #[derive(Debug)]
 pub(crate) enum UploadOp {
    /// Upload a layer file
@@ -227,8 +207,8 @@ pub(crate) enum UploadOp {
    /// Upload the metadata file
    UploadMetadata(IndexPart, Lsn),

-    /// Delete a layer file
-    Delete(Delete),
+    /// Delete a file.
+    Delete(RemoteOpFileKind, LayerFileName),

    /// Barrier. When the barrier operation is reached,
    Barrier(tokio::sync::watch::Sender<()>),
@@ -246,12 +226,7 @@ impl std::fmt::Display for UploadOp {
                )
            }
            UploadOp::UploadMetadata(_, lsn) => write!(f, "UploadMetadata(lsn: {})", lsn),
-            UploadOp::Delete(delete) => write!(
-                f,
-                "Delete(path: {}, scheduled_from_timeline_delete: {})",
-                delete.layer_file_name.file_name(),
-                delete.scheduled_from_timeline_delete
-            ),
+            UploadOp::Delete(_, path) => write!(f, "Delete({})", path.file_name()),
            UploadOp::Barrier(_) => write!(f, "Barrier"),
        }
    }
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -302,6 +302,15 @@ impl VirtualFile {
            .observe_closure_duration(|| self.open_options.open(&self.path))?;

        // Perform the requested operation on it
+        //
+        // TODO: We could downgrade the locks to read mode before calling
+        // 'func', to allow a little bit more concurrency, but the standard
+        // library RwLock doesn't allow downgrading without releasing the lock,
+        // and that doesn't seem worth the trouble.
+        //
+        // XXX: `parking_lot::RwLock` can enable such downgrades, yet its implementation is fair and
+        // may deadlock on subsequent read calls.
+        // Simply replacing all `RwLock` in project causes deadlocks, so use it sparingly.
        let result = STORAGE_IO_TIME
            .with_label_values(&[op, &self.tenant_id, &self.timeline_id])
            .observe_closure_duration(|| func(&file));
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -25,7 +25,7 @@ use postgres_ffi::v14::nonrelfile_utils::clogpage_precedes;
 use postgres_ffi::v14::nonrelfile_utils::slru_may_delete_clogsegment;
 use postgres_ffi::{fsm_logical_to_physical, page_is_new, page_set_lsn};

-use anyhow::{Context, Result};
+use anyhow::Result;
 use bytes::{Buf, Bytes, BytesMut};
 use tracing::*;

@@ -333,7 +333,7 @@ impl<'a> WalIngest<'a> {

        // Now that this record has been fully handled, including updating the
        // checkpoint data, let the repository know that it is up-to-date to this LSN
-        modification.commit().await?;
+        modification.commit()?;

        Ok(())
    }
@@ -1082,10 +1082,7 @@ impl<'a> WalIngest<'a> {
            .await?
        {
            // create it with 0 size initially, the logic below will extend it
-            modification
-                .put_rel_creation(rel, 0, ctx)
-                .await
-                .context("Relation Error")?;
+            modification.put_rel_creation(rel, 0, ctx).await?;
            0
        } else {
            self.timeline.get_rel_size(rel, last_lsn, true, ctx).await?
@@ -1174,6 +1171,7 @@ impl<'a> WalIngest<'a> {
 #[cfg(test)]
 mod tests {
    use super::*;
+    use crate::pgdatadir_mapping::create_test_timeline;
    use crate::tenant::harness::*;
    use crate::tenant::Timeline;
    use postgres_ffi::v14::xlog_utils::SIZEOF_CHECKPOINT;
@@ -1202,7 +1200,7 @@ mod tests {
        let mut m = tline.begin_modification(Lsn(0x10));
        m.put_checkpoint(ZERO_CHECKPOINT.clone())?;
        m.put_relmap_file(0, 111, Bytes::from(""), ctx).await?; // dummy relmapper file
-        m.commit().await?;
+        m.commit()?;
        let walingest = WalIngest::new(tline, Lsn(0x10), ctx).await?;

        Ok(walingest)
@@ -1211,9 +1209,7 @@ mod tests {
    #[tokio::test]
    async fn test_relsize() -> Result<()> {
        let (tenant, ctx) = TenantHarness::create("test_relsize")?.load().await;
-        let tline = tenant
-            .create_test_timeline(TIMELINE_ID, Lsn(8), DEFAULT_PG_VERSION, &ctx)
-            .await?;
+        let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION, &ctx)?;
        let mut walingest = init_walingest_test(&tline, &ctx).await?;

        let mut m = tline.begin_modification(Lsn(0x20));
@@ -1221,22 +1217,22 @@ mod tests {
        walingest
            .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"), &ctx)
            .await?;
-        m.commit().await?;
+        m.commit()?;
        let mut m = tline.begin_modification(Lsn(0x30));
        walingest
            .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 3"), &ctx)
            .await?;
-        m.commit().await?;
+        m.commit()?;
        let mut m = tline.begin_modification(Lsn(0x40));
        walingest
            .put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1 at 4"), &ctx)
            .await?;
-        m.commit().await?;
+        m.commit()?;
        let mut m = tline.begin_modification(Lsn(0x50));
        walingest
            .put_rel_page_image(&mut m, TESTREL_A, 2, TEST_IMG("foo blk 2 at 5"), &ctx)
            .await?;
-        m.commit().await?;
+        m.commit()?;

        assert_current_logical_size(&tline, Lsn(0x50));

@@ -1322,7 +1318,7 @@ mod tests {
        walingest
            .put_rel_truncation(&mut m, TESTREL_A, 2, &ctx)
            .await?;
-        m.commit().await?;
+        m.commit()?;
        assert_current_logical_size(&tline, Lsn(0x60));

        // Check reported size and contents after truncation
@@ -1364,7 +1360,7 @@ mod tests {
        walingest
            .put_rel_truncation(&mut m, TESTREL_A, 0, &ctx)
            .await?;
-        m.commit().await?;
+        m.commit()?;
        assert_eq!(
            tline
                .get_rel_size(TESTREL_A, Lsn(0x68), false, &ctx)
@@ -1377,7 +1373,7 @@ mod tests {
        walingest
            .put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1"), &ctx)
            .await?;
-        m.commit().await?;
+        m.commit()?;
        assert_eq!(
            tline
                .get_rel_size(TESTREL_A, Lsn(0x70), false, &ctx)
@@ -1402,7 +1398,7 @@ mod tests {
        walingest
            .put_rel_page_image(&mut m, TESTREL_A, 1500, TEST_IMG("foo blk 1500"), &ctx)
            .await?;
-        m.commit().await?;
+        m.commit()?;
        assert_eq!(
            tline
                .get_rel_size(TESTREL_A, Lsn(0x80), false, &ctx)
@@ -1432,16 +1428,14 @@ mod tests {
    #[tokio::test]
    async fn test_drop_extend() -> Result<()> {
        let (tenant, ctx) = TenantHarness::create("test_drop_extend")?.load().await;
-        let tline = tenant
-            .create_test_timeline(TIMELINE_ID, Lsn(8), DEFAULT_PG_VERSION, &ctx)
-            .await?;
+        let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION, &ctx)?;
        let mut walingest = init_walingest_test(&tline, &ctx).await?;

        let mut m = tline.begin_modification(Lsn(0x20));
        walingest
            .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"), &ctx)
            .await?;
-        m.commit().await?;
+        m.commit()?;

        // Check that rel exists and size is correct
        assert_eq!(
@@ -1460,7 +1454,7 @@ mod tests {
        // Drop rel
        let mut m = tline.begin_modification(Lsn(0x30));
        walingest.put_rel_drop(&mut m, TESTREL_A, &ctx).await?;
-        m.commit().await?;
+        m.commit()?;

        // Check that rel is not visible anymore
        assert_eq!(
@@ -1478,7 +1472,7 @@ mod tests {
        walingest
            .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 4"), &ctx)
            .await?;
-        m.commit().await?;
+        m.commit()?;

        // Check that rel exists and size is correct
        assert_eq!(
@@ -1503,9 +1497,7 @@ mod tests {
    #[tokio::test]
    async fn test_truncate_extend() -> Result<()> {
        let (tenant, ctx) = TenantHarness::create("test_truncate_extend")?.load().await;
-        let tline = tenant
-            .create_test_timeline(TIMELINE_ID, Lsn(8), DEFAULT_PG_VERSION, &ctx)
-            .await?;
+        let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION, &ctx)?;
        let mut walingest = init_walingest_test(&tline, &ctx).await?;

        // Create a 20 MB relation (the size is arbitrary)
@@ -1517,7 +1509,7 @@ mod tests {
                .put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data), &ctx)
                .await?;
        }
-        m.commit().await?;
+        m.commit()?;

        // The relation was created at LSN 20, not visible at LSN 1 yet.
        assert_eq!(
@@ -1562,7 +1554,7 @@ mod tests {
        walingest
            .put_rel_truncation(&mut m, TESTREL_A, 1, &ctx)
            .await?;
-        m.commit().await?;
+        m.commit()?;

        // Check reported size and contents after truncation
        assert_eq!(
@@ -1611,7 +1603,7 @@ mod tests {
                .put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data), &ctx)
                .await?;
        }
-        m.commit().await?;
+        m.commit()?;

        assert_eq!(
            tline
@@ -1645,9 +1637,7 @@ mod tests {
    #[tokio::test]
    async fn test_large_rel() -> Result<()> {
        let (tenant, ctx) = TenantHarness::create("test_large_rel")?.load().await;
-        let tline = tenant
-            .create_test_timeline(TIMELINE_ID, Lsn(8), DEFAULT_PG_VERSION, &ctx)
-            .await?;
+        let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION, &ctx)?;
        let mut walingest = init_walingest_test(&tline, &ctx).await?;

        let mut lsn = 0x10;
@@ -1658,7 +1648,7 @@ mod tests {
            walingest
                .put_rel_page_image(&mut m, TESTREL_A, blknum as BlockNumber, img, &ctx)
                .await?;
-            m.commit().await?;
+            m.commit()?;
        }

        assert_current_logical_size(&tline, Lsn(lsn));
@@ -1674,7 +1664,7 @@ mod tests {
        walingest
            .put_rel_truncation(&mut m, TESTREL_A, RELSEG_SIZE, &ctx)
            .await?;
-        m.commit().await?;
+        m.commit()?;
        assert_eq!(
            tline.get_rel_size(TESTREL_A, Lsn(lsn), false, &ctx).await?,
            RELSEG_SIZE
@@ -1687,7 +1677,7 @@ mod tests {
        walingest
            .put_rel_truncation(&mut m, TESTREL_A, RELSEG_SIZE - 1, &ctx)
            .await?;
-        m.commit().await?;
+        m.commit()?;
        assert_eq!(
            tline.get_rel_size(TESTREL_A, Lsn(lsn), false, &ctx).await?,
            RELSEG_SIZE - 1
@@ -1703,7 +1693,7 @@ mod tests {
            walingest
                .put_rel_truncation(&mut m, TESTREL_A, size as BlockNumber, &ctx)
                .await?;
-            m.commit().await?;
+            m.commit()?;
            assert_eq!(
                tline.get_rel_size(TESTREL_A, Lsn(lsn), false, &ctx).await?,
                size as BlockNumber
--- a/pgxn/hnsw/Makefile
+++ b/pgxn/hnsw/Makefile
@@ -1,26 +0,0 @@
-EXTENSION = hnsw
-EXTVERSION = 0.1.0
-
-MODULE_big = hnsw
-DATA = $(wildcard *--*.sql)
-OBJS = hnsw.o hnswalg.o
-
-TESTS = $(wildcard test/sql/*.sql)
-REGRESS = $(patsubst test/sql/%.sql,%,$(TESTS))
-REGRESS_OPTS = --inputdir=test --load-extension=hnsw
-
-# For auto-vectorization:
-# - GCC (needs -ftree-vectorize OR -O3) - https://gcc.gnu.org/projects/tree-ssa/vectorization.html
-PG_CFLAGS += -O3
-PG_CXXFLAGS +=  -O3 -std=c++11
-PG_LDFLAGS += -lstdc++
-
-all: $(EXTENSION)--$(EXTVERSION).sql
-
-PG_CONFIG ?= pg_config
-PGXS := $(shell $(PG_CONFIG) --pgxs)
-include $(PGXS)
-
-dist:
-	mkdir -p dist
-	git archive --format zip --prefix=$(EXTENSION)-$(EXTVERSION)/ --output dist/$(EXTENSION)-$(EXTVERSION).zip master
--- a/pgxn/hnsw/README.md
+++ b/pgxn/hnsw/README.md
@@ -1,25 +0,0 @@
-# Revisiting the Inverted Indices for Billion-Scale Approximate Nearest Neighbors
-
-This ANN extension of Postgres is based
-on [ivf-hnsw](https://github.com/dbaranchuk/ivf-hnsw.git) implementation of [HNSW](https://www.pinecone.io/learn/hnsw),
-the code for the current state-of-the-art billion-scale nearest neighbor search system presented in the paper:
-
-[Revisiting the Inverted Indices for Billion-Scale Approximate Nearest Neighbors](http://openaccess.thecvf.com/content_ECCV_2018/html/Dmitry_Baranchuk_Revisiting_the_Inverted_ECCV_2018_paper.html),
-<br>
-Dmitry Baranchuk, Artem Babenko, Yury Malkov
-
-# Postgres extension
-
-HNSW index is hold in memory (built on demand) and it's maxial size is limited
-by `maxelements` index parameter. Another required parameter is nubmer of dimensions (if it is not specified in column type).
-Optional parameter `ef` specifies number of neighbors which are considered during index construction and search (corresponds `efConstruction` and `efSearch` parameters
-described in the article).
-
-# Example of usage:
-
-```
-create extension hnsw;
-create table embeddings(id integer primary key, payload real[]);
-create index on embeddings using hnsw(payload) with (maxelements=1000000, dims=100, m=32);
-select id from embeddings order by payload <-> array[1.0, 2.0,...] limit 100;
-```
--- a/pgxn/hnsw/hnsw--0.1.0.sql
+++ b/pgxn/hnsw/hnsw--0.1.0.sql
@@ -1,29 +0,0 @@
-- complain if script is sourced in psql, rather than via CREATE EXTENSION
-\echo Use "CREATE EXTENSION hnsw" to load this file. \quit
-
-- functions
-
-CREATE FUNCTION l2_distance(real[], real[]) RETURNS real
-	AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE;
-
-- operators
-
-CREATE OPERATOR <-> (
-	LEFTARG = real[], RIGHTARG = real[], PROCEDURE = l2_distance,
-	COMMUTATOR = '<->'
-);
-
-- access method
-
-CREATE FUNCTION hnsw_handler(internal) RETURNS index_am_handler
-	AS 'MODULE_PATHNAME' LANGUAGE C;
-
-CREATE ACCESS METHOD hnsw TYPE INDEX HANDLER hnsw_handler;
-
-COMMENT ON ACCESS METHOD hnsw IS 'hnsw index access method';
-
-- opclasses
-
-CREATE OPERATOR CLASS knn_ops
-	DEFAULT FOR TYPE real[] USING hnsw AS
-	OPERATOR 1 <-> (real[], real[]) FOR ORDER BY float_ops;
--- a/pgxn/hnsw/hnsw.c
+++ b/pgxn/hnsw/hnsw.c
@@ -1,591 +0,0 @@
-#include "postgres.h"
-
-#include "access/amapi.h"
-#include "access/generic_xlog.h"
-#include "access/relation.h"
-#include "access/reloptions.h"
-#include "access/tableam.h"
-#include "catalog/index.h"
-#include "commands/vacuum.h"
-#include "nodes/execnodes.h"
-#include "storage/bufmgr.h"
-#include "utils/guc.h"
-#include "utils/selfuncs.h"
-
-#include <math.h>
-#include <float.h>
-
-#include "hnsw.h"
-
-PG_MODULE_MAGIC;
-
-typedef struct {
-	int32 vl_len_;		/* varlena header (do not touch directly!) */
-	int dims;
-	int maxelements;
-	int efConstruction;
-	int efSearch;
-	int M;
-} HnswOptions;
-
-static relopt_kind hnsw_relopt_kind;
-
-typedef struct {
-	HierarchicalNSW* hnsw;
-	size_t curr;
-	size_t n_results;
-	ItemPointer results;
-} HnswScanOpaqueData;
-
-typedef HnswScanOpaqueData* HnswScanOpaque;
-
-typedef struct {
-	Oid relid;
-	uint32 status;
-	HierarchicalNSW* hnsw;
-} HnswHashEntry;
-
-
-#define SH_PREFIX			 hnsw_index
-#define SH_ELEMENT_TYPE		 HnswHashEntry
-#define SH_KEY_TYPE			 Oid
-#define SH_KEY				 relid
-#define SH_STORE_HASH
-#define SH_GET_HASH(tb, a)	 ((a)->relid)
-#define SH_HASH_KEY(tb, key) (key)
-#define SH_EQUAL(tb, a, b)	((a) == (b))
-#define SH_SCOPE			static inline
-#define SH_DEFINE
-#define SH_DECLARE
-#include "lib/simplehash.h"
-
-#define INDEX_HASH_SIZE     11
-
-#define DEFAULT_EF_SEARCH   64
-
-PGDLLEXPORT void _PG_init(void);
-
-static hnsw_index_hash *hnsw_indexes;
-
-/*
- * Initialize index options and variables
- */
-void
-_PG_init(void)
-{
-	hnsw_relopt_kind = add_reloption_kind();
-	add_int_reloption(hnsw_relopt_kind, "dims", "Number of dimensions",
-					  0, 0, INT_MAX, AccessExclusiveLock);
-	add_int_reloption(hnsw_relopt_kind, "maxelements", "Maximal number of elements",
-					  0, 0, INT_MAX, AccessExclusiveLock);
-	add_int_reloption(hnsw_relopt_kind, "m", "Number of neighbors of each vertex",
-					  100, 0, INT_MAX, AccessExclusiveLock);
-	add_int_reloption(hnsw_relopt_kind, "efconstruction", "Number of inspected neighbors during index construction",
-					  16, 1, INT_MAX, AccessExclusiveLock);
-	add_int_reloption(hnsw_relopt_kind, "efsearch", "Number of inspected neighbors during index search",
-					  64, 1, INT_MAX, AccessExclusiveLock);
-	hnsw_indexes = hnsw_index_create(TopMemoryContext, INDEX_HASH_SIZE, NULL);
-}
-
-
-static void
-hnsw_build_callback(Relation index, ItemPointer tid, Datum *values,
-					bool *isnull, bool tupleIsAlive, void *state)
-{
-	HierarchicalNSW* hnsw = (HierarchicalNSW*) state;
-	ArrayType* array;
-	int n_items;
-	label_t label = 0;
-
-	/* Skip nulls */
-	if (isnull[0])
-		return;
-
-	array = DatumGetArrayTypeP(values[0]);
-	n_items = ArrayGetNItems(ARR_NDIM(array), ARR_DIMS(array));
-	if (n_items != hnsw_dimensions(hnsw))
-	{
-		elog(ERROR, "Wrong number of dimensions: %d instead of %d expected",
-			 n_items, hnsw_dimensions(hnsw));
-	}
-
-	memcpy(&label, tid, sizeof(*tid));
-	hnsw_add_point(hnsw, (coord_t*)ARR_DATA_PTR(array), label);
-}
-
-static void
-hnsw_populate(HierarchicalNSW* hnsw, Relation indexRel, Relation heapRel)
-{
-	IndexInfo* indexInfo = BuildIndexInfo(indexRel);
-	Assert(indexInfo->ii_NumIndexAttrs == 1);
-	table_index_build_scan(heapRel, indexRel, indexInfo,
-						   true, true, hnsw_build_callback, (void *) hnsw, NULL);
-}
-
-#ifdef __APPLE__
-
-#include <sys/types.h>
-#include <sys/sysctl.h>
-
-static void
-hnsw_check_available_memory(Size requested)
-{
-	size_t total;
-	if (sysctlbyname("hw.memsize", NULL, &total, NULL, 0) < 0)
-		elog(ERROR, "Failed to get amount of RAM: %m");
-
-	if ((Size)NBuffers*BLCKSZ + requested >= total)
-		elog(ERROR, "HNSW index requeries %ld bytes while only %ld are available",
-			requested, total - (Size)NBuffers*BLCKSZ);
-}
-
-#else
-
-#include <sys/sysinfo.h>
-
-static void
-hnsw_check_available_memory(Size requested)
-{
-	struct sysinfo si;
-	Size total;
-	if (sysinfo(&si) < 0)
-		elog(ERROR, "Failed to get amount of RAM: %m");
-
-	total = si.totalram*si.mem_unit;
-	if ((Size)NBuffers*BLCKSZ + requested >= total)
-		elog(ERROR, "HNSW index requeries %ld bytes while only %ld are available",
-			requested, total - (Size)NBuffers*BLCKSZ);
-}
-
-#endif
-
-static HierarchicalNSW*
-hnsw_get_index(Relation indexRel, Relation heapRel)
-{
-	HierarchicalNSW* hnsw;
-	Oid indexoid = RelationGetRelid(indexRel);
-	HnswHashEntry* entry = hnsw_index_lookup(hnsw_indexes, indexoid);
-	if (entry == NULL)
-	{
-		size_t dims, maxelements;
-		size_t M;
-		size_t maxM;
-		size_t size_links_level0;
-		size_t size_data_per_element;
-		size_t data_size;
-		dsm_handle handle = indexoid << 1; /* make it even */
-		void* impl_private = NULL;
-		void* mapped_address = NULL;
-		Size  mapped_size = 0;
-		Size  shmem_size;
-		bool exists = true;
-		bool found;
-		HnswOptions *opts = (HnswOptions *) indexRel->rd_options;
-		if (opts == NULL || opts->maxelements == 0 || opts->dims == 0) {
-			elog(ERROR, "HNSW index requires 'maxelements' and 'dims' to be specified");
-		}
-		dims = opts->dims;
-		maxelements = opts->maxelements;
-		M = opts->M;
-		maxM = M * 2;
-		data_size = dims * sizeof(coord_t);
-		size_links_level0 = (maxM + 1) * sizeof(idx_t);
-		size_data_per_element = size_links_level0 + data_size + sizeof(label_t);
-		shmem_size =  hnsw_sizeof() + maxelements * size_data_per_element;
-
-		hnsw_check_available_memory(shmem_size);
-
-		/* first try to attach to existed index */
-		if (!dsm_impl_op(DSM_OP_ATTACH, handle, 0, &impl_private,
-						 &mapped_address, &mapped_size, DEBUG1))
-		{
-			/* index doesn't exists: try to create it */
-			if (!dsm_impl_op(DSM_OP_CREATE, handle, shmem_size, &impl_private,
-							 &mapped_address, &mapped_size, DEBUG1))
-			{
-				/* We can do it under shared lock, so some other backend may
-				 * try to initialize index. If create is failed because index already
-				 * created by somebody else, then try to attach to it once again
-				 */
-				if (!dsm_impl_op(DSM_OP_ATTACH, handle, 0, &impl_private,
-								 &mapped_address, &mapped_size, ERROR))
-				{
-					return NULL;
-				}
-			}
-			else
-			{
-				exists = false;
-			}
-		}
-		Assert(mapped_size == shmem_size);
-		hnsw = (HierarchicalNSW*)mapped_address;
-
-		if (!exists)
-		{
-			hnsw_init(hnsw, dims, maxelements, M, maxM, opts->efConstruction);
-			hnsw_populate(hnsw, indexRel, heapRel);
-		}
-		entry = hnsw_index_insert(hnsw_indexes, indexoid, &found);
-		Assert(!found);
-		entry->hnsw = hnsw;
-	}
-	else
-	{
-		hnsw = entry->hnsw;
-	}
-	return hnsw;
-}
-
-/*
- * Start or restart an index scan
- */
-static IndexScanDesc
-hnsw_beginscan(Relation index, int nkeys, int norderbys)
-{
-	IndexScanDesc scan = RelationGetIndexScan(index, nkeys, norderbys);
-	HnswScanOpaque so = (HnswScanOpaque) palloc(sizeof(HnswScanOpaqueData));
-	Relation heap = relation_open(index->rd_index->indrelid, NoLock);
-	so->hnsw = hnsw_get_index(index, heap);
-	relation_close(heap, NoLock);
-	so->curr = 0;
-	so->n_results = 0;
-	so->results = NULL;
-	scan->opaque = so;
-	return scan;
-}
-
-/*
- * Start or restart an index scan
- */
-static void
-hnsw_rescan(IndexScanDesc scan, ScanKey keys, int nkeys, ScanKey orderbys, int norderbys)
-{
-	HnswScanOpaque so = (HnswScanOpaque) scan->opaque;
-	if (so->results)
-	{
-		pfree(so->results);
-		so->results = NULL;
-	}
-	so->curr = 0;
-	if (orderbys && scan->numberOfOrderBys > 0)
-		memmove(scan->orderByData, orderbys, scan->numberOfOrderBys * sizeof(ScanKeyData));
-}
-
-/*
- * Fetch the next tuple in the given scan
- */
-static bool
-hnsw_gettuple(IndexScanDesc scan, ScanDirection dir)
-{
-	HnswScanOpaque so = (HnswScanOpaque) scan->opaque;
-
-	/*
-	 * Index can be used to scan backward, but Postgres doesn't support
-	 * backward scan on operators
-	 */
-	Assert(ScanDirectionIsForward(dir));
-
-	if (so->curr == 0)
-	{
-		Datum		value;
-		ArrayType*	array;
-		int         n_items;
-		size_t      n_results;
-		label_t*    results;
-		HnswOptions *opts = (HnswOptions *) scan->indexRelation->rd_options;
-		size_t      efSearch = opts ? opts->efSearch : DEFAULT_EF_SEARCH;
-
-		/* Safety check */
-		if (scan->orderByData == NULL)
-			elog(ERROR, "cannot scan HNSW index without order");
-
-		/* No items will match if null */
-		if (scan->orderByData->sk_flags & SK_ISNULL)
-			return false;
-
-		value = scan->orderByData->sk_argument;
-		array = DatumGetArrayTypeP(value);
-		n_items = ArrayGetNItems(ARR_NDIM(array), ARR_DIMS(array));
-		if (n_items != hnsw_dimensions(so->hnsw))
-		{
-			elog(ERROR, "Wrong number of dimensions: %d instead of %d expected",
-				 n_items, hnsw_dimensions(so->hnsw));
-		}
-
-		if (!hnsw_search(so->hnsw, (coord_t*)ARR_DATA_PTR(array), efSearch, &n_results, &results))
-			elog(ERROR, "HNSW index search failed");
-		so->results = (ItemPointer)palloc(n_results*sizeof(ItemPointerData));
-		so->n_results = n_results;
-		for (size_t i = 0; i < n_results; i++)
-		{
-			memcpy(&so->results[i], &results[i], sizeof(so->results[i]));
-		}
-		free(results);
-	}
-	if (so->curr >= so->n_results)
-	{
-		return false;
-	}
-	else
-	{
-		scan->xs_heaptid = so->results[so->curr++];
-		scan->xs_recheckorderby = false;
-		return true;
-	}
-}
-
-/*
- * End a scan and release resources
- */
-static void
-hnsw_endscan(IndexScanDesc scan)
-{
-	HnswScanOpaque so = (HnswScanOpaque) scan->opaque;
-	if (so->results)
-		pfree(so->results);
-	pfree(so);
-	scan->opaque = NULL;
-}
-
-
-/*
- * Estimate the cost of an index scan
- */
-static void
-hnsw_costestimate(PlannerInfo *root, IndexPath *path, double loop_count,
-				 Cost *indexStartupCost, Cost *indexTotalCost,
-				 Selectivity *indexSelectivity, double *indexCorrelation
-				 ,double *indexPages
-)
-{
-	GenericCosts costs;
-
-	/* Never use index without order */
-	if (path->indexorderbys == NULL)
-	{
-		*indexStartupCost = DBL_MAX;
-		*indexTotalCost = DBL_MAX;
-		*indexSelectivity = 0;
-		*indexCorrelation = 0;
-		*indexPages = 0;
-		return;
-	}
-
-	MemSet(&costs, 0, sizeof(costs));
-
-	genericcostestimate(root, path, loop_count, &costs);
-
-	/* Startup cost and total cost are same */
-	*indexStartupCost = costs.indexTotalCost;
-	*indexTotalCost = costs.indexTotalCost;
-	*indexSelectivity = costs.indexSelectivity;
-	*indexCorrelation = costs.indexCorrelation;
-	*indexPages = costs.numIndexPages;
-}
-
-/*
- * Parse and validate the reloptions
- */
-static bytea *
-hnsw_options(Datum reloptions, bool validate)
-{
-	static const relopt_parse_elt tab[] = {
-		{"dims", RELOPT_TYPE_INT, offsetof(HnswOptions, dims)},
-		{"maxelements", RELOPT_TYPE_INT, offsetof(HnswOptions, maxelements)},
-		{"efconstruction", RELOPT_TYPE_INT, offsetof(HnswOptions, efConstruction)},
-		{"efsearch", RELOPT_TYPE_INT, offsetof(HnswOptions, efSearch)},
-		{"m", RELOPT_TYPE_INT, offsetof(HnswOptions, M)}
-	};
-
-	return (bytea *) build_reloptions(reloptions, validate,
-									  hnsw_relopt_kind,
-									  sizeof(HnswOptions),
-									  tab, lengthof(tab));
-}
-
-/*
- * Validate catalog entries for the specified operator class
- */
-static bool
-hnsw_validate(Oid opclassoid)
-{
-	return true;
-}
-
-/*
- * Build the index for a logged table
- */
-static IndexBuildResult *
-hnsw_build(Relation heap, Relation index, IndexInfo *indexInfo)
-{
-	HierarchicalNSW* hnsw = hnsw_get_index(index, heap);
-	IndexBuildResult* result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
-	result->heap_tuples = result->index_tuples = hnsw_count(hnsw);
-
-	return result;
-}
-
-/*
- * Insert a tuple into the index
- */
-static bool
-hnsw_insert(Relation index, Datum *values, bool *isnull, ItemPointer heap_tid,
-			  Relation heap, IndexUniqueCheck checkUnique,
-			  bool indexUnchanged,
-			  IndexInfo *indexInfo)
-{
-	HierarchicalNSW* hnsw = hnsw_get_index(index, heap);
-	Datum value;
-	ArrayType* array;
-	int n_items;
-	label_t label = 0;
-
-	/* Skip nulls */
-	if (isnull[0])
-		return false;
-
-	/* Detoast value */
-	value = PointerGetDatum(PG_DETOAST_DATUM(values[0]));
-	array = DatumGetArrayTypeP(value);
-	n_items = ArrayGetNItems(ARR_NDIM(array), ARR_DIMS(array));
-	if (n_items != hnsw_dimensions(hnsw))
-	{
-		elog(ERROR, "Wrong number of dimensions: %d instead of %d expected",
-			 n_items, hnsw_dimensions(hnsw));
-	}
-	memcpy(&label, heap_tid, sizeof(*heap_tid));
-	if (!hnsw_add_point(hnsw, (coord_t*)ARR_DATA_PTR(array), label))
-		elog(ERROR, "HNSW index insert failed");
-	return true;
-}
-
-/*
- * Build the index for an unlogged table
- */
-static void
-hnsw_buildempty(Relation index)
-{
-	/* index will be constructed on dema nd when accessed */
-}
-
-/*
- * Clean up after a VACUUM operation
- */
-static IndexBulkDeleteResult *
-hnsw_vacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
-{
-	Relation	rel = info->index;
-
-	if (stats == NULL)
-		return NULL;
-
-	stats->num_pages = RelationGetNumberOfBlocks(rel);
-
-	return stats;
-}
-
-/*
- * Bulk delete tuples from the index
- */
-static IndexBulkDeleteResult *
-hnsw_bulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
-				IndexBulkDeleteCallback callback, void *callback_state)
-{
-	if (stats == NULL)
-		stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
-	return stats;
-}
-
-/*
- * Define index handler
- *
- * See https://www.postgresql.org/docs/current/index-api.html
- */
-PGDLLEXPORT PG_FUNCTION_INFO_V1(hnsw_handler);
-Datum
-hnsw_handler(PG_FUNCTION_ARGS)
-{
-	IndexAmRoutine *amroutine = makeNode(IndexAmRoutine);
-
-	amroutine->amstrategies = 0;
-	amroutine->amsupport = 0;
-	amroutine->amoptsprocnum = 0;
-	amroutine->amcanorder = false;
-	amroutine->amcanorderbyop = true;
-	amroutine->amcanbackward = false;	/* can change direction mid-scan */
-	amroutine->amcanunique = false;
-	amroutine->amcanmulticol = false;
-	amroutine->amoptionalkey = true;
-	amroutine->amsearcharray = false;
-	amroutine->amsearchnulls = false;
-	amroutine->amstorage = false;
-	amroutine->amclusterable = false;
-	amroutine->ampredlocks = false;
-	amroutine->amcanparallel = false;
-	amroutine->amcaninclude = false;
-	amroutine->amusemaintenanceworkmem = false; /* not used during VACUUM */
-	amroutine->amparallelvacuumoptions = VACUUM_OPTION_PARALLEL_BULKDEL;
-	amroutine->amkeytype = InvalidOid;
-
-	/* Interface functions */
-	amroutine->ambuild = hnsw_build;
-	amroutine->ambuildempty = hnsw_buildempty;
-	amroutine->aminsert = hnsw_insert;
-	amroutine->ambulkdelete = hnsw_bulkdelete;
-	amroutine->amvacuumcleanup = hnsw_vacuumcleanup;
-	amroutine->amcanreturn = NULL;	/* tuple not included in heapsort */
-	amroutine->amcostestimate = hnsw_costestimate;
-	amroutine->amoptions = hnsw_options;
-	amroutine->amproperty = NULL;	/* TODO AMPROP_DISTANCE_ORDERABLE */
-	amroutine->ambuildphasename = NULL;
-	amroutine->amvalidate = hnsw_validate;
-	amroutine->amadjustmembers = NULL;
-	amroutine->ambeginscan = hnsw_beginscan;
-	amroutine->amrescan = hnsw_rescan;
-	amroutine->amgettuple = hnsw_gettuple;
-	amroutine->amgetbitmap = NULL;
-	amroutine->amendscan = hnsw_endscan;
-	amroutine->ammarkpos = NULL;
-	amroutine->amrestrpos = NULL;
-
-	/* Interface functions to support parallel index scans */
-	amroutine->amestimateparallelscan = NULL;
-	amroutine->aminitparallelscan = NULL;
-	amroutine->amparallelrescan = NULL;
-
-	PG_RETURN_POINTER(amroutine);
-}
-
-/*
- * Get the L2 distance between vectors
- */
-PGDLLEXPORT PG_FUNCTION_INFO_V1(l2_distance);
-Datum
-l2_distance(PG_FUNCTION_ARGS)
-{
-	ArrayType  *a = PG_GETARG_ARRAYTYPE_P(0);
-	ArrayType  *b = PG_GETARG_ARRAYTYPE_P(1);
-	int         a_dim = ArrayGetNItems(ARR_NDIM(a), ARR_DIMS(a));
-	int         b_dim = ArrayGetNItems(ARR_NDIM(b), ARR_DIMS(b));
-	dist_t 		distance = 0.0;
-	dist_t		diff;
-	coord_t	   *ax = (coord_t*)ARR_DATA_PTR(a);
-	coord_t	   *bx = (coord_t*)ARR_DATA_PTR(b);
-
-	if (a_dim != b_dim)
-	{
-		ereport(ERROR,
-				(errcode(ERRCODE_DATA_EXCEPTION),
-				 errmsg("different array dimensions %d and %d", a_dim, b_dim)));
-	}
-
-	#pragma clang loop vectorize(enable)
-	for (int i = 0; i < a_dim; i++)
-	{
-		diff = ax[i] - bx[i];
-		distance += diff * diff;
-	}
-
-	PG_RETURN_FLOAT4((dist_t)sqrt(distance));
-}
--- a/pgxn/hnsw/hnsw.control
+++ b/pgxn/hnsw/hnsw.control
@@ -1,5 +0,0 @@
-comment = 'hnsw index'
-default_version = '0.1.0'
-module_pathname = '$libdir/hnsw'
-relocatable = true
-trusted = true
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Bojan Serafimov	a4bd82cb81	Merge branch 'layer-stats' of github.com:neondatabase/neon into layer-stats	2023-06-10 10:47:42 -04:00
Bojan Serafimov	c64798956d	Address comments	2023-06-10 10:46:28 -04:00
bojanserafimov	39bbaecb03	accept suggestion Co-authored-by: Joonas Koivunen <joonas@neon.tech>	2023-06-10 10:35:13 -04:00
Bojan Serafimov	26dca374eb	Add layer stats cli	2023-06-02 16:46:48 -04:00