Merge pull request #4571 from neondatabase/releases/2023-06-27

Release 2023-06-27
Merge branch 'release' into releases/2023-06-27
2026-01-25 06:10:37 +00:00 · 2023-06-27 12:55:33 +02:00 · 2023-06-27 12:22:12 +02:00 · 2023-06-27 10:55:03 +01:00 · 2023-06-27 10:57:28 +03:00 · 2023-06-26 14:10:27 -04:00
206 changed files with 13985 additions and 5192 deletions
--- a/.github/actions/allure-report-generate/action.yml
+++ b/.github/actions/allure-report-generate/action.yml
@@ -57,14 +57,14 @@ runs:
        if ! which allure; then
          ALLURE_ZIP=allure-${ALLURE_VERSION}.zip
          wget -q https://github.com/allure-framework/allure2/releases/download/${ALLURE_VERSION}/${ALLURE_ZIP}
-          echo "${ALLURE_ZIP_MD5}  ${ALLURE_ZIP}" | md5sum -c
+          echo "${ALLURE_ZIP_SHA256} ${ALLURE_ZIP}" | sha256sum --check
          unzip -q ${ALLURE_ZIP}
          echo "$(pwd)/allure-${ALLURE_VERSION}/bin" >> $GITHUB_PATH
          rm -f ${ALLURE_ZIP}
        fi
      env:
-        ALLURE_VERSION: 2.22.0
-        ALLURE_ZIP_MD5: d5c9f0989b896482536956340a7d5ec9
+        ALLURE_VERSION: 2.22.1
+        ALLURE_ZIP_SHA256: fdc7a62d94b14c5e0bf25198ae1feded6b005fdbed864b4d3cb4e5e901720b0b

    # Potentially we could have several running build for the same key (for example, for the main branch), so we use improvised lock for this
    - name: Acquire lock
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -36,14 +36,6 @@ inputs:
    description: 'Region name for real s3 tests'
    required: false
    default: ''
-  real_s3_access_key_id:
-    description: 'Access key id'
-    required: false
-    default: ''
-  real_s3_secret_access_key:
-    description: 'Secret access key'
-    required: false
-    default: ''
  rerun_flaky:
    description: 'Whether to rerun flaky tests'
    required: false
@@ -104,8 +96,6 @@ runs:
        COMPATIBILITY_POSTGRES_DISTRIB_DIR: /tmp/neon-previous/pg_install
        TEST_OUTPUT: /tmp/test_output
        BUILD_TYPE: ${{ inputs.build_type }}
-        AWS_ACCESS_KEY_ID: ${{ inputs.real_s3_access_key_id }}
-        AWS_SECRET_ACCESS_KEY: ${{ inputs.real_s3_secret_access_key }}
        COMPATIBILITY_SNAPSHOT_DIR: /tmp/compatibility_snapshot_pg${{ inputs.pg_version }}
        ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE: contains(github.event.pull_request.labels.*.name, 'backward compatibility breakage')
        ALLOW_FORWARD_COMPATIBILITY_BREAKAGE: contains(github.event.pull_request.labels.*.name, 'forward compatibility breakage')
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -180,7 +180,8 @@ jobs:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
      options: --init

-    timeout-minutes: 360 # 6h
+    # Increase timeout to 8h, default timeout is 6h
+    timeout-minutes: 480

    steps:
    - uses: actions/checkout@v3
@@ -321,8 +322,6 @@ jobs:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
      options: --init

-    timeout-minutes: 360 # 6h
-
    steps:
    - uses: actions/checkout@v3

@@ -414,8 +413,6 @@ jobs:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
      options: --init

-    timeout-minutes: 360 # 6h
-
    steps:
    - uses: actions/checkout@v3

@@ -501,8 +498,6 @@ jobs:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
      options: --init

-    timeout-minutes: 360 # 6h
-
    steps:
    - uses: actions/checkout@v3

--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -264,7 +264,7 @@ jobs:
          export REMOTE_STORAGE_S3_BUCKET=neon-github-public-dev
          export REMOTE_STORAGE_S3_REGION=eu-central-1
          # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
-          ${cov_prefix} cargo test $CARGO_FLAGS --package remote_storage --test pagination_tests -- s3_pagination_should_work --exact
+          ${cov_prefix} cargo test $CARGO_FLAGS --package remote_storage --test test_real_s3

      - name: Install rust binaries
        run: |
@@ -346,10 +346,8 @@ jobs:
          test_selection: regress
          needs_postgres_source: true
          run_with_real_s3: true
-          real_s3_bucket: ci-tests-s3
-          real_s3_region: us-west-2
-          real_s3_access_key_id: "${{ secrets.AWS_ACCESS_KEY_ID_CI_TESTS_S3 }}"
-          real_s3_secret_access_key: "${{ secrets.AWS_SECRET_ACCESS_KEY_CI_TESTS_S3 }}"
+          real_s3_bucket: neon-github-ci-tests
+          real_s3_region: eu-central-1
          rerun_flaky: true
          pg_version: ${{ matrix.pg_version }}
        env:
@@ -409,9 +407,7 @@ jobs:
        uses: ./.github/actions/allure-report-generate

      - uses: actions/github-script@v6
-        if: >
-          !cancelled() &&
-          github.event_name == 'pull_request'
+        if: ${{ !cancelled() }}
        with:
          # Retry script for 5XX server errors: https://github.com/actions/github-script#retries
          retries: 5
@@ -421,7 +417,7 @@ jobs:
              reportJsonUrl: "${{ steps.create-allure-report.outputs.report-json-url }}",
            }

-            const script = require("./scripts/pr-comment-test-report.js")
+            const script = require("./scripts/comment-test-report.js")
            await script({
              github,
              context,
@@ -496,19 +492,24 @@ jobs:
        env:
          COMMIT_URL: ${{ github.server_url }}/${{ github.repository }}/commit/${{ github.event.pull_request.head.sha || github.sha }}
        run: |
-          scripts/coverage \
-            --dir=/tmp/coverage report \
+          scripts/coverage --dir=/tmp/coverage \
+            report \
            --input-objects=/tmp/coverage/binaries.list \
            --commit-url=${COMMIT_URL} \
            --format=github

+          scripts/coverage --dir=/tmp/coverage \
+            report \
+            --input-objects=/tmp/coverage/binaries.list \
+            --format=lcov
+
      - name: Upload coverage report
        id: upload-coverage-report
        env:
          BUCKET: neon-github-public-dev
          COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
        run: |
-          aws s3 cp --only-show-errors --recursive /tmp/coverage/report s3://neon-github-public-dev/code-coverage/${COMMIT_SHA}
+          aws s3 cp --only-show-errors --recursive /tmp/coverage/report s3://${BUCKET}/code-coverage/${COMMIT_SHA}

          REPORT_URL=https://${BUCKET}.s3.amazonaws.com/code-coverage/${COMMIT_SHA}/index.html
          echo "report-url=${REPORT_URL}" >> $GITHUB_OUTPUT
@@ -622,48 +623,6 @@ jobs:
      - name: Cleanup ECR folder
        run: rm -rf ~/.ecr

-
-  neon-image-depot:
-    # For testing this will run side-by-side for a few merges.
-    # This action is not really optimized yet, but gets the job done
-    runs-on: [ self-hosted, gen3, large ]
-    needs: [ tag ]
-    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
-    permissions:
-      contents: read
-      id-token: write
-
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v3
-        with:
-          submodules: true
-          fetch-depth: 0
-
-      - name: Setup go
-        uses: actions/setup-go@v3
-        with:
-          go-version: '1.19'
-
-      - name: Set up Depot CLI
-        uses: depot/setup-action@v1
-
-      - name: Install Crane & ECR helper
-        run: go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@69c85dc22db6511932bbf119e1a0cc5c90c69a7f # v0.6.0
-
-      - name: Configure ECR login
-        run: |
-          mkdir /github/home/.docker/
-          echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json
-
-      - name: Build and push
-        uses: depot/build-push-action@v1
-        with:
-          # if no depot.json file is at the root of your repo, you must specify the project id
-          project: nrdv0s4kcs
-          push: true
-          tags: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:depot-${{needs.tag.outputs.build-tag}}
-
  compute-tools-image:
    runs-on: [ self-hosted, gen3, large ]
    needs: [ tag ]
@@ -700,6 +659,7 @@ jobs:
                           --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache
                           --context .
                           --build-arg GIT_VERSION=${{ github.sha }}
+                           --build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}}
                           --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
                           --dockerfile Dockerfile.compute-tools
                           --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}}
@@ -757,6 +717,7 @@ jobs:
                           --context .
                           --build-arg GIT_VERSION=${{ github.sha }}
                           --build-arg PG_VERSION=${{ matrix.version }}
+                           --build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}}
                           --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
                           --dockerfile Dockerfile.compute-node
                           --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
@@ -777,7 +738,7 @@ jobs:
      run:
        shell: sh -eu {0}
    env:
-      VM_BUILDER_VERSION: v0.4.6
+      VM_BUILDER_VERSION: v0.11.0

    steps:
      - name: Checkout
@@ -787,21 +748,18 @@ jobs:

      - name: Downloading vm-builder
        run: |
-          curl -L https://github.com/neondatabase/neonvm/releases/download/$VM_BUILDER_VERSION/vm-builder -o vm-builder
+          curl -fL https://github.com/neondatabase/autoscaling/releases/download/$VM_BUILDER_VERSION/vm-builder -o vm-builder
          chmod +x vm-builder

+      # Note: we need a separate pull step here because otherwise vm-builder will try to pull, and
+      # it won't have the proper authentication (written at v0.6.0)
      - name: Pulling compute-node image
        run: |
          docker pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}

-      - name: Building VM compute-node rootfs
-        run: |
-          docker build -t temp-vm-compute-node --build-arg SRC_IMAGE=369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} -f Dockerfile.vm-compute-node .
-
      - name: Build vm image
        run: |
-          # note: as of 2023-01-12, vm-builder requires a trailing ":latest" for local images
-          ./vm-builder -use-inittab -src=temp-vm-compute-node:latest -dst=369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
+          ./vm-builder -enable-file-cache -src=369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} -dst=369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}

      - name: Pushing vm-compute-node image
        run: |
@@ -958,6 +916,20 @@ jobs:
            exit 1
          fi

+      - name: Create git tag
+        if: github.ref_name == 'release'
+        uses: actions/github-script@v6
+        with:
+          # Retry script for 5XX server errors: https://github.com/actions/github-script#retries
+          retries: 5
+          script: |
+            github.rest.git.createRef({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              ref: "refs/tags/${{ needs.tag.outputs.build-tag }}",
+              sha: context.sha,
+            })
+
  promote-compatibility-data:
    runs-on: [ self-hosted, gen3, small ]
    container:
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -3,6 +3,7 @@ name: Create Release Branch
 on:
  schedule:
    - cron: '0 10 * * 2'
+  workflow_dispatch:

 jobs:
  create_release_branch:
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -3,12 +3,26 @@ members = [
    "compute_tools",
    "control_plane",
    "pageserver",
+    "pageserver/ctl",
    "proxy",
    "safekeeper",
    "storage_broker",
    "workspace_hack",
    "trace",
-    "libs/*",
+    "libs/compute_api",
+    "libs/pageserver_api",
+    "libs/postgres_ffi",
+    "libs/safekeeper_api",
+    "libs/utils",
+    "libs/consumption_metrics",
+    "libs/postgres_backend",
+    "libs/pq_proto",
+    "libs/tenant_size_model",
+    "libs/metrics",
+    "libs/postgres_connection",
+    "libs/remote_storage",
+    "libs/tracing-utils",
+    "libs/postgres_ffi/wal_craft",
 ]

 [workspace.package]
@@ -22,7 +36,7 @@ async-stream = "0.3"
 async-trait = "0.1"
 atty = "0.2.14"
 aws-config = { version = "0.55", default-features = false, features=["rustls"] }
-aws-sdk-s3 = "0.25"
+aws-sdk-s3 = "0.27"
 aws-smithy-http = "0.55"
 aws-credential-types = "0.55"
 aws-types = "0.55"
@@ -126,11 +140,11 @@ env_logger = "0.10"
 log = "0.4"

 ## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="0bc41d8503c092b040142214aac3cf7d11d0c19f" }
-postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="0bc41d8503c092b040142214aac3cf7d11d0c19f" }
-postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="0bc41d8503c092b040142214aac3cf7d11d0c19f" }
-postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="0bc41d8503c092b040142214aac3cf7d11d0c19f" }
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="0bc41d8503c092b040142214aac3cf7d11d0c19f" }
+postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
+postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
+postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
+postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
 tokio-tar = { git = "https://github.com/neondatabase/tokio-tar.git", rev="404df61437de0feef49ba2ccdbdd94eb8ad6e142" }

 ## Other git libraries
@@ -166,7 +180,7 @@ tonic-build = "0.9"

 # This is only needed for proxy's tests.
 # TODO: we should probably fork `tokio-postgres-rustls` instead.
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="0bc41d8503c092b040142214aac3cf7d11d0c19f" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }

 # Changes the MAX_THREADS limit from 4096 to 32768.
 # This is a temporary workaround for using tracing from many threads in safekeepers code,
--- a/6
+++ b/6
@@ -47,8 +47,7 @@ RUN set -e \
    && mold -run cargo build  \
      --bin pg_sni_router  \
      --bin pageserver  \
-      --bin pageserver_binutils  \
-      --bin draw_timeline_dir \
+      --bin pagectl  \
      --bin safekeeper  \
      --bin storage_broker  \
      --bin proxy  \
@@ -73,8 +72,7 @@ RUN set -e \

 COPY --from=build --chown=neon:neon /home/nonroot/target/release/pg_sni_router       /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/pageserver          /usr/local/bin
-COPY --from=build --chown=neon:neon /home/nonroot/target/release/pageserver_binutils /usr/local/bin
-COPY --from=build --chown=neon:neon /home/nonroot/target/release/draw_timeline_dir   /usr/local/bin
+COPY --from=build --chown=neon:neon /home/nonroot/target/release/pagectl             /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/safekeeper          /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_broker         /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy               /usr/local/bin
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -2,6 +2,7 @@ ARG PG_VERSION
 ARG REPOSITORY=neondatabase
 ARG IMAGE=rust
 ARG TAG=pinned
+ARG BUILD_TAG

 #########################################################################################
 #
@@ -67,7 +68,7 @@ RUN apt update && \
 RUN wget https://gitlab.com/Oslandia/SFCGAL/-/archive/v1.3.10/SFCGAL-v1.3.10.tar.gz -O SFCGAL.tar.gz && \
    echo "4e39b3b2adada6254a7bdba6d297bb28e1a9835a9f879b74f37e2dab70203232 SFCGAL.tar.gz" | sha256sum --check && \
    mkdir sfcgal-src && cd sfcgal-src && tar xvzf ../SFCGAL.tar.gz --strip-components=1 -C . && \
-    cmake . && make -j $(getconf _NPROCESSORS_ONLN) && \
+    cmake -DCMAKE_BUILD_TYPE=Release . && make -j $(getconf _NPROCESSORS_ONLN) && \
    DESTDIR=/sfcgal make install -j $(getconf _NPROCESSORS_ONLN) && \
    make clean && cp -R /sfcgal/* /

@@ -95,7 +96,7 @@ RUN wget https://github.com/pgRouting/pgrouting/archive/v3.4.2.tar.gz -O pgrouti
    mkdir pgrouting-src && cd pgrouting-src && tar xvzf ../pgrouting.tar.gz --strip-components=1 -C . && \
    mkdir build && \
    cd build && \
-    cmake .. && \
+    cmake -DCMAKE_BUILD_TYPE=Release .. && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
    make -j $(getconf _NPROCESSORS_ONLN) install && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrouting.control
@@ -355,7 +356,7 @@ RUN apt-get update && \
    wget https://github.com/timescale/timescaledb/archive/refs/tags/2.10.1.tar.gz -O timescaledb.tar.gz && \
    echo "6fca72a6ed0f6d32d2b3523951ede73dc5f9b0077b38450a029a5f411fdb8c73 timescaledb.tar.gz" | sha256sum --check && \
    mkdir timescaledb-src && cd timescaledb-src && tar xvzf ../timescaledb.tar.gz --strip-components=1 -C . && \
-    ./bootstrap -DSEND_TELEMETRY_DEFAULT:BOOL=OFF -DUSE_TELEMETRY:BOOL=OFF -DAPACHE_ONLY:BOOL=ON && \
+    ./bootstrap -DSEND_TELEMETRY_DEFAULT:BOOL=OFF -DUSE_TELEMETRY:BOOL=OFF -DAPACHE_ONLY:BOOL=ON -DCMAKE_BUILD_TYPE=Release && \
    cd build && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
    make install -j $(getconf _NPROCESSORS_ONLN) && \
@@ -410,7 +411,7 @@ RUN apt-get update && \
    mkdir kq_imcx-src && cd kq_imcx-src && tar xvzf ../kq_imcx.tar.gz --strip-components=1 -C . && \
    mkdir build && \
    cd build && \
-    cmake .. && \
+    cmake -DCMAKE_BUILD_TYPE=Release .. && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
    make -j $(getconf _NPROCESSORS_ONLN) install && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/kq_imcx.control
@@ -432,6 +433,88 @@ RUN wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.5.2.tar.gz -O
    make -j $(getconf _NPROCESSORS_ONLN) install && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_cron.control

+#########################################################################################
+#
+# Layer "rdkit-pg-build"
+# compile rdkit extension
+#
+#########################################################################################
+FROM build-deps AS rdkit-pg-build
+COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
+
+RUN apt-get update && \
+    apt-get install -y \
+        cmake \
+        libboost-iostreams1.74-dev \
+        libboost-regex1.74-dev \
+        libboost-serialization1.74-dev \
+        libboost-system1.74-dev \
+        libeigen3-dev \
+        libfreetype6-dev
+
+ENV PATH "/usr/local/pgsql/bin/:/usr/local/pgsql/:$PATH"
+RUN wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_1.tar.gz -O rdkit.tar.gz && \
+    echo "db346afbd0ba52c843926a2a62f8a38c7b774ffab37eaf382d789a824f21996c rdkit.tar.gz" | sha256sum --check && \
+    mkdir rdkit-src && cd rdkit-src && tar xvzf ../rdkit.tar.gz --strip-components=1 -C . && \
+    cmake \
+        -D RDK_BUILD_CAIRO_SUPPORT=OFF \
+        -D RDK_BUILD_INCHI_SUPPORT=ON \
+        -D RDK_BUILD_AVALON_SUPPORT=ON \
+        -D RDK_BUILD_PYTHON_WRAPPERS=OFF \
+        -D RDK_BUILD_DESCRIPTORS3D=OFF \
+        -D RDK_BUILD_FREESASA_SUPPORT=OFF \
+        -D RDK_BUILD_COORDGEN_SUPPORT=ON \
+        -D RDK_BUILD_MOLINTERCHANGE_SUPPORT=OFF \
+        -D RDK_BUILD_YAEHMOP_SUPPORT=OFF \
+        -D RDK_BUILD_STRUCTCHECKER_SUPPORT=OFF \
+        -D RDK_USE_URF=OFF \
+        -D RDK_BUILD_PGSQL=ON \
+        -D RDK_PGSQL_STATIC=ON \
+        -D PostgreSQL_CONFIG=pg_config \
+        -D PostgreSQL_INCLUDE_DIR=`pg_config --includedir` \
+        -D PostgreSQL_TYPE_INCLUDE_DIR=`pg_config --includedir-server` \
+        -D PostgreSQL_LIBRARY_DIR=`pg_config --libdir` \
+        -D RDK_INSTALL_INTREE=OFF \
+        -D CMAKE_BUILD_TYPE=Release \
+        . && \
+    make -j $(getconf _NPROCESSORS_ONLN) && \
+    make -j $(getconf _NPROCESSORS_ONLN) install && \
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/rdkit.control
+
+#########################################################################################
+#
+# Layer "pg-uuidv7-pg-build"
+# compile pg_uuidv7 extension
+#
+#########################################################################################
+FROM build-deps AS pg-uuidv7-pg-build
+COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
+
+ENV PATH "/usr/local/pgsql/bin/:$PATH"
+RUN wget https://github.com/fboulnois/pg_uuidv7/archive/refs/tags/v1.0.1.tar.gz -O pg_uuidv7.tar.gz && \
+    echo "0d0759ab01b7fb23851ecffb0bce27822e1868a4a5819bfd276101c716637a7a pg_uuidv7.tar.gz" | sha256sum --check && \
+    mkdir pg_uuidv7-src && cd pg_uuidv7-src && tar xvzf ../pg_uuidv7.tar.gz --strip-components=1 -C . && \
+    make -j $(getconf _NPROCESSORS_ONLN) && \
+    make -j $(getconf _NPROCESSORS_ONLN) install && \
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_uuidv7.control
+
+#########################################################################################
+#
+# Layer "pg-roaringbitmap-pg-build"
+# compile pg_roaringbitmap extension
+#
+#########################################################################################
+FROM build-deps AS pg-roaringbitmap-pg-build
+COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
+
+ENV PATH "/usr/local/pgsql/bin/:$PATH"
+RUN wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4.tar.gz -O pg_roaringbitmap.tar.gz && \
+    echo "b75201efcb1c2d1b014ec4ae6a22769cc7a224e6e406a587f5784a37b6b5a2aa pg_roaringbitmap.tar.gz" | sha256sum --check && \
+    mkdir pg_roaringbitmap-src && cd pg_roaringbitmap-src && tar xvzf ../pg_roaringbitmap.tar.gz --strip-components=1 -C . && \
+    make -j $(getconf _NPROCESSORS_ONLN) && \
+    make -j $(getconf _NPROCESSORS_ONLN) install && \
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/roaringbitmap.control
+
 #########################################################################################
 #
 # Layer "rust extensions"
@@ -517,6 +600,22 @@ RUN wget https://github.com/kelvich/pg_tiktoken/archive/801f84f08c6881c8aa30f405
    cargo pgx install --release && \
    echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_tiktoken.control

+#########################################################################################
+#
+# Layer "pg-pgx-ulid-build"
+# Compile "pgx_ulid" extension
+#
+#########################################################################################
+
+FROM rust-extensions-build AS pg-pgx-ulid-build
+
+RUN wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.0.tar.gz -O pgx_ulid.tar.gz && \
+    echo "908b7358e6f846e87db508ae5349fb56a88ee6305519074b12f3d5b0ff09f791 pgx_ulid.tar.gz" | sha256sum --check && \
+    mkdir pgx_ulid-src && cd pgx_ulid-src && tar xvzf ../pgx_ulid.tar.gz --strip-components=1 -C . && \
+    sed -i 's/pgx        = "=0.7.3"/pgx = { version = "0.7.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
+    cargo pgx install --release && \
+    echo "trusted = true" >> /usr/local/pgsql/share/extension/ulid.control
+
 #########################################################################################
 #
 # Layer "neon-pg-ext-build"
@@ -547,6 +646,10 @@ COPY --from=timescaledb-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-hint-plan-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=kq-imcx-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-cron-pg-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pg-pgx-ulid-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=rdkit-pg-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pg-uuidv7-pg-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pg-roaringbitmap-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY pgxn/ pgxn/

 RUN make -j $(getconf _NPROCESSORS_ONLN) \
@@ -556,6 +659,10 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) \
    make -j $(getconf _NPROCESSORS_ONLN) \
        PG_CONFIG=/usr/local/pgsql/bin/pg_config \
        -C pgxn/neon_utils \
+        -s install && \
+    make -j $(getconf _NPROCESSORS_ONLN) \
+        PG_CONFIG=/usr/local/pgsql/bin/pg_config \
+        -C pgxn/hnsw \
        -s install

 #########################################################################################
@@ -564,6 +671,9 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) \
 #
 #########################################################################################
 FROM $REPOSITORY/$IMAGE:$TAG AS compute-tools
+ARG BUILD_TAG
+ENV BUILD_TAG=$BUILD_TAG
+
 USER nonroot
 # Copy entire project to get Cargo.* files with proper dependencies for the whole project
 COPY --chown=nonroot . .
@@ -616,14 +726,19 @@ COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-deb
 # libgeos, libgdal, libsfcgal1, libproj and libprotobuf-c1 for PostGIS
 # libxml2, libxslt1.1 for xml2
 # libzstd1 for zstd
+# libboost*, libfreetype6, and zlib1g for rdkit
 RUN apt update &&  \
    apt install --no-install-recommends -y \
        gdb \
-        locales \
        libicu67 \
        liblz4-1 \
        libreadline8 \
+        libboost-iostreams1.74.0 \
+        libboost-regex1.74.0 \
+        libboost-serialization1.74.0 \
+        libboost-system1.74.0 \
        libossp-uuid16 \
+        libfreetype6 \
        libgeos-c1v5 \
        libgdal28 \
        libproj19 \
@@ -632,7 +747,10 @@ RUN apt update &&  \
        libxml2 \
        libxslt1.1 \
        libzstd1 \
-        procps && \
+        libcurl4-openssl-dev \
+        locales \
+        procps \
+        zlib1g && \
    rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
    localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8

--- a/Dockerfile.compute-tools
+++ b/Dockerfile.compute-tools
@@ -3,6 +3,7 @@
 ARG REPOSITORY=neondatabase
 ARG IMAGE=rust
 ARG TAG=pinned
+ARG BUILD_TAG

 FROM $REPOSITORY/$IMAGE:$TAG AS rust-build
 WORKDIR /home/nonroot
@@ -16,6 +17,8 @@ ENV CACHEPOT_S3_KEY_PREFIX=cachepot
 ARG CACHEPOT_BUCKET=neon-github-dev
 #ARG AWS_ACCESS_KEY_ID
 #ARG AWS_SECRET_ACCESS_KEY
+ARG BUILD_TAG
+ENV BUILD_TAG=$BUILD_TAG

 COPY . .

--- a/Dockerfile.vm-compute-node
+++ b/Dockerfile.vm-compute-node
@@ -1,70 +0,0 @@
-# Note: this file *mostly* just builds on Dockerfile.compute-node
-
-ARG SRC_IMAGE
-ARG VM_INFORMANT_VERSION=v0.1.14
-# on libcgroup update, make sure to check bootstrap.sh for changes
-ARG LIBCGROUP_VERSION=v2.0.3
-
-# Pull VM informant, to copy from later
-FROM neondatabase/vm-informant:$VM_INFORMANT_VERSION as informant
-
-# Build cgroup-tools
-#
-# At time of writing (2023-03-14), debian bullseye has a version of cgroup-tools (technically
-# libcgroup) that doesn't support cgroup v2 (version 0.41-11). Unfortunately, the vm-informant
-# requires cgroup v2, so we'll build cgroup-tools ourselves.
-FROM debian:bullseye-slim as libcgroup-builder
-ARG LIBCGROUP_VERSION
-
-RUN set -exu \
-	&& apt update \
-	&& apt install --no-install-recommends -y \
-		git \
-		ca-certificates \
-		automake \
-		cmake \
-		make \
-		gcc \
-		byacc \
-		flex \
-		libtool \
-		libpam0g-dev \
-	&& git clone --depth 1 -b $LIBCGROUP_VERSION https://github.com/libcgroup/libcgroup \
-	&& INSTALL_DIR="/libcgroup-install" \
-	&& mkdir -p "$INSTALL_DIR/bin" "$INSTALL_DIR/include" \
-	&& cd libcgroup \
-	# extracted from bootstrap.sh, with modified flags:
-	&& (test -d m4 || mkdir m4) \
-	&& autoreconf -fi \
-	&& rm -rf autom4te.cache \
-	&& CFLAGS="-O3" ./configure --prefix="$INSTALL_DIR" --sysconfdir=/etc --localstatedir=/var --enable-opaque-hierarchy="name=systemd" \
-	# actually build the thing...
-	&& make install
-
-# Combine, starting from non-VM compute node image.
-FROM $SRC_IMAGE as base
-
-# Temporarily set user back to root so we can run adduser, set inittab
-USER root
-RUN adduser vm-informant --disabled-password --no-create-home
-
-RUN set -e \
-	&& rm -f /etc/inittab \
-	&& touch /etc/inittab
-
-RUN set -e \
-	&& echo "::sysinit:cgconfigparser -l /etc/cgconfig.conf -s 1664" >> /etc/inittab \
-	&& CONNSTR="dbname=postgres user=cloud_admin sslmode=disable" \
-	&& ARGS="--auto-restart --cgroup=neon-postgres --pgconnstr=\"$CONNSTR\"" \
-	&& echo "::respawn:su vm-informant -c '/usr/local/bin/vm-informant $ARGS'" >> /etc/inittab
-
-USER postgres
-
-ADD vm-cgconfig.conf /etc/cgconfig.conf
-COPY --from=informant /usr/bin/vm-informant /usr/local/bin/vm-informant
-
-COPY --from=libcgroup-builder /libcgroup-install/bin/* /usr/bin/
-COPY --from=libcgroup-builder /libcgroup-install/lib/* /usr/lib/
-COPY --from=libcgroup-builder /libcgroup-install/sbin/* /usr/sbin/
-
-ENTRYPOINT ["/usr/sbin/cgexec", "-g", "*:neon-postgres", "/usr/local/bin/compute_ctl"]
--- a/8
+++ b/8
@@ -138,6 +138,11 @@ neon-pg-ext-%: postgres-%
 	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
 		-C $(POSTGRES_INSTALL_DIR)/build/neon-utils-$* \
 		-f $(ROOT_PROJECT_DIR)/pgxn/neon_utils/Makefile install
+	+@echo "Compiling hnsw $*"
+	mkdir -p $(POSTGRES_INSTALL_DIR)/build/hnsw-$*
+	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
+		-C $(POSTGRES_INSTALL_DIR)/build/hnsw-$* \
+		-f $(ROOT_PROJECT_DIR)/pgxn/hnsw/Makefile install

 .PHONY: neon-pg-ext-clean-%
 neon-pg-ext-clean-%:
@@ -153,6 +158,9 @@ neon-pg-ext-clean-%:
 	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config \
 	-C $(POSTGRES_INSTALL_DIR)/build/neon-utils-$* \
 	-f $(ROOT_PROJECT_DIR)/pgxn/neon_utils/Makefile clean
+	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config \
+	-C $(POSTGRES_INSTALL_DIR)/build/hnsw-$* \
+	-f $(ROOT_PROJECT_DIR)/pgxn/hnsw/Makefile clean

 .PHONY: neon-pg-ext
 neon-pg-ext: \
--- a/README.md
+++ b/README.md
@@ -28,18 +28,19 @@ See developer documentation in [SUMMARY.md](/docs/SUMMARY.md) for more informati
 * On Ubuntu or Debian, this set of packages should be sufficient to build the code:
 ```bash
 apt install build-essential libtool libreadline-dev zlib1g-dev flex bison libseccomp-dev \
-libssl-dev clang pkg-config libpq-dev cmake postgresql-client protobuf-compiler
+libssl-dev clang pkg-config libpq-dev cmake postgresql-client protobuf-compiler \
+libcurl4-openssl-dev
 ```
 * On Fedora, these packages are needed:
 ```bash
 dnf install flex bison readline-devel zlib-devel openssl-devel \
  libseccomp-devel perl clang cmake postgresql postgresql-contrib protobuf-compiler \
-  protobuf-devel
+  protobuf-devel libcurl-devel
 ```
 * On Arch based systems, these packages are needed:
 ```bash
 pacman -S base-devel readline zlib libseccomp openssl clang \
-postgresql-libs cmake postgresql protobuf
+postgresql-libs cmake postgresql protobuf curl
 ```

 Building Neon requires 3.15+ version of `protoc` (protobuf-compiler). If your distribution provides an older version, you can install a newer version from [here](https://github.com/protocolbuffers/protobuf/releases).
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -54,11 +54,20 @@ use compute_tools::monitor::launch_monitor;
 use compute_tools::params::*;
 use compute_tools::spec::*;

+const BUILD_TAG_DEFAULT: &str = "local";
+
 fn main() -> Result<()> {
    init_tracing_and_logging(DEFAULT_LOG_LEVEL)?;

+    let build_tag = option_env!("BUILD_TAG").unwrap_or(BUILD_TAG_DEFAULT);
+
+    info!("build_tag: {build_tag}");
+
    let matches = cli().get_matches();

+    let http_port = *matches
+        .get_one::<u16>("http-port")
+        .expect("http-port is required");
    let pgdata = matches
        .get_one::<String>("pgdata")
        .expect("PGDATA path is required");
@@ -178,7 +187,8 @@ fn main() -> Result<()> {

    // Launch http service first, so we were able to serve control-plane
    // requests, while configuration is still in progress.
-    let _http_handle = launch_http_server(&compute).expect("cannot launch http endpoint thread");
+    let _http_handle =
+        launch_http_server(http_port, &compute).expect("cannot launch http endpoint thread");

    if !spec_set {
        // No spec provided, hang waiting for it.
@@ -286,6 +296,14 @@ fn cli() -> clap::Command {
    let version = option_env!("CARGO_PKG_VERSION").unwrap_or("unknown");
    clap::Command::new("compute_ctl")
        .version(version)
+        .arg(
+            Arg::new("http-port")
+                .long("http-port")
+                .value_name("HTTP_PORT")
+                .default_value("3080")
+                .value_parser(clap::value_parser!(u16))
+                .required(false),
+        )
        .arg(
            Arg::new("connstr")
                .short('C')
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -1,19 +1,3 @@
-//
-// XXX: This starts to be scarry similar to the `PostgresNode` from `control_plane`,
-// but there are several things that makes `PostgresNode` usage inconvenient in the
-// cloud:
-// - it inherits from `LocalEnv`, which contains **all-all** the information about
-//   a complete service running
-// - it uses `PageServerNode` with information about http endpoint, which we do not
-//   need in the cloud again
-// - many tiny pieces like, for example, we do not use `pg_ctl` in the cloud
-//
-// Thus, to use `PostgresNode` in the cloud, we need to 'mock' a bunch of required
-// attributes (not required for the cloud). Yet, it is still tempting to unify these
-// `PostgresNode` and `ComputeNode` and use one in both places.
-//
-// TODO: stabilize `ComputeNode` and think about using it in the `control_plane`.
-//
 use std::fs;
 use std::os::unix::fs::PermissionsExt;
 use std::path::Path;
@@ -106,26 +90,38 @@ pub struct ParsedSpec {
 impl TryFrom<ComputeSpec> for ParsedSpec {
    type Error = String;
    fn try_from(spec: ComputeSpec) -> Result<Self, String> {
+        // Extract the options from the spec file that are needed to connect to
+        // the storage system.
+        //
+        // For backwards-compatibility, the top-level fields in the spec file
+        // may be empty. In that case, we need to dig them from the GUCs in the
+        // cluster.settings field.
        let pageserver_connstr = spec
-            .cluster
-            .settings
-            .find("neon.pageserver_connstring")
+            .pageserver_connstring
+            .clone()
+            .or_else(|| spec.cluster.settings.find("neon.pageserver_connstring"))
            .ok_or("pageserver connstr should be provided")?;
        let storage_auth_token = spec.storage_auth_token.clone();
-        let tenant_id: TenantId = spec
-            .cluster
-            .settings
-            .find("neon.tenant_id")
-            .ok_or("tenant id should be provided")
-            .map(|s| TenantId::from_str(&s))?
-            .or(Err("invalid tenant id"))?;
-        let timeline_id: TimelineId = spec
-            .cluster
-            .settings
-            .find("neon.timeline_id")
-            .ok_or("timeline id should be provided")
-            .map(|s| TimelineId::from_str(&s))?
-            .or(Err("invalid timeline id"))?;
+        let tenant_id: TenantId = if let Some(tenant_id) = spec.tenant_id {
+            tenant_id
+        } else {
+            spec.cluster
+                .settings
+                .find("neon.tenant_id")
+                .ok_or("tenant id should be provided")
+                .map(|s| TenantId::from_str(&s))?
+                .or(Err("invalid tenant id"))?
+        };
+        let timeline_id: TimelineId = if let Some(timeline_id) = spec.timeline_id {
+            timeline_id
+        } else {
+            spec.cluster
+                .settings
+                .find("neon.timeline_id")
+                .ok_or("timeline id should be provided")
+                .map(|s| TimelineId::from_str(&s))?
+                .or(Err("invalid timeline id"))?
+        };

        Ok(ParsedSpec {
            spec,
@@ -137,6 +133,84 @@ impl TryFrom<ComputeSpec> for ParsedSpec {
    }
 }

+/// Create special neon_superuser role, that's a slightly nerfed version of a real superuser
+/// that we give to customers
+fn create_neon_superuser(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
+    let roles = spec
+        .cluster
+        .roles
+        .iter()
+        .map(|r| format!("'{}'", escape_literal(&r.name)))
+        .collect::<Vec<_>>();
+
+    let dbs = spec
+        .cluster
+        .databases
+        .iter()
+        .map(|db| format!("'{}'", escape_literal(&db.name)))
+        .collect::<Vec<_>>();
+
+    let roles_decl = if roles.is_empty() {
+        String::from("roles text[] := NULL;")
+    } else {
+        format!(
+            r#"
+               roles text[] := ARRAY(SELECT rolname
+                                     FROM pg_catalog.pg_roles
+                                     WHERE rolname IN ({}));"#,
+            roles.join(", ")
+        )
+    };
+
+    let database_decl = if dbs.is_empty() {
+        String::from("dbs text[] := NULL;")
+    } else {
+        format!(
+            r#"
+               dbs text[] := ARRAY(SELECT datname
+                                   FROM pg_catalog.pg_database
+                                   WHERE datname IN ({}));"#,
+            dbs.join(", ")
+        )
+    };
+
+    // ALL PRIVILEGES grants CREATE, CONNECT, and TEMPORARY on all databases
+    // (see https://www.postgresql.org/docs/current/ddl-priv.html)
+    let query = format!(
+        r#"
+            DO $$
+                DECLARE
+                    r text;
+                    {}
+                    {}
+                BEGIN
+                    IF NOT EXISTS (
+                        SELECT FROM pg_catalog.pg_roles WHERE rolname = 'neon_superuser')
+                    THEN
+                        CREATE ROLE neon_superuser CREATEDB CREATEROLE NOLOGIN IN ROLE pg_read_all_data, pg_write_all_data;
+                        IF array_length(roles, 1) IS NOT NULL THEN
+                            EXECUTE format('GRANT neon_superuser TO %s',
+                                           array_to_string(ARRAY(SELECT quote_ident(x) FROM unnest(roles) as x), ', '));
+                            FOREACH r IN ARRAY roles LOOP
+                                EXECUTE format('ALTER ROLE %s CREATEROLE CREATEDB', quote_ident(r));
+                            END LOOP;
+                        END IF;
+                        IF array_length(dbs, 1) IS NOT NULL THEN
+                            EXECUTE format('GRANT ALL PRIVILEGES ON DATABASE %s TO neon_superuser',
+                                           array_to_string(ARRAY(SELECT quote_ident(x) FROM unnest(dbs) as x), ', '));
+                        END IF;
+                    END IF;
+                END
+            $$;"#,
+        roles_decl, database_decl,
+    );
+    info!("Neon superuser created:\n{}", &query);
+    client
+        .simple_query(&query)
+        .map_err(|e| anyhow::anyhow!(e).context(query))?;
+    Ok(())
+}
+
 impl ComputeNode {
    pub fn set_status(&self, status: ComputeStatus) {
        let mut state = self.state.lock().unwrap();
@@ -295,8 +369,8 @@ impl ComputeNode {
        update_pg_hba(pgdata_path)?;

        match spec.mode {
-            ComputeMode::Primary | ComputeMode::Static(..) => {}
-            ComputeMode::Replica => {
+            ComputeMode::Primary => {}
+            ComputeMode::Replica | ComputeMode::Static(..) => {
                add_standby_signal(pgdata_path)?;
            }
        }
@@ -351,6 +425,8 @@ impl ComputeNode {
                    .map_err(|_| anyhow::anyhow!("invalid connstr"))?;

                let mut client = Client::connect(zenith_admin_connstr.as_str(), NoTls)?;
+                // Disable forwarding so that users don't get a cloud_admin role
+                client.simple_query("SET neon.forward_ddl = false")?;
                client.simple_query("CREATE USER cloud_admin WITH SUPERUSER")?;
                client.simple_query("GRANT zenith_admin TO cloud_admin")?;
                drop(client);
@@ -361,22 +437,21 @@ impl ComputeNode {
            Ok(client) => client,
        };

+        // Disable DDL forwarding because control plane already knows about these roles/databases.
+        client.simple_query("SET neon.forward_ddl = false")?;
+
        // Proceed with post-startup configuration. Note, that order of operations is important.
        let spec = &compute_state.pspec.as_ref().expect("spec must be set").spec;
+        create_neon_superuser(spec, &mut client)?;
        handle_roles(spec, &mut client)?;
        handle_databases(spec, &mut client)?;
        handle_role_deletions(spec, self.connstr.as_str(), &mut client)?;
-        handle_grants(spec, self.connstr.as_str(), &mut client)?;
+        handle_grants(spec, self.connstr.as_str())?;
        handle_extensions(spec, &mut client)?;

        // 'Close' connection
        drop(client);

-        info!(
-            "finished configuration of compute for project {}",
-            spec.cluster.cluster_id
-        );
-
        Ok(())
    }

@@ -403,11 +478,13 @@ impl ComputeNode {
        self.pg_reload_conf(&mut client)?;

        // Proceed with post-startup configuration. Note, that order of operations is important.
+        // Disable DDL forwarding because control plane already knows about these roles/databases.
        if spec.mode == ComputeMode::Primary {
+            client.simple_query("SET neon.forward_ddl = false")?;
            handle_roles(&spec, &mut client)?;
            handle_databases(&spec, &mut client)?;
            handle_role_deletions(&spec, self.connstr.as_str(), &mut client)?;
-            handle_grants(&spec, self.connstr.as_str(), &mut client)?;
+            handle_grants(&spec, self.connstr.as_str())?;
            handle_extensions(&spec, &mut client)?;
        }

@@ -427,22 +504,22 @@ impl ComputeNode {
    #[instrument(skip(self))]
    pub fn start_compute(&self) -> Result<std::process::Child> {
        let compute_state = self.state.lock().unwrap().clone();
-        let spec = compute_state.pspec.as_ref().expect("spec must be set");
+        let pspec = compute_state.pspec.as_ref().expect("spec must be set");
        info!(
            "starting compute for project {}, operation {}, tenant {}, timeline {}",
-            spec.spec.cluster.cluster_id,
-            spec.spec.operation_uuid.as_deref().unwrap_or("None"),
-            spec.tenant_id,
-            spec.timeline_id,
+            pspec.spec.cluster.cluster_id.as_deref().unwrap_or("None"),
+            pspec.spec.operation_uuid.as_deref().unwrap_or("None"),
+            pspec.tenant_id,
+            pspec.timeline_id,
        );

        self.prepare_pgdata(&compute_state)?;

        let start_time = Utc::now();

-        let pg = self.start_postgres(spec.storage_auth_token.clone())?;
+        let pg = self.start_postgres(pspec.storage_auth_token.clone())?;

-        if spec.spec.mode == ComputeMode::Primary {
+        if pspec.spec.mode == ComputeMode::Primary && !pspec.spec.skip_pg_catalog_updates {
            self.apply_config(&compute_state)?;
        }

@@ -462,6 +539,11 @@ impl ComputeNode {
        }
        self.set_status(ComputeStatus::Running);

+        info!(
+            "finished configuration of compute for project {}",
+            pspec.spec.cluster.cluster_id.as_deref().unwrap_or("None")
+        );
+
        Ok(pg)
    }

--- a/compute_tools/src/config.rs
+++ b/compute_tools/src/config.rs
@@ -5,6 +5,7 @@ use std::path::Path;

 use anyhow::Result;

+use crate::pg_helpers::escape_conf_value;
 use crate::pg_helpers::PgOptionsSerialize;
 use compute_api::spec::{ComputeMode, ComputeSpec};

@@ -36,10 +37,44 @@ pub fn write_postgres_conf(path: &Path, spec: &ComputeSpec) -> Result<()> {
    // File::create() destroys the file content if it exists.
    let mut file = File::create(path)?;

-    writeln!(file, "# Managed by compute_ctl: begin")?;
+    // Write the postgresql.conf content from the spec file as is.
+    if let Some(conf) = &spec.cluster.postgresql_conf {
+        writeln!(file, "{}", conf)?;
+    }

    write!(file, "{}", &spec.cluster.settings.as_pg_settings())?;

+    // Add options for connecting to storage
+    writeln!(file, "# Neon storage settings")?;
+    if let Some(s) = &spec.pageserver_connstring {
+        writeln!(
+            file,
+            "neon.pageserver_connstring='{}'",
+            escape_conf_value(s)
+        )?;
+    }
+    if !spec.safekeeper_connstrings.is_empty() {
+        writeln!(
+            file,
+            "neon.safekeepers='{}'",
+            escape_conf_value(&spec.safekeeper_connstrings.join(","))
+        )?;
+    }
+    if let Some(s) = &spec.tenant_id {
+        writeln!(
+            file,
+            "neon.tenant_id='{}'",
+            escape_conf_value(&s.to_string())
+        )?;
+    }
+    if let Some(s) = &spec.timeline_id {
+        writeln!(
+            file,
+            "neon.timeline_id='{}'",
+            escape_conf_value(&s.to_string())
+        )?;
+    }
+
    match spec.mode {
        ComputeMode::Primary => {}
        ComputeMode::Static(lsn) => {
@@ -53,7 +88,12 @@ pub fn write_postgres_conf(path: &Path, spec: &ComputeSpec) -> Result<()> {
        }
    }

-    writeln!(file, "# Managed by compute_ctl: end")?;
+    // If there are any extra options in the 'settings' field, append those
+    if spec.cluster.settings.is_some() {
+        writeln!(file, "# Managed by compute_ctl: begin")?;
+        write!(file, "{}", spec.cluster.settings.as_pg_settings())?;
+        writeln!(file, "# Managed by compute_ctl: end")?;
+    }

    Ok(())
 }
--- a/compute_tools/src/http/api.rs
+++ b/compute_tools/src/http/api.rs
@@ -220,8 +220,8 @@ fn render_json_error(e: &str, status: StatusCode) -> Response<Body> {

 // Main Hyper HTTP server function that runs it and blocks waiting on it forever.
 #[tokio::main]
-async fn serve(state: Arc<ComputeNode>) {
-    let addr = SocketAddr::from(([0, 0, 0, 0], 3080));
+async fn serve(port: u16, state: Arc<ComputeNode>) {
+    let addr = SocketAddr::from(([0, 0, 0, 0], port));

    let make_service = make_service_fn(move |_conn| {
        let state = state.clone();
@@ -256,10 +256,10 @@ async fn serve(state: Arc<ComputeNode>) {
 }

 /// Launch a separate Hyper HTTP API server thread and return its `JoinHandle`.
-pub fn launch_http_server(state: &Arc<ComputeNode>) -> Result<thread::JoinHandle<()>> {
+pub fn launch_http_server(port: u16, state: &Arc<ComputeNode>) -> Result<thread::JoinHandle<()>> {
    let state = Arc::clone(state);

    Ok(thread::Builder::new()
        .name("http-endpoint".into())
-        .spawn(move || serve(state))?)
+        .spawn(move || serve(port, state))?)
 }
--- a/compute_tools/src/logger.rs
+++ b/compute_tools/src/logger.rs
@@ -33,5 +33,7 @@ pub fn init_tracing_and_logging(default_log_level: &str) -> anyhow::Result<()> {
        .init();
    tracing::info!("logging and tracing started");

+    utils::logging::replace_panic_hook_with_tracing_panic_hook().forget();
+
    Ok(())
 }
--- a/compute_tools/src/pg_helpers.rs
+++ b/compute_tools/src/pg_helpers.rs
@@ -17,13 +17,13 @@ use compute_api::spec::{Database, GenericOption, GenericOptions, PgIdent, Role};
 const POSTGRES_WAIT_TIMEOUT: Duration = Duration::from_millis(60 * 1000); // milliseconds

 /// Escape a string for including it in a SQL literal
-fn escape_literal(s: &str) -> String {
+pub fn escape_literal(s: &str) -> String {
    s.replace('\'', "''").replace('\\', "\\\\")
 }

 /// Escape a string so that it can be used in postgresql.conf.
 /// Same as escape_literal, currently.
-fn escape_conf_value(s: &str) -> String {
+pub fn escape_conf_value(s: &str) -> String {
    s.replace('\'', "''").replace('\\', "\\\\")
 }

@@ -121,9 +121,8 @@ impl RoleExt for Role {
    /// string of arguments.
    fn to_pg_options(&self) -> String {
        // XXX: consider putting LOGIN as a default option somewhere higher, e.g. in control-plane.
-        // For now, we do not use generic `options` for roles. Once used, add
-        // `self.options.as_pg_options()` somewhere here.
-        let mut params: String = "LOGIN".to_string();
+        let mut params: String = self.options.as_pg_options();
+        params.push_str(" LOGIN");

        if let Some(pass) = &self.encrypted_password {
            // Some time ago we supported only md5 and treated all encrypted_password as md5.
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -62,7 +62,7 @@ fn do_control_plane_request(
    }
 }

-/// Request spec from the control-plane by compute_id. If `NEON_CONSOLE_JWT`
+/// Request spec from the control-plane by compute_id. If `NEON_CONTROL_PLANE_TOKEN`
 /// env variable is set, it will be used for authorization.
 pub fn get_spec_from_control_plane(
    base_uri: &str,
@@ -269,17 +269,13 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
                xact.execute(query.as_str(), &[])?;
            }
            RoleAction::Create => {
-                let mut query: String = format!("CREATE ROLE {} ", name.pg_quote());
+                let mut query: String = format!(
+                    "CREATE ROLE {} CREATEROLE CREATEDB IN ROLE neon_superuser",
+                    name.pg_quote()
+                );
                info!("role create query: '{}'", &query);
                query.push_str(&role.to_pg_options());
                xact.execute(query.as_str(), &[])?;
-
-                let grant_query = format!(
-                    "GRANT pg_read_all_data, pg_write_all_data TO {}",
-                    name.pg_quote()
-                );
-                xact.execute(grant_query.as_str(), &[])?;
-                info!("role grant query: '{}'", &grant_query);
            }
        }

@@ -476,6 +472,11 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
                query.push_str(&db.to_pg_options());
                let _guard = info_span!("executing", query).entered();
                client.execute(query.as_str(), &[])?;
+                let grant_query: String = format!(
+                    "GRANT ALL PRIVILEGES ON DATABASE {} TO neon_superuser",
+                    name.pg_quote()
+                );
+                client.execute(grant_query.as_str(), &[])?;
            }
        };

@@ -495,35 +496,9 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
 /// Grant CREATE ON DATABASE to the database owner and do some other alters and grants
 /// to allow users creating trusted extensions and re-creating `public` schema, for example.
 #[instrument(skip_all)]
-pub fn handle_grants(spec: &ComputeSpec, connstr: &str, client: &mut Client) -> Result<()> {
+pub fn handle_grants(spec: &ComputeSpec, connstr: &str) -> Result<()> {
    info!("cluster spec grants:");

-    // We now have a separate `web_access` role to connect to the database
-    // via the web interface and proxy link auth. And also we grant a
-    // read / write all data privilege to every role. So also grant
-    // create to everyone.
-    // XXX: later we should stop messing with Postgres ACL in such horrible
-    // ways.
-    let roles = spec
-        .cluster
-        .roles
-        .iter()
-        .map(|r| r.name.pg_quote())
-        .collect::<Vec<_>>();
-
-    for db in &spec.cluster.databases {
-        let dbname = &db.name;
-
-        let query: String = format!(
-            "GRANT CREATE ON DATABASE {} TO {}",
-            dbname.pg_quote(),
-            roles.join(", ")
-        );
-        info!("grant query {}", &query);
-
-        client.execute(query.as_str(), &[])?;
-    }
-
    // Do some per-database access adjustments. We'd better do this at db creation time,
    // but CREATE DATABASE isn't transactional. So we cannot create db + do some grants
    // atomically.
--- a/compute_tools/tests/pg_helpers_tests.rs
+++ b/compute_tools/tests/pg_helpers_tests.rs
@@ -16,7 +16,7 @@ mod pg_helpers_tests {
        );
        assert_eq!(
            spec.cluster.roles.first().unwrap().to_pg_options(),
-            "LOGIN PASSWORD 'md56b1d16b78004bbd51fa06af9eda75972'"
+            " LOGIN PASSWORD 'md56b1d16b78004bbd51fa06af9eda75972'"
        );
    }

--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -41,7 +41,7 @@ const DEFAULT_PAGESERVER_ID: NodeId = NodeId(1);
 const DEFAULT_BRANCH_NAME: &str = "main";
 project_git_version!(GIT_VERSION);

-const DEFAULT_PG_VERSION: &str = "14";
+const DEFAULT_PG_VERSION: &str = "15";

 fn default_conf() -> String {
    format!(
@@ -476,10 +476,11 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -

            println!("Creating endpoint for imported timeline ...");
            cplane.new_endpoint(
-                tenant_id,
                name,
+                tenant_id,
                timeline_id,
                None,
+                None,
                pg_version,
                ComputeMode::Primary,
            )?;
@@ -591,7 +592,7 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(

                table.add_row([
                    endpoint_id.as_str(),
-                    &endpoint.address.to_string(),
+                    &endpoint.pg_address.to_string(),
                    &endpoint.timeline_id.to_string(),
                    branch_name,
                    lsn_str.as_str(),
@@ -620,8 +621,8 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
                .get_branch_timeline_id(branch_name, tenant_id)
                .ok_or_else(|| anyhow!("Found no timeline id for branch name '{branch_name}'"))?;

-            let port: Option<u16> = sub_args.get_one::<u16>("port").copied();
-
+            let pg_port: Option<u16> = sub_args.get_one::<u16>("pg-port").copied();
+            let http_port: Option<u16> = sub_args.get_one::<u16>("http-port").copied();
            let pg_version = sub_args
                .get_one::<u32>("pg-version")
                .copied()
@@ -639,14 +640,38 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
                (Some(_), true) => anyhow::bail!("cannot specify both lsn and hot-standby"),
            };

-            cplane.new_endpoint(tenant_id, &endpoint_id, timeline_id, port, pg_version, mode)?;
+            cplane.new_endpoint(
+                &endpoint_id,
+                tenant_id,
+                timeline_id,
+                pg_port,
+                http_port,
+                pg_version,
+                mode,
+            )?;
        }
        "start" => {
-            let port: Option<u16> = sub_args.get_one::<u16>("port").copied();
+            let pg_port: Option<u16> = sub_args.get_one::<u16>("pg-port").copied();
+            let http_port: Option<u16> = sub_args.get_one::<u16>("http-port").copied();
            let endpoint_id = sub_args
                .get_one::<String>("endpoint_id")
                .ok_or_else(|| anyhow!("No endpoint ID was provided to start"))?;

+            // If --safekeepers argument is given, use only the listed safekeeper nodes.
+            let safekeepers =
+                if let Some(safekeepers_str) = sub_args.get_one::<String>("safekeepers") {
+                    let mut safekeepers: Vec<NodeId> = Vec::new();
+                    for sk_id in safekeepers_str.split(',').map(str::trim) {
+                        let sk_id = NodeId(u64::from_str(sk_id).map_err(|_| {
+                            anyhow!("invalid node ID \"{sk_id}\" in --safekeepers list")
+                        })?);
+                        safekeepers.push(sk_id);
+                    }
+                    safekeepers
+                } else {
+                    env.safekeepers.iter().map(|sk| sk.id).collect()
+                };
+
            let endpoint = cplane.endpoints.get(endpoint_id.as_str());

            let auth_token = if matches!(env.pageserver.pg_auth_type, AuthType::NeonJWT) {
@@ -673,7 +698,7 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
                    _ => {}
                }
                println!("Starting existing endpoint {endpoint_id}...");
-                endpoint.start(&auth_token)?;
+                endpoint.start(&auth_token, safekeepers)?;
            } else {
                let branch_name = sub_args
                    .get_one::<String>("branch-name")
@@ -709,14 +734,15 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
                println!("Starting new endpoint {endpoint_id} (PostgreSQL v{pg_version}) on timeline {timeline_id} ...");

                let ep = cplane.new_endpoint(
-                    tenant_id,
                    endpoint_id,
+                    tenant_id,
                    timeline_id,
-                    port,
+                    pg_port,
+                    http_port,
                    pg_version,
                    mode,
                )?;
-                ep.start(&auth_token)?;
+                ep.start(&auth_token, safekeepers)?;
            }
        }
        "stop" => {
@@ -944,11 +970,22 @@ fn cli() -> Command {
        .value_parser(value_parser!(u32))
        .default_value(DEFAULT_PG_VERSION);

-    let port_arg = Arg::new("port")
-        .long("port")
+    let pg_port_arg = Arg::new("pg-port")
+        .long("pg-port")
        .required(false)
        .value_parser(value_parser!(u16))
-        .value_name("port");
+        .value_name("pg-port");
+
+    let http_port_arg = Arg::new("http-port")
+        .long("http-port")
+        .required(false)
+        .value_parser(value_parser!(u16))
+        .value_name("http-port");
+
+    let safekeepers_arg = Arg::new("safekeepers")
+        .long("safekeepers")
+        .required(false)
+        .value_name("safekeepers");

    let stop_mode_arg = Arg::new("stop-mode")
        .short('m')
@@ -1093,7 +1130,8 @@ fn cli() -> Command {
                    .arg(branch_name_arg.clone())
                    .arg(tenant_id_arg.clone())
                    .arg(lsn_arg.clone())
-                    .arg(port_arg.clone())
+                    .arg(pg_port_arg.clone())
+                    .arg(http_port_arg.clone())
                    .arg(
                        Arg::new("config-only")
                            .help("Don't do basebackup, create endpoint directory with only config files")
@@ -1109,9 +1147,11 @@ fn cli() -> Command {
                    .arg(branch_name_arg)
                    .arg(timeline_id_arg)
                    .arg(lsn_arg)
-                    .arg(port_arg)
+                    .arg(pg_port_arg)
+                    .arg(http_port_arg)
                    .arg(pg_version_arg)
                    .arg(hot_standby_arg)
+                    .arg(safekeepers_arg)
                )
                .subcommand(
                    Command::new("stop")
--- a/control_plane/src/broker.rs
+++ b/control_plane/src/broker.rs
@@ -1,3 +1,9 @@
+//! Code to manage the storage broker
+//!
+//! In the local test environment, the data for each safekeeper is stored in
+//!
+//!   .neon/safekeepers/<safekeeper id>
+//!
 use anyhow::Context;

 use std::path::PathBuf;
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -1,41 +1,73 @@
+//! Code to manage compute endpoints
+//!
+//! In the local test environment, the data for each endpoint is stored in
+//!
+//!   .neon/endpoints/<endpoint id>
+//!
+//! Some basic information about the endpoint, like the tenant and timeline IDs,
+//! are stored in the `endpoint.json` file. The `endpoint.json` file is created
+//! when the endpoint is created, and doesn't change afterwards.
+//!
+//! The endpoint is managed by the `compute_ctl` binary. When an endpoint is
+//! started, we launch `compute_ctl` It synchronizes the safekeepers, downloads
+//! the basebackup from the pageserver to initialize the the data directory, and
+//! finally launches the PostgreSQL process. It watches the PostgreSQL process
+//! until it exits.
+//!
+//! When an endpoint is created, a `postgresql.conf` file is also created in
+//! the endpoint's directory. The file can be modified before starting PostgreSQL.
+//! However, the `postgresql.conf` file in the endpoint directory is not used directly
+//! by PostgreSQL. It is passed to `compute_ctl`, and `compute_ctl` writes another
+//! copy of it in the data directory.
+//!
+//! Directory contents:
+//!
+//! ```ignore
+//! .neon/endpoints/main/
+//!     compute.log               - log output of `compute_ctl` and `postgres`
+//!     endpoint.json             - serialized `EndpointConf` struct
+//!     postgresql.conf           - postgresql settings
+//!     spec.json                 - passed to `compute_ctl`
+//!     pgdata/
+//!         postgresql.conf       - copy of postgresql.conf created by `compute_ctl`
+//!         zenith.signal
+//!         <other PostgreSQL files>
+//! ```
+//!
 use std::collections::BTreeMap;
-use std::fs::{self, File};
-use std::io::Write;
 use std::net::SocketAddr;
 use std::net::TcpStream;
-use std::os::unix::fs::PermissionsExt;
 use std::path::PathBuf;
-use std::process::{Command, Stdio};
-use std::str::FromStr;
+use std::process::Command;
 use std::sync::Arc;
 use std::time::Duration;

-use anyhow::{Context, Result};
+use anyhow::{anyhow, bail, Context, Result};
 use serde::{Deserialize, Serialize};
 use serde_with::{serde_as, DisplayFromStr};
-use utils::{
-    id::{TenantId, TimelineId},
-    lsn::Lsn,
-};
+use utils::id::{NodeId, TenantId, TimelineId};

 use crate::local_env::LocalEnv;
 use crate::pageserver::PageServerNode;
 use crate::postgresql_conf::PostgresConf;

-use compute_api::spec::ComputeMode;
+use compute_api::responses::{ComputeState, ComputeStatus};
+use compute_api::spec::{Cluster, ComputeMode, ComputeSpec};

 // contents of a endpoint.json file
 #[serde_as]
 #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
 pub struct EndpointConf {
-    name: String,
+    endpoint_id: String,
    #[serde_as(as = "DisplayFromStr")]
    tenant_id: TenantId,
    #[serde_as(as = "DisplayFromStr")]
    timeline_id: TimelineId,
    mode: ComputeMode,
-    port: u16,
+    pg_port: u16,
+    http_port: u16,
    pg_version: u32,
+    skip_pg_catalog_updates: bool,
 }

 //
@@ -57,11 +89,11 @@ impl ComputeControlPlane {
        let pageserver = Arc::new(PageServerNode::from_env(&env));

        let mut endpoints = BTreeMap::default();
-        for endpoint_dir in fs::read_dir(env.endpoints_path())
+        for endpoint_dir in std::fs::read_dir(env.endpoints_path())
            .with_context(|| format!("failed to list {}", env.endpoints_path().display()))?
        {
            let ep = Endpoint::from_dir_entry(endpoint_dir?, &env, &pageserver)?;
-            endpoints.insert(ep.name.clone(), Arc::new(ep));
+            endpoints.insert(ep.endpoint_id.clone(), Arc::new(ep));
        }

        Ok(ComputeControlPlane {
@@ -76,47 +108,58 @@ impl ComputeControlPlane {
        1 + self
            .endpoints
            .values()
-            .map(|ep| ep.address.port())
+            .map(|ep| std::cmp::max(ep.pg_address.port(), ep.http_address.port()))
            .max()
            .unwrap_or(self.base_port)
    }

+    #[allow(clippy::too_many_arguments)]
    pub fn new_endpoint(
        &mut self,
+        endpoint_id: &str,
        tenant_id: TenantId,
-        name: &str,
        timeline_id: TimelineId,
-        port: Option<u16>,
+        pg_port: Option<u16>,
+        http_port: Option<u16>,
        pg_version: u32,
        mode: ComputeMode,
    ) -> Result<Arc<Endpoint>> {
-        let port = port.unwrap_or_else(|| self.get_port());
-
+        let pg_port = pg_port.unwrap_or_else(|| self.get_port());
+        let http_port = http_port.unwrap_or_else(|| self.get_port() + 1);
        let ep = Arc::new(Endpoint {
-            name: name.to_owned(),
-            address: SocketAddr::new("127.0.0.1".parse().unwrap(), port),
+            endpoint_id: endpoint_id.to_owned(),
+            pg_address: SocketAddr::new("127.0.0.1".parse().unwrap(), pg_port),
+            http_address: SocketAddr::new("127.0.0.1".parse().unwrap(), http_port),
            env: self.env.clone(),
            pageserver: Arc::clone(&self.pageserver),
            timeline_id,
            mode,
            tenant_id,
            pg_version,
+            skip_pg_catalog_updates: false,
        });
-        ep.create_pgdata()?;
+
+        ep.create_endpoint_dir()?;
        std::fs::write(
            ep.endpoint_path().join("endpoint.json"),
            serde_json::to_string_pretty(&EndpointConf {
-                name: name.to_string(),
+                endpoint_id: endpoint_id.to_string(),
                tenant_id,
                timeline_id,
                mode,
-                port,
+                http_port,
+                pg_port,
                pg_version,
+                skip_pg_catalog_updates: false,
            })?,
        )?;
-        ep.setup_pg_conf()?;
+        std::fs::write(
+            ep.endpoint_path().join("postgresql.conf"),
+            ep.setup_pg_conf()?.to_string(),
+        )?;

-        self.endpoints.insert(ep.name.clone(), Arc::clone(&ep));
+        self.endpoints
+            .insert(ep.endpoint_id.clone(), Arc::clone(&ep));

        Ok(ep)
    }
@@ -127,13 +170,15 @@ impl ComputeControlPlane {
 #[derive(Debug)]
 pub struct Endpoint {
    /// used as the directory name
-    name: String,
+    endpoint_id: String,
    pub tenant_id: TenantId,
    pub timeline_id: TimelineId,
    pub mode: ComputeMode,

-    // port and address of the Postgres server
-    pub address: SocketAddr,
+    // port and address of the Postgres server and `compute_ctl`'s HTTP API
+    pub pg_address: SocketAddr,
+    pub http_address: SocketAddr,
+
    // postgres major version in the format: 14, 15, etc.
    pg_version: u32,

@@ -141,6 +186,9 @@ pub struct Endpoint {
    // the endpoint runs in.
    pub env: LocalEnv,
    pageserver: Arc<PageServerNode>,
+
+    // Optimizations
+    skip_pg_catalog_updates: bool,
 }

 impl Endpoint {
@@ -158,123 +206,37 @@ impl Endpoint {

        // parse data directory name
        let fname = entry.file_name();
-        let name = fname.to_str().unwrap().to_string();
+        let endpoint_id = fname.to_str().unwrap().to_string();

        // Read the endpoint.json file
        let conf: EndpointConf =
            serde_json::from_slice(&std::fs::read(entry.path().join("endpoint.json"))?)?;

-        // ok now
        Ok(Endpoint {
-            address: SocketAddr::new("127.0.0.1".parse().unwrap(), conf.port),
-            name,
+            pg_address: SocketAddr::new("127.0.0.1".parse().unwrap(), conf.pg_port),
+            http_address: SocketAddr::new("127.0.0.1".parse().unwrap(), conf.http_port),
+            endpoint_id,
            env: env.clone(),
            pageserver: Arc::clone(pageserver),
            timeline_id: conf.timeline_id,
            mode: conf.mode,
            tenant_id: conf.tenant_id,
            pg_version: conf.pg_version,
+            skip_pg_catalog_updates: conf.skip_pg_catalog_updates,
        })
    }

-    fn sync_safekeepers(&self, auth_token: &Option<String>, pg_version: u32) -> Result<Lsn> {
-        let pg_path = self.env.pg_bin_dir(pg_version)?.join("postgres");
-        let mut cmd = Command::new(pg_path);
-
-        cmd.arg("--sync-safekeepers")
-            .env_clear()
-            .env(
-                "LD_LIBRARY_PATH",
-                self.env.pg_lib_dir(pg_version)?.to_str().unwrap(),
-            )
-            .env(
-                "DYLD_LIBRARY_PATH",
-                self.env.pg_lib_dir(pg_version)?.to_str().unwrap(),
-            )
-            .env("PGDATA", self.pgdata().to_str().unwrap())
-            .stdout(Stdio::piped())
-            // Comment this to avoid capturing stderr (useful if command hangs)
-            .stderr(Stdio::piped());
-
-        if let Some(token) = auth_token {
-            cmd.env("NEON_AUTH_TOKEN", token);
-        }
-
-        let sync_handle = cmd
-            .spawn()
-            .expect("postgres --sync-safekeepers failed to start");
-
-        let sync_output = sync_handle
-            .wait_with_output()
-            .expect("postgres --sync-safekeepers failed");
-        if !sync_output.status.success() {
-            anyhow::bail!(
-                "sync-safekeepers failed: '{}'",
-                String::from_utf8_lossy(&sync_output.stderr)
-            );
-        }
-
-        let lsn = Lsn::from_str(std::str::from_utf8(&sync_output.stdout)?.trim())?;
-        println!("Safekeepers synced on {}", lsn);
-        Ok(lsn)
-    }
-
-    /// Get basebackup from the pageserver as a tar archive and extract it
-    /// to the `self.pgdata()` directory.
-    fn do_basebackup(&self, lsn: Option<Lsn>) -> Result<()> {
-        println!(
-            "Extracting base backup to create postgres instance: path={} port={}",
-            self.pgdata().display(),
-            self.address.port()
-        );
-
-        let sql = if let Some(lsn) = lsn {
-            format!("basebackup {} {} {}", self.tenant_id, self.timeline_id, lsn)
-        } else {
-            format!("basebackup {} {}", self.tenant_id, self.timeline_id)
-        };
-
-        let mut client = self
-            .pageserver
-            .page_server_psql_client()
-            .context("connecting to page server failed")?;
-
-        let copyreader = client
-            .copy_out(sql.as_str())
-            .context("page server 'basebackup' command failed")?;
-
-        // Read the archive directly from the `CopyOutReader`
-        //
-        // Set `ignore_zeros` so that unpack() reads all the Copy data and
-        // doesn't stop at the end-of-archive marker. Otherwise, if the server
-        // sends an Error after finishing the tarball, we will not notice it.
-        let mut ar = tar::Archive::new(copyreader);
-        ar.set_ignore_zeros(true);
-        ar.unpack(&self.pgdata())
-            .context("extracting base backup failed")?;
-
-        Ok(())
-    }
-
-    fn create_pgdata(&self) -> Result<()> {
-        fs::create_dir_all(self.pgdata()).with_context(|| {
+    fn create_endpoint_dir(&self) -> Result<()> {
+        std::fs::create_dir_all(self.endpoint_path()).with_context(|| {
            format!(
-                "could not create data directory {}",
-                self.pgdata().display()
+                "could not create endpoint directory {}",
+                self.endpoint_path().display()
            )
-        })?;
-        fs::set_permissions(self.pgdata().as_path(), fs::Permissions::from_mode(0o700))
-            .with_context(|| {
-                format!(
-                    "could not set permissions in data directory {}",
-                    self.pgdata().display()
-                )
-            })
+        })
    }

-    // Write postgresql.conf with default configuration
-    // and PG_VERSION file to the data directory of a new endpoint.
-    fn setup_pg_conf(&self) -> Result<()> {
+    // Generate postgresql.conf with default configuration
+    fn setup_pg_conf(&self) -> Result<PostgresConf> {
        let mut conf = PostgresConf::new();
        conf.append("max_wal_senders", "10");
        conf.append("wal_log_hints", "off");
@@ -287,25 +249,14 @@ impl Endpoint {
        // wal_sender_timeout is the maximum time to wait for WAL replication.
        // It also defines how often the walreciever will send a feedback message to the wal sender.
        conf.append("wal_sender_timeout", "5s");
-        conf.append("listen_addresses", &self.address.ip().to_string());
-        conf.append("port", &self.address.port().to_string());
+        conf.append("listen_addresses", &self.pg_address.ip().to_string());
+        conf.append("port", &self.pg_address.port().to_string());
        conf.append("wal_keep_size", "0");
        // walproposer panics when basebackup is invalid, it is pointless to restart in this case.
        conf.append("restart_after_crash", "off");

-        // Configure the Neon Postgres extension to fetch pages from pageserver
-        let pageserver_connstr = {
-            let config = &self.pageserver.pg_connection_config;
-            let (host, port) = (config.host(), config.port());
-
-            // NOTE: avoid spaces in connection string, because it is less error prone if we forward it somewhere.
-            format!("postgresql://no_user@{host}:{port}")
-        };
+        // Load the 'neon' extension
        conf.append("shared_preload_libraries", "neon");
-        conf.append_line("");
-        conf.append("neon.pageserver_connstring", &pageserver_connstr);
-        conf.append("neon.tenant_id", &self.tenant_id.to_string());
-        conf.append("neon.timeline_id", &self.timeline_id.to_string());

        conf.append_line("");
        // Replication-related configurations, such as WAL sending
@@ -390,46 +341,11 @@ impl Endpoint {
            }
        }

-        let mut file = File::create(self.pgdata().join("postgresql.conf"))?;
-        file.write_all(conf.to_string().as_bytes())?;
-
-        let mut file = File::create(self.pgdata().join("PG_VERSION"))?;
-        file.write_all(self.pg_version.to_string().as_bytes())?;
-
-        Ok(())
-    }
-
-    fn load_basebackup(&self, auth_token: &Option<String>) -> Result<()> {
-        let backup_lsn = match &self.mode {
-            ComputeMode::Primary => {
-                if !self.env.safekeepers.is_empty() {
-                    // LSN 0 means that it is bootstrap and we need to download just
-                    // latest data from the pageserver. That is a bit clumsy but whole bootstrap
-                    // procedure evolves quite actively right now, so let's think about it again
-                    // when things would be more stable (TODO).
-                    let lsn = self.sync_safekeepers(auth_token, self.pg_version)?;
-                    if lsn == Lsn(0) {
-                        None
-                    } else {
-                        Some(lsn)
-                    }
-                } else {
-                    None
-                }
-            }
-            ComputeMode::Static(lsn) => Some(*lsn),
-            ComputeMode::Replica => {
-                None // Take the latest snapshot available to start with
-            }
-        };
-
-        self.do_basebackup(backup_lsn)?;
-
-        Ok(())
+        Ok(conf)
    }

    pub fn endpoint_path(&self) -> PathBuf {
-        self.env.endpoints_path().join(&self.name)
+        self.env.endpoints_path().join(&self.endpoint_id)
    }

    pub fn pgdata(&self) -> PathBuf {
@@ -439,7 +355,7 @@ impl Endpoint {
    pub fn status(&self) -> &str {
        let timeout = Duration::from_millis(300);
        let has_pidfile = self.pgdata().join("postmaster.pid").exists();
-        let can_connect = TcpStream::connect_timeout(&self.address, timeout).is_ok();
+        let can_connect = TcpStream::connect_timeout(&self.pg_address, timeout).is_ok();

        match (has_pidfile, can_connect) {
            (true, true) => "running",
@@ -457,8 +373,6 @@ impl Endpoint {
                &[
                    "-D",
                    self.pgdata().to_str().unwrap(),
-                    "-l",
-                    self.pgdata().join("pg.log").to_str().unwrap(),
                    "-w", //wait till pg_ctl actually does what was asked
                ],
                args,
@@ -494,36 +408,184 @@ impl Endpoint {
        Ok(())
    }

-    pub fn start(&self, auth_token: &Option<String>) -> Result<()> {
+    pub fn start(&self, auth_token: &Option<String>, safekeepers: Vec<NodeId>) -> Result<()> {
        if self.status() == "running" {
            anyhow::bail!("The endpoint is already running");
        }

-        // 1. We always start Postgres from scratch, so
-        // if old dir exists, preserve 'postgresql.conf' and drop the directory
-        let postgresql_conf_path = self.pgdata().join("postgresql.conf");
-        let postgresql_conf = fs::read(&postgresql_conf_path).with_context(|| {
-            format!(
-                "failed to read config file in {}",
-                postgresql_conf_path.to_str().unwrap()
-            )
-        })?;
-        fs::remove_dir_all(self.pgdata())?;
-        self.create_pgdata()?;
+        // Slurp the endpoints/<endpoint id>/postgresql.conf file into
+        // memory. We will include it in the spec file that we pass to
+        // `compute_ctl`, and `compute_ctl` will write it to the postgresql.conf
+        // in the data directory.
+        let postgresql_conf_path = self.endpoint_path().join("postgresql.conf");
+        let postgresql_conf = match std::fs::read(&postgresql_conf_path) {
+            Ok(content) => String::from_utf8(content)?,
+            Err(e) if e.kind() == std::io::ErrorKind::NotFound => "".to_string(),
+            Err(e) => {
+                return Err(anyhow::Error::new(e).context(format!(
+                    "failed to read config file in {}",
+                    postgresql_conf_path.to_str().unwrap()
+                )))
+            }
+        };

-        // 2. Bring back config files
-        fs::write(&postgresql_conf_path, postgresql_conf)?;
-
-        // 3. Load basebackup
-        self.load_basebackup(auth_token)?;
-
-        if self.mode != ComputeMode::Primary {
-            File::create(self.pgdata().join("standby.signal"))?;
+        // We always start the compute node from scratch, so if the Postgres
+        // data dir exists from a previous launch, remove it first.
+        if self.pgdata().exists() {
+            std::fs::remove_dir_all(self.pgdata())?;
        }

-        // 4. Finally start postgres
-        println!("Starting postgres at '{}'", self.connstr());
-        self.pg_ctl(&["start"], auth_token)
+        let pageserver_connstring = {
+            let config = &self.pageserver.pg_connection_config;
+            let (host, port) = (config.host(), config.port());
+
+            // NOTE: avoid spaces in connection string, because it is less error prone if we forward it somewhere.
+            format!("postgresql://no_user@{host}:{port}")
+        };
+        let mut safekeeper_connstrings = Vec::new();
+        if self.mode == ComputeMode::Primary {
+            for sk_id in safekeepers {
+                let sk = self
+                    .env
+                    .safekeepers
+                    .iter()
+                    .find(|node| node.id == sk_id)
+                    .ok_or_else(|| anyhow!("safekeeper {sk_id} does not exist"))?;
+                safekeeper_connstrings.push(format!("127.0.0.1:{}", sk.pg_port));
+            }
+        }
+
+        // Create spec file
+        let spec = ComputeSpec {
+            skip_pg_catalog_updates: self.skip_pg_catalog_updates,
+            format_version: 1.0,
+            operation_uuid: None,
+            cluster: Cluster {
+                cluster_id: None, // project ID: not used
+                name: None,       // project name: not used
+                state: None,
+                roles: vec![],
+                databases: vec![],
+                settings: None,
+                postgresql_conf: Some(postgresql_conf),
+            },
+            delta_operations: None,
+            tenant_id: Some(self.tenant_id),
+            timeline_id: Some(self.timeline_id),
+            mode: self.mode,
+            pageserver_connstring: Some(pageserver_connstring),
+            safekeeper_connstrings,
+            storage_auth_token: auth_token.clone(),
+        };
+        let spec_path = self.endpoint_path().join("spec.json");
+        std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?;
+
+        // Open log file. We'll redirect the stdout and stderr of `compute_ctl` to it.
+        let logfile = std::fs::OpenOptions::new()
+            .create(true)
+            .append(true)
+            .open(self.endpoint_path().join("compute.log"))?;
+
+        // Launch compute_ctl
+        println!("Starting postgres node at '{}'", self.connstr());
+        let mut cmd = Command::new(self.env.neon_distrib_dir.join("compute_ctl"));
+        cmd.args(["--http-port", &self.http_address.port().to_string()])
+            .args(["--pgdata", self.pgdata().to_str().unwrap()])
+            .args(["--connstr", &self.connstr()])
+            .args([
+                "--spec-path",
+                self.endpoint_path().join("spec.json").to_str().unwrap(),
+            ])
+            .args([
+                "--pgbin",
+                self.env
+                    .pg_bin_dir(self.pg_version)?
+                    .join("postgres")
+                    .to_str()
+                    .unwrap(),
+            ])
+            .stdin(std::process::Stdio::null())
+            .stderr(logfile.try_clone()?)
+            .stdout(logfile);
+        let _child = cmd.spawn()?;
+
+        // Wait for it to start
+        let mut attempt = 0;
+        const ATTEMPT_INTERVAL: Duration = Duration::from_millis(100);
+        const MAX_ATTEMPTS: u32 = 10 * 30; // Wait up to 30 s
+        loop {
+            attempt += 1;
+            match self.get_status() {
+                Ok(state) => {
+                    match state.status {
+                        ComputeStatus::Init => {
+                            if attempt == MAX_ATTEMPTS {
+                                bail!("compute startup timed out; still in Init state");
+                            }
+                            // keep retrying
+                        }
+                        ComputeStatus::Running => {
+                            // All good!
+                            break;
+                        }
+                        ComputeStatus::Failed => {
+                            bail!(
+                                "compute startup failed: {}",
+                                state
+                                    .error
+                                    .as_deref()
+                                    .unwrap_or("<no error from compute_ctl>")
+                            );
+                        }
+                        ComputeStatus::Empty
+                        | ComputeStatus::ConfigurationPending
+                        | ComputeStatus::Configuration => {
+                            bail!("unexpected compute status: {:?}", state.status)
+                        }
+                    }
+                }
+                Err(e) => {
+                    if attempt == MAX_ATTEMPTS {
+                        return Err(e).context(
+                            "timed out waiting to connect to compute_ctl HTTP; last error: {e}",
+                        );
+                    }
+                }
+            }
+            std::thread::sleep(ATTEMPT_INTERVAL);
+        }
+
+        Ok(())
+    }
+
+    // Call the /status HTTP API
+    pub fn get_status(&self) -> Result<ComputeState> {
+        let client = reqwest::blocking::Client::new();
+
+        let response = client
+            .request(
+                reqwest::Method::GET,
+                format!(
+                    "http://{}:{}/status",
+                    self.http_address.ip(),
+                    self.http_address.port()
+                ),
+            )
+            .send()?;
+
+        // Interpret the response
+        let status = response.status();
+        if !(status.is_client_error() || status.is_server_error()) {
+            Ok(response.json()?)
+        } else {
+            // reqwest does not export its error construction utility functions, so let's craft the message ourselves
+            let url = response.url().to_owned();
+            let msg = match response.text() {
+                Ok(err_body) => format!("Error: {}", err_body),
+                Err(_) => format!("Http error ({}) at {}.", status.as_u16(), url),
+            };
+            Err(anyhow::anyhow!(msg))
+        }
    }

    pub fn stop(&self, destroy: bool) -> Result<()> {
@@ -540,7 +602,7 @@ impl Endpoint {
                "Destroying postgres data directory '{}'",
                self.pgdata().to_str().unwrap()
            );
-            fs::remove_dir_all(self.endpoint_path())?;
+            std::fs::remove_dir_all(self.endpoint_path())?;
        } else {
            self.pg_ctl(&["stop"], &None)?;
        }
@@ -549,10 +611,10 @@ impl Endpoint {

    pub fn connstr(&self) -> String {
        format!(
-            "host={} port={} user={} dbname={}",
-            self.address.ip(),
-            self.address.port(),
+            "postgresql://{}@{}:{}/{}",
            "cloud_admin",
+            self.pg_address.ip(),
+            self.pg_address.port(),
            "postgres"
        )
    }
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -24,7 +24,7 @@ use utils::{

 use crate::safekeeper::SafekeeperNode;

-pub const DEFAULT_PG_VERSION: u32 = 14;
+pub const DEFAULT_PG_VERSION: u32 = 15;

 //
 // This data structures represents neon_local CLI config
@@ -37,7 +37,7 @@ pub const DEFAULT_PG_VERSION: u32 = 14;
 #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
 pub struct LocalEnv {
    // Base directory for all the nodes (the pageserver, safekeepers and
-    // compute nodes).
+    // compute endpoints).
    //
    // This is not stored in the config file. Rather, this is the path where the
    // config file itself is. It is read from the NEON_REPO_DIR env variable or
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -1,3 +1,9 @@
+//! Code to manage pageservers
+//!
+//! In the local test environment, the pageserver stores its data directly in
+//!
+//!   .neon/
+//!
 use std::borrow::Cow;
 use std::collections::HashMap;
 use std::fs::File;
@@ -369,7 +375,16 @@ impl PageServerNode {
            evictions_low_residence_duration_metric_threshold: settings
                .remove("evictions_low_residence_duration_metric_threshold")
                .map(|x| x.to_string()),
+            gc_feedback: settings
+                .remove("gc_feedback")
+                .map(|x| x.parse::<bool>())
+                .transpose()
+                .context("Failed to parse 'gc_feedback' as bool")?,
        };
+
+        // If tenant ID was not specified, generate one
+        let new_tenant_id = new_tenant_id.unwrap_or(TenantId::generate());
+
        let request = models::TenantCreateRequest {
            new_tenant_id,
            config,
@@ -459,6 +474,11 @@ impl PageServerNode {
                evictions_low_residence_duration_metric_threshold: settings
                    .remove("evictions_low_residence_duration_metric_threshold")
                    .map(|x| x.to_string()),
+                gc_feedback: settings
+                    .remove("gc_feedback")
+                    .map(|x| x.parse::<bool>())
+                    .transpose()
+                    .context("Failed to parse 'gc_feedback' as bool")?,
            }
        };

@@ -495,6 +515,9 @@ impl PageServerNode {
        ancestor_timeline_id: Option<TimelineId>,
        pg_version: Option<u32>,
    ) -> anyhow::Result<TimelineInfo> {
+        // If timeline ID was not specified, generate one
+        let new_timeline_id = new_timeline_id.unwrap_or(TimelineId::generate());
+
        self.http_request(
            Method::POST,
            format!("{}/tenant/{}/timeline", self.http_base_url, tenant_id),
--- a/control_plane/src/safekeeper.rs
+++ b/control_plane/src/safekeeper.rs
@@ -1,3 +1,9 @@
+//! Code to manage safekeepers
+//!
+//! In the local test environment, the data for each safekeeper is stored in
+//!
+//!   .neon/safekeepers/<safekeeper id>
+//!
 use std::io::Write;
 use std::path::PathBuf;
 use std::process::Child;
--- a/docker-compose/compute_wrapper/shell/compute.sh
+++ b/docker-compose/compute_wrapper/shell/compute.sh
@@ -1,6 +1,14 @@
 #!/bin/bash
 set -eux

+# Generate a random tenant or timeline ID
+#
+# Takes a variable name as argument. The result is stored in that variable.
+generate_id() {
+    local -n resvar=$1
+    printf -v resvar '%08x%08x%08x%08x' $SRANDOM $SRANDOM $SRANDOM $SRANDOM
+}
+
 PG_VERSION=${PG_VERSION:-14}

 SPEC_FILE_ORG=/var/db/postgres/specs/spec.json
@@ -13,29 +21,29 @@ done
 echo "Page server is ready."

 echo "Create a tenant and timeline"
+generate_id tenant_id
 PARAMS=(
     -sb 
     -X POST
     -H "Content-Type: application/json"
-     -d "{}"
+     -d "{\"new_tenant_id\": \"${tenant_id}\"}"
     http://pageserver:9898/v1/tenant/
 )
-tenant_id=$(curl "${PARAMS[@]}" | sed 's/"//g')
+result=$(curl "${PARAMS[@]}")
+echo $result | jq .

+generate_id timeline_id
 PARAMS=(
     -sb 
     -X POST
     -H "Content-Type: application/json"
-     -d "{\"tenant_id\":\"${tenant_id}\", \"pg_version\": ${PG_VERSION}}"
+     -d "{\"new_timeline_id\": \"${timeline_id}\", \"pg_version\": ${PG_VERSION}}"
     "http://pageserver:9898/v1/tenant/${tenant_id}/timeline/"
 )
 result=$(curl "${PARAMS[@]}")
 echo $result | jq .

 echo "Overwrite tenant id and timeline id in spec file"
-tenant_id=$(echo ${result} | jq -r .tenant_id)
-timeline_id=$(echo ${result} | jq -r .timeline_id)
-
 sed "s/TENANT_ID/${tenant_id}/" ${SPEC_FILE_ORG} > ${SPEC_FILE}
 sed -i "s/TIMELINE_ID/${timeline_id}/" ${SPEC_FILE}

--- a/docs/pageserver-thread-mgmt.md
+++ b/docs/pageserver-thread-mgmt.md
@@ -52,9 +52,7 @@ completion, or shield the rest of the code from surprise cancellations
 by spawning a separate task. The code that handles incoming HTTP
 requests, for example, spawns a separate task for each request,
 because Hyper will drop the request-handling Future if the HTTP
-connection is lost.  (FIXME: our HTTP handlers do not do that
-currently, but we should fix that. See [issue
-3478](https://github.com/neondatabase/neon/issues/3478)).
+connection is lost.


 #### How to cancel, then?
--- a/libs/compute_api/src/responses.rs
+++ b/libs/compute_api/src/responses.rs
@@ -5,13 +5,13 @@ use serde::{Deserialize, Serialize, Serializer};

 use crate::spec::ComputeSpec;

-#[derive(Serialize, Debug)]
+#[derive(Serialize, Debug, Deserialize)]
 pub struct GenericAPIError {
    pub error: String,
 }

 /// Response of the /status API
-#[derive(Serialize, Debug)]
+#[derive(Serialize, Debug, Deserialize)]
 #[serde(rename_all = "snake_case")]
 pub struct ComputeStatusResponse {
    pub start_time: DateTime<Utc>,
@@ -23,7 +23,7 @@ pub struct ComputeStatusResponse {
    pub error: Option<String>,
 }

-#[derive(Serialize)]
+#[derive(Deserialize, Serialize)]
 #[serde(rename_all = "snake_case")]
 pub struct ComputeState {
    pub status: ComputeStatus,
@@ -33,7 +33,7 @@ pub struct ComputeState {
    pub error: Option<String>,
 }

-#[derive(Serialize, Clone, Copy, Debug, PartialEq, Eq)]
+#[derive(Serialize, Clone, Copy, Debug, Deserialize, PartialEq, Eq)]
 #[serde(rename_all = "snake_case")]
 pub enum ComputeStatus {
    // Spec wasn't provided at start, waiting for it to be
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -5,6 +5,7 @@
 //! and connect it to the storage nodes.
 use serde::{Deserialize, Serialize};
 use serde_with::{serde_as, DisplayFromStr};
+use utils::id::{TenantId, TimelineId};
 use utils::lsn::Lsn;

 /// String type alias representing Postgres identifier and
@@ -14,7 +15,7 @@ pub type PgIdent = String;
 /// Cluster spec or configuration represented as an optional number of
 /// delta operations + final cluster state description.
 #[serde_as]
-#[derive(Clone, Debug, Default, Deserialize)]
+#[derive(Clone, Debug, Default, Deserialize, Serialize)]
 pub struct ComputeSpec {
    pub format_version: f32,

@@ -26,9 +27,38 @@ pub struct ComputeSpec {
    pub cluster: Cluster,
    pub delta_operations: Option<Vec<DeltaOp>>,

+    /// An optinal hint that can be passed to speed up startup time if we know
+    /// that no pg catalog mutations (like role creation, database creation,
+    /// extension creation) need to be done on the actual database to start.
+    #[serde(default)] // Default false
+    pub skip_pg_catalog_updates: bool,
+
+    // Information needed to connect to the storage layer.
+    //
+    // `tenant_id`, `timeline_id` and `pageserver_connstring` are always needed.
+    //
+    // Depending on `mode`, this can be a primary read-write node, a read-only
+    // replica, or a read-only node pinned at an older LSN.
+    // `safekeeper_connstrings` must be set for a primary.
+    //
+    // For backwards compatibility, the control plane may leave out all of
+    // these, and instead set the "neon.tenant_id", "neon.timeline_id",
+    // etc. GUCs in cluster.settings. TODO: Once the control plane has been
+    // updated to fill these fields, we can make these non optional.
+    #[serde_as(as = "Option<DisplayFromStr>")]
+    pub tenant_id: Option<TenantId>,
+    #[serde_as(as = "Option<DisplayFromStr>")]
+    pub timeline_id: Option<TimelineId>,
+    #[serde_as(as = "Option<DisplayFromStr>")]
+    pub pageserver_connstring: Option<String>,
+    #[serde(default)]
+    pub safekeeper_connstrings: Vec<String>,
+
    #[serde(default)]
    pub mode: ComputeMode,

+    /// If set, 'storage_auth_token' is used as the password to authenticate to
+    /// the pageserver and safekeepers.
    pub storage_auth_token: Option<String>,
 }

@@ -47,13 +77,19 @@ pub enum ComputeMode {
    Replica,
 }

-#[derive(Clone, Debug, Default, Deserialize)]
+#[derive(Clone, Debug, Default, Deserialize, Serialize)]
 pub struct Cluster {
-    pub cluster_id: String,
-    pub name: String,
+    pub cluster_id: Option<String>,
+    pub name: Option<String>,
    pub state: Option<String>,
    pub roles: Vec<Role>,
    pub databases: Vec<Database>,
+
+    /// Desired contents of 'postgresql.conf' file. (The 'compute_ctl'
+    /// tool may add additional settings to the final file.)
+    pub postgresql_conf: Option<String>,
+
+    /// Additional settings that will be appended to the 'postgresql.conf' file.
    pub settings: GenericOptions,
 }

@@ -63,7 +99,7 @@ pub struct Cluster {
 /// - DROP ROLE
 /// - ALTER ROLE name RENAME TO new_name
 /// - ALTER DATABASE name RENAME TO new_name
-#[derive(Clone, Debug, Deserialize)]
+#[derive(Clone, Debug, Deserialize, Serialize)]
 pub struct DeltaOp {
    pub action: String,
    pub name: PgIdent,
@@ -72,7 +108,7 @@ pub struct DeltaOp {

 /// Rust representation of Postgres role info with only those fields
 /// that matter for us.
-#[derive(Clone, Debug, Deserialize)]
+#[derive(Clone, Debug, Deserialize, Serialize)]
 pub struct Role {
    pub name: PgIdent,
    pub encrypted_password: Option<String>,
@@ -81,7 +117,7 @@ pub struct Role {

 /// Rust representation of Postgres database info with only those fields
 /// that matter for us.
-#[derive(Clone, Debug, Deserialize)]
+#[derive(Clone, Debug, Deserialize, Serialize)]
 pub struct Database {
    pub name: PgIdent,
    pub owner: PgIdent,
@@ -91,7 +127,7 @@ pub struct Database {
 /// Common type representing both SQL statement params with or without value,
 /// like `LOGIN` or `OWNER username` in the `CREATE/ALTER ROLE`, and config
 /// options like `wal_level = logical`.
-#[derive(Clone, Debug, Deserialize)]
+#[derive(Clone, Debug, Deserialize, Serialize)]
 pub struct GenericOption {
    pub name: String,
    pub value: Option<String>,
@@ -112,4 +148,14 @@ mod tests {
        let file = File::open("tests/cluster_spec.json").unwrap();
        let _spec: ComputeSpec = serde_json::from_reader(file).unwrap();
    }
+
+    #[test]
+    fn parse_unknown_fields() {
+        // Forward compatibility test
+        let file = File::open("tests/cluster_spec.json").unwrap();
+        let mut json: serde_json::Value = serde_json::from_reader(file).unwrap();
+        let ob = json.as_object_mut().unwrap();
+        ob.insert("unknown_field_123123123".into(), "hello".into());
+        let _spec: ComputeSpec = serde_json::from_value(json).unwrap();
+    }
 }
--- a/libs/metrics/src/lib.rs
+++ b/libs/metrics/src/lib.rs
@@ -23,6 +23,7 @@ use prometheus::{Registry, Result};
 pub mod launch_timestamp;
 mod wrappers;
 pub use wrappers::{CountedReader, CountedWriter};
+pub mod metric_vec_duration;

 pub type UIntGauge = GenericGauge<AtomicU64>;
 pub type UIntGaugeVec = GenericGaugeVec<AtomicU64>;
--- a/libs/metrics/src/metric_vec_duration.rs
+++ b/libs/metrics/src/metric_vec_duration.rs
@@ -0,0 +1,23 @@
+//! Helpers for observing duration on HistogramVec / CounterVec / GaugeVec / MetricVec<T>.
+
+use std::{future::Future, time::Instant};
+
+pub trait DurationResultObserver {
+    fn observe_result<T, E>(&self, res: &Result<T, E>, duration: std::time::Duration);
+}
+
+pub async fn observe_async_block_duration_by_result<
+    T,
+    E,
+    F: Future<Output = Result<T, E>>,
+    O: DurationResultObserver,
+>(
+    observer: &O,
+    block: F,
+) -> Result<T, E> {
+    let start = Instant::now();
+    let result = block.await;
+    let duration = start.elapsed();
+    observer.observe_result(&result, duration);
+    result
+}
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -18,7 +18,29 @@ use crate::reltag::RelTag;
 use anyhow::bail;
 use bytes::{BufMut, Bytes, BytesMut};

-/// A state of a tenant in pageserver's memory.
+/// The state of a tenant in this pageserver.
+///
+/// ```mermaid
+/// stateDiagram-v2
+///
+///     [*] --> Loading: spawn_load()
+///     [*] --> Attaching: spawn_attach()
+///
+///     Loading --> Activating: activate()
+///     Attaching --> Activating: activate()
+///     Activating --> Active: infallible
+///
+///     Loading --> Broken: load() failure
+///     Attaching --> Broken: attach() failure
+///
+///     Active --> Stopping: set_stopping(), part of shutdown & detach
+///     Stopping --> Broken: late error in remove_tenant_from_memory
+///
+///     Broken --> [*]: ignore / detach / shutdown
+///     Stopping --> [*]: remove_from_memory complete
+///
+///     Active --> Broken: cfg(testing)-only tenant break point
+/// ```
 #[derive(
    Clone,
    PartialEq,
@@ -26,51 +48,73 @@ use bytes::{BufMut, Bytes, BytesMut};
    serde::Serialize,
    serde::Deserialize,
    strum_macros::Display,
-    strum_macros::EnumString,
    strum_macros::EnumVariantNames,
    strum_macros::AsRefStr,
    strum_macros::IntoStaticStr,
 )]
 #[serde(tag = "slug", content = "data")]
 pub enum TenantState {
-    /// This tenant is being loaded from local disk
+    /// This tenant is being loaded from local disk.
+    ///
+    /// `set_stopping()` and `set_broken()` do not work in this state and wait for it to pass.
    Loading,
-    /// This tenant is being downloaded from cloud storage.
+    /// This tenant is being attached to the pageserver.
+    ///
+    /// `set_stopping()` and `set_broken()` do not work in this state and wait for it to pass.
    Attaching,
-    /// Tenant is fully operational
+    /// The tenant is transitioning from Loading/Attaching to Active.
+    ///
+    /// While in this state, the individual timelines are being activated.
+    ///
+    /// `set_stopping()` and `set_broken()` do not work in this state and wait for it to pass.
+    Activating(ActivatingFrom),
+    /// The tenant has finished activating and is open for business.
+    ///
+    /// Transitions out of this state are possible through `set_stopping()` and `set_broken()`.
    Active,
-    /// A tenant is recognized by pageserver, but it is being detached or the
+    /// The tenant is recognized by pageserver, but it is being detached or the
    /// system is being shut down.
+    ///
+    /// Transitions out of this state are possible through `set_broken()`.
    Stopping,
-    /// A tenant is recognized by the pageserver, but can no longer be used for
-    /// any operations, because it failed to be activated.
+    /// The tenant is recognized by the pageserver, but can no longer be used for
+    /// any operations.
+    ///
+    /// If the tenant fails to load or attach, it will transition to this state
+    /// and it is guaranteed that no background tasks are running in its name.
+    ///
+    /// The other way to transition into this state is from `Stopping` state
+    /// through `set_broken()` called from `remove_tenant_from_memory()`. That happens
+    /// if the cleanup future executed by `remove_tenant_from_memory()` fails.
    Broken { reason: String, backtrace: String },
 }

 impl TenantState {
    pub fn attachment_status(&self) -> TenantAttachmentStatus {
        use TenantAttachmentStatus::*;
+
+        // Below TenantState::Activating is used as "transient" or "transparent" state for
+        // attachment_status determining.
        match self {
            // The attach procedure writes the marker file before adding the Attaching tenant to the tenants map.
            // So, technically, we can return Attached here.
            // However, as soon as Console observes Attached, it will proceed with the Postgres-level health check.
            // But, our attach task might still be fetching the remote timelines, etc.
            // So, return `Maybe` while Attaching, making Console wait for the attach task to finish.
-            Self::Attaching => Maybe,
+            Self::Attaching | Self::Activating(ActivatingFrom::Attaching) => Maybe,
            // tenant mgr startup distinguishes attaching from loading via marker file.
            // If it's loading, there is no attach marker file, i.e., attach had finished in the past.
-            Self::Loading => Attached,
+            Self::Loading | Self::Activating(ActivatingFrom::Loading) => Attached,
            // We only reach Active after successful load / attach.
            // So, call atttachment status Attached.
            Self::Active => Attached,
            // If the (initial or resumed) attach procedure fails, the tenant becomes Broken.
            // However, it also becomes Broken if the regular load fails.
-            // We would need a separate TenantState variant to distinguish these cases.
-            // However, there's no practical difference from Console's perspective.
-            // It will run a Postgres-level health check as soon as it observes Attached.
-            // That will fail on Broken tenants.
-            // Console can then rollback the attach, or, wait for operator to fix the Broken tenant.
-            Self::Broken { .. } => Attached,
+            // From Console's perspective there's no practical difference
+            // because attachment_status is polled by console only during attach operation execution.
+            Self::Broken { reason, .. } => Failed {
+                reason: reason.to_owned(),
+            },
            // Why is Stopping a Maybe case? Because, during pageserver shutdown,
            // we set the Stopping state irrespective of whether the tenant
            // has finished attaching or not.
@@ -98,8 +142,17 @@ impl std::fmt::Debug for TenantState {
    }
 }

+/// The only [`TenantState`] variants we could be `TenantState::Activating` from.
+#[derive(Clone, Copy, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
+pub enum ActivatingFrom {
+    /// Arrived to [`TenantState::Activating`] from [`TenantState::Loading`]
+    Loading,
+    /// Arrived to [`TenantState::Activating`] from [`TenantState::Attaching`]
+    Attaching,
+}
+
 /// A state of a timeline in pageserver's memory.
-#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
+#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
 pub enum TimelineState {
    /// The timeline is recognized by the pageserver but is not yet operational.
    /// In particular, the walreceiver connection loop is not running for this timeline.
@@ -112,15 +165,14 @@ pub enum TimelineState {
    /// It cannot transition back into any other state.
    Stopping,
    /// The timeline is broken and not operational (previous states: Loading or Active).
-    Broken,
+    Broken { reason: String, backtrace: String },
 }

 #[serde_as]
 #[derive(Serialize, Deserialize)]
 pub struct TimelineCreateRequest {
-    #[serde(default)]
-    #[serde_as(as = "Option<DisplayFromStr>")]
-    pub new_timeline_id: Option<TimelineId>,
+    #[serde_as(as = "DisplayFromStr")]
+    pub new_timeline_id: TimelineId,
    #[serde(default)]
    #[serde_as(as = "Option<DisplayFromStr>")]
    pub ancestor_timeline_id: Option<TimelineId>,
@@ -131,12 +183,11 @@ pub struct TimelineCreateRequest {
 }

 #[serde_as]
-#[derive(Serialize, Deserialize, Debug, Default)]
+#[derive(Serialize, Deserialize, Debug)]
 #[serde(deny_unknown_fields)]
 pub struct TenantCreateRequest {
-    #[serde(default)]
-    #[serde_as(as = "Option<DisplayFromStr>")]
-    pub new_tenant_id: Option<TenantId>,
+    #[serde_as(as = "DisplayFromStr")]
+    pub new_tenant_id: TenantId,
    #[serde(flatten)]
    pub config: TenantConfig, // as we have a flattened field, we should reject all unknown fields in it
 }
@@ -171,6 +222,7 @@ pub struct TenantConfig {
    pub eviction_policy: Option<serde_json::Value>,
    pub min_resident_size_override: Option<u64>,
    pub evictions_low_residence_duration_metric_threshold: Option<String>,
+    pub gc_feedback: Option<bool>,
 }

 #[serde_as]
@@ -184,10 +236,10 @@ pub struct StatusResponse {
 }

 impl TenantCreateRequest {
-    pub fn new(new_tenant_id: Option<TenantId>) -> TenantCreateRequest {
+    pub fn new(new_tenant_id: TenantId) -> TenantCreateRequest {
        TenantCreateRequest {
            new_tenant_id,
-            ..Default::default()
+            config: TenantConfig::default(),
        }
    }
 }
@@ -229,17 +281,41 @@ impl TenantConfigRequest {
            eviction_policy: None,
            min_resident_size_override: None,
            evictions_low_residence_duration_metric_threshold: None,
+            gc_feedback: None,
        };
        TenantConfigRequest { tenant_id, config }
    }
 }

+#[derive(Debug, Serialize, Deserialize)]
+pub struct TenantAttachRequest {
+    pub config: TenantAttachConfig,
+}
+
+/// Newtype to enforce deny_unknown_fields on TenantConfig for
+/// its usage inside `TenantAttachRequest`.
+#[derive(Debug, Serialize, Deserialize)]
+#[serde(deny_unknown_fields)]
+pub struct TenantAttachConfig {
+    #[serde(flatten)]
+    allowing_unknown_fields: TenantConfig,
+}
+
+impl std::ops::Deref for TenantAttachConfig {
+    type Target = TenantConfig;
+
+    fn deref(&self) -> &Self::Target {
+        &self.allowing_unknown_fields
+    }
+}
+
 /// See [`TenantState::attachment_status`] and the OpenAPI docs for context.
 #[derive(Serialize, Deserialize, Clone)]
-#[serde(rename_all = "snake_case")]
+#[serde(tag = "slug", content = "data", rename_all = "snake_case")]
 pub enum TenantAttachmentStatus {
    Maybe,
    Attached,
+    Failed { reason: String },
 }

 #[serde_as]
@@ -282,6 +358,7 @@ pub struct TimelineInfo {
    /// Sum of the size of all layer files.
    /// If a layer is present in both local FS and S3, it counts only once.
    pub current_physical_size: Option<u64>, // is None when timeline is Unloaded
+    pub current_logical_size_non_incremental: Option<u64>,

    pub timeline_dir_layer_file_size_sum: Option<u64>,

@@ -732,7 +809,9 @@ mod tests {
                "slug": "Active",
            },
            "current_physical_size": 42,
-            "attachment_status": "attached",
+            "attachment_status": {
+                "slug":"attached",
+            }
        });

        let original_broken = TenantInfo {
@@ -754,7 +833,9 @@ mod tests {
                }
            },
            "current_physical_size": 42,
-            "attachment_status": "attached",
+            "attachment_status": {
+                "slug":"attached",
+            }
        });

        assert_eq!(
@@ -795,5 +876,68 @@ mod tests {
            "expect unknown field `unknown_field` error, got: {}",
            err
        );
+
+        let attach_request = json!({
+            "config": {
+                "unknown_field": "unknown_value".to_string(),
+            },
+        });
+        let err = serde_json::from_value::<TenantAttachRequest>(attach_request).unwrap_err();
+        assert!(
+            err.to_string().contains("unknown field `unknown_field`"),
+            "expect unknown field `unknown_field` error, got: {}",
+            err
+        );
+    }
+
+    #[test]
+    fn tenantstatus_activating_serde() {
+        let states = [
+            TenantState::Activating(ActivatingFrom::Loading),
+            TenantState::Activating(ActivatingFrom::Attaching),
+        ];
+        let expected = "[{\"slug\":\"Activating\",\"data\":\"Loading\"},{\"slug\":\"Activating\",\"data\":\"Attaching\"}]";
+
+        let actual = serde_json::to_string(&states).unwrap();
+
+        assert_eq!(actual, expected);
+
+        let parsed = serde_json::from_str::<Vec<TenantState>>(&actual).unwrap();
+
+        assert_eq!(states.as_slice(), &parsed);
+    }
+
+    #[test]
+    fn tenantstatus_activating_strum() {
+        // tests added, because we use these for metrics
+        let examples = [
+            (line!(), TenantState::Loading, "Loading"),
+            (line!(), TenantState::Attaching, "Attaching"),
+            (
+                line!(),
+                TenantState::Activating(ActivatingFrom::Loading),
+                "Activating",
+            ),
+            (
+                line!(),
+                TenantState::Activating(ActivatingFrom::Attaching),
+                "Activating",
+            ),
+            (line!(), TenantState::Active, "Active"),
+            (line!(), TenantState::Stopping, "Stopping"),
+            (
+                line!(),
+                TenantState::Broken {
+                    reason: "Example".into(),
+                    backtrace: "Looooong backtrace".into(),
+                },
+                "Broken",
+            ),
+        ];
+
+        for (line, rendered, expected) in examples {
+            let actual: &'static str = rendered.into();
+            assert_eq!(actual, expected, "example on {line}");
+        }
    }
 }
--- a/libs/postgres_ffi/Cargo.toml
+++ b/libs/postgres_ffi/Cargo.toml
@@ -24,7 +24,6 @@ workspace_hack.workspace = true
 [dev-dependencies]
 env_logger.workspace = true
 postgres.workspace = true
-wal_craft = { path = "wal_craft" }

 [build-dependencies]
 anyhow.workspace = true
--- a/libs/postgres_ffi/src/lib.rs
+++ b/libs/postgres_ffi/src/lib.rs
@@ -33,6 +33,7 @@ macro_rules! postgres_ffi {
            }
            pub mod controlfile_utils;
            pub mod nonrelfile_utils;
+            pub mod wal_craft_test_export;
            pub mod waldecoder_handler;
            pub mod xlog_utils;

@@ -45,8 +46,15 @@ macro_rules! postgres_ffi {
    };
 }

-postgres_ffi!(v14);
-postgres_ffi!(v15);
+#[macro_export]
+macro_rules! for_all_postgres_versions {
+    ($macro:tt) => {
+        $macro!(v14);
+        $macro!(v15);
+    };
+}
+
+for_all_postgres_versions! { postgres_ffi }

 pub mod pg_constants;
 pub mod relfile_utils;
--- a/libs/postgres_ffi/src/wal_craft_test_export.rs
+++ b/libs/postgres_ffi/src/wal_craft_test_export.rs
@@ -0,0 +1,6 @@
+//! This module is for WAL craft to test with postgres_ffi. Should not import any thing in normal usage.
+
+pub use super::PG_MAJORVERSION;
+pub use super::xlog_utils::*;
+pub use super::bindings::*;
+pub use crate::WAL_SEGMENT_SIZE;
--- a/libs/postgres_ffi/src/xlog_utils.rs
+++ b/libs/postgres_ffi/src/xlog_utils.rs
@@ -481,220 +481,4 @@ pub fn encode_logical_message(prefix: &str, message: &str) -> Vec<u8> {
    wal
 }

-#[cfg(test)]
-mod tests {
-    use super::super::PG_MAJORVERSION;
-    use super::*;
-    use regex::Regex;
-    use std::cmp::min;
-    use std::fs;
-    use std::{env, str::FromStr};
-    use utils::const_assert;
-
-    fn init_logging() {
-        let _ = env_logger::Builder::from_env(env_logger::Env::default().default_filter_or(
-            format!("wal_craft=info,postgres_ffi::{PG_MAJORVERSION}::xlog_utils=trace"),
-        ))
-        .is_test(true)
-        .try_init();
-    }
-
-    fn test_end_of_wal<C: wal_craft::Crafter>(test_name: &str) {
-        use wal_craft::*;
-
-        let pg_version = PG_MAJORVERSION[1..3].parse::<u32>().unwrap();
-
-        // Craft some WAL
-        let top_path = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
-            .join("..")
-            .join("..");
-        let cfg = Conf {
-            pg_version,
-            pg_distrib_dir: top_path.join("pg_install"),
-            datadir: top_path.join(format!("test_output/{}-{PG_MAJORVERSION}", test_name)),
-        };
-        if cfg.datadir.exists() {
-            fs::remove_dir_all(&cfg.datadir).unwrap();
-        }
-        cfg.initdb().unwrap();
-        let srv = cfg.start_server().unwrap();
-        let (intermediate_lsns, expected_end_of_wal_partial) =
-            C::craft(&mut srv.connect_with_timeout().unwrap()).unwrap();
-        let intermediate_lsns: Vec<Lsn> = intermediate_lsns
-            .iter()
-            .map(|&lsn| u64::from(lsn).into())
-            .collect();
-        let expected_end_of_wal: Lsn = u64::from(expected_end_of_wal_partial).into();
-        srv.kill();
-
-        // Check find_end_of_wal on the initial WAL
-        let last_segment = cfg
-            .wal_dir()
-            .read_dir()
-            .unwrap()
-            .map(|f| f.unwrap().file_name().into_string().unwrap())
-            .filter(|fname| IsXLogFileName(fname))
-            .max()
-            .unwrap();
-        check_pg_waldump_end_of_wal(&cfg, &last_segment, expected_end_of_wal);
-        for start_lsn in intermediate_lsns
-            .iter()
-            .chain(std::iter::once(&expected_end_of_wal))
-        {
-            // Erase all WAL before `start_lsn` to ensure it's not used by `find_end_of_wal`.
-            // We assume that `start_lsn` is non-decreasing.
-            info!(
-                "Checking with start_lsn={}, erasing WAL before it",
-                start_lsn
-            );
-            for file in fs::read_dir(cfg.wal_dir()).unwrap().flatten() {
-                let fname = file.file_name().into_string().unwrap();
-                if !IsXLogFileName(&fname) {
-                    continue;
-                }
-                let (segno, _) = XLogFromFileName(&fname, WAL_SEGMENT_SIZE);
-                let seg_start_lsn = XLogSegNoOffsetToRecPtr(segno, 0, WAL_SEGMENT_SIZE);
-                if seg_start_lsn > u64::from(*start_lsn) {
-                    continue;
-                }
-                let mut f = File::options().write(true).open(file.path()).unwrap();
-                const ZEROS: [u8; WAL_SEGMENT_SIZE] = [0u8; WAL_SEGMENT_SIZE];
-                f.write_all(
-                    &ZEROS[0..min(
-                        WAL_SEGMENT_SIZE,
-                        (u64::from(*start_lsn) - seg_start_lsn) as usize,
-                    )],
-                )
-                .unwrap();
-            }
-            check_end_of_wal(&cfg, &last_segment, *start_lsn, expected_end_of_wal);
-        }
-    }
-
-    fn check_pg_waldump_end_of_wal(
-        cfg: &wal_craft::Conf,
-        last_segment: &str,
-        expected_end_of_wal: Lsn,
-    ) {
-        // Get the actual end of WAL by pg_waldump
-        let waldump_output = cfg
-            .pg_waldump("000000010000000000000001", last_segment)
-            .unwrap()
-            .stderr;
-        let waldump_output = std::str::from_utf8(&waldump_output).unwrap();
-        let caps = match Regex::new(r"invalid record length at (.+):")
-            .unwrap()
-            .captures(waldump_output)
-        {
-            Some(caps) => caps,
-            None => {
-                error!("Unable to parse pg_waldump's stderr:\n{}", waldump_output);
-                panic!();
-            }
-        };
-        let waldump_wal_end = Lsn::from_str(caps.get(1).unwrap().as_str()).unwrap();
-        info!(
-            "waldump erred on {}, expected wal end at {}",
-            waldump_wal_end, expected_end_of_wal
-        );
-        assert_eq!(waldump_wal_end, expected_end_of_wal);
-    }
-
-    fn check_end_of_wal(
-        cfg: &wal_craft::Conf,
-        last_segment: &str,
-        start_lsn: Lsn,
-        expected_end_of_wal: Lsn,
-    ) {
-        // Check end_of_wal on non-partial WAL segment (we treat it as fully populated)
-        // let wal_end = find_end_of_wal(&cfg.wal_dir(), WAL_SEGMENT_SIZE, start_lsn).unwrap();
-        // info!(
-        //     "find_end_of_wal returned wal_end={} with non-partial WAL segment",
-        //     wal_end
-        // );
-        // assert_eq!(wal_end, expected_end_of_wal_non_partial);
-
-        // Rename file to partial to actually find last valid lsn, then rename it back.
-        fs::rename(
-            cfg.wal_dir().join(last_segment),
-            cfg.wal_dir().join(format!("{}.partial", last_segment)),
-        )
-        .unwrap();
-        let wal_end = find_end_of_wal(&cfg.wal_dir(), WAL_SEGMENT_SIZE, start_lsn).unwrap();
-        info!(
-            "find_end_of_wal returned wal_end={} with partial WAL segment",
-            wal_end
-        );
-        assert_eq!(wal_end, expected_end_of_wal);
-        fs::rename(
-            cfg.wal_dir().join(format!("{}.partial", last_segment)),
-            cfg.wal_dir().join(last_segment),
-        )
-        .unwrap();
-    }
-
-    const_assert!(WAL_SEGMENT_SIZE == 16 * 1024 * 1024);
-
-    #[test]
-    pub fn test_find_end_of_wal_simple() {
-        init_logging();
-        test_end_of_wal::<wal_craft::Simple>("test_find_end_of_wal_simple");
-    }
-
-    #[test]
-    pub fn test_find_end_of_wal_crossing_segment_followed_by_small_one() {
-        init_logging();
-        test_end_of_wal::<wal_craft::WalRecordCrossingSegmentFollowedBySmallOne>(
-            "test_find_end_of_wal_crossing_segment_followed_by_small_one",
-        );
-    }
-
-    #[test]
-    pub fn test_find_end_of_wal_last_crossing_segment() {
-        init_logging();
-        test_end_of_wal::<wal_craft::LastWalRecordCrossingSegment>(
-            "test_find_end_of_wal_last_crossing_segment",
-        );
-    }
-
-    /// Check the math in update_next_xid
-    ///
-    /// NOTE: These checks are sensitive to the value of XID_CHECKPOINT_INTERVAL,
-    /// currently 1024.
-    #[test]
-    pub fn test_update_next_xid() {
-        let checkpoint_buf = [0u8; std::mem::size_of::<CheckPoint>()];
-        let mut checkpoint = CheckPoint::decode(&checkpoint_buf).unwrap();
-
-        checkpoint.nextXid = FullTransactionId { value: 10 };
-        assert_eq!(checkpoint.nextXid.value, 10);
-
-        // The input XID gets rounded up to the next XID_CHECKPOINT_INTERVAL
-        // boundary
-        checkpoint.update_next_xid(100);
-        assert_eq!(checkpoint.nextXid.value, 1024);
-
-        // No change
-        checkpoint.update_next_xid(500);
-        assert_eq!(checkpoint.nextXid.value, 1024);
-        checkpoint.update_next_xid(1023);
-        assert_eq!(checkpoint.nextXid.value, 1024);
-
-        // The function returns the *next* XID, given the highest XID seen so
-        // far. So when we pass 1024, the nextXid gets bumped up to the next
-        // XID_CHECKPOINT_INTERVAL boundary.
-        checkpoint.update_next_xid(1024);
-        assert_eq!(checkpoint.nextXid.value, 2048);
-    }
-
-    #[test]
-    pub fn test_encode_logical_message() {
-        let expected = [
-            64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 21, 0, 0, 170, 34, 166, 227, 255,
-            38, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 112, 114,
-            101, 102, 105, 120, 0, 109, 101, 115, 115, 97, 103, 101,
-        ];
-        let actual = encode_logical_message("prefix", "message");
-        assert_eq!(expected, actual[..]);
-    }
-}
+// If you need to craft WAL and write tests for this module, put it at wal_craft crate.
--- a/libs/postgres_ffi/wal_craft/Cargo.toml
+++ b/libs/postgres_ffi/wal_craft/Cargo.toml
@@ -15,3 +15,7 @@ postgres_ffi.workspace = true
 tempfile.workspace = true

 workspace_hack.workspace = true
+
+[dev-dependencies]
+regex.workspace = true
+utils.workspace = true
--- a/libs/postgres_ffi/wal_craft/src/lib.rs
+++ b/libs/postgres_ffi/wal_craft/src/lib.rs
@@ -10,6 +10,20 @@ use std::process::Command;
 use std::time::{Duration, Instant};
 use tempfile::{tempdir, TempDir};

+macro_rules! xlog_utils_test {
+    ($version:ident) => {
+        #[path = "."]
+        mod $version {
+            pub use postgres_ffi::$version::wal_craft_test_export::*;
+            #[allow(clippy::duplicate_mod)]
+            #[cfg(test)]
+            mod xlog_utils_test;
+        }
+    };
+}
+
+postgres_ffi::for_all_postgres_versions! { xlog_utils_test }
+
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub struct Conf {
    pub pg_version: u32,
--- a/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs
+++ b/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs
@@ -0,0 +1,219 @@
+//! Tests for postgres_ffi xlog_utils module. Put it here to break cyclic dependency.
+
+use super::*;
+use crate::{error, info};
+use regex::Regex;
+use std::cmp::min;
+use std::fs::{self, File};
+use std::io::Write;
+use std::{env, str::FromStr};
+use utils::const_assert;
+use utils::lsn::Lsn;
+
+fn init_logging() {
+    let _ = env_logger::Builder::from_env(env_logger::Env::default().default_filter_or(
+        format!("crate=info,postgres_ffi::{PG_MAJORVERSION}::xlog_utils=trace"),
+    ))
+    .is_test(true)
+    .try_init();
+}
+
+fn test_end_of_wal<C: crate::Crafter>(test_name: &str) {
+    use crate::*;
+
+    let pg_version = PG_MAJORVERSION[1..3].parse::<u32>().unwrap();
+
+    // Craft some WAL
+    let top_path = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
+        .join("..")
+        .join("..")
+        .join("..");
+    let cfg = Conf {
+        pg_version,
+        pg_distrib_dir: top_path.join("pg_install"),
+        datadir: top_path.join(format!("test_output/{}-{PG_MAJORVERSION}", test_name)),
+    };
+    if cfg.datadir.exists() {
+        fs::remove_dir_all(&cfg.datadir).unwrap();
+    }
+    cfg.initdb().unwrap();
+    let srv = cfg.start_server().unwrap();
+    let (intermediate_lsns, expected_end_of_wal_partial) =
+        C::craft(&mut srv.connect_with_timeout().unwrap()).unwrap();
+    let intermediate_lsns: Vec<Lsn> = intermediate_lsns
+        .iter()
+        .map(|&lsn| u64::from(lsn).into())
+        .collect();
+    let expected_end_of_wal: Lsn = u64::from(expected_end_of_wal_partial).into();
+    srv.kill();
+
+    // Check find_end_of_wal on the initial WAL
+    let last_segment = cfg
+        .wal_dir()
+        .read_dir()
+        .unwrap()
+        .map(|f| f.unwrap().file_name().into_string().unwrap())
+        .filter(|fname| IsXLogFileName(fname))
+        .max()
+        .unwrap();
+    check_pg_waldump_end_of_wal(&cfg, &last_segment, expected_end_of_wal);
+    for start_lsn in intermediate_lsns
+        .iter()
+        .chain(std::iter::once(&expected_end_of_wal))
+    {
+        // Erase all WAL before `start_lsn` to ensure it's not used by `find_end_of_wal`.
+        // We assume that `start_lsn` is non-decreasing.
+        info!(
+            "Checking with start_lsn={}, erasing WAL before it",
+            start_lsn
+        );
+        for file in fs::read_dir(cfg.wal_dir()).unwrap().flatten() {
+            let fname = file.file_name().into_string().unwrap();
+            if !IsXLogFileName(&fname) {
+                continue;
+            }
+            let (segno, _) = XLogFromFileName(&fname, WAL_SEGMENT_SIZE);
+            let seg_start_lsn = XLogSegNoOffsetToRecPtr(segno, 0, WAL_SEGMENT_SIZE);
+            if seg_start_lsn > u64::from(*start_lsn) {
+                continue;
+            }
+            let mut f = File::options().write(true).open(file.path()).unwrap();
+            const ZEROS: [u8; WAL_SEGMENT_SIZE] = [0u8; WAL_SEGMENT_SIZE];
+            f.write_all(
+                &ZEROS[0..min(
+                    WAL_SEGMENT_SIZE,
+                    (u64::from(*start_lsn) - seg_start_lsn) as usize,
+                )],
+            )
+            .unwrap();
+        }
+        check_end_of_wal(&cfg, &last_segment, *start_lsn, expected_end_of_wal);
+    }
+}
+
+fn check_pg_waldump_end_of_wal(
+    cfg: &crate::Conf,
+    last_segment: &str,
+    expected_end_of_wal: Lsn,
+) {
+    // Get the actual end of WAL by pg_waldump
+    let waldump_output = cfg
+        .pg_waldump("000000010000000000000001", last_segment)
+        .unwrap()
+        .stderr;
+    let waldump_output = std::str::from_utf8(&waldump_output).unwrap();
+    let caps = match Regex::new(r"invalid record length at (.+):")
+        .unwrap()
+        .captures(waldump_output)
+    {
+        Some(caps) => caps,
+        None => {
+            error!("Unable to parse pg_waldump's stderr:\n{}", waldump_output);
+            panic!();
+        }
+    };
+    let waldump_wal_end = Lsn::from_str(caps.get(1).unwrap().as_str()).unwrap();
+    info!(
+        "waldump erred on {}, expected wal end at {}",
+        waldump_wal_end, expected_end_of_wal
+    );
+    assert_eq!(waldump_wal_end, expected_end_of_wal);
+}
+
+fn check_end_of_wal(
+    cfg: &crate::Conf,
+    last_segment: &str,
+    start_lsn: Lsn,
+    expected_end_of_wal: Lsn,
+) {
+    // Check end_of_wal on non-partial WAL segment (we treat it as fully populated)
+    // let wal_end = find_end_of_wal(&cfg.wal_dir(), WAL_SEGMENT_SIZE, start_lsn).unwrap();
+    // info!(
+    //     "find_end_of_wal returned wal_end={} with non-partial WAL segment",
+    //     wal_end
+    // );
+    // assert_eq!(wal_end, expected_end_of_wal_non_partial);
+
+    // Rename file to partial to actually find last valid lsn, then rename it back.
+    fs::rename(
+        cfg.wal_dir().join(last_segment),
+        cfg.wal_dir().join(format!("{}.partial", last_segment)),
+    )
+    .unwrap();
+    let wal_end = find_end_of_wal(&cfg.wal_dir(), WAL_SEGMENT_SIZE, start_lsn).unwrap();
+    info!(
+        "find_end_of_wal returned wal_end={} with partial WAL segment",
+        wal_end
+    );
+    assert_eq!(wal_end, expected_end_of_wal);
+    fs::rename(
+        cfg.wal_dir().join(format!("{}.partial", last_segment)),
+        cfg.wal_dir().join(last_segment),
+    )
+    .unwrap();
+}
+
+const_assert!(WAL_SEGMENT_SIZE == 16 * 1024 * 1024);
+
+#[test]
+pub fn test_find_end_of_wal_simple() {
+    init_logging();
+    test_end_of_wal::<crate::Simple>("test_find_end_of_wal_simple");
+}
+
+#[test]
+pub fn test_find_end_of_wal_crossing_segment_followed_by_small_one() {
+    init_logging();
+    test_end_of_wal::<crate::WalRecordCrossingSegmentFollowedBySmallOne>(
+        "test_find_end_of_wal_crossing_segment_followed_by_small_one",
+    );
+}
+
+#[test]
+pub fn test_find_end_of_wal_last_crossing_segment() {
+    init_logging();
+    test_end_of_wal::<crate::LastWalRecordCrossingSegment>(
+        "test_find_end_of_wal_last_crossing_segment",
+    );
+}
+
+/// Check the math in update_next_xid
+///
+/// NOTE: These checks are sensitive to the value of XID_CHECKPOINT_INTERVAL,
+/// currently 1024.
+#[test]
+pub fn test_update_next_xid() {
+    let checkpoint_buf = [0u8; std::mem::size_of::<CheckPoint>()];
+    let mut checkpoint = CheckPoint::decode(&checkpoint_buf).unwrap();
+
+    checkpoint.nextXid = FullTransactionId { value: 10 };
+    assert_eq!(checkpoint.nextXid.value, 10);
+
+    // The input XID gets rounded up to the next XID_CHECKPOINT_INTERVAL
+    // boundary
+    checkpoint.update_next_xid(100);
+    assert_eq!(checkpoint.nextXid.value, 1024);
+
+    // No change
+    checkpoint.update_next_xid(500);
+    assert_eq!(checkpoint.nextXid.value, 1024);
+    checkpoint.update_next_xid(1023);
+    assert_eq!(checkpoint.nextXid.value, 1024);
+
+    // The function returns the *next* XID, given the highest XID seen so
+    // far. So when we pass 1024, the nextXid gets bumped up to the next
+    // XID_CHECKPOINT_INTERVAL boundary.
+    checkpoint.update_next_xid(1024);
+    assert_eq!(checkpoint.nextXid.value, 2048);
+}
+
+#[test]
+pub fn test_encode_logical_message() {
+    let expected = [
+        64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 21, 0, 0, 170, 34, 166, 227, 255,
+        38, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 112, 114,
+        101, 102, 105, 120, 0, 109, 101, 115, 115, 97, 103, 101,
+    ];
+    let actual = encode_logical_message("prefix", "message");
+    assert_eq!(expected, actual[..]);
+}
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -70,6 +70,14 @@ impl RemotePath {
    pub fn join(&self, segment: &Path) -> Self {
        Self(self.0.join(segment))
    }
+
+    pub fn get_path(&self) -> &PathBuf {
+        &self.0
+    }
+
+    pub fn extension(&self) -> Option<&str> {
+        self.0.extension()?.to_str()
+    }
 }

 /// Storage (potentially remote) API to manage its state.
@@ -86,6 +94,19 @@ pub trait RemoteStorage: Send + Sync + 'static {
        prefix: Option<&RemotePath>,
    ) -> Result<Vec<RemotePath>, DownloadError>;

+    /// Lists all files in directory "recursively"
+    /// (not really recursively, because AWS has a flat namespace)
+    /// Note: This is subtely different than list_prefixes,
+    /// because it is for listing files instead of listing
+    /// names sharing common prefixes.
+    /// For example,
+    /// list_files("foo/bar") = ["foo/bar/cat123.txt",
+    /// "foo/bar/cat567.txt", "foo/bar/dog123.txt", "foo/bar/dog456.txt"]
+    /// whereas,
+    /// list_prefixes("foo/bar/") = ["cat", "dog"]
+    /// See `test_real_s3.rs` for more details.
+    async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>>;
+
    /// Streams the local file contents into remote into the remote storage entry.
    async fn upload(
        &self,
@@ -111,6 +132,8 @@ pub trait RemoteStorage: Send + Sync + 'static {
    ) -> Result<Download, DownloadError>;

    async fn delete(&self, path: &RemotePath) -> anyhow::Result<()>;
+
+    async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()>;
 }

 pub struct Download {
@@ -172,6 +195,14 @@ impl GenericRemoteStorage {
        }
    }

+    pub async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
+        match self {
+            Self::LocalFs(s) => s.list_files(folder).await,
+            Self::AwsS3(s) => s.list_files(folder).await,
+            Self::Unreliable(s) => s.list_files(folder).await,
+        }
+    }
+
    pub async fn upload(
        &self,
        from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
@@ -223,6 +254,14 @@ impl GenericRemoteStorage {
            Self::Unreliable(s) => s.delete(path).await,
        }
    }
+
+    pub async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> {
+        match self {
+            Self::LocalFs(s) => s.delete_objects(paths).await,
+            Self::AwsS3(s) => s.delete_objects(paths).await,
+            Self::Unreliable(s) => s.delete_objects(paths).await,
+        }
+    }
 }

 impl GenericRemoteStorage {
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -17,7 +17,7 @@ use tokio::{
    io::{self, AsyncReadExt, AsyncSeekExt, AsyncWriteExt},
 };
 use tracing::*;
-use utils::crashsafe::path_with_suffix_extension;
+use utils::{crashsafe::path_with_suffix_extension, fs_ext::is_directory_empty};

 use crate::{Download, DownloadError, RemotePath};

@@ -48,6 +48,14 @@ impl LocalFs {
        Ok(Self { storage_root })
    }

+    // mirrors S3Bucket::s3_object_to_relative_path
+    fn local_file_to_relative_path(&self, key: PathBuf) -> RemotePath {
+        let relative_path = key
+            .strip_prefix(&self.storage_root)
+            .expect("relative path must contain storage_root as prefix");
+        RemotePath(relative_path.into())
+    }
+
    async fn read_storage_metadata(
        &self,
        file_path: &Path,
@@ -101,19 +109,63 @@ impl RemoteStorage for LocalFs {
            Some(prefix) => Cow::Owned(prefix.with_base(&self.storage_root)),
            None => Cow::Borrowed(&self.storage_root),
        };
-        Ok(get_all_files(path.as_ref(), false)
+
+        let prefixes_to_filter = get_all_files(path.as_ref(), false)
            .await
-            .map_err(DownloadError::Other)?
-            .into_iter()
-            .map(|path| {
-                path.strip_prefix(&self.storage_root)
-                    .context("Failed to strip preifix")
+            .map_err(DownloadError::Other)?;
+
+        let mut prefixes = Vec::with_capacity(prefixes_to_filter.len());
+
+        // filter out empty directories to mirror s3 behavior.
+        for prefix in prefixes_to_filter {
+            if prefix.is_dir()
+                && is_directory_empty(&prefix)
+                    .await
+                    .map_err(DownloadError::Other)?
+            {
+                continue;
+            }
+
+            prefixes.push(
+                prefix
+                    .strip_prefix(&self.storage_root)
+                    .context("Failed to strip prefix")
                    .and_then(RemotePath::new)
                    .expect(
                        "We list files for storage root, hence should be able to remote the prefix",
-                    )
-            })
-            .collect())
+                    ),
+            )
+        }
+
+        Ok(prefixes)
+    }
+
+    // recursively lists all files in a directory,
+    // mirroring the `list_files` for `s3_bucket`
+    async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
+        let full_path = match folder {
+            Some(folder) => folder.with_base(&self.storage_root),
+            None => self.storage_root.clone(),
+        };
+        let mut files = vec![];
+        let mut directory_queue = vec![full_path.clone()];
+
+        while !directory_queue.is_empty() {
+            let cur_folder = directory_queue
+                .pop()
+                .expect("queue cannot be empty: we just checked");
+            let mut entries = fs::read_dir(cur_folder.clone()).await?;
+            while let Some(entry) = entries.next_entry().await? {
+                let file_name: PathBuf = entry.file_name().into();
+                let full_file_name = cur_folder.clone().join(&file_name);
+                let file_remote_path = self.local_file_to_relative_path(full_file_name.clone());
+                files.push(file_remote_path.clone());
+                if full_file_name.is_dir() {
+                    directory_queue.push(full_file_name);
+                }
+            }
+        }
+        Ok(files)
    }

    async fn upload(
@@ -291,11 +343,25 @@ impl RemoteStorage for LocalFs {

    async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
        let file_path = path.with_base(&self.storage_root);
-        if file_path.exists() && file_path.is_file() {
-            Ok(fs::remove_file(file_path).await?)
-        } else {
-            bail!("File {file_path:?} either does not exist or is not a file")
+        if !file_path.exists() {
+            // See https://docs.aws.amazon.com/AmazonS3/latest/API/API_DeleteObject.html
+            // > If there isn't a null version, Amazon S3 does not remove any objects but will still respond that the command was successful.
+            return Ok(());
        }
+
+        if !file_path.is_file() {
+            anyhow::bail!("{file_path:?} is not a file");
+        }
+        Ok(fs::remove_file(file_path)
+            .await
+            .map_err(|e| anyhow::anyhow!(e))?)
+    }
+
+    async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> {
+        for path in paths {
+            self.delete(path).await?
+        }
+        Ok(())
    }
 }

@@ -320,7 +386,7 @@ where
                    let file_type = dir_entry.file_type().await?;
                    let entry_path = dir_entry.path();
                    if file_type.is_symlink() {
-                        debug!("{entry_path:?} us a symlink, skipping")
+                        debug!("{entry_path:?} is a symlink, skipping")
                    } else if file_type.is_dir() {
                        if recursive {
                            paths.extend(get_all_files(&entry_path, true).await?.into_iter())
@@ -595,15 +661,11 @@ mod fs_tests {
        storage.delete(&upload_target).await?;
        assert!(storage.list().await?.is_empty());

-        match storage.delete(&upload_target).await {
-            Ok(()) => panic!("Should not allow deleting non-existing storage files"),
-            Err(e) => {
-                let error_string = e.to_string();
-                assert!(error_string.contains("does not exist"));
-                let expected_path = upload_target.with_base(&storage.storage_root);
-                assert!(error_string.contains(expected_path.to_str().unwrap()));
-            }
-        }
+        storage
+            .delete(&upload_target)
+            .await
+            .expect("Should allow deleting non-existing storage files");
+
        Ok(())
    }

--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -17,6 +17,7 @@ use aws_sdk_s3::{
    error::SdkError,
    operation::get_object::GetObjectError,
    primitives::ByteStream,
+    types::{Delete, ObjectIdentifier},
    Client,
 };
 use aws_smithy_http::body::SdkBody;
@@ -33,6 +34,8 @@ use crate::{
    Download, DownloadError, RemotePath, RemoteStorage, S3Config, REMOTE_STORAGE_PREFIX_SEPARATOR,
 };

+const MAX_DELETE_OBJECTS_REQUEST_SIZE: usize = 1000;
+
 pub(super) mod metrics {
    use metrics::{register_int_counter_vec, IntCounterVec};
    use once_cell::sync::Lazy;
@@ -81,12 +84,24 @@ pub(super) mod metrics {
            .inc();
    }

+    pub fn inc_delete_objects(count: u64) {
+        S3_REQUESTS_COUNT
+            .with_label_values(&["delete_object"])
+            .inc_by(count);
+    }
+
    pub fn inc_delete_object_fail() {
        S3_REQUESTS_FAIL_COUNT
            .with_label_values(&["delete_object"])
            .inc();
    }

+    pub fn inc_delete_objects_fail(count: u64) {
+        S3_REQUESTS_FAIL_COUNT
+            .with_label_values(&["delete_object"])
+            .inc_by(count);
+    }
+
    pub fn inc_list_objects() {
        S3_REQUESTS_COUNT.with_label_values(&["list_objects"]).inc();
    }
@@ -332,6 +347,51 @@ impl RemoteStorage for S3Bucket {
        Ok(document_keys)
    }

+    /// See the doc for `RemoteStorage::list_files`
+    async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
+        let folder_name = folder
+            .map(|p| self.relative_path_to_s3_object(p))
+            .or_else(|| self.prefix_in_bucket.clone());
+
+        // AWS may need to break the response into several parts
+        let mut continuation_token = None;
+        let mut all_files = vec![];
+        loop {
+            let _guard = self
+                .concurrency_limiter
+                .acquire()
+                .await
+                .context("Concurrency limiter semaphore got closed during S3 list_files")?;
+            metrics::inc_list_objects();
+
+            let response = self
+                .client
+                .list_objects_v2()
+                .bucket(self.bucket_name.clone())
+                .set_prefix(folder_name.clone())
+                .set_continuation_token(continuation_token)
+                .set_max_keys(self.max_keys_per_list_response)
+                .send()
+                .await
+                .map_err(|e| {
+                    metrics::inc_list_objects_fail();
+                    e
+                })
+                .context("Failed to list files in S3 bucket")?;
+
+            for object in response.contents().unwrap_or_default() {
+                let object_path = object.key().expect("response does not contain a key");
+                let remote_path = self.s3_object_to_relative_path(object_path);
+                all_files.push(remote_path);
+            }
+            match response.next_continuation_token {
+                Some(new_token) => continuation_token = Some(new_token),
+                None => break,
+            }
+        }
+        Ok(all_files)
+    }
+
    async fn upload(
        &self,
        from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
@@ -396,6 +456,50 @@ impl RemoteStorage for S3Bucket {
        })
        .await
    }
+    async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> {
+        let _guard = self
+            .concurrency_limiter
+            .acquire()
+            .await
+            .context("Concurrency limiter semaphore got closed during S3 delete")?;
+
+        let mut delete_objects = Vec::with_capacity(paths.len());
+        for path in paths {
+            let obj_id = ObjectIdentifier::builder()
+                .set_key(Some(self.relative_path_to_s3_object(path)))
+                .build();
+            delete_objects.push(obj_id);
+        }
+
+        for chunk in delete_objects.chunks(MAX_DELETE_OBJECTS_REQUEST_SIZE) {
+            metrics::inc_delete_objects(chunk.len() as u64);
+
+            let resp = self
+                .client
+                .delete_objects()
+                .bucket(self.bucket_name.clone())
+                .delete(Delete::builder().set_objects(Some(chunk.to_vec())).build())
+                .send()
+                .await;
+
+            match resp {
+                Ok(resp) => {
+                    if let Some(errors) = resp.errors {
+                        metrics::inc_delete_objects_fail(errors.len() as u64);
+                        return Err(anyhow::format_err!(
+                            "Failed to delete {} objects",
+                            errors.len()
+                        ));
+                    }
+                }
+                Err(e) => {
+                    metrics::inc_delete_objects_fail(chunk.len() as u64);
+                    return Err(e.into());
+                }
+            }
+        }
+        Ok(())
+    }

    async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
        let _guard = self
--- a/libs/remote_storage/src/simulate_failures.rs
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -24,6 +24,7 @@ enum RemoteOp {
    Upload(RemotePath),
    Download(RemotePath),
    Delete(RemotePath),
+    DeleteObjects(Vec<RemotePath>),
 }

 impl UnreliableWrapper {
@@ -82,6 +83,11 @@ impl RemoteStorage for UnreliableWrapper {
        self.inner.list_prefixes(prefix).await
    }

+    async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
+        self.attempt(RemoteOp::ListPrefixes(folder.cloned()))?;
+        self.inner.list_files(folder).await
+    }
+
    async fn upload(
        &self,
        data: impl tokio::io::AsyncRead + Unpin + Send + Sync + 'static,
@@ -119,4 +125,21 @@ impl RemoteStorage for UnreliableWrapper {
        self.attempt(RemoteOp::Delete(path.clone()))?;
        self.inner.delete(path).await
    }
+
+    async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> {
+        self.attempt(RemoteOp::DeleteObjects(paths.to_vec()))?;
+        let mut error_counter = 0;
+        for path in paths {
+            if (self.delete(path).await).is_err() {
+                error_counter += 1;
+            }
+        }
+        if error_counter > 0 {
+            return Err(anyhow::anyhow!(
+                "failed to delete {} objects",
+                error_counter
+            ));
+        }
+        Ok(())
+    }
 }
--- a/libs/remote_storage/tests/pagination_tests.rs
+++ b/libs/remote_storage/tests/pagination_tests.rs
@@ -7,6 +7,7 @@ use std::sync::Arc;
 use std::time::UNIX_EPOCH;

 use anyhow::Context;
+use once_cell::sync::OnceCell;
 use remote_storage::{
    GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind, S3Config,
 };
@@ -14,8 +15,12 @@ use test_context::{test_context, AsyncTestContext};
 use tokio::task::JoinSet;
 use tracing::{debug, error, info};

+static LOGGING_DONE: OnceCell<()> = OnceCell::new();
+
 const ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME: &str = "ENABLE_REAL_S3_REMOTE_STORAGE";

+const BASE_PREFIX: &str = "test/";
+
 /// Tests that S3 client can list all prefixes, even if the response come paginated and requires multiple S3 queries.
 /// Uses real S3 and requires [`ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME`] and related S3 cred env vars specified.
 /// See the client creation in [`create_s3_client`] for details on the required env vars.
@@ -38,20 +43,20 @@ const ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME: &str = "ENABLE_REAL_S3_REMOTE_
 ///
 /// Lastly, the test attempts to clean up and remove all uploaded S3 files.
 /// If any errors appear during the clean up, they get logged, but the test is not failed or stopped until clean up is finished.
-#[test_context(MaybeEnabledS3)]
+#[test_context(MaybeEnabledS3WithTestBlobs)]
 #[tokio::test]
-async fn s3_pagination_should_work(ctx: &mut MaybeEnabledS3) -> anyhow::Result<()> {
+async fn s3_pagination_should_work(ctx: &mut MaybeEnabledS3WithTestBlobs) -> anyhow::Result<()> {
    let ctx = match ctx {
-        MaybeEnabledS3::Enabled(ctx) => ctx,
-        MaybeEnabledS3::Disabled => return Ok(()),
-        MaybeEnabledS3::UploadsFailed(e, _) => anyhow::bail!("S3 init failed: {e:?}"),
+        MaybeEnabledS3WithTestBlobs::Enabled(ctx) => ctx,
+        MaybeEnabledS3WithTestBlobs::Disabled => return Ok(()),
+        MaybeEnabledS3WithTestBlobs::UploadsFailed(e, _) => anyhow::bail!("S3 init failed: {e:?}"),
    };

-    let test_client = Arc::clone(&ctx.client_with_excessive_pagination);
+    let test_client = Arc::clone(&ctx.enabled.client);
    let expected_remote_prefixes = ctx.remote_prefixes.clone();

-    let base_prefix =
-        RemotePath::new(Path::new(ctx.base_prefix_str)).context("common_prefix construction")?;
+    let base_prefix = RemotePath::new(Path::new(ctx.enabled.base_prefix))
+        .context("common_prefix construction")?;
    let root_remote_prefixes = test_client
        .list_prefixes(None)
        .await
@@ -83,27 +88,174 @@ async fn s3_pagination_should_work(ctx: &mut MaybeEnabledS3) -> anyhow::Result<(
    Ok(())
 }

+/// Tests that S3 client can list all files in a folder, even if the response comes paginated and requirees multiple S3 queries.
+/// Uses real S3 and requires [`ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME`] and related S3 cred env vars specified. Test will skip real code and pass if env vars not set.
+/// See `s3_pagination_should_work` for more information.
+///
+/// First, create a set of S3 objects with keys `random_prefix/folder{j}/blob_{i}.txt` in [`upload_s3_data`]
+/// Then performs the following queries:
+///    1. `list_files(None)`. This should return all files `random_prefix/folder{j}/blob_{i}.txt`
+///    2. `list_files("folder1")`.  This  should return all files `random_prefix/folder1/blob_{i}.txt`
+#[test_context(MaybeEnabledS3WithSimpleTestBlobs)]
+#[tokio::test]
+async fn s3_list_files_works(ctx: &mut MaybeEnabledS3WithSimpleTestBlobs) -> anyhow::Result<()> {
+    let ctx = match ctx {
+        MaybeEnabledS3WithSimpleTestBlobs::Enabled(ctx) => ctx,
+        MaybeEnabledS3WithSimpleTestBlobs::Disabled => return Ok(()),
+        MaybeEnabledS3WithSimpleTestBlobs::UploadsFailed(e, _) => {
+            anyhow::bail!("S3 init failed: {e:?}")
+        }
+    };
+    let test_client = Arc::clone(&ctx.enabled.client);
+    let base_prefix =
+        RemotePath::new(Path::new("folder1")).context("common_prefix construction")?;
+    let root_files = test_client
+        .list_files(None)
+        .await
+        .context("client list root files failure")?
+        .into_iter()
+        .collect::<HashSet<_>>();
+    assert_eq!(
+        root_files,
+        ctx.remote_blobs.clone(),
+        "remote storage list_files on root mismatches with the uploads."
+    );
+    let nested_remote_files = test_client
+        .list_files(Some(&base_prefix))
+        .await
+        .context("client list nested files failure")?
+        .into_iter()
+        .collect::<HashSet<_>>();
+    let trim_remote_blobs: HashSet<_> = ctx
+        .remote_blobs
+        .iter()
+        .map(|x| x.get_path().to_str().expect("must be valid name"))
+        .filter(|x| x.starts_with("folder1"))
+        .map(|x| RemotePath::new(Path::new(x)).expect("must be valid name"))
+        .collect();
+    assert_eq!(
+        nested_remote_files, trim_remote_blobs,
+        "remote storage list_files on subdirrectory mismatches with the uploads."
+    );
+    Ok(())
+}
+
+#[test_context(MaybeEnabledS3)]
+#[tokio::test]
+async fn s3_delete_non_exising_works(ctx: &mut MaybeEnabledS3) -> anyhow::Result<()> {
+    let ctx = match ctx {
+        MaybeEnabledS3::Enabled(ctx) => ctx,
+        MaybeEnabledS3::Disabled => return Ok(()),
+    };
+
+    let path = RemotePath::new(&PathBuf::from(format!(
+        "{}/for_sure_there_is_nothing_there_really",
+        ctx.base_prefix,
+    )))
+    .with_context(|| "RemotePath conversion")?;
+
+    ctx.client.delete(&path).await.expect("should succeed");
+
+    Ok(())
+}
+
+#[test_context(MaybeEnabledS3)]
+#[tokio::test]
+async fn s3_delete_objects_works(ctx: &mut MaybeEnabledS3) -> anyhow::Result<()> {
+    let ctx = match ctx {
+        MaybeEnabledS3::Enabled(ctx) => ctx,
+        MaybeEnabledS3::Disabled => return Ok(()),
+    };
+
+    let path1 = RemotePath::new(&PathBuf::from(format!("{}/path1", ctx.base_prefix,)))
+        .with_context(|| "RemotePath conversion")?;
+
+    let path2 = RemotePath::new(&PathBuf::from(format!("{}/path2", ctx.base_prefix,)))
+        .with_context(|| "RemotePath conversion")?;
+
+    let data1 = "remote blob data1".as_bytes();
+    let data1_len = data1.len();
+    let data2 = "remote blob data2".as_bytes();
+    let data2_len = data2.len();
+    ctx.client
+        .upload(std::io::Cursor::new(data1), data1_len, &path1, None)
+        .await?;
+
+    ctx.client
+        .upload(std::io::Cursor::new(data2), data2_len, &path2, None)
+        .await?;
+
+    ctx.client.delete_objects(&[path1, path2]).await?;
+
+    Ok(())
+}
+
+fn ensure_logging_ready() {
+    LOGGING_DONE.get_or_init(|| {
+        utils::logging::init(
+            utils::logging::LogFormat::Test,
+            utils::logging::TracingErrorLayerEnablement::Disabled,
+        )
+        .expect("logging init failed");
+    });
+}
+
+struct EnabledS3 {
+    client: Arc<GenericRemoteStorage>,
+    base_prefix: &'static str,
+}
+
+impl EnabledS3 {
+    async fn setup(max_keys_in_list_response: Option<i32>) -> Self {
+        let client = create_s3_client(max_keys_in_list_response)
+            .context("S3 client creation")
+            .expect("S3 client creation failed");
+
+        EnabledS3 {
+            client,
+            base_prefix: BASE_PREFIX,
+        }
+    }
+}
+
 enum MaybeEnabledS3 {
+    Enabled(EnabledS3),
+    Disabled,
+}
+
+#[async_trait::async_trait]
+impl AsyncTestContext for MaybeEnabledS3 {
+    async fn setup() -> Self {
+        ensure_logging_ready();
+
+        if env::var(ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME).is_err() {
+            info!(
+                "`{}` env variable is not set, skipping the test",
+                ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME
+            );
+            return Self::Disabled;
+        }
+
+        Self::Enabled(EnabledS3::setup(None).await)
+    }
+}
+
+enum MaybeEnabledS3WithTestBlobs {
    Enabled(S3WithTestBlobs),
    Disabled,
    UploadsFailed(anyhow::Error, S3WithTestBlobs),
 }

 struct S3WithTestBlobs {
-    client_with_excessive_pagination: Arc<GenericRemoteStorage>,
-    base_prefix_str: &'static str,
+    enabled: EnabledS3,
    remote_prefixes: HashSet<RemotePath>,
    remote_blobs: HashSet<RemotePath>,
 }

 #[async_trait::async_trait]
-impl AsyncTestContext for MaybeEnabledS3 {
+impl AsyncTestContext for MaybeEnabledS3WithTestBlobs {
    async fn setup() -> Self {
-        utils::logging::init(
-            utils::logging::LogFormat::Test,
-            utils::logging::TracingErrorLayerEnablement::Disabled,
-        )
-        .expect("logging init failed");
+        ensure_logging_ready();
        if env::var(ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME).is_err() {
            info!(
                "`{}` env variable is not set, skipping the test",
@@ -115,23 +267,14 @@ impl AsyncTestContext for MaybeEnabledS3 {
        let max_keys_in_list_response = 10;
        let upload_tasks_count = 1 + (2 * usize::try_from(max_keys_in_list_response).unwrap());

-        let client_with_excessive_pagination = create_s3_client(max_keys_in_list_response)
-            .context("S3 client creation")
-            .expect("S3 client creation failed");
+        let enabled = EnabledS3::setup(Some(max_keys_in_list_response)).await;

-        let base_prefix_str = "test/";
-        match upload_s3_data(
-            &client_with_excessive_pagination,
-            base_prefix_str,
-            upload_tasks_count,
-        )
-        .await
-        {
+        match upload_s3_data(&enabled.client, enabled.base_prefix, upload_tasks_count).await {
            ControlFlow::Continue(uploads) => {
                info!("Remote objects created successfully");
+
                Self::Enabled(S3WithTestBlobs {
-                    client_with_excessive_pagination,
-                    base_prefix_str,
+                    enabled,
                    remote_prefixes: uploads.prefixes,
                    remote_blobs: uploads.blobs,
                })
@@ -139,8 +282,7 @@ impl AsyncTestContext for MaybeEnabledS3 {
            ControlFlow::Break(uploads) => Self::UploadsFailed(
                anyhow::anyhow!("One or multiple blobs failed to upload to S3"),
                S3WithTestBlobs {
-                    client_with_excessive_pagination,
-                    base_prefix_str,
+                    enabled,
                    remote_prefixes: uploads.prefixes,
                    remote_blobs: uploads.blobs,
                },
@@ -152,13 +294,75 @@ impl AsyncTestContext for MaybeEnabledS3 {
        match self {
            Self::Disabled => {}
            Self::Enabled(ctx) | Self::UploadsFailed(_, ctx) => {
-                cleanup(&ctx.client_with_excessive_pagination, ctx.remote_blobs).await;
+                cleanup(&ctx.enabled.client, ctx.remote_blobs).await;
            }
        }
    }
 }

-fn create_s3_client(max_keys_per_list_response: i32) -> anyhow::Result<Arc<GenericRemoteStorage>> {
+// NOTE: the setups for the list_prefixes test and the list_files test are very similar
+// However, they are not idential. The list_prefixes function is concerned with listing prefixes,
+// whereas the list_files function is concerned with listing files.
+// See `RemoteStorage::list_files` documentation for more details
+enum MaybeEnabledS3WithSimpleTestBlobs {
+    Enabled(S3WithSimpleTestBlobs),
+    Disabled,
+    UploadsFailed(anyhow::Error, S3WithSimpleTestBlobs),
+}
+struct S3WithSimpleTestBlobs {
+    enabled: EnabledS3,
+    remote_blobs: HashSet<RemotePath>,
+}
+
+#[async_trait::async_trait]
+impl AsyncTestContext for MaybeEnabledS3WithSimpleTestBlobs {
+    async fn setup() -> Self {
+        ensure_logging_ready();
+        if env::var(ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME).is_err() {
+            info!(
+                "`{}` env variable is not set, skipping the test",
+                ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME
+            );
+            return Self::Disabled;
+        }
+
+        let max_keys_in_list_response = 10;
+        let upload_tasks_count = 1 + (2 * usize::try_from(max_keys_in_list_response).unwrap());
+
+        let enabled = EnabledS3::setup(Some(max_keys_in_list_response)).await;
+
+        match upload_simple_s3_data(&enabled.client, upload_tasks_count).await {
+            ControlFlow::Continue(uploads) => {
+                info!("Remote objects created successfully");
+
+                Self::Enabled(S3WithSimpleTestBlobs {
+                    enabled,
+                    remote_blobs: uploads,
+                })
+            }
+            ControlFlow::Break(uploads) => Self::UploadsFailed(
+                anyhow::anyhow!("One or multiple blobs failed to upload to S3"),
+                S3WithSimpleTestBlobs {
+                    enabled,
+                    remote_blobs: uploads,
+                },
+            ),
+        }
+    }
+
+    async fn teardown(self) {
+        match self {
+            Self::Disabled => {}
+            Self::Enabled(ctx) | Self::UploadsFailed(_, ctx) => {
+                cleanup(&ctx.enabled.client, ctx.remote_blobs).await;
+            }
+        }
+    }
+}
+
+fn create_s3_client(
+    max_keys_per_list_response: Option<i32>,
+) -> anyhow::Result<Arc<GenericRemoteStorage>> {
    let remote_storage_s3_bucket = env::var("REMOTE_STORAGE_S3_BUCKET")
        .context("`REMOTE_STORAGE_S3_BUCKET` env var is not set, but real S3 tests are enabled")?;
    let remote_storage_s3_region = env::var("REMOTE_STORAGE_S3_REGION")
@@ -166,7 +370,7 @@ fn create_s3_client(max_keys_per_list_response: i32) -> anyhow::Result<Arc<Gener
    let random_prefix_part = std::time::SystemTime::now()
        .duration_since(UNIX_EPOCH)
        .context("random s3 test prefix part calculation")?
-        .as_millis();
+        .as_nanos();
    let remote_storage_config = RemoteStorageConfig {
        max_concurrent_syncs: NonZeroUsize::new(100).unwrap(),
        max_sync_errors: NonZeroU32::new(5).unwrap(),
@@ -176,7 +380,7 @@ fn create_s3_client(max_keys_per_list_response: i32) -> anyhow::Result<Arc<Gener
            prefix_in_bucket: Some(format!("pagination_should_work_test_{random_prefix_part}/")),
            endpoint: None,
            concurrency_limit: NonZeroUsize::new(100).unwrap(),
-            max_keys_per_list_response: Some(max_keys_per_list_response),
+            max_keys_per_list_response,
        }),
    };
    Ok(Arc::new(
@@ -272,3 +476,52 @@ async fn cleanup(client: &Arc<GenericRemoteStorage>, objects_to_delete: HashSet<
        }
    }
 }
+
+// Uploads files `folder{j}/blob{i}.txt`. See test description for more details.
+async fn upload_simple_s3_data(
+    client: &Arc<GenericRemoteStorage>,
+    upload_tasks_count: usize,
+) -> ControlFlow<HashSet<RemotePath>, HashSet<RemotePath>> {
+    info!("Creating {upload_tasks_count} S3 files");
+    let mut upload_tasks = JoinSet::new();
+    for i in 1..upload_tasks_count + 1 {
+        let task_client = Arc::clone(client);
+        upload_tasks.spawn(async move {
+            let blob_path = PathBuf::from(format!("folder{}/blob_{}.txt", i / 7, i));
+            let blob_path = RemotePath::new(&blob_path)
+                .with_context(|| format!("{blob_path:?} to RemotePath conversion"))?;
+            debug!("Creating remote item {i} at path {blob_path:?}");
+
+            let data = format!("remote blob data {i}").into_bytes();
+            let data_len = data.len();
+            task_client
+                .upload(std::io::Cursor::new(data), data_len, &blob_path, None)
+                .await?;
+
+            Ok::<_, anyhow::Error>(blob_path)
+        });
+    }
+
+    let mut upload_tasks_failed = false;
+    let mut uploaded_blobs = HashSet::with_capacity(upload_tasks_count);
+    while let Some(task_run_result) = upload_tasks.join_next().await {
+        match task_run_result
+            .context("task join failed")
+            .and_then(|task_result| task_result.context("upload task failed"))
+        {
+            Ok(upload_path) => {
+                uploaded_blobs.insert(upload_path);
+            }
+            Err(e) => {
+                error!("Upload task failed: {e:?}");
+                upload_tasks_failed = true;
+            }
+        }
+    }
+
+    if upload_tasks_failed {
+        ControlFlow::Break(uploaded_blobs)
+    } else {
+        ControlFlow::Continue(uploaded_blobs)
+    }
+}
--- a/libs/utils/src/completion.rs
+++ b/libs/utils/src/completion.rs
@@ -0,0 +1,33 @@
+use std::sync::Arc;
+
+use tokio::sync::{mpsc, Mutex};
+
+/// While a reference is kept around, the associated [`Barrier::wait`] will wait.
+///
+/// Can be cloned, moved and kept around in futures as "guard objects".
+#[derive(Clone)]
+pub struct Completion(mpsc::Sender<()>);
+
+/// Barrier will wait until all clones of [`Completion`] have been dropped.
+#[derive(Clone)]
+pub struct Barrier(Arc<Mutex<mpsc::Receiver<()>>>);
+
+impl Barrier {
+    pub async fn wait(self) {
+        self.0.lock().await.recv().await;
+    }
+
+    pub async fn maybe_wait(barrier: Option<Barrier>) {
+        if let Some(b) = barrier {
+            b.wait().await
+        }
+    }
+}
+
+/// Create new Guard and Barrier pair.
+pub fn channel() -> (Completion, Barrier) {
+    let (tx, rx) = mpsc::channel::<()>(1);
+    let rx = Mutex::new(rx);
+    let rx = Arc::new(rx);
+    (Completion(tx), Barrier(rx))
+}
--- a/libs/utils/src/fs_ext.rs
+++ b/libs/utils/src/fs_ext.rs
@@ -1,6 +1,8 @@
 /// Extensions to `std::fs` types.
 use std::{fs, io, path::Path};

+use anyhow::Context;
+
 pub trait PathExt {
    /// Returns an error if `self` is not a directory.
    fn is_empty_dir(&self) -> io::Result<bool>;
@@ -15,10 +17,19 @@ where
    }
 }

+pub async fn is_directory_empty(path: impl AsRef<Path>) -> anyhow::Result<bool> {
+    let mut dir = tokio::fs::read_dir(&path)
+        .await
+        .context(format!("read_dir({})", path.as_ref().display()))?;
+    Ok(dir.next_entry().await?.is_none())
+}
+
 #[cfg(test)]
 mod test {
    use std::path::PathBuf;

+    use crate::fs_ext::is_directory_empty;
+
    #[test]
    fn is_empty_dir() {
        use super::PathExt;
@@ -42,4 +53,26 @@ mod test {
        std::fs::remove_file(&file_path).unwrap();
        assert!(file_path.is_empty_dir().is_err());
    }
+
+    #[tokio::test]
+    async fn is_empty_dir_async() {
+        let dir = tempfile::tempdir().unwrap();
+        let dir_path = dir.path();
+
+        // test positive case
+        assert!(
+            is_directory_empty(dir_path).await.expect("test failure"),
+            "new tempdir should be empty"
+        );
+
+        // invoke on a file to ensure it returns an error
+        let file_path: PathBuf = dir_path.join("testfile");
+        let f = std::fs::File::create(&file_path).unwrap();
+        drop(f);
+        assert!(is_directory_empty(&file_path).await.is_err());
+
+        // do it again on a path, we know to be nonexistent
+        std::fs::remove_file(&file_path).unwrap();
+        assert!(is_directory_empty(file_path).await.is_err());
+    }
 }
--- a/libs/utils/src/http/endpoint.rs
+++ b/libs/utils/src/http/endpoint.rs
@@ -1,23 +1,20 @@
 use crate::auth::{Claims, JwtAuth};
-use crate::http::error;
-use anyhow::{anyhow, Context};
+use crate::http::error::{api_error_handler, route_error_handler, ApiError};
+use anyhow::Context;
 use hyper::header::{HeaderName, AUTHORIZATION};
 use hyper::http::HeaderValue;
 use hyper::Method;
-use hyper::{header::CONTENT_TYPE, Body, Request, Response, Server};
+use hyper::{header::CONTENT_TYPE, Body, Request, Response};
 use metrics::{register_int_counter, Encoder, IntCounter, TextEncoder};
 use once_cell::sync::Lazy;
 use routerify::ext::RequestExt;
-use routerify::{Middleware, RequestInfo, Router, RouterBuilder, RouterService};
+use routerify::{Middleware, RequestInfo, Router, RouterBuilder};
 use tokio::task::JoinError;
 use tracing::{self, debug, info, info_span, warn, Instrument};

 use std::future::Future;
-use std::net::TcpListener;
 use std::str::FromStr;

-use super::error::ApiError;
-
 static SERVE_METRICS_COUNT: Lazy<IntCounter> = Lazy::new(|| {
    register_int_counter!(
        "libmetrics_metric_handler_requests_total",
@@ -35,8 +32,18 @@ struct RequestId(String);
 /// Adds a tracing info_span! instrumentation around the handler events,
 /// logs the request start and end events for non-GET requests and non-200 responses.
 ///
+/// Usage: Replace `my_handler` with `|r| request_span(r, my_handler)`
+///
 /// Use this to distinguish between logs of different HTTP requests: every request handler wrapped
-/// in this type will get request info logged in the wrapping span, including the unique request ID.
+/// with this will get request info logged in the wrapping span, including the unique request ID.
+///
+/// This also handles errors, logging them and converting them to an HTTP error response.
+///
+/// NB: If the client disconnects, Hyper will drop the Future, without polling it to
+/// completion. In other words, the handler must be async cancellation safe! request_span
+/// prints a warning to the log when that happens, so that you have some trace of it in
+/// the log.
+///
 ///
 /// There could be other ways to implement similar functionality:
 ///
@@ -54,60 +61,56 @@ struct RequestId(String);
 /// tries to achive with its `.instrument` used in the current approach.
 ///
 /// If needed, a declarative macro to substitute the |r| ... closure boilerplate could be introduced.
-pub struct RequestSpan<E, R, H>(pub H)
+pub async fn request_span<R, H>(request: Request<Body>, handler: H) -> R::Output
 where
-    E: Into<Box<dyn std::error::Error + Send + Sync>> + 'static,
-    R: Future<Output = Result<Response<Body>, E>> + Send + 'static,
-    H: Fn(Request<Body>) -> R + Send + Sync + 'static;
-
-impl<E, R, H> RequestSpan<E, R, H>
-where
-    E: Into<Box<dyn std::error::Error + Send + Sync>> + 'static,
-    R: Future<Output = Result<Response<Body>, E>> + Send + 'static,
-    H: Fn(Request<Body>) -> R + Send + Sync + 'static,
+    R: Future<Output = Result<Response<Body>, ApiError>> + Send + 'static,
+    H: FnOnce(Request<Body>) -> R + Send + Sync + 'static,
 {
-    /// Creates a tracing span around inner request handler and executes the request handler in the contex of that span.
-    /// Use as `|r| RequestSpan(my_handler).handle(r)` instead of `my_handler` as the request handler to get the span enabled.
-    pub async fn handle(self, request: Request<Body>) -> Result<Response<Body>, E> {
-        let request_id = request.context::<RequestId>().unwrap_or_default().0;
-        let method = request.method();
-        let path = request.uri().path();
-        let request_span = info_span!("request", %method, %path, %request_id);
+    let request_id = request.context::<RequestId>().unwrap_or_default().0;
+    let method = request.method();
+    let path = request.uri().path();
+    let request_span = info_span!("request", %method, %path, %request_id);

-        let log_quietly = method == Method::GET;
-        async move {
-            let cancellation_guard = RequestCancelled::warn_when_dropped_without_responding();
-            if log_quietly {
-                debug!("Handling request");
-            } else {
-                info!("Handling request");
-            }
-
-            // Note that we reuse `error::handler` here and not returning and error at all,
-            // yet cannot use `!` directly in the method signature due to `routerify::RouterBuilder` limitation.
-            // Usage of the error handler also means that we expect only the `ApiError` errors to be raised in this call.
-            //
-            // Panics are not handled separately, there's a `tracing_panic_hook` from another module to do that globally.
-            let res = (self.0)(request).await;
-
-            cancellation_guard.disarm();
-
-            match res {
-                Ok(response) => {
-                    let response_status = response.status();
-                    if log_quietly && response_status.is_success() {
-                        debug!("Request handled, status: {response_status}");
-                    } else {
-                        info!("Request handled, status: {response_status}");
-                    }
-                    Ok(response)
-                }
-                Err(e) => Ok(error::handler(e.into()).await),
-            }
+    let log_quietly = method == Method::GET;
+    async move {
+        let cancellation_guard = RequestCancelled::warn_when_dropped_without_responding();
+        if log_quietly {
+            debug!("Handling request");
+        } else {
+            info!("Handling request");
+        }
+
+        // No special handling for panics here. There's a `tracing_panic_hook` from another
+        // module to do that globally.
+        let res = handler(request).await;
+
+        cancellation_guard.disarm();
+
+        // Log the result if needed.
+        //
+        // We also convert any errors into an Ok response with HTTP error code here.
+        // `make_router` sets a last-resort error handler that would do the same, but
+        // we prefer to do it here, before we exit the request span, so that the error
+        // is still logged with the span.
+        //
+        // (Because we convert errors to Ok response, we never actually return an error,
+        // and we could declare the function to return the never type (`!`). However,
+        // using `routerify::RouterBuilder` requires a proper error type.)
+        match res {
+            Ok(response) => {
+                let response_status = response.status();
+                if log_quietly && response_status.is_success() {
+                    debug!("Request handled, status: {response_status}");
+                } else {
+                    info!("Request handled, status: {response_status}");
+                }
+                Ok(response)
+            }
+            Err(err) => Ok(api_error_handler(err)),
        }
-        .instrument(request_span)
-        .await
    }
+    .instrument(request_span)
+    .await
 }

 /// Drop guard to WARN in case the request was dropped before completion.
@@ -207,10 +210,8 @@ pub fn make_router() -> RouterBuilder<hyper::Body, ApiError> {
        .middleware(Middleware::post_with_info(
            add_request_id_header_to_response,
        ))
-        .get("/metrics", |r| {
-            RequestSpan(prometheus_metrics_handler).handle(r)
-        })
-        .err_handler(error::handler)
+        .get("/metrics", |r| request_span(r, prometheus_metrics_handler))
+        .err_handler(route_error_handler)
 }

 pub fn attach_openapi_ui(
@@ -220,12 +221,14 @@ pub fn attach_openapi_ui(
    ui_mount_path: &'static str,
 ) -> RouterBuilder<hyper::Body, ApiError> {
    router_builder
-        .get(spec_mount_path, move |r| {
-            RequestSpan(move |_| async move { Ok(Response::builder().body(Body::from(spec)).unwrap()) })
-                .handle(r)
-        })
-        .get(ui_mount_path, move |r| RequestSpan( move |_| async move {
-            Ok(Response::builder().body(Body::from(format!(r#"
+        .get(spec_mount_path,
+            move |r| request_span(r, move |_| async move {
+                Ok(Response::builder().body(Body::from(spec)).unwrap())
+            })
+        )
+        .get(ui_mount_path,
+             move |r| request_span(r, move |_| async move {
+                 Ok(Response::builder().body(Body::from(format!(r#"
                <!DOCTYPE html>
                <html lang="en">
                <head>
@@ -255,7 +258,8 @@ pub fn attach_openapi_ui(
                </body>
                </html>
            "#, spec_mount_path))).unwrap())
-        }).handle(r))
+             })
+        )
 }

 fn parse_token(header_value: &str) -> Result<&str, ApiError> {
@@ -343,40 +347,6 @@ pub fn check_permission_with(
    }
 }

-///
-/// Start listening for HTTP requests on given socket.
-///
-/// 'shutdown_future' can be used to stop. If the Future becomes
-/// ready, we stop listening for new requests, and the function returns.
-///
-pub fn serve_thread_main<S>(
-    router_builder: RouterBuilder<hyper::Body, ApiError>,
-    listener: TcpListener,
-    shutdown_future: S,
-) -> anyhow::Result<()>
-where
-    S: Future<Output = ()> + Send + Sync,
-{
-    info!("Starting an HTTP endpoint at {}", listener.local_addr()?);
-
-    // Create a Service from the router above to handle incoming requests.
-    let service = RouterService::new(router_builder.build().map_err(|err| anyhow!(err))?).unwrap();
-
-    // Enter a single-threaded tokio runtime bound to the current thread
-    let runtime = tokio::runtime::Builder::new_current_thread()
-        .enable_all()
-        .build()?;
-
-    let _guard = runtime.enter();
-
-    let server = Server::from_tcp(listener)?
-        .serve(service)
-        .with_graceful_shutdown(shutdown_future);
-
-    runtime.block_on(server)?;
-
-    Ok(())
-}
 #[cfg(test)]
 mod tests {
    use super::*;
--- a/libs/utils/src/http/error.rs
+++ b/libs/utils/src/http/error.rs
@@ -1,5 +1,6 @@
 use hyper::{header, Body, Response, StatusCode};
 use serde::{Deserialize, Serialize};
+use std::error::Error as StdError;
 use thiserror::Error;
 use tracing::error;

@@ -15,13 +16,13 @@ pub enum ApiError {
    Unauthorized(String),

    #[error("NotFound: {0}")]
-    NotFound(anyhow::Error),
+    NotFound(Box<dyn StdError + Send + Sync + 'static>),

    #[error("Conflict: {0}")]
    Conflict(String),

    #[error("Precondition failed: {0}")]
-    PreconditionFailed(&'static str),
+    PreconditionFailed(Box<str>),

    #[error(transparent)]
    InternalServerError(anyhow::Error),
@@ -83,13 +84,24 @@ impl HttpErrorBody {
    }
 }

-pub async fn handler(err: routerify::RouteError) -> Response<Body> {
-    let api_error = err
-        .downcast::<ApiError>()
-        .expect("handler should always return api error");
+pub async fn route_error_handler(err: routerify::RouteError) -> Response<Body> {
+    match err.downcast::<ApiError>() {
+        Ok(api_error) => api_error_handler(*api_error),
+        Err(other_error) => {
+            // We expect all the request handlers to return an ApiError, so this should
+            // not be reached. But just in case.
+            error!("Error processing HTTP request: {other_error:?}");
+            HttpErrorBody::response_from_msg_and_status(
+                other_error.to_string(),
+                StatusCode::INTERNAL_SERVER_ERROR,
+            )
+        }
+    }
+}

+pub fn api_error_handler(api_error: ApiError) -> Response<Body> {
    // Print a stack trace for Internal Server errors
-    if let ApiError::InternalServerError(_) = api_error.as_ref() {
+    if let ApiError::InternalServerError(_) = api_error {
        error!("Error processing HTTP request: {api_error:?}");
    } else {
        error!("Error processing HTTP request: {api_error:#}");
--- a/libs/utils/src/http/json.rs
+++ b/libs/utils/src/http/json.rs
@@ -8,12 +8,26 @@ use super::error::ApiError;
 pub async fn json_request<T: for<'de> Deserialize<'de>>(
    request: &mut Request<Body>,
 ) -> Result<T, ApiError> {
-    let whole_body = hyper::body::aggregate(request.body_mut())
+    json_request_or_empty_body(request)
+        .await?
+        .context("missing request body")
+        .map_err(ApiError::BadRequest)
+}
+
+/// Will be removed as part of https://github.com/neondatabase/neon/issues/4282
+pub async fn json_request_or_empty_body<T: for<'de> Deserialize<'de>>(
+    request: &mut Request<Body>,
+) -> Result<Option<T>, ApiError> {
+    let body = hyper::body::aggregate(request.body_mut())
        .await
        .context("Failed to read request body")
        .map_err(ApiError::BadRequest)?;
-    serde_json::from_reader(whole_body.reader())
+    if body.remaining() == 0 {
+        return Ok(None);
+    }
+    serde_json::from_reader(body.reader())
        .context("Failed to parse json request")
+        .map(Some)
        .map_err(ApiError::BadRequest)
 }

--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -60,6 +60,9 @@ pub mod tracing_span_assert;

 pub mod rate_limit;

+/// Simple once-barrier and a guard which keeps barrier awaiting.
+pub mod completion;
+
 mod failpoint_macro_helpers {

    /// use with fail::cfg("$name", "return(2000)")
--- a/pageserver/benches/bench_layer_map.rs
+++ b/pageserver/benches/bench_layer_map.rs
@@ -33,7 +33,7 @@ fn build_layer_map(filename_dump: PathBuf) -> LayerMap<LayerDescriptor> {
        min_lsn = min(min_lsn, lsn_range.start);
        max_lsn = max(max_lsn, Lsn(lsn_range.end.0 - 1));

-        updates.insert_historic(Arc::new(layer));
+        updates.insert_historic(layer.get_persistent_layer_desc(), Arc::new(layer));
    }

    println!("min: {min_lsn}, max: {max_lsn}");
@@ -215,7 +215,7 @@ fn bench_sequential(c: &mut Criterion) {
            is_incremental: false,
            short_id: format!("Layer {}", i),
        };
-        updates.insert_historic(Arc::new(layer));
+        updates.insert_historic(layer.get_persistent_layer_desc(), Arc::new(layer));
    }
    updates.flush();
    println!("Finished layer map init in {:?}", now.elapsed());
--- a/pageserver/ctl/Cargo.toml
+++ b/pageserver/ctl/Cargo.toml
@@ -0,0 +1,18 @@
+[package]
+name = "pagectl"
+version = "0.1.0"
+edition.workspace = true
+license.workspace = true
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+anyhow.workspace = true
+bytes.workspace = true
+clap = { workspace = true, features = ["string"] }
+git-version.workspace = true
+pageserver = { path = ".." }
+postgres_ffi.workspace = true
+utils.workspace = true
+svg_fmt.workspace = true
+workspace_hack.workspace = true
--- a/pageserver/ctl/src/draw_timeline_dir.rs
+++ b/pageserver/ctl/src/draw_timeline_dir.rs
@@ -12,7 +12,7 @@
 //! Example use:
 //! ```
 //! $ ls test_output/test_pgbench\[neon-45-684\]/repo/tenants/$TENANT/timelines/$TIMELINE | \
-//! $   grep "__" | cargo run --release --bin draw_timeline_dir > out.svg
+//! $   grep "__" | cargo run --release --bin pagectl draw-timeline-dir > out.svg
 //! $ firefox out.svg
 //! ```
 //!
@@ -62,7 +62,7 @@ fn parse_filename(name: &str) -> (Range<Key>, Range<Lsn>) {
    (keys, lsns)
 }

-fn main() -> Result<()> {
+pub fn main() -> Result<()> {
    // Parse layer filenames from stdin
    let mut ranges: Vec<(Range<Key>, Range<Lsn>)> = vec![];
    let stdin = io::stdin();
--- a/pageserver/ctl/src/layer_map_analyzer.rs
+++ b/pageserver/ctl/src/layer_map_analyzer.rs
@@ -6,7 +6,7 @@ use anyhow::Result;
 use std::cmp::Ordering;
 use std::collections::BinaryHeap;
 use std::ops::Range;
-use std::{env, fs, path::Path, path::PathBuf, str, str::FromStr};
+use std::{fs, path::Path, str};

 use pageserver::page_cache::PAGE_SZ;
 use pageserver::repository::{Key, KEY_SIZE};
@@ -18,12 +18,14 @@ use pageserver::virtual_file::VirtualFile;

 use utils::{bin_ser::BeSer, lsn::Lsn};

+use crate::AnalyzeLayerMapCmd;
+
 const MIN_HOLE_LENGTH: i128 = (128 * 1024 * 1024 / PAGE_SZ) as i128;
 const DEFAULT_MAX_HOLES: usize = 10;

 /// Wrapper for key range to provide reverse ordering by range length for BinaryHeap
 #[derive(PartialEq, Eq)]
-struct Hole(Range<Key>);
+pub struct Hole(Range<Key>);

 impl Ord for Hole {
    fn cmp(&self, other: &Self) -> Ordering {
@@ -39,11 +41,11 @@ impl PartialOrd for Hole {
    }
 }

-struct LayerFile {
-    key_range: Range<Key>,
-    lsn_range: Range<Lsn>,
-    is_delta: bool,
-    holes: Vec<Hole>,
+pub(crate) struct LayerFile {
+    pub key_range: Range<Key>,
+    pub lsn_range: Range<Lsn>,
+    pub is_delta: bool,
+    pub holes: Vec<Hole>,
 }

 impl LayerFile {
@@ -67,7 +69,7 @@ impl LayerFile {
    }
 }

-fn parse_filename(name: &str) -> Option<LayerFile> {
+pub(crate) fn parse_filename(name: &str) -> Option<LayerFile> {
    let split: Vec<&str> = name.split("__").collect();
    if split.len() != 2 {
        return None;
@@ -127,18 +129,9 @@ fn get_holes(path: &Path, max_holes: usize) -> Result<Vec<Hole>> {
    Ok(holes)
 }

-fn main() -> Result<()> {
-    let args: Vec<String> = env::args().collect();
-    if args.len() < 2 {
-        println!("Usage: layer_map_analyzer PAGESERVER_DATA_DIR [MAX_HOLES]");
-        return Ok(());
-    }
-    let storage_path = PathBuf::from_str(&args[1])?;
-    let max_holes = if args.len() > 2 {
-        args[2].parse::<usize>().unwrap()
-    } else {
-        DEFAULT_MAX_HOLES
-    };
+pub(crate) fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> {
+    let storage_path = &cmd.path;
+    let max_holes = cmd.max_holes.unwrap_or(DEFAULT_MAX_HOLES);

    // Initialize virtual_file (file desriptor cache) and page cache which are needed to access layer persistent B-Tree.
    pageserver::virtual_file::init(10);
--- a/pageserver/ctl/src/layers.rs
+++ b/pageserver/ctl/src/layers.rs
@@ -0,0 +1,169 @@
+use std::path::{Path, PathBuf};
+
+use anyhow::Result;
+use clap::Subcommand;
+use pageserver::tenant::block_io::BlockCursor;
+use pageserver::tenant::disk_btree::DiskBtreeReader;
+use pageserver::tenant::storage_layer::delta_layer::{BlobRef, Summary};
+use pageserver::{page_cache, virtual_file};
+use pageserver::{
+    repository::{Key, KEY_SIZE},
+    tenant::{
+        block_io::FileBlockReader, disk_btree::VisitDirection,
+        storage_layer::delta_layer::DELTA_KEY_SIZE,
+    },
+    virtual_file::VirtualFile,
+};
+use std::fs;
+use utils::bin_ser::BeSer;
+
+use crate::layer_map_analyzer::parse_filename;
+
+#[derive(Subcommand)]
+pub(crate) enum LayerCmd {
+    /// List all tenants and timelines under the pageserver path
+    ///
+    /// Example: `cargo run --bin pagectl layer list .neon/`
+    List { path: PathBuf },
+    /// List all layers of a given tenant and timeline
+    ///
+    /// Example: `cargo run --bin pagectl layer list .neon/`
+    ListLayer {
+        path: PathBuf,
+        tenant: String,
+        timeline: String,
+    },
+    /// Dump all information of a layer file
+    DumpLayer {
+        path: PathBuf,
+        tenant: String,
+        timeline: String,
+        /// The id from list-layer command
+        id: usize,
+    },
+}
+
+fn read_delta_file(path: impl AsRef<Path>) -> Result<()> {
+    use pageserver::tenant::blob_io::BlobCursor;
+    use pageserver::tenant::block_io::BlockReader;
+
+    let path = path.as_ref();
+    virtual_file::init(10);
+    page_cache::init(100);
+    let file = FileBlockReader::new(VirtualFile::open(path)?);
+    let summary_blk = file.read_blk(0)?;
+    let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
+    let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
+        actual_summary.index_start_blk,
+        actual_summary.index_root_blk,
+        &file,
+    );
+    // TODO(chi): dedup w/ `delta_layer.rs` by exposing the API.
+    let mut all = vec![];
+    tree_reader.visit(
+        &[0u8; DELTA_KEY_SIZE],
+        VisitDirection::Forwards,
+        |key, value_offset| {
+            let curr = Key::from_slice(&key[..KEY_SIZE]);
+            all.push((curr, BlobRef(value_offset)));
+            true
+        },
+    )?;
+    let mut cursor = BlockCursor::new(&file);
+    for (k, v) in all {
+        let value = cursor.read_blob(v.pos())?;
+        println!("key:{} value_len:{}", k, value.len());
+    }
+    // TODO(chi): special handling for last key?
+    Ok(())
+}
+
+pub(crate) fn main(cmd: &LayerCmd) -> Result<()> {
+    match cmd {
+        LayerCmd::List { path } => {
+            for tenant in fs::read_dir(path.join("tenants"))? {
+                let tenant = tenant?;
+                if !tenant.file_type()?.is_dir() {
+                    continue;
+                }
+                println!("tenant {}", tenant.file_name().to_string_lossy());
+                for timeline in fs::read_dir(tenant.path().join("timelines"))? {
+                    let timeline = timeline?;
+                    if !timeline.file_type()?.is_dir() {
+                        continue;
+                    }
+                    println!("- timeline {}", timeline.file_name().to_string_lossy());
+                }
+            }
+        }
+        LayerCmd::ListLayer {
+            path,
+            tenant,
+            timeline,
+        } => {
+            let timeline_path = path
+                .join("tenants")
+                .join(tenant)
+                .join("timelines")
+                .join(timeline);
+            let mut idx = 0;
+            for layer in fs::read_dir(timeline_path)? {
+                let layer = layer?;
+                if let Some(layer_file) = parse_filename(&layer.file_name().into_string().unwrap())
+                {
+                    println!(
+                        "[{:3}]  key:{}-{}\n       lsn:{}-{}\n       delta:{}",
+                        idx,
+                        layer_file.key_range.start,
+                        layer_file.key_range.end,
+                        layer_file.lsn_range.start,
+                        layer_file.lsn_range.end,
+                        layer_file.is_delta,
+                    );
+                    idx += 1;
+                }
+            }
+        }
+        LayerCmd::DumpLayer {
+            path,
+            tenant,
+            timeline,
+            id,
+        } => {
+            let timeline_path = path
+                .join("tenants")
+                .join(tenant)
+                .join("timelines")
+                .join(timeline);
+            let mut idx = 0;
+            for layer in fs::read_dir(timeline_path)? {
+                let layer = layer?;
+                if let Some(layer_file) = parse_filename(&layer.file_name().into_string().unwrap())
+                {
+                    if *id == idx {
+                        // TODO(chi): dedup code
+                        println!(
+                            "[{:3}]  key:{}-{}\n       lsn:{}-{}\n       delta:{}",
+                            idx,
+                            layer_file.key_range.start,
+                            layer_file.key_range.end,
+                            layer_file.lsn_range.start,
+                            layer_file.lsn_range.end,
+                            layer_file.is_delta,
+                        );
+
+                        if layer_file.is_delta {
+                            read_delta_file(layer.path())?;
+                        } else {
+                            anyhow::bail!("not supported yet :(");
+                        }
+
+                        break;
+                    }
+                    idx += 1;
+                }
+            }
+        }
+    }
+    Ok(())
+}
--- a/pageserver/ctl/src/main.rs
+++ b/pageserver/ctl/src/main.rs
@@ -0,0 +1,179 @@
+//! A helper tool to manage pageserver binary files.
+//! Accepts a file as an argument, attempts to parse it with all ways possible
+//! and prints its interpreted context.
+//!
+//! Separate, `metadata` subcommand allows to print and update pageserver's metadata file.
+
+mod draw_timeline_dir;
+mod layer_map_analyzer;
+mod layers;
+
+use clap::{Parser, Subcommand};
+use layers::LayerCmd;
+use pageserver::{
+    context::{DownloadBehavior, RequestContext},
+    page_cache,
+    task_mgr::TaskKind,
+    tenant::{dump_layerfile_from_path, metadata::TimelineMetadata},
+    virtual_file,
+};
+use postgres_ffi::ControlFileData;
+use std::path::{Path, PathBuf};
+use utils::{lsn::Lsn, project_git_version};
+
+project_git_version!(GIT_VERSION);
+
+#[derive(Parser)]
+#[command(
+    version = GIT_VERSION,
+    about = "Neon Pageserver binutils",
+    long_about = "Reads pageserver (and related) binary files management utility"
+)]
+#[command(propagate_version = true)]
+struct CliOpts {
+    #[command(subcommand)]
+    command: Commands,
+}
+
+#[derive(Subcommand)]
+enum Commands {
+    Metadata(MetadataCmd),
+    PrintLayerFile(PrintLayerFileCmd),
+    DrawTimeline {},
+    AnalyzeLayerMap(AnalyzeLayerMapCmd),
+    #[command(subcommand)]
+    Layer(LayerCmd),
+}
+
+/// Read and update pageserver metadata file
+#[derive(Parser)]
+struct MetadataCmd {
+    /// Input metadata file path
+    metadata_path: PathBuf,
+    /// Replace disk consistent Lsn
+    disk_consistent_lsn: Option<Lsn>,
+    /// Replace previous record Lsn
+    prev_record_lsn: Option<Lsn>,
+    /// Replace latest gc cuttoff
+    latest_gc_cuttoff: Option<Lsn>,
+}
+
+#[derive(Parser)]
+struct PrintLayerFileCmd {
+    /// Pageserver data path
+    path: PathBuf,
+}
+
+#[derive(Parser)]
+struct AnalyzeLayerMapCmd {
+    /// Pageserver data path
+    path: PathBuf,
+    /// Max holes
+    max_holes: Option<usize>,
+}
+
+fn main() -> anyhow::Result<()> {
+    let cli = CliOpts::parse();
+
+    match cli.command {
+        Commands::Layer(cmd) => {
+            layers::main(&cmd)?;
+        }
+        Commands::Metadata(cmd) => {
+            handle_metadata(&cmd)?;
+        }
+        Commands::DrawTimeline {} => {
+            draw_timeline_dir::main()?;
+        }
+        Commands::AnalyzeLayerMap(cmd) => {
+            layer_map_analyzer::main(&cmd)?;
+        }
+        Commands::PrintLayerFile(cmd) => {
+            if let Err(e) = read_pg_control_file(&cmd.path) {
+                println!(
+                    "Failed to read input file as a pg control one: {e:#}\n\
+                    Attempting to read it as layer file"
+                );
+                print_layerfile(&cmd.path)?;
+            }
+        }
+    };
+    Ok(())
+}
+
+fn read_pg_control_file(control_file_path: &Path) -> anyhow::Result<()> {
+    let control_file = ControlFileData::decode(&std::fs::read(control_file_path)?)?;
+    println!("{control_file:?}");
+    let control_file_initdb = Lsn(control_file.checkPoint);
+    println!(
+        "pg_initdb_lsn: {}, aligned: {}",
+        control_file_initdb,
+        control_file_initdb.align()
+    );
+    Ok(())
+}
+
+fn print_layerfile(path: &Path) -> anyhow::Result<()> {
+    // Basic initialization of things that don't change after startup
+    virtual_file::init(10);
+    page_cache::init(100);
+    let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
+    dump_layerfile_from_path(path, true, &ctx)
+}
+
+fn handle_metadata(
+    MetadataCmd {
+        metadata_path: path,
+        disk_consistent_lsn,
+        prev_record_lsn,
+        latest_gc_cuttoff,
+    }: &MetadataCmd,
+) -> Result<(), anyhow::Error> {
+    let metadata_bytes = std::fs::read(path)?;
+    let mut meta = TimelineMetadata::from_bytes(&metadata_bytes)?;
+    println!("Current metadata:\n{meta:?}");
+    let mut update_meta = false;
+    if let Some(disk_consistent_lsn) = disk_consistent_lsn {
+        meta = TimelineMetadata::new(
+            *disk_consistent_lsn,
+            meta.prev_record_lsn(),
+            meta.ancestor_timeline(),
+            meta.ancestor_lsn(),
+            meta.latest_gc_cutoff_lsn(),
+            meta.initdb_lsn(),
+            meta.pg_version(),
+        );
+        update_meta = true;
+    }
+    if let Some(prev_record_lsn) = prev_record_lsn {
+        meta = TimelineMetadata::new(
+            meta.disk_consistent_lsn(),
+            Some(*prev_record_lsn),
+            meta.ancestor_timeline(),
+            meta.ancestor_lsn(),
+            meta.latest_gc_cutoff_lsn(),
+            meta.initdb_lsn(),
+            meta.pg_version(),
+        );
+        update_meta = true;
+    }
+    if let Some(latest_gc_cuttoff) = latest_gc_cuttoff {
+        meta = TimelineMetadata::new(
+            meta.disk_consistent_lsn(),
+            meta.prev_record_lsn(),
+            meta.ancestor_timeline(),
+            meta.ancestor_lsn(),
+            *latest_gc_cuttoff,
+            meta.initdb_lsn(),
+            meta.pg_version(),
+        );
+        update_meta = true;
+    }
+
+    if update_meta {
+        let metadata_bytes = meta.to_bytes()?;
+        std::fs::write(path, metadata_bytes)?;
+    }
+
+    Ok(())
+}
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -9,6 +9,7 @@ use clap::{Arg, ArgAction, Command};
 use fail::FailScenario;
 use metrics::launch_timestamp::{set_launch_timestamp_metric, LaunchTimestamp};
 use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task};
+use pageserver::task_mgr::WALRECEIVER_RUNTIME;
 use remote_storage::GenericRemoteStorage;
 use tracing::*;

@@ -18,9 +19,7 @@ use pageserver::{
    context::{DownloadBehavior, RequestContext},
    http, page_cache, page_service, task_mgr,
    task_mgr::TaskKind,
-    task_mgr::{
-        BACKGROUND_RUNTIME, COMPUTE_REQUEST_RUNTIME, MGMT_REQUEST_RUNTIME, WALRECEIVER_RUNTIME,
-    },
+    task_mgr::{BACKGROUND_RUNTIME, COMPUTE_REQUEST_RUNTIME, MGMT_REQUEST_RUNTIME},
    tenant::mgr,
    virtual_file,
 };
@@ -276,7 +275,18 @@ fn start_pageserver(
    let pageserver_listener = tcp_listener::bind(pg_addr)?;

    // Launch broker client
-    WALRECEIVER_RUNTIME.block_on(pageserver::broker_client::init_broker_client(conf))?;
+    // The storage_broker::connect call needs to happen inside a tokio runtime thread.
+    let broker_client = WALRECEIVER_RUNTIME
+        .block_on(async {
+            // Note: we do not attempt connecting here (but validate endpoints sanity).
+            storage_broker::connect(conf.broker_endpoint.clone(), conf.broker_keepalive_interval)
+        })
+        .with_context(|| {
+            format!(
+                "create broker client for uri={:?} keepalive_interval={:?}",
+                &conf.broker_endpoint, conf.broker_keepalive_interval,
+            )
+        })?;

    // Initialize authentication for incoming connections
    let http_auth;
@@ -325,8 +335,118 @@ fn start_pageserver(
    // Set up remote storage client
    let remote_storage = create_remote_storage_client(conf)?;

+    // Startup staging or optimizing:
+    //
+    // We want to minimize downtime for `page_service` connections, and trying not to overload
+    // BACKGROUND_RUNTIME by doing initial compactions and initial logical sizes at the same time.
+    //
+    // init_done_rx will notify when all initial load operations have completed.
+    //
+    // background_jobs_can_start (same name used to hold off background jobs from starting at
+    // consumer side) will be dropped once we can start the background jobs. Currently it is behind
+    // completing all initial logical size calculations (init_logical_size_done_rx) and a timeout
+    // (background_task_maximum_delay).
+    let (init_done_tx, init_done_rx) = utils::completion::channel();
+
+    let (init_logical_size_done_tx, init_logical_size_done_rx) = utils::completion::channel();
+
+    let (background_jobs_can_start, background_jobs_barrier) = utils::completion::channel();
+
+    let order = pageserver::InitializationOrder {
+        initial_tenant_load: Some(init_done_tx),
+        initial_logical_size_can_start: init_done_rx.clone(),
+        initial_logical_size_attempt: init_logical_size_done_tx,
+        background_jobs_can_start: background_jobs_barrier.clone(),
+    };
+
    // Scan the local 'tenants/' directory and start loading the tenants
-    BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(conf, remote_storage.clone()))?;
+    let init_started_at = std::time::Instant::now();
+    let shutdown_pageserver = tokio_util::sync::CancellationToken::new();
+
+    BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(
+        conf,
+        broker_client.clone(),
+        remote_storage.clone(),
+        order,
+    ))?;
+
+    BACKGROUND_RUNTIME.spawn({
+        let init_done_rx = init_done_rx;
+        let shutdown_pageserver = shutdown_pageserver.clone();
+        let drive_init = async move {
+            // NOTE: unlike many futures in pageserver, this one is cancellation-safe
+            let guard = scopeguard::guard_on_success((), |_| tracing::info!("Cancelled before initial load completed"));
+
+            init_done_rx.wait().await;
+            // initial logical sizes can now start, as they were waiting on init_done_rx.
+
+            scopeguard::ScopeGuard::into_inner(guard);
+
+            let init_done = std::time::Instant::now();
+            let elapsed = init_done - init_started_at;
+
+            tracing::info!(
+                elapsed_millis = elapsed.as_millis(),
+                "Initial load completed"
+            );
+
+            let mut init_sizes_done = std::pin::pin!(init_logical_size_done_rx.wait());
+
+            let timeout = conf.background_task_maximum_delay;
+
+            let guard = scopeguard::guard_on_success((), |_| tracing::info!("Cancelled before initial logical sizes completed"));
+
+            let init_sizes_done = tokio::select! {
+                _ = &mut init_sizes_done => {
+                    let now = std::time::Instant::now();
+                    tracing::info!(
+                        from_init_done_millis = (now - init_done).as_millis(),
+                        from_init_millis = (now - init_started_at).as_millis(),
+                        "Initial logical sizes completed"
+                    );
+                    None
+                }
+                _ = tokio::time::sleep(timeout) => {
+                    tracing::info!(
+                        timeout_millis = timeout.as_millis(),
+                        "Initial logical size timeout elapsed; starting background jobs"
+                    );
+                    Some(init_sizes_done)
+                }
+            };
+
+            scopeguard::ScopeGuard::into_inner(guard);
+
+            // allow background jobs to start
+            drop(background_jobs_can_start);
+
+            if let Some(init_sizes_done) = init_sizes_done {
+                // ending up here is not a bug; at the latest logical sizes will be queried by
+                // consumption metrics.
+                let guard = scopeguard::guard_on_success((), |_| tracing::info!("Cancelled before initial logical sizes completed"));
+                init_sizes_done.await;
+
+                scopeguard::ScopeGuard::into_inner(guard);
+
+                let now = std::time::Instant::now();
+                tracing::info!(
+                    from_init_done_millis = (now - init_done).as_millis(),
+                    from_init_millis = (now - init_started_at).as_millis(),
+                    "Initial logical sizes completed after timeout (background jobs already started)"
+                );
+
+            }
+        };
+
+        async move {
+            let mut drive_init = std::pin::pin!(drive_init);
+            // just race these tasks
+            tokio::select! {
+                _ = shutdown_pageserver.cancelled() => {},
+                _ = &mut drive_init => {},
+            }
+        }
+    });

    // shared state between the disk-usage backed eviction background task and the http endpoint
    // that allows triggering disk-usage based eviction manually. note that the http endpoint
@@ -339,6 +459,7 @@ fn start_pageserver(
            conf,
            remote_storage.clone(),
            disk_usage_eviction_state.clone(),
+            background_jobs_barrier.clone(),
        )?;
    }

@@ -351,6 +472,7 @@ fn start_pageserver(
            conf,
            launch_ts,
            http_auth,
+            broker_client.clone(),
            remote_storage,
            disk_usage_eviction_state,
        )?
@@ -373,37 +495,50 @@ fn start_pageserver(
                Ok(())
            },
        );
+    }

-        if let Some(metric_collection_endpoint) = &conf.metric_collection_endpoint {
-            let metrics_ctx = RequestContext::todo_child(
-                TaskKind::MetricsCollection,
-                // This task itself shouldn't download anything.
-                // The actual size calculation does need downloads, and
-                // creates a child context with the right DownloadBehavior.
-                DownloadBehavior::Error,
-            );
-            task_mgr::spawn(
-                MGMT_REQUEST_RUNTIME.handle(),
-                TaskKind::MetricsCollection,
-                None,
-                None,
-                "consumption metrics collection",
-                true,
-                async move {
-                    pageserver::consumption_metrics::collect_metrics(
-                        metric_collection_endpoint,
-                        conf.metric_collection_interval,
-                        conf.cached_metric_collection_interval,
-                        conf.synthetic_size_calculation_interval,
-                        conf.id,
-                        metrics_ctx,
-                    )
-                    .instrument(info_span!("metrics_collection"))
-                    .await?;
-                    Ok(())
-                },
-            );
-        }
+    if let Some(metric_collection_endpoint) = &conf.metric_collection_endpoint {
+        let background_jobs_barrier = background_jobs_barrier;
+        let metrics_ctx = RequestContext::todo_child(
+            TaskKind::MetricsCollection,
+            // This task itself shouldn't download anything.
+            // The actual size calculation does need downloads, and
+            // creates a child context with the right DownloadBehavior.
+            DownloadBehavior::Error,
+        );
+        task_mgr::spawn(
+            crate::BACKGROUND_RUNTIME.handle(),
+            TaskKind::MetricsCollection,
+            None,
+            None,
+            "consumption metrics collection",
+            true,
+            async move {
+                // first wait until background jobs are cleared to launch.
+                //
+                // this is because we only process active tenants and timelines, and the
+                // Timeline::get_current_logical_size will spawn the logical size calculation,
+                // which will not be rate-limited.
+                let cancel = task_mgr::shutdown_token();
+
+                tokio::select! {
+                    _ = cancel.cancelled() => { return Ok(()); },
+                    _ = background_jobs_barrier.wait() => {}
+                };
+
+                pageserver::consumption_metrics::collect_metrics(
+                    metric_collection_endpoint,
+                    conf.metric_collection_interval,
+                    conf.cached_metric_collection_interval,
+                    conf.synthetic_size_calculation_interval,
+                    conf.id,
+                    metrics_ctx,
+                )
+                .instrument(info_span!("metrics_collection"))
+                .await?;
+                Ok(())
+            },
+        );
    }

    // Spawn a task to listen for libpq connections. It will spawn further tasks
@@ -427,6 +562,7 @@ fn start_pageserver(
            async move {
                page_service::libpq_listener_main(
                    conf,
+                    broker_client,
                    pg_auth,
                    pageserver_listener,
                    conf.pg_auth_type,
@@ -437,6 +573,8 @@ fn start_pageserver(
        );
    }

+    let mut shutdown_pageserver = Some(shutdown_pageserver.drop_guard());
+
    // All started up! Now just sit and wait for shutdown signal.
    ShutdownSignals::handle(|signal| match signal {
        Signal::Quit => {
@@ -452,6 +590,11 @@ fn start_pageserver(
                "Got {}. Terminating gracefully in fast shutdown mode",
                signal.name()
            );
+
+            // This cancels the `shutdown_pageserver` cancellation tree.
+            // Right now that tree doesn't reach very far, and `task_mgr` is used instead.
+            // The plan is to change that over time.
+            shutdown_pageserver.take();
            BACKGROUND_RUNTIME.block_on(pageserver::shutdown_pageserver(0));
            unreachable!()
        }
--- a/pageserver/src/bin/pageserver_binutils.rs
+++ b/pageserver/src/bin/pageserver_binutils.rs
@@ -1,157 +0,0 @@
-//! A helper tool to manage pageserver binary files.
-//! Accepts a file as an argument, attempts to parse it with all ways possible
-//! and prints its interpreted context.
-//!
-//! Separate, `metadata` subcommand allows to print and update pageserver's metadata file.
-use std::{
-    path::{Path, PathBuf},
-    str::FromStr,
-};
-
-use anyhow::Context;
-use clap::{value_parser, Arg, Command};
-
-use pageserver::{
-    context::{DownloadBehavior, RequestContext},
-    page_cache,
-    task_mgr::TaskKind,
-    tenant::{dump_layerfile_from_path, metadata::TimelineMetadata},
-    virtual_file,
-};
-use postgres_ffi::ControlFileData;
-use utils::{lsn::Lsn, project_git_version};
-
-project_git_version!(GIT_VERSION);
-
-const METADATA_SUBCOMMAND: &str = "metadata";
-
-fn main() -> anyhow::Result<()> {
-    let arg_matches = cli().get_matches();
-
-    match arg_matches.subcommand() {
-        Some((subcommand_name, subcommand_matches)) => {
-            let path = subcommand_matches
-                .get_one::<PathBuf>("metadata_path")
-                .context("'metadata_path' argument is missing")?
-                .to_path_buf();
-            anyhow::ensure!(
-                subcommand_name == METADATA_SUBCOMMAND,
-                "Unknown subcommand {subcommand_name}"
-            );
-            handle_metadata(&path, subcommand_matches)?;
-        }
-        None => {
-            let path = arg_matches
-                .get_one::<PathBuf>("path")
-                .context("'path' argument is missing")?
-                .to_path_buf();
-            println!(
-                "No subcommand specified, attempting to guess the format for file {}",
-                path.display()
-            );
-            if let Err(e) = read_pg_control_file(&path) {
-                println!(
-                    "Failed to read input file as a pg control one: {e:#}\n\
-                    Attempting to read it as layer file"
-                );
-                print_layerfile(&path)?;
-            }
-        }
-    };
-    Ok(())
-}
-
-fn read_pg_control_file(control_file_path: &Path) -> anyhow::Result<()> {
-    let control_file = ControlFileData::decode(&std::fs::read(control_file_path)?)?;
-    println!("{control_file:?}");
-    let control_file_initdb = Lsn(control_file.checkPoint);
-    println!(
-        "pg_initdb_lsn: {}, aligned: {}",
-        control_file_initdb,
-        control_file_initdb.align()
-    );
-    Ok(())
-}
-
-fn print_layerfile(path: &Path) -> anyhow::Result<()> {
-    // Basic initialization of things that don't change after startup
-    virtual_file::init(10);
-    page_cache::init(100);
-    let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
-    dump_layerfile_from_path(path, true, &ctx)
-}
-
-fn handle_metadata(path: &Path, arg_matches: &clap::ArgMatches) -> Result<(), anyhow::Error> {
-    let metadata_bytes = std::fs::read(path)?;
-    let mut meta = TimelineMetadata::from_bytes(&metadata_bytes)?;
-    println!("Current metadata:\n{meta:?}");
-    let mut update_meta = false;
-    if let Some(disk_consistent_lsn) = arg_matches.get_one::<String>("disk_consistent_lsn") {
-        meta = TimelineMetadata::new(
-            Lsn::from_str(disk_consistent_lsn)?,
-            meta.prev_record_lsn(),
-            meta.ancestor_timeline(),
-            meta.ancestor_lsn(),
-            meta.latest_gc_cutoff_lsn(),
-            meta.initdb_lsn(),
-            meta.pg_version(),
-        );
-        update_meta = true;
-    }
-    if let Some(prev_record_lsn) = arg_matches.get_one::<String>("prev_record_lsn") {
-        meta = TimelineMetadata::new(
-            meta.disk_consistent_lsn(),
-            Some(Lsn::from_str(prev_record_lsn)?),
-            meta.ancestor_timeline(),
-            meta.ancestor_lsn(),
-            meta.latest_gc_cutoff_lsn(),
-            meta.initdb_lsn(),
-            meta.pg_version(),
-        );
-        update_meta = true;
-    }
-
-    if update_meta {
-        let metadata_bytes = meta.to_bytes()?;
-        std::fs::write(path, metadata_bytes)?;
-    }
-
-    Ok(())
-}
-
-fn cli() -> Command {
-    Command::new("Neon Pageserver binutils")
-        .about("Reads pageserver (and related) binary files management utility")
-        .version(GIT_VERSION)
-        .arg(
-            Arg::new("path")
-                .help("Input file path")
-                .value_parser(value_parser!(PathBuf))
-                .required(false),
-        )
-        .subcommand(
-            Command::new(METADATA_SUBCOMMAND)
-                .about("Read and update pageserver metadata file")
-                .arg(
-                    Arg::new("metadata_path")
-                        .help("Input metadata file path")
-                        .value_parser(value_parser!(PathBuf))
-                        .required(false),
-                )
-                .arg(
-                    Arg::new("disk_consistent_lsn")
-                        .long("disk_consistent_lsn")
-                        .help("Replace disk consistent Lsn"),
-                )
-                .arg(
-                    Arg::new("prev_record_lsn")
-                        .long("prev_record_lsn")
-                        .help("Replace previous record Lsn"),
-                ),
-        )
-}
-
-#[test]
-fn verify_cli() {
-    cli().debug_assert();
-}
--- a/pageserver/src/broker_client.rs
+++ b/pageserver/src/broker_client.rs
@@ -1,48 +0,0 @@
-//! The broker client instance of the pageserver, created during pageserver startup.
-//! Used by each timelines' [`walreceiver`].
-
-use crate::config::PageServerConf;
-
-use anyhow::Context;
-use once_cell::sync::OnceCell;
-use storage_broker::BrokerClientChannel;
-use tracing::*;
-
-static BROKER_CLIENT: OnceCell<BrokerClientChannel> = OnceCell::new();
-
-///
-/// Initialize the broker client. This must be called once at page server startup.
-///
-pub async fn init_broker_client(conf: &'static PageServerConf) -> anyhow::Result<()> {
-    let broker_endpoint = conf.broker_endpoint.clone();
-
-    // Note: we do not attempt connecting here (but validate endpoints sanity).
-    let broker_client =
-        storage_broker::connect(broker_endpoint.clone(), conf.broker_keepalive_interval).context(
-            format!(
-                "Failed to create broker client to {}",
-                &conf.broker_endpoint
-            ),
-        )?;
-
-    if BROKER_CLIENT.set(broker_client).is_err() {
-        panic!("broker already initialized");
-    }
-
-    info!(
-        "Initialized broker client with endpoints: {}",
-        broker_endpoint
-    );
-    Ok(())
-}
-
-///
-/// Get a handle to the broker client
-///
-pub fn get_broker_client() -> &'static BrokerClientChannel {
-    BROKER_CLIENT.get().expect("broker client not initialized")
-}
-
-pub fn is_broker_client_initialized() -> bool {
-    BROKER_CLIENT.get().is_some()
-}
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -63,6 +63,7 @@ pub mod defaults {
    pub const DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL: &str = "1 hour";
    pub const DEFAULT_METRIC_COLLECTION_ENDPOINT: Option<reqwest::Url> = None;
    pub const DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL: &str = "10 min";
+    pub const DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY: &str = "10s";

    ///
    /// Default built-in configuration file.
@@ -91,15 +92,16 @@ pub mod defaults {
 #cached_metric_collection_interval = '{DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL}'
 #synthetic_size_calculation_interval = '{DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL}'

-
 #disk_usage_based_eviction = {{ max_usage_pct = .., min_avail_bytes = .., period = "10s"}}

-# [tenant_config]
+#background_task_maximum_delay = '{DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY}'
+
+[tenant_config]
 #checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
 #checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT}
 #compaction_target_size = {DEFAULT_COMPACTION_TARGET_SIZE} # in bytes
 #compaction_period = '{DEFAULT_COMPACTION_PERIOD}'
-#compaction_threshold = '{DEFAULT_COMPACTION_THRESHOLD}'
+#compaction_threshold = {DEFAULT_COMPACTION_THRESHOLD}

 #gc_period = '{DEFAULT_GC_PERIOD}'
 #gc_horizon = {DEFAULT_GC_HORIZON}
@@ -108,8 +110,9 @@ pub mod defaults {

 #min_resident_size_override = .. # in bytes
 #evictions_low_residence_duration_metric_threshold = '{DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD}'
+#gc_feedback = false

-# [remote_storage]
+[remote_storage]

 "###
    );
@@ -187,6 +190,15 @@ pub struct PageServerConf {
    pub test_remote_failures: u64,

    pub ondemand_download_behavior_treat_error_as_warn: bool,
+
+    /// How long will background tasks be delayed at most after initial load of tenants.
+    ///
+    /// Our largest initialization completions are in the range of 100-200s, so perhaps 10s works
+    /// as we now isolate initial loading, initial logical size calculation and background tasks.
+    /// Smaller nodes will have background tasks "not running" for this long unless every timeline
+    /// has it's initial logical size calculated. Not running background tasks for some seconds is
+    /// not terrible.
+    pub background_task_maximum_delay: Duration,
 }

 /// We do not want to store this in a PageServerConf because the latter may be logged
@@ -259,6 +271,8 @@ struct PageServerConfigBuilder {
    test_remote_failures: BuilderValue<u64>,

    ondemand_download_behavior_treat_error_as_warn: BuilderValue<bool>,
+
+    background_task_maximum_delay: BuilderValue<Duration>,
 }

 impl Default for PageServerConfigBuilder {
@@ -316,6 +330,11 @@ impl Default for PageServerConfigBuilder {
            test_remote_failures: Set(0),

            ondemand_download_behavior_treat_error_as_warn: Set(false),
+
+            background_task_maximum_delay: Set(humantime::parse_duration(
+                DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY,
+            )
+            .unwrap()),
        }
    }
 }
@@ -440,6 +459,10 @@ impl PageServerConfigBuilder {
            BuilderValue::Set(ondemand_download_behavior_treat_error_as_warn);
    }

+    pub fn background_task_maximum_delay(&mut self, delay: Duration) {
+        self.background_task_maximum_delay = BuilderValue::Set(delay);
+    }
+
    pub fn build(self) -> anyhow::Result<PageServerConf> {
        let concurrent_tenant_size_logical_size_queries = self
            .concurrent_tenant_size_logical_size_queries
@@ -522,6 +545,9 @@ impl PageServerConfigBuilder {
                .ok_or(anyhow!(
                    "missing ondemand_download_behavior_treat_error_as_warn"
                ))?,
+            background_task_maximum_delay: self
+                .background_task_maximum_delay
+                .ok_or(anyhow!("missing background_task_maximum_delay"))?,
        })
    }
 }
@@ -710,6 +736,7 @@ impl PageServerConf {
                    )
                },
                "ondemand_download_behavior_treat_error_as_warn" => builder.ondemand_download_behavior_treat_error_as_warn(parse_toml_bool(key, item)?),
+                "background_task_maximum_delay" => builder.background_task_maximum_delay(parse_toml_duration(key, item)?),
                _ => bail!("unrecognized pageserver option '{key}'"),
            }
        }
@@ -797,7 +824,8 @@ impl PageServerConf {
            )?);
        }
        if let Some(max_lsn_wal_lag) = item.get("max_lsn_wal_lag") {
-            t_conf.max_lsn_wal_lag = Some(parse_toml_from_str("max_lsn_wal_lag", max_lsn_wal_lag)?);
+            t_conf.max_lsn_wal_lag =
+                Some(deserialize_from_item("max_lsn_wal_lag", max_lsn_wal_lag)?);
        }
        if let Some(trace_read_requests) = item.get("trace_read_requests") {
            t_conf.trace_read_requests =
@@ -827,6 +855,14 @@ impl PageServerConf {
            )?);
        }

+        if let Some(gc_feedback) = item.get("gc_feedback") {
+            t_conf.gc_feedback = Some(
+                gc_feedback
+                    .as_bool()
+                    .with_context(|| "configure option gc_feedback is not a bool".to_string())?,
+            );
+        }
+
        Ok(t_conf)
    }

@@ -868,6 +904,7 @@ impl PageServerConf {
            disk_usage_based_eviction: None,
            test_remote_failures: 0,
            ondemand_download_behavior_treat_error_as_warn: false,
+            background_task_maximum_delay: Duration::ZERO,
        }
    }
 }
@@ -1027,6 +1064,7 @@ metric_collection_endpoint = 'http://localhost:80/metrics'
 synthetic_size_calculation_interval = '333 s'

 log_format = 'json'
+background_task_maximum_delay = '334 s'

 "#;

@@ -1085,6 +1123,9 @@ log_format = 'json'
                disk_usage_based_eviction: None,
                test_remote_failures: 0,
                ondemand_download_behavior_treat_error_as_warn: false,
+                background_task_maximum_delay: humantime::parse_duration(
+                    defaults::DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY
+                )?,
            },
            "Correct defaults should be used when no config values are provided"
        );
@@ -1139,6 +1180,7 @@ log_format = 'json'
                disk_usage_based_eviction: None,
                test_remote_failures: 0,
                ondemand_download_behavior_treat_error_as_warn: false,
+                background_task_maximum_delay: Duration::from_secs(334),
            },
            "Should be able to parse all basic config values correctly"
        );
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -5,7 +5,7 @@
 //!
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
-use crate::tenant::mgr;
+use crate::tenant::{mgr, LogicalSizeCalculationCause};
 use anyhow;
 use chrono::Utc;
 use consumption_metrics::{idempotency_key, Event, EventChunk, EventType, CHUNK_SIZE};
@@ -113,7 +113,7 @@ pub async fn collect_metrics_iteration(
    cached_metrics: &mut HashMap<PageserverConsumptionMetricsKey, u64>,
    metric_collection_endpoint: &reqwest::Url,
    node_id: NodeId,
-    _ctx: &RequestContext,
+    ctx: &RequestContext,
    send_cached: bool,
 ) {
    let mut current_metrics: Vec<(PageserverConsumptionMetricsKey, u64)> = Vec::new();
@@ -164,15 +164,30 @@ pub async fn collect_metrics_iteration(
                    timeline_written_size,
                ));

-                current_metrics.push((
-                    PageserverConsumptionMetricsKey {
-                        tenant_id,
-                        timeline_id: Some(timeline.timeline_id),
-                        metric: TIMELINE_LOGICAL_SIZE,
-                    },
-                    timeline.get_current_logical_size(),
-                ));
+                let span = info_span!("collect_metrics_iteration", tenant_id = %timeline.tenant_id, timeline_id = %timeline.timeline_id);
+                match span.in_scope(|| timeline.get_current_logical_size(ctx)) {
+                    // Only send timeline logical size when it is fully calculated.
+                    Ok((size, is_exact)) if is_exact => {
+                        current_metrics.push((
+                            PageserverConsumptionMetricsKey {
+                                tenant_id,
+                                timeline_id: Some(timeline.timeline_id),
+                                metric: TIMELINE_LOGICAL_SIZE,
+                            },
+                            size,
+                        ));
+                    }
+                    Ok((_, _)) => {}
+                    Err(err) => {
+                        error!(
+                            "failed to get current logical size for timeline {}: {err:?}",
+                            timeline.timeline_id
+                        );
+                        continue;
+                    }
+                };
            }
+
            let timeline_resident_size = timeline.get_resident_physical_size();
            tenant_resident_size += timeline_resident_size;
        }
@@ -321,6 +336,7 @@ pub async fn calculate_synthetic_size_worker(
                    if let Ok(tenant) = mgr::get_tenant(tenant_id, true).await
                    {
                        if let Err(e) = tenant.calculate_synthetic_size(
+                            LogicalSizeCalculationCause::ConsumptionMetricsSyntheticSize,
                            ctx).await {
                            error!("failed to calculate synthetic size for tenant {}: {}", tenant_id, e);
                        }
--- a/pageserver/src/context.rs
+++ b/pageserver/src/context.rs
@@ -88,6 +88,7 @@
 use crate::task_mgr::TaskKind;

 // The main structure of this module, see module-level comment.
+#[derive(Clone, Debug)]
 pub struct RequestContext {
    task_kind: TaskKind,
    download_behavior: DownloadBehavior,
@@ -95,7 +96,7 @@ pub struct RequestContext {

 /// Desired behavior if the operation requires an on-demand download
 /// to proceed.
-#[derive(Clone, Copy, PartialEq, Eq)]
+#[derive(Clone, Copy, PartialEq, Eq, Debug)]
 pub enum DownloadBehavior {
    /// Download the layer file. It can take a while.
    Download,
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -54,6 +54,7 @@ use serde::{Deserialize, Serialize};
 use tokio::time::Instant;
 use tokio_util::sync::CancellationToken;
 use tracing::{debug, error, info, instrument, warn, Instrument};
+use utils::completion;
 use utils::serde_percent::Percent;

 use crate::{
@@ -82,6 +83,7 @@ pub fn launch_disk_usage_global_eviction_task(
    conf: &'static PageServerConf,
    storage: GenericRemoteStorage,
    state: Arc<State>,
+    background_jobs_barrier: completion::Barrier,
 ) -> anyhow::Result<()> {
    let Some(task_config) = &conf.disk_usage_based_eviction else {
        info!("disk usage based eviction task not configured");
@@ -98,14 +100,16 @@ pub fn launch_disk_usage_global_eviction_task(
        "disk usage based eviction",
        false,
        async move {
-            disk_usage_eviction_task(
-                &state,
-                task_config,
-                storage,
-                &conf.tenants_path(),
-                task_mgr::shutdown_token(),
-            )
-            .await;
+            let cancel = task_mgr::shutdown_token();
+
+            // wait until initial load is complete, because we cannot evict from loading tenants.
+            tokio::select! {
+                _ = cancel.cancelled() => { return Ok(()); },
+                _ = background_jobs_barrier.wait() => { }
+            };
+
+            disk_usage_eviction_task(&state, task_config, storage, &conf.tenants_path(), cancel)
+                .await;
            info!("disk usage based eviction task finishing");
            Ok(())
        },
@@ -512,7 +516,7 @@ async fn collect_eviction_candidates(
            if !tl.is_active() {
                continue;
            }
-            let info = tl.get_local_layers_for_disk_usage_eviction();
+            let info = tl.get_local_layers_for_disk_usage_eviction().await;
            debug!(tenant_id=%tl.tenant_id, timeline_id=%tl.timeline_id, "timeline resident layers count: {}", info.resident_layers.len());
            tenant_candidates.extend(
                info.resident_layers
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -215,7 +215,7 @@ paths:
              schema:
                $ref: "#/components/schemas/NotFoundError"
        "412":
-          description: Tenant is missing
+          description: Tenant is missing, or timeline has children
          content:
            application/json:
              schema:
@@ -363,11 +363,30 @@ paths:
        * MUST NOT ASSUME that the request has been lost, based on the observation
          that a subsequent tenant status request returns 404. The request may
          still be in flight. It must be retried.
+
+        The client SHOULD supply a `TenantConfig` for the tenant in the request body.
+        Settings specified in the config override the pageserver's defaults.
+        It is guaranteed that the config settings are applied before the pageserver
+        starts operating on the tenant. E.g., if the config specifies a specific
+        PITR interval for a tenant, then that setting will be in effect before the
+        pageserver starts the garbage collection loop. This enables a client to
+        guarantee a specific PITR setting across detach/attach cycles.
+        The pageserver will reject the request if it cannot parse the config, or
+        if there are any unknown fields in it.
+
+        If the client does not supply a config, the pageserver will use its defaults.
+        This behavior is deprecated: https://github.com/neondatabase/neon/issues/4282
+      requestBody:
+        required: false
+        content:
+          application/json:
+            schema:
+              $ref: "#/components/schemas/TenantAttachRequest"
      responses:
        "202":
          description: Tenant attaching scheduled
        "400":
-          description: Error when no tenant id found in path parameters
+          description: Bad Request
          content:
            application/json:
              schema:
@@ -660,6 +679,8 @@ paths:
          application/json:
            schema:
              type: object
+              required:
+                - new_timeline_id
              properties:
                new_timeline_id:
                  type: string
@@ -908,20 +929,45 @@ components:
              writing to the tenant's S3 state, so, DO NOT ATTACH the
              tenant to any other pageserver, or we risk split-brain.
            - `attached` means that the attach operation has completed,
-              maybe successfully, maybe not. Perform a health check at
-              the Postgres level to determine healthiness of the tenant.
+              successfully
+            - `failed` means that attach has failed. For reason check corresponding `reason` failed.
+              `failed` is the terminal state, retrying attach call wont resolve the issue.
+              For example this can be caused by s3 being unreachable. The retry may be implemented
+              with call to detach, though it would be better to not automate it and inspec failed state
+              manually before proceeding with a retry.

            See the tenant `/attach` endpoint for more information.
-          type: string
-          enum: [ "maybe", "attached" ]
+          type: object
+          required:
+            - slug
+            - data
+          properties:
+            slug:
+              type: string
+              enum: [ "maybe", "attached", "failed" ]
+            data:
+              type: object
+              properties:
+                reason:
+                  type: string
+
    TenantCreateRequest:
      allOf:
        - $ref: '#/components/schemas/TenantConfig'
        - type: object
+          required:
+            - new_tenant_id
          properties:
            new_tenant_id:
              type: string
              format: hex
+    TenantAttachRequest:
+      type: object
+      required:
+        - config
+      properties:
+        config:
+          $ref: '#/components/schemas/TenantConfig'
    TenantConfigRequest:
      allOf:
        - $ref: '#/components/schemas/TenantConfig'
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -1,3 +1,6 @@
+//!
+//! Management HTTP API
+//!
 use std::collections::HashMap;
 use std::sync::Arc;

@@ -5,12 +8,14 @@ use anyhow::{anyhow, Context, Result};
 use hyper::StatusCode;
 use hyper::{Body, Request, Response, Uri};
 use metrics::launch_timestamp::LaunchTimestamp;
-use pageserver_api::models::DownloadRemoteLayersTaskSpawnRequest;
+use pageserver_api::models::{DownloadRemoteLayersTaskSpawnRequest, TenantAttachRequest};
 use remote_storage::GenericRemoteStorage;
+use storage_broker::BrokerClientChannel;
 use tenant_size_model::{SizeResult, StorageModel};
 use tokio_util::sync::CancellationToken;
 use tracing::*;
-use utils::http::endpoint::RequestSpan;
+use utils::http::endpoint::request_span;
+use utils::http::json::json_request_or_empty_body;
 use utils::http::request::{get_request_param, must_get_query_param, parse_query_param};

 use super::models::{
@@ -23,10 +28,12 @@ use crate::metrics::{StorageTimeOperation, STORAGE_TIME_GLOBAL};
 use crate::pgdatadir_mapping::LsnForTimestamp;
 use crate::task_mgr::TaskKind;
 use crate::tenant::config::TenantConfOpt;
-use crate::tenant::mgr::{TenantMapInsertError, TenantStateError};
+use crate::tenant::mgr::{
+    GetTenantError, SetNewTenantConfigError, TenantMapInsertError, TenantStateError,
+};
 use crate::tenant::size::ModelInputs;
 use crate::tenant::storage_layer::LayerAccessStatsReset;
-use crate::tenant::{PageReconstructError, Timeline};
+use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError, Timeline};
 use crate::{config::PageServerConf, tenant::mgr};
 use utils::{
    auth::JwtAuth,
@@ -42,7 +49,6 @@ use utils::{
 };

 // Imports only used for testing APIs
-#[cfg(feature = "testing")]
 use super::models::ConfigureFailpointsRequest;

 struct State {
@@ -50,6 +56,7 @@ struct State {
    auth: Option<Arc<JwtAuth>>,
    allowlist_routes: Vec<Uri>,
    remote_storage: Option<GenericRemoteStorage>,
+    broker_client: storage_broker::BrokerClientChannel,
    disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
 }

@@ -58,6 +65,7 @@ impl State {
        conf: &'static PageServerConf,
        auth: Option<Arc<JwtAuth>>,
        remote_storage: Option<GenericRemoteStorage>,
+        broker_client: storage_broker::BrokerClientChannel,
        disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
    ) -> anyhow::Result<Self> {
        let allowlist_routes = ["/v1/status", "/v1/doc", "/swagger.yml"]
@@ -69,6 +77,7 @@ impl State {
            auth,
            allowlist_routes,
            remote_storage,
+            broker_client,
            disk_usage_eviction_state,
        })
    }
@@ -133,20 +142,51 @@ impl From<TenantMapInsertError> for ApiError {
 impl From<TenantStateError> for ApiError {
    fn from(tse: TenantStateError) -> ApiError {
        match tse {
-            TenantStateError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {}", tid)),
+            TenantStateError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {}", tid).into()),
            _ => ApiError::InternalServerError(anyhow::Error::new(tse)),
        }
    }
 }

+impl From<GetTenantError> for ApiError {
+    fn from(tse: GetTenantError) -> ApiError {
+        match tse {
+            GetTenantError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {}", tid).into()),
+            e @ GetTenantError::NotActive(_) => {
+                // Why is this not `ApiError::NotFound`?
+                // Because we must be careful to never return 404 for a tenant if it does
+                // in fact exist locally. If we did, the caller could draw the conclusion
+                // that it can attach the tenant to another PS and we'd be in split-brain.
+                //
+                // (We can produce this variant only in `mgr::get_tenant(..., active=true)` calls).
+                ApiError::InternalServerError(anyhow::Error::new(e))
+            }
+        }
+    }
+}
+
+impl From<SetNewTenantConfigError> for ApiError {
+    fn from(e: SetNewTenantConfigError) -> ApiError {
+        match e {
+            SetNewTenantConfigError::GetTenant(tid) => {
+                ApiError::NotFound(anyhow!("tenant {}", tid).into())
+            }
+            e @ SetNewTenantConfigError::Persist(_) => {
+                ApiError::InternalServerError(anyhow::Error::new(e))
+            }
+        }
+    }
+}
+
 impl From<crate::tenant::DeleteTimelineError> for ApiError {
    fn from(value: crate::tenant::DeleteTimelineError) -> Self {
        use crate::tenant::DeleteTimelineError::*;
        match value {
-            NotFound => ApiError::NotFound(anyhow::anyhow!("timeline not found")),
-            HasChildren => ApiError::BadRequest(anyhow::anyhow!(
-                "Cannot delete timeline which has child timelines"
-            )),
+            NotFound => ApiError::NotFound(anyhow::anyhow!("timeline not found").into()),
+            HasChildren(children) => ApiError::PreconditionFailed(
+                format!("Cannot delete timeline which has child timelines: {children:?}")
+                    .into_boxed_str(),
+            ),
            Other(e) => ApiError::InternalServerError(e),
        }
    }
@@ -158,9 +198,9 @@ impl From<crate::tenant::mgr::DeleteTimelineError> for ApiError {
        match value {
            // Report Precondition failed so client can distinguish between
            // "tenant is missing" case from "timeline is missing"
-            Tenant(TenantStateError::NotFound(..)) => {
-                ApiError::PreconditionFailed("Requested tenant is missing")
-            }
+            Tenant(GetTenantError::NotFound(..)) => ApiError::PreconditionFailed(
+                "Requested tenant is missing".to_owned().into_boxed_str(),
+            ),
            Tenant(t) => ApiError::from(t),
            Timeline(t) => ApiError::from(t),
        }
@@ -168,12 +208,36 @@ impl From<crate::tenant::mgr::DeleteTimelineError> for ApiError {
 }

 // Helper function to construct a TimelineInfo struct for a timeline
-fn build_timeline_info(
+async fn build_timeline_info(
    timeline: &Arc<Timeline>,
-    _ctx: &RequestContext,
+    include_non_incremental_logical_size: bool,
+    ctx: &RequestContext,
 ) -> anyhow::Result<TimelineInfo> {
    crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id();

+    let mut info = build_timeline_info_common(timeline, ctx).await?;
+    if include_non_incremental_logical_size {
+        // XXX we should be using spawn_ondemand_logical_size_calculation here.
+        // Otherwise, if someone deletes the timeline / detaches the tenant while
+        // we're executing this function, we will outlive the timeline on-disk state.
+        info.current_logical_size_non_incremental = Some(
+            timeline
+                .get_current_logical_size_non_incremental(
+                    info.last_record_lsn,
+                    CancellationToken::new(),
+                    ctx,
+                )
+                .await?,
+        );
+    }
+    Ok(info)
+}
+
+async fn build_timeline_info_common(
+    timeline: &Arc<Timeline>,
+    ctx: &RequestContext,
+) -> anyhow::Result<TimelineInfo> {
+    crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id();
    let last_record_lsn = timeline.get_last_record_lsn();
    let (wal_source_connstr, last_received_msg_lsn, last_received_msg_ts) = {
        let guard = timeline.last_received_wal.lock().unwrap();
@@ -193,8 +257,14 @@ fn build_timeline_info(
        Lsn(0) => None,
        lsn @ Lsn(_) => Some(lsn),
    };
-    let current_logical_size = Some(timeline.get_current_logical_size());
-    let current_physical_size = Some(timeline.layer_size_sum());
+    let current_logical_size = match timeline.get_current_logical_size(ctx) {
+        Ok((size, _)) => Some(size),
+        Err(err) => {
+            error!("Timeline info creation failed to get current logical size: {err:?}");
+            None
+        }
+    };
+    let current_physical_size = Some(timeline.layer_size_sum().await);
    let state = timeline.current_state();
    let remote_consistent_lsn = timeline.get_remote_consistent_lsn().unwrap_or(Lsn(0));

@@ -210,6 +280,7 @@ fn build_timeline_info(
        latest_gc_cutoff_lsn: *timeline.get_latest_gc_cutoff_lsn(),
        current_logical_size,
        current_physical_size,
+        current_logical_size_non_incremental: None,
        timeline_dir_layer_file_size_sum: None,
        wal_source_connstr,
        last_received_msg_lsn,
@@ -222,23 +293,29 @@ fn build_timeline_info(
 }

 // healthcheck handler
-async fn status_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn status_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
    check_permission(&request, None)?;
    let config = get_config(&request);
    json_response(StatusCode::OK, StatusResponse { id: config.id })
 }

-async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn timeline_create_handler(
+    mut request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    let request_data: TimelineCreateRequest = json_request(&mut request).await?;
    check_permission(&request, Some(tenant_id))?;

-    let new_timeline_id = request_data
-        .new_timeline_id
-        .unwrap_or_else(TimelineId::generate);
+    let new_timeline_id = request_data.new_timeline_id;

    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Error);

+    let state = get_state(&request);
+
    async {
        let tenant = mgr::get_tenant(tenant_id, true).await?;
        match tenant.create_timeline(
@@ -246,12 +323,14 @@ async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<
            request_data.ancestor_timeline_id.map(TimelineId::from),
            request_data.ancestor_start_lsn,
            request_data.pg_version.unwrap_or(crate::DEFAULT_PG_VERSION),
+            state.broker_client.clone(),
            &ctx,
        )
        .await {
            Ok(Some(new_timeline)) => {
                // Created. Construct a TimelineInfo for it.
-                let timeline_info = build_timeline_info(&new_timeline, &ctx)
+                let timeline_info = build_timeline_info_common(&new_timeline, &ctx)
+                    .await
                    .map_err(ApiError::InternalServerError)?;
                json_response(StatusCode::CREATED, timeline_info)
            }
@@ -259,12 +338,17 @@ async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<
            Err(err) => Err(ApiError::InternalServerError(err)),
        }
    }
-    .instrument(info_span!("timeline_create", tenant = %tenant_id, new_timeline = ?request_data.new_timeline_id, timeline_id = %new_timeline_id, lsn=?request_data.ancestor_start_lsn, pg_version=?request_data.pg_version))
+    .instrument(info_span!("timeline_create", tenant = %tenant_id, timeline_id = %new_timeline_id, lsn=?request_data.ancestor_start_lsn, pg_version=?request_data.pg_version))
    .await
 }

-async fn timeline_list_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn timeline_list_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
+    let include_non_incremental_logical_size: Option<bool> =
+        parse_query_param(&request, "include-non-incremental-logical-size")?;
    check_permission(&request, Some(tenant_id))?;

    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
@@ -275,11 +359,15 @@ async fn timeline_list_handler(request: Request<Body>) -> Result<Response<Body>,

        let mut response_data = Vec::with_capacity(timelines.len());
        for timeline in timelines {
-            let timeline_info = build_timeline_info(&timeline, &ctx)
-                .context(
-                    "Failed to convert tenant timeline {timeline_id} into the local one: {e:?}",
-                )
-                .map_err(ApiError::InternalServerError)?;
+            let timeline_info = build_timeline_info(
+                &timeline,
+                include_non_incremental_logical_size.unwrap_or(false),
+                &ctx,
+            )
+            .instrument(info_span!("build_timeline_info", timeline_id = %timeline.timeline_id))
+            .await
+            .context("Failed to convert tenant timeline {timeline_id} into the local one: {e:?}")
+            .map_err(ApiError::InternalServerError)?;

            response_data.push(timeline_info);
        }
@@ -291,9 +379,14 @@ async fn timeline_list_handler(request: Request<Body>) -> Result<Response<Body>,
    json_response(StatusCode::OK, response_data)
 }

-async fn timeline_detail_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn timeline_detail_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
+    let include_non_incremental_logical_size: Option<bool> =
+        parse_query_param(&request, "include-non-incremental-logical-size")?;
    check_permission(&request, Some(tenant_id))?;

    // Logical size calculation needs downloading.
@@ -304,11 +397,16 @@ async fn timeline_detail_handler(request: Request<Body>) -> Result<Response<Body

        let timeline = tenant
            .get_timeline(timeline_id, false)
-            .map_err(ApiError::NotFound)?;
+            .map_err(|e| ApiError::NotFound(e.into()))?;

-        let timeline_info = build_timeline_info(&timeline, &ctx)
-            .context("get local timeline info")
-            .map_err(ApiError::InternalServerError)?;
+        let timeline_info = build_timeline_info(
+            &timeline,
+            include_non_incremental_logical_size.unwrap_or(false),
+            &ctx,
+        )
+        .await
+        .context("get local timeline info")
+        .map_err(ApiError::InternalServerError)?;

        Ok::<_, ApiError>(timeline_info)
    }
@@ -318,7 +416,10 @@ async fn timeline_detail_handler(request: Request<Body>) -> Result<Response<Body
    json_response(StatusCode::OK, timeline_info)
 }

-async fn get_lsn_by_timestamp_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn get_lsn_by_timestamp_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    check_permission(&request, Some(tenant_id))?;

@@ -342,11 +443,19 @@ async fn get_lsn_by_timestamp_handler(request: Request<Body>) -> Result<Response
    json_response(StatusCode::OK, result)
 }

-// TODO makes sense to provide tenant config right away the same way as it handled in tenant_create
-async fn tenant_attach_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn tenant_attach_handler(
+    mut request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    check_permission(&request, Some(tenant_id))?;

+    let maybe_body: Option<TenantAttachRequest> = json_request_or_empty_body(&mut request).await?;
+    let tenant_conf = match maybe_body {
+        Some(request) => TenantConfOpt::try_from(&*request.config).map_err(ApiError::BadRequest)?,
+        None => TenantConfOpt::default(),
+    };
+
    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);

    info!("Handling tenant attach {tenant_id}");
@@ -357,9 +466,8 @@ async fn tenant_attach_handler(request: Request<Body>) -> Result<Response<Body>,
        mgr::attach_tenant(
            state.conf,
            tenant_id,
-            // XXX: Attach should provide the config, especially during tenant migration.
-            //      See https://github.com/neondatabase/neon/issues/1555
-            TenantConfOpt::default(),
+            tenant_conf,
+            state.broker_client.clone(),
            remote_storage.clone(),
            &ctx,
        )
@@ -374,7 +482,10 @@ async fn tenant_attach_handler(request: Request<Body>) -> Result<Response<Body>,
    json_response(StatusCode::ACCEPTED, ())
 }

-async fn timeline_delete_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn timeline_delete_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
    check_permission(&request, Some(tenant_id))?;
@@ -385,10 +496,14 @@ async fn timeline_delete_handler(request: Request<Body>) -> Result<Response<Body
        .instrument(info_span!("timeline_delete", tenant = %tenant_id, timeline = %timeline_id))
        .await?;

-    json_response(StatusCode::OK, ())
+    // FIXME: needs to be an error for console to retry it. Ideally Accepted should be used and retried until 404.
+    json_response(StatusCode::ACCEPTED, ())
 }

-async fn tenant_detach_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn tenant_detach_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    check_permission(&request, Some(tenant_id))?;
    let detach_ignored: Option<bool> = parse_query_param(&request, "detach_ignored")?;
@@ -402,21 +517,33 @@ async fn tenant_detach_handler(request: Request<Body>) -> Result<Response<Body>,
    json_response(StatusCode::OK, ())
 }

-async fn tenant_load_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn tenant_load_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    check_permission(&request, Some(tenant_id))?;

    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);

    let state = get_state(&request);
-    mgr::load_tenant(state.conf, tenant_id, state.remote_storage.clone(), &ctx)
-        .instrument(info_span!("load", tenant = %tenant_id))
-        .await?;
+    mgr::load_tenant(
+        state.conf,
+        tenant_id,
+        state.broker_client.clone(),
+        state.remote_storage.clone(),
+        &ctx,
+    )
+    .instrument(info_span!("load", tenant = %tenant_id))
+    .await?;

    json_response(StatusCode::ACCEPTED, ())
 }

-async fn tenant_ignore_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn tenant_ignore_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    check_permission(&request, Some(tenant_id))?;

@@ -429,7 +556,10 @@ async fn tenant_ignore_handler(request: Request<Body>) -> Result<Response<Body>,
    json_response(StatusCode::OK, ())
 }

-async fn tenant_list_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn tenant_list_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
    check_permission(&request, None)?;

    let response_data = mgr::list_tenants()
@@ -449,7 +579,10 @@ async fn tenant_list_handler(request: Request<Body>) -> Result<Response<Body>, A
    json_response(StatusCode::OK, response_data)
 }

-async fn tenant_status(request: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn tenant_status(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    check_permission(&request, Some(tenant_id))?;

@@ -459,11 +592,11 @@ async fn tenant_status(request: Request<Body>) -> Result<Response<Body>, ApiErro
        // Calculate total physical size of all timelines
        let mut current_physical_size = 0;
        for timeline in tenant.list_timelines().iter() {
-            current_physical_size += timeline.layer_size_sum();
+            current_physical_size += timeline.layer_size_sum().await;
        }

        let state = tenant.current_state();
-        Ok(TenantInfo {
+        Result::<_, ApiError>::Ok(TenantInfo {
            id: tenant_id,
            state: state.clone(),
            current_physical_size: Some(current_physical_size),
@@ -471,8 +604,7 @@ async fn tenant_status(request: Request<Body>) -> Result<Response<Body>, ApiErro
        })
    }
    .instrument(info_span!("tenant_status_handler", tenant = %tenant_id))
-    .await
-    .map_err(ApiError::InternalServerError)?;
+    .await?;

    json_response(StatusCode::OK, tenant_info)
 }
@@ -490,7 +622,10 @@ async fn tenant_status(request: Request<Body>) -> Result<Response<Body>, ApiErro
 /// Note: we don't update the cached size and prometheus metric here.
 /// The retention period might be different, and it's nice to have a method to just calculate it
 /// without modifying anything anyway.
-async fn tenant_size_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn tenant_size_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    check_permission(&request, Some(tenant_id))?;
    let inputs_only: Option<bool> = parse_query_param(&request, "inputs_only")?;
@@ -502,7 +637,11 @@ async fn tenant_size_handler(request: Request<Body>) -> Result<Response<Body>, A

    // this can be long operation
    let inputs = tenant
-        .gather_size_inputs(retention_period, &ctx)
+        .gather_size_inputs(
+            retention_period,
+            LogicalSizeCalculationCause::TenantSizeHandler,
+            &ctx,
+        )
        .await
        .map_err(ApiError::InternalServerError)?;

@@ -551,7 +690,10 @@ async fn tenant_size_handler(request: Request<Body>) -> Result<Response<Body>, A
    )
 }

-async fn layer_map_info_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn layer_map_info_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
    let reset: LayerAccessStatsReset =
@@ -560,12 +702,15 @@ async fn layer_map_info_handler(request: Request<Body>) -> Result<Response<Body>
    check_permission(&request, Some(tenant_id))?;

    let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
-    let layer_map_info = timeline.layer_map_info(reset);
+    let layer_map_info = timeline.layer_map_info(reset).await;

    json_response(StatusCode::OK, layer_map_info)
 }

-async fn layer_download_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn layer_download_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    check_permission(&request, Some(tenant_id))?;
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
@@ -588,7 +733,10 @@ async fn layer_download_handler(request: Request<Body>) -> Result<Response<Body>
    }
 }

-async fn evict_timeline_layer_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn evict_timeline_layer_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    check_permission(&request, Some(tenant_id))?;
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
@@ -666,7 +814,12 @@ pub fn html_response(status: StatusCode, data: String) -> Result<Response<Body>,
    Ok(response)
 }

-async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn tenant_create_handler(
+    mut request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let request_data: TenantCreateRequest = json_request(&mut request).await?;
+    let target_tenant_id = request_data.new_tenant_id;
    check_permission(&request, None)?;

    let _timer = STORAGE_TIME_GLOBAL
@@ -674,17 +827,10 @@ async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Bo
        .expect("bug")
        .start_timer();

-    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
-
-    let request_data: TenantCreateRequest = json_request(&mut request).await?;
-
    let tenant_conf =
        TenantConfOpt::try_from(&request_data.config).map_err(ApiError::BadRequest)?;

-    let target_tenant_id = request_data
-        .new_tenant_id
-        .map(TenantId::from)
-        .unwrap_or_else(TenantId::generate);
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);

    let state = get_state(&request);

@@ -692,6 +838,7 @@ async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Bo
        state.conf,
        tenant_conf,
        target_tenant_id,
+        state.broker_client.clone(),
        state.remote_storage.clone(),
        &ctx,
    )
@@ -717,7 +864,10 @@ async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Bo
    )
 }

-async fn get_tenant_config_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn get_tenant_config_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    check_permission(&request, Some(tenant_id))?;

@@ -743,6 +893,7 @@ async fn get_tenant_config_handler(request: Request<Body>) -> Result<Response<Bo

 async fn update_tenant_config_handler(
    mut request: Request<Body>,
+    _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
    let request_data: TenantConfigRequest = json_request(&mut request).await?;
    let tenant_id = request_data.tenant_id;
@@ -760,21 +911,25 @@ async fn update_tenant_config_handler(
 }

 /// Testing helper to transition a tenant to [`crate::tenant::TenantState::Broken`].
-#[cfg(feature = "testing")]
-async fn handle_tenant_break(r: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn handle_tenant_break(
+    r: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&r, "tenant_id")?;

    let tenant = crate::tenant::mgr::get_tenant(tenant_id, true)
        .await
        .map_err(|_| ApiError::Conflict(String::from("no active tenant found")))?;

-    tenant.set_broken("broken from test".to_owned());
+    tenant.set_broken("broken from test".to_owned()).await;

    json_response(StatusCode::OK, ())
 }

-#[cfg(feature = "testing")]
-async fn failpoints_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn failpoints_handler(
+    mut request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
    if !fail::has_failpoints() {
        return Err(ApiError::BadRequest(anyhow!(
            "Cannot manage failpoints because pageserver was compiled without failpoints support"
@@ -807,7 +962,10 @@ async fn failpoints_handler(mut request: Request<Body>) -> Result<Response<Body>
 }

 // Run GC immediately on given timeline.
-async fn timeline_gc_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn timeline_gc_handler(
+    mut request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
    check_permission(&request, Some(tenant_id))?;
@@ -826,8 +984,10 @@ async fn timeline_gc_handler(mut request: Request<Body>) -> Result<Response<Body
 }

 // Run compaction immediately on given timeline.
-#[cfg(feature = "testing")]
-async fn timeline_compact_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn timeline_compact_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
    check_permission(&request, Some(tenant_id))?;
@@ -848,8 +1008,10 @@ async fn timeline_compact_handler(request: Request<Body>) -> Result<Response<Bod
 }

 // Run checkpoint immediately on given timeline.
-#[cfg(feature = "testing")]
-async fn timeline_checkpoint_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn timeline_checkpoint_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
    check_permission(&request, Some(tenant_id))?;
@@ -873,6 +1035,7 @@ async fn timeline_checkpoint_handler(request: Request<Body>) -> Result<Response<

 async fn timeline_download_remote_layers_handler_post(
    mut request: Request<Body>,
+    _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
@@ -888,6 +1051,7 @@ async fn timeline_download_remote_layers_handler_post(

 async fn timeline_download_remote_layers_handler_get(
    request: Request<Body>,
+    _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    check_permission(&request, Some(tenant_id))?;
@@ -897,7 +1061,7 @@ async fn timeline_download_remote_layers_handler_get(
    let info = timeline
        .get_download_all_remote_layers_task_info()
        .context("task never started since last pageserver process start")
-        .map_err(ApiError::NotFound)?;
+        .map_err(|e| ApiError::NotFound(e.into()))?;
    json_response(StatusCode::OK, info)
 }

@@ -908,10 +1072,13 @@ async fn active_timeline_of_active_tenant(
    let tenant = mgr::get_tenant(tenant_id, true).await?;
    tenant
        .get_timeline(timeline_id, true)
-        .map_err(ApiError::NotFound)
+        .map_err(|e| ApiError::NotFound(e.into()))
 }

-async fn always_panic_handler(req: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn always_panic_handler(
+    req: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
    // Deliberately cause a panic to exercise the panic hook registered via std::panic::set_hook().
    // For pageserver, the relevant panic hook is `tracing_panic_hook` , and the `sentry` crate's wrapper around it.
    // Use catch_unwind to ensure that tokio nor hyper are distracted by our panic.
@@ -922,7 +1089,10 @@ async fn always_panic_handler(req: Request<Body>) -> Result<Response<Body>, ApiE
    json_response(StatusCode::NO_CONTENT, ())
 }

-async fn disk_usage_eviction_run(mut r: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn disk_usage_eviction_run(
+    mut r: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
    check_permission(&r, None)?;

    #[derive(Debug, Clone, Copy, serde::Serialize, serde::Deserialize)]
@@ -958,8 +1128,6 @@ async fn disk_usage_eviction_run(mut r: Request<Body>) -> Result<Response<Body>,
        freed_bytes: 0,
    };

-    use crate::task_mgr::MGMT_REQUEST_RUNTIME;
-
    let (tx, rx) = tokio::sync::oneshot::channel();

    let state = get_state(&r);
@@ -977,7 +1145,7 @@ async fn disk_usage_eviction_run(mut r: Request<Body>) -> Result<Response<Body>,
    let _g = cancel.drop_guard();

    crate::task_mgr::spawn(
-        MGMT_REQUEST_RUNTIME.handle(),
+        crate::task_mgr::BACKGROUND_RUNTIME.handle(),
        TaskKind::DiskUsageEviction,
        None,
        None,
@@ -1012,8 +1180,10 @@ async fn handler_404(_: Request<Body>) -> Result<Response<Body>, ApiError> {
    )
 }

-#[cfg(feature = "testing")]
-async fn post_tracing_event_handler(mut r: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn post_tracing_event_handler(
+    mut r: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
    #[derive(Debug, serde::Deserialize)]
    #[serde(rename_all = "lowercase")]
    enum Level {
@@ -1043,10 +1213,90 @@ async fn post_tracing_event_handler(mut r: Request<Body>) -> Result<Response<Bod
    json_response(StatusCode::OK, ())
 }

+/// Common functionality of all the HTTP API handlers.
+///
+/// - Adds a tracing span to each request (by `request_span`)
+/// - Logs the request depending on the request method (by `request_span`)
+/// - Logs the response if it was not successful (by `request_span`
+/// - Shields the handler function from async cancellations. Hyper can drop the handler
+///   Future if the connection to the client is lost, but most of the pageserver code is
+///   not async cancellation safe. This converts the dropped future into a graceful cancellation
+///   request with a CancellationToken.
+async fn api_handler<R, H>(request: Request<Body>, handler: H) -> Result<Response<Body>, ApiError>
+where
+    R: std::future::Future<Output = Result<Response<Body>, ApiError>> + Send + 'static,
+    H: FnOnce(Request<Body>, CancellationToken) -> R + Send + Sync + 'static,
+{
+    // Spawn a new task to handle the request, to protect the handler from unexpected
+    // async cancellations. Most pageserver functions are not async cancellation safe.
+    // We arm a drop-guard, so that if Hyper drops the Future, we signal the task
+    // with the cancellation token.
+    let token = CancellationToken::new();
+    let cancel_guard = token.clone().drop_guard();
+    let result = request_span(request, move |r| async {
+        let handle = tokio::spawn(
+            async {
+                let token_cloned = token.clone();
+                let result = handler(r, token).await;
+                if token_cloned.is_cancelled() {
+                    info!("Cancelled request finished");
+                }
+                result
+            }
+            .in_current_span(),
+        );
+
+        match handle.await {
+            Ok(result) => result,
+            Err(e) => {
+                // The handler task panicked. We have a global panic handler that logs the
+                // panic with its backtrace, so no need to log that here. Only log a brief
+                // message to make it clear that we returned the error to the client.
+                error!("HTTP request handler task panicked: {e:#}");
+
+                // Don't return an Error here, because then fallback error handler that was
+                // installed in make_router() will print the error. Instead, construct the
+                // HTTP error response and return that.
+                Ok(
+                    ApiError::InternalServerError(anyhow!("HTTP request handler task panicked"))
+                        .into_response(),
+                )
+            }
+        }
+    })
+    .await;
+
+    cancel_guard.disarm();
+
+    result
+}
+
+/// Like api_handler, but returns an error response if the server is built without
+/// the 'testing' feature.
+async fn testing_api_handler<R, H>(
+    desc: &str,
+    request: Request<Body>,
+    handler: H,
+) -> Result<Response<Body>, ApiError>
+where
+    R: std::future::Future<Output = Result<Response<Body>, ApiError>> + Send + 'static,
+    H: FnOnce(Request<Body>, CancellationToken) -> R + Send + Sync + 'static,
+{
+    if cfg!(feature = "testing") {
+        api_handler(request, handler).await
+    } else {
+        std::future::ready(Err(ApiError::BadRequest(anyhow!(
+            "Cannot {desc} because pageserver was compiled without testing APIs",
+        ))))
+        .await
+    }
+}
+
 pub fn make_router(
    conf: &'static PageServerConf,
    launch_ts: &'static LaunchTimestamp,
    auth: Option<Arc<JwtAuth>>,
+    broker_client: BrokerClientChannel,
    remote_storage: Option<GenericRemoteStorage>,
    disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
 ) -> anyhow::Result<RouterBuilder<hyper::Body, ApiError>> {
@@ -1071,121 +1321,99 @@ pub fn make_router(
        .expect("construct launch timestamp header middleware"),
    );

-    macro_rules! testing_api {
-        ($handler_desc:literal, $handler:path $(,)?) => {{
-            #[cfg(not(feature = "testing"))]
-            async fn cfg_disabled(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
-                Err(ApiError::BadRequest(anyhow!(concat!(
-                    "Cannot ",
-                    $handler_desc,
-                    " because pageserver was compiled without testing APIs",
-                ))))
-            }
-
-            #[cfg(feature = "testing")]
-            let handler = $handler;
-            #[cfg(not(feature = "testing"))]
-            let handler = cfg_disabled;
-
-            move |r| RequestSpan(handler).handle(r)
-        }};
-    }
-
    Ok(router
        .data(Arc::new(
-            State::new(conf, auth, remote_storage, disk_usage_eviction_state)
-                .context("Failed to initialize router state")?,
+            State::new(
+                conf,
+                auth,
+                remote_storage,
+                broker_client,
+                disk_usage_eviction_state,
+            )
+            .context("Failed to initialize router state")?,
        ))
-        .get("/v1/status", |r| RequestSpan(status_handler).handle(r))
-        .put(
-            "/v1/failpoints",
-            testing_api!("manage failpoints", failpoints_handler),
-        )
-        .get("/v1/tenant", |r| RequestSpan(tenant_list_handler).handle(r))
-        .post("/v1/tenant", |r| {
-            RequestSpan(tenant_create_handler).handle(r)
-        })
-        .get("/v1/tenant/:tenant_id", |r| {
-            RequestSpan(tenant_status).handle(r)
+        .get("/v1/status", |r| api_handler(r, status_handler))
+        .put("/v1/failpoints", |r| {
+            testing_api_handler("manage failpoints", r, failpoints_handler)
        })
+        .get("/v1/tenant", |r| api_handler(r, tenant_list_handler))
+        .post("/v1/tenant", |r| api_handler(r, tenant_create_handler))
+        .get("/v1/tenant/:tenant_id", |r| api_handler(r, tenant_status))
        .get("/v1/tenant/:tenant_id/synthetic_size", |r| {
-            RequestSpan(tenant_size_handler).handle(r)
+            api_handler(r, tenant_size_handler)
        })
        .put("/v1/tenant/config", |r| {
-            RequestSpan(update_tenant_config_handler).handle(r)
+            api_handler(r, update_tenant_config_handler)
        })
        .get("/v1/tenant/:tenant_id/config", |r| {
-            RequestSpan(get_tenant_config_handler).handle(r)
+            api_handler(r, get_tenant_config_handler)
        })
        .get("/v1/tenant/:tenant_id/timeline", |r| {
-            RequestSpan(timeline_list_handler).handle(r)
+            api_handler(r, timeline_list_handler)
        })
        .post("/v1/tenant/:tenant_id/timeline", |r| {
-            RequestSpan(timeline_create_handler).handle(r)
+            api_handler(r, timeline_create_handler)
        })
        .post("/v1/tenant/:tenant_id/attach", |r| {
-            RequestSpan(tenant_attach_handler).handle(r)
+            api_handler(r, tenant_attach_handler)
        })
        .post("/v1/tenant/:tenant_id/detach", |r| {
-            RequestSpan(tenant_detach_handler).handle(r)
+            api_handler(r, tenant_detach_handler)
        })
        .post("/v1/tenant/:tenant_id/load", |r| {
-            RequestSpan(tenant_load_handler).handle(r)
+            api_handler(r, tenant_load_handler)
        })
        .post("/v1/tenant/:tenant_id/ignore", |r| {
-            RequestSpan(tenant_ignore_handler).handle(r)
+            api_handler(r, tenant_ignore_handler)
        })
        .get("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
-            RequestSpan(timeline_detail_handler).handle(r)
+            api_handler(r, timeline_detail_handler)
        })
        .get(
            "/v1/tenant/:tenant_id/timeline/:timeline_id/get_lsn_by_timestamp",
-            |r| RequestSpan(get_lsn_by_timestamp_handler).handle(r),
+            |r| api_handler(r, get_lsn_by_timestamp_handler),
        )
        .put("/v1/tenant/:tenant_id/timeline/:timeline_id/do_gc", |r| {
-            RequestSpan(timeline_gc_handler).handle(r)
+            api_handler(r, timeline_gc_handler)
+        })
+        .put("/v1/tenant/:tenant_id/timeline/:timeline_id/compact", |r| {
+            testing_api_handler("run timeline compaction", r, timeline_compact_handler)
        })
-        .put(
-            "/v1/tenant/:tenant_id/timeline/:timeline_id/compact",
-            testing_api!("run timeline compaction", timeline_compact_handler),
-        )
        .put(
            "/v1/tenant/:tenant_id/timeline/:timeline_id/checkpoint",
-            testing_api!("run timeline checkpoint", timeline_checkpoint_handler),
+            |r| testing_api_handler("run timeline checkpoint", r, timeline_checkpoint_handler),
        )
        .post(
            "/v1/tenant/:tenant_id/timeline/:timeline_id/download_remote_layers",
-            |r| RequestSpan(timeline_download_remote_layers_handler_post).handle(r),
+            |r| api_handler(r, timeline_download_remote_layers_handler_post),
        )
        .get(
            "/v1/tenant/:tenant_id/timeline/:timeline_id/download_remote_layers",
-            |r| RequestSpan(timeline_download_remote_layers_handler_get).handle(r),
+            |r| api_handler(r, timeline_download_remote_layers_handler_get),
        )
        .delete("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
-            RequestSpan(timeline_delete_handler).handle(r)
+            api_handler(r, timeline_delete_handler)
        })
        .get("/v1/tenant/:tenant_id/timeline/:timeline_id/layer", |r| {
-            RequestSpan(layer_map_info_handler).handle(r)
+            api_handler(r, layer_map_info_handler)
        })
        .get(
            "/v1/tenant/:tenant_id/timeline/:timeline_id/layer/:layer_file_name",
-            |r| RequestSpan(layer_download_handler).handle(r),
+            |r| api_handler(r, layer_download_handler),
        )
        .delete(
            "/v1/tenant/:tenant_id/timeline/:timeline_id/layer/:layer_file_name",
-            |r| RequestSpan(evict_timeline_layer_handler).handle(r),
+            |r| api_handler(r, evict_timeline_layer_handler),
        )
        .put("/v1/disk_usage_eviction/run", |r| {
-            RequestSpan(disk_usage_eviction_run).handle(r)
+            api_handler(r, disk_usage_eviction_run)
+        })
+        .put("/v1/tenant/:tenant_id/break", |r| {
+            testing_api_handler("set tenant state to broken", r, handle_tenant_break)
+        })
+        .get("/v1/panic", |r| api_handler(r, always_panic_handler))
+        .post("/v1/tracing/event", |r| {
+            testing_api_handler("emit a tracing event", r, post_tracing_event_handler)
        })
-        .put(
-            "/v1/tenant/:tenant_id/break",
-            testing_api!("set tenant state to broken", handle_tenant_break),
-        )
-        .get("/v1/panic", |r| RequestSpan(always_panic_handler).handle(r))
-        .post(
-            "/v1/tracing/event",
-            testing_api!("emit a tracing event", post_tracing_event_handler),
-        )
        .any(handler_404))
 }
--- a/pageserver/src/import_datadir.rs
+++ b/pageserver/src/import_datadir.rs
@@ -75,12 +75,12 @@ pub async fn import_timeline_from_postgres_datadir(
            {
                pg_control = Some(control_file);
            }
-            modification.flush()?;
+            modification.flush().await?;
        }
    }

    // We're done importing all the data files.
-    modification.commit()?;
+    modification.commit().await?;

    // We expect the Postgres server to be shut down cleanly.
    let pg_control = pg_control.context("pg_control file not found")?;
@@ -148,17 +148,17 @@ async fn import_rel(
    // because there is no guarantee about the order in which we are processing segments.
    // ignore "relation already exists" error
    //
-    // FIXME: use proper error type for this, instead of parsing the error message.
-    // Or better yet, keep track of which relations we've already created
+    // FIXME: Keep track of which relations we've already created?
    // https://github.com/neondatabase/neon/issues/3309
    if let Err(e) = modification
        .put_rel_creation(rel, nblocks as u32, ctx)
        .await
    {
-        if e.to_string().contains("already exists") {
-            debug!("relation {} already exists. we must be extending it", rel);
-        } else {
-            return Err(e);
+        match e {
+            RelationError::AlreadyExists => {
+                debug!("Relation {} already exist. We must be extending it.", rel)
+            }
+            _ => return Err(e.into()),
        }
    }

@@ -359,7 +359,7 @@ pub async fn import_basebackup_from_tar(
                    // We found the pg_control file.
                    pg_control = Some(res);
                }
-                modification.flush()?;
+                modification.flush().await?;
            }
            tokio_tar::EntryType::Directory => {
                debug!("directory {:?}", file_path);
@@ -377,7 +377,7 @@ pub async fn import_basebackup_from_tar(
    // sanity check: ensure that pg_control is loaded
    let _pg_control = pg_control.context("pg_control file not found")?;

-    modification.commit()?;
+    modification.commit().await?;
    Ok(())
 }

@@ -594,7 +594,7 @@ async fn import_file(
        // zenith.signal is not necessarily the last file, that we handle
        // but it is ok to call `finish_write()`, because final `modification.commit()`
        // will update lsn once more to the final one.
-        let writer = modification.tline.writer();
+        let writer = modification.tline.writer().await;
        writer.finish_write(prev_lsn);

        debug!("imported zenith signal {}", prev_lsn);
--- a/pageserver/src/keyspace.rs
+++ b/pageserver/src/keyspace.rs
@@ -5,7 +5,7 @@ use std::ops::Range;
 ///
 /// Represents a set of Keys, in a compact form.
 ///
-#[derive(Clone, Debug)]
+#[derive(Clone, Debug, Default)]
 pub struct KeySpace {
    /// Contiguous ranges of keys that belong to the key space. In key order,
    /// and with no overlap.
@@ -61,6 +61,18 @@ impl KeySpace {

        KeyPartitioning { parts }
    }
+
+    ///
+    /// Check if key space contains overlapping range
+    ///
+    pub fn overlaps(&self, range: &Range<Key>) -> bool {
+        match self.ranges.binary_search_by_key(&range.end, |r| r.start) {
+            Ok(0) => false,
+            Err(0) => false,
+            Ok(index) => self.ranges[index - 1].end > range.start,
+            Err(index) => self.ranges[index - 1].end > range.start,
+        }
+    }
 }

 ///
@@ -129,3 +141,226 @@ impl KeySpaceAccum {
        }
    }
 }
+
+///
+/// A helper object, to collect a set of keys and key ranges into a KeySpace
+/// object. Key ranges may be inserted in any order and can overlap.
+///
+#[derive(Clone, Debug, Default)]
+pub struct KeySpaceRandomAccum {
+    ranges: Vec<Range<Key>>,
+}
+
+impl KeySpaceRandomAccum {
+    pub fn new() -> Self {
+        Self { ranges: Vec::new() }
+    }
+
+    pub fn add_key(&mut self, key: Key) {
+        self.add_range(singleton_range(key))
+    }
+
+    pub fn add_range(&mut self, range: Range<Key>) {
+        self.ranges.push(range);
+    }
+
+    pub fn to_keyspace(mut self) -> KeySpace {
+        let mut ranges = Vec::new();
+        if !self.ranges.is_empty() {
+            self.ranges.sort_by_key(|r| r.start);
+            let mut start = self.ranges.first().unwrap().start;
+            let mut end = self.ranges.first().unwrap().end;
+            for r in self.ranges {
+                assert!(r.start >= start);
+                if r.start > end {
+                    ranges.push(start..end);
+                    start = r.start;
+                    end = r.end;
+                } else if r.end > end {
+                    end = r.end;
+                }
+            }
+            ranges.push(start..end);
+        }
+        KeySpace { ranges }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::fmt::Write;
+
+    // Helper function to create a key range.
+    //
+    // Make the tests below less verbose.
+    fn kr(irange: Range<i128>) -> Range<Key> {
+        Key::from_i128(irange.start)..Key::from_i128(irange.end)
+    }
+
+    #[allow(dead_code)]
+    fn dump_keyspace(ks: &KeySpace) {
+        for r in ks.ranges.iter() {
+            println!("  {}..{}", r.start.to_i128(), r.end.to_i128());
+        }
+    }
+
+    fn assert_ks_eq(actual: &KeySpace, expected: Vec<Range<Key>>) {
+        if actual.ranges != expected {
+            let mut msg = String::new();
+
+            writeln!(msg, "expected:").unwrap();
+            for r in &expected {
+                writeln!(msg, "  {}..{}", r.start.to_i128(), r.end.to_i128()).unwrap();
+            }
+            writeln!(msg, "got:").unwrap();
+            for r in &actual.ranges {
+                writeln!(msg, "  {}..{}", r.start.to_i128(), r.end.to_i128()).unwrap();
+            }
+            panic!("{}", msg);
+        }
+    }
+
+    #[test]
+    fn keyspace_add_range() {
+        // two separate ranges
+        //
+        // #####
+        //         #####
+        let mut ks = KeySpaceRandomAccum::default();
+        ks.add_range(kr(0..10));
+        ks.add_range(kr(20..30));
+        assert_ks_eq(&ks.to_keyspace(), vec![kr(0..10), kr(20..30)]);
+
+        // two separate ranges, added in reverse order
+        //
+        //         #####
+        // #####
+        let mut ks = KeySpaceRandomAccum::default();
+        ks.add_range(kr(20..30));
+        ks.add_range(kr(0..10));
+
+        // add range that is adjacent to the end of an existing range
+        //
+        // #####
+        //      #####
+        ks.add_range(kr(0..10));
+        ks.add_range(kr(10..30));
+        assert_ks_eq(&ks.to_keyspace(), vec![kr(0..30)]);
+
+        // add range that is adjacent to the start of an existing range
+        //
+        //      #####
+        // #####
+        let mut ks = KeySpaceRandomAccum::default();
+        ks.add_range(kr(10..30));
+        ks.add_range(kr(0..10));
+        assert_ks_eq(&ks.to_keyspace(), vec![kr(0..30)]);
+
+        // add range that overlaps with the end of an existing range
+        //
+        // #####
+        //    #####
+        let mut ks = KeySpaceRandomAccum::default();
+        ks.add_range(kr(0..10));
+        ks.add_range(kr(5..30));
+        assert_ks_eq(&ks.to_keyspace(), vec![kr(0..30)]);
+
+        // add range that overlaps with the start of an existing range
+        //
+        //    #####
+        // #####
+        let mut ks = KeySpaceRandomAccum::default();
+        ks.add_range(kr(5..30));
+        ks.add_range(kr(0..10));
+        assert_ks_eq(&ks.to_keyspace(), vec![kr(0..30)]);
+
+        // add range that is fully covered by an existing range
+        //
+        // #########
+        //   #####
+        let mut ks = KeySpaceRandomAccum::default();
+        ks.add_range(kr(0..30));
+        ks.add_range(kr(10..20));
+        assert_ks_eq(&ks.to_keyspace(), vec![kr(0..30)]);
+
+        // add range that extends an existing range from both ends
+        //
+        //   #####
+        // #########
+        let mut ks = KeySpaceRandomAccum::default();
+        ks.add_range(kr(10..20));
+        ks.add_range(kr(0..30));
+        assert_ks_eq(&ks.to_keyspace(), vec![kr(0..30)]);
+
+        // add a range that overlaps with two existing ranges, joining them
+        //
+        // #####   #####
+        //    #######
+        let mut ks = KeySpaceRandomAccum::default();
+        ks.add_range(kr(0..10));
+        ks.add_range(kr(20..30));
+        ks.add_range(kr(5..25));
+        assert_ks_eq(&ks.to_keyspace(), vec![kr(0..30)]);
+    }
+
+    #[test]
+    fn keyspace_overlaps() {
+        let mut ks = KeySpaceRandomAccum::default();
+        ks.add_range(kr(10..20));
+        ks.add_range(kr(30..40));
+        let ks = ks.to_keyspace();
+
+        //        #####      #####
+        // xxxx
+        assert!(!ks.overlaps(&kr(0..5)));
+
+        //        #####      #####
+        //   xxxx
+        assert!(!ks.overlaps(&kr(5..9)));
+
+        //        #####      #####
+        //    xxxx
+        assert!(!ks.overlaps(&kr(5..10)));
+
+        //        #####      #####
+        //     xxxx
+        assert!(ks.overlaps(&kr(5..11)));
+
+        //        #####      #####
+        //        xxxx
+        assert!(ks.overlaps(&kr(10..15)));
+
+        //        #####      #####
+        //         xxxx
+        assert!(ks.overlaps(&kr(15..20)));
+
+        //        #####      #####
+        //           xxxx
+        assert!(ks.overlaps(&kr(15..25)));
+
+        //        #####      #####
+        //              xxxx
+        assert!(!ks.overlaps(&kr(22..28)));
+
+        //        #####      #####
+        //               xxxx
+        assert!(!ks.overlaps(&kr(25..30)));
+
+        //        #####      #####
+        //                      xxxx
+        assert!(ks.overlaps(&kr(35..35)));
+
+        //        #####      #####
+        //                        xxxx
+        assert!(!ks.overlaps(&kr(40..45)));
+
+        //        #####      #####
+        //                        xxxx
+        assert!(!ks.overlaps(&kr(45..50)));
+
+        //        #####      #####
+        //        xxxxxxxxxxx
+        assert!(ks.overlaps(&kr(0..30))); // XXXXX This fails currently!
+    }
+}
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -1,6 +1,5 @@
 mod auth;
 pub mod basebackup;
-pub mod broker_client;
 pub mod config;
 pub mod consumption_metrics;
 pub mod context;
@@ -36,7 +35,7 @@ use tracing::info;
 /// backwards-compatible changes to the metadata format.
 pub const STORAGE_FORMAT_VERSION: u16 = 3;

-pub const DEFAULT_PG_VERSION: u32 = 14;
+pub const DEFAULT_PG_VERSION: u32 = 15;

 // Magic constants used to identify different kinds of files
 pub const IMAGE_FILE_MAGIC: u16 = 0x5A60;
@@ -46,6 +45,7 @@ static ZERO_PAGE: bytes::Bytes = bytes::Bytes::from_static(&[0u8; 8192]);

 pub use crate::metrics::preinitialize_metrics;

+#[tracing::instrument]
 pub async fn shutdown_pageserver(exit_code: i32) {
    // Shut down the libpq endpoint task. This prevents new connections from
    // being accepted.
@@ -58,12 +58,6 @@ pub async fn shutdown_pageserver(exit_code: i32) {
    // the checkpoint and GC tasks.
    tenant::mgr::shutdown_all_tenants().await;

-    // Stop syncing with remote storage.
-    //
-    // FIXME: Does this wait for the sync tasks to finish syncing what's queued up?
-    // Should it?
-    task_mgr::shutdown_tasks(Some(TaskKind::RemoteUploadTask), None, None).await;
-
    // Shut down the HTTP endpoint last, so that you can still check the server's
    // status while it's shutting down.
    // FIXME: We should probably stop accepting commands like attach/detach earlier.
@@ -138,6 +132,29 @@ pub fn is_uninit_mark(path: &Path) -> bool {
    }
 }

+/// During pageserver startup, we need to order operations not to exhaust tokio worker threads by
+/// blocking.
+///
+/// The instances of this value exist only during startup, otherwise `None` is provided, meaning no
+/// delaying is needed.
+#[derive(Clone)]
+pub struct InitializationOrder {
+    /// Each initial tenant load task carries this until completion.
+    pub initial_tenant_load: Option<utils::completion::Completion>,
+
+    /// Barrier for when we can start initial logical size calculations.
+    pub initial_logical_size_can_start: utils::completion::Barrier,
+
+    /// Each timeline owns a clone of this to be consumed on the initial logical size calculation
+    /// attempt. It is important to drop this once the attempt has completed.
+    pub initial_logical_size_attempt: utils::completion::Completion,
+
+    /// Barrier for when we can start any background jobs.
+    ///
+    /// This can be broken up later on, but right now there is just one class of a background job.
+    pub background_jobs_can_start: utils::completion::Barrier,
+}
+
 #[cfg(test)]
 mod backoff_defaults_tests {
    use super::*;
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1,4 +1,4 @@
-use metrics::core::{AtomicU64, GenericCounter};
+use metrics::metric_vec_duration::DurationResultObserver;
 use metrics::{
    register_counter_vec, register_histogram, register_histogram_vec, register_int_counter,
    register_int_counter_vec, register_int_gauge, register_int_gauge_vec, register_uint_gauge_vec,
@@ -84,22 +84,48 @@ pub static STORAGE_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-// Metrics collected on operations on the storage repository.
-static RECONSTRUCT_TIME: Lazy<HistogramVec> = Lazy::new(|| {
+static READ_NUM_FS_LAYERS: Lazy<HistogramVec> = Lazy::new(|| {
    register_histogram_vec!(
+        "pageserver_read_num_fs_layers",
+        "Number of persistent layers accessed for processing a read request, including those in the cache",
+        &["tenant_id", "timeline_id"],
+        vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 10.0, 20.0, 50.0, 100.0],
+    )
+    .expect("failed to define a metric")
+});
+
+// Metrics collected on operations on the storage repository.
+pub static RECONSTRUCT_TIME: Lazy<Histogram> = Lazy::new(|| {
+    register_histogram!(
        "pageserver_getpage_reconstruct_seconds",
-        "Time spent in reconstruct_value",
+        "Time spent in reconstruct_value (reconstruct a page from deltas)",
+        CRITICAL_OP_BUCKETS.into(),
+    )
+    .expect("failed to define a metric")
+});
+
+pub static MATERIALIZED_PAGE_CACHE_HIT_DIRECT: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
+        "pageserver_materialized_cache_hits_direct_total",
+        "Number of cache hits from materialized page cache without redo",
+    )
+    .expect("failed to define a metric")
+});
+
+static GET_RECONSTRUCT_DATA_TIME: Lazy<HistogramVec> = Lazy::new(|| {
+    register_histogram_vec!(
+        "pageserver_getpage_get_reconstruct_data_seconds",
+        "Time spent in get_reconstruct_value_data",
        &["tenant_id", "timeline_id"],
        CRITICAL_OP_BUCKETS.into(),
    )
    .expect("failed to define a metric")
 });

-static MATERIALIZED_PAGE_CACHE_HIT: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
+pub static MATERIALIZED_PAGE_CACHE_HIT: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
        "pageserver_materialized_cache_hits_total",
        "Number of cache hits from materialized page cache",
-        &["tenant_id", "timeline_id"]
    )
    .expect("failed to define a metric")
 });
@@ -354,6 +380,7 @@ const STORAGE_IO_TIME_BUCKETS: &[f64] = &[
    0.001000, // 1000 usec
    0.030,    // 30 ms
    1.000,    // 1000 ms
+    30.000,   // 30000 ms
 ];

 const STORAGE_IO_TIME_OPERATIONS: &[&str] = &[
@@ -398,6 +425,27 @@ pub static SMGR_QUERY_TIME: Lazy<HistogramVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

+pub struct BasebackupQueryTime(HistogramVec);
+pub static BASEBACKUP_QUERY_TIME: Lazy<BasebackupQueryTime> = Lazy::new(|| {
+    BasebackupQueryTime({
+        register_histogram_vec!(
+            "pageserver_basebackup_query_seconds",
+            "Histogram of basebackup queries durations, by result type",
+            &["result"],
+            CRITICAL_OP_BUCKETS.into(),
+        )
+        .expect("failed to define a metric")
+    })
+});
+
+impl DurationResultObserver for BasebackupQueryTime {
+    fn observe_result<T, E>(&self, res: &Result<T, E>, duration: std::time::Duration) {
+        let label_value = if res.is_ok() { "ok" } else { "error" };
+        let metric = self.0.get_metric_with_label_values(&[label_value]).unwrap();
+        metric.observe(duration.as_secs_f64());
+    }
+}
+
 pub static LIVE_CONNECTIONS_COUNT: Lazy<IntGaugeVec> = Lazy::new(|| {
    register_int_gauge_vec!(
        "pageserver_live_connections",
@@ -622,7 +670,7 @@ pub static WAL_REDO_TIME: Lazy<Histogram> = Lazy::new(|| {
 pub static WAL_REDO_WAIT_TIME: Lazy<Histogram> = Lazy::new(|| {
    register_histogram!(
        "pageserver_wal_redo_wait_seconds",
-        "Time spent waiting for access to the WAL redo process",
+        "Time spent waiting for access to the Postgres WAL redo process",
        redo_histogram_time_buckets!(),
    )
    .expect("failed to define a metric")
@@ -631,7 +679,7 @@ pub static WAL_REDO_WAIT_TIME: Lazy<Histogram> = Lazy::new(|| {
 pub static WAL_REDO_RECORDS_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
    register_histogram!(
        "pageserver_wal_redo_records_histogram",
-        "Histogram of number of records replayed per redo",
+        "Histogram of number of records replayed per redo in the Postgres WAL redo process",
        redo_histogram_count_buckets!(),
    )
    .expect("failed to define a metric")
@@ -640,7 +688,7 @@ pub static WAL_REDO_RECORDS_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
 pub static WAL_REDO_BYTES_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
    register_histogram!(
        "pageserver_wal_redo_bytes_histogram",
-        "Histogram of number of records replayed per redo",
+        "Histogram of number of records replayed per redo sent to Postgres",
        redo_bytes_histogram_count_buckets!(),
    )
    .expect("failed to define a metric")
@@ -722,8 +770,7 @@ impl StorageTimeMetrics {
 pub struct TimelineMetrics {
    tenant_id: String,
    timeline_id: String,
-    pub reconstruct_time_histo: Histogram,
-    pub materialized_page_cache_hit_counter: GenericCounter<AtomicU64>,
+    pub get_reconstruct_data_time_histo: Histogram,
    pub flush_time_histo: StorageTimeMetrics,
    pub compact_time_histo: StorageTimeMetrics,
    pub create_images_time_histo: StorageTimeMetrics,
@@ -734,6 +781,7 @@ pub struct TimelineMetrics {
    pub last_record_gauge: IntGauge,
    pub wait_lsn_time_histo: Histogram,
    pub resident_physical_size_gauge: UIntGauge,
+    pub read_num_fs_layers: Histogram,
    /// copy of LayeredTimeline.current_logical_size
    pub current_logical_size_gauge: UIntGauge,
    pub num_persistent_files_created: IntCounter,
@@ -750,10 +798,7 @@ impl TimelineMetrics {
    ) -> Self {
        let tenant_id = tenant_id.to_string();
        let timeline_id = timeline_id.to_string();
-        let reconstruct_time_histo = RECONSTRUCT_TIME
-            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
-            .unwrap();
-        let materialized_page_cache_hit_counter = MATERIALIZED_PAGE_CACHE_HIT
+        let get_reconstruct_data_time_histo = GET_RECONSTRUCT_DATA_TIME
            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
            .unwrap();
        let flush_time_histo =
@@ -794,14 +839,16 @@ impl TimelineMetrics {
        let evictions = EVICTIONS
            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
            .unwrap();
+        let read_num_fs_layers = READ_NUM_FS_LAYERS
+            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
+            .unwrap();
        let evictions_with_low_residence_duration =
            evictions_with_low_residence_duration_builder.build(&tenant_id, &timeline_id);

        TimelineMetrics {
            tenant_id,
            timeline_id,
-            reconstruct_time_histo,
-            materialized_page_cache_hit_counter,
+            get_reconstruct_data_time_histo,
            flush_time_histo,
            compact_time_histo,
            create_images_time_histo,
@@ -819,6 +866,7 @@ impl TimelineMetrics {
            evictions_with_low_residence_duration: std::sync::RwLock::new(
                evictions_with_low_residence_duration,
            ),
+            read_num_fs_layers,
        }
    }
 }
@@ -827,8 +875,7 @@ impl Drop for TimelineMetrics {
    fn drop(&mut self) {
        let tenant_id = &self.tenant_id;
        let timeline_id = &self.timeline_id;
-        let _ = RECONSTRUCT_TIME.remove_label_values(&[tenant_id, timeline_id]);
-        let _ = MATERIALIZED_PAGE_CACHE_HIT.remove_label_values(&[tenant_id, timeline_id]);
+        let _ = GET_RECONSTRUCT_DATA_TIME.remove_label_values(&[tenant_id, timeline_id]);
        let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, timeline_id]);
        let _ = WAIT_LSN_TIME.remove_label_values(&[tenant_id, timeline_id]);
        let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
@@ -836,6 +883,8 @@ impl Drop for TimelineMetrics {
        let _ = NUM_PERSISTENT_FILES_CREATED.remove_label_values(&[tenant_id, timeline_id]);
        let _ = PERSISTENT_BYTES_WRITTEN.remove_label_values(&[tenant_id, timeline_id]);
        let _ = EVICTIONS.remove_label_values(&[tenant_id, timeline_id]);
+        let _ = READ_NUM_FS_LAYERS.remove_label_values(&[tenant_id, timeline_id]);
+
        self.evictions_with_low_residence_duration
            .write()
            .unwrap()
@@ -1270,4 +1319,8 @@ pub fn preinitialize_metrics() {

    // Same as above for this metric, but, it's a Vec-type metric for which we don't know all the labels.
    BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT.reset();
+
+    // Python tests need these.
+    MATERIALIZED_PAGE_CACHE_HIT_DIRECT.get();
+    MATERIALIZED_PAGE_CACHE_HIT.get();
 }
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -50,7 +50,9 @@ use crate::import_datadir::import_wal_from_tar;
 use crate::metrics::{LIVE_CONNECTIONS_COUNT, SMGR_QUERY_TIME};
 use crate::task_mgr;
 use crate::task_mgr::TaskKind;
+use crate::tenant;
 use crate::tenant::mgr;
+use crate::tenant::mgr::GetTenantError;
 use crate::tenant::{Tenant, Timeline};
 use crate::trace::Tracer;

@@ -172,6 +174,7 @@ async fn read_tar_eof(mut reader: (impl AsyncRead + Unpin)) -> anyhow::Result<()
 ///
 pub async fn libpq_listener_main(
    conf: &'static PageServerConf,
+    broker_client: storage_broker::BrokerClientChannel,
    auth: Option<Arc<JwtAuth>>,
    listener: TcpListener,
    auth_type: AuthType,
@@ -213,7 +216,14 @@ pub async fn libpq_listener_main(
                    None,
                    "serving compute connection task",
                    false,
-                    page_service_conn_main(conf, local_auth, socket, auth_type, connection_ctx),
+                    page_service_conn_main(
+                        conf,
+                        broker_client.clone(),
+                        local_auth,
+                        socket,
+                        auth_type,
+                        connection_ctx,
+                    ),
                );
            }
            Err(err) => {
@@ -230,6 +240,7 @@ pub async fn libpq_listener_main(

 async fn page_service_conn_main(
    conf: &'static PageServerConf,
+    broker_client: storage_broker::BrokerClientChannel,
    auth: Option<Arc<JwtAuth>>,
    socket: tokio::net::TcpStream,
    auth_type: AuthType,
@@ -266,7 +277,7 @@ async fn page_service_conn_main(
    // and create a child per-query context when it invokes process_query.
    // But it's in a shared crate, so, we store connection_ctx inside PageServerHandler
    // and create the per-query context in process_query ourselves.
-    let mut conn_handler = PageServerHandler::new(conf, auth, connection_ctx);
+    let mut conn_handler = PageServerHandler::new(conf, broker_client, auth, connection_ctx);
    let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?;

    match pgbackend
@@ -324,6 +335,7 @@ impl PageRequestMetrics {

 struct PageServerHandler {
    _conf: &'static PageServerConf,
+    broker_client: storage_broker::BrokerClientChannel,
    auth: Option<Arc<JwtAuth>>,
    claims: Option<Claims>,

@@ -337,11 +349,13 @@ struct PageServerHandler {
 impl PageServerHandler {
    pub fn new(
        conf: &'static PageServerConf,
+        broker_client: storage_broker::BrokerClientChannel,
        auth: Option<Arc<JwtAuth>>,
        connection_ctx: RequestContext,
    ) -> Self {
        PageServerHandler {
            _conf: conf,
+            broker_client,
            auth,
            claims: None,
            connection_ctx,
@@ -376,7 +390,9 @@ impl PageServerHandler {
        };

        // Check that the timeline exists
-        let timeline = tenant.get_timeline(timeline_id, true)?;
+        let timeline = tenant
+            .get_timeline(timeline_id, true)
+            .map_err(|e| anyhow::anyhow!(e))?;

        // switch client to COPYBOTH
        pgb.write_message_noflush(&BeMessage::CopyBothResponse)?;
@@ -494,7 +510,12 @@ impl PageServerHandler {

        let mut copyin_reader = pin!(StreamReader::new(copyin_stream(pgb)));
        timeline
-            .import_basebackup_from_tar(&mut copyin_reader, base_lsn, &ctx)
+            .import_basebackup_from_tar(
+                &mut copyin_reader,
+                base_lsn,
+                self.broker_client.clone(),
+                &ctx,
+            )
            .await?;

        // Read the end of the tar archive.
@@ -892,10 +913,24 @@ where
                None
            };

-            // Check that the timeline exists
-            self.handle_basebackup_request(pgb, tenant_id, timeline_id, lsn, None, false, ctx)
-                .await?;
-            pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
+            metrics::metric_vec_duration::observe_async_block_duration_by_result(
+                &*crate::metrics::BASEBACKUP_QUERY_TIME,
+                async move {
+                    self.handle_basebackup_request(
+                        pgb,
+                        tenant_id,
+                        timeline_id,
+                        lsn,
+                        None,
+                        false,
+                        ctx,
+                    )
+                    .await?;
+                    pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
+                    anyhow::Ok(())
+                },
+            )
+            .await?;
        }
        // return pair of prev_lsn and last_lsn
        else if query_string.starts_with("get_last_record_rlsn ") {
@@ -1131,7 +1166,9 @@ enum GetActiveTenantError {
        wait_time: Duration,
    },
    #[error(transparent)]
-    Other(#[from] anyhow::Error),
+    NotFound(GetTenantError),
+    #[error(transparent)]
+    WaitTenantActive(tenant::WaitToBecomeActiveError),
 }

 impl From<GetActiveTenantError> for QueryError {
@@ -1140,7 +1177,8 @@ impl From<GetActiveTenantError> for QueryError {
            GetActiveTenantError::WaitForActiveTimeout { .. } => QueryError::Disconnected(
                ConnectionError::Io(io::Error::new(io::ErrorKind::TimedOut, e.to_string())),
            ),
-            GetActiveTenantError::Other(e) => QueryError::Other(e),
+            GetActiveTenantError::WaitTenantActive(e) => QueryError::Other(anyhow::Error::new(e)),
+            GetActiveTenantError::NotFound(e) => QueryError::Other(anyhow::Error::new(e)),
        }
    }
 }
@@ -1156,13 +1194,16 @@ async fn get_active_tenant_with_timeout(
 ) -> Result<Arc<Tenant>, GetActiveTenantError> {
    let tenant = match mgr::get_tenant(tenant_id, false).await {
        Ok(tenant) => tenant,
-        Err(e) => return Err(GetActiveTenantError::Other(e.into())),
+        Err(e @ GetTenantError::NotFound(_)) => return Err(GetActiveTenantError::NotFound(e)),
+        Err(GetTenantError::NotActive(_)) => {
+            unreachable!("we're calling get_tenant with active=false")
+        }
    };
    let wait_time = Duration::from_secs(30);
    match tokio::time::timeout(wait_time, tenant.wait_to_become_active()).await {
        Ok(Ok(())) => Ok(tenant),
        // no .context(), the error message is good enough and some tests depend on it
-        Ok(Err(wait_error)) => Err(GetActiveTenantError::Other(wait_error)),
+        Ok(Err(e)) => Err(GetActiveTenantError::WaitTenantActive(e)),
        Err(_) => {
            let latest_state = tenant.current_state();
            if latest_state == TenantState::Active {
@@ -1177,13 +1218,34 @@ async fn get_active_tenant_with_timeout(
    }
 }

+#[derive(Debug, thiserror::Error)]
+enum GetActiveTimelineError {
+    #[error(transparent)]
+    Tenant(GetActiveTenantError),
+    #[error(transparent)]
+    Timeline(anyhow::Error),
+}
+
+impl From<GetActiveTimelineError> for QueryError {
+    fn from(e: GetActiveTimelineError) -> Self {
+        match e {
+            GetActiveTimelineError::Tenant(e) => e.into(),
+            GetActiveTimelineError::Timeline(e) => QueryError::Other(e),
+        }
+    }
+}
+
 /// Shorthand for getting a reference to a Timeline of an Active tenant.
 async fn get_active_tenant_timeline(
    tenant_id: TenantId,
    timeline_id: TimelineId,
    ctx: &RequestContext,
-) -> Result<Arc<Timeline>, GetActiveTenantError> {
-    let tenant = get_active_tenant_with_timeout(tenant_id, ctx).await?;
-    let timeline = tenant.get_timeline(timeline_id, true)?;
+) -> Result<Arc<Timeline>, GetActiveTimelineError> {
+    let tenant = get_active_tenant_with_timeout(tenant_id, ctx)
+        .await
+        .map_err(GetActiveTimelineError::Tenant)?;
+    let timeline = tenant
+        .get_timeline(timeline_id, true)
+        .map_err(|e| GetActiveTimelineError::Timeline(anyhow::anyhow!(e)))?;
    Ok(timeline)
 }
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -20,6 +20,7 @@ use postgres_ffi::{Oid, TimestampTz, TransactionId};
 use serde::{Deserialize, Serialize};
 use std::collections::{hash_map, HashMap, HashSet};
 use std::ops::Range;
+use tokio_util::sync::CancellationToken;
 use tracing::{debug, trace, warn};
 use utils::{bin_ser::BeSer, lsn::Lsn};

@@ -42,6 +43,16 @@ pub enum CalculateLogicalSizeError {
    Other(#[from] anyhow::Error),
 }

+#[derive(Debug, thiserror::Error)]
+pub enum RelationError {
+    #[error("Relation Already Exists")]
+    AlreadyExists,
+    #[error("invalid relnode")]
+    InvalidRelnode,
+    #[error(transparent)]
+    Other(#[from] anyhow::Error),
+}
+
 ///
 /// This impl provides all the functionality to store PostgreSQL relations, SLRUs,
 /// and other special kinds of files, in a versioned key-value store. The
@@ -100,9 +111,9 @@ impl Timeline {
        ctx: &RequestContext,
    ) -> Result<Bytes, PageReconstructError> {
        if tag.relnode == 0 {
-            return Err(PageReconstructError::Other(anyhow::anyhow!(
-                "invalid relnode"
-            )));
+            return Err(PageReconstructError::Other(
+                RelationError::InvalidRelnode.into(),
+            ));
        }

        let nblocks = self.get_rel_size(tag, lsn, latest, ctx).await?;
@@ -138,17 +149,6 @@ impl Timeline {
        Ok(total_blocks)
    }

-    /// Get timeline logical size
-    pub async fn get_logical_size(
-        &self,
-        lsn: Lsn,
-        ctx: &RequestContext,
-    ) -> Result<u64, PageReconstructError> {
-        let mut buf = self.get(LOGICAL_SIZE_KEY, lsn, ctx).await?;
-        let size = buf.get_u64_le();
-        Ok(size)
-    }
-
    /// Get size of a relation file
    pub async fn get_rel_size(
        &self,
@@ -158,9 +158,9 @@ impl Timeline {
        ctx: &RequestContext,
    ) -> Result<BlockNumber, PageReconstructError> {
        if tag.relnode == 0 {
-            return Err(PageReconstructError::Other(anyhow::anyhow!(
-                "invalid relnode"
-            )));
+            return Err(PageReconstructError::Other(
+                RelationError::InvalidRelnode.into(),
+            ));
        }

        if let Some(nblocks) = self.get_cached_rel_size(&tag, lsn) {
@@ -203,9 +203,9 @@ impl Timeline {
        ctx: &RequestContext,
    ) -> Result<bool, PageReconstructError> {
        if tag.relnode == 0 {
-            return Err(PageReconstructError::Other(anyhow::anyhow!(
-                "invalid relnode"
-            )));
+            return Err(PageReconstructError::Other(
+                RelationError::InvalidRelnode.into(),
+            ));
        }

        // first try to lookup relation in cache
@@ -499,6 +499,46 @@ impl Timeline {
        self.get(CHECKPOINT_KEY, lsn, ctx).await
    }

+    /// Does the same as get_current_logical_size but counted on demand.
+    /// Used to initialize the logical size tracking on startup.
+    ///
+    /// Only relation blocks are counted currently. That excludes metadata,
+    /// SLRUs, twophase files etc.
+    pub async fn get_current_logical_size_non_incremental(
+        &self,
+        lsn: Lsn,
+        cancel: CancellationToken,
+        ctx: &RequestContext,
+    ) -> Result<u64, CalculateLogicalSizeError> {
+        crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id();
+
+        // Fetch list of database dirs and iterate them
+        let buf = self.get(DBDIR_KEY, lsn, ctx).await.context("read dbdir")?;
+        let dbdir = DbDirectory::des(&buf).context("deserialize db directory")?;
+
+        let mut total_size: u64 = 0;
+        for (spcnode, dbnode) in dbdir.dbdirs.keys() {
+            for rel in self
+                .list_rels(*spcnode, *dbnode, lsn, ctx)
+                .await
+                .context("list rels")?
+            {
+                if cancel.is_cancelled() {
+                    return Err(CalculateLogicalSizeError::Cancelled);
+                }
+                let relsize_key = rel_size_to_key(rel);
+                let mut buf = self
+                    .get(relsize_key, lsn, ctx)
+                    .await
+                    .with_context(|| format!("read relation size of {rel:?}"))?;
+                let relsize = buf.get_u32_le();
+
+                total_size += relsize as u64;
+            }
+        }
+        Ok(total_size * BLCKSZ as u64)
+    }
+
    ///
    /// Get a KeySpace that covers all the Keys that are in use at the given LSN.
    /// Anything that's not listed maybe removed from the underlying storage (from
@@ -669,6 +709,20 @@ impl<'a> DatadirModification<'a> {
        Ok(())
    }

+    #[cfg(test)]
+    pub fn init_empty_test_timeline(&mut self) -> anyhow::Result<()> {
+        self.init_empty()?;
+        self.put_control_file(bytes::Bytes::from_static(
+            b"control_file contents do not matter",
+        ))
+        .context("put_control_file")?;
+        self.put_checkpoint(bytes::Bytes::from_static(
+            b"checkpoint_file contents do not matter",
+        ))
+        .context("put_checkpoint_file")?;
+        Ok(())
+    }
+
    /// Put a new page version that can be constructed from a WAL record
    ///
    /// NOTE: this will *not* implicitly extend the relation, if the page is beyond the
@@ -680,7 +734,7 @@ impl<'a> DatadirModification<'a> {
        blknum: BlockNumber,
        rec: NeonWalRecord,
    ) -> anyhow::Result<()> {
-        anyhow::ensure!(rel.relnode != 0, "invalid relnode");
+        anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
        self.put(rel_block_to_key(rel, blknum), Value::WalRecord(rec));
        Ok(())
    }
@@ -707,7 +761,7 @@ impl<'a> DatadirModification<'a> {
        blknum: BlockNumber,
        img: Bytes,
    ) -> anyhow::Result<()> {
-        anyhow::ensure!(rel.relnode != 0, "invalid relnode");
+        anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
        self.put(rel_block_to_key(rel, blknum), Value::Image(img));
        Ok(())
    }
@@ -789,12 +843,6 @@ impl<'a> DatadirModification<'a> {
        Ok(())
    }

-    pub fn put_logical_size(&mut self, size: u64) -> anyhow::Result<()> {
-        let buf = size.to_le_bytes();
-        self.put(LOGICAL_SIZE_KEY, Value::Image(Bytes::from(buf.to_vec())));
-        Ok(())
-    }
-
    pub async fn drop_dbdir(
        &mut self,
        spcnode: Oid,
@@ -837,32 +885,38 @@ impl<'a> DatadirModification<'a> {
        rel: RelTag,
        nblocks: BlockNumber,
        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
-        anyhow::ensure!(rel.relnode != 0, "invalid relnode");
+    ) -> Result<(), RelationError> {
+        if rel.relnode == 0 {
+            return Err(RelationError::AlreadyExists);
+        }
        // It's possible that this is the first rel for this db in this
        // tablespace.  Create the reldir entry for it if so.
-        let mut dbdir = DbDirectory::des(&self.get(DBDIR_KEY, ctx).await?)?;
+        let mut dbdir = DbDirectory::des(&self.get(DBDIR_KEY, ctx).await.context("read db")?)
+            .context("deserialize db")?;
        let rel_dir_key = rel_dir_to_key(rel.spcnode, rel.dbnode);
        let mut rel_dir = if dbdir.dbdirs.get(&(rel.spcnode, rel.dbnode)).is_none() {
            // Didn't exist. Update dbdir
            dbdir.dbdirs.insert((rel.spcnode, rel.dbnode), false);
-            let buf = DbDirectory::ser(&dbdir)?;
+            let buf = DbDirectory::ser(&dbdir).context("serialize db")?;
            self.put(DBDIR_KEY, Value::Image(buf.into()));

            // and create the RelDirectory
            RelDirectory::default()
        } else {
            // reldir already exists, fetch it
-            RelDirectory::des(&self.get(rel_dir_key, ctx).await?)?
+            RelDirectory::des(&self.get(rel_dir_key, ctx).await.context("read db")?)
+                .context("deserialize db")?
        };

        // Add the new relation to the rel directory entry, and write it back
        if !rel_dir.rels.insert((rel.relnode, rel.forknum)) {
-            anyhow::bail!("rel {rel} already exists");
+            return Err(RelationError::AlreadyExists);
        }
        self.put(
            rel_dir_key,
-            Value::Image(Bytes::from(RelDirectory::ser(&rel_dir)?)),
+            Value::Image(Bytes::from(
+                RelDirectory::ser(&rel_dir).context("serialize")?,
+            )),
        );

        // Put size
@@ -887,7 +941,7 @@ impl<'a> DatadirModification<'a> {
        nblocks: BlockNumber,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
-        anyhow::ensure!(rel.relnode != 0, "invalid relnode");
+        anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
        let last_lsn = self.tline.get_last_record_lsn();
        if self.tline.get_rel_exists(rel, last_lsn, true, ctx).await? {
            let size_key = rel_size_to_key(rel);
@@ -918,7 +972,7 @@ impl<'a> DatadirModification<'a> {
        nblocks: BlockNumber,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
-        anyhow::ensure!(rel.relnode != 0, "invalid relnode");
+        anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);

        // Put size
        let size_key = rel_size_to_key(rel);
@@ -939,7 +993,7 @@ impl<'a> DatadirModification<'a> {

    /// Drop a relation.
    pub async fn put_rel_drop(&mut self, rel: RelTag, ctx: &RequestContext) -> anyhow::Result<()> {
-        anyhow::ensure!(rel.relnode != 0, "invalid relnode");
+        anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);

        // Remove it from the directory entry
        let dir_key = rel_dir_to_key(rel.spcnode, rel.dbnode);
@@ -1084,7 +1138,7 @@ impl<'a> DatadirModification<'a> {
    /// retains all the metadata, but data pages are flushed. That's again OK
    /// for bulk import, where you are just loading data pages and won't try to
    /// modify the same pages twice.
-    pub fn flush(&mut self) -> anyhow::Result<()> {
+    pub async fn flush(&mut self) -> anyhow::Result<()> {
        // Unless we have accumulated a decent amount of changes, it's not worth it
        // to scan through the pending_updates list.
        let pending_nblocks = self.pending_nblocks;
@@ -1092,23 +1146,23 @@ impl<'a> DatadirModification<'a> {
            return Ok(());
        }

-        let writer = self.tline.writer();
+        let writer = self.tline.writer().await;

        // Flush relation and  SLRU data blocks, keep metadata.
-        let mut result: anyhow::Result<()> = Ok(());
-        self.pending_updates.retain(|&key, value| {
-            if result.is_ok() && (is_rel_block_key(key) || is_slru_block_key(key)) {
-                result = writer.put(key, self.lsn, value);
-                false
+        let mut retained_pending_updates = HashMap::new();
+        for (key, value) in self.pending_updates.drain() {
+            if is_rel_block_key(key) || is_slru_block_key(key) {
+                // This bails out on first error without modifying pending_updates.
+                // That's Ok, cf this function's doc comment.
+                writer.put(key, self.lsn, &value).await?;
            } else {
-                true
+                retained_pending_updates.insert(key, value);
            }
-        });
-        result?;
+        }
+        self.pending_updates.extend(retained_pending_updates);

        if pending_nblocks != 0 {
-            let size = writer.update_current_logical_size(pending_nblocks * i64::from(BLCKSZ));
-            self.put_logical_size(size)?;
+            writer.update_current_logical_size(pending_nblocks * i64::from(BLCKSZ));
            self.pending_nblocks = 0;
        }

@@ -1120,24 +1174,23 @@ impl<'a> DatadirModification<'a> {
    /// underlying timeline.
    /// All the modifications in this atomic update are stamped by the specified LSN.
    ///
-    pub fn commit(&mut self) -> anyhow::Result<()> {
-        let writer = self.tline.writer();
+    pub async fn commit(&mut self) -> anyhow::Result<()> {
+        let writer = self.tline.writer().await;
        let lsn = self.lsn;
        let pending_nblocks = self.pending_nblocks;
        self.pending_nblocks = 0;

        for (key, value) in self.pending_updates.drain() {
-            writer.put(key, lsn, &value)?;
+            writer.put(key, lsn, &value).await?;
        }
        for key_range in self.pending_deletions.drain(..) {
-            writer.delete(key_range, lsn)?;
+            writer.delete(key_range, lsn).await?;
        }

        writer.finish_write(lsn);

        if pending_nblocks != 0 {
-            let size = writer.update_current_logical_size(pending_nblocks * i64::from(BLCKSZ));
-            self.put_logical_size(size)?;
+            writer.update_current_logical_size(pending_nblocks * i64::from(BLCKSZ));
        }

        Ok(())
@@ -1252,7 +1305,7 @@ static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; BLCKSZ as usize]);
 // 03 misc
 //    controlfile
 //    checkpoint
-//    logical_size
+//    pg_version
 //
 // Below is a full list of the keyspace allocation:
 //
@@ -1292,10 +1345,6 @@ static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; BLCKSZ as usize]);
 // Checkpoint:
 // 03 00000000 00000000 00000000 00   00000001
 //-- Section 01: relation data and metadata
-//
-// LogicalSize:
-// 03 00000000 00000000 00000000 00   00000002
-//

 const DBDIR_KEY: Key = Key {
    field1: 0x00,
@@ -1518,15 +1567,6 @@ const CHECKPOINT_KEY: Key = Key {
    field6: 1,
 };

-const LOGICAL_SIZE_KEY: Key = Key {
-    field1: 0x03,
-    field2: 0,
-    field3: 0,
-    field4: 0,
-    field5: 0,
-    field6: 3,
-};
-
 // Reverse mappings for a few Keys.
 // These are needed by WAL redo manager.

@@ -1584,22 +1624,6 @@ fn is_slru_block_key(key: Key) -> bool {
        && key.field6 != 0xffffffff // and not SlruSegSize
 }

-#[cfg(test)]
-pub fn create_test_timeline(
-    tenant: &crate::tenant::Tenant,
-    timeline_id: utils::id::TimelineId,
-    pg_version: u32,
-    ctx: &RequestContext,
-) -> anyhow::Result<std::sync::Arc<Timeline>> {
-    let tline = tenant
-        .create_empty_timeline(timeline_id, Lsn(8), pg_version, ctx)?
-        .initialize(ctx)?;
-    let mut m = tline.begin_modification(Lsn(8));
-    m.init_empty()?;
-    m.commit()?;
-    Ok(tline)
-}
-
 #[allow(clippy::bool_assert_comparison)]
 #[cfg(test)]
 mod tests {
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -237,6 +237,11 @@ pub enum TaskKind {
    /// See [`crate::disk_usage_eviction_task`].
    DiskUsageEviction,

+    // Initial logical size calculation
+    InitialLogicalSizeCalculation,
+
+    OndemandLogicalSizeCalculation,
+
    // Task that flushes frozen in-memory layers to disk
    LayerFlushTask,

@@ -252,6 +257,9 @@ pub enum TaskKind {
    // task that handles attaching a tenant
    Attach,

+    // Used mostly for background deletion from s3
+    TimelineDeletionWorker,
+
    // task that handhes metrics collection
    MetricsCollection,

@@ -471,18 +479,35 @@ pub async fn shutdown_tasks(
                && (timeline_id.is_none() || task_mut.timeline_id == timeline_id)
            {
                task.cancel.cancel();
-                victim_tasks.push(Arc::clone(task));
+                victim_tasks.push((
+                    Arc::clone(task),
+                    task.kind,
+                    task_mut.tenant_id,
+                    task_mut.timeline_id,
+                ));
            }
        }
    }

-    for task in victim_tasks {
+    let log_all = kind.is_none() && tenant_id.is_none() && timeline_id.is_none();
+
+    for (task, task_kind, tenant_id, timeline_id) in victim_tasks {
        let join_handle = {
            let mut task_mut = task.mutable.lock().unwrap();
            task_mut.join_handle.take()
        };
        if let Some(mut join_handle) = join_handle {
+            if log_all {
+                if tenant_id.is_none() {
+                    // there are quite few of these
+                    info!(name = task.name, kind = ?task_kind, "stopping global task");
+                } else {
+                    // warn to catch these in tests; there shouldn't be any
+                    warn!(name = task.name, tenant_id = ?tenant_id, timeline_id = ?timeline_id, kind = ?task_kind, "stopping left-over");
+                }
+            }
            let completed = tokio::select! {
+                biased;
                _ = &mut join_handle => { true },
                _ = tokio::time::sleep(std::time::Duration::from_secs(1)) => {
                    // allow some time to elapse before logging to cut down the number of log
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -38,8 +38,8 @@ pub mod defaults {
    pub const DEFAULT_GC_PERIOD: &str = "1 hr";
    pub const DEFAULT_IMAGE_CREATION_THRESHOLD: usize = 3;
    pub const DEFAULT_PITR_INTERVAL: &str = "7 days";
-    pub const DEFAULT_WALRECEIVER_CONNECT_TIMEOUT: &str = "2 seconds";
-    pub const DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT: &str = "3 seconds";
+    pub const DEFAULT_WALRECEIVER_CONNECT_TIMEOUT: &str = "10 seconds";
+    pub const DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT: &str = "10 seconds";
    pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 10 * 1024 * 1024;
    pub const DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD: &str = "24 hour";
 }
@@ -99,6 +99,7 @@ pub struct TenantConf {
    // See the corresponding metric's help string.
    #[serde(with = "humantime_serde")]
    pub evictions_low_residence_duration_metric_threshold: Duration,
+    pub gc_feedback: bool,
 }

 /// Same as TenantConf, but this struct preserves the information about
@@ -175,6 +176,10 @@ pub struct TenantConfOpt {
    #[serde(with = "humantime_serde")]
    #[serde(default)]
    pub evictions_low_residence_duration_metric_threshold: Option<Duration>,
+
+    #[serde(skip_serializing_if = "Option::is_none")]
+    #[serde(default)]
+    pub gc_feedback: Option<bool>,
 }

 #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
@@ -242,6 +247,7 @@ impl TenantConfOpt {
            evictions_low_residence_duration_metric_threshold: self
                .evictions_low_residence_duration_metric_threshold
                .unwrap_or(global_conf.evictions_low_residence_duration_metric_threshold),
+            gc_feedback: self.gc_feedback.unwrap_or(global_conf.gc_feedback),
        }
    }
 }
@@ -278,6 +284,7 @@ impl Default for TenantConf {
                DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD,
            )
            .expect("cannot parse default evictions_low_residence_duration_metric_threshold"),
+            gc_feedback: false,
        }
    }
 }
@@ -372,6 +379,7 @@ impl TryFrom<&'_ models::TenantConfig> for TenantConfOpt {
                    ))?,
            );
        }
+        tenant_conf.gc_feedback = request_data.gc_feedback;

        Ok(tenant_conf)
    }
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -51,7 +51,9 @@ use crate::keyspace::KeyPartitioning;
 use crate::repository::Key;
 use crate::tenant::storage_layer::InMemoryLayer;
 use crate::tenant::storage_layer::Layer;
+use anyhow::Context;
 use anyhow::Result;
+use std::collections::HashMap;
 use std::collections::VecDeque;
 use std::ops::Range;
 use std::sync::Arc;
@@ -61,6 +63,8 @@ use historic_layer_coverage::BufferedHistoricLayerCoverage;
 pub use historic_layer_coverage::Replacement;

 use super::storage_layer::range_eq;
+use super::storage_layer::PersistentLayerDesc;
+use super::storage_layer::PersistentLayerKey;

 ///
 /// LayerMap tracks what layers exist on a timeline.
@@ -86,11 +90,16 @@ pub struct LayerMap<L: ?Sized> {
    pub frozen_layers: VecDeque<Arc<InMemoryLayer>>,

    /// Index of the historic layers optimized for search
-    historic: BufferedHistoricLayerCoverage<Arc<L>>,
+    historic: BufferedHistoricLayerCoverage<Arc<PersistentLayerDesc>>,

    /// L0 layers have key range Key::MIN..Key::MAX, and locating them using R-Tree search is very inefficient.
    /// So L0 layers are held in l0_delta_layers vector, in addition to the R-tree.
-    l0_delta_layers: Vec<Arc<L>>,
+    l0_delta_layers: Vec<Arc<PersistentLayerDesc>>,
+
+    /// Mapping from persistent layer key to the actual layer object. Currently, it stores delta, image, and
+    /// remote layers. In future refactors, this will be eventually moved out of LayerMap into Timeline, and
+    /// RemoteLayer will be removed.
+    mapping: HashMap<PersistentLayerKey, Arc<L>>,
 }

 impl<L: ?Sized> Default for LayerMap<L> {
@@ -101,6 +110,7 @@ impl<L: ?Sized> Default for LayerMap<L> {
            frozen_layers: VecDeque::default(),
            l0_delta_layers: Vec::default(),
            historic: BufferedHistoricLayerCoverage::default(),
+            mapping: HashMap::default(),
        }
    }
 }
@@ -125,8 +135,9 @@ where
    ///
    /// Insert an on-disk layer.
    ///
-    pub fn insert_historic(&mut self, layer: Arc<L>) {
-        self.layer_map.insert_historic_noflush(layer)
+    // TODO remove the `layer` argument when `mapping` is refactored out of `LayerMap`
+    pub fn insert_historic(&mut self, layer_desc: PersistentLayerDesc, layer: Arc<L>) {
+        self.layer_map.insert_historic_noflush(layer_desc, layer)
    }

    ///
@@ -134,8 +145,8 @@ where
    ///
    /// This should be called when the corresponding file on disk has been deleted.
    ///
-    pub fn remove_historic(&mut self, layer: Arc<L>) {
-        self.layer_map.remove_historic_noflush(layer)
+    pub fn remove_historic(&mut self, layer_desc: PersistentLayerDesc, layer: Arc<L>) {
+        self.layer_map.remove_historic_noflush(layer_desc, layer)
    }

    /// Replaces existing layer iff it is the `expected`.
@@ -150,12 +161,15 @@ where
    ///      that we can replace values only by updating a hashmap.
    pub fn replace_historic(
        &mut self,
+        expected_desc: PersistentLayerDesc,
        expected: &Arc<L>,
+        new_desc: PersistentLayerDesc,
        new: Arc<L>,
    ) -> anyhow::Result<Replacement<Arc<L>>> {
        fail::fail_point!("layermap-replace-notfound", |_| Ok(Replacement::NotFound));

-        self.layer_map.replace_historic_noflush(expected, new)
+        self.layer_map
+            .replace_historic_noflush(expected_desc, expected, new_desc, new)
    }

    // We will flush on drop anyway, but this method makes it
@@ -230,6 +244,7 @@ where
            (None, None) => None,
            (None, Some(image)) => {
                let lsn_floor = image.get_lsn_range().start;
+                let image = self.get_layer_from_mapping(&image.key()).clone();
                Some(SearchResult {
                    layer: image,
                    lsn_floor,
@@ -237,6 +252,7 @@ where
            }
            (Some(delta), None) => {
                let lsn_floor = delta.get_lsn_range().start;
+                let delta = self.get_layer_from_mapping(&delta.key()).clone();
                Some(SearchResult {
                    layer: delta,
                    lsn_floor,
@@ -247,6 +263,7 @@ where
                let image_is_newer = image.get_lsn_range().end >= delta.get_lsn_range().end;
                let image_exact_match = img_lsn + 1 == end_lsn;
                if image_is_newer || image_exact_match {
+                    let image = self.get_layer_from_mapping(&image.key()).clone();
                    Some(SearchResult {
                        layer: image,
                        lsn_floor: img_lsn,
@@ -254,6 +271,7 @@ where
                } else {
                    let lsn_floor =
                        std::cmp::max(delta.get_lsn_range().start, image.get_lsn_range().start + 1);
+                    let delta = self.get_layer_from_mapping(&delta.key()).clone();
                    Some(SearchResult {
                        layer: delta,
                        lsn_floor,
@@ -273,16 +291,33 @@ where
    ///
    /// Helper function for BatchedUpdates::insert_historic
    ///
-    pub(self) fn insert_historic_noflush(&mut self, layer: Arc<L>) {
+    /// TODO(chi): remove L generic so that we do not need to pass layer object.
+    pub(self) fn insert_historic_noflush(
+        &mut self,
+        layer_desc: PersistentLayerDesc,
+        layer: Arc<L>,
+    ) {
+        self.mapping.insert(layer_desc.key(), layer.clone());
+
        // TODO: See #3869, resulting #4088, attempted fix and repro #4094
-        self.historic.insert(
-            historic_layer_coverage::LayerKey::from(&*layer),
-            Arc::clone(&layer),
-        );

        if Self::is_l0(&layer) {
-            self.l0_delta_layers.push(layer);
+            self.l0_delta_layers.push(layer_desc.clone().into());
        }
+
+        self.historic.insert(
+            historic_layer_coverage::LayerKey::from(&*layer),
+            layer_desc.into(),
+        );
+    }
+
+    fn get_layer_from_mapping(&self, key: &PersistentLayerKey) -> &Arc<L> {
+        let layer = self
+            .mapping
+            .get(key)
+            .with_context(|| format!("{key:?}"))
+            .expect("inconsistent layer mapping");
+        layer
    }

    ///
@@ -290,14 +325,16 @@ where
    ///
    /// Helper function for BatchedUpdates::remove_historic
    ///
-    pub fn remove_historic_noflush(&mut self, layer: Arc<L>) {
+    pub fn remove_historic_noflush(&mut self, layer_desc: PersistentLayerDesc, layer: Arc<L>) {
        self.historic
            .remove(historic_layer_coverage::LayerKey::from(&*layer));
-
        if Self::is_l0(&layer) {
            let len_before = self.l0_delta_layers.len();
-            self.l0_delta_layers
-                .retain(|other| !Self::compare_arced_layers(other, &layer));
+            let mut l0_delta_layers = std::mem::take(&mut self.l0_delta_layers);
+            l0_delta_layers.retain(|other| {
+                !Self::compare_arced_layers(self.get_layer_from_mapping(&other.key()), &layer)
+            });
+            self.l0_delta_layers = l0_delta_layers;
            // this assertion is related to use of Arc::ptr_eq in Self::compare_arced_layers,
            // there's a chance that the comparison fails at runtime due to it comparing (pointer,
            // vtable) pairs.
@@ -307,11 +344,14 @@ where
                "failed to locate removed historic layer from l0_delta_layers"
            );
        }
+        self.mapping.remove(&layer_desc.key());
    }

    pub(self) fn replace_historic_noflush(
        &mut self,
+        expected_desc: PersistentLayerDesc,
        expected: &Arc<L>,
+        new_desc: PersistentLayerDesc,
        new: Arc<L>,
    ) -> anyhow::Result<Replacement<Arc<L>>> {
        let key = historic_layer_coverage::LayerKey::from(&**expected);
@@ -332,10 +372,9 @@ where

        let l0_index = if expected_l0 {
            // find the index in case replace worked, we need to replace that as well
-            let pos = self
-                .l0_delta_layers
-                .iter()
-                .position(|slot| Self::compare_arced_layers(slot, expected));
+            let pos = self.l0_delta_layers.iter().position(|slot| {
+                Self::compare_arced_layers(self.get_layer_from_mapping(&slot.key()), expected)
+            });

            if pos.is_none() {
                return Ok(Replacement::NotFound);
@@ -345,16 +384,28 @@ where
            None
        };

-        let replaced = self.historic.replace(&key, new.clone(), |existing| {
-            Self::compare_arced_layers(existing, expected)
+        let new_desc = Arc::new(new_desc);
+        let replaced = self.historic.replace(&key, new_desc.clone(), |existing| {
+            **existing == expected_desc
        });

        if let Replacement::Replaced { .. } = &replaced {
+            self.mapping.remove(&expected_desc.key());
+            self.mapping.insert(new_desc.key(), new);
            if let Some(index) = l0_index {
-                self.l0_delta_layers[index] = new;
+                self.l0_delta_layers[index] = new_desc;
            }
        }

+        let replaced = match replaced {
+            Replacement::Replaced { in_buffered } => Replacement::Replaced { in_buffered },
+            Replacement::NotFound => Replacement::NotFound,
+            Replacement::RemovalBuffered => Replacement::RemovalBuffered,
+            Replacement::Unexpected(x) => {
+                Replacement::Unexpected(self.get_layer_from_mapping(&x.key()).clone())
+            }
+        };
+
        Ok(replaced)
    }

@@ -383,7 +434,7 @@ where
        let start = key.start.to_i128();
        let end = key.end.to_i128();

-        let layer_covers = |layer: Option<Arc<L>>| match layer {
+        let layer_covers = |layer: Option<Arc<PersistentLayerDesc>>| match layer {
            Some(layer) => layer.get_lsn_range().start >= lsn.start,
            None => false,
        };
@@ -404,7 +455,9 @@ where
    }

    pub fn iter_historic_layers(&self) -> impl '_ + Iterator<Item = Arc<L>> {
-        self.historic.iter()
+        self.historic
+            .iter()
+            .map(|x| self.get_layer_from_mapping(&x.key()).clone())
    }

    ///
@@ -436,14 +489,24 @@ where
        // Loop through the change events and push intervals
        for (change_key, change_val) in version.image_coverage.range(start..end) {
            let kr = Key::from_i128(current_key)..Key::from_i128(change_key);
-            coverage.push((kr, current_val.take()));
+            coverage.push((
+                kr,
+                current_val
+                    .take()
+                    .map(|l| self.get_layer_from_mapping(&l.key()).clone()),
+            ));
            current_key = change_key;
            current_val = change_val.clone();
        }

        // Add the final interval
        let kr = Key::from_i128(current_key)..Key::from_i128(end);
-        coverage.push((kr, current_val.take()));
+        coverage.push((
+            kr,
+            current_val
+                .take()
+                .map(|l| self.get_layer_from_mapping(&l.key()).clone()),
+        ));

        Ok(coverage)
    }
@@ -532,7 +595,9 @@ where
                    let kr = Key::from_i128(current_key)..Key::from_i128(change_key);
                    let lr = lsn.start..val.get_lsn_range().start;
                    if !kr.is_empty() {
-                        let base_count = Self::is_reimage_worthy(&val, key) as usize;
+                        let base_count =
+                            Self::is_reimage_worthy(self.get_layer_from_mapping(&val.key()), key)
+                                as usize;
                        let new_limit = limit.map(|l| l - base_count);
                        let max_stacked_deltas_underneath =
                            self.count_deltas(&kr, &lr, new_limit)?;
@@ -555,7 +620,9 @@ where
                let lr = lsn.start..val.get_lsn_range().start;

                if !kr.is_empty() {
-                    let base_count = Self::is_reimage_worthy(&val, key) as usize;
+                    let base_count =
+                        Self::is_reimage_worthy(self.get_layer_from_mapping(&val.key()), key)
+                            as usize;
                    let new_limit = limit.map(|l| l - base_count);
                    let max_stacked_deltas_underneath = self.count_deltas(&kr, &lr, new_limit)?;
                    max_stacked_deltas = std::cmp::max(
@@ -706,7 +773,11 @@ where

    /// Return all L0 delta layers
    pub fn get_level0_deltas(&self) -> Result<Vec<Arc<L>>> {
-        Ok(self.l0_delta_layers.clone())
+        Ok(self
+            .l0_delta_layers
+            .iter()
+            .map(|x| self.get_layer_from_mapping(&x.key()).clone())
+            .collect())
    }

    /// debugging function to print out the contents of the layer map
@@ -809,12 +880,17 @@ mod tests {
            let layer = LayerDescriptor::from(layer);

            // same skeletan construction; see scenario below
-            let not_found: Arc<dyn Layer> = Arc::new(layer.clone());
-            let new_version: Arc<dyn Layer> = Arc::new(layer);
+            let not_found = Arc::new(layer.clone());
+            let new_version = Arc::new(layer);

            let mut map = LayerMap::default();

-            let res = map.batch_update().replace_historic(&not_found, new_version);
+            let res = map.batch_update().replace_historic(
+                not_found.get_persistent_layer_desc(),
+                &not_found,
+                new_version.get_persistent_layer_desc(),
+                new_version,
+            );

            assert!(matches!(res, Ok(Replacement::NotFound)), "{res:?}");
        }
@@ -823,8 +899,8 @@ mod tests {
            let name = LayerFileName::from_str(layer_name).unwrap();
            let skeleton = LayerDescriptor::from(name);

-            let remote: Arc<dyn Layer> = Arc::new(skeleton.clone());
-            let downloaded: Arc<dyn Layer> = Arc::new(skeleton);
+            let remote = Arc::new(skeleton.clone());
+            let downloaded = Arc::new(skeleton);

            let mut map = LayerMap::default();

@@ -834,12 +910,18 @@ mod tests {

            let expected_in_counts = (1, usize::from(expected_l0));

-            map.batch_update().insert_historic(remote.clone());
+            map.batch_update()
+                .insert_historic(remote.get_persistent_layer_desc(), remote.clone());
            assert_eq!(count_layer_in(&map, &remote), expected_in_counts);

            let replaced = map
                .batch_update()
-                .replace_historic(&remote, downloaded.clone())
+                .replace_historic(
+                    remote.get_persistent_layer_desc(),
+                    &remote,
+                    downloaded.get_persistent_layer_desc(),
+                    downloaded.clone(),
+                )
                .expect("name derived attributes are the same");
            assert!(
                matches!(replaced, Replacement::Replaced { .. }),
@@ -847,11 +929,12 @@ mod tests {
            );
            assert_eq!(count_layer_in(&map, &downloaded), expected_in_counts);

-            map.batch_update().remove_historic(downloaded.clone());
+            map.batch_update()
+                .remove_historic(downloaded.get_persistent_layer_desc(), downloaded.clone());
            assert_eq!(count_layer_in(&map, &downloaded), (0, 0));
        }

-        fn count_layer_in(map: &LayerMap<dyn Layer>, layer: &Arc<dyn Layer>) -> (usize, usize) {
+        fn count_layer_in<L: Layer + ?Sized>(map: &LayerMap<L>, layer: &Arc<L>) -> (usize, usize) {
            let historic = map
                .iter_historic_layers()
                .filter(|x| LayerMap::compare_arced_layers(x, layer))
--- a/pageserver/src/tenant/layer_map/historic_layer_coverage.rs
+++ b/pageserver/src/tenant/layer_map/historic_layer_coverage.rs
@@ -204,6 +204,35 @@ fn test_off_by_one() {
    assert_eq!(version.image_coverage.query(5), None);
 }

+/// White-box regression test, checking for incorrect removal of node at key.end
+#[test]
+fn test_regression() {
+    let mut map = HistoricLayerCoverage::<String>::new();
+    map.insert(
+        LayerKey {
+            key: 0..5,
+            lsn: 0..5,
+            is_image: false,
+        },
+        "Layer 1".to_string(),
+    );
+    map.insert(
+        LayerKey {
+            key: 0..5,
+            lsn: 1..2,
+            is_image: false,
+        },
+        "Layer 2".to_string(),
+    );
+
+    // If an insertion operation improperly deletes the endpoint of a previous layer
+    // (which is more likely to happen with layers that collide on key.end), we will
+    // end up with an infinite layer, covering the entire keyspace. Here we assert
+    // that there's no layer at key 100 because we didn't insert any layer there.
+    let version = map.get_version(100).unwrap();
+    assert_eq!(version.delta_coverage.query(100), None);
+}
+
 /// Cover edge cases where layers begin or end on the same key
 #[test]
 fn test_key_collision() {
--- a/pageserver/src/tenant/layer_map/layer_coverage.rs
+++ b/pageserver/src/tenant/layer_map/layer_coverage.rs
@@ -1,8 +1,8 @@
 use std::ops::Range;

-// TODO the `im` crate has 20x more downloads and also has
-// persistent/immutable BTree. It also runs a bit faster but
-// results are not the same on some tests.
+// NOTE the `im` crate has 20x more downloads and also has
+// persistent/immutable BTree. But it's bugged so rpds is a
+// better choice https://github.com/neondatabase/neon/issues/3395
 use rpds::RedBlackTreeMapSync;

 /// Data structure that can efficiently:
@@ -10,19 +10,22 @@ use rpds::RedBlackTreeMapSync;
 /// - iterate the latest layers in a key range
 /// - insert layers in non-decreasing lsn.start order
 ///
-/// The struct is parameterized over Value for easier
-/// testing, but in practice it's some sort of layer.
+/// For a detailed explanation and justification of this approach, see:
+/// https://neon.tech/blog/persistent-structures-in-neons-wal-indexing
+///
+/// NOTE The struct is parameterized over Value for easier
+///      testing, but in practice it's some sort of layer.
 pub struct LayerCoverage<Value> {
    /// For every change in coverage (as we sweep the key space)
    /// we store (lsn.end, value).
    ///
-    /// We use an immutable/persistent tree so that we can keep historic
-    /// versions of this coverage without cloning the whole thing and
-    /// incurring quadratic memory cost. See HistoricLayerCoverage.
+    /// NOTE We use an immutable/persistent tree so that we can keep historic
+    ///      versions of this coverage without cloning the whole thing and
+    ///      incurring quadratic memory cost. See HistoricLayerCoverage.
    ///
-    /// We use the Sync version of the map because we want Self to
-    /// be Sync. Using nonsync might be faster, if we can work with
-    /// that.
+    /// NOTE We use the Sync version of the map because we want Self to
+    ///      be Sync. Using nonsync might be faster, if we can work with
+    ///      that.
    nodes: RedBlackTreeMapSync<i128, Option<(u64, Value)>>,
 }

@@ -41,6 +44,13 @@ impl<Value: Clone> LayerCoverage<Value> {

    /// Helper function to subdivide the key range without changing any values
    ///
+    /// This operation has no semantic effect by itself. It only helps us pin in
+    /// place the part of the coverage we don't want to change when inserting.
+    ///
+    /// As an analogy, think of a polygon. If you add a vertex along one of the
+    /// segments, the polygon is still the same, but it behaves differently when
+    /// we move or delete one of the other points.
+    ///
    /// Complexity: O(log N)
    fn add_node(&mut self, key: i128) {
        let value = match self.nodes.range(..=key).last() {
@@ -74,7 +84,7 @@ impl<Value: Clone> LayerCoverage<Value> {
        let mut to_update = Vec::new();
        let mut to_remove = Vec::new();
        let mut prev_covered = false;
-        for (k, node) in self.nodes.range(key.clone()) {
+        for (k, node) in self.nodes.range(key) {
            let needs_cover = match node {
                None => true,
                Some((h, _)) => h < &lsn.end,
@@ -87,9 +97,8 @@ impl<Value: Clone> LayerCoverage<Value> {
            }
            prev_covered = needs_cover;
        }
-        if !prev_covered {
-            to_remove.push(key.end);
-        }
+        // TODO check if the nodes inserted at key.start and key.end are safe
+        //      to remove. It's fine to keep them but they could be redundant.
        for k in to_update {
            self.nodes.insert_mut(k, Some((lsn.end, value.clone())));
        }
--- a/pageserver/src/tenant/manifest.rs
+++ b/pageserver/src/tenant/manifest.rs
@@ -0,0 +1,325 @@
+//! This module contains the encoding and decoding of the local manifest file.
+//!
+//! MANIFEST is a write-ahead log which is stored locally to each timeline. It
+//! records the state of the storage engine. It contains a snapshot of the
+//! state and all operations proceeding that snapshot. The file begins with a
+//! header recording MANIFEST version number. After that, it contains a snapshot.
+//! The snapshot is followed by a list of operations. Each operation is a list
+//! of records. Each record is either an addition or a removal of a layer.
+//!
+//! With MANIFEST, we can:
+//!
+//! 1. recover state quickly by reading the file, potentially boosting the
+//!    startup speed.
+//! 2. ensure all operations are atomic and avoid corruption, solving issues
+//!    like redundant image layer and preparing us for future compaction
+//!    strategies.
+//!
+//! There is also a format for storing all layer files on S3, called
+//! `index_part.json`. Compared with index_part, MANIFEST is an WAL which
+//! records all operations as logs, and therefore we can easily replay the
+//! operations when recovering from crash, while ensuring those operations
+//! are atomic upon restart.
+//!
+//! Currently, this is not used in the system. Future refactors will ensure
+//! the storage state will be recorded in this file, and the system can be
+//! recovered from this file. This is tracked in
+//! https://github.com/neondatabase/neon/issues/4418
+
+use std::io::{self, Read, Write};
+
+use crate::virtual_file::VirtualFile;
+use anyhow::Result;
+use bytes::{Buf, BufMut, Bytes, BytesMut};
+use crc32c::crc32c;
+use serde::{Deserialize, Serialize};
+use tracing::log::warn;
+use utils::lsn::Lsn;
+
+use super::storage_layer::PersistentLayerDesc;
+
+pub struct Manifest {
+    file: VirtualFile,
+}
+
+#[derive(Clone, Serialize, Deserialize, PartialEq, Eq, Debug)]
+pub struct Snapshot {
+    pub layers: Vec<PersistentLayerDesc>,
+}
+
+/// serde by default encode this in tagged enum, and therefore it will be something
+/// like `{ "AddLayer": { ... } }`.
+#[derive(Clone, Serialize, Deserialize, PartialEq, Eq, Debug)]
+pub enum Record {
+    AddLayer(PersistentLayerDesc),
+    RemoveLayer(PersistentLayerDesc),
+}
+
+/// `echo neon.manifest | sha1sum` and take the leading 8 bytes.
+const MANIFEST_MAGIC_NUMBER: u64 = 0xf5c44592b806109c;
+const MANIFEST_VERSION: u64 = 1;
+
+#[derive(Clone, Serialize, Deserialize, PartialEq, Eq, Debug)]
+pub struct ManifestHeader {
+    magic_number: u64,
+    version: u64,
+}
+
+const MANIFEST_HEADER_LEN: usize = 16;
+
+impl ManifestHeader {
+    fn encode(&self) -> BytesMut {
+        let mut buf = BytesMut::with_capacity(MANIFEST_HEADER_LEN);
+        buf.put_u64(self.magic_number);
+        buf.put_u64(self.version);
+        buf
+    }
+
+    fn decode(mut buf: &[u8]) -> Self {
+        assert!(buf.len() == MANIFEST_HEADER_LEN, "invalid header");
+        Self {
+            magic_number: buf.get_u64(),
+            version: buf.get_u64(),
+        }
+    }
+}
+
+#[derive(Clone, Serialize, Deserialize, PartialEq, Eq, Debug)]
+pub enum Operation {
+    /// A snapshot of the current state.
+    ///
+    /// Lsn field represents the LSN that is persisted to disk for this snapshot.
+    Snapshot(Snapshot, Lsn),
+    /// An atomic operation that changes the state.
+    ///
+    /// Lsn field represents the LSN that is persisted to disk after the operation is done.
+    /// This will only change when new L0 is flushed to the disk.
+    Operation(Vec<Record>, Lsn),
+}
+
+struct RecordHeader {
+    size: u32,
+    checksum: u32,
+}
+
+const RECORD_HEADER_LEN: usize = 8;
+
+impl RecordHeader {
+    fn encode(&self) -> BytesMut {
+        let mut buf = BytesMut::with_capacity(RECORD_HEADER_LEN);
+        buf.put_u32(self.size);
+        buf.put_u32(self.checksum);
+        buf
+    }
+
+    fn decode(mut buf: &[u8]) -> Self {
+        assert!(buf.len() == RECORD_HEADER_LEN, "invalid header");
+        Self {
+            size: buf.get_u32(),
+            checksum: buf.get_u32(),
+        }
+    }
+}
+
+#[derive(Debug, thiserror::Error)]
+pub enum ManifestLoadError {
+    #[error("manifest header is corrupted")]
+    CorruptedManifestHeader,
+    #[error("unsupported manifest version: got {0}, expected {1}")]
+    UnsupportedVersion(u64, u64),
+    #[error("error when decoding record: {0}")]
+    DecodeRecord(serde_json::Error),
+    #[error("I/O error: {0}")]
+    Io(io::Error),
+}
+
+#[must_use = "Should check if the manifest is partially corrupted"]
+pub struct ManifestPartiallyCorrupted(bool);
+
+impl Manifest {
+    /// Create a new manifest by writing the manifest header and a snapshot record to the given file.
+    pub fn init(file: VirtualFile, snapshot: Snapshot, lsn: Lsn) -> Result<Self> {
+        let mut manifest = Self { file };
+        manifest.append_manifest_header(ManifestHeader {
+            magic_number: MANIFEST_MAGIC_NUMBER,
+            version: MANIFEST_VERSION,
+        })?;
+        manifest.append_operation(Operation::Snapshot(snapshot, lsn))?;
+        Ok(manifest)
+    }
+
+    /// Load a manifest. Returns the manifest and a list of operations. If the manifest is corrupted,
+    /// the bool flag will be set to true and the user is responsible to reconstruct a new manifest and
+    /// backup the current one.
+    pub fn load(
+        mut file: VirtualFile,
+    ) -> Result<(Self, Vec<Operation>, ManifestPartiallyCorrupted), ManifestLoadError> {
+        let mut buf = vec![];
+        file.read_to_end(&mut buf).map_err(ManifestLoadError::Io)?;
+
+        // Read manifest header
+        let mut buf = Bytes::from(buf);
+        if buf.remaining() < MANIFEST_HEADER_LEN {
+            return Err(ManifestLoadError::CorruptedManifestHeader);
+        }
+        let header = ManifestHeader::decode(&buf[..MANIFEST_HEADER_LEN]);
+        buf.advance(MANIFEST_HEADER_LEN);
+        if header.version != MANIFEST_VERSION {
+            return Err(ManifestLoadError::UnsupportedVersion(
+                header.version,
+                MANIFEST_VERSION,
+            ));
+        }
+
+        // Read operations
+        let mut operations = Vec::new();
+        let corrupted = loop {
+            if buf.remaining() == 0 {
+                break false;
+            }
+            if buf.remaining() < RECORD_HEADER_LEN {
+                warn!("incomplete header when decoding manifest, could be corrupted");
+                break true;
+            }
+            let RecordHeader { size, checksum } = RecordHeader::decode(&buf[..RECORD_HEADER_LEN]);
+            let size = size as usize;
+            buf.advance(RECORD_HEADER_LEN);
+            if buf.remaining() < size {
+                warn!("incomplete data when decoding manifest, could be corrupted");
+                break true;
+            }
+            let data = &buf[..size];
+            if crc32c(data) != checksum {
+                warn!("checksum mismatch when decoding manifest, could be corrupted");
+                break true;
+            }
+            // if the following decode fails, we cannot use the manifest or safely ignore any record.
+            operations.push(serde_json::from_slice(data).map_err(ManifestLoadError::DecodeRecord)?);
+            buf.advance(size);
+        };
+        Ok((
+            Self { file },
+            operations,
+            ManifestPartiallyCorrupted(corrupted),
+        ))
+    }
+
+    fn append_data(&mut self, data: &[u8]) -> Result<()> {
+        if data.len() >= u32::MAX as usize {
+            panic!("data too large");
+        }
+        let header = RecordHeader {
+            size: data.len() as u32,
+            checksum: crc32c(data),
+        };
+        let header = header.encode();
+        self.file.write_all(&header)?;
+        self.file.write_all(data)?;
+        self.file.sync_all()?;
+        Ok(())
+    }
+
+    fn append_manifest_header(&mut self, header: ManifestHeader) -> Result<()> {
+        let encoded = header.encode();
+        self.file.write_all(&encoded)?;
+        Ok(())
+    }
+
+    /// Add an operation to the manifest. The operation will be appended to the end of the file,
+    /// and the file will fsync.
+    pub fn append_operation(&mut self, operation: Operation) -> Result<()> {
+        let encoded = Vec::from(serde_json::to_string(&operation)?);
+        self.append_data(&encoded)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::fs::OpenOptions;
+
+    use crate::repository::Key;
+
+    use super::*;
+
+    #[test]
+    fn test_read_manifest() {
+        let testdir = crate::config::PageServerConf::test_repo_dir("test_read_manifest");
+        std::fs::create_dir_all(&testdir).unwrap();
+        let file = VirtualFile::create(&testdir.join("MANIFEST")).unwrap();
+        let layer1 = PersistentLayerDesc::new_test(Key::from_i128(0)..Key::from_i128(233));
+        let layer2 = PersistentLayerDesc::new_test(Key::from_i128(233)..Key::from_i128(2333));
+        let layer3 = PersistentLayerDesc::new_test(Key::from_i128(2333)..Key::from_i128(23333));
+        let layer4 = PersistentLayerDesc::new_test(Key::from_i128(23333)..Key::from_i128(233333));
+
+        // Write a manifest with a snapshot and some operations
+        let snapshot = Snapshot {
+            layers: vec![layer1, layer2],
+        };
+        let mut manifest = Manifest::init(file, snapshot.clone(), Lsn::from(0)).unwrap();
+        manifest
+            .append_operation(Operation::Operation(
+                vec![Record::AddLayer(layer3.clone())],
+                Lsn::from(1),
+            ))
+            .unwrap();
+        drop(manifest);
+
+        // Open the second time and write
+        let file = VirtualFile::open_with_options(
+            &testdir.join("MANIFEST"),
+            OpenOptions::new()
+                .read(true)
+                .write(true)
+                .create_new(false)
+                .truncate(false),
+        )
+        .unwrap();
+        let (mut manifest, operations, corrupted) = Manifest::load(file).unwrap();
+        assert!(!corrupted.0);
+        assert_eq!(operations.len(), 2);
+        assert_eq!(
+            &operations[0],
+            &Operation::Snapshot(snapshot.clone(), Lsn::from(0))
+        );
+        assert_eq!(
+            &operations[1],
+            &Operation::Operation(vec![Record::AddLayer(layer3.clone())], Lsn::from(1))
+        );
+        manifest
+            .append_operation(Operation::Operation(
+                vec![
+                    Record::RemoveLayer(layer3.clone()),
+                    Record::AddLayer(layer4.clone()),
+                ],
+                Lsn::from(2),
+            ))
+            .unwrap();
+        drop(manifest);
+
+        // Open the third time and verify
+        let file = VirtualFile::open_with_options(
+            &testdir.join("MANIFEST"),
+            OpenOptions::new()
+                .read(true)
+                .write(true)
+                .create_new(false)
+                .truncate(false),
+        )
+        .unwrap();
+        let (_manifest, operations, corrupted) = Manifest::load(file).unwrap();
+        assert!(!corrupted.0);
+        assert_eq!(operations.len(), 3);
+        assert_eq!(&operations[0], &Operation::Snapshot(snapshot, Lsn::from(0)));
+        assert_eq!(
+            &operations[1],
+            &Operation::Operation(vec![Record::AddLayer(layer3.clone())], Lsn::from(1))
+        );
+        assert_eq!(
+            &operations[2],
+            &Operation::Operation(
+                vec![Record::RemoveLayer(layer3), Record::AddLayer(layer4)],
+                Lsn::from(2)
+            )
+        );
+    }
+}
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -10,6 +10,7 @@ use tokio::fs;
 use anyhow::Context;
 use once_cell::sync::Lazy;
 use tokio::sync::RwLock;
+use tokio::task::JoinSet;
 use tracing::*;

 use remote_storage::GenericRemoteStorage;
@@ -20,7 +21,7 @@ use crate::context::{DownloadBehavior, RequestContext};
 use crate::task_mgr::{self, TaskKind};
 use crate::tenant::config::TenantConfOpt;
 use crate::tenant::{create_tenant_files, CreateTenantFilesMode, Tenant, TenantState};
-use crate::IGNORED_TENANT_FILE_NAME;
+use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME};

 use utils::fs_ext::PathExt;
 use utils::id::{TenantId, TimelineId};
@@ -58,10 +59,12 @@ static TENANTS: Lazy<RwLock<TenantsMap>> = Lazy::new(|| RwLock::new(TenantsMap::
 /// Initialize repositories with locally available timelines.
 /// Timelines that are only partially available locally (remote storage has more data than this pageserver)
 /// are scheduled for download and added to the tenant once download is completed.
-#[instrument(skip(conf, remote_storage))]
+#[instrument(skip_all)]
 pub async fn init_tenant_mgr(
    conf: &'static PageServerConf,
+    broker_client: storage_broker::BrokerClientChannel,
    remote_storage: Option<GenericRemoteStorage>,
+    init_order: InitializationOrder,
 ) -> anyhow::Result<()> {
    // Scan local filesystem for attached tenants
    let tenants_dir = conf.tenants_path();
@@ -116,7 +119,9 @@ pub async fn init_tenant_mgr(
                    match schedule_local_tenant_processing(
                        conf,
                        &tenant_dir_path,
+                        broker_client.clone(),
                        remote_storage.clone(),
+                        Some(init_order.clone()),
                        &ctx,
                    ) {
                        Ok(tenant) => {
@@ -150,7 +155,9 @@ pub async fn init_tenant_mgr(
 pub fn schedule_local_tenant_processing(
    conf: &'static PageServerConf,
    tenant_path: &Path,
+    broker_client: storage_broker::BrokerClientChannel,
    remote_storage: Option<GenericRemoteStorage>,
+    init_order: Option<InitializationOrder>,
    ctx: &RequestContext,
 ) -> anyhow::Result<Arc<Tenant>> {
    anyhow::ensure!(
@@ -186,7 +193,7 @@ pub fn schedule_local_tenant_processing(
    let tenant = if conf.tenant_attaching_mark_file_path(&tenant_id).exists() {
        info!("tenant {tenant_id} has attaching mark file, resuming its attach operation");
        if let Some(remote_storage) = remote_storage {
-            match Tenant::spawn_attach(conf, tenant_id, remote_storage, ctx) {
+            match Tenant::spawn_attach(conf, tenant_id, broker_client, remote_storage, ctx) {
                Ok(tenant) => tenant,
                Err(e) => {
                    error!("Failed to spawn_attach tenant {tenant_id}, reason: {e:#}");
@@ -204,7 +211,14 @@ pub fn schedule_local_tenant_processing(
    } else {
        info!("tenant {tenant_id} is assumed to be loadable, starting load operation");
        // Start loading the tenant into memory. It will initially be in Loading state.
-        Tenant::spawn_load(conf, tenant_id, remote_storage, ctx)
+        Tenant::spawn_load(
+            conf,
+            tenant_id,
+            broker_client,
+            remote_storage,
+            init_order,
+            ctx,
+        )
    };
    Ok(tenant)
 }
@@ -219,6 +233,7 @@ pub fn schedule_local_tenant_processing(
 /// That could be easily misinterpreted by control plane, the consumer of the
 /// management API. For example, it could attach the tenant on a different pageserver.
 /// We would then be in split-brain once this pageserver restarts.
+#[instrument]
 pub async fn shutdown_all_tenants() {
    // Prevent new tenants from being created.
    let tenants_to_shut_down = {
@@ -235,39 +250,51 @@ pub async fn shutdown_all_tenants() {
                tenants_clone
            }
            TenantsMap::ShuttingDown(_) => {
+                // TODO: it is possible that detach and shutdown happen at the same time. as a
+                // result, during shutdown we do not wait for detach.
                error!("already shutting down, this function isn't supposed to be called more than once");
                return;
            }
        }
    };

-    let mut tenants_to_freeze_and_flush = Vec::with_capacity(tenants_to_shut_down.len());
-    for (_, tenant) in tenants_to_shut_down {
-        if tenant.is_active() {
-            // updates tenant state, forbidding new GC and compaction iterations from starting
-            tenant.set_stopping();
-            tenants_to_freeze_and_flush.push(tenant);
+    let mut join_set = JoinSet::new();
+    for (tenant_id, tenant) in tenants_to_shut_down {
+        join_set.spawn(
+            async move {
+                let freeze_and_flush = true;
+
+                match tenant.shutdown(freeze_and_flush).await {
+                    Ok(()) => debug!("tenant successfully stopped"),
+                    Err(super::ShutdownError::AlreadyStopping) => {
+                        warn!("tenant was already shutting down")
+                    }
+                }
+            }
+            .instrument(info_span!("shutdown", %tenant_id)),
+        );
+    }
+
+    let mut panicked = 0;
+
+    while let Some(res) = join_set.join_next().await {
+        match res {
+            Ok(()) => {}
+            Err(join_error) if join_error.is_cancelled() => {
+                unreachable!("we are not cancelling any of the futures");
+            }
+            Err(join_error) if join_error.is_panic() => {
+                // cannot really do anything, as this panic is likely a bug
+                panicked += 1;
+            }
+            Err(join_error) => {
+                warn!("unknown kind of JoinError: {join_error}");
+            }
        }
    }

-    // Shut down all existing walreceiver connections and stop accepting the new ones.
-    task_mgr::shutdown_tasks(Some(TaskKind::WalReceiverManager), None, None).await;
-
-    // Ok, no background tasks running anymore. Flush any remaining data in
-    // memory to disk.
-    //
-    // We assume that any incoming connections that might request pages from
-    // the tenant have already been terminated by the caller, so there
-    // should be no more activity in any of the repositories.
-    //
-    // On error, log it but continue with the shutdown for other tenants.
-    for tenant in tenants_to_freeze_and_flush {
-        let tenant_id = tenant.tenant_id();
-        debug!("shutdown tenant {tenant_id}");
-
-        if let Err(err) = tenant.freeze_and_flush().await {
-            error!("Could not checkpoint tenant {tenant_id} during shutdown: {err:?}");
-        }
+    if panicked > 0 {
+        warn!(panicked, "observed panicks while shutting down tenants");
    }
 }

@@ -275,10 +302,11 @@ pub async fn create_tenant(
    conf: &'static PageServerConf,
    tenant_conf: TenantConfOpt,
    tenant_id: TenantId,
+    broker_client: storage_broker::BrokerClientChannel,
    remote_storage: Option<GenericRemoteStorage>,
    ctx: &RequestContext,
 ) -> Result<Arc<Tenant>, TenantMapInsertError> {
-    tenant_map_insert(tenant_id, |vacant_entry| {
+    tenant_map_insert(tenant_id, || {
        // We're holding the tenants lock in write mode while doing local IO.
        // If this section ever becomes contentious, introduce a new `TenantState::Creating`
        // and do the work in that state.
@@ -287,7 +315,7 @@ pub async fn create_tenant(
        //       See https://github.com/neondatabase/neon/issues/4233

        let created_tenant =
-            schedule_local_tenant_processing(conf, &tenant_directory, remote_storage, ctx)?;
+            schedule_local_tenant_processing(conf, &tenant_directory, broker_client, remote_storage, None, ctx)?;
        // TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here.
        //      See https://github.com/neondatabase/neon/issues/4233

@@ -296,16 +324,23 @@ pub async fn create_tenant(
                tenant_id == crated_tenant_id,
                "loaded created tenant has unexpected tenant id (expect {tenant_id} != actual {crated_tenant_id})",
            );
-        vacant_entry.insert(Arc::clone(&created_tenant));
        Ok(created_tenant)
    }).await
 }

+#[derive(Debug, thiserror::Error)]
+pub enum SetNewTenantConfigError {
+    #[error(transparent)]
+    GetTenant(#[from] GetTenantError),
+    #[error(transparent)]
+    Persist(anyhow::Error),
+}
+
 pub async fn set_new_tenant_config(
    conf: &'static PageServerConf,
    new_tenant_conf: TenantConfOpt,
    tenant_id: TenantId,
-) -> Result<(), TenantStateError> {
+) -> Result<(), SetNewTenantConfigError> {
    info!("configuring tenant {tenant_id}");
    let tenant = get_tenant(tenant_id, true).await?;

@@ -315,23 +350,32 @@ pub async fn set_new_tenant_config(
        &tenant_config_path,
        new_tenant_conf,
        false,
-    )?;
+    )
+    .map_err(SetNewTenantConfigError::Persist)?;
    tenant.set_new_tenant_config(new_tenant_conf);
    Ok(())
 }

+#[derive(Debug, thiserror::Error)]
+pub enum GetTenantError {
+    #[error("Tenant {0} not found")]
+    NotFound(TenantId),
+    #[error("Tenant {0} is not active")]
+    NotActive(TenantId),
+}
+
 /// Gets the tenant from the in-memory data, erroring if it's absent or is not fitting to the query.
 /// `active_only = true` allows to query only tenants that are ready for operations, erroring on other kinds of tenants.
 pub async fn get_tenant(
    tenant_id: TenantId,
    active_only: bool,
-) -> Result<Arc<Tenant>, TenantStateError> {
+) -> Result<Arc<Tenant>, GetTenantError> {
    let m = TENANTS.read().await;
    let tenant = m
        .get(&tenant_id)
-        .ok_or(TenantStateError::NotFound(tenant_id))?;
+        .ok_or(GetTenantError::NotFound(tenant_id))?;
    if active_only && !tenant.is_active() {
-        Err(TenantStateError::NotActive(tenant_id))
+        Err(GetTenantError::NotActive(tenant_id))
    } else {
        Ok(Arc::clone(tenant))
    }
@@ -340,7 +384,7 @@ pub async fn get_tenant(
 #[derive(Debug, thiserror::Error)]
 pub enum DeleteTimelineError {
    #[error("Tenant {0}")]
-    Tenant(#[from] TenantStateError),
+    Tenant(#[from] GetTenantError),

    #[error("Timeline {0}")]
    Timeline(#[from] crate::tenant::DeleteTimelineError),
@@ -352,7 +396,9 @@ pub async fn delete_timeline(
    ctx: &RequestContext,
 ) -> Result<(), DeleteTimelineError> {
    let tenant = get_tenant(tenant_id, true).await?;
-    tenant.delete_timeline(timeline_id, ctx).await?;
+    tenant
+        .prepare_and_schedule_delete_timeline(timeline_id, ctx)
+        .await?;
    Ok(())
 }

@@ -405,10 +451,11 @@ pub async fn detach_tenant(
 pub async fn load_tenant(
    conf: &'static PageServerConf,
    tenant_id: TenantId,
+    broker_client: storage_broker::BrokerClientChannel,
    remote_storage: Option<GenericRemoteStorage>,
    ctx: &RequestContext,
 ) -> Result<(), TenantMapInsertError> {
-    tenant_map_insert(tenant_id, |vacant_entry| {
+    tenant_map_insert(tenant_id, || {
        let tenant_path = conf.tenant_path(&tenant_id);
        let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(tenant_id);
        if tenant_ignore_mark.exists() {
@@ -416,14 +463,14 @@ pub async fn load_tenant(
                .with_context(|| format!("Failed to remove tenant ignore mark {tenant_ignore_mark:?} during tenant loading"))?;
        }

-        let new_tenant = schedule_local_tenant_processing(conf, &tenant_path, remote_storage, ctx)
+        let new_tenant = schedule_local_tenant_processing(conf, &tenant_path, broker_client, remote_storage, None, ctx)
            .with_context(|| {
                format!("Failed to schedule tenant processing in path {tenant_path:?}")
            })?;

-        vacant_entry.insert(new_tenant);
-        Ok(())
-    }).await
+        Ok(new_tenant)
+    }).await?;
+    Ok(())
 }

 pub async fn ignore_tenant(
@@ -473,10 +520,11 @@ pub async fn attach_tenant(
    conf: &'static PageServerConf,
    tenant_id: TenantId,
    tenant_conf: TenantConfOpt,
+    broker_client: storage_broker::BrokerClientChannel,
    remote_storage: GenericRemoteStorage,
    ctx: &RequestContext,
 ) -> Result<(), TenantMapInsertError> {
-    tenant_map_insert(tenant_id, |vacant_entry| {
+    tenant_map_insert(tenant_id, || {
        let tenant_dir = create_tenant_files(conf, tenant_conf, tenant_id, CreateTenantFilesMode::Attach)?;
        // TODO: tenant directory remains on disk if we bail out from here on.
        //       See https://github.com/neondatabase/neon/issues/4233
@@ -488,7 +536,7 @@ pub async fn attach_tenant(
            .context("check for attach marker file existence")?;
        anyhow::ensure!(marker_file_exists, "create_tenant_files should have created the attach marker file");

-        let attached_tenant = schedule_local_tenant_processing(conf, &tenant_dir, Some(remote_storage), ctx)?;
+        let attached_tenant = schedule_local_tenant_processing(conf, &tenant_dir, broker_client, Some(remote_storage), None, ctx)?;
        // TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here.
        //      See https://github.com/neondatabase/neon/issues/4233

@@ -497,10 +545,10 @@ pub async fn attach_tenant(
            tenant_id == attached_tenant_id,
            "loaded created tenant has unexpected tenant id (expect {tenant_id} != actual {attached_tenant_id})",
        );
-        vacant_entry.insert(Arc::clone(&attached_tenant));
-        Ok(())
+        Ok(attached_tenant)
    })
-    .await
+    .await?;
+    Ok(())
 }

 #[derive(Debug, thiserror::Error)]
@@ -521,12 +569,12 @@ pub enum TenantMapInsertError {
 ///
 /// NB: the closure should return quickly because the current implementation of tenants map
 /// serializes access through an `RwLock`.
-async fn tenant_map_insert<F, V>(
+async fn tenant_map_insert<F>(
    tenant_id: TenantId,
    insert_fn: F,
-) -> Result<V, TenantMapInsertError>
+) -> Result<Arc<Tenant>, TenantMapInsertError>
 where
-    F: FnOnce(hash_map::VacantEntry<TenantId, Arc<Tenant>>) -> anyhow::Result<V>,
+    F: FnOnce() -> anyhow::Result<Arc<Tenant>>,
 {
    let mut guard = TENANTS.write().await;
    let m = match &mut *guard {
@@ -539,8 +587,11 @@ where
            tenant_id,
            e.get().current_state(),
        )),
-        hash_map::Entry::Vacant(v) => match insert_fn(v) {
-            Ok(v) => Ok(v),
+        hash_map::Entry::Vacant(v) => match insert_fn() {
+            Ok(tenant) => {
+                v.insert(tenant.clone());
+                Ok(tenant)
+            }
            Err(e) => Err(TenantMapInsertError::Closure(e)),
        },
    }
@@ -561,25 +612,26 @@ where
    // The exclusive lock here ensures we don't miss the tenant state updates before trying another removal.
    // tenant-wde cleanup operations may take some time (removing the entire tenant directory), we want to
    // avoid holding the lock for the entire process.
-    {
-        let tenants_accessor = TENANTS.write().await;
-        match tenants_accessor.get(&tenant_id) {
-            Some(tenant) => match tenant.current_state() {
-                TenantState::Attaching
-                | TenantState::Loading
-                | TenantState::Broken { .. }
-                | TenantState::Active => tenant.set_stopping(),
-                TenantState::Stopping => return Err(TenantStateError::IsStopping(tenant_id)),
-            },
-            None => return Err(TenantStateError::NotFound(tenant_id)),
+    let tenant = {
+        TENANTS
+            .write()
+            .await
+            .get(&tenant_id)
+            .cloned()
+            .ok_or(TenantStateError::NotFound(tenant_id))?
+    };
+
+    let freeze_and_flush = false;
+
+    // shutdown is sure to transition tenant to stopping, and wait for all tasks to complete, so
+    // that we can continue safely to cleanup.
+    match tenant.shutdown(freeze_and_flush).await {
+        Ok(()) => {}
+        Err(super::ShutdownError::AlreadyStopping) => {
+            return Err(TenantStateError::IsStopping(tenant_id))
        }
    }

-    // shutdown all tenant and timeline tasks: gc, compaction, page service)
-    // No new tasks will be started for this tenant because it's in `Stopping` state.
-    // Hence, once we're done here, the `tenant_cleanup` callback can mutate tenant on-disk state freely.
-    task_mgr::shutdown_tasks(None, Some(tenant_id), None).await;
-
    match tenant_cleanup
        .await
        .with_context(|| format!("Failed to run cleanup for tenant {tenant_id}"))
@@ -595,7 +647,7 @@ where
            let tenants_accessor = TENANTS.read().await;
            match tenants_accessor.get(&tenant_id) {
                Some(tenant) => {
-                    tenant.set_broken(e.to_string());
+                    tenant.set_broken(e.to_string()).await;
                }
                None => {
                    warn!("Tenant {tenant_id} got removed from memory");
@@ -623,7 +675,7 @@ pub async fn immediate_gc(
        .get(&tenant_id)
        .map(Arc::clone)
        .with_context(|| format!("tenant {tenant_id}"))
-        .map_err(ApiError::NotFound)?;
+        .map_err(|e| ApiError::NotFound(e.into()))?;

    let gc_horizon = gc_req.gc_horizon.unwrap_or_else(|| tenant.get_gc_horizon());
    // Use tenant's pitr setting
@@ -661,7 +713,6 @@ pub async fn immediate_gc(
    Ok(wait_task_done)
 }

-#[cfg(feature = "testing")]
 pub async fn immediate_compact(
    tenant_id: TenantId,
    timeline_id: TimelineId,
@@ -673,11 +724,11 @@ pub async fn immediate_compact(
        .get(&tenant_id)
        .map(Arc::clone)
        .with_context(|| format!("tenant {tenant_id}"))
-        .map_err(ApiError::NotFound)?;
+        .map_err(|e| ApiError::NotFound(e.into()))?;

    let timeline = tenant
        .get_timeline(timeline_id, true)
-        .map_err(ApiError::NotFound)?;
+        .map_err(|e| ApiError::NotFound(e.into()))?;

    // Run in task_mgr to avoid race with tenant_detach operation
    let ctx = ctx.detached_child(TaskKind::Compaction, DownloadBehavior::Download);
--- a/pageserver/src/tenant/par_fsync.rs
+++ b/pageserver/src/tenant/par_fsync.rs
@@ -19,14 +19,8 @@ fn parallel_worker(paths: &[PathBuf], next_path_idx: &AtomicUsize) -> io::Result
    Ok(())
 }

-pub fn par_fsync(paths: &[PathBuf]) -> io::Result<()> {
-    const PARALLEL_PATH_THRESHOLD: usize = 1;
-    if paths.len() <= PARALLEL_PATH_THRESHOLD {
-        for path in paths {
-            fsync_path(path)?;
-        }
-        return Ok(());
-    }
+fn fsync_in_thread_pool(paths: &[PathBuf]) -> io::Result<()> {
+    // TODO: remove this function in favor of `par_fsync_async` once we asyncify everything.

    /// Use at most this number of threads.
    /// Increasing this limit will
@@ -36,11 +30,11 @@ pub fn par_fsync(paths: &[PathBuf]) -> io::Result<()> {
    let num_threads = paths.len().min(MAX_NUM_THREADS);
    let next_path_idx = AtomicUsize::new(0);

-    crossbeam_utils::thread::scope(|s| -> io::Result<()> {
+    std::thread::scope(|s| -> io::Result<()> {
        let mut handles = vec![];
        // Spawn `num_threads - 1`, as the current thread is also a worker.
        for _ in 1..num_threads {
-            handles.push(s.spawn(|_| parallel_worker(paths, &next_path_idx)));
+            handles.push(s.spawn(|| parallel_worker(paths, &next_path_idx)));
        }

        parallel_worker(paths, &next_path_idx)?;
@@ -51,5 +45,41 @@ pub fn par_fsync(paths: &[PathBuf]) -> io::Result<()> {

        Ok(())
    })
-    .unwrap()
+}
+
+/// Parallel fsync all files. Can be used in non-async context as it is using rayon thread pool.
+pub fn par_fsync(paths: &[PathBuf]) -> io::Result<()> {
+    if paths.len() == 1 {
+        fsync_path(&paths[0])?;
+        return Ok(());
+    }
+
+    fsync_in_thread_pool(paths)
+}
+
+/// Parallel fsync asynchronously. If number of files are less than PARALLEL_PATH_THRESHOLD, fsync is done in the current
+/// execution thread. Otherwise, we will spawn_blocking and run it in tokio.
+pub async fn par_fsync_async(paths: &[PathBuf]) -> io::Result<()> {
+    const MAX_CONCURRENT_FSYNC: usize = 64;
+    let mut next = paths.iter().peekable();
+    let mut js = tokio::task::JoinSet::new();
+    loop {
+        while js.len() < MAX_CONCURRENT_FSYNC && next.peek().is_some() {
+            let next = next.next().expect("just peeked");
+            let next = next.to_owned();
+            js.spawn_blocking(move || fsync_path(&next));
+        }
+
+        // now the joinset has been filled up, wait for next to complete
+        if let Some(res) = js.join_next().await {
+            res??;
+        } else {
+            // last item had already completed
+            assert!(
+                next.peek().is_none(),
+                "joinset emptied, we shouldn't have more work"
+            );
+            return Ok(());
+        }
+    }
 }
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -210,13 +210,15 @@ use chrono::{NaiveDateTime, Utc};
 pub use download::{is_temp_download_file, list_remote_timelines};
 use scopeguard::ScopeGuard;

+use std::collections::{HashMap, VecDeque};
+use std::path::Path;
 use std::sync::atomic::{AtomicU32, Ordering};
 use std::sync::{Arc, Mutex};

-use remote_storage::{DownloadError, GenericRemoteStorage};
+use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath};
 use std::ops::DerefMut;
 use tokio::runtime::Runtime;
-use tracing::{debug, error, info, warn};
+use tracing::{debug, error, info, instrument, warn};
 use tracing::{info_span, Instrument};
 use utils::lsn::Lsn;

@@ -225,7 +227,9 @@ use crate::metrics::{
    RemoteTimelineClientMetricsCallTrackSize, REMOTE_ONDEMAND_DOWNLOADED_BYTES,
    REMOTE_ONDEMAND_DOWNLOADED_LAYERS,
 };
+use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
+use crate::tenant::upload_queue::Delete;
 use crate::{
    config::PageServerConf,
    task_mgr,
@@ -259,7 +263,7 @@ const FAILED_UPLOAD_WARN_THRESHOLD: u32 = 3;

 pub enum MaybeDeletedIndexPart {
    IndexPart(IndexPart),
-    Deleted,
+    Deleted(IndexPart),
 }

 /// Errors that can arise when calling [`RemoteTimelineClient::stop`].
@@ -361,11 +365,42 @@ impl RemoteTimelineClient {
        Ok(())
    }

+    /// Initialize the queue in stopped state. Used in startup path
+    /// to continue deletion operation interrupted by pageserver crash or restart.
+    pub fn init_upload_queue_stopped_to_continue_deletion(
+        &self,
+        index_part: &IndexPart,
+    ) -> anyhow::Result<()> {
+        // FIXME: consider newtype for DeletedIndexPart.
+        let deleted_at = index_part.deleted_at.ok_or(anyhow::anyhow!(
+            "bug: it is responsibility of the caller to provide index part from MaybeDeletedIndexPart::Deleted"
+        ))?;
+
+        {
+            let mut upload_queue = self.upload_queue.lock().unwrap();
+            upload_queue.initialize_with_current_remote_index_part(index_part)?;
+            self.update_remote_physical_size_gauge(Some(index_part));
+        }
+        // also locks upload queue, without dropping the guard above it will be a deadlock
+        self.stop().expect("initialized line above");
+
+        let mut upload_queue = self.upload_queue.lock().unwrap();
+
+        upload_queue
+            .stopped_mut()
+            .expect("stopped above")
+            .deleted_at = SetDeletedFlagProgress::Successful(deleted_at);
+
+        Ok(())
+    }
+
    pub fn last_uploaded_consistent_lsn(&self) -> Option<Lsn> {
        match &*self.upload_queue.lock().unwrap() {
            UploadQueue::Uninitialized => None,
            UploadQueue::Initialized(q) => Some(q.last_uploaded_consistent_lsn),
-            UploadQueue::Stopped(q) => Some(q.last_uploaded_consistent_lsn),
+            UploadQueue::Stopped(q) => {
+                Some(q.upload_queue_for_deletion.last_uploaded_consistent_lsn)
+            }
        }
    }

@@ -420,7 +455,7 @@ impl RemoteTimelineClient {
        .await?;

        if index_part.deleted_at.is_some() {
-            Ok(MaybeDeletedIndexPart::Deleted)
+            Ok(MaybeDeletedIndexPart::Deleted(index_part))
        } else {
            Ok(MaybeDeletedIndexPart::IndexPart(index_part))
        }
@@ -622,7 +657,11 @@ impl RemoteTimelineClient {

            // schedule the actual deletions
            for name in names {
-                let op = UploadOp::Delete(RemoteOpFileKind::Layer, name.clone());
+                let op = UploadOp::Delete(Delete {
+                    file_kind: RemoteOpFileKind::Layer,
+                    layer_file_name: name.clone(),
+                    scheduled_from_timeline_delete: false,
+                });
                self.calls_unfinished_metric_begin(&op);
                upload_queue.queued_operations.push_back(op);
                info!("scheduled layer file deletion {}", name.file_name());
@@ -639,18 +678,11 @@ impl RemoteTimelineClient {
    /// Wait for all previously scheduled uploads/deletions to complete
    ///
    pub async fn wait_completion(self: &Arc<Self>) -> anyhow::Result<()> {
-        let (sender, mut receiver) = tokio::sync::watch::channel(());
-        let barrier_op = UploadOp::Barrier(sender);
-
-        {
+        let mut receiver = {
            let mut guard = self.upload_queue.lock().unwrap();
            let upload_queue = guard.initialized_mut()?;
-            upload_queue.queued_operations.push_back(barrier_op);
-            // Don't count this kind of operation!
-
-            // Launch the task immediately, if possible
-            self.launch_queued_tasks(upload_queue);
-        }
+            self.schedule_barrier(upload_queue)
+        };

        if receiver.changed().await.is_err() {
            anyhow::bail!("wait_completion aborted because upload queue was stopped");
@@ -658,6 +690,22 @@ impl RemoteTimelineClient {
        Ok(())
    }

+    fn schedule_barrier(
+        self: &Arc<Self>,
+        upload_queue: &mut UploadQueueInitialized,
+    ) -> tokio::sync::watch::Receiver<()> {
+        let (sender, receiver) = tokio::sync::watch::channel(());
+        let barrier_op = UploadOp::Barrier(sender);
+
+        upload_queue.queued_operations.push_back(barrier_op);
+        // Don't count this kind of operation!
+
+        // Launch the task immediately, if possible
+        self.launch_queued_tasks(upload_queue);
+
+        receiver
+    }
+
    /// Set the deleted_at field in the remote index file.
    ///
    /// This fails if the upload queue has not been `stop()`ed.
@@ -665,6 +713,7 @@ impl RemoteTimelineClient {
    /// The caller is responsible for calling `stop()` AND for waiting
    /// for any ongoing upload tasks to finish after `stop()` has succeeded.
    /// Check method [`RemoteTimelineClient::stop`] for details.
+    #[instrument(skip_all)]
    pub(crate) async fn persist_index_part_with_deleted_flag(
        self: &Arc<Self>,
    ) -> Result<(), PersistIndexPartWithDeletedFlagError> {
@@ -674,15 +723,7 @@ impl RemoteTimelineClient {
            // We must be in stopped state because otherwise
            // we can have inprogress index part upload that can overwrite the file
            // with missing is_deleted flag that we going to set below
-            let stopped = match &mut *locked {
-                UploadQueue::Uninitialized => {
-                    return Err(anyhow::anyhow!("is not Stopped but Uninitialized").into())
-                }
-                UploadQueue::Initialized(_) => {
-                    return Err(anyhow::anyhow!("is not Stopped but Initialized").into())
-                }
-                UploadQueue::Stopped(stopped) => stopped,
-            };
+            let stopped = locked.stopped_mut()?;

            match stopped.deleted_at {
                SetDeletedFlagProgress::NotRunning => (), // proceed
@@ -696,48 +737,34 @@ impl RemoteTimelineClient {
            let deleted_at = Utc::now().naive_utc();
            stopped.deleted_at = SetDeletedFlagProgress::InProgress(deleted_at);

-            let mut index_part = IndexPart::new(
-                stopped.latest_files.clone(),
-                stopped.last_uploaded_consistent_lsn,
-                stopped
-                    .latest_metadata
-                    .to_bytes()
-                    .context("serialize metadata")?,
-            );
+            let mut index_part = IndexPart::try_from(&stopped.upload_queue_for_deletion)
+                .context("IndexPart serialize")?;
            index_part.deleted_at = Some(deleted_at);
            index_part
        };

        let undo_deleted_at = scopeguard::guard(Arc::clone(self), |self_clone| {
            let mut locked = self_clone.upload_queue.lock().unwrap();
-            let stopped = match &mut *locked {
-                UploadQueue::Uninitialized | UploadQueue::Initialized(_) => unreachable!(
-                    "there's no way out of Stopping, and we checked it's Stopping above: {:?}",
-                    locked.as_str(),
-                ),
-                UploadQueue::Stopped(stopped) => stopped,
-            };
+            let stopped = locked
+                .stopped_mut()
+                .expect("there's no way out of Stopping, and we checked it's Stopping above");
            stopped.deleted_at = SetDeletedFlagProgress::NotRunning;
        });

        // Have a failpoint that can use the `pause` failpoint action.
        // We don't want to block the executor thread, hence, spawn_blocking + await.
-        #[cfg(feature = "testing")]
-        tokio::task::spawn_blocking({
-            let current = tracing::Span::current();
-            move || {
-                let _entered = current.entered();
-                tracing::info!(
-                    "at failpoint persist_index_part_with_deleted_flag_after_set_before_upload_pause"
-                );
-                fail::fail_point!(
-                    "persist_index_part_with_deleted_flag_after_set_before_upload_pause"
-                );
-            }
-        })
-        .await
-        .expect("spawn_blocking");
-
+        if cfg!(feature = "testing") {
+            tokio::task::spawn_blocking({
+                let current = tracing::Span::current();
+                move || {
+                    let _entered = current.entered();
+                    tracing::info!("at failpoint persist_deleted_index_part");
+                    fail::fail_point!("persist_deleted_index_part");
+                }
+            })
+            .await
+            .expect("spawn_blocking");
+        }
        upload::upload_index_part(
            self.conf,
            &self.storage_impl,
@@ -751,13 +778,10 @@ impl RemoteTimelineClient {
        ScopeGuard::into_inner(undo_deleted_at);
        {
            let mut locked = self.upload_queue.lock().unwrap();
-            let stopped = match &mut *locked {
-                UploadQueue::Uninitialized | UploadQueue::Initialized(_) => unreachable!(
-                    "there's no way out of Stopping, and we checked it's Stopping above: {:?}",
-                    locked.as_str(),
-                ),
-                UploadQueue::Stopped(stopped) => stopped,
-            };
+
+            let stopped = locked
+                .stopped_mut()
+                .expect("there's no way out of Stopping, and we checked it's Stopping above");
            stopped.deleted_at = SetDeletedFlagProgress::Successful(
                index_part_with_deleted_at
                    .deleted_at
@@ -768,6 +792,92 @@ impl RemoteTimelineClient {
        Ok(())
    }

+    /// Prerequisites: UploadQueue should be in stopped state and deleted_at should be successfuly set.
+    /// The function deletes layer files one by one, then lists the prefix to see if we leaked something
+    /// deletes leaked files if any and proceeds with deletion of index file at the end.
+    pub(crate) async fn delete_all(self: &Arc<Self>) -> anyhow::Result<()> {
+        debug_assert_current_span_has_tenant_and_timeline_id();
+
+        let (mut receiver, deletions_queued) = {
+            let mut deletions_queued = 0;
+
+            let mut locked = self.upload_queue.lock().unwrap();
+            let stopped = locked.stopped_mut()?;
+
+            if !matches!(stopped.deleted_at, SetDeletedFlagProgress::Successful(_)) {
+                anyhow::bail!("deleted_at is not set")
+            }
+
+            debug_assert!(stopped.upload_queue_for_deletion.no_pending_work());
+
+            stopped
+                .upload_queue_for_deletion
+                .queued_operations
+                .reserve(stopped.upload_queue_for_deletion.latest_files.len());
+
+            // schedule the actual deletions
+            for name in stopped.upload_queue_for_deletion.latest_files.keys() {
+                let op = UploadOp::Delete(Delete {
+                    file_kind: RemoteOpFileKind::Layer,
+                    layer_file_name: name.clone(),
+                    scheduled_from_timeline_delete: true,
+                });
+                self.calls_unfinished_metric_begin(&op);
+                stopped
+                    .upload_queue_for_deletion
+                    .queued_operations
+                    .push_back(op);
+
+                info!("scheduled layer file deletion {}", name.file_name());
+                deletions_queued += 1;
+            }
+
+            self.launch_queued_tasks(&mut stopped.upload_queue_for_deletion);
+
+            (
+                self.schedule_barrier(&mut stopped.upload_queue_for_deletion),
+                deletions_queued,
+            )
+        };
+
+        receiver.changed().await?;
+
+        // Do not delete index part yet, it is needed for possible retry. If we remove it first
+        // and retry will arrive to different pageserver there wont be any traces of it on remote storage
+        let timeline_path = self.conf.timeline_path(&self.timeline_id, &self.tenant_id);
+        let timeline_storage_path = self.conf.remote_path(&timeline_path)?;
+
+        let remaining = self
+            .storage_impl
+            .list_prefixes(Some(&timeline_storage_path))
+            .await?;
+
+        let remaining: Vec<RemotePath> = remaining
+            .into_iter()
+            .filter(|p| p.object_name() != Some(IndexPart::FILE_NAME))
+            .collect();
+
+        if !remaining.is_empty() {
+            warn!(
+                "Found {} files not bound to index_file.json, proceeding with their deletion",
+                remaining.len()
+            );
+            for file in remaining {
+                warn!("Removing {}", file.object_name().unwrap_or_default());
+                self.storage_impl.delete(&file).await?;
+            }
+        }
+
+        let index_file_path = timeline_storage_path.join(Path::new(IndexPart::FILE_NAME));
+
+        debug!("deleting index part");
+        self.storage_impl.delete(&index_file_path).await?;
+
+        info!(deletions_queued, "done deleting, including index_part.json");
+
+        Ok(())
+    }
+
    ///
    /// Pick next tasks from the queue, and start as many of them as possible without violating
    /// the ordering constraints.
@@ -786,7 +896,7 @@ impl RemoteTimelineClient {
                    // have finished.
                    upload_queue.inprogress_tasks.is_empty()
                }
-                UploadOp::Delete(_, _) => {
+                UploadOp::Delete(_) => {
                    // Wait for preceding uploads to finish. Concurrent deletions are OK, though.
                    upload_queue.num_inprogress_deletions == upload_queue.inprogress_tasks.len()
                }
@@ -817,7 +927,7 @@ impl RemoteTimelineClient {
                UploadOp::UploadMetadata(_, _) => {
                    upload_queue.num_inprogress_metadata_uploads += 1;
                }
-                UploadOp::Delete(_, _) => {
+                UploadOp::Delete(_) => {
                    upload_queue.num_inprogress_deletions += 1;
                }
                UploadOp::Barrier(sender) => {
@@ -891,7 +1001,6 @@ impl RemoteTimelineClient {
                        unreachable!("we never launch an upload task if the queue is uninitialized, and once it is initialized, we never go back")
                    }
                }
-                self.calls_unfinished_metric_end(&task.op);
                return;
            }

@@ -937,16 +1046,16 @@ impl RemoteTimelineClient {
                    }
                    res
                }
-                UploadOp::Delete(metric_file_kind, ref layer_file_name) => {
+                UploadOp::Delete(delete) => {
                    let path = &self
                        .conf
                        .timeline_path(&self.timeline_id, &self.tenant_id)
-                        .join(layer_file_name.file_name());
+                        .join(delete.layer_file_name.file_name());
                    delete::delete_layer(self.conf, &self.storage_impl, path)
                        .measure_remote_op(
                            self.tenant_id,
                            self.timeline_id,
-                            *metric_file_kind,
+                            delete.file_kind,
                            RemoteOpKind::Delete,
                            Arc::clone(&self.metrics),
                        )
@@ -1012,11 +1121,24 @@ impl RemoteTimelineClient {
            let mut upload_queue_guard = self.upload_queue.lock().unwrap();
            let upload_queue = match upload_queue_guard.deref_mut() {
                UploadQueue::Uninitialized => panic!("callers are responsible for ensuring this is only called on an initialized queue"),
-                UploadQueue::Stopped(_) => {
+                UploadQueue::Stopped(stopped) => {
+                    // Special care is needed for deletions, if it was an earlier deletion (not scheduled from deletion)
+                    // then stop() took care of it so we just return.
+                    // For deletions that come from delete_all we still want to maintain metrics, launch following tasks, etc.
+                    match &task.op {
+                        UploadOp::Delete(delete) if delete.scheduled_from_timeline_delete => Some(&mut stopped.upload_queue_for_deletion),
+                        _ => None
+                    }
+                },
+                UploadQueue::Initialized(qi) => { Some(qi) }
+            };
+
+            let upload_queue = match upload_queue {
+                Some(upload_queue) => upload_queue,
+                None => {
                    info!("another concurrent task already stopped the queue");
                    return;
-                }, // nothing to do
-                UploadQueue::Initialized(qi) => { qi }
+                }
            };

            upload_queue.inprogress_tasks.remove(&task.task_id);
@@ -1029,7 +1151,7 @@ impl RemoteTimelineClient {
                    upload_queue.num_inprogress_metadata_uploads -= 1;
                    upload_queue.last_uploaded_consistent_lsn = lsn; // XXX monotonicity check?
                }
-                UploadOp::Delete(_, _) => {
+                UploadOp::Delete(_) => {
                    upload_queue.num_inprogress_deletions -= 1;
                }
                UploadOp::Barrier(_) => unreachable!(),
@@ -1063,8 +1185,8 @@ impl RemoteTimelineClient {
                    reason: "metadata uploads are tiny",
                },
            ),
-            UploadOp::Delete(file_kind, _) => (
-                *file_kind,
+            UploadOp::Delete(delete) => (
+                delete.file_kind,
                RemoteOpKind::Delete,
                DontTrackSize {
                    reason: "should we track deletes? positive or negative sign?",
@@ -1111,32 +1233,36 @@ impl RemoteTimelineClient {
                info!("another concurrent task already shut down the queue");
                Ok(())
            }
-            UploadQueue::Initialized(UploadQueueInitialized {
-                latest_files,
-                latest_metadata,
-                last_uploaded_consistent_lsn,
-                ..
-            }) => {
+            UploadQueue::Initialized(initialized) => {
                info!("shutting down upload queue");

                // Replace the queue with the Stopped state, taking ownership of the old
                // Initialized queue. We will do some checks on it, and then drop it.
                let qi = {
-                    // take or clone what we need
-                    let latest_files = std::mem::take(latest_files);
-                    let last_uploaded_consistent_lsn = *last_uploaded_consistent_lsn;
-                    // this could be Copy
-                    let latest_metadata = latest_metadata.clone();
-
-                    let stopped = UploadQueueStopped {
-                        latest_files,
-                        last_uploaded_consistent_lsn,
-                        latest_metadata,
-                        deleted_at: SetDeletedFlagProgress::NotRunning,
+                    // Here we preserve working version of the upload queue for possible use during deletions.
+                    // In-place replace of Initialized to Stopped can be done with the help of https://github.com/Sgeo/take_mut
+                    // but for this use case it doesnt really makes sense to bring unsafe code only for this usage point.
+                    // Deletion is not really perf sensitive so there shouldnt be any problems with cloning a fraction of it.
+                    let upload_queue_for_deletion = UploadQueueInitialized {
+                        task_counter: 0,
+                        latest_files: initialized.latest_files.clone(),
+                        latest_files_changes_since_metadata_upload_scheduled: 0,
+                        latest_metadata: initialized.latest_metadata.clone(),
+                        last_uploaded_consistent_lsn: initialized.last_uploaded_consistent_lsn,
+                        num_inprogress_layer_uploads: 0,
+                        num_inprogress_metadata_uploads: 0,
+                        num_inprogress_deletions: 0,
+                        inprogress_tasks: HashMap::default(),
+                        queued_operations: VecDeque::default(),
                    };

-                    let upload_queue =
-                        std::mem::replace(&mut *guard, UploadQueue::Stopped(stopped));
+                    let upload_queue = std::mem::replace(
+                        &mut *guard,
+                        UploadQueue::Stopped(UploadQueueStopped {
+                            upload_queue_for_deletion,
+                            deleted_at: SetDeletedFlagProgress::NotRunning,
+                        }),
+                    );
                    if let UploadQueue::Initialized(qi) = upload_queue {
                        qi
                    } else {
@@ -1144,8 +1270,6 @@ impl RemoteTimelineClient {
                    }
                };

-                assert!(qi.latest_files.is_empty(), "do not use this anymore");
-
                // consistency check
                assert_eq!(
                    qi.num_inprogress_layer_uploads
@@ -1243,7 +1367,7 @@ mod tests {
    struct TestSetup {
        runtime: &'static tokio::runtime::Runtime,
        entered_runtime: EnterGuard<'static>,
-        harness: TenantHarness<'static>,
+        harness: TenantHarness,
        tenant: Arc<Tenant>,
        tenant_ctx: RequestContext,
        remote_fs_dir: PathBuf,
@@ -1264,9 +1388,12 @@ mod tests {
            let harness = TenantHarness::create(test_name)?;
            let (tenant, ctx) = runtime.block_on(harness.load());
            // create an empty timeline directory
-            let timeline =
-                tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?;
-            let _ = timeline.initialize(&ctx).unwrap();
+            let _ = runtime.block_on(tenant.create_test_timeline(
+                TIMELINE_ID,
+                Lsn(8),
+                DEFAULT_PG_VERSION,
+                &ctx,
+            ))?;

            let remote_fs_dir = harness.conf.workdir.join("remote_fs");
            std::fs::create_dir_all(remote_fs_dir)?;
@@ -1410,7 +1537,7 @@ mod tests {
        // Download back the index.json, and check that the list of files is correct
        let index_part = match runtime.block_on(client.download_index_file())? {
            MaybeDeletedIndexPart::IndexPart(index_part) => index_part,
-            MaybeDeletedIndexPart::Deleted => panic!("unexpectedly got deleted index part"),
+            MaybeDeletedIndexPart::Deleted(_) => panic!("unexpectedly got deleted index part"),
        };

        assert_file_list(
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -7,9 +7,11 @@ use std::collections::{HashMap, HashSet};
 use chrono::NaiveDateTime;
 use serde::{Deserialize, Serialize};
 use serde_with::{serde_as, DisplayFromStr};
+use utils::bin_ser::SerializeError;

 use crate::tenant::metadata::TimelineMetadata;
 use crate::tenant::storage_layer::LayerFileName;
+use crate::tenant::upload_queue::UploadQueueInitialized;

 use utils::lsn::Lsn;

@@ -115,6 +117,21 @@ impl IndexPart {
    }
 }

+impl TryFrom<&UploadQueueInitialized> for IndexPart {
+    type Error = SerializeError;
+
+    fn try_from(upload_queue: &UploadQueueInitialized) -> Result<Self, Self::Error> {
+        let disk_consistent_lsn = upload_queue.latest_metadata.disk_consistent_lsn();
+        let metadata_bytes = upload_queue.latest_metadata.to_bytes()?;
+
+        Ok(Self::new(
+            upload_queue.latest_files.clone(),
+            disk_consistent_lsn,
+            metadata_bytes,
+        ))
+    }
+}
+
 /// Serialized form of [`LayerFileMetadata`].
 #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, Default)]
 pub struct IndexLayerMetadata {
--- a/pageserver/src/tenant/size.rs
+++ b/pageserver/src/tenant/size.rs
@@ -4,14 +4,20 @@ use std::collections::{HashMap, HashSet};
 use std::sync::Arc;

 use anyhow::{bail, Context};
+use tokio::sync::oneshot::error::RecvError;
+use tokio::sync::Semaphore;
+use tokio_util::sync::CancellationToken;

 use crate::context::RequestContext;
+use crate::pgdatadir_mapping::CalculateLogicalSizeError;

-use super::Tenant;
+use super::{LogicalSizeCalculationCause, Tenant};
 use crate::tenant::Timeline;
 use utils::id::TimelineId;
 use utils::lsn::Lsn;

+use tracing::*;
+
 use tenant_size_model::{Segment, StorageModel};

 /// Inputs to the actual tenant sizing model
@@ -117,8 +123,10 @@ pub struct TimelineInputs {
 /// tenant size will be zero.
 pub(super) async fn gather_inputs(
    tenant: &Tenant,
+    limit: &Arc<Semaphore>,
    max_retention_period: Option<u64>,
    logical_size_cache: &mut HashMap<(TimelineId, Lsn), u64>,
+    cause: LogicalSizeCalculationCause,
    ctx: &RequestContext,
 ) -> anyhow::Result<ModelInputs> {
    // refresh is needed to update gc related pitr_cutoff and horizon_cutoff
@@ -311,7 +319,15 @@ pub(super) async fn gather_inputs(

    // We left the 'size' field empty in all of the Segments so far.
    // Now find logical sizes for all of the points that might need or benefit from them.
-    fill_logical_sizes(&timelines, &mut segments, logical_size_cache, ctx).await?;
+    fill_logical_sizes(
+        &timelines,
+        &mut segments,
+        limit,
+        logical_size_cache,
+        cause,
+        ctx,
+    )
+    .await?;

    Ok(ModelInputs {
        segments,
@@ -327,7 +343,9 @@ pub(super) async fn gather_inputs(
 async fn fill_logical_sizes(
    timelines: &[Arc<Timeline>],
    segments: &mut [SegmentMeta],
+    limit: &Arc<Semaphore>,
    logical_size_cache: &mut HashMap<(TimelineId, Lsn), u64>,
+    cause: LogicalSizeCalculationCause,
    ctx: &RequestContext,
 ) -> anyhow::Result<()> {
    let timeline_hash: HashMap<TimelineId, Arc<Timeline>> = HashMap::from_iter(
@@ -343,6 +361,11 @@ async fn fill_logical_sizes(

    // with joinset, on drop, all of the tasks will just be de-scheduled, which we can use to
    // our advantage with `?` error handling.
+    let mut joinset = tokio::task::JoinSet::new();
+
+    let cancel = tokio_util::sync::CancellationToken::new();
+    // be sure to cancel all spawned tasks if we are dropped
+    let _dg = cancel.clone().drop_guard();

    // For each point that would benefit from having a logical size available,
    // spawn a Task to fetch it, unless we have it cached already.
@@ -355,18 +378,71 @@ async fn fill_logical_sizes(
        let lsn = Lsn(seg.segment.lsn);

        if let Entry::Vacant(e) = sizes_needed.entry((timeline_id, lsn)) {
-            let mut cached_size = logical_size_cache.get(&(timeline_id, lsn)).cloned();
+            let cached_size = logical_size_cache.get(&(timeline_id, lsn)).cloned();
            if cached_size.is_none() {
                let timeline = Arc::clone(timeline_hash.get(&timeline_id).unwrap());
-                cached_size = Some(timeline.get_logical_size(lsn, ctx).await?);
+                let parallel_size_calcs = Arc::clone(limit);
+                let ctx = ctx.attached_child();
+                joinset.spawn(
+                    calculate_logical_size(
+                        parallel_size_calcs,
+                        timeline,
+                        lsn,
+                        cause,
+                        ctx,
+                        cancel.child_token(),
+                    )
+                    .in_current_span(),
+                );
            }
            e.insert(cached_size);
        }
    }

+    // Perform the size lookups
+    let mut have_any_error = false;
+    while let Some(res) = joinset.join_next().await {
+        // each of these come with Result<anyhow::Result<_>, JoinError>
+        // because of spawn + spawn_blocking
+        match res {
+            Err(join_error) if join_error.is_cancelled() => {
+                unreachable!("we are not cancelling any of the futures, nor should be");
+            }
+            Err(join_error) => {
+                // cannot really do anything, as this panic is likely a bug
+                error!("task that calls spawn_ondemand_logical_size_calculation panicked: {join_error:#}");
+                have_any_error = true;
+            }
+            Ok(Err(recv_result_error)) => {
+                // cannot really do anything, as this panic is likely a bug
+                error!("failed to receive logical size query result: {recv_result_error:#}");
+                have_any_error = true;
+            }
+            Ok(Ok(TimelineAtLsnSizeResult(timeline, lsn, Err(error)))) => {
+                warn!(
+                    timeline_id=%timeline.timeline_id,
+                    "failed to calculate logical size at {lsn}: {error:#}"
+                );
+                have_any_error = true;
+            }
+            Ok(Ok(TimelineAtLsnSizeResult(timeline, lsn, Ok(size)))) => {
+                debug!(timeline_id=%timeline.timeline_id, %lsn, size, "size calculated");
+
+                logical_size_cache.insert((timeline.timeline_id, lsn), size);
+                sizes_needed.insert((timeline.timeline_id, lsn), Some(size));
+            }
+        }
+    }
+
    // prune any keys not needed anymore; we record every used key and added key.
    logical_size_cache.retain(|key, _| sizes_needed.contains_key(key));

+    if have_any_error {
+        // we cannot complete this round, because we are missing data.
+        // we have however cached all we were able to request calculation on.
+        anyhow::bail!("failed to calculate some logical_sizes");
+    }
+
    // Insert the looked up sizes to the Segments
    for seg in segments.iter_mut() {
        if !seg.size_needed() {
@@ -408,6 +484,33 @@ impl ModelInputs {
    }
 }

+/// Newtype around the tuple that carries the timeline at lsn logical size calculation.
+struct TimelineAtLsnSizeResult(
+    Arc<crate::tenant::Timeline>,
+    utils::lsn::Lsn,
+    Result<u64, CalculateLogicalSizeError>,
+);
+
+#[instrument(skip_all, fields(timeline_id=%timeline.timeline_id, lsn=%lsn))]
+async fn calculate_logical_size(
+    limit: Arc<tokio::sync::Semaphore>,
+    timeline: Arc<crate::tenant::Timeline>,
+    lsn: utils::lsn::Lsn,
+    cause: LogicalSizeCalculationCause,
+    ctx: RequestContext,
+    cancel: CancellationToken,
+) -> Result<TimelineAtLsnSizeResult, RecvError> {
+    let _permit = tokio::sync::Semaphore::acquire_owned(limit)
+        .await
+        .expect("global semaphore should not had been closed");
+
+    let size_res = timeline
+        .spawn_ondemand_logical_size_calculation(lsn, cause, ctx, cancel)
+        .instrument(info_span!("spawn_ondemand_logical_size_calculation"))
+        .await?;
+    Ok(TimelineAtLsnSizeResult(timeline, lsn, size_res))
+}
+
 #[test]
 fn verify_size_for_multiple_branches() {
    // this is generated from integration test test_tenant_size_with_multiple_branches, but this way
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -4,6 +4,7 @@ pub mod delta_layer;
 mod filename;
 mod image_layer;
 mod inmemory_layer;
+mod layer_desc;
 mod remote_layer;

 use crate::config::PageServerConf;
@@ -37,6 +38,7 @@ pub use delta_layer::{DeltaLayer, DeltaLayerWriter};
 pub use filename::{DeltaFileName, ImageFileName, LayerFileName};
 pub use image_layer::{ImageLayer, ImageLayerWriter};
 pub use inmemory_layer::InMemoryLayer;
+pub use layer_desc::{PersistentLayerDesc, PersistentLayerKey};
 pub use remote_layer::RemoteLayer;

 use super::layer_map::BatchedUpdates;
@@ -387,10 +389,10 @@ pub trait Layer: std::fmt::Debug + Send + Sync {
 }

 /// Returned by [`Layer::iter`]
-pub type LayerIter<'i> = Box<dyn Iterator<Item = Result<(Key, Lsn, Value)>> + 'i>;
+pub type LayerIter<'i> = Box<dyn Iterator<Item = Result<(Key, Lsn, Value)>> + 'i + Send>;

 /// Returned by [`Layer::key_iter`]
-pub type LayerKeyIter<'i> = Box<dyn Iterator<Item = (Key, Lsn, u64)> + 'i>;
+pub type LayerKeyIter<'i> = Box<dyn Iterator<Item = (Key, Lsn, u64)> + 'i + Send>;

 /// A Layer contains all data in a "rectangle" consisting of a range of keys and
 /// range of LSNs.
@@ -406,14 +408,23 @@ pub type LayerKeyIter<'i> = Box<dyn Iterator<Item = (Key, Lsn, u64)> + 'i>;
 /// An image layer is a snapshot of all the data in a key-range, at a single
 /// LSN.
 pub trait PersistentLayer: Layer {
-    fn get_tenant_id(&self) -> TenantId;
+    /// Get the layer descriptor.
+    fn layer_desc(&self) -> &PersistentLayerDesc;
+
+    fn get_tenant_id(&self) -> TenantId {
+        self.layer_desc().tenant_id
+    }

    /// Identify the timeline this layer belongs to
-    fn get_timeline_id(&self) -> TimelineId;
+    fn get_timeline_id(&self) -> TimelineId {
+        self.layer_desc().timeline_id
+    }

    /// File name used for this layer, both in the pageserver's local filesystem
    /// state as well as in the remote storage.
-    fn filename(&self) -> LayerFileName;
+    fn filename(&self) -> LayerFileName {
+        self.layer_desc().filename()
+    }

    // Path to the layer file in the local filesystem.
    // `None` for `RemoteLayer`.
@@ -443,7 +454,9 @@ pub trait PersistentLayer: Layer {
    ///
    /// Should not change over the lifetime of the layer object because
    /// current_physical_size is computed as the som of this value.
-    fn file_size(&self) -> u64;
+    fn file_size(&self) -> u64 {
+        self.layer_desc().file_size
+    }

    fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo;

@@ -472,6 +485,20 @@ pub struct LayerDescriptor {
    pub short_id: String,
 }

+impl LayerDescriptor {
+    /// `LayerDescriptor` is only used for testing purpose so it does not matter whether it is image / delta,
+    /// and the tenant / timeline id does not matter.
+    pub fn get_persistent_layer_desc(&self) -> PersistentLayerDesc {
+        PersistentLayerDesc::new_delta(
+            TenantId::from_array([0; 16]),
+            TimelineId::from_array([0; 16]),
+            self.key.clone(),
+            self.lsn.clone(),
+            233,
+        )
+    }
+}
+
 impl Layer for LayerDescriptor {
    fn get_key_range(&self) -> Range<Key> {
        self.key.clone()
@@ -542,7 +569,7 @@ impl From<LayerFileName> for LayerDescriptor {
 ///
 /// This is used by DeltaLayer and ImageLayer. Normally, this holds a reference to the
 /// global config, and paths to layer files are constructed using the tenant/timeline
-/// path from the config. But in the 'pageserver_binutils' binary, we need to construct a Layer
+/// path from the config. But in the 'pagectl' binary, we need to construct a Layer
 /// struct for a file on disk, without having a page server running, so that we have no
 /// config. In that case, we use the Path variant to hold the full path to the file on
 /// disk.
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -37,6 +37,7 @@ use crate::virtual_file::VirtualFile;
 use crate::{walrecord, TEMP_FILE_SUFFIX};
 use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
 use anyhow::{bail, ensure, Context, Result};
+use once_cell::sync::OnceCell;
 use pageserver_api::models::{HistoricLayerInfo, LayerAccessKind};
 use rand::{distributions::Alphanumeric, Rng};
 use serde::{Deserialize, Serialize};
@@ -46,7 +47,6 @@ use std::io::{Seek, SeekFrom};
 use std::ops::Range;
 use std::os::unix::fs::FileExt;
 use std::path::{Path, PathBuf};
-use std::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard};
 use tracing::*;

 use utils::{
@@ -56,8 +56,8 @@ use utils::{
 };

 use super::{
-    DeltaFileName, Layer, LayerAccessStats, LayerAccessStatsReset, LayerFileName, LayerIter,
-    LayerKeyIter, PathOrConf,
+    DeltaFileName, Layer, LayerAccessStats, LayerAccessStatsReset, LayerIter, LayerKeyIter,
+    PathOrConf, PersistentLayerDesc,
 };

 ///
@@ -89,10 +89,10 @@ impl From<&DeltaLayer> for Summary {
            magic: DELTA_FILE_MAGIC,
            format_version: STORAGE_FORMAT_VERSION,

-            tenant_id: layer.tenant_id,
-            timeline_id: layer.timeline_id,
-            key_range: layer.key_range.clone(),
-            lsn_range: layer.lsn_range.clone(),
+            tenant_id: layer.desc.tenant_id,
+            timeline_id: layer.desc.timeline_id,
+            key_range: layer.desc.key_range.clone(),
+            lsn_range: layer.desc.lsn_range.clone(),

            index_start_blk: 0,
            index_root_blk: 0,
@@ -110,7 +110,7 @@ const WILL_INIT: u64 = 1;
 /// reading/deserializing records themselves.
 ///
 #[derive(Debug, Serialize, Deserialize, Copy, Clone)]
-struct BlobRef(u64);
+pub struct BlobRef(pub u64);

 impl BlobRef {
    pub fn will_init(&self) -> bool {
@@ -180,16 +180,11 @@ impl DeltaKey {
 pub struct DeltaLayer {
    path_or_conf: PathOrConf,

-    pub tenant_id: TenantId,
-    pub timeline_id: TimelineId,
-    pub key_range: Range<Key>,
-    pub lsn_range: Range<Lsn>,
-
-    pub file_size: u64,
+    pub desc: PersistentLayerDesc,

    access_stats: LayerAccessStats,

-    inner: RwLock<DeltaLayerInner>,
+    inner: OnceCell<DeltaLayerInner>,
 }

 impl std::fmt::Debug for DeltaLayer {
@@ -197,30 +192,26 @@ impl std::fmt::Debug for DeltaLayer {
        use super::RangeDisplayDebug;

        f.debug_struct("DeltaLayer")
-            .field("key_range", &RangeDisplayDebug(&self.key_range))
-            .field("lsn_range", &self.lsn_range)
-            .field("file_size", &self.file_size)
+            .field("key_range", &RangeDisplayDebug(&self.desc.key_range))
+            .field("lsn_range", &self.desc.lsn_range)
+            .field("file_size", &self.desc.file_size)
            .field("inner", &self.inner)
            .finish()
    }
 }

 pub struct DeltaLayerInner {
-    /// If false, the fields below have not been loaded into memory yet.
-    loaded: bool,
-
    // values copied from summary
    index_start_blk: u32,
    index_root_blk: u32,

-    /// Reader object for reading blocks from the file. (None if not loaded yet)
-    file: Option<FileBlockReader<VirtualFile>>,
+    /// Reader object for reading blocks from the file.
+    file: FileBlockReader<VirtualFile>,
 }

 impl std::fmt::Debug for DeltaLayerInner {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.debug_struct("DeltaLayerInner")
-            .field("loaded", &self.loaded)
            .field("index_start_blk", &self.index_start_blk)
            .field("index_root_blk", &self.index_root_blk)
            .finish()
@@ -228,30 +219,16 @@ impl std::fmt::Debug for DeltaLayerInner {
 }

 impl Layer for DeltaLayer {
-    fn get_key_range(&self) -> Range<Key> {
-        self.key_range.clone()
-    }
-
-    fn get_lsn_range(&self) -> Range<Lsn> {
-        self.lsn_range.clone()
-    }
-    fn is_incremental(&self) -> bool {
-        true
-    }
-
-    fn short_id(&self) -> String {
-        self.filename().file_name()
-    }
    /// debugging function to print out the contents of the layer
    fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
        println!(
            "----- delta layer for ten {} tli {} keys {}-{} lsn {}-{} ----",
-            self.tenant_id,
-            self.timeline_id,
-            self.key_range.start,
-            self.key_range.end,
-            self.lsn_range.start,
-            self.lsn_range.end
+            self.desc.tenant_id,
+            self.desc.timeline_id,
+            self.desc.key_range.start,
+            self.desc.key_range.end,
+            self.desc.lsn_range.start,
+            self.desc.lsn_range.end
        );

        if !verbose {
@@ -265,7 +242,7 @@ impl Layer for DeltaLayer {
            inner.index_start_blk, inner.index_root_blk
        );

-        let file = inner.file.as_ref().unwrap();
+        let file = &inner.file;
        let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
            inner.index_start_blk,
            inner.index_root_blk,
@@ -324,17 +301,17 @@ impl Layer for DeltaLayer {
        reconstruct_state: &mut ValueReconstructState,
        ctx: &RequestContext,
    ) -> anyhow::Result<ValueReconstructResult> {
-        ensure!(lsn_range.start >= self.lsn_range.start);
+        ensure!(lsn_range.start >= self.desc.lsn_range.start);
        let mut need_image = true;

-        ensure!(self.key_range.contains(&key));
+        ensure!(self.desc.key_range.contains(&key));

        {
            // Open the file and lock the metadata in memory
            let inner = self.load(LayerAccessKind::GetValueReconstructData, ctx)?;

            // Scan the page versions backwards, starting from `lsn`.
-            let file = inner.file.as_ref().unwrap();
+            let file = &inner.file;
            let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
                inner.index_start_blk,
                inner.index_root_blk,
@@ -402,19 +379,31 @@ impl Layer for DeltaLayer {
            Ok(ValueReconstructResult::Complete)
        }
    }
+
+    /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
+    fn get_key_range(&self) -> Range<Key> {
+        self.layer_desc().key_range.clone()
+    }
+
+    /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
+    fn get_lsn_range(&self) -> Range<Lsn> {
+        self.layer_desc().lsn_range.clone()
+    }
+
+    /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
+    fn is_incremental(&self) -> bool {
+        self.layer_desc().is_incremental
+    }
+
+    /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
+    fn short_id(&self) -> String {
+        self.layer_desc().short_id()
+    }
 }

 impl PersistentLayer for DeltaLayer {
-    fn get_tenant_id(&self) -> TenantId {
-        self.tenant_id
-    }
-
-    fn get_timeline_id(&self) -> TimelineId {
-        self.timeline_id
-    }
-
-    fn filename(&self) -> LayerFileName {
-        self.layer_name().into()
+    fn layer_desc(&self) -> &PersistentLayerDesc {
+        &self.desc
    }

    fn local_path(&self) -> Option<PathBuf> {
@@ -444,10 +433,6 @@ impl PersistentLayer for DeltaLayer {
        Ok(())
    }

-    fn file_size(&self) -> u64 {
-        self.file_size
-    }
-
    fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
        let layer_file_name = self.filename().file_name();
        let lsn_range = self.get_lsn_range();
@@ -456,7 +441,7 @@ impl PersistentLayer for DeltaLayer {

        HistoricLayerInfo::Delta {
            layer_file_name,
-            layer_file_size: self.file_size,
+            layer_file_size: self.desc.file_size,
            lsn_start: lsn_range.start,
            lsn_end: lsn_range.end,
            remote: false,
@@ -511,51 +496,22 @@ impl DeltaLayer {
    /// Open the underlying file and read the metadata into memory, if it's
    /// not loaded already.
    ///
-    fn load(
-        &self,
-        access_kind: LayerAccessKind,
-        ctx: &RequestContext,
-    ) -> Result<RwLockReadGuard<DeltaLayerInner>> {
+    fn load(&self, access_kind: LayerAccessKind, ctx: &RequestContext) -> Result<&DeltaLayerInner> {
        self.access_stats
            .record_access(access_kind, ctx.task_kind());
-        loop {
-            // Quick exit if already loaded
-            let inner = self.inner.read().unwrap();
-            if inner.loaded {
-                return Ok(inner);
-            }
-
-            // Need to open the file and load the metadata. Upgrade our lock to
-            // a write lock. (Or rather, release and re-lock in write mode.)
-            drop(inner);
-            let inner = self.inner.write().unwrap();
-            if !inner.loaded {
-                self.load_inner(inner).with_context(|| {
-                    format!("Failed to load delta layer {}", self.path().display())
-                })?;
-            } else {
-                // Another thread loaded it while we were not holding the lock.
-            }
-
-            // We now have the file open and loaded. There's no function to do
-            // that in the std library RwLock, so we have to release and re-lock
-            // in read mode. (To be precise, the lock guard was moved in the
-            // above call to `load_inner`, so it's already been released). And
-            // while we do that, another thread could unload again, so we have
-            // to re-check and retry if that happens.
-        }
+        // Quick exit if already loaded
+        self.inner
+            .get_or_try_init(|| self.load_inner())
+            .with_context(|| format!("Failed to load delta layer {}", self.path().display()))
    }

-    fn load_inner(&self, mut inner: RwLockWriteGuard<DeltaLayerInner>) -> Result<()> {
+    fn load_inner(&self) -> Result<DeltaLayerInner> {
        let path = self.path();

-        // Open the file if it's not open already.
-        if inner.file.is_none() {
-            let file = VirtualFile::open(&path)
-                .with_context(|| format!("Failed to open file '{}'", path.display()))?;
-            inner.file = Some(FileBlockReader::new(file));
-        }
-        let file = inner.file.as_mut().unwrap();
+        let file = VirtualFile::open(&path)
+            .with_context(|| format!("Failed to open file '{}'", path.display()))?;
+        let file = FileBlockReader::new(file);
+
        let summary_blk = file.read_blk(0)?;
        let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;

@@ -582,13 +538,13 @@ impl DeltaLayer {
            }
        }

-        inner.index_start_blk = actual_summary.index_start_blk;
-        inner.index_root_blk = actual_summary.index_root_blk;
-
        debug!("loaded from {}", &path.display());

-        inner.loaded = true;
-        Ok(())
+        Ok(DeltaLayerInner {
+            file,
+            index_start_blk: actual_summary.index_start_blk,
+            index_root_blk: actual_summary.index_root_blk,
+        })
    }

    /// Create a DeltaLayer struct representing an existing file on disk.
@@ -602,24 +558,21 @@ impl DeltaLayer {
    ) -> DeltaLayer {
        DeltaLayer {
            path_or_conf: PathOrConf::Conf(conf),
-            timeline_id,
-            tenant_id,
-            key_range: filename.key_range.clone(),
-            lsn_range: filename.lsn_range.clone(),
-            file_size,
+            desc: PersistentLayerDesc::new_delta(
+                tenant_id,
+                timeline_id,
+                filename.key_range.clone(),
+                filename.lsn_range.clone(),
+                file_size,
+            ),
            access_stats,
-            inner: RwLock::new(DeltaLayerInner {
-                loaded: false,
-                file: None,
-                index_start_blk: 0,
-                index_root_blk: 0,
-            }),
+            inner: once_cell::sync::OnceCell::new(),
        }
    }

    /// Create a DeltaLayer struct representing an existing file on disk.
    ///
-    /// This variant is only used for debugging purposes, by the 'pageserver_binutils' binary.
+    /// This variant is only used for debugging purposes, by the 'pagectl' binary.
    pub fn new_for_path(path: &Path, file: File) -> Result<Self> {
        let mut summary_buf = Vec::new();
        summary_buf.resize(PAGE_SZ, 0);
@@ -632,34 +585,27 @@ impl DeltaLayer {

        Ok(DeltaLayer {
            path_or_conf: PathOrConf::Path(path.to_path_buf()),
-            timeline_id: summary.timeline_id,
-            tenant_id: summary.tenant_id,
-            key_range: summary.key_range,
-            lsn_range: summary.lsn_range,
-            file_size: metadata.len(),
+            desc: PersistentLayerDesc::new_delta(
+                summary.tenant_id,
+                summary.timeline_id,
+                summary.key_range,
+                summary.lsn_range,
+                metadata.len(),
+            ),
            access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
-            inner: RwLock::new(DeltaLayerInner {
-                loaded: false,
-                file: None,
-                index_start_blk: 0,
-                index_root_blk: 0,
-            }),
+            inner: once_cell::sync::OnceCell::new(),
        })
    }

    fn layer_name(&self) -> DeltaFileName {
-        DeltaFileName {
-            key_range: self.key_range.clone(),
-            lsn_range: self.lsn_range.clone(),
-        }
+        self.desc.delta_file_name()
    }
-
    /// Path to the layer file in pageserver workdir.
    pub fn path(&self) -> PathBuf {
        Self::path_for(
            &self.path_or_conf,
-            self.timeline_id,
-            self.tenant_id,
+            self.desc.timeline_id,
+            self.desc.tenant_id,
            &self.layer_name(),
        )
    }
@@ -803,18 +749,15 @@ impl DeltaLayerWriterInner {
        // set inner.file here. The first read will have to re-open it.
        let layer = DeltaLayer {
            path_or_conf: PathOrConf::Conf(self.conf),
-            tenant_id: self.tenant_id,
-            timeline_id: self.timeline_id,
-            key_range: self.key_start..key_end,
-            lsn_range: self.lsn_range.clone(),
-            file_size: metadata.len(),
+            desc: PersistentLayerDesc::new_delta(
+                self.tenant_id,
+                self.timeline_id,
+                self.key_start..key_end,
+                self.lsn_range.clone(),
+                metadata.len(),
+            ),
            access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
-            inner: RwLock::new(DeltaLayerInner {
-                loaded: false,
-                file: None,
-                index_start_blk,
-                index_root_blk,
-            }),
+            inner: once_cell::sync::OnceCell::new(),
        };

        // fsync the file
@@ -949,13 +892,13 @@ struct DeltaValueIter<'a> {
    reader: BlockCursor<Adapter<'a>>,
 }

-struct Adapter<'a>(RwLockReadGuard<'a, DeltaLayerInner>);
+struct Adapter<'a>(&'a DeltaLayerInner);

 impl<'a> BlockReader for Adapter<'a> {
    type BlockLease = PageReadGuard<'static>;

    fn read_blk(&self, blknum: u32) -> Result<Self::BlockLease, std::io::Error> {
-        self.0.file.as_ref().unwrap().read_blk(blknum)
+        self.0.file.read_blk(blknum)
    }
 }

@@ -968,8 +911,8 @@ impl<'a> Iterator for DeltaValueIter<'a> {
 }

 impl<'a> DeltaValueIter<'a> {
-    fn new(inner: RwLockReadGuard<'a, DeltaLayerInner>) -> Result<Self> {
-        let file = inner.file.as_ref().unwrap();
+    fn new(inner: &'a DeltaLayerInner) -> Result<Self> {
+        let file = &inner.file;
        let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
            inner.index_start_blk,
            inner.index_root_blk,
@@ -1042,8 +985,8 @@ impl Iterator for DeltaKeyIter {
 }

 impl<'a> DeltaKeyIter {
-    fn new(inner: RwLockReadGuard<'a, DeltaLayerInner>) -> Result<Self> {
-        let file = inner.file.as_ref().unwrap();
+    fn new(inner: &'a DeltaLayerInner) -> Result<Self> {
+        let file = &inner.file;
        let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
            inner.index_start_blk,
            inner.index_root_blk,
@@ -1083,3 +1026,21 @@ impl<'a> DeltaKeyIter {
        Ok(iter)
    }
 }
+
+#[cfg(test)]
+mod test {
+    use super::DeltaKeyIter;
+    use super::DeltaLayer;
+    use super::DeltaValueIter;
+
+    // We will soon need the iters to be send in the compaction code.
+    // Cf https://github.com/neondatabase/neon/pull/4462#issuecomment-1587398883
+    // Cf https://github.com/neondatabase/neon/issues/4471
+    #[test]
+    fn is_send() {
+        fn assert_send<T: Send>() {}
+        assert_send::<DeltaLayer>();
+        assert_send::<DeltaValueIter>();
+        assert_send::<DeltaKeyIter>();
+    }
+}
--- a/pageserver/src/tenant/storage_layer/filename.rs
+++ b/pageserver/src/tenant/storage_layer/filename.rs
@@ -9,6 +9,8 @@ use std::str::FromStr;

 use utils::lsn::Lsn;

+use super::PersistentLayerDesc;
+
 // Note: Timeline::load_layer_map() relies on this sort order
 #[derive(PartialEq, Eq, Clone, Hash)]
 pub struct DeltaFileName {
@@ -153,7 +155,7 @@ impl Ord for ImageFileName {
 impl ImageFileName {
    pub fn lsn_as_range(&self) -> Range<Lsn> {
        // Saves from having to copypaste this all over
-        self.lsn..(self.lsn + 1)
+        PersistentLayerDesc::image_layer_lsn_range(self.lsn)
    }
 }

--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -52,8 +52,8 @@ use utils::{
    lsn::Lsn,
 };

-use super::filename::{ImageFileName, LayerFileName};
-use super::{Layer, LayerAccessStatsReset, LayerIter, PathOrConf};
+use super::filename::ImageFileName;
+use super::{Layer, LayerAccessStatsReset, LayerIter, PathOrConf, PersistentLayerDesc};

 ///
 /// Header stored in the beginning of the file
@@ -84,9 +84,9 @@ impl From<&ImageLayer> for Summary {
        Self {
            magic: IMAGE_FILE_MAGIC,
            format_version: STORAGE_FORMAT_VERSION,
-            tenant_id: layer.tenant_id,
-            timeline_id: layer.timeline_id,
-            key_range: layer.key_range.clone(),
+            tenant_id: layer.desc.tenant_id,
+            timeline_id: layer.desc.timeline_id,
+            key_range: layer.desc.key_range.clone(),
            lsn: layer.lsn,

            index_start_blk: 0,
@@ -104,12 +104,9 @@ impl From<&ImageLayer> for Summary {
 /// and it needs to be loaded before using it in queries.
 pub struct ImageLayer {
    path_or_conf: PathOrConf,
-    pub tenant_id: TenantId,
-    pub timeline_id: TimelineId,
-    pub key_range: Range<Key>,
-    pub file_size: u64,

-    // This entry contains an image of all pages as of this LSN
+    pub desc: PersistentLayerDesc,
+    // This entry contains an image of all pages as of this LSN, should be the same as desc.lsn
    pub lsn: Lsn,

    access_stats: LayerAccessStats,
@@ -122,8 +119,8 @@ impl std::fmt::Debug for ImageLayer {
        use super::RangeDisplayDebug;

        f.debug_struct("ImageLayer")
-            .field("key_range", &RangeDisplayDebug(&self.key_range))
-            .field("file_size", &self.file_size)
+            .field("key_range", &RangeDisplayDebug(&self.desc.key_range))
+            .field("file_size", &self.desc.file_size)
            .field("lsn", &self.lsn)
            .field("inner", &self.inner)
            .finish()
@@ -153,27 +150,15 @@ impl std::fmt::Debug for ImageLayerInner {
 }

 impl Layer for ImageLayer {
-    fn get_key_range(&self) -> Range<Key> {
-        self.key_range.clone()
-    }
-
-    fn get_lsn_range(&self) -> Range<Lsn> {
-        // End-bound is exclusive
-        self.lsn..(self.lsn + 1)
-    }
-    fn is_incremental(&self) -> bool {
-        false
-    }
-
-    fn short_id(&self) -> String {
-        self.filename().file_name()
-    }
-
    /// debugging function to print out the contents of the layer
    fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
        println!(
            "----- image layer for ten {} tli {} key {}-{} at {} ----",
-            self.tenant_id, self.timeline_id, self.key_range.start, self.key_range.end, self.lsn
+            self.desc.tenant_id,
+            self.desc.timeline_id,
+            self.desc.key_range.start,
+            self.desc.key_range.end,
+            self.lsn
        );

        if !verbose {
@@ -203,7 +188,7 @@ impl Layer for ImageLayer {
        reconstruct_state: &mut ValueReconstructState,
        ctx: &RequestContext,
    ) -> anyhow::Result<ValueReconstructResult> {
-        assert!(self.key_range.contains(&key));
+        assert!(self.desc.key_range.contains(&key));
        assert!(lsn_range.start >= self.lsn);
        assert!(lsn_range.end >= self.lsn);

@@ -230,24 +215,37 @@ impl Layer for ImageLayer {
            Ok(ValueReconstructResult::Missing)
        }
    }
+
+    /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
+    fn get_key_range(&self) -> Range<Key> {
+        self.layer_desc().key_range.clone()
+    }
+
+    /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
+    fn get_lsn_range(&self) -> Range<Lsn> {
+        self.layer_desc().lsn_range.clone()
+    }
+
+    /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
+    fn is_incremental(&self) -> bool {
+        self.layer_desc().is_incremental
+    }
+
+    /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
+    fn short_id(&self) -> String {
+        self.layer_desc().short_id()
+    }
 }

 impl PersistentLayer for ImageLayer {
-    fn filename(&self) -> LayerFileName {
-        self.layer_name().into()
+    fn layer_desc(&self) -> &PersistentLayerDesc {
+        &self.desc
    }

    fn local_path(&self) -> Option<PathBuf> {
        Some(self.path())
    }

-    fn get_tenant_id(&self) -> TenantId {
-        self.tenant_id
-    }
-
-    fn get_timeline_id(&self) -> TimelineId {
-        self.timeline_id
-    }
    fn iter(&self, _ctx: &RequestContext) -> Result<LayerIter<'_>> {
        unimplemented!();
    }
@@ -258,17 +256,13 @@ impl PersistentLayer for ImageLayer {
        Ok(())
    }

-    fn file_size(&self) -> u64 {
-        self.file_size
-    }
-
    fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
        let layer_file_name = self.filename().file_name();
        let lsn_range = self.get_lsn_range();

        HistoricLayerInfo::Image {
            layer_file_name,
-            layer_file_size: self.file_size,
+            layer_file_size: self.desc.file_size,
            lsn_start: lsn_range.start,
            remote: false,
            access_stats: self.access_stats.as_api_model(reset),
@@ -405,11 +399,15 @@ impl ImageLayer {
    ) -> ImageLayer {
        ImageLayer {
            path_or_conf: PathOrConf::Conf(conf),
-            timeline_id,
-            tenant_id,
-            key_range: filename.key_range.clone(),
+            desc: PersistentLayerDesc::new_img(
+                tenant_id,
+                timeline_id,
+                filename.key_range.clone(),
+                filename.lsn,
+                false,
+                file_size,
+            ), // Now we assume image layer ALWAYS covers the full range. This may change in the future.
            lsn: filename.lsn,
-            file_size,
            access_stats,
            inner: RwLock::new(ImageLayerInner {
                loaded: false,
@@ -422,7 +420,7 @@ impl ImageLayer {

    /// Create an ImageLayer struct representing an existing file on disk.
    ///
-    /// This variant is only used for debugging purposes, by the 'pageserver_binutils' binary.
+    /// This variant is only used for debugging purposes, by the 'pagectl' binary.
    pub fn new_for_path(path: &Path, file: File) -> Result<ImageLayer> {
        let mut summary_buf = Vec::new();
        summary_buf.resize(PAGE_SZ, 0);
@@ -433,11 +431,15 @@ impl ImageLayer {
            .context("get file metadata to determine size")?;
        Ok(ImageLayer {
            path_or_conf: PathOrConf::Path(path.to_path_buf()),
-            timeline_id: summary.timeline_id,
-            tenant_id: summary.tenant_id,
-            key_range: summary.key_range,
+            desc: PersistentLayerDesc::new_img(
+                summary.tenant_id,
+                summary.timeline_id,
+                summary.key_range,
+                summary.lsn,
+                false,
+                metadata.len(),
+            ), // Now we assume image layer ALWAYS covers the full range. This may change in the future.
            lsn: summary.lsn,
-            file_size: metadata.len(),
            access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
            inner: RwLock::new(ImageLayerInner {
                file: None,
@@ -449,18 +451,15 @@ impl ImageLayer {
    }

    fn layer_name(&self) -> ImageFileName {
-        ImageFileName {
-            key_range: self.key_range.clone(),
-            lsn: self.lsn,
-        }
+        self.desc.image_file_name()
    }

    /// Path to the layer file in pageserver workdir.
    pub fn path(&self) -> PathBuf {
        Self::path_for(
            &self.path_or_conf,
-            self.timeline_id,
-            self.tenant_id,
+            self.desc.timeline_id,
+            self.desc.tenant_id,
            &self.layer_name(),
        )
    }
@@ -484,6 +483,7 @@ struct ImageLayerWriterInner {
    tenant_id: TenantId,
    key_range: Range<Key>,
    lsn: Lsn,
+    is_incremental: bool,

    blob_writer: WriteBlobWriter<VirtualFile>,
    tree: DiskBtreeBuilder<BlockBuf, KEY_SIZE>,
@@ -499,6 +499,7 @@ impl ImageLayerWriterInner {
        tenant_id: TenantId,
        key_range: &Range<Key>,
        lsn: Lsn,
+        is_incremental: bool,
    ) -> anyhow::Result<Self> {
        // Create the file initially with a temporary filename.
        // We'll atomically rename it to the final name when we're done.
@@ -533,6 +534,7 @@ impl ImageLayerWriterInner {
            lsn,
            tree: tree_builder,
            blob_writer,
+            is_incremental,
        };

        Ok(writer)
@@ -588,16 +590,22 @@ impl ImageLayerWriterInner {
            .metadata()
            .context("get metadata to determine file size")?;

+        let desc = PersistentLayerDesc::new_img(
+            self.tenant_id,
+            self.timeline_id,
+            self.key_range.clone(),
+            self.lsn,
+            self.is_incremental, // for now, image layer ALWAYS covers the full range
+            metadata.len(),
+        );
+
        // Note: Because we open the file in write-only mode, we cannot
        // reuse the same VirtualFile for reading later. That's why we don't
        // set inner.file here. The first read will have to re-open it.
        let layer = ImageLayer {
            path_or_conf: PathOrConf::Conf(self.conf),
-            timeline_id: self.timeline_id,
-            tenant_id: self.tenant_id,
-            key_range: self.key_range.clone(),
+            desc,
            lsn: self.lsn,
-            file_size: metadata.len(),
            access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
            inner: RwLock::new(ImageLayerInner {
                loaded: false,
@@ -667,6 +675,7 @@ impl ImageLayerWriter {
        tenant_id: TenantId,
        key_range: &Range<Key>,
        lsn: Lsn,
+        is_incremental: bool,
    ) -> anyhow::Result<ImageLayerWriter> {
        Ok(Self {
            inner: Some(ImageLayerWriterInner::new(
@@ -675,6 +684,7 @@ impl ImageLayerWriter {
                tenant_id,
                key_range,
                lsn,
+                is_incremental,
            )?),
        })
    }
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -304,7 +304,7 @@ impl InMemoryLayer {
        Ok(())
    }

-    pub fn put_tombstone(&self, _key_range: Range<Key>, _lsn: Lsn) -> Result<()> {
+    pub async fn put_tombstone(&self, _key_range: Range<Key>, _lsn: Lsn) -> Result<()> {
        // TODO: Currently, we just leak the storage for any deleted keys

        Ok(())
--- a/pageserver/src/tenant/storage_layer/layer_desc.rs
+++ b/pageserver/src/tenant/storage_layer/layer_desc.rs
@@ -0,0 +1,191 @@
+use anyhow::Result;
+use std::ops::Range;
+use utils::{
+    id::{TenantId, TimelineId},
+    lsn::Lsn,
+};
+
+use crate::{context::RequestContext, repository::Key};
+
+use super::{DeltaFileName, ImageFileName, LayerFileName};
+
+use serde::{Deserialize, Serialize};
+
+/// A unique identifier of a persistent layer. This is different from `LayerDescriptor`, which is only used in the
+/// benchmarks. This struct contains all necessary information to find the image / delta layer. It also provides
+/// a unified way to generate layer information like file name.
+#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)]
+pub struct PersistentLayerDesc {
+    pub tenant_id: TenantId,
+    pub timeline_id: TimelineId,
+    pub key_range: Range<Key>,
+    /// For image layer, this is `[lsn, lsn+1)`.
+    pub lsn_range: Range<Lsn>,
+    /// Whether this is a delta layer.
+    pub is_delta: bool,
+    /// Whether this layer only contains page images for part of the keys in the range. In the current implementation, this should
+    /// always be equal to `is_delta`. If we land the partial image layer PR someday, image layer could also be
+    /// incremental.
+    pub is_incremental: bool,
+    /// File size
+    pub file_size: u64,
+}
+
+/// A unique identifier of a persistent layer within the context of one timeline.
+#[derive(Debug, PartialEq, Eq, Clone, Hash)]
+pub struct PersistentLayerKey {
+    pub key_range: Range<Key>,
+    pub lsn_range: Range<Lsn>,
+    pub is_delta: bool,
+}
+
+impl PersistentLayerDesc {
+    pub fn key(&self) -> PersistentLayerKey {
+        PersistentLayerKey {
+            key_range: self.key_range.clone(),
+            lsn_range: self.lsn_range.clone(),
+            is_delta: self.is_delta,
+        }
+    }
+
+    pub fn short_id(&self) -> String {
+        self.filename().file_name()
+    }
+
+    #[cfg(test)]
+    pub fn new_test(key_range: Range<Key>) -> Self {
+        Self {
+            tenant_id: TenantId::generate(),
+            timeline_id: TimelineId::generate(),
+            key_range,
+            lsn_range: Lsn(0)..Lsn(1),
+            is_delta: false,
+            is_incremental: false,
+            file_size: 0,
+        }
+    }
+
+    pub fn new_img(
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        key_range: Range<Key>,
+        lsn: Lsn,
+        is_incremental: bool,
+        file_size: u64,
+    ) -> Self {
+        Self {
+            tenant_id,
+            timeline_id,
+            key_range,
+            lsn_range: Self::image_layer_lsn_range(lsn),
+            is_delta: false,
+            is_incremental,
+            file_size,
+        }
+    }
+
+    pub fn new_delta(
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        key_range: Range<Key>,
+        lsn_range: Range<Lsn>,
+        file_size: u64,
+    ) -> Self {
+        Self {
+            tenant_id,
+            timeline_id,
+            key_range,
+            lsn_range,
+            is_delta: true,
+            is_incremental: true,
+            file_size,
+        }
+    }
+
+    /// Get the LSN that the image layer covers.
+    pub fn image_layer_lsn(&self) -> Lsn {
+        assert!(!self.is_delta);
+        assert!(self.lsn_range.start + 1 == self.lsn_range.end);
+        self.lsn_range.start
+    }
+
+    /// Get the LSN range corresponding to a single image layer LSN.
+    pub fn image_layer_lsn_range(lsn: Lsn) -> Range<Lsn> {
+        lsn..(lsn + 1)
+    }
+
+    /// Get a delta file name for this layer.
+    ///
+    /// Panic: if this is not a delta layer.
+    pub fn delta_file_name(&self) -> DeltaFileName {
+        assert!(self.is_delta);
+        DeltaFileName {
+            key_range: self.key_range.clone(),
+            lsn_range: self.lsn_range.clone(),
+        }
+    }
+
+    /// Get a delta file name for this layer.
+    ///
+    /// Panic: if this is not an image layer, or the lsn range is invalid
+    pub fn image_file_name(&self) -> ImageFileName {
+        assert!(!self.is_delta);
+        assert!(self.lsn_range.start + 1 == self.lsn_range.end);
+        ImageFileName {
+            key_range: self.key_range.clone(),
+            lsn: self.lsn_range.start,
+        }
+    }
+
+    pub fn filename(&self) -> LayerFileName {
+        if self.is_delta {
+            self.delta_file_name().into()
+        } else {
+            self.image_file_name().into()
+        }
+    }
+
+    // TODO: remove this in the future once we refactor timeline APIs.
+
+    pub fn get_lsn_range(&self) -> Range<Lsn> {
+        self.lsn_range.clone()
+    }
+
+    pub fn get_key_range(&self) -> Range<Key> {
+        self.key_range.clone()
+    }
+
+    pub fn get_timeline_id(&self) -> TimelineId {
+        self.timeline_id
+    }
+
+    pub fn get_tenant_id(&self) -> TenantId {
+        self.tenant_id
+    }
+
+    pub fn is_incremental(&self) -> bool {
+        self.is_incremental
+    }
+
+    pub fn is_delta(&self) -> bool {
+        self.is_delta
+    }
+
+    pub fn dump(&self, _verbose: bool, _ctx: &RequestContext) -> Result<()> {
+        println!(
+            "----- layer for ten {} tli {} keys {}-{} lsn {}-{} ----",
+            self.tenant_id,
+            self.timeline_id,
+            self.key_range.start,
+            self.key_range.end,
+            self.lsn_range.start,
+            self.lsn_range.end
+        );
+
+        Ok(())
+    }
+
+    pub fn file_size(&self) -> u64 {
+        self.file_size
+    }
+}
--- a/pageserver/src/tenant/storage_layer/remote_layer.rs
+++ b/pageserver/src/tenant/storage_layer/remote_layer.rs
@@ -18,11 +18,10 @@ use utils::{
    lsn::Lsn,
 };

-use super::filename::{DeltaFileName, ImageFileName, LayerFileName};
-use super::image_layer::ImageLayer;
+use super::filename::{DeltaFileName, ImageFileName};
 use super::{
-    DeltaLayer, LayerAccessStats, LayerAccessStatsReset, LayerIter, LayerKeyIter,
-    LayerResidenceStatus, PersistentLayer,
+    DeltaLayer, ImageLayer, LayerAccessStats, LayerAccessStatsReset, LayerIter, LayerKeyIter,
+    LayerResidenceStatus, PersistentLayer, PersistentLayerDesc,
 };

 /// RemoteLayer is a not yet downloaded [`ImageLayer`] or
@@ -34,19 +33,10 @@ use super::{
 ///
 /// See: [`crate::context::RequestContext`] for authorization to download
 pub struct RemoteLayer {
-    tenantid: TenantId,
-    timelineid: TimelineId,
-    key_range: Range<Key>,
-    lsn_range: Range<Lsn>,
-
-    pub file_name: LayerFileName,
+    pub desc: PersistentLayerDesc,

    pub layer_metadata: LayerFileMetadata,

-    is_delta: bool,
-
-    is_incremental: bool,
-
    access_stats: LayerAccessStats,

    pub(crate) ongoing_download: Arc<tokio::sync::Semaphore>,
@@ -66,22 +56,14 @@ pub struct RemoteLayer {
 impl std::fmt::Debug for RemoteLayer {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.debug_struct("RemoteLayer")
-            .field("file_name", &self.file_name)
+            .field("file_name", &self.desc.filename())
            .field("layer_metadata", &self.layer_metadata)
-            .field("is_incremental", &self.is_incremental)
+            .field("is_incremental", &self.desc.is_incremental)
            .finish()
    }
 }

 impl Layer for RemoteLayer {
-    fn get_key_range(&self) -> Range<Key> {
-        self.key_range.clone()
-    }
-
-    fn get_lsn_range(&self) -> Range<Lsn> {
-        self.lsn_range.clone()
-    }
-
    fn get_value_reconstruct_data(
        &self,
        _key: Key,
@@ -95,53 +77,45 @@ impl Layer for RemoteLayer {
        );
    }

-    fn is_incremental(&self) -> bool {
-        self.is_incremental
-    }
-
    /// debugging function to print out the contents of the layer
    fn dump(&self, _verbose: bool, _ctx: &RequestContext) -> Result<()> {
        println!(
            "----- remote layer for ten {} tli {} keys {}-{} lsn {}-{} ----",
-            self.tenantid,
-            self.timelineid,
-            self.key_range.start,
-            self.key_range.end,
-            self.lsn_range.start,
-            self.lsn_range.end
+            self.desc.tenant_id,
+            self.desc.timeline_id,
+            self.desc.key_range.start,
+            self.desc.key_range.end,
+            self.desc.lsn_range.start,
+            self.desc.lsn_range.end
        );

        Ok(())
    }

+    /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
+    fn get_key_range(&self) -> Range<Key> {
+        self.layer_desc().key_range.clone()
+    }
+
+    /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
+    fn get_lsn_range(&self) -> Range<Lsn> {
+        self.layer_desc().lsn_range.clone()
+    }
+
+    /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
+    fn is_incremental(&self) -> bool {
+        self.layer_desc().is_incremental
+    }
+
+    /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
    fn short_id(&self) -> String {
-        self.filename().file_name()
+        self.layer_desc().short_id()
    }
 }

 impl PersistentLayer for RemoteLayer {
-    fn get_tenant_id(&self) -> TenantId {
-        self.tenantid
-    }
-
-    fn get_timeline_id(&self) -> TimelineId {
-        self.timelineid
-    }
-
-    fn filename(&self) -> LayerFileName {
-        if self.is_delta {
-            DeltaFileName {
-                key_range: self.key_range.clone(),
-                lsn_range: self.lsn_range.clone(),
-            }
-            .into()
-        } else {
-            ImageFileName {
-                key_range: self.key_range.clone(),
-                lsn: self.lsn_range.start,
-            }
-            .into()
-        }
+    fn layer_desc(&self) -> &PersistentLayerDesc {
+        &self.desc
    }

    fn local_path(&self) -> Option<PathBuf> {
@@ -168,15 +142,11 @@ impl PersistentLayer for RemoteLayer {
        true
    }

-    fn file_size(&self) -> u64 {
-        self.layer_metadata.file_size()
-    }
-
    fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
        let layer_file_name = self.filename().file_name();
        let lsn_range = self.get_lsn_range();

-        if self.is_delta {
+        if self.desc.is_delta {
            HistoricLayerInfo::Delta {
                layer_file_name,
                layer_file_size: self.layer_metadata.file_size(),
@@ -210,13 +180,14 @@ impl RemoteLayer {
        access_stats: LayerAccessStats,
    ) -> RemoteLayer {
        RemoteLayer {
-            tenantid,
-            timelineid,
-            key_range: fname.key_range.clone(),
-            lsn_range: fname.lsn_as_range(),
-            is_delta: false,
-            is_incremental: false,
-            file_name: fname.to_owned().into(),
+            desc: PersistentLayerDesc::new_img(
+                tenantid,
+                timelineid,
+                fname.key_range.clone(),
+                fname.lsn,
+                false,
+                layer_metadata.file_size(),
+            ),
            layer_metadata: layer_metadata.clone(),
            ongoing_download: Arc::new(tokio::sync::Semaphore::new(1)),
            download_replacement_failure: std::sync::atomic::AtomicBool::default(),
@@ -232,13 +203,13 @@ impl RemoteLayer {
        access_stats: LayerAccessStats,
    ) -> RemoteLayer {
        RemoteLayer {
-            tenantid,
-            timelineid,
-            key_range: fname.key_range.clone(),
-            lsn_range: fname.lsn_range.clone(),
-            is_delta: true,
-            is_incremental: true,
-            file_name: fname.to_owned().into(),
+            desc: PersistentLayerDesc::new_delta(
+                tenantid,
+                timelineid,
+                fname.key_range.clone(),
+                fname.lsn_range.clone(),
+                layer_metadata.file_size(),
+            ),
            layer_metadata: layer_metadata.clone(),
            ongoing_download: Arc::new(tokio::sync::Semaphore::new(1)),
            download_replacement_failure: std::sync::atomic::AtomicBool::default(),
@@ -256,15 +227,12 @@ impl RemoteLayer {
    where
        L: ?Sized + Layer,
    {
-        if self.is_delta {
-            let fname = DeltaFileName {
-                key_range: self.key_range.clone(),
-                lsn_range: self.lsn_range.clone(),
-            };
+        if self.desc.is_delta {
+            let fname = self.desc.delta_file_name();
            Arc::new(DeltaLayer::new(
                conf,
-                self.timelineid,
-                self.tenantid,
+                self.desc.timeline_id,
+                self.desc.tenant_id,
                &fname,
                file_size,
                self.access_stats.clone_for_residence_change(
@@ -273,14 +241,11 @@ impl RemoteLayer {
                ),
            ))
        } else {
-            let fname = ImageFileName {
-                key_range: self.key_range.clone(),
-                lsn: self.lsn_range.start,
-            };
+            let fname = self.desc.image_file_name();
            Arc::new(ImageLayer::new(
                conf,
-                self.timelineid,
-                self.tenantid,
+                self.desc.timeline_id,
+                self.desc.tenant_id,
                &fname,
                file_size,
                self.access_stats.clone_for_residence_change(
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -9,13 +9,17 @@ use crate::context::{DownloadBehavior, RequestContext};
 use crate::metrics::TENANT_TASK_EVENTS;
 use crate::task_mgr;
 use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME};
-use crate::tenant::mgr;
 use crate::tenant::{Tenant, TenantState};
 use tokio_util::sync::CancellationToken;
 use tracing::*;
-use utils::id::TenantId;
+use utils::completion;

-pub fn start_background_loops(tenant_id: TenantId) {
+/// Start per tenant background loops: compaction and gc.
+pub fn start_background_loops(
+    tenant: &Arc<Tenant>,
+    background_jobs_can_start: Option<&completion::Barrier>,
+) {
+    let tenant_id = tenant.tenant_id;
    task_mgr::spawn(
        BACKGROUND_RUNTIME.handle(),
        TaskKind::Compaction,
@@ -23,11 +27,20 @@ pub fn start_background_loops(tenant_id: TenantId) {
        None,
        &format!("compactor for tenant {tenant_id}"),
        false,
-        async move {
-            compaction_loop(tenant_id)
-                .instrument(info_span!("compaction_loop", tenant_id = %tenant_id))
-                .await;
-            Ok(())
+        {
+            let tenant = Arc::clone(tenant);
+            let background_jobs_can_start = background_jobs_can_start.cloned();
+            async move {
+                let cancel = task_mgr::shutdown_token();
+                tokio::select! {
+                    _ = cancel.cancelled() => { return Ok(()) },
+                    _ = completion::Barrier::maybe_wait(background_jobs_can_start) => {}
+                };
+                compaction_loop(tenant, cancel)
+                    .instrument(info_span!("compaction_loop", tenant_id = %tenant_id))
+                    .await;
+                Ok(())
+            }
        },
    );
    task_mgr::spawn(
@@ -37,11 +50,20 @@ pub fn start_background_loops(tenant_id: TenantId) {
        None,
        &format!("garbage collector for tenant {tenant_id}"),
        false,
-        async move {
-            gc_loop(tenant_id)
-                .instrument(info_span!("gc_loop", tenant_id = %tenant_id))
-                .await;
-            Ok(())
+        {
+            let tenant = Arc::clone(tenant);
+            let background_jobs_can_start = background_jobs_can_start.cloned();
+            async move {
+                let cancel = task_mgr::shutdown_token();
+                tokio::select! {
+                    _ = cancel.cancelled() => { return Ok(()) },
+                    _ = completion::Barrier::maybe_wait(background_jobs_can_start) => {}
+                };
+                gc_loop(tenant, cancel)
+                    .instrument(info_span!("gc_loop", tenant_id = %tenant_id))
+                    .await;
+                Ok(())
+            }
        },
    );
 }
@@ -49,27 +71,26 @@ pub fn start_background_loops(tenant_id: TenantId) {
 ///
 /// Compaction task's main loop
 ///
-async fn compaction_loop(tenant_id: TenantId) {
+async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
    let wait_duration = Duration::from_secs(2);
    info!("starting");
    TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
    async {
-        let cancel = task_mgr::shutdown_token();
        let ctx = RequestContext::todo_child(TaskKind::Compaction, DownloadBehavior::Download);
        let mut first = true;
        loop {
            trace!("waking up");

-            let tenant = tokio::select! {
+            tokio::select! {
                _ = cancel.cancelled() => {
                    info!("received cancellation request");
                    return;
                },
-                tenant_wait_result = wait_for_active_tenant(tenant_id, wait_duration) => match tenant_wait_result {
+                tenant_wait_result = wait_for_active_tenant(&tenant) => match tenant_wait_result {
                    ControlFlow::Break(()) => return,
-                    ControlFlow::Continue(tenant) => tenant,
+                    ControlFlow::Continue(()) => (),
                },
-            };
+            }

            let period = tenant.get_compaction_period();

@@ -119,29 +140,29 @@ async fn compaction_loop(tenant_id: TenantId) {
 ///
 /// GC task's main loop
 ///
-async fn gc_loop(tenant_id: TenantId) {
+async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
    let wait_duration = Duration::from_secs(2);
    info!("starting");
    TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
    async {
-        let cancel = task_mgr::shutdown_token();
        // GC might require downloading, to find the cutoff LSN that corresponds to the
        // cutoff specified as time.
-        let ctx = RequestContext::todo_child(TaskKind::GarbageCollector, DownloadBehavior::Download);
+        let ctx =
+            RequestContext::todo_child(TaskKind::GarbageCollector, DownloadBehavior::Download);
        let mut first = true;
        loop {
            trace!("waking up");

-            let tenant = tokio::select! {
+            tokio::select! {
                _ = cancel.cancelled() => {
                    info!("received cancellation request");
                    return;
                },
-                tenant_wait_result = wait_for_active_tenant(tenant_id, wait_duration) => match tenant_wait_result {
+                tenant_wait_result = wait_for_active_tenant(&tenant) => match tenant_wait_result {
                    ControlFlow::Break(()) => return,
-                    ControlFlow::Continue(tenant) => tenant,
+                    ControlFlow::Continue(()) => (),
                },
-            };
+            }

            let period = tenant.get_gc_period();

@@ -161,7 +182,9 @@ async fn gc_loop(tenant_id: TenantId) {
                Duration::from_secs(10)
            } else {
                // Run gc
-                let res = tenant.gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), &ctx).await;
+                let res = tenant
+                    .gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), &ctx)
+                    .await;
                if let Err(e) = res {
                    error!("Gc failed, retrying in {:?}: {e:?}", wait_duration);
                    wait_duration
@@ -187,23 +210,10 @@ async fn gc_loop(tenant_id: TenantId) {
    trace!("GC loop stopped.");
 }

-async fn wait_for_active_tenant(
-    tenant_id: TenantId,
-    wait: Duration,
-) -> ControlFlow<(), Arc<Tenant>> {
-    let tenant = loop {
-        match mgr::get_tenant(tenant_id, false).await {
-            Ok(tenant) => break tenant,
-            Err(e) => {
-                error!("Failed to get a tenant {tenant_id}: {e:#}");
-                tokio::time::sleep(wait).await;
-            }
-        }
-    };
-
+async fn wait_for_active_tenant(tenant: &Arc<Tenant>) -> ControlFlow<()> {
    // if the tenant has a proper status already, no need to wait for anything
    if tenant.current_state() == TenantState::Active {
-        ControlFlow::Continue(tenant)
+        ControlFlow::Continue(())
    } else {
        let mut tenant_state_updates = tenant.subscribe_for_state_updates();
        loop {
@@ -213,7 +223,7 @@ async fn wait_for_active_tenant(
                    match new_state {
                        TenantState::Active => {
                            debug!("Tenant state changed to active, continuing the task loop");
-                            return ControlFlow::Continue(tenant);
+                            return ControlFlow::Continue(());
                        }
                        state => {
                            debug!("Not running the task loop, tenant is not active: {state:?}");
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -30,10 +30,12 @@ use crate::{
    tenant::{
        config::{EvictionPolicy, EvictionPolicyLayerAccessThreshold},
        storage_layer::PersistentLayer,
-        Tenant,
+        LogicalSizeCalculationCause, Tenant,
    },
 };

+use utils::completion;
+
 use super::Timeline;

 #[derive(Default)]
@@ -47,8 +49,12 @@ pub struct EvictionTaskTenantState {
 }

 impl Timeline {
-    pub(super) fn launch_eviction_task(self: &Arc<Self>) {
+    pub(super) fn launch_eviction_task(
+        self: &Arc<Self>,
+        background_tasks_can_start: Option<&completion::Barrier>,
+    ) {
        let self_clone = Arc::clone(self);
+        let background_tasks_can_start = background_tasks_can_start.cloned();
        task_mgr::spawn(
            BACKGROUND_RUNTIME.handle(),
            TaskKind::Eviction,
@@ -57,7 +63,13 @@ impl Timeline {
            &format!("layer eviction for {}/{}", self.tenant_id, self.timeline_id),
            false,
            async move {
-                self_clone.eviction_task(task_mgr::shutdown_token()).await;
+                let cancel = task_mgr::shutdown_token();
+                tokio::select! {
+                    _ = cancel.cancelled() => { return Ok(()); }
+                    _ = completion::Barrier::maybe_wait(background_tasks_can_start) => {}
+                };
+
+                self_clone.eviction_task(cancel).await;
                info!("eviction task finishing");
                Ok(())
            },
@@ -185,7 +197,7 @@ impl Timeline {
        // We don't want to hold the layer map lock during eviction.
        // So, we just need to deal with this.
        let candidates: Vec<Arc<dyn PersistentLayer>> = {
-            let layers = self.layers.read().unwrap();
+            let layers = self.layers.read().await;
            let mut candidates = Vec::new();
            for hist_layer in layers.iter_historic_layers() {
                if hist_layer.is_remote_layer() {
@@ -294,12 +306,17 @@ impl Timeline {
        match state.last_layer_access_imitation {
            Some(ts) if ts.elapsed() < p.threshold => { /* no need to run */ }
            _ => {
-                self.imitate_timeline_cached_layer_accesses(ctx).await;
+                self.imitate_timeline_cached_layer_accesses(cancel, ctx)
+                    .await;
                state.last_layer_access_imitation = Some(tokio::time::Instant::now())
            }
        }
        drop(state);

+        if cancel.is_cancelled() {
+            return ControlFlow::Break(());
+        }
+
        // This task is timeline-scoped, but the synthetic size calculation is tenant-scoped.
        // Make one of the tenant's timelines draw the short straw and run the calculation.
        // The others wait until the calculation is done so that they take into account the
@@ -328,8 +345,36 @@ impl Timeline {

    /// Recompute the values which would cause on-demand downloads during restart.
    #[instrument(skip_all)]
-    async fn imitate_timeline_cached_layer_accesses(&self, ctx: &RequestContext) {
+    async fn imitate_timeline_cached_layer_accesses(
+        &self,
+        cancel: &CancellationToken,
+        ctx: &RequestContext,
+    ) {
        let lsn = self.get_last_record_lsn();
+
+        // imitiate on-restart initial logical size
+        let size = self
+            .calculate_logical_size(
+                lsn,
+                LogicalSizeCalculationCause::EvictionTaskImitation,
+                cancel.clone(),
+                ctx,
+            )
+            .instrument(info_span!("calculate_logical_size"))
+            .await;
+
+        match &size {
+            Ok(_size) => {
+                // good, don't log it to avoid confusion
+            }
+            Err(_) => {
+                // we have known issues for which we already log this on consumption metrics,
+                // gc, and compaction. leave logging out for now.
+                //
+                // https://github.com/neondatabase/neon/issues/2539
+            }
+        }
+
        // imitiate repartiting on first compactation
        if let Err(e) = self
            .collect_keyspace(lsn, ctx)
@@ -337,7 +382,13 @@ impl Timeline {
            .await
        {
            // if this failed, we probably failed logical size because these use the same keys
-            warn!("failed to collect keyspace but succeeded in calculating logical size: {e:#}");
+            if size.is_err() {
+                // ignore, see above comment
+            } else {
+                warn!(
+                    "failed to collect keyspace but succeeded in calculating logical size: {e:#}"
+                );
+            }
        }
    }

@@ -374,9 +425,21 @@ impl Timeline {
        // So, the chance of the worst case is quite low in practice.
        // It runs as a per-tenant task, but the eviction_task.rs is per-timeline.
        // So, we must coordinate with other with other eviction tasks of this tenant.
+        let limit = self
+            .conf
+            .eviction_task_immitated_concurrent_logical_size_queries
+            .inner();
+
        let mut throwaway_cache = HashMap::new();
-        let gather = crate::tenant::size::gather_inputs(tenant, None, &mut throwaway_cache, ctx)
-            .instrument(info_span!("gather_inputs"));
+        let gather = crate::tenant::size::gather_inputs(
+            tenant,
+            limit,
+            None,
+            &mut throwaway_cache,
+            LogicalSizeCalculationCause::EvictionTaskImitation,
+            ctx,
+        )
+        .instrument(info_span!("gather_inputs"));

        tokio::select! {
            _ = cancel.cancelled() => {}
--- a/pageserver/src/tenant/timeline/walreceiver.rs
+++ b/pageserver/src/tenant/timeline/walreceiver.rs
@@ -25,20 +25,19 @@ mod walreceiver_connection;

 use crate::context::{DownloadBehavior, RequestContext};
 use crate::task_mgr::{self, TaskKind, WALRECEIVER_RUNTIME};
+use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::tenant::timeline::walreceiver::connection_manager::{
    connection_manager_loop_step, ConnectionManagerState,
 };

-use anyhow::Context;
 use std::future::Future;
 use std::num::NonZeroU64;
 use std::ops::ControlFlow;
-use std::sync::atomic::{self, AtomicBool};
-use std::sync::{Arc, Weak};
+use std::sync::Arc;
 use std::time::Duration;
 use storage_broker::BrokerClientChannel;
 use tokio::select;
-use tokio::sync::{watch, RwLock};
+use tokio::sync::watch;
 use tokio_util::sync::CancellationToken;
 use tracing::*;

@@ -62,46 +61,23 @@ pub struct WalReceiverConf {

 pub struct WalReceiver {
    timeline: TenantTimelineId,
-    timeline_ref: Weak<Timeline>,
-    conf: WalReceiverConf,
-    started: AtomicBool,
-    manager_status: Arc<RwLock<Option<ConnectionManagerStatus>>>,
+    manager_status: Arc<std::sync::RwLock<Option<ConnectionManagerStatus>>>,
 }

 impl WalReceiver {
-    pub fn new(
-        timeline: TenantTimelineId,
-        timeline_ref: Weak<Timeline>,
-        conf: WalReceiverConf,
-    ) -> Self {
-        Self {
-            timeline,
-            timeline_ref,
-            conf,
-            started: AtomicBool::new(false),
-            manager_status: Arc::new(RwLock::new(None)),
-        }
-    }
-
    pub fn start(
-        &self,
-        ctx: &RequestContext,
+        timeline: Arc<Timeline>,
+        conf: WalReceiverConf,
        mut broker_client: BrokerClientChannel,
-    ) -> anyhow::Result<()> {
-        if self.started.load(atomic::Ordering::Acquire) {
-            anyhow::bail!("Wal receiver is already started");
-        }
-
-        let timeline = self.timeline_ref.upgrade().with_context(|| {
-            format!("walreceiver start on a dropped timeline {}", self.timeline)
-        })?;
-
+        ctx: &RequestContext,
+    ) -> Self {
        let tenant_id = timeline.tenant_id;
        let timeline_id = timeline.timeline_id;
        let walreceiver_ctx =
            ctx.detached_child(TaskKind::WalReceiverManager, DownloadBehavior::Error);
-        let wal_receiver_conf = self.conf.clone();
-        let loop_status = Arc::clone(&self.manager_status);
+
+        let loop_status = Arc::new(std::sync::RwLock::new(None));
+        let manager_status = Arc::clone(&loop_status);
        task_mgr::spawn(
            WALRECEIVER_RUNTIME.handle(),
            TaskKind::WalReceiverManager,
@@ -110,15 +86,16 @@ impl WalReceiver {
            &format!("walreceiver for timeline {tenant_id}/{timeline_id}"),
            false,
            async move {
-                info!("WAL receiver manager started, connecting to broker");
+                debug_assert_current_span_has_tenant_and_timeline_id();
+                debug!("WAL receiver manager started, connecting to broker");
                let mut connection_manager_state = ConnectionManagerState::new(
                    timeline,
-                    wal_receiver_conf,
+                    conf,
                );
                loop {
                    select! {
                        _ = task_mgr::shutdown_watcher() => {
-                            info!("WAL receiver shutdown requested, shutting down");
+                            trace!("WAL receiver shutdown requested, shutting down");
                            break;
                        },
                        loop_step_result = connection_manager_loop_step(
@@ -129,7 +106,7 @@ impl WalReceiver {
                        ) => match loop_step_result {
                            ControlFlow::Continue(()) => continue,
                            ControlFlow::Break(()) => {
-                                info!("Connection manager loop ended, shutting down");
+                                trace!("Connection manager loop ended, shutting down");
                                break;
                            }
                        },
@@ -137,29 +114,29 @@ impl WalReceiver {
                }

                connection_manager_state.shutdown().await;
-                *loop_status.write().await = None;
+                *loop_status.write().unwrap() = None;
                Ok(())
            }
-            .instrument(info_span!(parent: None, "wal_connection_manager", tenant = %tenant_id, timeline = %timeline_id))
+            .instrument(info_span!(parent: None, "wal_connection_manager", tenant_id = %tenant_id, timeline_id = %timeline_id))
        );

-        self.started.store(true, atomic::Ordering::Release);
-
-        Ok(())
+        Self {
+            timeline: TenantTimelineId::new(tenant_id, timeline_id),
+            manager_status,
+        }
    }

-    pub async fn stop(&self) {
+    pub async fn stop(self) {
        task_mgr::shutdown_tasks(
            Some(TaskKind::WalReceiverManager),
            Some(self.timeline.tenant_id),
            Some(self.timeline.timeline_id),
        )
        .await;
-        self.started.store(false, atomic::Ordering::Release);
    }

-    pub(super) async fn status(&self) -> Option<ConnectionManagerStatus> {
-        self.manager_status.read().await.clone()
+    pub(super) fn status(&self) -> Option<ConnectionManagerStatus> {
+        self.manager_status.read().unwrap().clone()
    }
 }

@@ -223,29 +200,19 @@ impl<E: Clone> TaskHandle<E> {
                TaskEvent::End(match self.join_handle.as_mut() {
                    Some(jh) => {
                        if !jh.is_finished() {
-                            // Barring any implementation errors in this module, we can
-                            // only arrive here while the task that executes the future
-                            // passed to `Self::spawn()` is still execution. Cf the comment
-                            // in Self::spawn().
-                            //
-                            // This was logging at warning level in earlier versions, presumably
-                            // to leave some breadcrumbs in case we had an implementation
-                            // error that would would make us get stuck in `jh.await`.
-                            //
-                            // There hasn't been such a bug so far.
-                            // But in a busy system, e.g., during pageserver restart,
-                            // we arrive here often enough that the warning-level logs
-                            // became a distraction.
-                            // So, tone them down to info-level.
-                            //
-                            // XXX: rewrite this module to eliminate the race condition.
-                            info!("sender is dropped while join handle is still alive");
+                            // See: https://github.com/neondatabase/neon/issues/2885
+                            trace!("sender is dropped while join handle is still alive");
                        }

-                        let res = jh
-                            .await
-                            .map_err(|e| anyhow::anyhow!("Failed to join task: {e}"))
-                            .and_then(|x| x);
+                        let res = match jh.await {
+                            Ok(res) => res,
+                            Err(je) if je.is_cancelled() => unreachable!("not used"),
+                            Err(je) if je.is_panic() => {
+                                // already logged
+                                Ok(())
+                            }
+                            Err(je) => Err(anyhow::Error::new(je).context("join walreceiver task")),
+                        };

                        // For cancellation-safety, drop join_handle only after successful .await.
                        self.join_handle = None;
@@ -268,12 +235,12 @@ impl<E: Clone> TaskHandle<E> {
            match jh.await {
                Ok(Ok(())) => debug!("Shutdown success"),
                Ok(Err(e)) => error!("Shutdown task error: {e:?}"),
-                Err(join_error) => {
-                    if join_error.is_cancelled() {
-                        error!("Shutdown task was cancelled");
-                    } else {
-                        error!("Shutdown task join error: {join_error}")
-                    }
+                Err(je) if je.is_cancelled() => unreachable!("not used"),
+                Err(je) if je.is_panic() => {
+                    // already logged
+                }
+                Err(je) => {
+                    error!("Shutdown task join error: {je}")
                }
            }
        }
--- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
@@ -18,7 +18,7 @@ use crate::metrics::{
    WALRECEIVER_CANDIDATES_REMOVED, WALRECEIVER_SWITCHES,
 };
 use crate::task_mgr::TaskKind;
-use crate::tenant::Timeline;
+use crate::tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeline};
 use anyhow::Context;
 use chrono::{NaiveDateTime, Utc};
 use pageserver_api::models::TimelineState;
@@ -29,7 +29,6 @@ use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId;
 use storage_broker::BrokerClientChannel;
 use storage_broker::Streaming;
 use tokio::select;
-use tokio::sync::RwLock;
 use tracing::*;

 use crate::{exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS};
@@ -48,7 +47,7 @@ pub(super) async fn connection_manager_loop_step(
    broker_client: &mut BrokerClientChannel,
    connection_manager_state: &mut ConnectionManagerState,
    ctx: &RequestContext,
-    manager_status: &RwLock<Option<ConnectionManagerStatus>>,
+    manager_status: &std::sync::RwLock<Option<ConnectionManagerStatus>>,
 ) -> ControlFlow<(), ()> {
    match connection_manager_state
        .timeline
@@ -56,8 +55,11 @@ pub(super) async fn connection_manager_loop_step(
        .await
    {
        Ok(()) => {}
-        Err(_) => {
-            info!("Timeline dropped state updates sender before becoming active, stopping wal connection manager loop");
+        Err(new_state) => {
+            debug!(
+                ?new_state,
+                "state changed, stopping wal connection manager loop"
+            );
            return ControlFlow::Break(());
        }
    }
@@ -80,7 +82,7 @@ pub(super) async fn connection_manager_loop_step(
    // with other streams on this client (other connection managers). When
    // object goes out of scope, stream finishes in drop() automatically.
    let mut broker_subscription = subscribe_for_timeline_updates(broker_client, id).await;
-    info!("Subscribed for broker timeline updates");
+    debug!("Subscribed for broker timeline updates");

    loop {
        let time_until_next_retry = connection_manager_state.time_until_next_retry();
@@ -151,13 +153,13 @@ pub(super) async fn connection_manager_loop_step(
                            match new_state {
                                // we're already active as walreceiver, no need to reactivate
                                TimelineState::Active => continue,
-                                TimelineState::Broken | TimelineState::Stopping => {
-                                    info!("timeline entered terminal state {new_state:?}, stopping wal connection manager loop");
+                                TimelineState::Broken { .. } | TimelineState::Stopping => {
+                                    debug!("timeline entered terminal state {new_state:?}, stopping wal connection manager loop");
                                    return ControlFlow::Break(());
                                }
                                TimelineState::Loading => {
                                    warn!("timeline transitioned back to Loading state, that should not happen");
-                                    return ControlFlow::Continue(new_state);
+                                    return ControlFlow::Continue(());
                                }
                            }
                        }
@@ -165,12 +167,11 @@ pub(super) async fn connection_manager_loop_step(
                    }
                }
            } => match new_event {
-                ControlFlow::Continue(new_state) => {
-                    info!("observed timeline state change, new state is {new_state:?}");
+                ControlFlow::Continue(()) => {
                    return ControlFlow::Continue(());
                }
                ControlFlow::Break(()) => {
-                    info!("Timeline dropped state updates sender, stopping wal connection manager loop");
+                    debug!("Timeline is no longer active, stopping wal connection manager loop");
                    return ControlFlow::Break(());
                }
            },
@@ -195,7 +196,7 @@ pub(super) async fn connection_manager_loop_step(
                .change_connection(new_candidate, ctx)
                .await
        }
-        *manager_status.write().await = Some(connection_manager_state.manager_status());
+        *manager_status.write().unwrap() = Some(connection_manager_state.manager_status());
    }
 }

@@ -391,7 +392,6 @@ impl ConnectionManagerState {

        self.drop_old_connection(true).await;

-        let id = self.id;
        let node_id = new_sk.safekeeper_id;
        let connect_timeout = self.conf.wal_connect_timeout;
        let timeline = Arc::clone(&self.timeline);
@@ -399,9 +399,13 @@ impl ConnectionManagerState {
            TaskKind::WalReceiverConnectionHandler,
            DownloadBehavior::Download,
        );
+
+        let span = info_span!("connection", %node_id);
        let connection_handle = TaskHandle::spawn(move |events_sender, cancellation| {
            async move {
-                super::walreceiver_connection::handle_walreceiver_connection(
+                debug_assert_current_span_has_tenant_and_timeline_id();
+
+                let res = super::walreceiver_connection::handle_walreceiver_connection(
                    timeline,
                    new_sk.wal_source_connconf,
                    events_sender,
@@ -410,12 +414,23 @@ impl ConnectionManagerState {
                    ctx,
                    node_id,
                )
-                .await
-                .context("walreceiver connection handling failure")
+                .await;
+
+                match res {
+                    Ok(()) => Ok(()),
+                    Err(e) => {
+                        use super::walreceiver_connection::ExpectedError;
+                        if e.is_expected() {
+                            info!("walreceiver connection handling ended: {e:#}");
+                            Ok(())
+                        } else {
+                            // give out an error to have task_mgr give it a really verbose logging
+                            Err(e).context("walreceiver connection handling failure")
+                        }
+                    }
+                }
            }
-            .instrument(
-                info_span!("walreceiver_connection", tenant_id = %id.tenant_id, timeline_id = %id.timeline_id, %node_id),
-            )
+            .instrument(span)
        });

        let now = Utc::now().naive_utc();
@@ -1306,12 +1321,12 @@ mod tests {

    const DUMMY_SAFEKEEPER_HOST: &str = "safekeeper_connstr";

-    async fn dummy_state(harness: &TenantHarness<'_>) -> ConnectionManagerState {
+    async fn dummy_state(harness: &TenantHarness) -> ConnectionManagerState {
        let (tenant, ctx) = harness.load().await;
        let timeline = tenant
-            .create_empty_timeline(TIMELINE_ID, Lsn(0), crate::DEFAULT_PG_VERSION, &ctx)
+            .create_test_timeline(TIMELINE_ID, Lsn(0x8), crate::DEFAULT_PG_VERSION, &ctx)
+            .await
            .expect("Failed to create an empty timeline for dummy wal connection manager");
-        let timeline = timeline.initialize(&ctx).unwrap();

        ConnectionManagerState {
            id: TenantTimelineId {
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -21,16 +21,16 @@ use postgres_types::PgLsn;
 use tokio::{select, sync::watch, time};
 use tokio_postgres::{replication::ReplicationStream, Client};
 use tokio_util::sync::CancellationToken;
-use tracing::{debug, error, info, trace, warn};
+use tracing::{debug, error, info, trace, warn, Instrument};

 use super::TaskStateUpdate;
-use crate::metrics::LIVE_CONNECTIONS_COUNT;
-use crate::{context::RequestContext, metrics::WALRECEIVER_STARTED_CONNECTIONS};
 use crate::{
+    context::RequestContext,
+    metrics::{LIVE_CONNECTIONS_COUNT, WALRECEIVER_STARTED_CONNECTIONS},
    task_mgr,
    task_mgr::TaskKind,
    task_mgr::WALRECEIVER_RUNTIME,
-    tenant::{Timeline, WalReceiverInfo},
+    tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeline, WalReceiverInfo},
    walingest::WalIngest,
    walrecord::DecodedWALRecord,
 };
@@ -81,13 +81,8 @@ pub(super) async fn handle_walreceiver_connection(
        config.application_name("pageserver");
        config.replication_mode(tokio_postgres::config::ReplicationMode::Physical);
        match time::timeout(connect_timeout, config.connect(postgres::NoTls)).await {
-            Ok(Ok(client_and_conn)) => client_and_conn,
-            Ok(Err(conn_err)) => {
-                let expected_error = ignore_expected_errors(conn_err)?;
-                info!("DB connection stream finished: {expected_error}");
-                return Ok(());
-            }
-            Err(_) => {
+            Ok(client_and_conn) => client_and_conn?,
+            Err(_elapsed) => {
                // Timing out to connect to a safekeeper node could happen long time, due to
                // many reasons that pageserver cannot control.
                // Do not produce an error, but make it visible, that timeouts happen by logging the `event.
@@ -97,7 +92,7 @@ pub(super) async fn handle_walreceiver_connection(
        }
    };

-    info!("connected!");
+    debug!("connected!");
    let mut connection_status = WalConnectionStatus {
        is_connected: true,
        has_processed_wal: false,
@@ -127,20 +122,25 @@ pub(super) async fn handle_walreceiver_connection(
        "walreceiver connection",
        false,
        async move {
+            debug_assert_current_span_has_tenant_and_timeline_id();
+
            select! {
                connection_result = connection => match connection_result {
-                    Ok(()) => info!("Walreceiver db connection closed"),
+                    Ok(()) => debug!("Walreceiver db connection closed"),
                    Err(connection_error) => {
-                        if let Err(e) = ignore_expected_errors(connection_error) {
-                            warn!("Connection aborted: {e:#}")
+                        if connection_error.is_expected() {
+                            // silence, because most likely we've already exited the outer call
+                            // with a similar error.
+                        } else {
+                            warn!("Connection aborted: {connection_error:#}")
                        }
                    }
                },
-                // Future: replace connection_cancellation with connection_ctx cancellation
-                _ = connection_cancellation.cancelled() => info!("Connection cancelled"),
+                _ = connection_cancellation.cancelled() => debug!("Connection cancelled"),
            }
            Ok(())
-        },
+        }
+        .instrument(tracing::info_span!("poller")),
    );

    // Immediately increment the gauge, then create a job to decrement it on task exit.
@@ -203,20 +203,13 @@ pub(super) async fn handle_walreceiver_connection(
    while let Some(replication_message) = {
        select! {
            _ = cancellation.cancelled() => {
-                info!("walreceiver interrupted");
+                debug!("walreceiver interrupted");
                None
            }
            replication_message = physical_stream.next() => replication_message,
        }
    } {
-        let replication_message = match replication_message {
-            Ok(message) => message,
-            Err(replication_error) => {
-                let expected_error = ignore_expected_errors(replication_error)?;
-                info!("Replication stream finished: {expected_error}");
-                return Ok(());
-            }
-        };
+        let replication_message = replication_message?;

        let now = Utc::now().naive_utc();
        let last_rec_lsn_before_msg = last_rec_lsn;
@@ -261,8 +254,6 @@ pub(super) async fn handle_walreceiver_connection(
                    let mut decoded = DecodedWALRecord::default();
                    let mut modification = timeline.begin_modification(endlsn);
                    while let Some((lsn, recdata)) = waldecoder.poll_decode()? {
-                        // let _enter = info_span!("processing record", lsn = %lsn).entered();
-
                        // It is important to deal with the aligned records as lsn in getPage@LSN is
                        // aligned and can be several bytes bigger. Without this alignment we are
                        // at risk of hitting a deadlock.
@@ -313,12 +304,15 @@ pub(super) async fn handle_walreceiver_connection(
            }
        }

-        timeline.check_checkpoint_distance().with_context(|| {
-            format!(
-                "Failed to check checkpoint distance for timeline {}",
-                timeline.timeline_id
-            )
-        })?;
+        timeline
+            .check_checkpoint_distance()
+            .await
+            .with_context(|| {
+                format!(
+                    "Failed to check checkpoint distance for timeline {}",
+                    timeline.timeline_id
+                )
+            })?;

        if let Some(last_lsn) = status_update {
            let timeline_remote_consistent_lsn =
@@ -346,7 +340,9 @@ pub(super) async fn handle_walreceiver_connection(

            // Send the replication feedback message.
            // Regular standby_status_update fields are put into this message.
-            let timeline_logical_size = timeline.get_current_logical_size();
+            let (timeline_logical_size, _) = timeline
+                .get_current_logical_size(&ctx)
+                .context("Status update creation failed to get current logical size")?;
            let status_update = PageserverFeedback {
                current_timeline_size: timeline_logical_size,
                last_received_lsn,
@@ -419,31 +415,50 @@ async fn identify_system(client: &mut Client) -> anyhow::Result<IdentifySystem>
    }
 }

-/// We don't want to report connectivity problems as real errors towards connection manager because
-/// 1. they happen frequently enough to make server logs hard to read and
-/// 2. the connection manager can retry other safekeeper.
-///
-/// If this function returns `Ok(pg_error)`, it's such an error.
-/// The caller should log it at info level and then report to connection manager that we're done handling this connection.
-/// Connection manager will then handle reconnections.
-///
-/// If this function returns an `Err()`, the caller can bubble it up using `?`.
-/// The connection manager will log the error at ERROR level.
-fn ignore_expected_errors(pg_error: postgres::Error) -> anyhow::Result<postgres::Error> {
-    if pg_error.is_closed()
-        || pg_error
-            .source()
-            .and_then(|source| source.downcast_ref::<std::io::Error>())
-            .map(is_expected_io_error)
-            .unwrap_or(false)
-    {
-        return Ok(pg_error);
-    } else if let Some(db_error) = pg_error.as_db_error() {
-        if db_error.code() == &SqlState::SUCCESSFUL_COMPLETION
-            && db_error.message().contains("ending streaming")
-        {
-            return Ok(pg_error);
-        }
-    }
-    Err(pg_error).context("connection error")
+/// Trait for avoid reporting walreceiver specific expected or "normal" or "ok" errors.
+pub(super) trait ExpectedError {
+    /// Test if this error is an ok error.
+    ///
+    /// We don't want to report connectivity problems as real errors towards connection manager because
+    /// 1. they happen frequently enough to make server logs hard to read and
+    /// 2. the connection manager can retry other safekeeper.
+    ///
+    /// If this function returns `true`, it's such an error.
+    /// The caller should log it at info level and then report to connection manager that we're done handling this connection.
+    /// Connection manager will then handle reconnections.
+    ///
+    /// If this function returns an `false` the error should be propagated and the connection manager
+    /// will log the error at ERROR level.
+    fn is_expected(&self) -> bool;
+}
+
+impl ExpectedError for postgres::Error {
+    fn is_expected(&self) -> bool {
+        self.is_closed()
+            || self
+                .source()
+                .and_then(|source| source.downcast_ref::<std::io::Error>())
+                .map(is_expected_io_error)
+                .unwrap_or(false)
+            || self
+                .as_db_error()
+                .filter(|db_error| {
+                    db_error.code() == &SqlState::SUCCESSFUL_COMPLETION
+                        && db_error.message().contains("ending streaming")
+                })
+                .is_some()
+    }
+}
+
+impl ExpectedError for anyhow::Error {
+    fn is_expected(&self) -> bool {
+        let head = self.downcast_ref::<postgres::Error>();
+
+        let tail = self
+            .chain()
+            .filter_map(|e| e.downcast_ref::<postgres::Error>());
+
+        // check if self or any of the chained/sourced errors are expected
+        head.into_iter().chain(tail).any(|e| e.is_expected())
+    }
 }
--- a/pageserver/src/tenant/upload_queue.rs
+++ b/pageserver/src/tenant/upload_queue.rs
@@ -76,6 +76,12 @@ pub(crate) struct UploadQueueInitialized {
    pub(crate) queued_operations: VecDeque<UploadOp>,
 }

+impl UploadQueueInitialized {
+    pub(super) fn no_pending_work(&self) -> bool {
+        self.inprogress_tasks.is_empty() && self.queued_operations.is_empty()
+    }
+}
+
 #[derive(Clone, Copy)]
 pub(super) enum SetDeletedFlagProgress {
    NotRunning,
@@ -84,9 +90,7 @@ pub(super) enum SetDeletedFlagProgress {
 }

 pub(super) struct UploadQueueStopped {
-    pub(super) latest_files: HashMap<LayerFileName, LayerFileMetadata>,
-    pub(super) last_uploaded_consistent_lsn: Lsn,
-    pub(super) latest_metadata: TimelineMetadata,
+    pub(super) upload_queue_for_deletion: UploadQueueInitialized,
    pub(super) deleted_at: SetDeletedFlagProgress,
 }

@@ -187,6 +191,15 @@ impl UploadQueue {
            UploadQueue::Initialized(x) => Ok(x),
        }
    }
+
+    pub(crate) fn stopped_mut(&mut self) -> anyhow::Result<&mut UploadQueueStopped> {
+        match self {
+            UploadQueue::Initialized(_) | UploadQueue::Uninitialized => {
+                anyhow::bail!("queue is in state {}", self.as_str())
+            }
+            UploadQueue::Stopped(stopped) => Ok(stopped),
+        }
+    }
 }

 /// An in-progress upload or delete task.
@@ -199,6 +212,13 @@ pub(crate) struct UploadTask {
    pub(crate) op: UploadOp,
 }

+#[derive(Debug)]
+pub(crate) struct Delete {
+    pub(crate) file_kind: RemoteOpFileKind,
+    pub(crate) layer_file_name: LayerFileName,
+    pub(crate) scheduled_from_timeline_delete: bool,
+}
+
 #[derive(Debug)]
 pub(crate) enum UploadOp {
    /// Upload a layer file
@@ -207,8 +227,8 @@ pub(crate) enum UploadOp {
    /// Upload the metadata file
    UploadMetadata(IndexPart, Lsn),

-    /// Delete a file.
-    Delete(RemoteOpFileKind, LayerFileName),
+    /// Delete a layer file
+    Delete(Delete),

    /// Barrier. When the barrier operation is reached,
    Barrier(tokio::sync::watch::Sender<()>),
@@ -226,7 +246,12 @@ impl std::fmt::Display for UploadOp {
                )
            }
            UploadOp::UploadMetadata(_, lsn) => write!(f, "UploadMetadata(lsn: {})", lsn),
-            UploadOp::Delete(_, path) => write!(f, "Delete({})", path.file_name()),
+            UploadOp::Delete(delete) => write!(
+                f,
+                "Delete(path: {}, scheduled_from_timeline_delete: {})",
+                delete.layer_file_name.file_name(),
+                delete.scheduled_from_timeline_delete
+            ),
            UploadOp::Barrier(_) => write!(f, "Barrier"),
        }
    }
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -25,7 +25,7 @@ use postgres_ffi::v14::nonrelfile_utils::clogpage_precedes;
 use postgres_ffi::v14::nonrelfile_utils::slru_may_delete_clogsegment;
 use postgres_ffi::{fsm_logical_to_physical, page_is_new, page_set_lsn};

-use anyhow::Result;
+use anyhow::{Context, Result};
 use bytes::{Buf, Bytes, BytesMut};
 use tracing::*;

@@ -333,7 +333,7 @@ impl<'a> WalIngest<'a> {

        // Now that this record has been fully handled, including updating the
        // checkpoint data, let the repository know that it is up-to-date to this LSN
-        modification.commit()?;
+        modification.commit().await?;

        Ok(())
    }
@@ -1082,7 +1082,10 @@ impl<'a> WalIngest<'a> {
            .await?
        {
            // create it with 0 size initially, the logic below will extend it
-            modification.put_rel_creation(rel, 0, ctx).await?;
+            modification
+                .put_rel_creation(rel, 0, ctx)
+                .await
+                .context("Relation Error")?;
            0
        } else {
            self.timeline.get_rel_size(rel, last_lsn, true, ctx).await?
@@ -1171,7 +1174,6 @@ impl<'a> WalIngest<'a> {
 #[cfg(test)]
 mod tests {
    use super::*;
-    use crate::pgdatadir_mapping::create_test_timeline;
    use crate::tenant::harness::*;
    use crate::tenant::Timeline;
    use postgres_ffi::v14::xlog_utils::SIZEOF_CHECKPOINT;
@@ -1200,7 +1202,7 @@ mod tests {
        let mut m = tline.begin_modification(Lsn(0x10));
        m.put_checkpoint(ZERO_CHECKPOINT.clone())?;
        m.put_relmap_file(0, 111, Bytes::from(""), ctx).await?; // dummy relmapper file
-        m.commit()?;
+        m.commit().await?;
        let walingest = WalIngest::new(tline, Lsn(0x10), ctx).await?;

        Ok(walingest)
@@ -1209,7 +1211,9 @@ mod tests {
    #[tokio::test]
    async fn test_relsize() -> Result<()> {
        let (tenant, ctx) = TenantHarness::create("test_relsize")?.load().await;
-        let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION, &ctx)?;
+        let tline = tenant
+            .create_test_timeline(TIMELINE_ID, Lsn(8), DEFAULT_PG_VERSION, &ctx)
+            .await?;
        let mut walingest = init_walingest_test(&tline, &ctx).await?;

        let mut m = tline.begin_modification(Lsn(0x20));
@@ -1217,22 +1221,22 @@ mod tests {
        walingest
            .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"), &ctx)
            .await?;
-        m.commit()?;
+        m.commit().await?;
        let mut m = tline.begin_modification(Lsn(0x30));
        walingest
            .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 3"), &ctx)
            .await?;
-        m.commit()?;
+        m.commit().await?;
        let mut m = tline.begin_modification(Lsn(0x40));
        walingest
            .put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1 at 4"), &ctx)
            .await?;
-        m.commit()?;
+        m.commit().await?;
        let mut m = tline.begin_modification(Lsn(0x50));
        walingest
            .put_rel_page_image(&mut m, TESTREL_A, 2, TEST_IMG("foo blk 2 at 5"), &ctx)
            .await?;
-        m.commit()?;
+        m.commit().await?;

        assert_current_logical_size(&tline, Lsn(0x50));

@@ -1318,7 +1322,7 @@ mod tests {
        walingest
            .put_rel_truncation(&mut m, TESTREL_A, 2, &ctx)
            .await?;
-        m.commit()?;
+        m.commit().await?;
        assert_current_logical_size(&tline, Lsn(0x60));

        // Check reported size and contents after truncation
@@ -1360,7 +1364,7 @@ mod tests {
        walingest
            .put_rel_truncation(&mut m, TESTREL_A, 0, &ctx)
            .await?;
-        m.commit()?;
+        m.commit().await?;
        assert_eq!(
            tline
                .get_rel_size(TESTREL_A, Lsn(0x68), false, &ctx)
@@ -1373,7 +1377,7 @@ mod tests {
        walingest
            .put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1"), &ctx)
            .await?;
-        m.commit()?;
+        m.commit().await?;
        assert_eq!(
            tline
                .get_rel_size(TESTREL_A, Lsn(0x70), false, &ctx)
@@ -1398,7 +1402,7 @@ mod tests {
        walingest
            .put_rel_page_image(&mut m, TESTREL_A, 1500, TEST_IMG("foo blk 1500"), &ctx)
            .await?;
-        m.commit()?;
+        m.commit().await?;
        assert_eq!(
            tline
                .get_rel_size(TESTREL_A, Lsn(0x80), false, &ctx)
@@ -1428,14 +1432,16 @@ mod tests {
    #[tokio::test]
    async fn test_drop_extend() -> Result<()> {
        let (tenant, ctx) = TenantHarness::create("test_drop_extend")?.load().await;
-        let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION, &ctx)?;
+        let tline = tenant
+            .create_test_timeline(TIMELINE_ID, Lsn(8), DEFAULT_PG_VERSION, &ctx)
+            .await?;
        let mut walingest = init_walingest_test(&tline, &ctx).await?;

        let mut m = tline.begin_modification(Lsn(0x20));
        walingest
            .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"), &ctx)
            .await?;
-        m.commit()?;
+        m.commit().await?;

        // Check that rel exists and size is correct
        assert_eq!(
@@ -1454,7 +1460,7 @@ mod tests {
        // Drop rel
        let mut m = tline.begin_modification(Lsn(0x30));
        walingest.put_rel_drop(&mut m, TESTREL_A, &ctx).await?;
-        m.commit()?;
+        m.commit().await?;

        // Check that rel is not visible anymore
        assert_eq!(
@@ -1472,7 +1478,7 @@ mod tests {
        walingest
            .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 4"), &ctx)
            .await?;
-        m.commit()?;
+        m.commit().await?;

        // Check that rel exists and size is correct
        assert_eq!(
@@ -1497,7 +1503,9 @@ mod tests {
    #[tokio::test]
    async fn test_truncate_extend() -> Result<()> {
        let (tenant, ctx) = TenantHarness::create("test_truncate_extend")?.load().await;
-        let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION, &ctx)?;
+        let tline = tenant
+            .create_test_timeline(TIMELINE_ID, Lsn(8), DEFAULT_PG_VERSION, &ctx)
+            .await?;
        let mut walingest = init_walingest_test(&tline, &ctx).await?;

        // Create a 20 MB relation (the size is arbitrary)
@@ -1509,7 +1517,7 @@ mod tests {
                .put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data), &ctx)
                .await?;
        }
-        m.commit()?;
+        m.commit().await?;

        // The relation was created at LSN 20, not visible at LSN 1 yet.
        assert_eq!(
@@ -1554,7 +1562,7 @@ mod tests {
        walingest
            .put_rel_truncation(&mut m, TESTREL_A, 1, &ctx)
            .await?;
-        m.commit()?;
+        m.commit().await?;

        // Check reported size and contents after truncation
        assert_eq!(
@@ -1603,7 +1611,7 @@ mod tests {
                .put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data), &ctx)
                .await?;
        }
-        m.commit()?;
+        m.commit().await?;

        assert_eq!(
            tline
@@ -1637,7 +1645,9 @@ mod tests {
    #[tokio::test]
    async fn test_large_rel() -> Result<()> {
        let (tenant, ctx) = TenantHarness::create("test_large_rel")?.load().await;
-        let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION, &ctx)?;
+        let tline = tenant
+            .create_test_timeline(TIMELINE_ID, Lsn(8), DEFAULT_PG_VERSION, &ctx)
+            .await?;
        let mut walingest = init_walingest_test(&tline, &ctx).await?;

        let mut lsn = 0x10;
@@ -1648,7 +1658,7 @@ mod tests {
            walingest
                .put_rel_page_image(&mut m, TESTREL_A, blknum as BlockNumber, img, &ctx)
                .await?;
-            m.commit()?;
+            m.commit().await?;
        }

        assert_current_logical_size(&tline, Lsn(lsn));
@@ -1664,7 +1674,7 @@ mod tests {
        walingest
            .put_rel_truncation(&mut m, TESTREL_A, RELSEG_SIZE, &ctx)
            .await?;
-        m.commit()?;
+        m.commit().await?;
        assert_eq!(
            tline.get_rel_size(TESTREL_A, Lsn(lsn), false, &ctx).await?,
            RELSEG_SIZE
@@ -1677,7 +1687,7 @@ mod tests {
        walingest
            .put_rel_truncation(&mut m, TESTREL_A, RELSEG_SIZE - 1, &ctx)
            .await?;
-        m.commit()?;
+        m.commit().await?;
        assert_eq!(
            tline.get_rel_size(TESTREL_A, Lsn(lsn), false, &ctx).await?,
            RELSEG_SIZE - 1
@@ -1693,7 +1703,7 @@ mod tests {
            walingest
                .put_rel_truncation(&mut m, TESTREL_A, size as BlockNumber, &ctx)
                .await?;
-            m.commit()?;
+            m.commit().await?;
            assert_eq!(
                tline.get_rel_size(TESTREL_A, Lsn(lsn), false, &ctx).await?,
                size as BlockNumber
--- a/Show More
+++ b/Show More