Release 2023-08-02: update pg_embedding (#4877 )

Cherry-picking ca4d71a954 from `main` into the `release` Co-authored-by: Vadim Kharitonov <vadim2404@users.noreply.github.com>
Merge pull request #4861 from neondatabase/releases/2023-08-01--2-fix-pipeline
2026-02-24 21:10:37 +00:00 · 2023-08-03 08:48:09 +02:00 · 2023-08-01 13:22:07 +02:00 · 2023-08-01 12:01:45 +02:00 · 2023-08-01 10:44:01 +02:00 · 2023-08-01 02:59:11 +03:00
89 changed files with 3191 additions and 1444 deletions
--- a/.dockerignore
+++ b/.dockerignore
@@ -21,4 +21,5 @@
 !workspace_hack/
 !neon_local/
 !scripts/ninstall.sh
+!scripts/combine_control_files.py
 !vm-cgconfig.conf
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -209,4 +209,4 @@ runs:
      uses: ./.github/actions/allure-report-store
      with:
        report-dir: /tmp/test_output/allure/results
-        unique-key: ${{ inputs.build_type }}
+        unique-key: ${{ inputs.build_type }}-${{ inputs.pg_version }}
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -955,22 +955,15 @@ jobs:
        version: [ v14, v15 ]

    env:
-      # While on transition period we extract public extensions from compute-node image and custom extensions from extensions image.
-      # Later all the extensions will be moved to extensions image.
      EXTENSIONS_IMAGE: ${{ github.ref_name == 'release' && '093970136003' || '369495373322'}}.dkr.ecr.eu-central-1.amazonaws.com/extensions-${{ matrix.version }}:latest
-      COMPUTE_NODE_IMAGE: ${{ github.ref_name == 'release' && '093970136003' || '369495373322'}}.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:latest
      AWS_ACCESS_KEY_ID: ${{ github.ref_name == 'release' && secrets.AWS_ACCESS_KEY_PROD || secrets.AWS_ACCESS_KEY_DEV }}
      AWS_SECRET_ACCESS_KEY: ${{ github.ref_name == 'release' && secrets.AWS_SECRET_KEY_PROD || secrets.AWS_SECRET_KEY_DEV }}
-      S3_BUCKETS: |
-        ${{ github.ref_name == 'release' &&
-          'neon-prod-extensions-ap-southeast-1 neon-prod-extensions-eu-central-1 neon-prod-extensions-us-east-1 neon-prod-extensions-us-east-2 neon-prod-extensions-us-west-2' ||
-          'neon-dev-extensions-eu-central-1 neon-dev-extensions-eu-west-1 neon-dev-extensions-us-east-2' }}
+      S3_BUCKETS: ${{ github.ref_name == 'release' && vars.S3_EXTENSIONS_BUCKETS_PROD || vars.S3_EXTENSIONS_BUCKETS_DEV }}

    steps:
      - name: Pull postgres-extensions image
        run: |
          docker pull ${EXTENSIONS_IMAGE}
-          docker pull ${COMPUTE_NODE_IMAGE}

      - name: Create postgres-extensions container
        id: create-container
@@ -978,46 +971,23 @@ jobs:
          EID=$(docker create ${EXTENSIONS_IMAGE} true)
          echo "EID=${EID}" >> $GITHUB_OUTPUT

-          CID=$(docker create ${COMPUTE_NODE_IMAGE} true)
-          echo "CID=${CID}" >> $GITHUB_OUTPUT
-
      - name: Extract postgres-extensions from container
        run: |
-          rm -rf ./extensions-to-upload ./custom-extensions # Just in case
+          rm -rf ./extensions-to-upload # Just in case
+          mkdir -p extensions-to-upload

-          # In compute image we have a bit different directory layout
-          mkdir -p extensions-to-upload/share
-          docker cp ${{ steps.create-container.outputs.CID }}:/usr/local/share/extension ./extensions-to-upload/share/extension
-          docker cp ${{ steps.create-container.outputs.CID }}:/usr/local/lib             ./extensions-to-upload/lib
-
-          # Delete Neon extensitons (they always present on compute-node image)
-          rm -rf ./extensions-to-upload/share/extension/neon*
-          rm -rf ./extensions-to-upload/lib/neon*
-
-          # Delete leftovers from the extension build step
-          rm -rf ./extensions-to-upload/lib/pgxs
-          rm -rf ./extensions-to-upload/lib/pkgconfig
-
-          docker cp ${{ steps.create-container.outputs.EID }}:/extensions ./custom-extensions
-          for EXT_NAME in $(ls ./custom-extensions); do
-            mkdir -p ./extensions-to-upload/${EXT_NAME}/share
-
-            mv ./custom-extensions/${EXT_NAME}/share/extension ./extensions-to-upload/${EXT_NAME}/share/extension
-            mv ./custom-extensions/${EXT_NAME}/lib             ./extensions-to-upload/${EXT_NAME}/lib
-          done
+          docker cp ${{ steps.create-container.outputs.EID }}:/extensions/ ./extensions-to-upload/
+          docker cp ${{ steps.create-container.outputs.EID }}:/ext_index.json ./extensions-to-upload/

      - name: Upload postgres-extensions to S3
-        # TODO: Reenable step after switching to the new extensions format (tar-gzipped + index.json)
-        if: false
        run: |
-          for BUCKET in $(echo ${S3_BUCKETS}); do
+          for BUCKET in $(echo ${S3_BUCKETS:-[]} | jq --raw-output '.[]'); do
            aws s3 cp --recursive --only-show-errors ./extensions-to-upload s3://${BUCKET}/${{ needs.tag.outputs.build-tag }}/${{ matrix.version }}
          done

      - name: Cleanup
-        if: ${{ always() && (steps.create-container.outputs.CID || steps.create-container.outputs.EID) }}
+        if: ${{ always() && steps.create-container.outputs.EID }}
        run: |
-          docker rm ${{ steps.create-container.outputs.CID }} || true
          docker rm ${{ steps.create-container.outputs.EID }} || true

  deploy:
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2506,6 +2506,7 @@ dependencies = [
 "pageserver",
 "postgres_ffi",
 "svg_fmt",
+ "tokio",
 "utils",
 "workspace_hack",
 ]
@@ -2544,6 +2545,7 @@ dependencies = [
 "metrics",
 "nix",
 "num-traits",
+ "num_cpus",
 "once_cell",
 "pageserver_api",
 "pin-project-lite",
@@ -2780,7 +2782,7 @@ dependencies = [
 [[package]]
 name = "postgres"
 version = "0.19.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=1aaedab101b23f7612042850d8f2036810fa7c7f#1aaedab101b23f7612042850d8f2036810fa7c7f"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=9011f7110db12b5e15afaf98f8ac834501d50ddc#9011f7110db12b5e15afaf98f8ac834501d50ddc"
 dependencies = [
 "bytes",
 "fallible-iterator",
@@ -2793,7 +2795,7 @@ dependencies = [
 [[package]]
 name = "postgres-native-tls"
 version = "0.5.0"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=1aaedab101b23f7612042850d8f2036810fa7c7f#1aaedab101b23f7612042850d8f2036810fa7c7f"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=9011f7110db12b5e15afaf98f8ac834501d50ddc#9011f7110db12b5e15afaf98f8ac834501d50ddc"
 dependencies = [
 "native-tls",
 "tokio",
@@ -2804,7 +2806,7 @@ dependencies = [
 [[package]]
 name = "postgres-protocol"
 version = "0.6.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=1aaedab101b23f7612042850d8f2036810fa7c7f#1aaedab101b23f7612042850d8f2036810fa7c7f"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=9011f7110db12b5e15afaf98f8ac834501d50ddc#9011f7110db12b5e15afaf98f8ac834501d50ddc"
 dependencies = [
 "base64 0.20.0",
 "byteorder",
@@ -2822,7 +2824,7 @@ dependencies = [
 [[package]]
 name = "postgres-types"
 version = "0.2.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=1aaedab101b23f7612042850d8f2036810fa7c7f#1aaedab101b23f7612042850d8f2036810fa7c7f"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=9011f7110db12b5e15afaf98f8ac834501d50ddc#9011f7110db12b5e15afaf98f8ac834501d50ddc"
 dependencies = [
 "bytes",
 "fallible-iterator",
@@ -4312,7 +4314,7 @@ dependencies = [
 [[package]]
 name = "tokio-postgres"
 version = "0.7.7"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=1aaedab101b23f7612042850d8f2036810fa7c7f#1aaedab101b23f7612042850d8f2036810fa7c7f"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=9011f7110db12b5e15afaf98f8ac834501d50ddc#9011f7110db12b5e15afaf98f8ac834501d50ddc"
 dependencies = [
 "async-trait",
 "byteorder",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -144,11 +144,11 @@ env_logger = "0.10"
 log = "0.4"

 ## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
-postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
-postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
-postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
+postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }
+postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }
+postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }
+postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }

 ## Other git libraries
 heapless = { default-features=false, features=[], git = "https://github.com/japaric/heapless.git", rev = "644653bf3b831c6bb4963be2de24804acf5e5001" } # upstream release pending
@@ -183,7 +183,7 @@ tonic-build = "0.9"

 # This is only needed for proxy's tests.
 # TODO: we should probably fork `tokio-postgres-rustls` instead.
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }

 ################# Binary contents sections

--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -13,7 +13,7 @@ FROM debian:bullseye-slim AS build-deps
 RUN apt update &&  \
    apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev \
    zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget pkg-config libssl-dev \
-    libicu-dev libxslt1-dev liblz4-dev libzstd-dev
+    libicu-dev libxslt1-dev liblz4-dev libzstd-dev zstd

 #########################################################################################
 #
@@ -77,6 +77,7 @@ ENV PATH "/usr/local/pgsql/bin:$PATH"
 RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.2.tar.gz -O postgis.tar.gz && \
    echo "9a2a219da005a1730a39d1959a1c7cec619b1efb009b65be80ffc25bad299068 postgis.tar.gz" | sha256sum --check && \
    mkdir postgis-src && cd postgis-src && tar xvzf ../postgis.tar.gz --strip-components=1 -C . && \
+    find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\
    ./autogen.sh && \
    ./configure --with-sfcgal=/usr/local/bin/sfcgal-config && \
    make -j $(getconf _NPROCESSORS_ONLN) install && \
@@ -89,17 +90,28 @@ RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.2.tar.gz -O postg
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_tiger_geocoder.control && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_topology.control && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/address_standardizer.control && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/address_standardizer_data_us.control
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/address_standardizer_data_us.control && \
+    mkdir -p /extensions/postgis && \
+    cp /usr/local/pgsql/share/extension/postgis.control /extensions/postgis && \
+    cp /usr/local/pgsql/share/extension/postgis_raster.control /extensions/postgis && \
+    cp /usr/local/pgsql/share/extension/postgis_sfcgal.control /extensions/postgis && \
+    cp /usr/local/pgsql/share/extension/postgis_tiger_geocoder.control /extensions/postgis && \
+    cp /usr/local/pgsql/share/extension/postgis_topology.control /extensions/postgis && \
+    cp /usr/local/pgsql/share/extension/address_standardizer.control /extensions/postgis && \
+    cp /usr/local/pgsql/share/extension/address_standardizer_data_us.control /extensions/postgis

 RUN wget https://github.com/pgRouting/pgrouting/archive/v3.4.2.tar.gz -O pgrouting.tar.gz && \
    echo "cac297c07d34460887c4f3b522b35c470138760fe358e351ad1db4edb6ee306e pgrouting.tar.gz" | sha256sum --check && \
    mkdir pgrouting-src && cd pgrouting-src && tar xvzf ../pgrouting.tar.gz --strip-components=1 -C . && \
-    mkdir build && \
-    cd build && \
+    mkdir build && cd build && \
    cmake -DCMAKE_BUILD_TYPE=Release .. && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
    make -j $(getconf _NPROCESSORS_ONLN) install && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrouting.control
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrouting.control && \
+    find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /after.txt &&\
+    cp /usr/local/pgsql/share/extension/pgrouting.control /extensions/postgis && \
+    sort -o /before.txt /before.txt && sort -o /after.txt /after.txt && \
+    comm -13 /before.txt /after.txt | tar --directory=/usr/local/pgsql --zstd -cf /extensions/postgis.tar.zst -T -

 #########################################################################################
 #
@@ -419,12 +431,16 @@ RUN apt-get update && \
    wget https://github.com/ketteq-neon/postgres-exts/archive/e0bd1a9d9313d7120c1b9c7bb15c48c0dede4c4e.tar.gz -O kq_imcx.tar.gz && \
    echo "dc93a97ff32d152d32737ba7e196d9687041cda15e58ab31344c2f2de8855336 kq_imcx.tar.gz" | sha256sum --check && \
    mkdir kq_imcx-src && cd kq_imcx-src && tar xvzf ../kq_imcx.tar.gz --strip-components=1 -C . && \
-    mkdir build && \
-    cd build && \
+    find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\
+    mkdir build && cd build && \
    cmake -DCMAKE_BUILD_TYPE=Release .. && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
    make -j $(getconf _NPROCESSORS_ONLN) install && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/kq_imcx.control
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/kq_imcx.control && \
+    find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /after.txt &&\
+    mkdir -p /extensions/kq_imcx && cp /usr/local/pgsql/share/extension/kq_imcx.control /extensions/kq_imcx && \
+    sort -o /before.txt /before.txt && sort -o /after.txt /after.txt && \
+    comm -13 /before.txt /after.txt | tar --directory=/usr/local/pgsql --zstd -cf /extensions/kq_imcx.tar.zst -T -

 #########################################################################################
 #
@@ -535,10 +551,8 @@ FROM build-deps AS pg-embedding-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 ENV PATH "/usr/local/pgsql/bin/:$PATH"
-# eeb3ba7c3a60c95b2604dd543c64b2f1bb4a3703 made on 15/07/2023
-# There is no release tag yet
-RUN wget https://github.com/neondatabase/pg_embedding/archive/eeb3ba7c3a60c95b2604dd543c64b2f1bb4a3703.tar.gz -O pg_embedding.tar.gz && \
-    echo "030846df723652f99a8689ce63b66fa0c23477a7fd723533ab8a6b28ab70730f pg_embedding.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/neondatabase/pg_embedding/archive/refs/tags/0.3.5.tar.gz -O pg_embedding.tar.gz && \
+    echo "0e95b27b8b6196e2cf0a0c9ec143fe2219b82e54c5bb4ee064e76398cbe69ae9 pg_embedding.tar.gz" | sha256sum --check && \
    mkdir pg_embedding-src && cd pg_embedding-src && tar xvzf ../pg_embedding.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
    make -j $(getconf _NPROCESSORS_ONLN) install && \
@@ -553,16 +567,17 @@ RUN wget https://github.com/neondatabase/pg_embedding/archive/eeb3ba7c3a60c95b26
 FROM build-deps AS pg-anon-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

-# Kaniko doesn't allow to do `${from#/usr/local/pgsql/}`, so we use `${from:17}` instead
 ENV PATH "/usr/local/pgsql/bin/:$PATH"
 RUN wget https://gitlab.com/dalibo/postgresql_anonymizer/-/archive/1.1.0/postgresql_anonymizer-1.1.0.tar.gz -O pg_anon.tar.gz && \
    echo "08b09d2ff9b962f96c60db7e6f8e79cf7253eb8772516998fc35ece08633d3ad pg_anon.tar.gz" | sha256sum --check && \
    mkdir pg_anon-src && cd pg_anon-src && tar xvzf ../pg_anon.tar.gz --strip-components=1 -C . && \
-    find /usr/local/pgsql -type f | sort  > /before.txt && \
+    find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\
    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/anon.control && \
-    find /usr/local/pgsql -type f | sort  > /after.txt && \
-    /bin/bash -c 'for from in $(comm -13 /before.txt /after.txt); do to=/extensions/anon/${from:17} && mkdir -p $(dirname ${to}) && cp -a ${from} ${to}; done'
+    find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /after.txt &&\
+    mkdir -p /extensions/anon && cp /usr/local/pgsql/share/extension/anon.control /extensions/anon && \
+    sort -o /before.txt /before.txt && sort -o /after.txt /after.txt && \
+    comm -13 /before.txt /after.txt | tar --directory=/usr/local/pgsql --zstd -cf /extensions/anon.tar.zst -T -

 #########################################################################################
 #
@@ -754,16 +769,23 @@ RUN rm /usr/local/pgsql/lib/lib*.a
 # Extenstion only
 #
 #########################################################################################
+FROM python:3.9-slim-bullseye AS generate-ext-index
+ARG PG_VERSION
+ARG BUILD_TAG
+RUN apt update && apt install -y zstd
+
+# copy the control files here
+COPY --from=kq-imcx-pg-build /extensions/ /extensions/
+COPY --from=pg-anon-pg-build /extensions/ /extensions/
+COPY --from=postgis-build /extensions/ /extensions/
+COPY scripts/combine_control_files.py ./combine_control_files.py
+RUN python3 ./combine_control_files.py ${PG_VERSION} ${BUILD_TAG} --public_extensions="anon,postgis"
+
 FROM scratch AS postgres-extensions
 # After the transition this layer will include all extensitons.
-# As for now, it's only for new custom ones
-#
-# # Default extensions
-# COPY --from=postgres-cleanup-layer /usr/local/pgsql/share/extension /usr/local/pgsql/share/extension
-# COPY --from=postgres-cleanup-layer /usr/local/pgsql/lib             /usr/local/pgsql/lib
-# Custom extensions
-COPY --from=pg-anon-pg-build /extensions/anon/lib/ /extensions/anon/lib
-COPY --from=pg-anon-pg-build /extensions/anon/share/extension /extensions/anon/share/extension
+# As for now, it's only a couple for testing purposses
+COPY --from=generate-ext-index /extensions/*.tar.zst /extensions/
+COPY --from=generate-ext-index /ext_index.json /ext_index.json

 #########################################################################################
 #
--- a/2
+++ b/2
@@ -108,6 +108,8 @@ postgres-%: postgres-configure-% \
 	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pg_buffercache install
 	+@echo "Compiling pageinspect $*"
 	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pageinspect install
+	+@echo "Compiling amcheck $*"
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/amcheck install

 .PHONY: postgres-clean-%
 postgres-clean-%:
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -193,6 +193,13 @@ fn main() -> Result<()> {
    if !spec_set {
        // No spec provided, hang waiting for it.
        info!("no compute spec provided, waiting");
+
+        // TODO this can stall startups in the unlikely event that we bind
+        //      this compute node while it's busy prewarming. It's not too
+        //      bad because it's just 100ms and unlikely, but it's an
+        //      avoidable problem.
+        compute.prewarm_postgres()?;
+
        let mut state = compute.state.lock().unwrap();
        while state.status != ComputeStatus::ConfigurationPending {
            state = compute.state_changed.wait(state).unwrap();
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -8,9 +8,11 @@ use std::sync::{Condvar, Mutex};

 use anyhow::{Context, Result};
 use chrono::{DateTime, Utc};
+use futures::stream::FuturesUnordered;
+use futures::StreamExt;
 use postgres::{Client, NoTls};
 use tokio_postgres;
-use tracing::{info, instrument, warn};
+use tracing::{error, info, instrument, warn};
 use utils::id::{TenantId, TimelineId};
 use utils::lsn::Lsn;

@@ -21,6 +23,7 @@ use utils::measured_stream::MeasuredReader;
 use crate::config;
 use crate::pg_helpers::*;
 use crate::spec::*;
+use crate::sync_sk::{check_if_synced, ping_safekeeper};

 /// Compute node info shared across several `compute_ctl` threads.
 pub struct ComputeNode {
@@ -86,6 +89,7 @@ pub struct ParsedSpec {
    pub tenant_id: TenantId,
    pub timeline_id: TimelineId,
    pub pageserver_connstr: String,
+    pub safekeeper_connstrings: Vec<String>,
    pub storage_auth_token: Option<String>,
 }

@@ -103,6 +107,21 @@ impl TryFrom<ComputeSpec> for ParsedSpec {
            .clone()
            .or_else(|| spec.cluster.settings.find("neon.pageserver_connstring"))
            .ok_or("pageserver connstr should be provided")?;
+        let safekeeper_connstrings = if spec.safekeeper_connstrings.is_empty() {
+            if matches!(spec.mode, ComputeMode::Primary) {
+                spec.cluster
+                    .settings
+                    .find("neon.safekeepers")
+                    .ok_or("safekeeper connstrings should be provided")?
+                    .split(',')
+                    .map(|str| str.to_string())
+                    .collect()
+            } else {
+                vec![]
+            }
+        } else {
+            spec.safekeeper_connstrings.clone()
+        };
        let storage_auth_token = spec.storage_auth_token.clone();
        let tenant_id: TenantId = if let Some(tenant_id) = spec.tenant_id {
            tenant_id
@@ -128,6 +147,7 @@ impl TryFrom<ComputeSpec> for ParsedSpec {
        Ok(ParsedSpec {
            spec,
            pageserver_connstr,
+            safekeeper_connstrings,
            storage_auth_token,
            tenant_id,
            timeline_id,
@@ -309,6 +329,102 @@ impl ComputeNode {
        Ok(())
    }

+    pub async fn check_safekeepers_synced_async(
+        &self,
+        compute_state: &ComputeState,
+    ) -> Result<Option<Lsn>> {
+        // Construct a connection config for each safekeeper
+        let pspec: ParsedSpec = compute_state
+            .pspec
+            .as_ref()
+            .expect("spec must be set")
+            .clone();
+        let sk_connstrs: Vec<String> = pspec.safekeeper_connstrings.clone();
+        let sk_configs = sk_connstrs.into_iter().map(|connstr| {
+            // Format connstr
+            let id = connstr.clone();
+            let connstr = format!("postgresql://no_user@{}", connstr);
+            let options = format!(
+                "-c timeline_id={} tenant_id={}",
+                pspec.timeline_id, pspec.tenant_id
+            );
+
+            // Construct client
+            let mut config = tokio_postgres::Config::from_str(&connstr).unwrap();
+            config.options(&options);
+            if let Some(storage_auth_token) = pspec.storage_auth_token.clone() {
+                config.password(storage_auth_token);
+            }
+
+            (id, config)
+        });
+
+        // Create task set to query all safekeepers
+        let mut tasks = FuturesUnordered::new();
+        let quorum = sk_configs.len() / 2 + 1;
+        for (id, config) in sk_configs {
+            let timeout = tokio::time::Duration::from_millis(100);
+            let task = tokio::time::timeout(timeout, ping_safekeeper(id, config));
+            tasks.push(tokio::spawn(task));
+        }
+
+        // Get a quorum of responses or errors
+        let mut responses = Vec::new();
+        let mut join_errors = Vec::new();
+        let mut task_errors = Vec::new();
+        let mut timeout_errors = Vec::new();
+        while let Some(response) = tasks.next().await {
+            match response {
+                Ok(Ok(Ok(r))) => responses.push(r),
+                Ok(Ok(Err(e))) => task_errors.push(e),
+                Ok(Err(e)) => timeout_errors.push(e),
+                Err(e) => join_errors.push(e),
+            };
+            if responses.len() >= quorum {
+                break;
+            }
+            if join_errors.len() + task_errors.len() + timeout_errors.len() >= quorum {
+                break;
+            }
+        }
+
+        // In case of error, log and fail the check, but don't crash.
+        // We're playing it safe because these errors could be transient
+        // and we don't yet retry. Also being careful here allows us to
+        // be backwards compatible with safekeepers that don't have the
+        // TIMELINE_STATUS API yet.
+        if responses.len() < quorum {
+            error!(
+                "failed sync safekeepers check {:?} {:?} {:?}",
+                join_errors, task_errors, timeout_errors
+            );
+            return Ok(None);
+        }
+
+        Ok(check_if_synced(responses))
+    }
+
+    // Fast path for sync_safekeepers. If they're already synced we get the lsn
+    // in one roundtrip. If not, we should do a full sync_safekeepers.
+    pub fn check_safekeepers_synced(&self, compute_state: &ComputeState) -> Result<Option<Lsn>> {
+        let start_time = Utc::now();
+
+        // Run actual work with new tokio runtime
+        let rt = tokio::runtime::Builder::new_current_thread()
+            .enable_all()
+            .build()
+            .expect("failed to create rt");
+        let result = rt.block_on(self.check_safekeepers_synced_async(compute_state));
+
+        // Record runtime
+        self.state.lock().unwrap().metrics.sync_sk_check_ms = Utc::now()
+            .signed_duration_since(start_time)
+            .to_std()
+            .unwrap()
+            .as_millis() as u64;
+        result
+    }
+
    // Run `postgres` in a special mode with `--sync-safekeepers` argument
    // and return the reported LSN back to the caller.
    #[instrument(skip_all)]
@@ -371,10 +487,14 @@ impl ComputeNode {
        // cannot sync safekeepers.
        let lsn = match spec.mode {
            ComputeMode::Primary => {
-                info!("starting safekeepers syncing");
-                let lsn = self
-                    .sync_safekeepers(pspec.storage_auth_token.clone())
-                    .with_context(|| "failed to sync safekeepers")?;
+                info!("checking if safekeepers are synced");
+                let lsn = if let Ok(Some(lsn)) = self.check_safekeepers_synced(compute_state) {
+                    lsn
+                } else {
+                    info!("starting safekeepers syncing");
+                    self.sync_safekeepers(pspec.storage_auth_token.clone())
+                        .with_context(|| "failed to sync safekeepers")?
+                };
                info!("safekeepers synced at LSN {}", lsn);
                lsn
            }
@@ -412,6 +532,50 @@ impl ComputeNode {
        Ok(())
    }

+    /// Start and stop a postgres process to warm up the VM for startup.
+    pub fn prewarm_postgres(&self) -> Result<()> {
+        info!("prewarming");
+
+        // Create pgdata
+        let pgdata = &format!("{}.warmup", self.pgdata);
+        create_pgdata(pgdata)?;
+
+        // Run initdb to completion
+        info!("running initdb");
+        let initdb_bin = Path::new(&self.pgbin).parent().unwrap().join("initdb");
+        Command::new(initdb_bin)
+            .args(["-D", pgdata])
+            .output()
+            .expect("cannot start initdb process");
+
+        // Write conf
+        use std::io::Write;
+        let conf_path = Path::new(pgdata).join("postgresql.conf");
+        let mut file = std::fs::File::create(conf_path)?;
+        writeln!(file, "shared_buffers=65536")?;
+        writeln!(file, "port=51055")?; // Nobody should be connecting
+        writeln!(file, "shared_preload_libraries = 'neon'")?;
+
+        // Start postgres
+        info!("starting postgres");
+        let mut pg = Command::new(&self.pgbin)
+            .args(["-D", pgdata])
+            .spawn()
+            .expect("cannot start postgres process");
+
+        // Stop it when it's ready
+        info!("waiting for postgres");
+        wait_for_postgres(&mut pg, Path::new(pgdata))?;
+        pg.kill()?;
+        info!("sent kill signal");
+        pg.wait()?;
+        info!("done prewarming");
+
+        // clean up
+        let _ok = fs::remove_dir_all(pgdata);
+        Ok(())
+    }
+
    /// Start Postgres as a child process and manage DBs/roles.
    /// After that this will hang waiting on the postmaster process to exit.
    #[instrument(skip_all)]
--- a/compute_tools/src/lib.rs
+++ b/compute_tools/src/lib.rs
@@ -13,3 +13,4 @@ pub mod monitor;
 pub mod params;
 pub mod pg_helpers;
 pub mod spec;
+pub mod sync_sk;
--- a/compute_tools/src/sync_sk.rs
+++ b/compute_tools/src/sync_sk.rs
@@ -0,0 +1,98 @@
+// Utils for running sync_safekeepers
+use anyhow::Result;
+use tracing::info;
+use utils::lsn::Lsn;
+
+#[derive(Copy, Clone, Debug)]
+pub enum TimelineStatusResponse {
+    NotFound,
+    Ok(TimelineStatusOkResponse),
+}
+
+#[derive(Copy, Clone, Debug)]
+pub struct TimelineStatusOkResponse {
+    flush_lsn: Lsn,
+    commit_lsn: Lsn,
+}
+
+/// Get a safekeeper's metadata for our timeline. The id is only used for logging
+pub async fn ping_safekeeper(
+    id: String,
+    config: tokio_postgres::Config,
+) -> Result<TimelineStatusResponse> {
+    // TODO add retries
+
+    // Connect
+    info!("connecting to {}", id);
+    let (client, conn) = config.connect(tokio_postgres::NoTls).await?;
+    tokio::spawn(async move {
+        if let Err(e) = conn.await {
+            eprintln!("connection error: {}", e);
+        }
+    });
+
+    // Query
+    info!("querying {}", id);
+    let result = client.simple_query("TIMELINE_STATUS").await?;
+
+    // Parse result
+    info!("done with {}", id);
+    if let postgres::SimpleQueryMessage::Row(row) = &result[0] {
+        use std::str::FromStr;
+        let response = TimelineStatusResponse::Ok(TimelineStatusOkResponse {
+            flush_lsn: Lsn::from_str(row.get("flush_lsn").unwrap())?,
+            commit_lsn: Lsn::from_str(row.get("commit_lsn").unwrap())?,
+        });
+        Ok(response)
+    } else {
+        // Timeline doesn't exist
+        Ok(TimelineStatusResponse::NotFound)
+    }
+}
+
+/// Given a quorum of responses, check if safekeepers are synced at some Lsn
+pub fn check_if_synced(responses: Vec<TimelineStatusResponse>) -> Option<Lsn> {
+    // Check if all responses are ok
+    let ok_responses: Vec<TimelineStatusOkResponse> = responses
+        .iter()
+        .filter_map(|r| match r {
+            TimelineStatusResponse::Ok(ok_response) => Some(ok_response),
+            _ => None,
+        })
+        .cloned()
+        .collect();
+    if ok_responses.len() < responses.len() {
+        info!(
+            "not synced. Only {} out of {} know about this timeline",
+            ok_responses.len(),
+            responses.len()
+        );
+        return None;
+    }
+
+    // Get the min and the max of everything
+    let commit: Vec<Lsn> = ok_responses.iter().map(|r| r.commit_lsn).collect();
+    let flush: Vec<Lsn> = ok_responses.iter().map(|r| r.flush_lsn).collect();
+    let commit_max = commit.iter().max().unwrap();
+    let commit_min = commit.iter().min().unwrap();
+    let flush_max = flush.iter().max().unwrap();
+    let flush_min = flush.iter().min().unwrap();
+
+    // Check that all values are equal
+    if commit_min != commit_max {
+        info!("not synced. {:?} {:?}", commit_min, commit_max);
+        return None;
+    }
+    if flush_min != flush_max {
+        info!("not synced. {:?} {:?}", flush_min, flush_max);
+        return None;
+    }
+
+    // Check that commit == flush
+    if commit_max != flush_max {
+        info!("not synced. {:?} {:?}", commit_max, flush_max);
+        return None;
+    }
+
+    Some(*commit_max)
+}
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -564,9 +564,7 @@ impl Endpoint {
                }
                Err(e) => {
                    if attempt == MAX_ATTEMPTS {
-                        return Err(e).context(
-                            "timed out waiting to connect to compute_ctl HTTP; last error: {e}",
-                        );
+                        return Err(e).context("timed out waiting to connect to compute_ctl HTTP");
                    }
                }
            }
--- a/libs/compute_api/src/responses.rs
+++ b/libs/compute_api/src/responses.rs
@@ -70,6 +70,7 @@ where
 pub struct ComputeMetrics {
    pub wait_for_spec_ms: u64,
    pub sync_safekeepers_ms: u64,
+    pub sync_sk_check_ms: u64,
    pub basebackup_ms: u64,
    pub basebackup_bytes: u64,
    pub start_postgres_ms: u64,
--- a/libs/consumption_metrics/src/lib.rs
+++ b/libs/consumption_metrics/src/lib.rs
@@ -5,7 +5,7 @@ use chrono::{DateTime, Utc};
 use rand::Rng;
 use serde::Serialize;

-#[derive(Serialize, Debug, Clone, Eq, PartialEq, Ord, PartialOrd)]
+#[derive(Serialize, Debug, Clone, Copy, Eq, PartialEq, Ord, PartialOrd)]
 #[serde(tag = "type")]
 pub enum EventType {
    #[serde(rename = "absolute")]
@@ -17,6 +17,32 @@ pub enum EventType {
    },
 }

+impl EventType {
+    pub fn absolute_time(&self) -> Option<&DateTime<Utc>> {
+        use EventType::*;
+        match self {
+            Absolute { time } => Some(time),
+            _ => None,
+        }
+    }
+
+    pub fn incremental_timerange(&self) -> Option<std::ops::Range<&DateTime<Utc>>> {
+        // these can most likely be thought of as Range or RangeFull
+        use EventType::*;
+        match self {
+            Incremental {
+                start_time,
+                stop_time,
+            } => Some(start_time..stop_time),
+            _ => None,
+        }
+    }
+
+    pub fn is_incremental(&self) -> bool {
+        matches!(self, EventType::Incremental { .. })
+    }
+}
+
 #[derive(Serialize, Debug, Clone, Eq, PartialEq, Ord, PartialOrd)]
 pub struct Event<Extra> {
    #[serde(flatten)]
--- a/libs/pq_proto/src/lib.rs
+++ b/libs/pq_proto/src/lib.rs
@@ -179,7 +179,7 @@ pub struct FeExecuteMessage {
 #[derive(Debug)]
 pub struct FeCloseMessage;

-/// An error occured while parsing or serializing raw stream into Postgres
+/// An error occurred while parsing or serializing raw stream into Postgres
 /// messages.
 #[derive(thiserror::Error, Debug)]
 pub enum ProtocolError {
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -200,13 +200,17 @@ impl S3Bucket {
        )
    }

-    fn relative_path_to_s3_object(&self, path: &RemotePath) -> String {
-        let mut full_path = self.prefix_in_bucket.clone().unwrap_or_default();
-        for segment in path.0.iter() {
-            full_path.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
-            full_path.push_str(segment.to_str().unwrap_or_default());
+    pub fn relative_path_to_s3_object(&self, path: &RemotePath) -> String {
+        assert_eq!(std::path::MAIN_SEPARATOR, REMOTE_STORAGE_PREFIX_SEPARATOR);
+        let path_string = path
+            .get_path()
+            .to_string_lossy()
+            .trim_end_matches(REMOTE_STORAGE_PREFIX_SEPARATOR)
+            .to_string();
+        match &self.prefix_in_bucket {
+            Some(prefix) => prefix.clone() + "/" + &path_string,
+            None => path_string,
        }
-        full_path
    }

    async fn download_object(&self, request: GetObjectRequest) -> Result<Download, DownloadError> {
@@ -427,10 +431,12 @@ impl RemoteStorage for S3Bucket {
    }

    async fn download(&self, from: &RemotePath) -> Result<Download, DownloadError> {
+        // if prefix is not none then download file `prefix/from`
+        // if prefix is none then download file `from`
        self.download_object(GetObjectRequest {
            bucket: self.bucket_name.clone(),
            key: self.relative_path_to_s3_object(from),
-            ..GetObjectRequest::default()
+            range: None,
        })
        .await
    }
@@ -523,3 +529,63 @@ impl RemoteStorage for S3Bucket {
        Ok(())
    }
 }
+
+#[cfg(test)]
+mod tests {
+    use std::num::NonZeroUsize;
+    use std::path::Path;
+
+    use crate::{RemotePath, S3Bucket, S3Config};
+
+    #[test]
+    fn relative_path() {
+        let all_paths = vec!["", "some/path", "some/path/"];
+        let all_paths: Vec<RemotePath> = all_paths
+            .iter()
+            .map(|x| RemotePath::new(Path::new(x)).expect("bad path"))
+            .collect();
+        let prefixes = [
+            None,
+            Some(""),
+            Some("test/prefix"),
+            Some("test/prefix/"),
+            Some("/test/prefix/"),
+        ];
+        let expected_outputs = vec![
+            vec!["", "some/path", "some/path"],
+            vec!["/", "/some/path", "/some/path"],
+            vec![
+                "test/prefix/",
+                "test/prefix/some/path",
+                "test/prefix/some/path",
+            ],
+            vec![
+                "test/prefix/",
+                "test/prefix/some/path",
+                "test/prefix/some/path",
+            ],
+            vec![
+                "test/prefix/",
+                "test/prefix/some/path",
+                "test/prefix/some/path",
+            ],
+        ];
+
+        for (prefix_idx, prefix) in prefixes.iter().enumerate() {
+            let config = S3Config {
+                bucket_name: "bucket".to_owned(),
+                bucket_region: "region".to_owned(),
+                prefix_in_bucket: prefix.map(str::to_string),
+                endpoint: None,
+                concurrency_limit: NonZeroUsize::new(100).unwrap(),
+                max_keys_per_list_response: Some(5),
+            };
+            let storage = S3Bucket::new(&config).expect("remote storage init");
+            for (test_path_idx, test_path) in all_paths.iter().enumerate() {
+                let result = storage.relative_path_to_s3_object(test_path);
+                let expected = expected_outputs[prefix_idx][test_path_idx];
+                assert_eq!(result, expected);
+            }
+        }
+    }
+}
--- a/libs/remote_storage/tests/test_real_s3.rs
+++ b/libs/remote_storage/tests/test_real_s3.rs
@@ -19,7 +19,7 @@ static LOGGING_DONE: OnceCell<()> = OnceCell::new();

 const ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME: &str = "ENABLE_REAL_S3_REMOTE_STORAGE";

-const BASE_PREFIX: &str = "test/";
+const BASE_PREFIX: &str = "test";

 /// Tests that S3 client can list all prefixes, even if the response come paginated and requires multiple S3 queries.
 /// Uses real S3 and requires [`ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME`] and related S3 cred env vars specified.
--- a/libs/utils/src/fs_ext.rs
+++ b/libs/utils/src/fs_ext.rs
@@ -24,12 +24,29 @@ pub async fn is_directory_empty(path: impl AsRef<Path>) -> anyhow::Result<bool>
    Ok(dir.next_entry().await?.is_none())
 }

+pub fn ignore_not_found(e: io::Error) -> io::Result<()> {
+    if e.kind() == io::ErrorKind::NotFound {
+        Ok(())
+    } else {
+        Err(e)
+    }
+}
+
+pub fn ignore_absent_files<F>(fs_operation: F) -> io::Result<()>
+where
+    F: Fn() -> io::Result<()>,
+{
+    fs_operation().or_else(ignore_not_found)
+}
+
 #[cfg(test)]
 mod test {
    use std::path::PathBuf;

    use crate::fs_ext::is_directory_empty;

+    use super::ignore_absent_files;
+
    #[test]
    fn is_empty_dir() {
        use super::PathExt;
@@ -75,4 +92,21 @@ mod test {
        std::fs::remove_file(&file_path).unwrap();
        assert!(is_directory_empty(file_path).await.is_err());
    }
+
+    #[test]
+    fn ignore_absent_files_works() {
+        let dir = tempfile::tempdir().unwrap();
+        let dir_path = dir.path();
+
+        let file_path: PathBuf = dir_path.join("testfile");
+
+        ignore_absent_files(|| std::fs::remove_file(&file_path)).expect("should execute normally");
+
+        let f = std::fs::File::create(&file_path).unwrap();
+        drop(f);
+
+        ignore_absent_files(|| std::fs::remove_file(&file_path)).expect("should execute normally");
+
+        assert!(!file_path.exists());
+    }
 }
--- a/libs/utils/src/id.rs
+++ b/libs/utils/src/id.rs
@@ -1,5 +1,7 @@
+use std::ffi::OsStr;
 use std::{fmt, str::FromStr};

+use anyhow::Context;
 use hex::FromHex;
 use rand::Rng;
 use serde::{Deserialize, Serialize};
@@ -213,6 +215,18 @@ pub struct TimelineId(Id);

 id_newtype!(TimelineId);

+impl TryFrom<Option<&OsStr>> for TimelineId {
+    type Error = anyhow::Error;
+
+    fn try_from(value: Option<&OsStr>) -> Result<Self, Self::Error> {
+        value
+            .and_then(OsStr::to_str)
+            .unwrap_or_default()
+            .parse::<TimelineId>()
+            .with_context(|| format!("Could not parse timeline id from {:?}", value))
+    }
+}
+
 /// Neon Tenant Id represents identifiar of a particular tenant.
 /// Is used for distinguishing requests and data belonging to different users.
 ///
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -35,6 +35,8 @@ humantime-serde.workspace = true
 hyper.workspace = true
 itertools.workspace = true
 nix.workspace = true
+# hack to get the number of worker threads tokio uses
+num_cpus = { version = "1.15" }
 num-traits.workspace = true
 once_cell.workspace = true
 pin-project-lite.workspace = true
--- a/pageserver/ctl/Cargo.toml
+++ b/pageserver/ctl/Cargo.toml
@@ -13,6 +13,7 @@ clap = { workspace = true, features = ["string"] }
 git-version.workspace = true
 pageserver = { path = ".." }
 postgres_ffi.workspace = true
+tokio.workspace = true
 utils.workspace = true
 svg_fmt.workspace = true
 workspace_hack.workspace = true
--- a/pageserver/ctl/src/layer_map_analyzer.rs
+++ b/pageserver/ctl/src/layer_map_analyzer.rs
@@ -95,7 +95,7 @@ pub(crate) fn parse_filename(name: &str) -> Option<LayerFile> {
 }

 // Finds the max_holes largest holes, ignoring any that are smaller than MIN_HOLE_LENGTH"
-fn get_holes(path: &Path, max_holes: usize) -> Result<Vec<Hole>> {
+async fn get_holes(path: &Path, max_holes: usize) -> Result<Vec<Hole>> {
    let file = FileBlockReader::new(VirtualFile::open(path)?);
    let summary_blk = file.read_blk(0)?;
    let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
@@ -129,7 +129,7 @@ fn get_holes(path: &Path, max_holes: usize) -> Result<Vec<Hole>> {
    Ok(holes)
 }

-pub(crate) fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> {
+pub(crate) async fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> {
    let storage_path = &cmd.path;
    let max_holes = cmd.max_holes.unwrap_or(DEFAULT_MAX_HOLES);

@@ -160,7 +160,7 @@ pub(crate) fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> {
                    parse_filename(&layer.file_name().into_string().unwrap())
                {
                    if layer_file.is_delta {
-                        layer_file.holes = get_holes(&layer.path(), max_holes)?;
+                        layer_file.holes = get_holes(&layer.path(), max_holes).await?;
                        n_deltas += 1;
                    }
                    layers.push(layer_file);
--- a/pageserver/ctl/src/layers.rs
+++ b/pageserver/ctl/src/layers.rs
@@ -43,8 +43,7 @@ pub(crate) enum LayerCmd {
    },
 }

-fn read_delta_file(path: impl AsRef<Path>) -> Result<()> {
-    use pageserver::tenant::blob_io::BlobCursor;
+async fn read_delta_file(path: impl AsRef<Path>) -> Result<()> {
    use pageserver::tenant::block_io::BlockReader;

    let path = path.as_ref();
@@ -78,7 +77,7 @@ fn read_delta_file(path: impl AsRef<Path>) -> Result<()> {
    Ok(())
 }

-pub(crate) fn main(cmd: &LayerCmd) -> Result<()> {
+pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
    match cmd {
        LayerCmd::List { path } => {
            for tenant in fs::read_dir(path.join("tenants"))? {
@@ -153,7 +152,7 @@ pub(crate) fn main(cmd: &LayerCmd) -> Result<()> {
                        );

                        if layer_file.is_delta {
-                            read_delta_file(layer.path())?;
+                            read_delta_file(layer.path()).await?;
                        } else {
                            anyhow::bail!("not supported yet :(");
                        }
--- a/pageserver/ctl/src/main.rs
+++ b/pageserver/ctl/src/main.rs
@@ -72,12 +72,13 @@ struct AnalyzeLayerMapCmd {
    max_holes: Option<usize>,
 }

-fn main() -> anyhow::Result<()> {
+#[tokio::main]
+async fn main() -> anyhow::Result<()> {
    let cli = CliOpts::parse();

    match cli.command {
        Commands::Layer(cmd) => {
-            layers::main(&cmd)?;
+            layers::main(&cmd).await?;
        }
        Commands::Metadata(cmd) => {
            handle_metadata(&cmd)?;
@@ -86,7 +87,7 @@ fn main() -> anyhow::Result<()> {
            draw_timeline_dir::main()?;
        }
        Commands::AnalyzeLayerMap(cmd) => {
-            layer_map_analyzer::main(&cmd)?;
+            layer_map_analyzer::main(&cmd).await?;
        }
        Commands::PrintLayerFile(cmd) => {
            if let Err(e) = read_pg_control_file(&cmd.path) {
@@ -94,7 +95,7 @@ fn main() -> anyhow::Result<()> {
                    "Failed to read input file as a pg control one: {e:#}\n\
                    Attempting to read it as layer file"
                );
-                print_layerfile(&cmd.path)?;
+                print_layerfile(&cmd.path).await?;
            }
        }
    };
@@ -113,12 +114,12 @@ fn read_pg_control_file(control_file_path: &Path) -> anyhow::Result<()> {
    Ok(())
 }

-fn print_layerfile(path: &Path) -> anyhow::Result<()> {
+async fn print_layerfile(path: &Path) -> anyhow::Result<()> {
    // Basic initialization of things that don't change after startup
    virtual_file::init(10);
    page_cache::init(100);
    let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
-    dump_layerfile_from_path(path, true, &ctx)
+    dump_layerfile_from_path(path, true, &ctx).await
 }

 fn handle_metadata(
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -33,7 +33,8 @@ use crate::tenant::config::TenantConf;
 use crate::tenant::config::TenantConfOpt;
 use crate::tenant::{TENANT_ATTACHING_MARKER_FILENAME, TIMELINES_SEGMENT_NAME};
 use crate::{
-    IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TIMELINE_UNINIT_MARK_SUFFIX,
+    IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX,
+    TIMELINE_UNINIT_MARK_SUFFIX,
 };

 pub mod defaults {
@@ -601,6 +602,17 @@ impl PageServerConf {
        )
    }

+    pub fn timeline_delete_mark_file_path(
+        &self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+    ) -> PathBuf {
+        path_with_suffix_extension(
+            self.timeline_path(&tenant_id, &timeline_id),
+            TIMELINE_DELETE_MARK_SUFFIX,
+        )
+    }
+
    pub fn traces_path(&self) -> PathBuf {
        self.workdir.join("traces")
    }
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -7,7 +7,7 @@ use crate::context::{DownloadBehavior, RequestContext};
 use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
 use crate::tenant::{mgr, LogicalSizeCalculationCause};
 use anyhow;
-use chrono::Utc;
+use chrono::{DateTime, Utc};
 use consumption_metrics::{idempotency_key, Event, EventChunk, EventType, CHUNK_SIZE};
 use pageserver_api::models::TenantState;
 use reqwest::Url;
@@ -18,12 +18,6 @@ use std::time::Duration;
 use tracing::*;
 use utils::id::{NodeId, TenantId, TimelineId};

-const WRITTEN_SIZE: &str = "written_size";
-const SYNTHETIC_STORAGE_SIZE: &str = "synthetic_storage_size";
-const RESIDENT_SIZE: &str = "resident_size";
-const REMOTE_STORAGE_SIZE: &str = "remote_storage_size";
-const TIMELINE_LOGICAL_SIZE: &str = "timeline_logical_size";
-
 const DEFAULT_HTTP_REPORTING_TIMEOUT: Duration = Duration::from_secs(60);

 #[serde_as]
@@ -44,6 +38,121 @@ pub struct PageserverConsumptionMetricsKey {
    pub metric: &'static str,
 }

+impl PageserverConsumptionMetricsKey {
+    const fn absolute_values(self) -> AbsoluteValueFactory {
+        AbsoluteValueFactory(self)
+    }
+    const fn incremental_values(self) -> IncrementalValueFactory {
+        IncrementalValueFactory(self)
+    }
+}
+
+/// Helper type which each individual metric kind can return to produce only absolute values.
+struct AbsoluteValueFactory(PageserverConsumptionMetricsKey);
+
+impl AbsoluteValueFactory {
+    fn now(self, val: u64) -> (PageserverConsumptionMetricsKey, (EventType, u64)) {
+        let key = self.0;
+        let time = Utc::now();
+        (key, (EventType::Absolute { time }, val))
+    }
+}
+
+/// Helper type which each individual metric kind can return to produce only incremental values.
+struct IncrementalValueFactory(PageserverConsumptionMetricsKey);
+
+impl IncrementalValueFactory {
+    #[allow(clippy::wrong_self_convention)]
+    fn from_previous_up_to(
+        self,
+        prev_end: DateTime<Utc>,
+        up_to: DateTime<Utc>,
+        val: u64,
+    ) -> (PageserverConsumptionMetricsKey, (EventType, u64)) {
+        let key = self.0;
+        // cannot assert prev_end < up_to because these are realtime clock based
+        (
+            key,
+            (
+                EventType::Incremental {
+                    start_time: prev_end,
+                    stop_time: up_to,
+                },
+                val,
+            ),
+        )
+    }
+
+    fn key(&self) -> &PageserverConsumptionMetricsKey {
+        &self.0
+    }
+}
+
+// the static part of a PageserverConsumptionMetricsKey
+impl PageserverConsumptionMetricsKey {
+    const fn written_size(tenant_id: TenantId, timeline_id: TimelineId) -> AbsoluteValueFactory {
+        PageserverConsumptionMetricsKey {
+            tenant_id,
+            timeline_id: Some(timeline_id),
+            metric: "written_size",
+        }
+        .absolute_values()
+    }
+
+    /// Values will be the difference of the latest written_size (last_record_lsn) to what we
+    /// previously sent.
+    const fn written_size_delta(
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+    ) -> IncrementalValueFactory {
+        PageserverConsumptionMetricsKey {
+            tenant_id,
+            timeline_id: Some(timeline_id),
+            metric: "written_size_bytes_delta",
+        }
+        .incremental_values()
+    }
+
+    const fn timeline_logical_size(
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+    ) -> AbsoluteValueFactory {
+        PageserverConsumptionMetricsKey {
+            tenant_id,
+            timeline_id: Some(timeline_id),
+            metric: "timeline_logical_size",
+        }
+        .absolute_values()
+    }
+
+    const fn remote_storage_size(tenant_id: TenantId) -> AbsoluteValueFactory {
+        PageserverConsumptionMetricsKey {
+            tenant_id,
+            timeline_id: None,
+            metric: "remote_storage_size",
+        }
+        .absolute_values()
+    }
+
+    const fn resident_size(tenant_id: TenantId) -> AbsoluteValueFactory {
+        PageserverConsumptionMetricsKey {
+            tenant_id,
+            timeline_id: None,
+            metric: "resident_size",
+        }
+        .absolute_values()
+    }
+
+    const fn synthetic_size(tenant_id: TenantId) -> AbsoluteValueFactory {
+        PageserverConsumptionMetricsKey {
+            tenant_id,
+            timeline_id: None,
+            metric: "synthetic_storage_size",
+        }
+        .absolute_values()
+    }
+}
+
 /// Main thread that serves metrics collection
 pub async fn collect_metrics(
    metric_collection_endpoint: &Url,
@@ -79,7 +188,7 @@ pub async fn collect_metrics(
        .timeout(DEFAULT_HTTP_REPORTING_TIMEOUT)
        .build()
        .expect("Failed to create http client with timeout");
-    let mut cached_metrics: HashMap<PageserverConsumptionMetricsKey, u64> = HashMap::new();
+    let mut cached_metrics = HashMap::new();
    let mut prev_iteration_time: std::time::Instant = std::time::Instant::now();

    loop {
@@ -121,13 +230,13 @@ pub async fn collect_metrics(
 /// - refactor this function (chunking+sending part) to reuse it in proxy module;
 pub async fn collect_metrics_iteration(
    client: &reqwest::Client,
-    cached_metrics: &mut HashMap<PageserverConsumptionMetricsKey, u64>,
+    cached_metrics: &mut HashMap<PageserverConsumptionMetricsKey, (EventType, u64)>,
    metric_collection_endpoint: &reqwest::Url,
    node_id: NodeId,
    ctx: &RequestContext,
    send_cached: bool,
 ) {
-    let mut current_metrics: Vec<(PageserverConsumptionMetricsKey, u64)> = Vec::new();
+    let mut current_metrics: Vec<(PageserverConsumptionMetricsKey, (EventType, u64))> = Vec::new();
    trace!(
        "starting collect_metrics_iteration. metric_collection_endpoint: {}",
        metric_collection_endpoint
@@ -166,27 +275,80 @@ pub async fn collect_metrics_iteration(
            if timeline.is_active() {
                let timeline_written_size = u64::from(timeline.get_last_record_lsn());

-                current_metrics.push((
-                    PageserverConsumptionMetricsKey {
-                        tenant_id,
-                        timeline_id: Some(timeline.timeline_id),
-                        metric: WRITTEN_SIZE,
+                let (key, written_size_now) =
+                    PageserverConsumptionMetricsKey::written_size(tenant_id, timeline.timeline_id)
+                        .now(timeline_written_size);
+
+                // last_record_lsn can only go up, right now at least, TODO: #2592 or related
+                // features might change this.
+
+                let written_size_delta_key = PageserverConsumptionMetricsKey::written_size_delta(
+                    tenant_id,
+                    timeline.timeline_id,
+                );
+
+                // use this when available, because in a stream of incremental values, it will be
+                // accurate where as when last_record_lsn stops moving, we will only cache the last
+                // one of those.
+                let last_stop_time =
+                    cached_metrics
+                        .get(written_size_delta_key.key())
+                        .map(|(until, _val)| {
+                            until
+                                .incremental_timerange()
+                                .expect("never create EventType::Absolute for written_size_delta")
+                                .end
+                        });
+
+                // by default, use the last sent written_size as the basis for
+                // calculating the delta. if we don't yet have one, use the load time value.
+                let prev = cached_metrics
+                    .get(&key)
+                    .map(|(prev_at, prev)| {
+                        // use the prev time from our last incremental update, or default to latest
+                        // absolute update on the first round.
+                        let prev_at = prev_at
+                            .absolute_time()
+                            .expect("never create EventType::Incremental for written_size");
+                        let prev_at = last_stop_time.unwrap_or(prev_at);
+                        (*prev_at, *prev)
+                    })
+                    .unwrap_or_else(|| {
+                        // if we don't have a previous point of comparison, compare to the load time
+                        // lsn.
+                        let (disk_consistent_lsn, loaded_at) = &timeline.loaded_at;
+                        (DateTime::from(*loaded_at), disk_consistent_lsn.0)
+                    });
+
+                // written_size_delta_bytes
+                current_metrics.extend(
+                    if let Some(delta) = written_size_now.1.checked_sub(prev.1) {
+                        let up_to = written_size_now
+                            .0
+                            .absolute_time()
+                            .expect("never create EventType::Incremental for written_size");
+                        let key_value =
+                            written_size_delta_key.from_previous_up_to(prev.0, *up_to, delta);
+                        Some(key_value)
+                    } else {
+                        None
                    },
-                    timeline_written_size,
-                ));
+                );
+
+                // written_size
+                current_metrics.push((key, written_size_now));

                let span = info_span!("collect_metrics_iteration", tenant_id = %timeline.tenant_id, timeline_id = %timeline.timeline_id);
                match span.in_scope(|| timeline.get_current_logical_size(ctx)) {
                    // Only send timeline logical size when it is fully calculated.
                    Ok((size, is_exact)) if is_exact => {
-                        current_metrics.push((
-                            PageserverConsumptionMetricsKey {
+                        current_metrics.push(
+                            PageserverConsumptionMetricsKey::timeline_logical_size(
                                tenant_id,
-                                timeline_id: Some(timeline.timeline_id),
-                                metric: TIMELINE_LOGICAL_SIZE,
-                            },
-                            size,
-                        ));
+                                timeline.timeline_id,
+                            )
+                            .now(size),
+                        );
                    }
                    Ok((_, _)) => {}
                    Err(err) => {
@@ -205,14 +367,10 @@ pub async fn collect_metrics_iteration(

        match tenant.get_remote_size().await {
            Ok(tenant_remote_size) => {
-                current_metrics.push((
-                    PageserverConsumptionMetricsKey {
-                        tenant_id,
-                        timeline_id: None,
-                        metric: REMOTE_STORAGE_SIZE,
-                    },
-                    tenant_remote_size,
-                ));
+                current_metrics.push(
+                    PageserverConsumptionMetricsKey::remote_storage_size(tenant_id)
+                        .now(tenant_remote_size),
+                );
            }
            Err(err) => {
                error!(
@@ -222,14 +380,9 @@ pub async fn collect_metrics_iteration(
            }
        }

-        current_metrics.push((
-            PageserverConsumptionMetricsKey {
-                tenant_id,
-                timeline_id: None,
-                metric: RESIDENT_SIZE,
-            },
-            tenant_resident_size,
-        ));
+        current_metrics.push(
+            PageserverConsumptionMetricsKey::resident_size(tenant_id).now(tenant_resident_size),
+        );

        // Note that this metric is calculated in a separate bgworker
        // Here we only use cached value, which may lag behind the real latest one
@@ -237,23 +390,27 @@ pub async fn collect_metrics_iteration(

        if tenant_synthetic_size != 0 {
            // only send non-zeroes because otherwise these show up as errors in logs
-            current_metrics.push((
-                PageserverConsumptionMetricsKey {
-                    tenant_id,
-                    timeline_id: None,
-                    metric: SYNTHETIC_STORAGE_SIZE,
-                },
-                tenant_synthetic_size,
-            ));
+            current_metrics.push(
+                PageserverConsumptionMetricsKey::synthetic_size(tenant_id)
+                    .now(tenant_synthetic_size),
+            );
        }
    }

    // Filter metrics, unless we want to send all metrics, including cached ones.
    // See: https://github.com/neondatabase/neon/issues/3485
    if !send_cached {
-        current_metrics.retain(|(curr_key, curr_val)| match cached_metrics.get(curr_key) {
-            Some(val) => val != curr_val,
-            None => true,
+        current_metrics.retain(|(curr_key, (kind, curr_val))| {
+            if kind.is_incremental() {
+                // incremental values (currently only written_size_delta) should not get any cache
+                // deduplication because they will be used by upstream for "is still alive."
+                true
+            } else {
+                match cached_metrics.get(curr_key) {
+                    Some((_, val)) => val != curr_val,
+                    None => true,
+                }
+            }
        });
    }

@@ -272,8 +429,8 @@ pub async fn collect_metrics_iteration(
        chunk_to_send.clear();

        // enrich metrics with type,timestamp and idempotency key before sending
-        chunk_to_send.extend(chunk.iter().map(|(curr_key, curr_val)| Event {
-            kind: EventType::Absolute { time: Utc::now() },
+        chunk_to_send.extend(chunk.iter().map(|(curr_key, (when, curr_val))| Event {
+            kind: *when,
            metric: curr_key.metric,
            idempotency_key: idempotency_key(node_id.to_string()),
            value: *curr_val,
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -545,12 +545,12 @@ async fn collect_eviction_candidates(
        // We could be better here, e.g., sum of all L0 layers + most recent L1 layer.
        // That's what's typically used by the various background loops.
        //
-        // The default can be overriden with a fixed value in the tenant conf.
+        // The default can be overridden with a fixed value in the tenant conf.
        // A default override can be put in the default tenant conf in the pageserver.toml.
        let min_resident_size = if let Some(s) = tenant.get_min_resident_size_override() {
            debug!(
                tenant_id=%tenant.tenant_id(),
-                overriden_size=s,
+                overridden_size=s,
                "using overridden min resident size for tenant"
            );
            s
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -994,31 +994,29 @@ async fn timeline_gc_handler(
 // Run compaction immediately on given timeline.
 async fn timeline_compact_handler(
    request: Request<Body>,
-    _cancel: CancellationToken,
+    cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
    check_permission(&request, Some(tenant_id))?;

-    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-    let result_receiver = mgr::immediate_compact(tenant_id, timeline_id, &ctx)
-        .await
-        .context("spawn compaction task")
-        .map_err(ApiError::InternalServerError)?;
-
-    let result: anyhow::Result<()> = result_receiver
-        .await
-        .context("receive compaction result")
-        .map_err(ApiError::InternalServerError)?;
-    result.map_err(ApiError::InternalServerError)?;
-
-    json_response(StatusCode::OK, ())
+    async {
+        let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
+        let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
+        timeline
+            .compact(&cancel, &ctx)
+            .await
+            .map_err(ApiError::InternalServerError)?;
+        json_response(StatusCode::OK, ())
+    }
+    .instrument(info_span!("manual_compaction", %tenant_id, %timeline_id))
+    .await
 }

 // Run checkpoint immediately on given timeline.
 async fn timeline_checkpoint_handler(
    request: Request<Body>,
-    _cancel: CancellationToken,
+    cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
@@ -1031,13 +1029,13 @@ async fn timeline_checkpoint_handler(
            .await
            .map_err(ApiError::InternalServerError)?;
        timeline
-            .compact(&ctx)
+            .compact(&cancel, &ctx)
            .await
            .map_err(ApiError::InternalServerError)?;

        json_response(StatusCode::OK, ())
    }
-    .instrument(info_span!("manual_checkpoint", tenant_id = %tenant_id, timeline_id = %timeline_id))
+    .instrument(info_span!("manual_checkpoint", %tenant_id, %timeline_id))
    .await
 }

--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -109,6 +109,8 @@ pub const TEMP_FILE_SUFFIX: &str = "___temp";
 /// Full path: `tenants/<tenant_id>/timelines/<timeline_id>___uninit`.
 pub const TIMELINE_UNINIT_MARK_SUFFIX: &str = "___uninit";

+pub const TIMELINE_DELETE_MARK_SUFFIX: &str = "___delete";
+
 /// A marker file to prevent pageserver from loading a certain tenant on restart.
 /// Different from [`TIMELINE_UNINIT_MARK_SUFFIX`] due to semantics of the corresponding
 /// `ignore` management API command, that expects the ignored tenant to be properly loaded
@@ -123,15 +125,30 @@ pub fn is_temporary(path: &Path) -> bool {
    }
 }

-pub fn is_uninit_mark(path: &Path) -> bool {
+fn ends_with_suffix(path: &Path, suffix: &str) -> bool {
    match path.file_name() {
-        Some(name) => name
-            .to_string_lossy()
-            .ends_with(TIMELINE_UNINIT_MARK_SUFFIX),
+        Some(name) => name.to_string_lossy().ends_with(suffix),
        None => false,
    }
 }

+pub fn is_uninit_mark(path: &Path) -> bool {
+    ends_with_suffix(path, TIMELINE_UNINIT_MARK_SUFFIX)
+}
+
+pub fn is_delete_mark(path: &Path) -> bool {
+    ends_with_suffix(path, TIMELINE_DELETE_MARK_SUFFIX)
+}
+
+fn is_walkdir_io_not_found(e: &walkdir::Error) -> bool {
+    if let Some(e) = e.io_error() {
+        if e.kind() == std::io::ErrorKind::NotFound {
+            return true;
+        }
+    }
+    false
+}
+
 /// During pageserver startup, we need to order operations not to exhaust tokio worker threads by
 /// blocking.
 ///
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -6,7 +6,6 @@ use metrics::{
    IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
 };
 use once_cell::sync::Lazy;
-use pageserver_api::models::TenantState;
 use strum::VariantNames;
 use strum_macros::{EnumVariantNames, IntoStaticStr};
 use utils::id::{TenantId, TimelineId};
@@ -74,7 +73,7 @@ pub static STORAGE_TIME_COUNT_PER_TIMELINE: Lazy<IntCounterVec> = Lazy::new(|| {
 // Buckets for background operations like compaction, GC, size calculation
 const STORAGE_OP_BUCKETS: &[f64] = &[0.010, 0.100, 1.0, 10.0, 100.0, 1000.0];

-pub static STORAGE_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
+pub(crate) static STORAGE_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
    register_histogram_vec!(
        "pageserver_storage_operations_seconds_global",
        "Time spent on storage operations",
@@ -84,18 +83,17 @@ pub static STORAGE_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-static READ_NUM_FS_LAYERS: Lazy<HistogramVec> = Lazy::new(|| {
-    register_histogram_vec!(
+pub(crate) static READ_NUM_FS_LAYERS: Lazy<Histogram> = Lazy::new(|| {
+    register_histogram!(
        "pageserver_read_num_fs_layers",
        "Number of persistent layers accessed for processing a read request, including those in the cache",
-        &["tenant_id", "timeline_id"],
        vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 10.0, 20.0, 50.0, 100.0],
    )
    .expect("failed to define a metric")
 });

 // Metrics collected on operations on the storage repository.
-pub static RECONSTRUCT_TIME: Lazy<Histogram> = Lazy::new(|| {
+pub(crate) static RECONSTRUCT_TIME: Lazy<Histogram> = Lazy::new(|| {
    register_histogram!(
        "pageserver_getpage_reconstruct_seconds",
        "Time spent in reconstruct_value (reconstruct a page from deltas)",
@@ -104,7 +102,7 @@ pub static RECONSTRUCT_TIME: Lazy<Histogram> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub static MATERIALIZED_PAGE_CACHE_HIT_DIRECT: Lazy<IntCounter> = Lazy::new(|| {
+pub(crate) static MATERIALIZED_PAGE_CACHE_HIT_DIRECT: Lazy<IntCounter> = Lazy::new(|| {
    register_int_counter!(
        "pageserver_materialized_cache_hits_direct_total",
        "Number of cache hits from materialized page cache without redo",
@@ -112,17 +110,16 @@ pub static MATERIALIZED_PAGE_CACHE_HIT_DIRECT: Lazy<IntCounter> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-static GET_RECONSTRUCT_DATA_TIME: Lazy<HistogramVec> = Lazy::new(|| {
-    register_histogram_vec!(
+pub(crate) static GET_RECONSTRUCT_DATA_TIME: Lazy<Histogram> = Lazy::new(|| {
+    register_histogram!(
        "pageserver_getpage_get_reconstruct_data_seconds",
        "Time spent in get_reconstruct_value_data",
-        &["tenant_id", "timeline_id"],
        CRITICAL_OP_BUCKETS.into(),
    )
    .expect("failed to define a metric")
 });

-pub static MATERIALIZED_PAGE_CACHE_HIT: Lazy<IntCounter> = Lazy::new(|| {
+pub(crate) static MATERIALIZED_PAGE_CACHE_HIT: Lazy<IntCounter> = Lazy::new(|| {
    register_int_counter!(
        "pageserver_materialized_cache_hits_total",
        "Number of cache hits from materialized page cache",
@@ -246,11 +243,10 @@ pub static PAGE_CACHE_SIZE: Lazy<PageCacheSizeMetrics> = Lazy::new(|| PageCacheS
    },
 });

-static WAIT_LSN_TIME: Lazy<HistogramVec> = Lazy::new(|| {
-    register_histogram_vec!(
+pub(crate) static WAIT_LSN_TIME: Lazy<Histogram> = Lazy::new(|| {
+    register_histogram!(
        "pageserver_wait_lsn_seconds",
        "Time spent waiting for WAL to arrive",
-        &["tenant_id", "timeline_id"],
        CRITICAL_OP_BUCKETS.into(),
    )
    .expect("failed to define a metric")
@@ -284,7 +280,7 @@ static REMOTE_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub static REMOTE_ONDEMAND_DOWNLOADED_LAYERS: Lazy<IntCounter> = Lazy::new(|| {
+pub(crate) static REMOTE_ONDEMAND_DOWNLOADED_LAYERS: Lazy<IntCounter> = Lazy::new(|| {
    register_int_counter!(
        "pageserver_remote_ondemand_downloaded_layers_total",
        "Total on-demand downloaded layers"
@@ -292,7 +288,7 @@ pub static REMOTE_ONDEMAND_DOWNLOADED_LAYERS: Lazy<IntCounter> = Lazy::new(|| {
    .unwrap()
 });

-pub static REMOTE_ONDEMAND_DOWNLOADED_BYTES: Lazy<IntCounter> = Lazy::new(|| {
+pub(crate) static REMOTE_ONDEMAND_DOWNLOADED_BYTES: Lazy<IntCounter> = Lazy::new(|| {
    register_int_counter!(
        "pageserver_remote_ondemand_downloaded_bytes_total",
        "Total bytes of layers on-demand downloaded",
@@ -309,16 +305,29 @@ static CURRENT_LOGICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
    .expect("failed to define current logical size metric")
 });

-pub static TENANT_STATE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|| {
+pub(crate) static TENANT_STATE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|| {
    register_uint_gauge_vec!(
        "pageserver_tenant_states_count",
        "Count of tenants per state",
-        &["tenant_id", "state"]
+        &["state"]
    )
    .expect("Failed to register pageserver_tenant_states_count metric")
 });

-pub static TENANT_SYNTHETIC_SIZE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|| {
+/// A set of broken tenants.
+///
+/// These are expected to be so rare that a set is fine. Set as in a new timeseries per each broken
+/// tenant.
+pub(crate) static BROKEN_TENANTS_SET: Lazy<UIntGaugeVec> = Lazy::new(|| {
+    register_uint_gauge_vec!(
+        "pageserver_broken_tenants_count",
+        "Set of broken tenants",
+        &["tenant_id"]
+    )
+    .expect("Failed to register pageserver_tenant_states_count metric")
+});
+
+pub(crate) static TENANT_SYNTHETIC_SIZE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|| {
    register_uint_gauge_vec!(
        "pageserver_tenant_synthetic_cached_size_bytes",
        "Synthetic size of each tenant in bytes",
@@ -376,7 +385,7 @@ static EVICTIONS_WITH_LOW_RESIDENCE_DURATION: Lazy<IntCounterVec> = Lazy::new(||
    .expect("failed to define a metric")
 });

-pub static UNEXPECTED_ONDEMAND_DOWNLOADS: Lazy<IntCounter> = Lazy::new(|| {
+pub(crate) static UNEXPECTED_ONDEMAND_DOWNLOADS: Lazy<IntCounter> = Lazy::new(|| {
    register_int_counter!(
        "pageserver_unexpected_ondemand_downloads_count",
        "Number of unexpected on-demand downloads. \
@@ -499,23 +508,31 @@ const STORAGE_IO_TIME_BUCKETS: &[f64] = &[
    30.000,   // 30000 ms
 ];

-const STORAGE_IO_TIME_OPERATIONS: &[&str] = &[
-    "open", "close", "read", "write", "seek", "fsync", "gc", "metadata",
-];
-
-const STORAGE_IO_SIZE_OPERATIONS: &[&str] = &["read", "write"];
-
-pub static STORAGE_IO_TIME: Lazy<HistogramVec> = Lazy::new(|| {
+/// Tracks time taken by fs operations near VirtualFile.
+///
+/// Operations:
+/// - open ([`std::fs::OpenOptions::open`])
+/// - close (dropping [`std::fs::File`])
+/// - close-by-replace (close by replacement algorithm)
+/// - read (`read_at`)
+/// - write (`write_at`)
+/// - seek (modify internal position or file length query)
+/// - fsync ([`std::fs::File::sync_all`])
+/// - metadata ([`std::fs::File::metadata`])
+pub(crate) static STORAGE_IO_TIME: Lazy<HistogramVec> = Lazy::new(|| {
    register_histogram_vec!(
        "pageserver_io_operations_seconds",
        "Time spent in IO operations",
-        &["operation", "tenant_id", "timeline_id"],
+        &["operation"],
        STORAGE_IO_TIME_BUCKETS.into()
    )
    .expect("failed to define a metric")
 });

-pub static STORAGE_IO_SIZE: Lazy<IntGaugeVec> = Lazy::new(|| {
+const STORAGE_IO_SIZE_OPERATIONS: &[&str] = &["read", "write"];
+
+// Needed for the https://neonprod.grafana.net/d/5uK9tHL4k/picking-tenant-for-relocation?orgId=1
+pub(crate) static STORAGE_IO_SIZE: Lazy<IntGaugeVec> = Lazy::new(|| {
    register_int_gauge_vec!(
        "pageserver_io_operations_bytes_total",
        "Total amount of bytes read/written in IO operations",
@@ -605,7 +622,7 @@ static REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST: Lazy<HistogramVec> = Lazy::new
         at a given instant. It gives you a better idea of the queue depth \
         than plotting the gauge directly, since operations may complete faster \
         than the sampling interval.",
-        &["tenant_id", "timeline_id", "file_kind", "op_kind"],
+        &["file_kind", "op_kind"],
        // The calls_unfinished gauge is an integer gauge, hence we have integer buckets.
        vec![0.0, 1.0, 2.0, 4.0, 6.0, 8.0, 10.0, 15.0, 20.0, 40.0, 60.0, 80.0, 100.0, 500.0],
    )
@@ -662,18 +679,18 @@ impl RemoteOpFileKind {
    }
 }

-pub static REMOTE_OPERATION_TIME: Lazy<HistogramVec> = Lazy::new(|| {
+pub(crate) static REMOTE_OPERATION_TIME: Lazy<HistogramVec> = Lazy::new(|| {
    register_histogram_vec!(
        "pageserver_remote_operation_seconds",
        "Time spent on remote storage operations. \
        Grouped by tenant, timeline, operation_kind and status. \
        Does not account for time spent waiting in remote timeline client's queues.",
-        &["tenant_id", "timeline_id", "file_kind", "op_kind", "status"]
+        &["file_kind", "op_kind", "status"]
    )
    .expect("failed to define a metric")
 });

-pub static TENANT_TASK_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
+pub(crate) static TENANT_TASK_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
    register_int_counter_vec!(
        "pageserver_tenant_task_events",
        "Number of task start/stop/fail events.",
@@ -682,7 +699,7 @@ pub static TENANT_TASK_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
    .expect("Failed to register tenant_task_events metric")
 });

-pub static BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
+pub(crate) static BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
    register_int_counter_vec!(
        "pageserver_background_loop_period_overrun_count",
        "Incremented whenever warn_when_period_overrun() logs a warning.",
@@ -693,7 +710,7 @@ pub static BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT: Lazy<IntCounterVec> = Lazy::new

 // walreceiver metrics

-pub static WALRECEIVER_STARTED_CONNECTIONS: Lazy<IntCounter> = Lazy::new(|| {
+pub(crate) static WALRECEIVER_STARTED_CONNECTIONS: Lazy<IntCounter> = Lazy::new(|| {
    register_int_counter!(
        "pageserver_walreceiver_started_connections_total",
        "Number of started walreceiver connections"
@@ -701,7 +718,7 @@ pub static WALRECEIVER_STARTED_CONNECTIONS: Lazy<IntCounter> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub static WALRECEIVER_ACTIVE_MANAGERS: Lazy<IntGauge> = Lazy::new(|| {
+pub(crate) static WALRECEIVER_ACTIVE_MANAGERS: Lazy<IntGauge> = Lazy::new(|| {
    register_int_gauge!(
        "pageserver_walreceiver_active_managers",
        "Number of active walreceiver managers"
@@ -709,7 +726,7 @@ pub static WALRECEIVER_ACTIVE_MANAGERS: Lazy<IntGauge> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub static WALRECEIVER_SWITCHES: Lazy<IntCounterVec> = Lazy::new(|| {
+pub(crate) static WALRECEIVER_SWITCHES: Lazy<IntCounterVec> = Lazy::new(|| {
    register_int_counter_vec!(
        "pageserver_walreceiver_switches_total",
        "Number of walreceiver manager change_connection calls",
@@ -718,7 +735,7 @@ pub static WALRECEIVER_SWITCHES: Lazy<IntCounterVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub static WALRECEIVER_BROKER_UPDATES: Lazy<IntCounter> = Lazy::new(|| {
+pub(crate) static WALRECEIVER_BROKER_UPDATES: Lazy<IntCounter> = Lazy::new(|| {
    register_int_counter!(
        "pageserver_walreceiver_broker_updates_total",
        "Number of received broker updates in walreceiver"
@@ -726,7 +743,7 @@ pub static WALRECEIVER_BROKER_UPDATES: Lazy<IntCounter> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub static WALRECEIVER_CANDIDATES_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
+pub(crate) static WALRECEIVER_CANDIDATES_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
    register_int_counter_vec!(
        "pageserver_walreceiver_candidates_events_total",
        "Number of walreceiver candidate events",
@@ -735,10 +752,10 @@ pub static WALRECEIVER_CANDIDATES_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub static WALRECEIVER_CANDIDATES_ADDED: Lazy<IntCounter> =
+pub(crate) static WALRECEIVER_CANDIDATES_ADDED: Lazy<IntCounter> =
    Lazy::new(|| WALRECEIVER_CANDIDATES_EVENTS.with_label_values(&["add"]));

-pub static WALRECEIVER_CANDIDATES_REMOVED: Lazy<IntCounter> =
+pub(crate) static WALRECEIVER_CANDIDATES_REMOVED: Lazy<IntCounter> =
    Lazy::new(|| WALRECEIVER_CANDIDATES_EVENTS.with_label_values(&["remove"]));

 // Metrics collected on WAL redo operations
@@ -785,7 +802,7 @@ macro_rules! redo_bytes_histogram_count_buckets {
    };
 }

-pub static WAL_REDO_TIME: Lazy<Histogram> = Lazy::new(|| {
+pub(crate) static WAL_REDO_TIME: Lazy<Histogram> = Lazy::new(|| {
    register_histogram!(
        "pageserver_wal_redo_seconds",
        "Time spent on WAL redo",
@@ -794,7 +811,7 @@ pub static WAL_REDO_TIME: Lazy<Histogram> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub static WAL_REDO_WAIT_TIME: Lazy<Histogram> = Lazy::new(|| {
+pub(crate) static WAL_REDO_WAIT_TIME: Lazy<Histogram> = Lazy::new(|| {
    register_histogram!(
        "pageserver_wal_redo_wait_seconds",
        "Time spent waiting for access to the Postgres WAL redo process",
@@ -803,7 +820,7 @@ pub static WAL_REDO_WAIT_TIME: Lazy<Histogram> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub static WAL_REDO_RECORDS_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
+pub(crate) static WAL_REDO_RECORDS_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
    register_histogram!(
        "pageserver_wal_redo_records_histogram",
        "Histogram of number of records replayed per redo in the Postgres WAL redo process",
@@ -812,7 +829,7 @@ pub static WAL_REDO_RECORDS_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub static WAL_REDO_BYTES_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
+pub(crate) static WAL_REDO_BYTES_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
    register_histogram!(
        "pageserver_wal_redo_bytes_histogram",
        "Histogram of number of records replayed per redo sent to Postgres",
@@ -821,7 +838,8 @@ pub static WAL_REDO_BYTES_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub static WAL_REDO_RECORD_COUNTER: Lazy<IntCounter> = Lazy::new(|| {
+// FIXME: isn't this already included by WAL_REDO_RECORDS_HISTOGRAM which has _count?
+pub(crate) static WAL_REDO_RECORD_COUNTER: Lazy<IntCounter> = Lazy::new(|| {
    register_int_counter!(
        "pageserver_replayed_wal_records_total",
        "Number of WAL records replayed in WAL redo process"
@@ -897,7 +915,6 @@ impl StorageTimeMetrics {
 pub struct TimelineMetrics {
    tenant_id: String,
    timeline_id: String,
-    pub get_reconstruct_data_time_histo: Histogram,
    pub flush_time_histo: StorageTimeMetrics,
    pub compact_time_histo: StorageTimeMetrics,
    pub create_images_time_histo: StorageTimeMetrics,
@@ -906,9 +923,7 @@ pub struct TimelineMetrics {
    pub load_layer_map_histo: StorageTimeMetrics,
    pub garbage_collect_histo: StorageTimeMetrics,
    pub last_record_gauge: IntGauge,
-    pub wait_lsn_time_histo: Histogram,
    pub resident_physical_size_gauge: UIntGauge,
-    pub read_num_fs_layers: Histogram,
    /// copy of LayeredTimeline.current_logical_size
    pub current_logical_size_gauge: UIntGauge,
    pub num_persistent_files_created: IntCounter,
@@ -925,9 +940,6 @@ impl TimelineMetrics {
    ) -> Self {
        let tenant_id = tenant_id.to_string();
        let timeline_id = timeline_id.to_string();
-        let get_reconstruct_data_time_histo = GET_RECONSTRUCT_DATA_TIME
-            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
-            .unwrap();
        let flush_time_histo =
            StorageTimeMetrics::new(StorageTimeOperation::LayerFlush, &tenant_id, &timeline_id);
        let compact_time_histo =
@@ -948,9 +960,6 @@ impl TimelineMetrics {
        let last_record_gauge = LAST_RECORD_LSN
            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
            .unwrap();
-        let wait_lsn_time_histo = WAIT_LSN_TIME
-            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
-            .unwrap();
        let resident_physical_size_gauge = RESIDENT_PHYSICAL_SIZE
            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
            .unwrap();
@@ -966,16 +975,12 @@ impl TimelineMetrics {
        let evictions = EVICTIONS
            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
            .unwrap();
-        let read_num_fs_layers = READ_NUM_FS_LAYERS
-            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
-            .unwrap();
        let evictions_with_low_residence_duration =
            evictions_with_low_residence_duration_builder.build(&tenant_id, &timeline_id);

        TimelineMetrics {
            tenant_id,
            timeline_id,
-            get_reconstruct_data_time_histo,
            flush_time_histo,
            compact_time_histo,
            create_images_time_histo,
@@ -984,7 +989,6 @@ impl TimelineMetrics {
            garbage_collect_histo,
            load_layer_map_histo,
            last_record_gauge,
-            wait_lsn_time_histo,
            resident_physical_size_gauge,
            current_logical_size_gauge,
            num_persistent_files_created,
@@ -993,7 +997,6 @@ impl TimelineMetrics {
            evictions_with_low_residence_duration: std::sync::RwLock::new(
                evictions_with_low_residence_duration,
            ),
-            read_num_fs_layers,
        }
    }
 }
@@ -1002,15 +1005,12 @@ impl Drop for TimelineMetrics {
    fn drop(&mut self) {
        let tenant_id = &self.tenant_id;
        let timeline_id = &self.timeline_id;
-        let _ = GET_RECONSTRUCT_DATA_TIME.remove_label_values(&[tenant_id, timeline_id]);
        let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, timeline_id]);
-        let _ = WAIT_LSN_TIME.remove_label_values(&[tenant_id, timeline_id]);
        let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
        let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
        let _ = NUM_PERSISTENT_FILES_CREATED.remove_label_values(&[tenant_id, timeline_id]);
        let _ = PERSISTENT_BYTES_WRITTEN.remove_label_values(&[tenant_id, timeline_id]);
        let _ = EVICTIONS.remove_label_values(&[tenant_id, timeline_id]);
-        let _ = READ_NUM_FS_LAYERS.remove_label_values(&[tenant_id, timeline_id]);

        self.evictions_with_low_residence_duration
            .write()
@@ -1022,9 +1022,6 @@ impl Drop for TimelineMetrics {
            let _ =
                STORAGE_TIME_COUNT_PER_TIMELINE.remove_label_values(&[op, tenant_id, timeline_id]);
        }
-        for op in STORAGE_IO_TIME_OPERATIONS {
-            let _ = STORAGE_IO_TIME.remove_label_values(&[op, tenant_id, timeline_id]);
-        }

        for op in STORAGE_IO_SIZE_OPERATIONS {
            let _ = STORAGE_IO_SIZE.remove_label_values(&[op, tenant_id, timeline_id]);
@@ -1039,9 +1036,7 @@ impl Drop for TimelineMetrics {
 pub fn remove_tenant_metrics(tenant_id: &TenantId) {
    let tid = tenant_id.to_string();
    let _ = TENANT_SYNTHETIC_SIZE_METRIC.remove_label_values(&[&tid]);
-    for state in TenantState::VARIANTS {
-        let _ = TENANT_STATE_METRIC.remove_label_values(&[&tid, state]);
-    }
+    // we leave the BROKEN_TENANTS_SET entry if any
 }

 use futures::Future;
@@ -1056,9 +1051,7 @@ pub struct RemoteTimelineClientMetrics {
    tenant_id: String,
    timeline_id: String,
    remote_physical_size_gauge: Mutex<Option<UIntGauge>>,
-    remote_operation_time: Mutex<HashMap<(&'static str, &'static str, &'static str), Histogram>>,
    calls_unfinished_gauge: Mutex<HashMap<(&'static str, &'static str), IntGauge>>,
-    calls_started_hist: Mutex<HashMap<(&'static str, &'static str), Histogram>>,
    bytes_started_counter: Mutex<HashMap<(&'static str, &'static str), IntCounter>>,
    bytes_finished_counter: Mutex<HashMap<(&'static str, &'static str), IntCounter>>,
 }
@@ -1068,14 +1061,13 @@ impl RemoteTimelineClientMetrics {
        RemoteTimelineClientMetrics {
            tenant_id: tenant_id.to_string(),
            timeline_id: timeline_id.to_string(),
-            remote_operation_time: Mutex::new(HashMap::default()),
            calls_unfinished_gauge: Mutex::new(HashMap::default()),
-            calls_started_hist: Mutex::new(HashMap::default()),
            bytes_started_counter: Mutex::new(HashMap::default()),
            bytes_finished_counter: Mutex::new(HashMap::default()),
            remote_physical_size_gauge: Mutex::new(None),
        }
    }
+
    pub fn remote_physical_size_gauge(&self) -> UIntGauge {
        let mut guard = self.remote_physical_size_gauge.lock().unwrap();
        guard
@@ -1089,26 +1081,17 @@ impl RemoteTimelineClientMetrics {
            })
            .clone()
    }
+
    pub fn remote_operation_time(
        &self,
        file_kind: &RemoteOpFileKind,
        op_kind: &RemoteOpKind,
        status: &'static str,
    ) -> Histogram {
-        let mut guard = self.remote_operation_time.lock().unwrap();
        let key = (file_kind.as_str(), op_kind.as_str(), status);
-        let metric = guard.entry(key).or_insert_with(move || {
-            REMOTE_OPERATION_TIME
-                .get_metric_with_label_values(&[
-                    &self.tenant_id.to_string(),
-                    &self.timeline_id.to_string(),
-                    key.0,
-                    key.1,
-                    key.2,
-                ])
-                .unwrap()
-        });
-        metric.clone()
+        REMOTE_OPERATION_TIME
+            .get_metric_with_label_values(&[key.0, key.1, key.2])
+            .unwrap()
    }

    fn calls_unfinished_gauge(
@@ -1136,19 +1119,10 @@ impl RemoteTimelineClientMetrics {
        file_kind: &RemoteOpFileKind,
        op_kind: &RemoteOpKind,
    ) -> Histogram {
-        let mut guard = self.calls_started_hist.lock().unwrap();
        let key = (file_kind.as_str(), op_kind.as_str());
-        let metric = guard.entry(key).or_insert_with(move || {
-            REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST
-                .get_metric_with_label_values(&[
-                    &self.tenant_id.to_string(),
-                    &self.timeline_id.to_string(),
-                    key.0,
-                    key.1,
-                ])
-                .unwrap()
-        });
-        metric.clone()
+        REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST
+            .get_metric_with_label_values(&[key.0, key.1])
+            .unwrap()
    }

    fn bytes_started_counter(
@@ -1328,15 +1302,10 @@ impl Drop for RemoteTimelineClientMetrics {
            tenant_id,
            timeline_id,
            remote_physical_size_gauge,
-            remote_operation_time,
            calls_unfinished_gauge,
-            calls_started_hist,
            bytes_started_counter,
            bytes_finished_counter,
        } = self;
-        for ((a, b, c), _) in remote_operation_time.get_mut().unwrap().drain() {
-            let _ = REMOTE_OPERATION_TIME.remove_label_values(&[tenant_id, timeline_id, a, b, c]);
-        }
        for ((a, b), _) in calls_unfinished_gauge.get_mut().unwrap().drain() {
            let _ = REMOTE_TIMELINE_CLIENT_CALLS_UNFINISHED_GAUGE.remove_label_values(&[
                tenant_id,
@@ -1345,14 +1314,6 @@ impl Drop for RemoteTimelineClientMetrics {
                b,
            ]);
        }
-        for ((a, b), _) in calls_started_hist.get_mut().unwrap().drain() {
-            let _ = REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST.remove_label_values(&[
-                tenant_id,
-                timeline_id,
-                a,
-                b,
-            ]);
-        }
        for ((a, b), _) in bytes_started_counter.get_mut().unwrap().drain() {
            let _ = REMOTE_TIMELINE_CLIENT_BYTES_STARTED_COUNTER.remove_label_values(&[
                tenant_id,
@@ -1434,15 +1395,51 @@ impl<F: Future<Output = Result<O, E>>, O, E> Future for MeasuredRemoteOp<F> {
 }

 pub fn preinitialize_metrics() {
-    // We want to alert on this metric increasing.
-    // Initialize it eagerly, so that our alert rule can distinguish absence of the metric from metric value 0.
-    assert_eq!(UNEXPECTED_ONDEMAND_DOWNLOADS.get(), 0);
-    UNEXPECTED_ONDEMAND_DOWNLOADS.reset();
+    // Python tests need these and on some we do alerting.
+    //
+    // FIXME(4813): make it so that we have no top level metrics as this fn will easily fall out of
+    // order:
+    // - global metrics reside in a Lazy<PageserverMetrics>
+    //   - access via crate::metrics::PS_METRICS.materialized_page_cache_hit.inc()
+    // - could move the statics into TimelineMetrics::new()?

-    // Same as above for this metric, but, it's a Vec-type metric for which we don't know all the labels.
-    BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT.reset();
+    // counters
+    [
+        &MATERIALIZED_PAGE_CACHE_HIT,
+        &MATERIALIZED_PAGE_CACHE_HIT_DIRECT,
+        &UNEXPECTED_ONDEMAND_DOWNLOADS,
+        &WALRECEIVER_STARTED_CONNECTIONS,
+        &WALRECEIVER_BROKER_UPDATES,
+        &WALRECEIVER_CANDIDATES_ADDED,
+        &WALRECEIVER_CANDIDATES_REMOVED,
+    ]
+    .into_iter()
+    .for_each(|c| {
+        Lazy::force(c);
+    });

-    // Python tests need these.
-    MATERIALIZED_PAGE_CACHE_HIT_DIRECT.get();
-    MATERIALIZED_PAGE_CACHE_HIT.get();
+    // countervecs
+    [&BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT]
+        .into_iter()
+        .for_each(|c| {
+            Lazy::force(c);
+        });
+
+    // gauges
+    WALRECEIVER_ACTIVE_MANAGERS.get();
+
+    // histograms
+    [
+        &READ_NUM_FS_LAYERS,
+        &RECONSTRUCT_TIME,
+        &WAIT_LSN_TIME,
+        &WAL_REDO_TIME,
+        &WAL_REDO_WAIT_TIME,
+        &WAL_REDO_RECORDS_HISTOGRAM,
+        &WAL_REDO_BYTES_HISTOGRAM,
+    ]
+    .into_iter()
+    .for_each(|h| {
+        Lazy::force(h);
+    });
 }
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -130,11 +130,25 @@ pub static WALRECEIVER_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
 pub static BACKGROUND_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
    tokio::runtime::Builder::new_multi_thread()
        .thread_name("background op worker")
+        // if you change the number of worker threads please change the constant below
        .enable_all()
        .build()
        .expect("Failed to create background op runtime")
 });

+pub(crate) static BACKGROUND_RUNTIME_WORKER_THREADS: Lazy<usize> = Lazy::new(|| {
+    // force init and thus panics
+    let _ = BACKGROUND_RUNTIME.handle();
+    // replicates tokio-1.28.1::loom::sys::num_cpus which is not available publicly
+    // tokio would had already panicked for parsing errors or NotUnicode
+    //
+    // this will be wrong if any of the runtimes gets their worker threads configured to something
+    // else, but that has not been needed in a long time.
+    std::env::var("TOKIO_WORKER_THREADS")
+        .map(|s| s.parse::<usize>().unwrap())
+        .unwrap_or_else(|_e| usize::max(1, num_cpus::get()))
+});
+
 #[derive(Debug, Clone, Copy)]
 pub struct PageserverTaskId(u64);

@@ -545,7 +559,7 @@ pub fn current_task_id() -> Option<PageserverTaskId> {
 pub async fn shutdown_watcher() {
    let token = SHUTDOWN_TOKEN
        .try_with(|t| t.clone())
-        .expect("shutdown_requested() called in an unexpected task or thread");
+        .expect("shutdown_watcher() called in an unexpected task or thread");

    token.cancelled().await;
 }
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -18,8 +18,8 @@ use remote_storage::DownloadError;
 use remote_storage::GenericRemoteStorage;
 use storage_broker::BrokerClientChannel;
 use tokio::sync::watch;
-use tokio::sync::OwnedMutexGuard;
 use tokio::task::JoinSet;
+use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::completion;
 use utils::crashsafe::path_with_suffix_extension;
@@ -28,7 +28,6 @@ use std::cmp::min;
 use std::collections::hash_map::Entry;
 use std::collections::BTreeSet;
 use std::collections::HashMap;
-use std::ffi::OsStr;
 use std::fs;
 use std::fs::File;
 use std::fs::OpenOptions;
@@ -47,6 +46,7 @@ use std::sync::{Mutex, RwLock};
 use std::time::{Duration, Instant};

 use self::config::TenantConf;
+use self::metadata::LoadMetadataError;
 use self::metadata::TimelineMetadata;
 use self::remote_timeline_client::RemoteTimelineClient;
 use self::timeline::uninit::TimelineUninitMark;
@@ -64,12 +64,12 @@ use crate::tenant::config::TenantConfOpt;
 use crate::tenant::metadata::load_metadata;
 use crate::tenant::remote_timeline_client::index::IndexPart;
 use crate::tenant::remote_timeline_client::MaybeDeletedIndexPart;
-use crate::tenant::remote_timeline_client::PersistIndexPartWithDeletedFlagError;
 use crate::tenant::storage_layer::DeltaLayer;
 use crate::tenant::storage_layer::ImageLayer;
 use crate::tenant::storage_layer::Layer;
 use crate::InitializationOrder;

+use crate::tenant::timeline::delete::DeleteTimelineFlow;
 use crate::tenant::timeline::uninit::cleanup_timeline_directory;
 use crate::virtual_file::VirtualFile;
 use crate::walredo::PostgresRedoManager;
@@ -265,6 +265,14 @@ pub enum GetTimelineError {
    },
 }

+#[derive(Debug, thiserror::Error)]
+pub enum LoadLocalTimelineError {
+    #[error("FailedToLoad")]
+    Load(#[source] anyhow::Error),
+    #[error("FailedToResumeDeletion")]
+    ResumeDeletion(#[source] anyhow::Error),
+}
+
 #[derive(Debug, thiserror::Error)]
 pub enum DeleteTimelineError {
    #[error("NotFound")]
@@ -318,14 +326,6 @@ impl std::fmt::Display for WaitToBecomeActiveError {
    }
 }

-struct DeletionGuard(OwnedMutexGuard<bool>);
-
-impl DeletionGuard {
-    fn is_deleted(&self) -> bool {
-        *self.0
-    }
-}
-
 #[derive(thiserror::Error, Debug)]
 pub enum CreateTimelineError {
    #[error("a timeline with the given ID already exists")]
@@ -336,6 +336,16 @@ pub enum CreateTimelineError {
    Other(#[from] anyhow::Error),
 }

+struct TenantDirectoryScan {
+    sorted_timelines_to_load: Vec<(TimelineId, TimelineMetadata)>,
+    timelines_to_resume_deletion: Vec<(TimelineId, Option<TimelineMetadata>)>,
+}
+
+enum CreateTimelineCause {
+    Load,
+    Delete,
+}
+
 impl Tenant {
    /// Yet another helper for timeline initialization.
    /// Contains the common part of `load_local_timeline` and `load_remote_timeline`.
@@ -374,6 +384,7 @@ impl Tenant {
            ancestor.clone(),
            remote_client,
            init_order,
+            CreateTimelineCause::Load,
        )?;
        let new_disk_consistent_lsn = timeline.get_disk_consistent_lsn();
        anyhow::ensure!(
@@ -802,11 +813,14 @@ impl Tenant {
        tenant
    }

-    pub fn scan_and_sort_timelines_dir(
-        self: Arc<Tenant>,
-    ) -> anyhow::Result<Vec<(TimelineId, TimelineMetadata)>> {
-        let timelines_dir = self.conf.timelines_path(&self.tenant_id);
+    fn scan_and_sort_timelines_dir(self: Arc<Tenant>) -> anyhow::Result<TenantDirectoryScan> {
        let mut timelines_to_load: HashMap<TimelineId, TimelineMetadata> = HashMap::new();
+        // Note timelines_to_resume_deletion needs to be separate because it can be not sortable
+        // from the point of `tree_sort_timelines`. I e some parents can be missing because deletion
+        // completed in non topological order (for example because parent has smaller number of layer files in it)
+        let mut timelines_to_resume_deletion: Vec<(TimelineId, Option<TimelineMetadata>)> = vec![];
+
+        let timelines_dir = self.conf.timelines_path(&self.tenant_id);

        for entry in
            std::fs::read_dir(&timelines_dir).context("list timelines directory for tenant")?
@@ -834,16 +848,13 @@ impl Tenant {
                    );
                    continue;
                }
+
                let timeline_uninit_mark_file = &timeline_dir;
                info!(
                    "Found an uninit mark file {}, removing the timeline and its uninit mark",
                    timeline_uninit_mark_file.display()
                );
-                let timeline_id = timeline_uninit_mark_file
-                    .file_stem()
-                    .and_then(OsStr::to_str)
-                    .unwrap_or_default()
-                    .parse::<TimelineId>()
+                let timeline_id = TimelineId::try_from(timeline_uninit_mark_file.file_stem())
                    .with_context(|| {
                        format!(
                            "Could not parse timeline id out of the timeline uninit mark name {}",
@@ -856,6 +867,47 @@ impl Tenant {
                {
                    error!("Failed to clean up uninit marked timeline: {e:?}");
                }
+            } else if crate::is_delete_mark(&timeline_dir) {
+                // If metadata exists, load as usual, continue deletion
+                let timeline_id =
+                    TimelineId::try_from(timeline_dir.file_stem()).with_context(|| {
+                        format!(
+                            "Could not parse timeline id out of the timeline uninit mark name {}",
+                            timeline_dir.display()
+                        )
+                    })?;
+
+                match load_metadata(self.conf, &self.tenant_id, &timeline_id) {
+                    Ok(metadata) => {
+                        timelines_to_resume_deletion.push((timeline_id, Some(metadata)))
+                    }
+                    Err(e) => match &e {
+                        LoadMetadataError::Read(r) => {
+                            if r.kind() != io::ErrorKind::NotFound {
+                                return Err(anyhow::anyhow!(e)).with_context(|| {
+                                    format!("Failed to load metadata for timeline_id {timeline_id}")
+                                });
+                            }
+
+                            // If metadata doesnt exist it means that we've crashed without
+                            // completing cleanup_remaining_timeline_fs_traces in DeleteTimelineFlow.
+                            // So save timeline_id for later call to `DeleteTimelineFlow::cleanup_remaining_timeline_fs_traces`.
+                            // We cant do it here because the method is async so we'd need block_on
+                            // and here we're in spawn_blocking. cleanup_remaining_timeline_fs_traces uses fs operations
+                            // so that basically results in a cycle:
+                            // spawn_blocking
+                            // - block_on
+                            //   - spawn_blocking
+                            // which can lead to running out of threads in blocing pool.
+                            timelines_to_resume_deletion.push((timeline_id, None));
+                        }
+                        _ => {
+                            return Err(anyhow::anyhow!(e)).with_context(|| {
+                                format!("Failed to load metadata for timeline_id {timeline_id}")
+                            })
+                        }
+                    },
+                }
            } else {
                if !timeline_dir.exists() {
                    warn!(
@@ -864,12 +916,8 @@ impl Tenant {
                    );
                    continue;
                }
-                let timeline_id = timeline_dir
-                    .file_name()
-                    .and_then(OsStr::to_str)
-                    .unwrap_or_default()
-                    .parse::<TimelineId>()
-                    .with_context(|| {
+                let timeline_id =
+                    TimelineId::try_from(timeline_dir.file_name()).with_context(|| {
                        format!(
                            "Could not parse timeline id out of the timeline dir name {}",
                            timeline_dir.display()
@@ -891,6 +939,14 @@ impl Tenant {
                    continue;
                }

+                let timeline_delete_mark_file = self
+                    .conf
+                    .timeline_delete_mark_file_path(self.tenant_id, timeline_id);
+                if timeline_delete_mark_file.exists() {
+                    // Cleanup should be done in `is_delete_mark` branch above
+                    continue;
+                }
+
                let file_name = entry.file_name();
                if let Ok(timeline_id) =
                    file_name.to_str().unwrap_or_default().parse::<TimelineId>()
@@ -910,7 +966,10 @@ impl Tenant {

        // Sort the array of timeline IDs into tree-order, so that parent comes before
        // all its children.
-        tree_sort_timelines(timelines_to_load)
+        tree_sort_timelines(timelines_to_load).map(|sorted_timelines| TenantDirectoryScan {
+            sorted_timelines_to_load: sorted_timelines,
+            timelines_to_resume_deletion,
+        })
    }

    ///
@@ -936,7 +995,7 @@ impl Tenant {
        let span = info_span!("blocking");
        let cloned = Arc::clone(self);

-        let sorted_timelines: Vec<(_, _)> = tokio::task::spawn_blocking(move || {
+        let scan = tokio::task::spawn_blocking(move || {
            let _g = span.entered();
            cloned.scan_and_sort_timelines_dir()
        })
@@ -947,10 +1006,60 @@ impl Tenant {
        // FIXME original collect_timeline_files contained one more check:
        //    1. "Timeline has no ancestor and no layer files"

-        for (timeline_id, local_metadata) in sorted_timelines {
-            self.load_local_timeline(timeline_id, local_metadata, init_order, ctx)
+        // Process loadable timelines first
+        for (timeline_id, local_metadata) in scan.sorted_timelines_to_load {
+            if let Err(e) = self
+                .load_local_timeline(timeline_id, local_metadata, init_order, ctx, false)
                .await
-                .with_context(|| format!("load local timeline {timeline_id}"))?;
+            {
+                match e {
+                    LoadLocalTimelineError::Load(source) => {
+                        return Err(anyhow::anyhow!(source)
+                            .context("Failed to load local timeline: {timeline_id}"))
+                    }
+                    LoadLocalTimelineError::ResumeDeletion(source) => {
+                        // Make sure resumed deletion wont fail loading for entire tenant.
+                        error!("Failed to resume timeline deletion: {source:#}")
+                    }
+                }
+            }
+        }
+
+        // Resume deletion ones with deleted_mark
+        for (timeline_id, maybe_local_metadata) in scan.timelines_to_resume_deletion {
+            match maybe_local_metadata {
+                None => {
+                    // See comment in `scan_and_sort_timelines_dir`.
+                    if let Err(e) =
+                        DeleteTimelineFlow::cleanup_remaining_timeline_fs_traces(self, timeline_id)
+                            .await
+                    {
+                        warn!(
+                            "cannot clean up deleted timeline dir timeline_id: {} error: {:#}",
+                            timeline_id, e
+                        );
+                    }
+                }
+                Some(local_metadata) => {
+                    if let Err(e) = self
+                        .load_local_timeline(timeline_id, local_metadata, init_order, ctx, true)
+                        .await
+                    {
+                        match e {
+                            LoadLocalTimelineError::Load(source) => {
+                                // We tried to load deleted timeline, this is a bug.
+                                return Err(anyhow::anyhow!(source).context(
+                                "This is a bug. We tried to load deleted timeline which is wrong and loading failed. Timeline: {timeline_id}"
+                            ));
+                            }
+                            LoadLocalTimelineError::ResumeDeletion(source) => {
+                                // Make sure resumed deletion wont fail loading for entire tenant.
+                                error!("Failed to resume timeline deletion: {source:#}")
+                            }
+                        }
+                    }
+                }
+            }
        }

        trace!("Done");
@@ -968,7 +1077,8 @@ impl Tenant {
        local_metadata: TimelineMetadata,
        init_order: Option<&InitializationOrder>,
        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+        found_delete_mark: bool,
+    ) -> Result<(), LoadLocalTimelineError> {
        span::debug_assert_current_span_has_tenant_id();

        let remote_client = self.remote_storage.as_ref().map(|remote_storage| {
@@ -980,14 +1090,6 @@ impl Tenant {
            )
        });

-        let ancestor = if let Some(ancestor_timeline_id) = local_metadata.ancestor_timeline() {
-            let ancestor_timeline = self.get_timeline(ancestor_timeline_id, false)
-                .with_context(|| anyhow::anyhow!("cannot find ancestor timeline {ancestor_timeline_id} for timeline {timeline_id}"))?;
-            Some(ancestor_timeline)
-        } else {
-            None
-        };
-
        let (remote_startup_data, remote_client) = match remote_client {
            Some(remote_client) => match remote_client.download_index_file().await {
                Ok(index_part) => {
@@ -1007,45 +1109,29 @@ impl Tenant {
                            info!("is_deleted is set on remote, resuming removal of timeline data originally done by timeline deletion handler");

                            remote_client
-                                .init_upload_queue_stopped_to_continue_deletion(&index_part)?;
+                                .init_upload_queue_stopped_to_continue_deletion(&index_part)
+                                .context("init queue stopped")
+                                .map_err(LoadLocalTimelineError::ResumeDeletion)?;

-                            let timeline = self
-                                .create_timeline_struct(
-                                    timeline_id,
-                                    &local_metadata,
-                                    ancestor,
-                                    Some(remote_client),
-                                    init_order,
-                                )
-                                .context("create_timeline_struct")?;
-
-                            let guard = DeletionGuard(
-                                Arc::clone(&timeline.delete_lock)
-                                    .try_lock_owned()
-                                    .expect("cannot happen because we're the only owner"),
-                            );
-
-                            // Note: here we even skip populating layer map. Timeline is essentially uninitialized.
-                            // RemoteTimelineClient is the only functioning part.
-                            timeline.set_state(TimelineState::Stopping);
-                            // We meed to do this because when console retries delete request we shouldnt answer with 404
-                            // because 404 means successful deletion.
-                            // FIXME consider TimelineState::Deleting.
-                            let mut locked = self.timelines.lock().unwrap();
-                            locked.insert(timeline_id, Arc::clone(&timeline));
-
-                            Tenant::schedule_delete_timeline(
+                            DeleteTimelineFlow::resume_deletion(
                                Arc::clone(self),
                                timeline_id,
-                                timeline,
-                                guard,
-                            );
+                                &local_metadata,
+                                Some(remote_client),
+                                init_order,
+                            )
+                            .await
+                            .context("resume deletion")
+                            .map_err(LoadLocalTimelineError::ResumeDeletion)?;

                            return Ok(());
                        }
                    };

-                    let remote_metadata = index_part.parse_metadata().context("parse_metadata")?;
+                    let remote_metadata = index_part
+                        .parse_metadata()
+                        .context("parse_metadata")
+                        .map_err(LoadLocalTimelineError::Load)?;
                    (
                        Some(RemoteStartupData {
                            index_part,
@@ -1055,12 +1141,54 @@ impl Tenant {
                    )
                }
                Err(DownloadError::NotFound) => {
-                    info!("no index file was found on the remote");
+                    info!("no index file was found on the remote, found_delete_mark: {found_delete_mark}");
+
+                    if found_delete_mark {
+                        // We could've resumed at a point where remote index was deleted, but metadata file wasnt.
+                        // Cleanup:
+                        return DeleteTimelineFlow::cleanup_remaining_timeline_fs_traces(
+                            self,
+                            timeline_id,
+                        )
+                        .await
+                        .context("cleanup_remaining_timeline_fs_traces")
+                        .map_err(LoadLocalTimelineError::ResumeDeletion);
+                    }
+
+                    // We're loading fresh timeline that didnt yet make it into remote.
                    (None, Some(remote_client))
                }
-                Err(e) => return Err(anyhow::anyhow!(e)),
+                Err(e) => return Err(LoadLocalTimelineError::Load(anyhow::Error::new(e))),
            },
-            None => (None, remote_client),
+            None => {
+                // No remote client
+                if found_delete_mark {
+                    // There is no remote client, we found local metadata.
+                    // Continue cleaning up local disk.
+                    DeleteTimelineFlow::resume_deletion(
+                        Arc::clone(self),
+                        timeline_id,
+                        &local_metadata,
+                        None,
+                        init_order,
+                    )
+                    .await
+                    .context("resume deletion")
+                    .map_err(LoadLocalTimelineError::ResumeDeletion)?;
+                    return Ok(());
+                }
+
+                (None, remote_client)
+            }
+        };
+
+        let ancestor = if let Some(ancestor_timeline_id) = local_metadata.ancestor_timeline() {
+            let ancestor_timeline = self.get_timeline(ancestor_timeline_id, false)
+                .with_context(|| anyhow::anyhow!("cannot find ancestor timeline {ancestor_timeline_id} for timeline {timeline_id}"))
+                .map_err(LoadLocalTimelineError::Load)?;
+            Some(ancestor_timeline)
+        } else {
+            None
        };

        self.timeline_init_and_sync(
@@ -1074,6 +1202,7 @@ impl Tenant {
            ctx,
        )
        .await
+        .map_err(LoadLocalTimelineError::Load)
    }

    pub fn tenant_id(&self) -> TenantId {
@@ -1335,7 +1464,11 @@ impl Tenant {
    /// This function is periodically called by compactor task.
    /// Also it can be explicitly requested per timeline through page server
    /// api's 'compact' command.
-    pub async fn compaction_iteration(&self, ctx: &RequestContext) -> anyhow::Result<()> {
+    pub async fn compaction_iteration(
+        &self,
+        cancel: &CancellationToken,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
        anyhow::ensure!(
            self.is_active(),
            "Cannot run compaction iteration on inactive tenant"
@@ -1363,7 +1496,7 @@ impl Tenant {

        for (timeline_id, timeline) in &timelines_to_compact {
            timeline
-                .compact(ctx)
+                .compact(cancel, ctx)
                .instrument(info_span!("compact_timeline", %timeline_id))
                .await?;
        }
@@ -1430,269 +1563,6 @@ impl Tenant {
        }
    }

-    /// Shuts down a timeline's tasks, removes its in-memory structures, and deletes its
-    /// data from both disk and s3.
-    async fn delete_timeline(
-        &self,
-        timeline_id: TimelineId,
-        timeline: Arc<Timeline>,
-        guard: DeletionGuard,
-    ) -> anyhow::Result<()> {
-        {
-            // Grab the layer_removal_cs lock, and actually perform the deletion.
-            //
-            // This lock prevents prevents GC or compaction from running at the same time.
-            // The GC task doesn't register itself with the timeline it's operating on,
-            // so it might still be running even though we called `shutdown_tasks`.
-            //
-            // Note that there are still other race conditions between
-            // GC, compaction and timeline deletion. See
-            // https://github.com/neondatabase/neon/issues/2671
-            //
-            // No timeout here, GC & Compaction should be responsive to the
-            // `TimelineState::Stopping` change.
-            info!("waiting for layer_removal_cs.lock()");
-            let layer_removal_guard = timeline.layer_removal_cs.lock().await;
-            info!("got layer_removal_cs.lock(), deleting layer files");
-
-            // NB: remote_timeline_client upload tasks that reference these layers have been cancelled
-            //     by the caller.
-
-            let local_timeline_directory = self
-                .conf
-                .timeline_path(&self.tenant_id, &timeline.timeline_id);
-
-            fail::fail_point!("timeline-delete-before-rm", |_| {
-                Err(anyhow::anyhow!("failpoint: timeline-delete-before-rm"))?
-            });
-
-            // NB: This need not be atomic because the deleted flag in the IndexPart
-            // will be observed during tenant/timeline load. The deletion will be resumed there.
-            //
-            // For configurations without remote storage, we tolerate that we're not crash-safe here.
-            // The timeline may come up Active but with missing layer files, in such setups.
-            // See https://github.com/neondatabase/neon/pull/3919#issuecomment-1531726720
-            match std::fs::remove_dir_all(&local_timeline_directory) {
-                Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
-                    // This can happen if we're called a second time, e.g.,
-                    // because of a previous failure/cancellation at/after
-                    // failpoint timeline-delete-after-rm.
-                    //
-                    // It can also happen if we race with tenant detach, because,
-                    // it doesn't grab the layer_removal_cs lock.
-                    //
-                    // For now, log and continue.
-                    // warn! level is technically not appropriate for the
-                    // first case because we should expect retries to happen.
-                    // But the error is so rare, it seems better to get attention if it happens.
-                    let tenant_state = self.current_state();
-                    warn!(
-                        timeline_dir=?local_timeline_directory,
-                        ?tenant_state,
-                        "timeline directory not found, proceeding anyway"
-                    );
-                    // continue with the rest of the deletion
-                }
-                res => res.with_context(|| {
-                    format!(
-                        "Failed to remove local timeline directory '{}'",
-                        local_timeline_directory.display()
-                    )
-                })?,
-            }
-
-            info!("finished deleting layer files, releasing layer_removal_cs.lock()");
-            drop(layer_removal_guard);
-        }
-
-        fail::fail_point!("timeline-delete-after-rm", |_| {
-            Err(anyhow::anyhow!("failpoint: timeline-delete-after-rm"))?
-        });
-
-        if let Some(remote_client) = &timeline.remote_client {
-            remote_client.delete_all().await.context("delete_all")?
-        };
-
-        pausable_failpoint!("in_progress_delete");
-
-        {
-            // Remove the timeline from the map.
-            let mut timelines = self.timelines.lock().unwrap();
-            let children_exist = timelines
-                .iter()
-                .any(|(_, entry)| entry.get_ancestor_timeline_id() == Some(timeline_id));
-            // XXX this can happen because `branch_timeline` doesn't check `TimelineState::Stopping`.
-            // We already deleted the layer files, so it's probably best to panic.
-            // (Ideally, above remove_dir_all is atomic so we don't see this timeline after a restart)
-            if children_exist {
-                panic!("Timeline grew children while we removed layer files");
-            }
-
-            timelines.remove(&timeline_id).expect(
-                "timeline that we were deleting was concurrently removed from 'timelines' map",
-            );
-
-            drop(timelines);
-        }
-
-        drop(guard);
-
-        Ok(())
-    }
-
-    /// Removes timeline-related in-memory data and schedules removal from remote storage.
-    #[instrument(skip(self, _ctx))]
-    pub async fn prepare_and_schedule_delete_timeline(
-        self: Arc<Self>,
-        timeline_id: TimelineId,
-        _ctx: &RequestContext,
-    ) -> Result<(), DeleteTimelineError> {
-        debug_assert_current_span_has_tenant_and_timeline_id();
-
-        // Transition the timeline into TimelineState::Stopping.
-        // This should prevent new operations from starting.
-        //
-        // Also grab the Timeline's delete_lock to prevent another deletion from starting.
-        let timeline;
-        let delete_lock_guard;
-        {
-            let mut timelines = self.timelines.lock().unwrap();
-
-            // Ensure that there are no child timelines **attached to that pageserver**,
-            // because detach removes files, which will break child branches
-            let children: Vec<TimelineId> = timelines
-                .iter()
-                .filter_map(|(id, entry)| {
-                    if entry.get_ancestor_timeline_id() == Some(timeline_id) {
-                        Some(*id)
-                    } else {
-                        None
-                    }
-                })
-                .collect();
-
-            if !children.is_empty() {
-                return Err(DeleteTimelineError::HasChildren(children));
-            }
-
-            let timeline_entry = match timelines.entry(timeline_id) {
-                Entry::Occupied(e) => e,
-                Entry::Vacant(_) => return Err(DeleteTimelineError::NotFound),
-            };
-
-            timeline = Arc::clone(timeline_entry.get());
-
-            // Prevent two tasks from trying to delete the timeline at the same time.
-            delete_lock_guard = DeletionGuard(
-                Arc::clone(&timeline.delete_lock)
-                    .try_lock_owned()
-                    .map_err(|_| DeleteTimelineError::AlreadyInProgress)?,
-            );
-
-            // If another task finished the deletion just before we acquired the lock,
-            // return success.
-            if delete_lock_guard.is_deleted() {
-                return Ok(());
-            }
-
-            timeline.set_state(TimelineState::Stopping);
-
-            drop(timelines);
-        }
-
-        // Now that the Timeline is in Stopping state, request all the related tasks to
-        // shut down.
-        //
-        // NB: If this fails half-way through, and is retried, the retry will go through
-        // all the same steps again. Make sure the code here is idempotent, and don't
-        // error out if some of the shutdown tasks have already been completed!
-
-        // Stop the walreceiver first.
-        debug!("waiting for wal receiver to shutdown");
-        let maybe_started_walreceiver = { timeline.walreceiver.lock().unwrap().take() };
-        if let Some(walreceiver) = maybe_started_walreceiver {
-            walreceiver.stop().await;
-        }
-        debug!("wal receiver shutdown confirmed");
-
-        // Prevent new uploads from starting.
-        if let Some(remote_client) = timeline.remote_client.as_ref() {
-            let res = remote_client.stop();
-            match res {
-                Ok(()) => {}
-                Err(e) => match e {
-                    remote_timeline_client::StopError::QueueUninitialized => {
-                        // This case shouldn't happen currently because the
-                        // load and attach code bails out if _any_ of the timeline fails to fetch its IndexPart.
-                        // That is, before we declare the Tenant as Active.
-                        // But we only allow calls to delete_timeline on Active tenants.
-                        return Err(DeleteTimelineError::Other(anyhow::anyhow!("upload queue is uninitialized, likely the timeline was in Broken state prior to this call because it failed to fetch IndexPart during load or attach, check the logs")));
-                    }
-                },
-            }
-        }
-
-        // Stop & wait for the remaining timeline tasks, including upload tasks.
-        // NB: This and other delete_timeline calls do not run as a task_mgr task,
-        //     so, they are not affected by this shutdown_tasks() call.
-        info!("waiting for timeline tasks to shutdown");
-        task_mgr::shutdown_tasks(None, Some(self.tenant_id), Some(timeline_id)).await;
-
-        // Mark timeline as deleted in S3 so we won't pick it up next time
-        // during attach or pageserver restart.
-        // See comment in persist_index_part_with_deleted_flag.
-        if let Some(remote_client) = timeline.remote_client.as_ref() {
-            match remote_client.persist_index_part_with_deleted_flag().await {
-                // If we (now, or already) marked it successfully as deleted, we can proceed
-                Ok(()) | Err(PersistIndexPartWithDeletedFlagError::AlreadyDeleted(_)) => (),
-                // Bail out otherwise
-                //
-                // AlreadyInProgress shouldn't happen, because the 'delete_lock' prevents
-                // two tasks from performing the deletion at the same time. The first task
-                // that starts deletion should run it to completion.
-                Err(e @ PersistIndexPartWithDeletedFlagError::AlreadyInProgress(_))
-                | Err(e @ PersistIndexPartWithDeletedFlagError::Other(_)) => {
-                    return Err(DeleteTimelineError::Other(anyhow::anyhow!(e)));
-                }
-            }
-        }
-        self.schedule_delete_timeline(timeline_id, timeline, delete_lock_guard);
-
-        Ok(())
-    }
-
-    fn schedule_delete_timeline(
-        self: Arc<Self>,
-        timeline_id: TimelineId,
-        timeline: Arc<Timeline>,
-        guard: DeletionGuard,
-    ) {
-        let tenant_id = self.tenant_id;
-        let timeline_clone = Arc::clone(&timeline);
-
-        task_mgr::spawn(
-            task_mgr::BACKGROUND_RUNTIME.handle(),
-            TaskKind::TimelineDeletionWorker,
-            Some(self.tenant_id),
-            Some(timeline_id),
-            "timeline_delete",
-            false,
-            async move {
-                if let Err(err) = self.delete_timeline(timeline_id, timeline, guard).await {
-                    error!("Error: {err:#}");
-                    timeline_clone.set_broken(err.to_string())
-                };
-                Ok(())
-            }
-            .instrument({
-                let span =
-                    tracing::info_span!(parent: None, "delete_timeline", tenant_id=%tenant_id, timeline_id=%timeline_id);
-                span.follows_from(Span::current());
-                span
-            }),
-        );
-    }
-
    pub fn current_state(&self) -> TenantState {
        self.state.borrow().clone()
    }
@@ -2149,6 +2019,10 @@ impl Tenant {
    /// The returned Timeline is in Loading state. The caller is responsible for
    /// initializing any on-disk state, and for inserting the Timeline to the 'timelines'
    /// map.
+    ///
+    /// `validate_ancestor == false` is used when a timeline is created for deletion
+    /// and we might not have the ancestor present anymore which is fine for to be
+    /// deleted timelines.
    fn create_timeline_struct(
        &self,
        new_timeline_id: TimelineId,
@@ -2156,19 +2030,26 @@ impl Tenant {
        ancestor: Option<Arc<Timeline>>,
        remote_client: Option<RemoteTimelineClient>,
        init_order: Option<&InitializationOrder>,
+        cause: CreateTimelineCause,
    ) -> anyhow::Result<Arc<Timeline>> {
-        if let Some(ancestor_timeline_id) = new_metadata.ancestor_timeline() {
-            anyhow::ensure!(
-                ancestor.is_some(),
-                "Timeline's {new_timeline_id} ancestor {ancestor_timeline_id} was not found"
-            )
-        }
+        let state = match cause {
+            CreateTimelineCause::Load => {
+                let ancestor_id = new_metadata.ancestor_timeline();
+                anyhow::ensure!(
+                    ancestor_id == ancestor.as_ref().map(|t| t.timeline_id),
+                    "Timeline's {new_timeline_id} ancestor {ancestor_id:?} was not found"
+                );
+                TimelineState::Loading
+            }
+            CreateTimelineCause::Delete => TimelineState::Stopping,
+        };

        let initial_logical_size_can_start = init_order.map(|x| &x.initial_logical_size_can_start);
        let initial_logical_size_attempt = init_order.map(|x| &x.initial_logical_size_attempt);

        let pg_version = new_metadata.pg_version();
-        Ok(Timeline::new(
+
+        let timeline = Timeline::new(
            self.conf,
            Arc::clone(&self.tenant_conf),
            new_metadata,
@@ -2180,7 +2061,10 @@ impl Tenant {
            pg_version,
            initial_logical_size_can_start.cloned(),
            initial_logical_size_attempt.cloned(),
-        ))
+            state,
+        );
+
+        Ok(timeline)
    }

    fn new(
@@ -2194,28 +2078,53 @@ impl Tenant {
        let (state, mut rx) = watch::channel(state);

        tokio::spawn(async move {
-            let mut current_state: &'static str = From::from(&*rx.borrow_and_update());
            let tid = tenant_id.to_string();
-            TENANT_STATE_METRIC
-                .with_label_values(&[&tid, current_state])
-                .inc();
-            loop {
-                match rx.changed().await {
-                    Ok(()) => {
-                        let new_state: &'static str = From::from(&*rx.borrow_and_update());
-                        TENANT_STATE_METRIC
-                            .with_label_values(&[&tid, current_state])
-                            .dec();
-                        TENANT_STATE_METRIC
-                            .with_label_values(&[&tid, new_state])
-                            .inc();

-                        current_state = new_state;
-                    }
-                    Err(_sender_dropped_error) => {
-                        info!("Tenant dropped the state updates sender, quitting waiting for tenant state change");
-                        return;
-                    }
+            fn inspect_state(state: &TenantState) -> ([&'static str; 1], bool) {
+                ([state.into()], matches!(state, TenantState::Broken { .. }))
+            }
+
+            let mut tuple = inspect_state(&rx.borrow_and_update());
+
+            let is_broken = tuple.1;
+            let mut counted_broken = if !is_broken {
+                // the tenant might be ignored and reloaded, so first remove any previous set
+                // element. it most likely has already been scraped, as these are manual operations
+                // right now. most likely we will add it back very soon.
+                drop(crate::metrics::BROKEN_TENANTS_SET.remove_label_values(&[&tid]));
+                false
+            } else {
+                // add the id to the set right away, there should not be any updates on the channel
+                // after
+                crate::metrics::BROKEN_TENANTS_SET
+                    .with_label_values(&[&tid])
+                    .set(1);
+                true
+            };
+
+            loop {
+                let labels = &tuple.0;
+                let current = TENANT_STATE_METRIC.with_label_values(labels);
+                current.inc();
+
+                if rx.changed().await.is_err() {
+                    // tenant has been dropped; decrement the counter because a tenant with that
+                    // state is no longer in tenant map, but allow any broken set item to exist
+                    // still.
+                    current.dec();
+                    break;
+                }
+
+                current.dec();
+                tuple = inspect_state(&rx.borrow_and_update());
+
+                let is_broken = tuple.1;
+                if is_broken && !counted_broken {
+                    counted_broken = true;
+                    // insert the tenant_id (back) into the set
+                    crate::metrics::BROKEN_TENANTS_SET
+                        .with_label_values(&[&tid])
+                        .inc();
                }
            }
        });
@@ -2822,7 +2731,14 @@ impl Tenant {
        };

        let timeline_struct = self
-            .create_timeline_struct(new_timeline_id, new_metadata, ancestor, remote_client, None)
+            .create_timeline_struct(
+                new_timeline_id,
+                new_metadata,
+                ancestor,
+                remote_client,
+                None,
+                CreateTimelineCause::Load,
+            )
            .context("Failed to create timeline data structure")?;

        timeline_struct.init_empty_layer_map(start_lsn);
@@ -3210,7 +3126,7 @@ impl Drop for Tenant {
    }
 }
 /// Dump contents of a layer file to stdout.
-pub fn dump_layerfile_from_path(
+pub async fn dump_layerfile_from_path(
    path: &Path,
    verbose: bool,
    ctx: &RequestContext,
@@ -3224,27 +3140,22 @@ pub fn dump_layerfile_from_path(
    file.read_exact_at(&mut header_buf, 0)?;

    match u16::from_be_bytes(header_buf) {
-        crate::IMAGE_FILE_MAGIC => ImageLayer::new_for_path(path, file)?.dump(verbose, ctx)?,
-        crate::DELTA_FILE_MAGIC => DeltaLayer::new_for_path(path, file)?.dump(verbose, ctx)?,
+        crate::IMAGE_FILE_MAGIC => {
+            ImageLayer::new_for_path(path, file)?
+                .dump(verbose, ctx)
+                .await?
+        }
+        crate::DELTA_FILE_MAGIC => {
+            DeltaLayer::new_for_path(path, file)?
+                .dump(verbose, ctx)
+                .await?
+        }
        magic => bail!("unrecognized magic identifier: {:?}", magic),
    }

    Ok(())
 }

-fn ignore_absent_files<F>(fs_operation: F) -> io::Result<()>
-where
-    F: Fn() -> io::Result<()>,
-{
-    fs_operation().or_else(|e| {
-        if e.kind() == io::ErrorKind::NotFound {
-            Ok(())
-        } else {
-            Err(e)
-        }
-    })
-}
-
 #[cfg(test)]
 pub mod harness {
    use bytes::{Bytes, BytesMut};
@@ -3441,6 +3352,7 @@ mod tests {
    use hex_literal::hex;
    use once_cell::sync::Lazy;
    use rand::{thread_rng, Rng};
+    use tokio_util::sync::CancellationToken;

    static TEST_KEY: Lazy<Key> =
        Lazy::new(|| Key::from_slice(&hex!("112222222233333333444444445500000001")));
@@ -3922,9 +3834,9 @@ mod tests {
            .await
            .err()
            .expect("should fail");
-        // get all the stack with all .context, not tonly the last one
+        // get all the stack with all .context, not only the last one
        let message = format!("{err:#}");
-        let expected = "Failed to parse metadata bytes from path";
+        let expected = "failed to load metadata";
        assert!(
            message.contains(expected),
            "message '{message}' expected to contain {expected}"
@@ -3941,7 +3853,8 @@ mod tests {
        }
        assert!(
            found_error_message,
-            "didn't find the corrupted metadata error"
+            "didn't find the corrupted metadata error in {}",
+            message
        );

        Ok(())
@@ -3962,7 +3875,7 @@ mod tests {
        drop(writer);

        tline.freeze_and_flush().await?;
-        tline.compact(&ctx).await?;
+        tline.compact(&CancellationToken::new(), &ctx).await?;

        let writer = tline.writer().await;
        writer
@@ -3972,7 +3885,7 @@ mod tests {
        drop(writer);

        tline.freeze_and_flush().await?;
-        tline.compact(&ctx).await?;
+        tline.compact(&CancellationToken::new(), &ctx).await?;

        let writer = tline.writer().await;
        writer
@@ -3982,7 +3895,7 @@ mod tests {
        drop(writer);

        tline.freeze_and_flush().await?;
-        tline.compact(&ctx).await?;
+        tline.compact(&CancellationToken::new(), &ctx).await?;

        let writer = tline.writer().await;
        writer
@@ -3992,7 +3905,7 @@ mod tests {
        drop(writer);

        tline.freeze_and_flush().await?;
-        tline.compact(&ctx).await?;
+        tline.compact(&CancellationToken::new(), &ctx).await?;

        assert_eq!(
            tline.get(*TEST_KEY, Lsn(0x10), &ctx).await?,
@@ -4061,7 +3974,7 @@ mod tests {
                .update_gc_info(Vec::new(), cutoff, Duration::ZERO, &ctx)
                .await?;
            tline.freeze_and_flush().await?;
-            tline.compact(&ctx).await?;
+            tline.compact(&CancellationToken::new(), &ctx).await?;
            tline.gc().await?;
        }

@@ -4138,7 +4051,7 @@ mod tests {
                .update_gc_info(Vec::new(), cutoff, Duration::ZERO, &ctx)
                .await?;
            tline.freeze_and_flush().await?;
-            tline.compact(&ctx).await?;
+            tline.compact(&CancellationToken::new(), &ctx).await?;
            tline.gc().await?;
        }

@@ -4226,7 +4139,7 @@ mod tests {
                .update_gc_info(Vec::new(), cutoff, Duration::ZERO, &ctx)
                .await?;
            tline.freeze_and_flush().await?;
-            tline.compact(&ctx).await?;
+            tline.compact(&CancellationToken::new(), &ctx).await?;
            tline.gc().await?;
        }

--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -16,29 +16,19 @@ use crate::tenant::block_io::{BlockCursor, BlockReader};
 use std::cmp::min;
 use std::io::{Error, ErrorKind};

-/// For reading
-pub trait BlobCursor {
+impl<R> BlockCursor<R>
+where
+    R: BlockReader,
+{
    /// Read a blob into a new buffer.
-    fn read_blob(&mut self, offset: u64) -> Result<Vec<u8>, std::io::Error> {
+    pub fn read_blob(&mut self, offset: u64) -> Result<Vec<u8>, std::io::Error> {
        let mut buf = Vec::new();
        self.read_blob_into_buf(offset, &mut buf)?;
        Ok(buf)
    }
-
    /// Read blob into the given buffer. Any previous contents in the buffer
    /// are overwritten.
-    fn read_blob_into_buf(
-        &mut self,
-        offset: u64,
-        dstbuf: &mut Vec<u8>,
-    ) -> Result<(), std::io::Error>;
-}
-
-impl<R> BlobCursor for BlockCursor<R>
-where
-    R: BlockReader,
-{
-    fn read_blob_into_buf(
+    pub fn read_blob_into_buf(
        &mut self,
        offset: u64,
        dstbuf: &mut Vec<u8>,
--- a/pageserver/src/tenant/disk_btree.rs
+++ b/pageserver/src/tenant/disk_btree.rs
@@ -390,39 +390,42 @@ where
    }

    #[allow(dead_code)]
-    pub fn dump(&self) -> Result<()> {
-        self.dump_recurse(self.root_blk, &[], 0)
-    }
+    pub async fn dump(&self) -> Result<()> {
+        let mut stack = Vec::new();

-    fn dump_recurse(&self, blknum: u32, path: &[u8], depth: usize) -> Result<()> {
-        let blk = self.reader.read_blk(self.start_blk + blknum)?;
-        let buf: &[u8] = blk.as_ref();
+        stack.push((self.root_blk, String::new(), 0, 0, 0));

-        let node = OnDiskNode::<L>::deparse(buf)?;
+        while let Some((blknum, path, depth, child_idx, key_off)) = stack.pop() {
+            let blk = self.reader.read_blk(self.start_blk + blknum)?;
+            let buf: &[u8] = blk.as_ref();
+            let node = OnDiskNode::<L>::deparse(buf)?;

-        print!("{:indent$}", "", indent = depth * 2);
-        println!(
-            "blk #{}: path {}: prefix {}, suffix_len {}",
-            blknum,
-            hex::encode(path),
-            hex::encode(node.prefix),
-            node.suffix_len
-        );
+            if child_idx == 0 {
+                print!("{:indent$}", "", indent = depth * 2);
+                let path_prefix = stack
+                    .iter()
+                    .map(|(_blknum, path, ..)| path.as_str())
+                    .collect::<String>();
+                println!(
+                    "blk #{blknum}: path {path_prefix}{path}: prefix {}, suffix_len {}",
+                    hex::encode(node.prefix),
+                    node.suffix_len
+                );
+            }

-        let mut idx = 0;
-        let mut key_off = 0;
-        while idx < node.num_children {
+            if child_idx + 1 < node.num_children {
+                let key_off = key_off + node.suffix_len as usize;
+                stack.push((blknum, path.clone(), depth, child_idx + 1, key_off));
+            }
            let key = &node.keys[key_off..key_off + node.suffix_len as usize];
-            let val = node.value(idx as usize);
+            let val = node.value(child_idx as usize);
+
            print!("{:indent$}", "", indent = depth * 2 + 2);
            println!("{}: {}", hex::encode(key), hex::encode(val.0));

            if node.level > 0 {
-                let child_path = [path, node.prefix].concat();
-                self.dump_recurse(val.to_blknum(), &child_path, depth + 1)?;
+                stack.push((val.to_blknum(), hex::encode(node.prefix), depth + 1, 0, 0));
            }
-            idx += 1;
-            key_off += node.suffix_len as usize;
        }
        Ok(())
    }
@@ -754,8 +757,8 @@ mod tests {
        }
    }

-    #[test]
-    fn basic() -> Result<()> {
+    #[tokio::test]
+    async fn basic() -> Result<()> {
        let mut disk = TestDisk::new();
        let mut writer = DiskBtreeBuilder::<_, 6>::new(&mut disk);

@@ -775,7 +778,7 @@ mod tests {

        let reader = DiskBtreeReader::new(0, root_offset, disk);

-        reader.dump()?;
+        reader.dump().await?;

        // Test the `get` function on all the keys.
        for (key, val) in all_data.iter() {
@@ -835,8 +838,8 @@ mod tests {
        Ok(())
    }

-    #[test]
-    fn lots_of_keys() -> Result<()> {
+    #[tokio::test]
+    async fn lots_of_keys() -> Result<()> {
        let mut disk = TestDisk::new();
        let mut writer = DiskBtreeBuilder::<_, 8>::new(&mut disk);

@@ -856,7 +859,7 @@ mod tests {

        let reader = DiskBtreeReader::new(0, root_offset, disk);

-        reader.dump()?;
+        reader.dump().await?;

        use std::sync::Mutex;

@@ -994,8 +997,8 @@ mod tests {
    ///
    /// This test contains a particular data set, see disk_btree_test_data.rs
    ///
-    #[test]
-    fn particular_data() -> Result<()> {
+    #[tokio::test]
+    async fn particular_data() -> Result<()> {
        // Build a tree from it
        let mut disk = TestDisk::new();
        let mut writer = DiskBtreeBuilder::<_, 26>::new(&mut disk);
@@ -1022,7 +1025,7 @@ mod tests {
        })?;
        assert_eq!(count, disk_btree_test_data::TEST_DATA.len());

-        reader.dump()?;
+        reader.dump().await?;

        Ok(())
    }
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -328,7 +328,7 @@ fn to_io_error(e: anyhow::Error, context: &str) -> io::Error {
 #[cfg(test)]
 mod tests {
    use super::*;
-    use crate::tenant::blob_io::{BlobCursor, BlobWriter};
+    use crate::tenant::blob_io::BlobWriter;
    use crate::tenant::block_io::BlockCursor;
    use rand::{seq::SliceRandom, thread_rng, RngCore};
    use std::fs;
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -626,17 +626,17 @@ impl LayerMap {

    /// debugging function to print out the contents of the layer map
    #[allow(unused)]
-    pub fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
+    pub async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
        println!("Begin dump LayerMap");

        println!("open_layer:");
        if let Some(open_layer) = &self.open_layer {
-            open_layer.dump(verbose, ctx)?;
+            open_layer.dump(verbose, ctx).await?;
        }

        println!("frozen_layers:");
        for frozen_layer in self.frozen_layers.iter() {
-            frozen_layer.dump(verbose, ctx)?;
+            frozen_layer.dump(verbose, ctx).await?;
        }

        println!("historic_layers:");
--- a/pageserver/src/tenant/metadata.rs
+++ b/pageserver/src/tenant/metadata.rs
@@ -9,10 +9,11 @@
 //! [`remote_timeline_client`]: super::remote_timeline_client

 use std::fs::{File, OpenOptions};
-use std::io::Write;
+use std::io::{self, Write};

 use anyhow::{bail, ensure, Context};
 use serde::{Deserialize, Serialize};
+use thiserror::Error;
 use tracing::info_span;
 use utils::bin_ser::SerializeError;
 use utils::{
@@ -267,24 +268,24 @@ pub fn save_metadata(
    Ok(())
 }

+#[derive(Error, Debug)]
+pub enum LoadMetadataError {
+    #[error(transparent)]
+    Read(#[from] io::Error),
+
+    #[error(transparent)]
+    Decode(#[from] anyhow::Error),
+}
+
 pub fn load_metadata(
    conf: &'static PageServerConf,
    tenant_id: &TenantId,
    timeline_id: &TimelineId,
-) -> anyhow::Result<TimelineMetadata> {
+) -> Result<TimelineMetadata, LoadMetadataError> {
    let metadata_path = conf.metadata_path(tenant_id, timeline_id);
-    let metadata_bytes = std::fs::read(&metadata_path).with_context(|| {
-        format!(
-            "Failed to read metadata bytes from path {}",
-            metadata_path.display()
-        )
-    })?;
-    TimelineMetadata::from_bytes(&metadata_bytes).with_context(|| {
-        format!(
-            "Failed to parse metadata bytes from path {}",
-            metadata_path.display()
-        )
-    })
+    let metadata_bytes = std::fs::read(metadata_path)?;
+
+    Ok(TimelineMetadata::from_bytes(&metadata_bytes)?)
 }

 #[cfg(test)]
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -26,6 +26,8 @@ use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME};
 use utils::fs_ext::PathExt;
 use utils::id::{TenantId, TimelineId};

+use super::timeline::delete::DeleteTimelineFlow;
+
 /// The tenants known to the pageserver.
 /// The enum variants are used to distinguish the different states that the pageserver can be in.
 enum TenantsMap {
@@ -421,12 +423,10 @@ pub enum DeleteTimelineError {
 pub async fn delete_timeline(
    tenant_id: TenantId,
    timeline_id: TimelineId,
-    ctx: &RequestContext,
+    _ctx: &RequestContext,
 ) -> Result<(), DeleteTimelineError> {
    let tenant = get_tenant(tenant_id, true).await?;
-    tenant
-        .prepare_and_schedule_delete_timeline(timeline_id, ctx)
-        .await?;
+    DeleteTimelineFlow::run(&tenant, timeline_id).await?;
    Ok(())
 }

@@ -768,55 +768,6 @@ pub async fn immediate_gc(
    Ok(wait_task_done)
 }

-pub async fn immediate_compact(
-    tenant_id: TenantId,
-    timeline_id: TimelineId,
-    ctx: &RequestContext,
-) -> Result<tokio::sync::oneshot::Receiver<anyhow::Result<()>>, ApiError> {
-    let guard = TENANTS.read().await;
-
-    let tenant = guard
-        .get(&tenant_id)
-        .map(Arc::clone)
-        .with_context(|| format!("tenant {tenant_id}"))
-        .map_err(|e| ApiError::NotFound(e.into()))?;
-
-    let timeline = tenant
-        .get_timeline(timeline_id, true)
-        .map_err(|e| ApiError::NotFound(e.into()))?;
-
-    // Run in task_mgr to avoid race with tenant_detach operation
-    let ctx = ctx.detached_child(TaskKind::Compaction, DownloadBehavior::Download);
-    let (task_done, wait_task_done) = tokio::sync::oneshot::channel();
-    task_mgr::spawn(
-        &tokio::runtime::Handle::current(),
-        TaskKind::Compaction,
-        Some(tenant_id),
-        Some(timeline_id),
-        &format!(
-            "timeline_compact_handler compaction run for tenant {tenant_id} timeline {timeline_id}"
-        ),
-        false,
-        async move {
-            let result = timeline
-                .compact(&ctx)
-                .instrument(info_span!("manual_compact", %tenant_id, %timeline_id))
-                .await;
-
-            match task_done.send(result) {
-                Ok(_) => (),
-                Err(result) => error!("failed to send compaction result: {result:?}"),
-            }
-            Ok(())
-        },
-    );
-
-    // drop the guard until after we've spawned the task so that timeline shutdown will wait for the task
-    drop(guard);
-
-    Ok(wait_task_done)
-}
-
 #[cfg(test)]
 mod tests {
    use std::collections::HashMap;
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -514,7 +514,7 @@ impl RemoteTimelineClient {
    /// updated metadata.
    ///
    /// The upload will be added to the queue immediately, but it
-    /// won't be performed until all previosuly scheduled layer file
+    /// won't be performed until all previously scheduled layer file
    /// upload operations have completed successfully.  This is to
    /// ensure that when the index file claims that layers X, Y and Z
    /// exist in remote storage, they really do. To wait for the upload
@@ -625,7 +625,7 @@ impl RemoteTimelineClient {
    /// Note: This schedules an index file upload before the deletions.  The
    /// deletion won't actually be performed, until any previously scheduled
    /// upload operations, and the index file upload, have completed
-    /// succesfully.
+    /// successfully.
    pub fn schedule_layer_file_deletion(
        self: &Arc<Self>,
        names: &[LayerFileName],
@@ -827,7 +827,7 @@ impl RemoteTimelineClient {
            )
        };

-        receiver.changed().await?;
+        receiver.changed().await.context("upload queue shut down")?;

        // Do not delete index part yet, it is needed for possible retry. If we remove it first
        // and retry will arrive to different pageserver there wont be any traces of it on remote storage
@@ -855,11 +855,23 @@ impl RemoteTimelineClient {
            self.storage_impl.delete_objects(&remaining).await?;
        }

+        fail::fail_point!("timeline-delete-before-index-delete", |_| {
+            Err(anyhow::anyhow!(
+                "failpoint: timeline-delete-before-index-delete"
+            ))?
+        });
+
        let index_file_path = timeline_storage_path.join(Path::new(IndexPart::FILE_NAME));

        debug!("deleting index part");
        self.storage_impl.delete(&index_file_path).await?;

+        fail::fail_point!("timeline-delete-after-index-delete", |_| {
+            Err(anyhow::anyhow!(
+                "failpoint: timeline-delete-after-index-delete"
+            ))?
+        });
+
        info!(prefix=%timeline_storage_path, referenced=deletions_queued, not_referenced=%remaining.len(), "done deleting in timeline prefix, including index_part.json");

        Ok(())
@@ -1105,7 +1117,7 @@ impl RemoteTimelineClient {
            debug!("remote task {} completed successfully", task.op);
        }

-        // The task has completed succesfully. Remove it from the in-progress list.
+        // The task has completed successfully. Remove it from the in-progress list.
        {
            let mut upload_queue_guard = self.upload_queue.lock().unwrap();
            let upload_queue = match upload_queue_guard.deref_mut() {
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -223,6 +223,45 @@ mod tests {
        assert_eq!(part, expected);
    }

+    #[test]
+    fn v2_indexpart_is_parsed_with_deleted_at() {
+        let example = r#"{
+            "version":2,
+            "timeline_layers":["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9"],
+            "missing_layers":["This shouldn't fail deserialization"],
+            "layer_metadata":{
+                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 },
+                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 }
+            },
+            "disk_consistent_lsn":"0/16960E8",
+            "metadata_bytes":[112,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
+            "deleted_at": "2023-07-31T09:00:00.123"
+        }"#;
+
+        let expected = IndexPart {
+            // note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead?
+            version: 2,
+            timeline_layers: HashSet::from(["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap()]),
+            layer_metadata: HashMap::from([
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
+                    file_size: 25600000,
+                }),
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
+                    // serde_json should always parse this but this might be a double with jq for
+                    // example.
+                    file_size: 9007199254741001,
+                })
+            ]),
+            disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
+            metadata_bytes: [112,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0].to_vec(),
+            deleted_at: Some(chrono::NaiveDateTime::parse_from_str(
+                "2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap())
+        };
+
+        let part = serde_json::from_str::<IndexPart>(example).unwrap();
+        assert_eq!(part, expected);
+    }
+
    #[test]
    fn empty_layers_are_parsed() {
        let empty_layers_json = r#"{
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -338,7 +338,8 @@ impl LayerAccessStats {
 /// All layers should implement a minimal `std::fmt::Debug` without tenant or
 /// timeline names, because those are known in the context of which the layers
 /// are used in (timeline).
-pub trait Layer: std::fmt::Debug + std::fmt::Display + Send + Sync {
+#[async_trait::async_trait]
+pub trait Layer: std::fmt::Debug + std::fmt::Display + Send + Sync + 'static {
    /// Range of keys that this layer covers
    fn get_key_range(&self) -> Range<Key>;

@@ -368,7 +369,7 @@ pub trait Layer: std::fmt::Debug + std::fmt::Display + Send + Sync {
    /// is available. If this returns ValueReconstructResult::Continue, look up
    /// the predecessor layer and call again with the same 'reconstruct_data' to
    /// collect more data.
-    fn get_value_reconstruct_data(
+    async fn get_value_reconstruct_data(
        &self,
        key: Key,
        lsn_range: Range<Lsn>,
@@ -377,7 +378,7 @@ pub trait Layer: std::fmt::Debug + std::fmt::Display + Send + Sync {
    ) -> Result<ValueReconstructResult>;

    /// Dump summary of the contents of the layer to stdout
-    fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()>;
+    async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()>;
 }

 /// Returned by [`PersistentLayer::iter`]
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -31,7 +31,7 @@ use crate::config::PageServerConf;
 use crate::context::RequestContext;
 use crate::page_cache::{PageReadGuard, PAGE_SZ};
 use crate::repository::{Key, Value, KEY_SIZE};
-use crate::tenant::blob_io::{BlobCursor, BlobWriter, WriteBlobWriter};
+use crate::tenant::blob_io::{BlobWriter, WriteBlobWriter};
 use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockReader, FileBlockReader};
 use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
 use crate::tenant::storage_layer::{
@@ -223,9 +223,10 @@ impl std::fmt::Debug for DeltaLayerInner {
    }
 }

+#[async_trait::async_trait]
 impl Layer for DeltaLayer {
    /// debugging function to print out the contents of the layer
-    fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
+    async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
        println!(
            "----- delta layer for ten {} tli {} keys {}-{} lsn {}-{} size {} ----",
            self.desc.tenant_id,
@@ -255,7 +256,7 @@ impl Layer for DeltaLayer {
            file,
        );

-        tree_reader.dump()?;
+        tree_reader.dump().await?;

        let mut cursor = file.block_cursor();

@@ -300,7 +301,7 @@ impl Layer for DeltaLayer {
        Ok(())
    }

-    fn get_value_reconstruct_data(
+    async fn get_value_reconstruct_data(
        &self,
        key: Key,
        lsn_range: Range<Lsn>,
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -27,7 +27,7 @@ use crate::config::PageServerConf;
 use crate::context::RequestContext;
 use crate::page_cache::PAGE_SZ;
 use crate::repository::{Key, KEY_SIZE};
-use crate::tenant::blob_io::{BlobCursor, BlobWriter, WriteBlobWriter};
+use crate::tenant::blob_io::{BlobWriter, WriteBlobWriter};
 use crate::tenant::block_io::{BlockBuf, BlockReader, FileBlockReader};
 use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
 use crate::tenant::storage_layer::{
@@ -38,6 +38,7 @@ use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX};
 use anyhow::{bail, ensure, Context, Result};
 use bytes::Bytes;
 use hex;
+use once_cell::sync::OnceCell;
 use pageserver_api::models::{HistoricLayerInfo, LayerAccessKind};
 use rand::{distributions::Alphanumeric, Rng};
 use serde::{Deserialize, Serialize};
@@ -47,7 +48,6 @@ use std::io::{Seek, SeekFrom};
 use std::ops::Range;
 use std::os::unix::prelude::FileExt;
 use std::path::{Path, PathBuf};
-use std::sync::{RwLock, RwLockReadGuard};
 use tracing::*;

 use utils::{
@@ -117,7 +117,7 @@ pub struct ImageLayer {

    access_stats: LayerAccessStats,

-    inner: RwLock<ImageLayerInner>,
+    inner: OnceCell<ImageLayerInner>,
 }

 impl std::fmt::Debug for ImageLayer {
@@ -134,30 +134,27 @@ impl std::fmt::Debug for ImageLayer {
 }

 pub struct ImageLayerInner {
-    /// If false, the 'index' has not been loaded into memory yet.
-    loaded: bool,
-
    // values copied from summary
    index_start_blk: u32,
    index_root_blk: u32,

-    /// Reader object for reading blocks from the file. (None if not loaded yet)
-    file: Option<FileBlockReader<VirtualFile>>,
+    /// Reader object for reading blocks from the file.
+    file: FileBlockReader<VirtualFile>,
 }

 impl std::fmt::Debug for ImageLayerInner {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.debug_struct("ImageLayerInner")
-            .field("loaded", &self.loaded)
            .field("index_start_blk", &self.index_start_blk)
            .field("index_root_blk", &self.index_root_blk)
            .finish()
    }
 }

+#[async_trait::async_trait]
 impl Layer for ImageLayer {
    /// debugging function to print out the contents of the layer
-    fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
+    async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
        println!(
            "----- image layer for ten {} tli {} key {}-{} at {} is_incremental {} size {} ----",
            self.desc.tenant_id,
@@ -174,11 +171,11 @@ impl Layer for ImageLayer {
        }

        let inner = self.load(LayerAccessKind::Dump, ctx)?;
-        let file = inner.file.as_ref().unwrap();
+        let file = &inner.file;
        let tree_reader =
            DiskBtreeReader::<_, KEY_SIZE>::new(inner.index_start_blk, inner.index_root_blk, file);

-        tree_reader.dump()?;
+        tree_reader.dump().await?;

        tree_reader.visit(&[0u8; KEY_SIZE], VisitDirection::Forwards, |key, value| {
            println!("key: {} offset {}", hex::encode(key), value);
@@ -189,7 +186,7 @@ impl Layer for ImageLayer {
    }

    /// Look up given page in the file
-    fn get_value_reconstruct_data(
+    async fn get_value_reconstruct_data(
        &self,
        key: Key,
        lsn_range: Range<Lsn>,
@@ -202,7 +199,7 @@ impl Layer for ImageLayer {

        let inner = self.load(LayerAccessKind::GetValueReconstructData, ctx)?;

-        let file = inner.file.as_ref().unwrap();
+        let file = &inner.file;
        let tree_reader = DiskBtreeReader::new(inner.index_start_blk, inner.index_root_blk, file);

        let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
@@ -321,52 +318,26 @@ impl ImageLayer {
    /// Open the underlying file and read the metadata into memory, if it's
    /// not loaded already.
    ///
-    fn load(
-        &self,
-        access_kind: LayerAccessKind,
-        ctx: &RequestContext,
-    ) -> Result<RwLockReadGuard<ImageLayerInner>> {
+    fn load(&self, access_kind: LayerAccessKind, ctx: &RequestContext) -> Result<&ImageLayerInner> {
        self.access_stats
            .record_access(access_kind, ctx.task_kind());
        loop {
-            // Quick exit if already loaded
-            let inner = self.inner.read().unwrap();
-            if inner.loaded {
+            if let Some(inner) = self.inner.get() {
                return Ok(inner);
            }
-
-            // Need to open the file and load the metadata. Upgrade our lock to
-            // a write lock. (Or rather, release and re-lock in write mode.)
-            drop(inner);
-            let mut inner = self.inner.write().unwrap();
-            if !inner.loaded {
-                self.load_inner(&mut inner).with_context(|| {
-                    format!("Failed to load image layer {}", self.path().display())
-                })?
-            } else {
-                // Another thread loaded it while we were not holding the lock.
-            }
-
-            // We now have the file open and loaded. There's no function to do
-            // that in the std library RwLock, so we have to release and re-lock
-            // in read mode. (To be precise, the lock guard was moved in the
-            // above call to `load_inner`, so it's already been released). And
-            // while we do that, another thread could unload again, so we have
-            // to re-check and retry if that happens.
-            drop(inner);
+            self.inner
+                .get_or_try_init(|| self.load_inner())
+                .with_context(|| format!("Failed to load image layer {}", self.path().display()))?;
        }
    }

-    fn load_inner(&self, inner: &mut ImageLayerInner) -> Result<()> {
+    fn load_inner(&self) -> Result<ImageLayerInner> {
        let path = self.path();

        // Open the file if it's not open already.
-        if inner.file.is_none() {
-            let file = VirtualFile::open(&path)
-                .with_context(|| format!("Failed to open file '{}'", path.display()))?;
-            inner.file = Some(FileBlockReader::new(file));
-        }
-        let file = inner.file.as_mut().unwrap();
+        let file = VirtualFile::open(&path)
+            .with_context(|| format!("Failed to open file '{}'", path.display()))?;
+        let file = FileBlockReader::new(file);
        let summary_blk = file.read_blk(0)?;
        let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;

@@ -394,10 +365,11 @@ impl ImageLayer {
            }
        }

-        inner.index_start_blk = actual_summary.index_start_blk;
-        inner.index_root_blk = actual_summary.index_root_blk;
-        inner.loaded = true;
-        Ok(())
+        Ok(ImageLayerInner {
+            index_start_blk: actual_summary.index_start_blk,
+            index_root_blk: actual_summary.index_root_blk,
+            file,
+        })
    }

    /// Create an ImageLayer struct representing an existing file on disk
@@ -421,12 +393,7 @@ impl ImageLayer {
            ), // Now we assume image layer ALWAYS covers the full range. This may change in the future.
            lsn: filename.lsn,
            access_stats,
-            inner: RwLock::new(ImageLayerInner {
-                loaded: false,
-                file: None,
-                index_start_blk: 0,
-                index_root_blk: 0,
-            }),
+            inner: OnceCell::new(),
        }
    }

@@ -453,12 +420,7 @@ impl ImageLayer {
            ), // Now we assume image layer ALWAYS covers the full range. This may change in the future.
            lsn: summary.lsn,
            access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
-            inner: RwLock::new(ImageLayerInner {
-                file: None,
-                loaded: false,
-                index_start_blk: 0,
-                index_root_blk: 0,
-            }),
+            inner: OnceCell::new(),
        })
    }

@@ -619,12 +581,7 @@ impl ImageLayerWriterInner {
            desc,
            lsn: self.lsn,
            access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
-            inner: RwLock::new(ImageLayerInner {
-                loaded: false,
-                file: None,
-                index_start_blk,
-                index_root_blk,
-            }),
+            inner: OnceCell::new(),
        };

        // fsync the file
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -7,7 +7,7 @@
 use crate::config::PageServerConf;
 use crate::context::RequestContext;
 use crate::repository::{Key, Value};
-use crate::tenant::blob_io::{BlobCursor, BlobWriter};
+use crate::tenant::blob_io::BlobWriter;
 use crate::tenant::block_io::BlockReader;
 use crate::tenant::ephemeral_file::EphemeralFile;
 use crate::tenant::storage_layer::{ValueReconstructResult, ValueReconstructState};
@@ -110,6 +110,7 @@ impl InMemoryLayer {
    }
 }

+#[async_trait::async_trait]
 impl Layer for InMemoryLayer {
    fn get_key_range(&self) -> Range<Key> {
        Key::MIN..Key::MAX
@@ -132,7 +133,7 @@ impl Layer for InMemoryLayer {
    }

    /// debugging function to print out the contents of the layer
-    fn dump(&self, verbose: bool, _ctx: &RequestContext) -> Result<()> {
+    async fn dump(&self, verbose: bool, _ctx: &RequestContext) -> Result<()> {
        let inner = self.inner.read().unwrap();

        let end_str = inner
@@ -183,7 +184,7 @@ impl Layer for InMemoryLayer {
    }

    /// Look up given value in the layer.
-    fn get_value_reconstruct_data(
+    async fn get_value_reconstruct_data(
        &self,
        key: Key,
        lsn_range: Range<Lsn>,
--- a/pageserver/src/tenant/storage_layer/remote_layer.rs
+++ b/pageserver/src/tenant/storage_layer/remote_layer.rs
@@ -65,8 +65,9 @@ impl std::fmt::Debug for RemoteLayer {
    }
 }

+#[async_trait::async_trait]
 impl Layer for RemoteLayer {
-    fn get_value_reconstruct_data(
+    async fn get_value_reconstruct_data(
        &self,
        _key: Key,
        _lsn_range: Range<Lsn>,
@@ -77,7 +78,7 @@ impl Layer for RemoteLayer {
    }

    /// debugging function to print out the contents of the layer
-    fn dump(&self, _verbose: bool, _ctx: &RequestContext) -> Result<()> {
+    async fn dump(&self, _verbose: bool, _ctx: &RequestContext) -> Result<()> {
        println!(
            "----- remote layer for ten {} tli {} keys {}-{} lsn {}-{} is_delta {} is_incremental {} size {} ----",
            self.desc.tenant_id,
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -111,7 +111,7 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                Duration::from_secs(10)
            } else {
                // Run compaction
-                if let Err(e) = tenant.compaction_iteration(&ctx).await {
+                if let Err(e) = tenant.compaction_iteration(&cancel, &ctx).await {
                    error!("Compaction failed, retrying in {:?}: {e:?}", wait_duration);
                    wait_duration
                } else {
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1,3 +1,4 @@
+pub mod delete;
 mod eviction_task;
 pub mod layer_manager;
 mod logical_size;
@@ -79,6 +80,7 @@ use crate::METADATA_FILE_NAME;
 use crate::ZERO_PAGE;
 use crate::{is_temporary, task_mgr};

+use self::delete::DeleteTimelineFlow;
 pub(super) use self::eviction_task::EvictionTaskTenantState;
 use self::eviction_task::EvictionTaskTimelineState;
 use self::layer_manager::LayerManager;
@@ -237,11 +239,10 @@ pub struct Timeline {

    /// Layer removal lock.
    /// A lock to ensure that no layer of the timeline is removed concurrently by other tasks.
-    /// This lock is acquired in [`Timeline::gc`], [`Timeline::compact`],
-    /// and [`Tenant::delete_timeline`]. This is an `Arc<Mutex>` lock because we need an owned
+    /// This lock is acquired in [`Timeline::gc`] and [`Timeline::compact`].
+    /// This is an `Arc<Mutex>` lock because we need an owned
    /// lock guard in functions that will be spawned to tokio I/O pool (which requires `'static`).
-    ///
-    /// [`Tenant::delete_timeline`]: super::Tenant::delete_timeline
+    /// Note that [`DeleteTimelineFlow`] uses `delete_progress` field.
    pub(super) layer_removal_cs: Arc<tokio::sync::Mutex<()>>,

    // Needed to ensure that we can't create a branch at a point that was already garbage collected
@@ -283,7 +284,7 @@ pub struct Timeline {

    /// Prevent two tasks from deleting the timeline at the same time. If held, the
    /// timeline is being deleted. If 'true', the timeline has already been deleted.
-    pub delete_lock: Arc<tokio::sync::Mutex<bool>>,
+    pub delete_progress: Arc<tokio::sync::Mutex<DeleteTimelineFlow>>,

    eviction_task_timeline_state: tokio::sync::Mutex<EvictionTaskTimelineState>,

@@ -293,6 +294,10 @@ pub struct Timeline {
    /// Completion shared between all timelines loaded during startup; used to delay heavier
    /// background tasks until some logical sizes have been calculated.
    initial_logical_size_attempt: Mutex<Option<completion::Completion>>,
+
+    /// Load or creation time information about the disk_consistent_lsn and when the loading
+    /// happened. Used for consumption metrics.
+    pub(crate) loaded_at: (Lsn, SystemTime),
 }

 pub struct WalReceiverInfo {
@@ -334,7 +339,7 @@ pub struct GcInfo {
 #[derive(thiserror::Error)]
 pub enum PageReconstructError {
    #[error(transparent)]
-    Other(#[from] anyhow::Error), // source and Display delegate to anyhow::Error
+    Other(#[from] anyhow::Error),

    /// The operation would require downloading a layer that is missing locally.
    NeedsDownload(TenantTimelineId, LayerFileName),
@@ -475,7 +480,7 @@ impl Timeline {
            img: cached_page_img,
        };

-        let timer = self.metrics.get_reconstruct_data_time_histo.start_timer();
+        let timer = crate::metrics::GET_RECONSTRUCT_DATA_TIME.start_timer();
        self.get_reconstruct_data(key, lsn, &mut reconstruct_state, ctx)
            .await?;
        timer.stop_and_record();
@@ -555,7 +560,7 @@ impl Timeline {
            "wait_lsn cannot be called in WAL receiver"
        );

-        let _timer = self.metrics.wait_lsn_time_histo.start_timer();
+        let _timer = crate::metrics::WAIT_LSN_TIME.start_timer();

        match self
            .last_record_lsn
@@ -611,9 +616,46 @@ impl Timeline {
    }

    /// Outermost timeline compaction operation; downloads needed layers.
-    pub async fn compact(self: &Arc<Self>, ctx: &RequestContext) -> anyhow::Result<()> {
+    pub async fn compact(
+        self: &Arc<Self>,
+        cancel: &CancellationToken,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
        const ROUNDS: usize = 2;

+        static CONCURRENT_COMPACTIONS: once_cell::sync::Lazy<tokio::sync::Semaphore> =
+            once_cell::sync::Lazy::new(|| {
+                let total_threads = *task_mgr::BACKGROUND_RUNTIME_WORKER_THREADS;
+                let permits = usize::max(
+                    1,
+                    // while a lot of the work is done on spawn_blocking, we still do
+                    // repartitioning in the async context. this should give leave us some workers
+                    // unblocked to be blocked on other work, hopefully easing any outside visible
+                    // effects of restarts.
+                    //
+                    // 6/8 is a guess; previously we ran with unlimited 8 and more from
+                    // spawn_blocking.
+                    (total_threads * 3).checked_div(4).unwrap_or(0),
+                );
+                assert_ne!(permits, 0, "we will not be adding in permits later");
+                assert!(
+                    permits < total_threads,
+                    "need threads avail for shorter work"
+                );
+                tokio::sync::Semaphore::new(permits)
+            });
+
+        // this wait probably never needs any "long time spent" logging, because we already nag if
+        // compaction task goes over it's period (20s) which is quite often in production.
+        let _permit = tokio::select! {
+            permit = CONCURRENT_COMPACTIONS.acquire() => {
+                permit
+            },
+            _ = cancel.cancelled() => {
+                return Ok(());
+            }
+        };
+
        let last_record_lsn = self.get_last_record_lsn();

        // Last record Lsn could be zero in case the timeline was just created
@@ -671,11 +713,9 @@ impl Timeline {

            let mut failed = 0;

-            let mut cancelled = pin!(task_mgr::shutdown_watcher());
-
            loop {
                tokio::select! {
-                    _ = &mut cancelled => anyhow::bail!("Cancelled while downloading remote layers"),
+                    _ = cancel.cancelled() => anyhow::bail!("Cancelled while downloading remote layers"),
                    res = downloads.next() => {
                        match res {
                            Some(Ok(())) => {},
@@ -890,7 +930,7 @@ impl Timeline {
                    new_state,
                    TimelineState::Stopping | TimelineState::Broken { .. }
                ) {
-                    // drop the copmletion guard, if any; it might be holding off the completion
+                    // drop the completion guard, if any; it might be holding off the completion
                    // forever needlessly
                    self.initial_logical_size_attempt
                        .lock()
@@ -1325,9 +1365,10 @@ impl Timeline {
        pg_version: u32,
        initial_logical_size_can_start: Option<completion::Barrier>,
        initial_logical_size_attempt: Option<completion::Completion>,
+        state: TimelineState,
    ) -> Arc<Self> {
        let disk_consistent_lsn = metadata.disk_consistent_lsn();
-        let (state, _) = watch::channel(TimelineState::Loading);
+        let (state, _) = watch::channel(state);

        let (layer_flush_start_tx, _) = tokio::sync::watch::channel(0);
        let (layer_flush_done_tx, _) = tokio::sync::watch::channel((0, Ok(())));
@@ -1367,6 +1408,8 @@ impl Timeline {
                last_freeze_at: AtomicLsn::new(disk_consistent_lsn.0),
                last_freeze_ts: RwLock::new(Instant::now()),

+                loaded_at: (disk_consistent_lsn, SystemTime::now()),
+
                ancestor_timeline: ancestor,
                ancestor_lsn: metadata.ancestor_lsn(),

@@ -1418,7 +1461,7 @@ impl Timeline {
                eviction_task_timeline_state: tokio::sync::Mutex::new(
                    EvictionTaskTimelineState::default(),
                ),
-                delete_lock: Arc::new(tokio::sync::Mutex::new(false)),
+                delete_progress: Arc::new(tokio::sync::Mutex::new(DeleteTimelineFlow::default())),

                initial_logical_size_can_start,
                initial_logical_size_attempt: Mutex::new(initial_logical_size_attempt),
@@ -1563,7 +1606,7 @@ impl Timeline {
            if let Some(imgfilename) = ImageFileName::parse_str(&fname) {
                // create an ImageLayer struct for each image file.
                if imgfilename.lsn > disk_consistent_lsn {
-                    warn!(
+                    info!(
                        "found future image layer {} on timeline {} disk_consistent_lsn is {}",
                        imgfilename, self.timeline_id, disk_consistent_lsn
                    );
@@ -1595,7 +1638,7 @@ impl Timeline {
                // is 102, then it might not have been fully flushed to disk
                // before crash.
                if deltafilename.lsn_range.end > disk_consistent_lsn + 1 {
-                    warn!(
+                    info!(
                        "found future delta layer {} on timeline {} disk_consistent_lsn is {}",
                        deltafilename, self.timeline_id, disk_consistent_lsn
                    );
@@ -1737,7 +1780,7 @@ impl Timeline {
            match remote_layer_name {
                LayerFileName::Image(imgfilename) => {
                    if imgfilename.lsn > up_to_date_disk_consistent_lsn {
-                        warn!(
+                        info!(
                        "found future image layer {} on timeline {} remote_consistent_lsn is {}",
                        imgfilename, self.timeline_id, up_to_date_disk_consistent_lsn
                    );
@@ -1762,7 +1805,7 @@ impl Timeline {
                    // is 102, then it might not have been fully flushed to disk
                    // before crash.
                    if deltafilename.lsn_range.end > up_to_date_disk_consistent_lsn + 1 {
-                        warn!(
+                        info!(
                            "found future delta layer {} on timeline {} remote_consistent_lsn is {}",
                            deltafilename, self.timeline_id, up_to_date_disk_consistent_lsn
                        );
@@ -1883,6 +1926,15 @@ impl Timeline {
    }

    fn try_spawn_size_init_task(self: &Arc<Self>, lsn: Lsn, ctx: &RequestContext) {
+        let state = self.current_state();
+        if matches!(
+            state,
+            TimelineState::Broken { .. } | TimelineState::Stopping
+        ) {
+            // Can happen when timeline detail endpoint is used when deletion is ongoing (or its broken).
+            return;
+        }
+
        let permit = match Arc::clone(&self.current_logical_size.initial_size_computation)
            .try_acquire_owned()
        {
@@ -2252,8 +2304,9 @@ impl Timeline {
        let mut timeline_owned;
        let mut timeline = self;

-        let mut read_count =
-            scopeguard::guard(0, |cnt| self.metrics.read_num_fs_layers.observe(cnt as f64));
+        let mut read_count = scopeguard::guard(0, |cnt| {
+            crate::metrics::READ_NUM_FS_LAYERS.observe(cnt as f64)
+        });

        // For debugging purposes, collect the path of layers that we traversed
        // through. It's included in the error message if we fail to find the key.
@@ -2387,12 +2440,15 @@ impl Timeline {
                            // Get all the data needed to reconstruct the page version from this layer.
                            // But if we have an older cached page image, no need to go past that.
                            let lsn_floor = max(cached_lsn + 1, start_lsn);
-                            result = match open_layer.get_value_reconstruct_data(
-                                key,
-                                lsn_floor..cont_lsn,
-                                reconstruct_state,
-                                ctx,
-                            ) {
+                            result = match open_layer
+                                .get_value_reconstruct_data(
+                                    key,
+                                    lsn_floor..cont_lsn,
+                                    reconstruct_state,
+                                    ctx,
+                                )
+                                .await
+                            {
                                Ok(result) => result,
                                Err(e) => return Err(PageReconstructError::from(e)),
                            };
@@ -2414,12 +2470,15 @@ impl Timeline {
                        if cont_lsn > start_lsn {
                            //info!("CHECKING for {} at {} on frozen layer {}", key, cont_lsn, frozen_layer.filename().display());
                            let lsn_floor = max(cached_lsn + 1, start_lsn);
-                            result = match frozen_layer.get_value_reconstruct_data(
-                                key,
-                                lsn_floor..cont_lsn,
-                                reconstruct_state,
-                                ctx,
-                            ) {
+                            result = match frozen_layer
+                                .get_value_reconstruct_data(
+                                    key,
+                                    lsn_floor..cont_lsn,
+                                    reconstruct_state,
+                                    ctx,
+                                )
+                                .await
+                            {
                                Ok(result) => result,
                                Err(e) => return Err(PageReconstructError::from(e)),
                            };
@@ -2450,12 +2509,15 @@ impl Timeline {
                            // Get all the data needed to reconstruct the page version from this layer.
                            // But if we have an older cached page image, no need to go past that.
                            let lsn_floor = max(cached_lsn + 1, lsn_floor);
-                            result = match layer.get_value_reconstruct_data(
-                                key,
-                                lsn_floor..cont_lsn,
-                                reconstruct_state,
-                                ctx,
-                            ) {
+                            result = match layer
+                                .get_value_reconstruct_data(
+                                    key,
+                                    lsn_floor..cont_lsn,
+                                    reconstruct_state,
+                                    ctx,
+                                )
+                                .await
+                            {
                                Ok(result) => result,
                                Err(e) => return Err(PageReconstructError::from(e)),
                            };
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -0,0 +1,576 @@
+use std::{
+    ops::{Deref, DerefMut},
+    sync::Arc,
+};
+
+use anyhow::Context;
+use pageserver_api::models::TimelineState;
+use tokio::sync::OwnedMutexGuard;
+use tracing::{debug, error, info, instrument, warn, Instrument, Span};
+use utils::{
+    crashsafe, fs_ext,
+    id::{TenantId, TimelineId},
+};
+
+use crate::{
+    config::PageServerConf,
+    task_mgr::{self, TaskKind},
+    tenant::{
+        metadata::TimelineMetadata,
+        remote_timeline_client::{
+            self, PersistIndexPartWithDeletedFlagError, RemoteTimelineClient,
+        },
+        CreateTimelineCause, DeleteTimelineError, Tenant,
+    },
+    InitializationOrder,
+};
+
+use super::Timeline;
+
+/// Now that the Timeline is in Stopping state, request all the related tasks to shut down.
+async fn stop_tasks(timeline: &Timeline) -> Result<(), DeleteTimelineError> {
+    // Stop the walreceiver first.
+    debug!("waiting for wal receiver to shutdown");
+    let maybe_started_walreceiver = { timeline.walreceiver.lock().unwrap().take() };
+    if let Some(walreceiver) = maybe_started_walreceiver {
+        walreceiver.stop().await;
+    }
+    debug!("wal receiver shutdown confirmed");
+
+    // Prevent new uploads from starting.
+    if let Some(remote_client) = timeline.remote_client.as_ref() {
+        let res = remote_client.stop();
+        match res {
+            Ok(()) => {}
+            Err(e) => match e {
+                remote_timeline_client::StopError::QueueUninitialized => {
+                    // This case shouldn't happen currently because the
+                    // load and attach code bails out if _any_ of the timeline fails to fetch its IndexPart.
+                    // That is, before we declare the Tenant as Active.
+                    // But we only allow calls to delete_timeline on Active tenants.
+                    return Err(DeleteTimelineError::Other(anyhow::anyhow!("upload queue is uninitialized, likely the timeline was in Broken state prior to this call because it failed to fetch IndexPart during load or attach, check the logs")));
+                }
+            },
+        }
+    }
+
+    // Stop & wait for the remaining timeline tasks, including upload tasks.
+    // NB: This and other delete_timeline calls do not run as a task_mgr task,
+    //     so, they are not affected by this shutdown_tasks() call.
+    info!("waiting for timeline tasks to shutdown");
+    task_mgr::shutdown_tasks(None, Some(timeline.tenant_id), Some(timeline.timeline_id)).await;
+
+    fail::fail_point!("timeline-delete-before-index-deleted-at", |_| {
+        Err(anyhow::anyhow!(
+            "failpoint: timeline-delete-before-index-deleted-at"
+        ))?
+    });
+    Ok(())
+}
+
+/// Mark timeline as deleted in S3 so we won't pick it up next time
+/// during attach or pageserver restart.
+/// See comment in persist_index_part_with_deleted_flag.
+async fn set_deleted_in_remote_index(timeline: &Timeline) -> Result<(), DeleteTimelineError> {
+    if let Some(remote_client) = timeline.remote_client.as_ref() {
+        match remote_client.persist_index_part_with_deleted_flag().await {
+            // If we (now, or already) marked it successfully as deleted, we can proceed
+            Ok(()) | Err(PersistIndexPartWithDeletedFlagError::AlreadyDeleted(_)) => (),
+            // Bail out otherwise
+            //
+            // AlreadyInProgress shouldn't happen, because the 'delete_lock' prevents
+            // two tasks from performing the deletion at the same time. The first task
+            // that starts deletion should run it to completion.
+            Err(e @ PersistIndexPartWithDeletedFlagError::AlreadyInProgress(_))
+            | Err(e @ PersistIndexPartWithDeletedFlagError::Other(_)) => {
+                return Err(DeleteTimelineError::Other(anyhow::anyhow!(e)));
+            }
+        }
+    }
+    Ok(())
+}
+
+// We delete local files first, so if pageserver restarts after local files deletion then remote deletion is not continued.
+// This can be solved with inversion of these steps. But even if these steps are inverted then, when index_part.json
+// gets deleted there is no way to distinguish between "this timeline is good, we just didnt upload it to remote"
+// and "this timeline is deleted we should continue with removal of local state". So to avoid the ambiguity we use a mark file.
+// After index part is deleted presence of this mark file indentifies that it was a deletion intention.
+// So we can just remove the mark file.
+async fn create_delete_mark(
+    conf: &PageServerConf,
+    tenant_id: TenantId,
+    timeline_id: TimelineId,
+) -> Result<(), DeleteTimelineError> {
+    fail::fail_point!("timeline-delete-before-delete-mark", |_| {
+        Err(anyhow::anyhow!(
+            "failpoint: timeline-delete-before-delete-mark"
+        ))?
+    });
+    let marker_path = conf.timeline_delete_mark_file_path(tenant_id, timeline_id);
+
+    // Note: we're ok to replace existing file.
+    let _ = std::fs::OpenOptions::new()
+        .write(true)
+        .create(true)
+        .open(&marker_path)
+        .with_context(|| format!("could not create delete marker file {marker_path:?}"))?;
+
+    crashsafe::fsync_file_and_parent(&marker_path).context("sync_mark")?;
+    Ok(())
+}
+
+/// Grab the layer_removal_cs lock, and actually perform the deletion.
+///
+/// This lock prevents prevents GC or compaction from running at the same time.
+/// The GC task doesn't register itself with the timeline it's operating on,
+/// so it might still be running even though we called `shutdown_tasks`.
+///
+/// Note that there are still other race conditions between
+/// GC, compaction and timeline deletion. See
+/// <https://github.com/neondatabase/neon/issues/2671>
+///
+/// No timeout here, GC & Compaction should be responsive to the
+/// `TimelineState::Stopping` change.
+async fn delete_local_layer_files(
+    conf: &PageServerConf,
+    tenant_id: TenantId,
+    timeline: &Timeline,
+) -> anyhow::Result<()> {
+    info!("waiting for layer_removal_cs.lock()");
+    let layer_removal_guard = timeline.layer_removal_cs.lock().await;
+    info!("got layer_removal_cs.lock(), deleting layer files");
+
+    // NB: storage_sync upload tasks that reference these layers have been cancelled
+    //     by the caller.
+
+    let local_timeline_directory = conf.timeline_path(&tenant_id, &timeline.timeline_id);
+
+    fail::fail_point!("timeline-delete-before-rm", |_| {
+        Err(anyhow::anyhow!("failpoint: timeline-delete-before-rm"))?
+    });
+
+    // NB: This need not be atomic because the deleted flag in the IndexPart
+    // will be observed during tenant/timeline load. The deletion will be resumed there.
+    //
+    // For configurations without remote storage, we guarantee crash-safety by persising delete mark file.
+    //
+    // Note that here we do not bail out on std::io::ErrorKind::NotFound.
+    // This can happen if we're called a second time, e.g.,
+    // because of a previous failure/cancellation at/after
+    // failpoint timeline-delete-after-rm.
+    //
+    // It can also happen if we race with tenant detach, because,
+    // it doesn't grab the layer_removal_cs lock.
+    //
+    // For now, log and continue.
+    // warn! level is technically not appropriate for the
+    // first case because we should expect retries to happen.
+    // But the error is so rare, it seems better to get attention if it happens.
+    //
+    // Note that metadata removal is skipped, this is not technically needed,
+    // but allows to reuse timeline loading code during resumed deletion.
+    // (we always expect that metadata is in place when timeline is being loaded)
+
+    #[cfg(feature = "testing")]
+    let mut counter = 0;
+
+    // Timeline directory may not exist if we failed to delete mark file and request was retried.
+    if !local_timeline_directory.exists() {
+        return Ok(());
+    }
+
+    let metadata_path = conf.metadata_path(&tenant_id, &timeline.timeline_id);
+
+    for entry in walkdir::WalkDir::new(&local_timeline_directory).contents_first(true) {
+        #[cfg(feature = "testing")]
+        {
+            counter += 1;
+            if counter == 2 {
+                fail::fail_point!("timeline-delete-during-rm", |_| {
+                    Err(anyhow::anyhow!("failpoint: timeline-delete-during-rm"))?
+                });
+            }
+        }
+
+        let entry = entry?;
+        if entry.path() == metadata_path {
+            debug!("found metadata, skipping");
+            continue;
+        }
+
+        if entry.path() == local_timeline_directory {
+            // Keeping directory because metedata file is still there
+            debug!("found timeline dir itself, skipping");
+            continue;
+        }
+
+        let metadata = match entry.metadata() {
+            Ok(metadata) => metadata,
+            Err(e) => {
+                if crate::is_walkdir_io_not_found(&e) {
+                    warn!(
+                        timeline_dir=?local_timeline_directory,
+                        path=?entry.path().display(),
+                        "got not found err while removing timeline dir, proceeding anyway"
+                    );
+                    continue;
+                }
+                anyhow::bail!(e);
+            }
+        };
+
+        let r = if metadata.is_dir() {
+            // There shouldnt be any directories inside timeline dir as of current layout.
+            tokio::fs::remove_dir(entry.path()).await
+        } else {
+            tokio::fs::remove_file(entry.path()).await
+        };
+
+        if let Err(e) = r {
+            if e.kind() == std::io::ErrorKind::NotFound {
+                warn!(
+                    timeline_dir=?local_timeline_directory,
+                    path=?entry.path().display(),
+                    "got not found err while removing timeline dir, proceeding anyway"
+                );
+                continue;
+            }
+            anyhow::bail!(anyhow::anyhow!(
+                "Failed to remove: {}. Error: {e}",
+                entry.path().display()
+            ));
+        }
+    }
+
+    info!("finished deleting layer files, releasing layer_removal_cs.lock()");
+    drop(layer_removal_guard);
+
+    fail::fail_point!("timeline-delete-after-rm", |_| {
+        Err(anyhow::anyhow!("failpoint: timeline-delete-after-rm"))?
+    });
+
+    Ok(())
+}
+
+/// Removes remote layers and an index file after them.
+async fn delete_remote_layers_and_index(timeline: &Timeline) -> anyhow::Result<()> {
+    if let Some(remote_client) = &timeline.remote_client {
+        remote_client.delete_all().await.context("delete_all")?
+    };
+
+    Ok(())
+}
+
+// This function removs remaining traces of a timeline on disk.
+// Namely: metadata file, timeline directory, delete mark.
+// Note: io::ErrorKind::NotFound are ignored for metadata and timeline dir.
+// delete mark should be present because it is the last step during deletion.
+// (nothing can fail after its deletion)
+async fn cleanup_remaining_timeline_fs_traces(
+    conf: &PageServerConf,
+    tenant_id: TenantId,
+    timeline_id: TimelineId,
+) -> anyhow::Result<()> {
+    // Remove local metadata
+    tokio::fs::remove_file(conf.metadata_path(&tenant_id, &timeline_id))
+        .await
+        .or_else(fs_ext::ignore_not_found)
+        .context("remove metadata")?;
+
+    fail::fail_point!("timeline-delete-after-rm-metadata", |_| {
+        Err(anyhow::anyhow!(
+            "failpoint: timeline-delete-after-rm-metadata"
+        ))?
+    });
+
+    // Remove timeline dir
+    tokio::fs::remove_dir(conf.timeline_path(&tenant_id, &timeline_id))
+        .await
+        .or_else(fs_ext::ignore_not_found)
+        .context("timeline dir")?;
+
+    fail::fail_point!("timeline-delete-after-rm-dir", |_| {
+        Err(anyhow::anyhow!("failpoint: timeline-delete-after-rm-dir"))?
+    });
+
+    // Remove delete mark
+    tokio::fs::remove_file(conf.timeline_delete_mark_file_path(tenant_id, timeline_id))
+        .await
+        .context("remove delete mark")
+}
+
+/// It is important that this gets called when DeletionGuard is being held.
+/// For more context see comments in [`DeleteTimelineFlow::prepare`]
+async fn remove_timeline_from_tenant(
+    tenant: &Tenant,
+    timeline_id: TimelineId,
+    _: &DeletionGuard, // using it as a witness
+) -> anyhow::Result<()> {
+    // Remove the timeline from the map.
+    let mut timelines = tenant.timelines.lock().unwrap();
+    let children_exist = timelines
+        .iter()
+        .any(|(_, entry)| entry.get_ancestor_timeline_id() == Some(timeline_id));
+    // XXX this can happen because `branch_timeline` doesn't check `TimelineState::Stopping`.
+    // We already deleted the layer files, so it's probably best to panic.
+    // (Ideally, above remove_dir_all is atomic so we don't see this timeline after a restart)
+    if children_exist {
+        panic!("Timeline grew children while we removed layer files");
+    }
+
+    timelines
+        .remove(&timeline_id)
+        .expect("timeline that we were deleting was concurrently removed from 'timelines' map");
+
+    drop(timelines);
+
+    Ok(())
+}
+
+/// Orchestrates timeline shut down of all timeline tasks, removes its in-memory structures,
+/// and deletes its data from both disk and s3.
+/// The sequence of steps:
+/// 1. Set deleted_at in remote index part.
+/// 2. Create local mark file.
+/// 3. Delete local files except metadata (it is simpler this way, to be able to reuse timeline initialization code that expects metadata)
+/// 4. Delete remote layers
+/// 5. Delete index part
+/// 6. Delete meta, timeline directory
+/// 7. Delete mark file
+/// It is resumable from any step in case a crash/restart occurs.
+/// There are three entrypoints to the process:
+/// 1. [`DeleteTimelineFlow::run`] this is the main one called by a management api handler.
+/// 2. [`DeleteTimelineFlow::resume_deletion`] is called during restarts when local metadata is still present
+/// and we possibly neeed to continue deletion of remote files.
+/// 3. [`DeleteTimelineFlow::cleanup_remaining_timeline_fs_traces`] is used when we deleted remote
+/// index but still have local metadata, timeline directory and delete mark.
+/// Note the only other place that messes around timeline delete mark is the logic that scans directory with timelines during tenant load.
+#[derive(Default)]
+pub enum DeleteTimelineFlow {
+    #[default]
+    NotStarted,
+    InProgress,
+    Finished,
+}
+
+impl DeleteTimelineFlow {
+    // These steps are run in the context of management api request handler.
+    // Long running steps are continued to run in the background.
+    // NB: If this fails half-way through, and is retried, the retry will go through
+    // all the same steps again. Make sure the code here is idempotent, and don't
+    // error out if some of the shutdown tasks have already been completed!
+    #[instrument(skip_all, fields(tenant_id=%tenant.tenant_id, %timeline_id))]
+    pub async fn run(
+        tenant: &Arc<Tenant>,
+        timeline_id: TimelineId,
+    ) -> Result<(), DeleteTimelineError> {
+        let (timeline, mut guard) = Self::prepare(tenant, timeline_id)?;
+
+        guard.mark_in_progress()?;
+
+        stop_tasks(&timeline).await?;
+
+        set_deleted_in_remote_index(&timeline).await?;
+
+        create_delete_mark(tenant.conf, timeline.tenant_id, timeline.timeline_id).await?;
+
+        fail::fail_point!("timeline-delete-before-schedule", |_| {
+            Err(anyhow::anyhow!(
+                "failpoint: timeline-delete-before-schedule"
+            ))?
+        });
+
+        Self::schedule_background(guard, tenant.conf, Arc::clone(tenant), timeline);
+
+        Ok(())
+    }
+
+    fn mark_in_progress(&mut self) -> anyhow::Result<()> {
+        match self {
+            Self::Finished => anyhow::bail!("Bug. Is in finished state"),
+            Self::InProgress { .. } => { /* We're in a retry */ }
+            Self::NotStarted => { /* Fresh start */ }
+        }
+
+        *self = Self::InProgress;
+
+        Ok(())
+    }
+
+    /// Shortcut to create Timeline in stopping state and spawn deletion task.
+    pub async fn resume_deletion(
+        tenant: Arc<Tenant>,
+        timeline_id: TimelineId,
+        local_metadata: &TimelineMetadata,
+        remote_client: Option<RemoteTimelineClient>,
+        init_order: Option<&InitializationOrder>,
+    ) -> anyhow::Result<()> {
+        // Note: here we even skip populating layer map. Timeline is essentially uninitialized.
+        // RemoteTimelineClient is the only functioning part.
+        let timeline = tenant
+            .create_timeline_struct(
+                timeline_id,
+                local_metadata,
+                None, // Ancestor is not needed for deletion.
+                remote_client,
+                init_order,
+                // Important. We dont pass ancestor above because it can be missing.
+                // Thus we need to skip the validation here.
+                CreateTimelineCause::Delete,
+            )
+            .context("create_timeline_struct")?;
+
+        let mut guard = DeletionGuard(
+            Arc::clone(&timeline.delete_progress)
+                .try_lock_owned()
+                .expect("cannot happen because we're the only owner"),
+        );
+
+        // We meed to do this because when console retries delete request we shouldnt answer with 404
+        // because 404 means successful deletion.
+        {
+            let mut locked = tenant.timelines.lock().unwrap();
+            locked.insert(timeline_id, Arc::clone(&timeline));
+        }
+
+        guard.mark_in_progress()?;
+
+        // Note that delete mark can be missing on resume
+        // because we create delete mark after we set deleted_at in the index part.
+        create_delete_mark(tenant.conf, tenant.tenant_id, timeline_id).await?;
+
+        Self::schedule_background(guard, tenant.conf, tenant, timeline);
+
+        Ok(())
+    }
+
+    pub async fn cleanup_remaining_timeline_fs_traces(
+        tenant: &Tenant,
+        timeline_id: TimelineId,
+    ) -> anyhow::Result<()> {
+        cleanup_remaining_timeline_fs_traces(tenant.conf, tenant.tenant_id, timeline_id).await
+    }
+
+    fn prepare(
+        tenant: &Tenant,
+        timeline_id: TimelineId,
+    ) -> Result<(Arc<Timeline>, DeletionGuard), DeleteTimelineError> {
+        // Note the interaction between this guard and deletion guard.
+        // Here we attempt to lock deletion guard when we're holding a lock on timelines.
+        // This is important because when you take into account `remove_timeline_from_tenant`
+        // we remove timeline from memory when we still hold the deletion guard.
+        // So here when timeline deletion is finished timeline wont be present in timelines map at all
+        // which makes the following sequence impossible:
+        // T1: get preempted right before the try_lock on `Timeline::delete_progress`
+        // T2: do a full deletion, acquire and drop `Timeline::delete_progress`
+        // T1: acquire deletion lock, do another `DeleteTimelineFlow::run`
+        // For more context see this discussion: `https://github.com/neondatabase/neon/pull/4552#discussion_r1253437346`
+        let timelines = tenant.timelines.lock().unwrap();
+
+        let timeline = match timelines.get(&timeline_id) {
+            Some(t) => t,
+            None => return Err(DeleteTimelineError::NotFound),
+        };
+
+        // Ensure that there are no child timelines **attached to that pageserver**,
+        // because detach removes files, which will break child branches
+        let children: Vec<TimelineId> = timelines
+            .iter()
+            .filter_map(|(id, entry)| {
+                if entry.get_ancestor_timeline_id() == Some(timeline_id) {
+                    Some(*id)
+                } else {
+                    None
+                }
+            })
+            .collect();
+
+        if !children.is_empty() {
+            return Err(DeleteTimelineError::HasChildren(children));
+        }
+
+        // Note that using try_lock here is important to avoid a deadlock.
+        // Here we take lock on timelines and then the deletion guard.
+        // At the end of the operation we're holding the guard and need to lock timelines map
+        // to remove the timeline from it.
+        // Always if you have two locks that are taken in different order this can result in a deadlock.
+        let delete_lock_guard = DeletionGuard(
+            Arc::clone(&timeline.delete_progress)
+                .try_lock_owned()
+                .map_err(|_| DeleteTimelineError::AlreadyInProgress)?,
+        );
+
+        timeline.set_state(TimelineState::Stopping);
+
+        Ok((Arc::clone(timeline), delete_lock_guard))
+    }
+
+    fn schedule_background(
+        guard: DeletionGuard,
+        conf: &'static PageServerConf,
+        tenant: Arc<Tenant>,
+        timeline: Arc<Timeline>,
+    ) {
+        let tenant_id = timeline.tenant_id;
+        let timeline_id = timeline.timeline_id;
+
+        task_mgr::spawn(
+            task_mgr::BACKGROUND_RUNTIME.handle(),
+            TaskKind::TimelineDeletionWorker,
+            Some(tenant_id),
+            Some(timeline_id),
+            "timeline_delete",
+            false,
+            async move {
+                if let Err(err) = Self::background(guard, conf, &tenant, &timeline).await {
+                    error!("Error: {err:#}");
+                    timeline.set_broken(format!("{err:#}"))
+                };
+                Ok(())
+            }
+            .instrument({
+                let span =
+                    tracing::info_span!(parent: None, "delete_timeline", tenant_id=%tenant_id, timeline_id=%timeline_id);
+                span.follows_from(Span::current());
+                span
+            }),
+        );
+    }
+
+    async fn background(
+        mut guard: DeletionGuard,
+        conf: &PageServerConf,
+        tenant: &Tenant,
+        timeline: &Timeline,
+    ) -> Result<(), DeleteTimelineError> {
+        delete_local_layer_files(conf, tenant.tenant_id, timeline).await?;
+
+        delete_remote_layers_and_index(timeline).await?;
+
+        pausable_failpoint!("in_progress_delete");
+
+        cleanup_remaining_timeline_fs_traces(conf, tenant.tenant_id, timeline.timeline_id).await?;
+
+        remove_timeline_from_tenant(tenant, timeline.timeline_id, &guard).await?;
+
+        *guard.0 = Self::Finished;
+
+        Ok(())
+    }
+}
+
+struct DeletionGuard(OwnedMutexGuard<DeleteTimelineFlow>);
+
+impl Deref for DeletionGuard {
+    type Target = DeleteTimelineFlow;
+
+    fn deref(&self) -> &Self::Target {
+        &self.0
+    }
+}
+
+impl DerefMut for DeletionGuard {
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        &mut self.0
+    }
+}
--- a/pageserver/src/tenant/timeline/uninit.rs
+++ b/pageserver/src/tenant/timeline/uninit.rs
@@ -2,13 +2,9 @@ use std::{collections::hash_map::Entry, fs, path::PathBuf, sync::Arc};

 use anyhow::Context;
 use tracing::{error, info, info_span, warn};
-use utils::{crashsafe, id::TimelineId, lsn::Lsn};
+use utils::{crashsafe, fs_ext, id::TimelineId, lsn::Lsn};

-use crate::{
-    context::RequestContext,
-    import_datadir,
-    tenant::{ignore_absent_files, Tenant},
-};
+use crate::{context::RequestContext, import_datadir, tenant::Tenant};

 use super::Timeline;

@@ -141,7 +137,7 @@ impl Drop for UninitializedTimeline<'_> {

 pub(crate) fn cleanup_timeline_directory(uninit_mark: TimelineUninitMark) {
    let timeline_path = &uninit_mark.timeline_path;
-    match ignore_absent_files(|| fs::remove_dir_all(timeline_path)) {
+    match fs_ext::ignore_absent_files(|| fs::remove_dir_all(timeline_path)) {
        Ok(()) => {
            info!("Timeline dir {timeline_path:?} removed successfully, removing the uninit mark")
        }
@@ -185,7 +181,7 @@ impl TimelineUninitMark {
        let uninit_mark_parent = uninit_mark_file
            .parent()
            .with_context(|| format!("Uninit mark file {uninit_mark_file:?} has no parent"))?;
-        ignore_absent_files(|| fs::remove_file(uninit_mark_file)).with_context(|| {
+        fs_ext::ignore_absent_files(|| fs::remove_file(uninit_mark_file)).with_context(|| {
            format!("Failed to remove uninit mark file at path {uninit_mark_file:?}")
        })?;
        crashsafe::fsync(uninit_mark_parent).context("Failed to fsync uninit mark parent")?;
--- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
@@ -1123,7 +1123,7 @@ mod tests {
    }

    #[tokio::test]
-    async fn lsn_wal_over_threshhold_current_candidate() -> anyhow::Result<()> {
+    async fn lsn_wal_over_threshold_current_candidate() -> anyhow::Result<()> {
        let harness = TenantHarness::create("lsn_wal_over_threshcurrent_candidate")?;
        let mut state = dummy_state(&harness).await;
        let current_lsn = Lsn(100_000).align();
@@ -1189,8 +1189,8 @@ mod tests {
    }

    #[tokio::test]
-    async fn timeout_connection_threshhold_current_candidate() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("timeout_connection_threshhold_current_candidate")?;
+    async fn timeout_connection_threshold_current_candidate() -> anyhow::Result<()> {
+        let harness = TenantHarness::create("timeout_connection_threshold_current_candidate")?;
        let mut state = dummy_state(&harness).await;
        let current_lsn = Lsn(100_000).align();
        let now = Utc::now().naive_utc();
@@ -1252,8 +1252,8 @@ mod tests {
    }

    #[tokio::test]
-    async fn timeout_wal_over_threshhold_current_candidate() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("timeout_wal_over_threshhold_current_candidate")?;
+    async fn timeout_wal_over_threshold_current_candidate() -> anyhow::Result<()> {
+        let harness = TenantHarness::create("timeout_wal_over_threshold_current_candidate")?;
        let mut state = dummy_state(&harness).await;
        let current_lsn = Lsn(100_000).align();
        let new_lsn = Lsn(100_100).align();
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -149,12 +149,10 @@ impl OpenFiles {
        // old file.
        //
        if let Some(old_file) = slot_guard.file.take() {
-            // We do not have information about tenant_id/timeline_id of evicted file.
-            // It is possible to store path together with file or use filepath crate,
-            // but as far as close() is not expected to be fast, it is not so critical to gather
-            // precise per-tenant statistic here.
+            // the normal path of dropping VirtualFile uses "close", use "close-by-replace" here to
+            // distinguish the two.
            STORAGE_IO_TIME
-                .with_label_values(&["close", "-", "-"])
+                .with_label_values(&["close-by-replace"])
                .observe_closure_duration(|| drop(old_file));
        }

@@ -208,7 +206,7 @@ impl VirtualFile {
        }
        let (handle, mut slot_guard) = get_open_files().find_victim_slot();
        let file = STORAGE_IO_TIME
-            .with_label_values(&["open", &tenant_id, &timeline_id])
+            .with_label_values(&["open"])
            .observe_closure_duration(|| open_options.open(path))?;

        // Strip all options other than read and write.
@@ -271,7 +269,7 @@ impl VirtualFile {
                            // Found a cached file descriptor.
                            slot.recently_used.store(true, Ordering::Relaxed);
                            return Ok(STORAGE_IO_TIME
-                                .with_label_values(&[op, &self.tenant_id, &self.timeline_id])
+                                .with_label_values(&[op])
                                .observe_closure_duration(|| func(file)));
                        }
                    }
@@ -298,12 +296,12 @@ impl VirtualFile {

        // Open the physical file
        let file = STORAGE_IO_TIME
-            .with_label_values(&["open", &self.tenant_id, &self.timeline_id])
+            .with_label_values(&["open"])
            .observe_closure_duration(|| self.open_options.open(&self.path))?;

        // Perform the requested operation on it
        let result = STORAGE_IO_TIME
-            .with_label_values(&[op, &self.tenant_id, &self.timeline_id])
+            .with_label_values(&[op])
            .observe_closure_duration(|| func(&file));

        // Store the File in the slot and update the handle in the VirtualFile
@@ -333,13 +331,11 @@ impl Drop for VirtualFile {
        let mut slot_guard = slot.inner.write().unwrap();
        if slot_guard.tag == handle.tag {
            slot.recently_used.store(false, Ordering::Relaxed);
-            // Unlike files evicted by replacement algorithm, here
-            // we group close time by tenant_id/timeline_id.
-            // At allows to compare number/time of "normal" file closes
-            // with file eviction.
+            // there is also operation "close-by-replace" for closes done on eviction for
+            // comparison.
            STORAGE_IO_TIME
-                .with_label_values(&["close", &self.tenant_id, &self.timeline_id])
-                .observe_closure_duration(|| slot_guard.file.take());
+                .with_label_values(&["close"])
+                .observe_closure_duration(|| drop(slot_guard.file.take()));
        }
    }
 }
--- a/pgxn/neon/libpqwalproposer.c
+++ b/pgxn/neon/libpqwalproposer.c
@@ -292,7 +292,7 @@ walprop_async_read(WalProposerConn *conn, char **buf, int *amount)
 	/*
 	 * The docs for PQgetCopyData list the return values as: 0 if the copy is
 	 * still in progress, but no "complete row" is available -1 if the copy is
-	 * done -2 if an error occured (> 0) if it was successful; that value is
+	 * done -2 if an error occurred (> 0) if it was successful; that value is
 	 * the amount transferred.
 	 *
 	 * The protocol we use between walproposer and safekeeper means that we
@@ -353,7 +353,7 @@ walprop_async_write(WalProposerConn *conn, void const *buf, size_t size)
 	/*
 	 * The docs for PQputcopyData list the return values as: 1 if the data was
 	 * queued, 0 if it was not queued because of full buffers, or -1 if an
-	 * error occured
+	 * error occurred
 	 */
 	result = PQputCopyData(conn->pg_conn, buf, size);

--- a/pgxn/neon/walproposer.c
+++ b/pgxn/neon/walproposer.c
@@ -788,7 +788,7 @@ ReconnectSafekeepers(void)

 /*
 * Performs the logic for advancing the state machine of the specified safekeeper,
- * given that a certain set of events has occured.
+ * given that a certain set of events has occurred.
 */
 static void
 AdvancePollState(Safekeeper *sk, uint32 events)
--- a/pgxn/neon/walproposer.h
+++ b/pgxn/neon/walproposer.h
@@ -23,7 +23,7 @@
 									 * message header */

 /*
- * In the spirit of WL_SOCKET_READABLE and others, this corresponds to no events having occured,
+ * In the spirit of WL_SOCKET_READABLE and others, this corresponds to no events having occurred,
 * because all WL_* events are given flags equal to some (1 << i), starting from i = 0
 */
 #define WL_NO_EVENTS 0
@@ -317,7 +317,7 @@ typedef struct AppendResponse
 	/* this is a criterion for walproposer --sync mode exit */
 	XLogRecPtr	commitLsn;
 	HotStandbyFeedback hs;
-	/* Feedback recieved from pageserver includes standby_status_update fields */
+	/* Feedback received from pageserver includes standby_status_update fields */
 	/* and custom neon feedback. */
 	/* This part of the message is extensible. */
 	PageserverFeedback rf;
--- a/poetry.lock
+++ b/poetry.lock
@@ -740,13 +740,13 @@ typing-extensions = ">=4.1.0"

 [[package]]
 name = "certifi"
-version = "2022.12.7"
+version = "2023.7.22"
 description = "Python package for providing Mozilla's CA Bundle."
 optional = false
 python-versions = ">=3.6"
 files = [
-    {file = "certifi-2022.12.7-py3-none-any.whl", hash = "sha256:4ad3232f5e926d6718ec31cfc1fcadfde020920e278684144551c91769c7bc18"},
-    {file = "certifi-2022.12.7.tar.gz", hash = "sha256:35824b4c3a97115964b408844d64aa14db1cc518f6562e8d7261699d1350a9e3"},
+    {file = "certifi-2023.7.22-py3-none-any.whl", hash = "sha256:92d6037539857d8206b8f6ae472e8b77db8058fec5937a1ef3f54304089edbb9"},
+    {file = "certifi-2023.7.22.tar.gz", hash = "sha256:539cc1d13202e33ca466e88b2807e29f4c13049d6d87031a3c110744495cb082"},
 ]

 [[package]]
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -53,6 +53,12 @@ pub enum BackendType<'a, T> {
    Postgres(Cow<'a, console::provider::mock::Api>, T),
    /// Authentication via a web browser.
    Link(Cow<'a, url::ApiUrl>),
+    /// Test backend.
+    Test(&'a dyn TestBackend),
+}
+
+pub trait TestBackend: Send + Sync + 'static {
+    fn wake_compute(&self) -> Result<CachedNodeInfo, console::errors::WakeComputeError>;
 }

 impl std::fmt::Display for BackendType<'_, ()> {
@@ -62,6 +68,7 @@ impl std::fmt::Display for BackendType<'_, ()> {
            Console(endpoint, _) => fmt.debug_tuple("Console").field(&endpoint.url()).finish(),
            Postgres(endpoint, _) => fmt.debug_tuple("Postgres").field(&endpoint.url()).finish(),
            Link(url) => fmt.debug_tuple("Link").field(&url.as_str()).finish(),
+            Test(_) => fmt.debug_tuple("Test").finish(),
        }
    }
 }
@@ -75,6 +82,7 @@ impl<T> BackendType<'_, T> {
            Console(c, x) => Console(Cow::Borrowed(c), x),
            Postgres(c, x) => Postgres(Cow::Borrowed(c), x),
            Link(c) => Link(Cow::Borrowed(c)),
+            Test(x) => Test(*x),
        }
    }
 }
@@ -89,6 +97,7 @@ impl<'a, T> BackendType<'a, T> {
            Console(c, x) => Console(c, f(x)),
            Postgres(c, x) => Postgres(c, f(x)),
            Link(c) => Link(c),
+            Test(x) => Test(x),
        }
    }
 }
@@ -102,6 +111,7 @@ impl<'a, T, E> BackendType<'a, Result<T, E>> {
            Console(c, x) => x.map(|x| Console(c, x)),
            Postgres(c, x) => x.map(|x| Postgres(c, x)),
            Link(c) => Ok(Link(c)),
+            Test(x) => Ok(Test(x)),
        }
    }
 }
@@ -147,6 +157,7 @@ impl BackendType<'_, ClientCredentials<'_>> {
            Console(_, creds) => creds.project.clone(),
            Postgres(_, creds) => creds.project.clone(),
            Link(_) => Some("link".to_owned()),
+            Test(_) => Some("test".to_owned()),
        }
    }
    /// Authenticate the client via the requested backend, possibly using credentials.
@@ -188,6 +199,9 @@ impl BackendType<'_, ClientCredentials<'_>> {
                    .await?
                    .map(CachedNodeInfo::new_uncached)
            }
+            Test(_) => {
+                unreachable!("this function should never be called in the test backend")
+            }
        };

        info!("user successfully authenticated");
@@ -206,6 +220,7 @@ impl BackendType<'_, ClientCredentials<'_>> {
            Console(api, creds) => api.wake_compute(extra, creds).map_ok(Some).await,
            Postgres(api, creds) => api.wake_compute(extra, creds).map_ok(Some).await,
            Link(_) => Ok(None),
+            Test(x) => x.wake_compute().map(Some),
        }
    }
 }
--- a/proxy/src/auth/backend/classic.rs
+++ b/proxy/src/auth/backend/classic.rs
@@ -1,8 +1,11 @@
+use std::ops::ControlFlow;
+
 use super::AuthSuccess;
 use crate::{
    auth::{self, AuthFlow, ClientCredentials},
    compute,
    console::{self, AuthInfo, CachedNodeInfo, ConsoleReqExtra},
+    proxy::handle_try_wake,
    sasl, scram,
    stream::PqStream,
 };
@@ -48,7 +51,16 @@ pub(super) async fn authenticate(
        }
    };

-    let mut node = api.wake_compute(extra, creds).await?;
+    info!("compute node's state has likely changed; requesting a wake-up");
+    let mut num_retries = 0;
+    let mut node = loop {
+        let wake_res = api.wake_compute(extra, creds).await;
+        match handle_try_wake(wake_res, num_retries)? {
+            ControlFlow::Continue(_) => num_retries += 1,
+            ControlFlow::Break(n) => break n,
+        }
+        info!(num_retries, "retrying wake compute");
+    };
    if let Some(keys) = scram_keys {
        use tokio_postgres::config::AuthKeys;
        node.config.auth_keys(AuthKeys::ScramSha256(keys));
--- a/proxy/src/auth/credentials.rs
+++ b/proxy/src/auth/credentials.rs
@@ -48,6 +48,14 @@ impl ClientCredentials<'_> {
 }

 impl<'a> ClientCredentials<'a> {
+    #[cfg(test)]
+    pub fn new_noop() -> Self {
+        ClientCredentials {
+            user: "",
+            project: None,
+        }
+    }
+
    pub fn parse(
        params: &'a StartupMessageParams,
        sni: Option<&str>,
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -14,6 +14,7 @@ pub mod errors {
    use crate::{
        error::{io_error, UserFacingError},
        http,
+        proxy::ShouldRetry,
    };
    use thiserror::Error;

@@ -72,6 +73,24 @@ pub mod errors {
        }
    }

+    impl ShouldRetry for ApiError {
+        fn could_retry(&self) -> bool {
+            match self {
+                // retry some transport errors
+                Self::Transport(io) => io.could_retry(),
+                // retry some temporary failures because the compute was in a bad state
+                // (bad request can be returned when the endpoint was in transition)
+                Self::Console {
+                    status: http::StatusCode::BAD_REQUEST | http::StatusCode::LOCKED,
+                    ..
+                } => true,
+                // retry server errors
+                Self::Console { status, .. } if status.is_server_error() => true,
+                _ => false,
+            }
+        }
+    }
+
    impl From<reqwest::Error> for ApiError {
        fn from(e: reqwest::Error) -> Self {
            io_error(e).into()
--- a/proxy/src/http/sql_over_http.rs
+++ b/proxy/src/http/sql_over_http.rs
@@ -1,7 +1,9 @@
 use std::sync::Arc;

+use anyhow::bail;
 use futures::pin_mut;
 use futures::StreamExt;
+use hashbrown::HashMap;
 use hyper::body::HttpBody;
 use hyper::http::HeaderName;
 use hyper::http::HeaderValue;
@@ -11,6 +13,8 @@ use serde_json::Map;
 use serde_json::Value;
 use tokio_postgres::types::Kind;
 use tokio_postgres::types::Type;
+use tokio_postgres::GenericClient;
+use tokio_postgres::IsolationLevel;
 use tokio_postgres::Row;
 use url::Url;

@@ -23,12 +27,21 @@ struct QueryData {
    params: Vec<serde_json::Value>,
 }

+#[derive(serde::Deserialize)]
+#[serde(untagged)]
+enum Payload {
+    Single(QueryData),
+    Batch(Vec<QueryData>),
+}
+
 pub const MAX_RESPONSE_SIZE: usize = 1024 * 1024; // 1 MB
 const MAX_REQUEST_SIZE: u64 = 1024 * 1024; // 1 MB

 static RAW_TEXT_OUTPUT: HeaderName = HeaderName::from_static("neon-raw-text-output");
 static ARRAY_MODE: HeaderName = HeaderName::from_static("neon-array-mode");
 static ALLOW_POOL: HeaderName = HeaderName::from_static("neon-pool-opt-in");
+static TXN_ISOLATION_LEVEL: HeaderName = HeaderName::from_static("neon-batch-isolation-level");
+static TXN_READ_ONLY: HeaderName = HeaderName::from_static("neon-batch-read-only");

 static HEADER_VALUE_TRUE: HeaderValue = HeaderValue::from_static("true");

@@ -162,7 +175,7 @@ pub async fn handle(
    request: Request<Body>,
    sni_hostname: Option<String>,
    conn_pool: Arc<GlobalConnPool>,
-) -> anyhow::Result<Value> {
+) -> anyhow::Result<(Value, HashMap<HeaderName, HeaderValue>)> {
    //
    // Determine the destination and connection params
    //
@@ -177,6 +190,23 @@ pub async fn handle(
    // Allow connection pooling only if explicitly requested
    let allow_pool = headers.get(&ALLOW_POOL) == Some(&HEADER_VALUE_TRUE);

+    // isolation level and read only
+
+    let txn_isolation_level_raw = headers.get(&TXN_ISOLATION_LEVEL).cloned();
+    let txn_isolation_level = match txn_isolation_level_raw {
+        Some(ref x) => Some(match x.as_bytes() {
+            b"Serializable" => IsolationLevel::Serializable,
+            b"ReadUncommitted" => IsolationLevel::ReadUncommitted,
+            b"ReadCommitted" => IsolationLevel::ReadCommitted,
+            b"RepeatableRead" => IsolationLevel::RepeatableRead,
+            _ => bail!("invalid isolation level"),
+        }),
+        None => None,
+    };
+
+    let txn_read_only_raw = headers.get(&TXN_READ_ONLY).cloned();
+    let txn_read_only = txn_read_only_raw.as_ref() == Some(&HEADER_VALUE_TRUE);
+
    let request_content_length = match request.body().size_hint().upper() {
        Some(v) => v,
        None => MAX_REQUEST_SIZE + 1,
@@ -192,15 +222,70 @@ pub async fn handle(
    // Read the query and query params from the request body
    //
    let body = hyper::body::to_bytes(request.into_body()).await?;
-    let QueryData { query, params } = serde_json::from_slice(&body)?;
-    let query_params = json_to_pg_text(params)?;
+    let payload: Payload = serde_json::from_slice(&body)?;
+
+    let mut client = conn_pool.get(&conn_info, !allow_pool).await?;

    //
    // Now execute the query and return the result
    //
-    let client = conn_pool.get(&conn_info, !allow_pool).await?;
+    let result = match payload {
+        Payload::Single(query) => query_to_json(&client, query, raw_output, array_mode)
+            .await
+            .map(|x| (x, HashMap::default())),
+        Payload::Batch(queries) => {
+            let mut results = Vec::new();
+            let mut builder = client.build_transaction();
+            if let Some(isolation_level) = txn_isolation_level {
+                builder = builder.isolation_level(isolation_level);
+            }
+            if txn_read_only {
+                builder = builder.read_only(true);
+            }
+            let transaction = builder.start().await?;
+            for query in queries {
+                let result = query_to_json(&transaction, query, raw_output, array_mode).await;
+                match result {
+                    Ok(r) => results.push(r),
+                    Err(e) => {
+                        transaction.rollback().await?;
+                        return Err(e);
+                    }
+                }
+            }
+            transaction.commit().await?;
+            let mut headers = HashMap::default();
+            headers.insert(
+                TXN_READ_ONLY.clone(),
+                HeaderValue::try_from(txn_read_only.to_string())?,
+            );
+            if let Some(txn_isolation_level_raw) = txn_isolation_level_raw {
+                headers.insert(TXN_ISOLATION_LEVEL.clone(), txn_isolation_level_raw);
+            }
+            Ok((json!({ "results": results }), headers))
+        }
+    };

-    let row_stream = client.query_raw_txt(query, query_params).await?;
+    if allow_pool {
+        // return connection to the pool
+        tokio::task::spawn(async move {
+            let _ = conn_pool.put(&conn_info, client).await;
+        });
+    }
+
+    result
+}
+
+async fn query_to_json<T: GenericClient>(
+    client: &T,
+    data: QueryData,
+    raw_output: bool,
+    array_mode: bool,
+) -> anyhow::Result<Value> {
+    let query_params = json_to_pg_text(data.params)?;
+    let row_stream = client
+        .query_raw_txt::<String, _>(data.query, query_params)
+        .await?;

    // Manually drain the stream into a vector to leave row_stream hanging
    // around to get a command tag. Also check that the response is not too
@@ -256,13 +341,6 @@ pub async fn handle(
        .map(|row| pg_text_row_to_json(row, raw_output, array_mode))
        .collect::<Result<Vec<_>, _>>()?;

-    if allow_pool {
-        // return connection to the pool
-        tokio::task::spawn(async move {
-            let _ = conn_pool.put(&conn_info, client).await;
-        });
-    }
-
    // resulting JSON format is based on the format of node-postgres result
    Ok(json!({
        "command": command_tag_name,
--- a/proxy/src/http/websocket.rs
+++ b/proxy/src/http/websocket.rs
@@ -6,6 +6,7 @@ use crate::{
 };
 use bytes::{Buf, Bytes};
 use futures::{Sink, Stream, StreamExt};
+use hashbrown::HashMap;
 use hyper::{
    server::{
        accept,
@@ -181,13 +182,15 @@ async fn ws_handler(

    // Check if the request is a websocket upgrade request.
    if hyper_tungstenite::is_upgrade_request(&request) {
+        info!(session_id = ?session_id, "performing websocket upgrade");
+
        let (response, websocket) = hyper_tungstenite::upgrade(&mut request, None)
            .map_err(|e| ApiError::BadRequest(e.into()))?;

        tokio::spawn(async move {
            if let Err(e) = serve_websocket(websocket, config, &cancel_map, session_id, host).await
            {
-                error!("error in websocket connection: {e:?}");
+                error!(session_id = ?session_id, "error in websocket connection: {e:?}");
            }
        });

@@ -203,7 +206,7 @@ async fn ws_handler(
            Ok(_) => StatusCode::OK,
            Err(_) => StatusCode::BAD_REQUEST,
        };
-        let json = match result {
+        let (json, headers) = match result {
            Ok(r) => r,
            Err(e) => {
                let message = format!("{:?}", e);
@@ -214,7 +217,10 @@ async fn ws_handler(
                    },
                    None => Value::Null,
                };
-                json!({ "message": message, "code": code })
+                (
+                    json!({ "message": message, "code": code }),
+                    HashMap::default(),
+                )
            }
        };
        json_response(status_code, json).map(|mut r| {
@@ -222,6 +228,9 @@ async fn ws_handler(
                "Access-Control-Allow-Origin",
                hyper::http::HeaderValue::from_static("*"),
            );
+            for (k, v) in headers {
+                r.headers_mut().insert(k, v);
+            }
            r
        })
    } else if request.uri().path() == "/sql" && request.method() == Method::OPTIONS {
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -6,18 +6,15 @@ use crate::{
    cancellation::{self, CancelMap},
    compute::{self, PostgresConnection},
    config::{ProxyConfig, TlsConfig},
-    console::{
-        self,
-        errors::{ApiError, WakeComputeError},
-        messages::MetricsAuxInfo,
-    },
+    console::{self, errors::WakeComputeError, messages::MetricsAuxInfo, Api},
    stream::{PqStream, Stream},
 };
 use anyhow::{bail, Context};
 use async_trait::async_trait;
 use futures::TryFutureExt;
-use hyper::StatusCode;
-use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec};
+use metrics::{
+    exponential_buckets, register_histogram, register_int_counter_vec, Histogram, IntCounterVec,
+};
 use once_cell::sync::Lazy;
 use pq_proto::{BeMessage as Be, FeStartupPacket, StartupMessageParams};
 use std::{error::Error, io, ops::ControlFlow, sync::Arc};
@@ -31,25 +28,37 @@ use utils::measured_stream::MeasuredStream;

 /// Number of times we should retry the `/proxy_wake_compute` http request.
 /// Retry duration is BASE_RETRY_WAIT_DURATION * 1.5^n
-const NUM_RETRIES_CONNECT: u32 = 10;
+pub const NUM_RETRIES_CONNECT: u32 = 10;
 const CONNECT_TIMEOUT: time::Duration = time::Duration::from_secs(2);
 const BASE_RETRY_WAIT_DURATION: time::Duration = time::Duration::from_millis(100);

 const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)";
 const ERR_PROTO_VIOLATION: &str = "protocol violation";

-static NUM_CONNECTIONS_ACCEPTED_COUNTER: Lazy<IntCounter> = Lazy::new(|| {
-    register_int_counter!(
+static NUM_CONNECTIONS_ACCEPTED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
        "proxy_accepted_connections_total",
-        "Number of TCP client connections accepted."
+        "Number of TCP client connections accepted.",
+        &["protocol"],
    )
    .unwrap()
 });

-static NUM_CONNECTIONS_CLOSED_COUNTER: Lazy<IntCounter> = Lazy::new(|| {
-    register_int_counter!(
+static NUM_CONNECTIONS_CLOSED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
        "proxy_closed_connections_total",
-        "Number of TCP client connections closed."
+        "Number of TCP client connections closed.",
+        &["protocol"],
+    )
+    .unwrap()
+});
+
+static COMPUTE_CONNECTION_LATENCY: Lazy<Histogram> = Lazy::new(|| {
+    register_histogram!(
+        "proxy_compute_connection_latency_seconds",
+        "Time it took for proxy to establish a connection to the compute endpoint",
+        // largest bucket = 2^16 * 0.5ms = 32s
+        exponential_buckets(0.0005, 2.0, 16).unwrap(),
    )
    .unwrap()
 });
@@ -137,6 +146,13 @@ pub enum ClientMode {

 /// Abstracts the logic of handling TCP vs WS clients
 impl ClientMode {
+    fn protocol_label(&self) -> &'static str {
+        match self {
+            ClientMode::Tcp => "tcp",
+            ClientMode::Websockets { .. } => "ws",
+        }
+    }
+
    fn allow_cleartext(&self) -> bool {
        match self {
            ClientMode::Tcp => false,
@@ -175,10 +191,17 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
    stream: S,
    mode: ClientMode,
 ) -> anyhow::Result<()> {
+    info!(
+        protocol = mode.protocol_label(),
+        "handling interactive connection from client"
+    );
+
    // The `closed` counter will increase when this future is destroyed.
-    NUM_CONNECTIONS_ACCEPTED_COUNTER.inc();
+    NUM_CONNECTIONS_ACCEPTED_COUNTER
+        .with_label_values(&[mode.protocol_label()])
+        .inc();
    scopeguard::defer! {
-        NUM_CONNECTIONS_CLOSED_COUNTER.inc();
+        NUM_CONNECTIONS_CLOSED_COUNTER.with_label_values(&[mode.protocol_label()]).inc();
    }

    let tls = config.tls_config.as_ref();
@@ -324,11 +347,6 @@ async fn connect_to_compute_once(
        .await
 }

-enum ConnectionState<E> {
-    Cached(console::CachedNodeInfo),
-    Invalid(compute::ConnCfg, E),
-}
-
 #[async_trait]
 pub trait ConnectMechanism {
    type Connection;
@@ -380,88 +398,91 @@ where
    M::ConnectError: ShouldRetry + std::fmt::Debug,
    M::Error: From<WakeComputeError>,
 {
+    let _timer = COMPUTE_CONNECTION_LATENCY.start_timer();
+
    mechanism.update_connect_config(&mut node_info.config);

-    let mut num_retries = 0;
-    let mut state = ConnectionState::<M::ConnectError>::Cached(node_info);
+    // try once
+    let (config, err) = match mechanism.connect_once(&node_info, CONNECT_TIMEOUT).await {
+        Ok(res) => return Ok(res),
+        Err(e) => {
+            error!(error = ?e, "could not connect to compute node");
+            (invalidate_cache(node_info), e)
+        }
+    };

-    loop {
-        match state {
-            ConnectionState::Invalid(config, err) => {
-                match try_wake(&config, extra, creds).await {
-                    // we can't wake up the compute node
-                    Ok(None) => return Err(err.into()),
-                    // there was an error communicating with the control plane
-                    Err(e) => return Err(e.into()),
-                    // failed to wake up but we can continue to retry
-                    Ok(Some(ControlFlow::Continue(()))) => {
-                        state = ConnectionState::Invalid(config, err);
-                        let wait_duration = retry_after(num_retries);
-                        num_retries += 1;
+    let mut num_retries = 1;

-                        info!(num_retries, "retrying wake compute");
-                        time::sleep(wait_duration).await;
-                        continue;
-                    }
-                    // successfully woke up a compute node and can break the wakeup loop
-                    Ok(Some(ControlFlow::Break(mut node_info))) => {
-                        mechanism.update_connect_config(&mut node_info.config);
-                        state = ConnectionState::Cached(node_info)
-                    }
-                }
+    // if we failed to connect, it's likely that the compute node was suspended, wake a new compute node
+    info!("compute node's state has likely changed; requesting a wake-up");
+    let node_info = loop {
+        let wake_res = match creds {
+            auth::BackendType::Console(api, creds) => api.wake_compute(extra, creds).await,
+            auth::BackendType::Postgres(api, creds) => api.wake_compute(extra, creds).await,
+            // nothing to do?
+            auth::BackendType::Link(_) => return Err(err.into()),
+            // test backend
+            auth::BackendType::Test(x) => x.wake_compute(),
+        };
+
+        match handle_try_wake(wake_res, num_retries)? {
+            // failed to wake up but we can continue to retry
+            ControlFlow::Continue(_) => {}
+            // successfully woke up a compute node and can break the wakeup loop
+            ControlFlow::Break(mut node_info) => {
+                node_info.config.reuse_password(&config);
+                mechanism.update_connect_config(&mut node_info.config);
+                break node_info;
            }
-            ConnectionState::Cached(node_info) => {
-                match mechanism.connect_once(&node_info, CONNECT_TIMEOUT).await {
-                    Ok(res) => return Ok(res),
-                    Err(e) => {
-                        error!(error = ?e, "could not connect to compute node");
-                        if !e.should_retry(num_retries) {
-                            return Err(e.into());
-                        }
+        }

-                        // after the first connect failure,
-                        // we should invalidate the cache and wake up a new compute node
-                        if num_retries == 0 {
-                            state = ConnectionState::Invalid(invalidate_cache(node_info), e);
-                        } else {
-                            state = ConnectionState::Cached(node_info);
-                        }
+        let wait_duration = retry_after(num_retries);
+        num_retries += 1;

-                        let wait_duration = retry_after(num_retries);
-                        num_retries += 1;
+        time::sleep(wait_duration).await;
+        info!(num_retries, "retrying wake compute");
+    };

-                        info!(num_retries, "retrying wake compute");
-                        time::sleep(wait_duration).await;
-                    }
+    // now that we have a new node, try connect to it repeatedly.
+    // this can error for a few reasons, for instance:
+    // * DNS connection settings haven't quite propagated yet
+    info!("wake_compute success. attempting to connect");
+    loop {
+        match mechanism.connect_once(&node_info, CONNECT_TIMEOUT).await {
+            Ok(res) => return Ok(res),
+            Err(e) => {
+                error!(error = ?e, "could not connect to compute node");
+                if !e.should_retry(num_retries) {
+                    return Err(e.into());
                }
            }
        }
+
+        let wait_duration = retry_after(num_retries);
+        num_retries += 1;
+
+        time::sleep(wait_duration).await;
+        info!(num_retries, "retrying connect_once");
    }
 }

 /// Attempts to wake up the compute node.
-/// * Returns Ok(Some(true)) if there was an error waking but retries are acceptable
-/// * Returns Ok(Some(false)) if the wakeup succeeded
-/// * Returns Ok(None) or Err(e) if there was an error
-async fn try_wake(
-    config: &compute::ConnCfg,
-    extra: &console::ConsoleReqExtra<'_>,
-    creds: &auth::BackendType<'_, auth::ClientCredentials<'_>>,
-) -> Result<Option<ControlFlow<console::CachedNodeInfo>>, WakeComputeError> {
-    info!("compute node's state has likely changed; requesting a wake-up");
-    match creds.wake_compute(extra).await {
-        // retry wake if the compute was in an invalid state
-        Err(WakeComputeError::ApiError(ApiError::Console {
-            status: StatusCode::BAD_REQUEST,
-            ..
-        })) => Ok(Some(ControlFlow::Continue(()))),
-        // Update `node_info` and try again.
-        Ok(Some(mut new)) => {
-            new.config.reuse_password(config);
-            Ok(Some(ControlFlow::Break(new)))
-        }
-        Err(e) => Err(e),
-        Ok(None) => Ok(None),
+/// * Returns Ok(Continue(e)) if there was an error waking but retries are acceptable
+/// * Returns Ok(Break(node)) if the wakeup succeeded
+/// * Returns Err(e) if there was an error
+pub fn handle_try_wake(
+    result: Result<console::CachedNodeInfo, WakeComputeError>,
+    num_retries: u32,
+) -> Result<ControlFlow<console::CachedNodeInfo, WakeComputeError>, WakeComputeError> {
+    match result {
+        Err(err) => match &err {
+            WakeComputeError::ApiError(api) if api.should_retry(num_retries) => {
+                Ok(ControlFlow::Continue(err))
+            }
+            _ => Err(err),
+        },
+        // Ready to try again.
+        Ok(new) => Ok(ControlFlow::Break(new)),
    }
 }

@@ -469,8 +490,6 @@ pub trait ShouldRetry {
    fn could_retry(&self) -> bool;
    fn should_retry(&self, num_retries: u32) -> bool {
        match self {
-            // retry all errors at least once
-            _ if num_retries == 0 => true,
            _ if num_retries >= NUM_RETRIES_CONNECT => false,
            err => err.could_retry(),
        }
@@ -522,14 +541,9 @@ impl ShouldRetry for compute::ConnectionError {
    }
 }

-pub fn retry_after(num_retries: u32) -> time::Duration {
-    match num_retries {
-        0 => time::Duration::ZERO,
-        _ => {
-            // 3/2 = 1.5 which seems to be an ok growth factor heuristic
-            BASE_RETRY_WAIT_DURATION * 3_u32.pow(num_retries) / 2_u32.pow(num_retries)
-        }
-    }
+fn retry_after(num_retries: u32) -> time::Duration {
+    // 1.5 seems to be an ok growth factor heuristic
+    BASE_RETRY_WAIT_DURATION.mul_f64(1.5_f64.powi(num_retries as i32))
 }

 /// Finish client connection initialization: confirm auth success, send params, etc.
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -1,6 +1,10 @@
 //! A group of high-level tests for connection establishing logic and auth.
+//!
 use super::*;
-use crate::{auth, sasl, scram};
+use crate::auth::backend::TestBackend;
+use crate::auth::ClientCredentials;
+use crate::console::{CachedNodeInfo, NodeInfo};
+use crate::{auth, http, sasl, scram};
 use async_trait::async_trait;
 use rstest::rstest;
 use tokio_postgres::config::SslMode;
@@ -298,9 +302,230 @@ async fn scram_auth_mock() -> anyhow::Result<()> {
 #[test]
 fn connect_compute_total_wait() {
    let mut total_wait = tokio::time::Duration::ZERO;
-    for num_retries in 0..10 {
+    for num_retries in 1..10 {
        total_wait += retry_after(num_retries);
    }
    assert!(total_wait < tokio::time::Duration::from_secs(12));
    assert!(total_wait > tokio::time::Duration::from_secs(10));
 }
+
+#[derive(Clone, Copy, Debug)]
+enum ConnectAction {
+    Wake,
+    WakeFail,
+    WakeRetry,
+    Connect,
+    Retry,
+    Fail,
+}
+
+struct TestConnectMechanism {
+    counter: Arc<std::sync::Mutex<usize>>,
+    sequence: Vec<ConnectAction>,
+}
+
+impl TestConnectMechanism {
+    fn verify(&self) {
+        let counter = self.counter.lock().unwrap();
+        assert_eq!(
+            *counter,
+            self.sequence.len(),
+            "sequence does not proceed to the end"
+        );
+    }
+}
+
+impl TestConnectMechanism {
+    fn new(sequence: Vec<ConnectAction>) -> Self {
+        Self {
+            counter: Arc::new(std::sync::Mutex::new(0)),
+            sequence,
+        }
+    }
+}
+
+#[derive(Debug)]
+struct TestConnection;
+
+#[derive(Debug)]
+struct TestConnectError {
+    retryable: bool,
+}
+
+impl std::fmt::Display for TestConnectError {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{:?}", self)
+    }
+}
+
+impl std::error::Error for TestConnectError {}
+
+impl ShouldRetry for TestConnectError {
+    fn could_retry(&self) -> bool {
+        self.retryable
+    }
+}
+
+#[async_trait]
+impl ConnectMechanism for TestConnectMechanism {
+    type Connection = TestConnection;
+    type ConnectError = TestConnectError;
+    type Error = anyhow::Error;
+
+    async fn connect_once(
+        &self,
+        _node_info: &console::CachedNodeInfo,
+        _timeout: time::Duration,
+    ) -> Result<Self::Connection, Self::ConnectError> {
+        let mut counter = self.counter.lock().unwrap();
+        let action = self.sequence[*counter];
+        *counter += 1;
+        match action {
+            ConnectAction::Connect => Ok(TestConnection),
+            ConnectAction::Retry => Err(TestConnectError { retryable: true }),
+            ConnectAction::Fail => Err(TestConnectError { retryable: false }),
+            x => panic!("expecting action {:?}, connect is called instead", x),
+        }
+    }
+
+    fn update_connect_config(&self, _conf: &mut compute::ConnCfg) {}
+}
+
+impl TestBackend for TestConnectMechanism {
+    fn wake_compute(&self) -> Result<CachedNodeInfo, console::errors::WakeComputeError> {
+        let mut counter = self.counter.lock().unwrap();
+        let action = self.sequence[*counter];
+        *counter += 1;
+        match action {
+            ConnectAction::Wake => Ok(helper_create_cached_node_info()),
+            ConnectAction::WakeFail => {
+                let err = console::errors::ApiError::Console {
+                    status: http::StatusCode::FORBIDDEN,
+                    text: "TEST".into(),
+                };
+                assert!(!err.could_retry());
+                Err(console::errors::WakeComputeError::ApiError(err))
+            }
+            ConnectAction::WakeRetry => {
+                let err = console::errors::ApiError::Console {
+                    status: http::StatusCode::INTERNAL_SERVER_ERROR,
+                    text: "TEST".into(),
+                };
+                assert!(err.could_retry());
+                Err(console::errors::WakeComputeError::ApiError(err))
+            }
+            x => panic!("expecting action {:?}, wake_compute is called instead", x),
+        }
+    }
+}
+
+fn helper_create_cached_node_info() -> CachedNodeInfo {
+    let node = NodeInfo {
+        config: compute::ConnCfg::new(),
+        aux: Default::default(),
+        allow_self_signed_compute: false,
+    };
+    CachedNodeInfo::new_uncached(node)
+}
+
+fn helper_create_connect_info(
+    mechanism: &TestConnectMechanism,
+) -> (
+    CachedNodeInfo,
+    console::ConsoleReqExtra<'static>,
+    auth::BackendType<'_, ClientCredentials<'static>>,
+) {
+    let cache = helper_create_cached_node_info();
+    let extra = console::ConsoleReqExtra {
+        session_id: uuid::Uuid::new_v4(),
+        application_name: Some("TEST"),
+    };
+    let creds = auth::BackendType::Test(mechanism);
+    (cache, extra, creds)
+}
+
+#[tokio::test]
+async fn connect_to_compute_success() {
+    use ConnectAction::*;
+    let mechanism = TestConnectMechanism::new(vec![Connect]);
+    let (cache, extra, creds) = helper_create_connect_info(&mechanism);
+    connect_to_compute(&mechanism, cache, &extra, &creds)
+        .await
+        .unwrap();
+    mechanism.verify();
+}
+
+#[tokio::test]
+async fn connect_to_compute_retry() {
+    use ConnectAction::*;
+    let mechanism = TestConnectMechanism::new(vec![Retry, Wake, Retry, Connect]);
+    let (cache, extra, creds) = helper_create_connect_info(&mechanism);
+    connect_to_compute(&mechanism, cache, &extra, &creds)
+        .await
+        .unwrap();
+    mechanism.verify();
+}
+
+/// Test that we don't retry if the error is not retryable.
+#[tokio::test]
+async fn connect_to_compute_non_retry_1() {
+    use ConnectAction::*;
+    let mechanism = TestConnectMechanism::new(vec![Retry, Wake, Retry, Fail]);
+    let (cache, extra, creds) = helper_create_connect_info(&mechanism);
+    connect_to_compute(&mechanism, cache, &extra, &creds)
+        .await
+        .unwrap_err();
+    mechanism.verify();
+}
+
+/// Even for non-retryable errors, we should retry at least once.
+#[tokio::test]
+async fn connect_to_compute_non_retry_2() {
+    use ConnectAction::*;
+    let mechanism = TestConnectMechanism::new(vec![Fail, Wake, Retry, Connect]);
+    let (cache, extra, creds) = helper_create_connect_info(&mechanism);
+    connect_to_compute(&mechanism, cache, &extra, &creds)
+        .await
+        .unwrap();
+    mechanism.verify();
+}
+
+/// Retry for at most `NUM_RETRIES_CONNECT` times.
+#[tokio::test]
+async fn connect_to_compute_non_retry_3() {
+    assert_eq!(NUM_RETRIES_CONNECT, 10);
+    use ConnectAction::*;
+    let mechanism = TestConnectMechanism::new(vec![
+        Retry, Wake, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry,
+        /* the 11th time */ Retry,
+    ]);
+    let (cache, extra, creds) = helper_create_connect_info(&mechanism);
+    connect_to_compute(&mechanism, cache, &extra, &creds)
+        .await
+        .unwrap_err();
+    mechanism.verify();
+}
+
+/// Should retry wake compute.
+#[tokio::test]
+async fn wake_retry() {
+    use ConnectAction::*;
+    let mechanism = TestConnectMechanism::new(vec![Retry, WakeRetry, Wake, Connect]);
+    let (cache, extra, creds) = helper_create_connect_info(&mechanism);
+    connect_to_compute(&mechanism, cache, &extra, &creds)
+        .await
+        .unwrap();
+    mechanism.verify();
+}
+
+/// Wake failed with a non-retryable error.
+#[tokio::test]
+async fn wake_non_retry() {
+    use ConnectAction::*;
+    let mechanism = TestConnectMechanism::new(vec![Retry, WakeFail]);
+    let (cache, extra, creds) = helper_create_connect_info(&mechanism);
+    connect_to_compute(&mechanism, cache, &extra, &creds)
+        .await
+        .unwrap_err();
+    mechanism.verify();
+}
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -234,7 +234,10 @@ async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
                listen_pg_addr_tenant_only
            );
            let listener = tcp_listener::bind(listen_pg_addr_tenant_only.clone()).map_err(|e| {
-                error!("failed to bind to address {}: {}", conf.listen_pg_addr, e);
+                error!(
+                    "failed to bind to address {}: {}",
+                    listen_pg_addr_tenant_only, e
+                );
                e
            })?;
            Some(listener)
--- a/safekeeper/src/handler.rs
+++ b/safekeeper/src/handler.rs
@@ -11,6 +11,7 @@ use crate::auth::check_permission;
 use crate::json_ctrl::{handle_json_ctrl, AppendLogicalMessage};

 use crate::metrics::{TrafficMetrics, PG_QUERIES_FINISHED, PG_QUERIES_RECEIVED};
+use crate::timeline::TimelineError;
 use crate::wal_service::ConnectionId;
 use crate::{GlobalTimelines, SafeKeeperConf};
 use postgres_backend::QueryError;
@@ -45,6 +46,7 @@ enum SafekeeperPostgresCommand {
    StartWalPush,
    StartReplication { start_lsn: Lsn },
    IdentifySystem,
+    TimelineStatus,
    JSONCtrl { cmd: AppendLogicalMessage },
 }

@@ -64,6 +66,8 @@ fn parse_cmd(cmd: &str) -> anyhow::Result<SafekeeperPostgresCommand> {
        Ok(SafekeeperPostgresCommand::StartReplication { start_lsn })
    } else if cmd.starts_with("IDENTIFY_SYSTEM") {
        Ok(SafekeeperPostgresCommand::IdentifySystem)
+    } else if cmd.starts_with("TIMELINE_STATUS") {
+        Ok(SafekeeperPostgresCommand::TimelineStatus)
    } else if cmd.starts_with("JSON_CTRL") {
        let cmd = cmd.strip_prefix("JSON_CTRL").context("invalid prefix")?;
        Ok(SafekeeperPostgresCommand::JSONCtrl {
@@ -78,6 +82,7 @@ fn cmd_to_string(cmd: &SafekeeperPostgresCommand) -> &str {
    match cmd {
        SafekeeperPostgresCommand::StartWalPush => "START_WAL_PUSH",
        SafekeeperPostgresCommand::StartReplication { .. } => "START_REPLICATION",
+        SafekeeperPostgresCommand::TimelineStatus => "TIMELINE_STATUS",
        SafekeeperPostgresCommand::IdentifySystem => "IDENTIFY_SYSTEM",
        SafekeeperPostgresCommand::JSONCtrl { .. } => "JSON_CTRL",
    }
@@ -219,6 +224,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin + Send> postgres_backend::Handler<IO>
                    .await
            }
            SafekeeperPostgresCommand::IdentifySystem => self.handle_identify_system(pgb).await,
+            SafekeeperPostgresCommand::TimelineStatus => self.handle_timeline_status(pgb).await,
            SafekeeperPostgresCommand::JSONCtrl { ref cmd } => {
                handle_json_ctrl(self, pgb, cmd).await
            }
@@ -263,6 +269,38 @@ impl SafekeeperPostgresHandler {
        check_permission(claims, tenant_id)
    }

+    async fn handle_timeline_status<IO: AsyncRead + AsyncWrite + Unpin>(
+        &mut self,
+        pgb: &mut PostgresBackend<IO>,
+    ) -> Result<(), QueryError> {
+        // Get timeline, handling "not found" error
+        let tli = match GlobalTimelines::get(self.ttid) {
+            Ok(tli) => Ok(Some(tli)),
+            Err(TimelineError::NotFound(_)) => Ok(None),
+            Err(e) => Err(QueryError::Other(e.into())),
+        }?;
+
+        // Write row description
+        pgb.write_message_noflush(&BeMessage::RowDescription(&[
+            RowDescriptor::text_col(b"flush_lsn"),
+            RowDescriptor::text_col(b"commit_lsn"),
+        ]))?;
+
+        // Write row if timeline exists
+        if let Some(tli) = tli {
+            let (inmem, _state) = tli.get_state().await;
+            let flush_lsn = tli.get_flush_lsn().await;
+            let commit_lsn = inmem.commit_lsn;
+            pgb.write_message_noflush(&BeMessage::DataRow(&[
+                Some(flush_lsn.to_string().as_bytes()),
+                Some(commit_lsn.to_string().as_bytes()),
+            ]))?;
+        }
+
+        pgb.write_message_noflush(&BeMessage::CommandComplete(b"TIMELINE_STATUS"))?;
+        Ok(())
+    }
+
    ///
    /// Handle IDENTIFY_SYSTEM replication command
    ///
--- a/scripts/combine_control_files.py
+++ b/scripts/combine_control_files.py
@@ -0,0 +1,80 @@
+#! /usr/bin/env python3
+# Script to generate ext_index.json metadata file
+# that stores content of the control files and location of extension archives
+# for all extensions in extensions subdir.
+import argparse
+import json
+import subprocess
+from pathlib import Path
+
+"""
+# ext_index.json example:
+{
+    "public_extensions": [
+        "anon"
+    ],
+    "library_index": {
+        "anon": "anon",
+        "kq_imcx": "kq_imcx"
+        // would be more complicated for something like postgis where multiple library names all map to postgis
+    },
+    "extension_data": {
+        "kq_imcx": {
+            "control_data": {
+                "kq_imcx.control": "# This file is generated content from add_postgresql_extension.\n# No point in modifying it, it will be overwritten anyway.\n\n# Default version, always set\ndefault_version = '0.1'\n\n# Module pathname generated from target shared library name. Use\n# MODULE_PATHNAME in script file.\nmodule_pathname = '$libdir/kq_imcx.so'\n\n# Comment for extension. Set using COMMENT option. Can be set in\n# script file as well.\ncomment = 'ketteQ In-Memory Calendar Extension (IMCX)'\n\n# Encoding for script file. Set using ENCODING option.\n#encoding = ''\n\n# Required extensions. Set using REQUIRES option (multi-valued).\n#requires = ''\ntrusted = true\n"
+            },
+            "archive_path": "5648391853/v15/extensions/kq_imcx.tar.zst"
+        },
+        "anon": {
+            "control_data": {
+                "anon.control": "# PostgreSQL Anonymizer (anon) extension \ncomment = 'Data anonymization tools' \ndefault_version = '1.1.0' \ndirectory='extension/anon' \nrelocatable = false \nrequires = 'pgcrypto' \nsuperuser = false \nmodule_pathname = '$libdir/anon' \ntrusted = true \n"
+            },
+            "archive_path": "5648391853/v15/extensions/anon.tar.zst"
+        }
+    }
+}
+"""
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="generate ext_index.json")
+    parser.add_argument("pg_version", type=str, choices=["v14", "v15"], help="pg_version")
+    parser.add_argument("BUILD_TAG", type=str, help="BUILD_TAG for this compute image")
+    parser.add_argument("--public_extensions", type=str, help="list of public extensions")
+    args = parser.parse_args()
+    pg_version = args.pg_version
+    BUILD_TAG = args.BUILD_TAG
+    public_ext_list = args.public_extensions.split(",")
+
+    ext_index = {}
+    library_index = {}
+    EXT_PATH = Path("extensions")
+    for extension in EXT_PATH.iterdir():
+        if extension.is_dir():
+            control_data = {}
+            for control_file in extension.glob("*.control"):
+                if control_file.suffix != ".control":
+                    continue
+                with open(control_file, "r") as f:
+                    control_data[control_file.name] = f.read()
+            ext_index[extension.name] = {
+                "control_data": control_data,
+                "archive_path": f"{BUILD_TAG}/{pg_version}/extensions/{extension.name}.tar.zst",
+            }
+        elif extension.suffix == ".zst":
+            file_list = (
+                str(subprocess.check_output(["tar", "tf", str(extension)]), "utf-8")
+                .strip()
+                .split("\n")
+            )
+            for file in file_list:
+                if file.endswith(".so") and file.startswith("lib/"):
+                    lib_name = file[4:-3]
+                    library_index[lib_name] = extension.name.replace(".tar.zst", "")
+
+    all_data = {
+        "public_extensions": public_ext_list,
+        "library_index": library_index,
+        "extension_data": ext_index,
+    }
+    with open("ext_index.json", "w") as f:
+        json.dump(all_data, f)
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -40,10 +40,13 @@ def parse_metrics(text: str, name: str = "") -> Metrics:
    return metrics


+def histogram(prefix_without_trailing_underscore: str) -> List[str]:
+    assert not prefix_without_trailing_underscore.endswith("_")
+    return [f"{prefix_without_trailing_underscore}_{x}" for x in ["bucket", "count", "sum"]]
+
+
 PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS: Tuple[str, ...] = (
    "pageserver_remote_timeline_client_calls_unfinished",
-    *[f"pageserver_remote_timeline_client_calls_started_{x}" for x in ["bucket", "count", "sum"]],
-    *[f"pageserver_remote_operation_seconds_{x}" for x in ["bucket", "count", "sum"]],
    "pageserver_remote_physical_size",
    "pageserver_remote_timeline_client_bytes_started_total",
    "pageserver_remote_timeline_client_bytes_finished_total",
@@ -67,34 +70,29 @@ PAGESERVER_GLOBAL_METRICS: Tuple[str, ...] = (
    "pageserver_getpage_reconstruct_seconds_count",
    "pageserver_getpage_reconstruct_seconds_sum",
    *[f"pageserver_basebackup_query_seconds_{x}" for x in ["bucket", "count", "sum"]],
+    *histogram("pageserver_read_num_fs_layers"),
+    *histogram("pageserver_getpage_get_reconstruct_data_seconds"),
+    *histogram("pageserver_wait_lsn_seconds"),
+    *histogram("pageserver_remote_operation_seconds"),
+    *histogram("pageserver_remote_timeline_client_calls_started"),
+    *histogram("pageserver_io_operations_seconds"),
+    "pageserver_tenant_states_count",
 )

 PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = (
    "pageserver_current_logical_size",
    "pageserver_resident_physical_size",
-    "pageserver_getpage_get_reconstruct_data_seconds_bucket",
-    "pageserver_getpage_get_reconstruct_data_seconds_count",
-    "pageserver_getpage_get_reconstruct_data_seconds_sum",
    "pageserver_io_operations_bytes_total",
-    "pageserver_io_operations_seconds_bucket",
-    "pageserver_io_operations_seconds_count",
-    "pageserver_io_operations_seconds_sum",
    "pageserver_last_record_lsn",
-    "pageserver_read_num_fs_layers_bucket",
-    "pageserver_read_num_fs_layers_count",
-    "pageserver_read_num_fs_layers_sum",
    "pageserver_smgr_query_seconds_bucket",
    "pageserver_smgr_query_seconds_count",
    "pageserver_smgr_query_seconds_sum",
    "pageserver_storage_operations_seconds_count_total",
    "pageserver_storage_operations_seconds_sum_total",
-    "pageserver_wait_lsn_seconds_bucket",
-    "pageserver_wait_lsn_seconds_count",
-    "pageserver_wait_lsn_seconds_sum",
    "pageserver_created_persistent_files_total",
    "pageserver_written_persistent_bytes_total",
-    "pageserver_tenant_states_count",
    "pageserver_evictions_total",
    "pageserver_evictions_with_low_residence_duration_total",
    *PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS,
+    # pageserver_broken_tenants_count is a leaked "metric" which is "cleared" on restart or reload
 )
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -542,7 +542,7 @@ class S3Storage:
    access_key: str
    secret_key: str
    endpoint: Optional[str] = None
-    prefix_in_bucket: Optional[str] = None
+    prefix_in_bucket: Optional[str] = ""

    def access_env_vars(self) -> Dict[str, str]:
        return {
@@ -1504,6 +1504,7 @@ class NeonCli(AbstractNeonCli):
        safekeepers: Optional[List[int]] = None,
        tenant_id: Optional[TenantId] = None,
        lsn: Optional[Lsn] = None,
+        branch_name: Optional[str] = None,
    ) -> "subprocess.CompletedProcess[str]":
        args = [
            "endpoint",
@@ -1517,8 +1518,11 @@ class NeonCli(AbstractNeonCli):
            args.append(f"--lsn={lsn}")
        args.extend(["--pg-port", str(pg_port)])
        args.extend(["--http-port", str(http_port)])
+
        if safekeepers is not None:
            args.extend(["--safekeepers", (",".join(map(str, safekeepers)))])
+        if branch_name is not None:
+            args.extend(["--branch-name", branch_name])
        if endpoint_id is not None:
            args.append(endpoint_id)

--- a/test_runner/fixtures/pageserver/utils.py
+++ b/test_runner/fixtures/pageserver/utils.py
@@ -194,14 +194,18 @@ def wait_for_upload_queue_empty(


 def wait_timeline_detail_404(
-    pageserver_http: PageserverHttpClient, tenant_id: TenantId, timeline_id: TimelineId
+    pageserver_http: PageserverHttpClient,
+    tenant_id: TenantId,
+    timeline_id: TimelineId,
+    wait_longer: bool = False,
 ):
    last_exc = None
-    for _ in range(2):
+    iterations = 10 if wait_longer else 2
+    for _ in range(iterations):
        time.sleep(0.250)
        try:
            data = pageserver_http.timeline_detail(tenant_id, timeline_id)
-            log.error(f"detail {data}")
+            log.info(f"detail {data}")
        except PageserverApiException as e:
            log.debug(e)
            if e.status_code == 404:
@@ -216,7 +220,8 @@ def timeline_delete_wait_completed(
    pageserver_http: PageserverHttpClient,
    tenant_id: TenantId,
    timeline_id: TimelineId,
+    wait_longer: bool = False,  # Use when running with RemoteStorageKind.REAL_S3
    **delete_args,
 ):
    pageserver_http.timeline_delete(tenant_id=tenant_id, timeline_id=timeline_id, **delete_args)
-    wait_timeline_detail_404(pageserver_http, tenant_id, timeline_id)
+    wait_timeline_detail_404(pageserver_http, tenant_id, timeline_id, wait_longer)
--- a/test_runner/performance/test_startup.py
+++ b/test_runner/performance/test_startup.py
@@ -61,6 +61,7 @@ def test_startup_simple(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenc
        durations = {
            "wait_for_spec_ms": f"{i}_wait_for_spec",
            "sync_safekeepers_ms": f"{i}_sync_safekeepers",
+            "sync_sk_check_ms": f"{i}_sync_sk_check",
            "basebackup_ms": f"{i}_basebackup",
            "start_postgres_ms": f"{i}_start_postgres",
            "config_ms": f"{i}_config",
--- a/test_runner/regress/test_attach_tenant_config.py
+++ b/test_runner/regress/test_attach_tenant_config.py
@@ -123,7 +123,7 @@ def test_config_with_unknown_keys_is_bad_request(negative_env: NegativeTests):
@pytest.mark.parametrize("content_type", [None, "application/json"])
 def test_empty_body(positive_env: NeonEnv, content_type: Optional[str]):
    """
-    For backwards-compatiblity: if we send an empty body,
+    For backwards-compatibility: if we send an empty body,
    the request should be accepted and the config should be the default config.
    """
    env = positive_env
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -4,7 +4,7 @@ import shutil
 import subprocess
 import tempfile
 from pathlib import Path
-from typing import Any, Optional
+from typing import Any, List, Optional

 import pytest
 import toml  # TODO: replace with tomllib for Python >= 3.11
@@ -14,7 +14,6 @@ from fixtures.neon_fixtures import (
    NeonEnvBuilder,
    PgBin,
    PortDistributor,
-    parse_project_git_version_output,
 )
 from fixtures.pageserver.http import PageserverHttpClient
 from fixtures.pageserver.utils import (
@@ -63,7 +62,6 @@ def test_create_snapshot(
    neon_env_builder.pg_version = pg_version
    neon_env_builder.num_safekeepers = 3
    neon_env_builder.enable_local_fs_remote_storage()
-    neon_env_builder.preserve_database_files = True

    env = neon_env_builder.init_start()
    endpoint = env.endpoints.create_start("main")
@@ -259,36 +257,15 @@ def prepare_snapshot(
        shutil.rmtree(repo_dir / "pgdatadirs")
    os.mkdir(repo_dir / "endpoints")

-    # Remove wal-redo temp directory if it exists. Newer pageserver versions don't create
-    # them anymore, but old versions did.
-    for tenant in (repo_dir / "tenants").glob("*"):
-        wal_redo_dir = tenant / "wal-redo-datadir.___temp"
-        if wal_redo_dir.exists() and wal_redo_dir.is_dir():
-            shutil.rmtree(wal_redo_dir)
-
    # Update paths and ports in config files
    pageserver_toml = repo_dir / "pageserver.toml"
    pageserver_config = toml.load(pageserver_toml)
    pageserver_config["remote_storage"]["local_path"] = str(repo_dir / "local_fs_remote_storage")
-    pageserver_config["listen_http_addr"] = port_distributor.replace_with_new_port(
-        pageserver_config["listen_http_addr"]
-    )
-    pageserver_config["listen_pg_addr"] = port_distributor.replace_with_new_port(
-        pageserver_config["listen_pg_addr"]
-    )
-    # since storage_broker these are overriden by neon_local during pageserver
-    # start; remove both to prevent unknown options during etcd ->
-    # storage_broker migration. TODO: remove once broker is released
-    pageserver_config.pop("broker_endpoint", None)
-    pageserver_config.pop("broker_endpoints", None)
-    etcd_broker_endpoints = [f"http://localhost:{port_distributor.get_port()}/"]
-    if get_neon_version(neon_binpath) == "49da498f651b9f3a53b56c7c0697636d880ddfe0":
-        pageserver_config["broker_endpoints"] = etcd_broker_endpoints  # old etcd version
+    for param in ("listen_http_addr", "listen_pg_addr", "broker_endpoint"):
+        pageserver_config[param] = port_distributor.replace_with_new_port(pageserver_config[param])

-    # Older pageserver versions had just one `auth_type` setting. Now there
-    # are separate settings for pg and http ports. We don't use authentication
-    # in compatibility tests so just remove authentication related settings.
-    pageserver_config.pop("auth_type", None)
+    # We don't use authentication in compatibility tests
+    # so just remove authentication related settings.
    pageserver_config.pop("pg_auth_type", None)
    pageserver_config.pop("http_auth_type", None)

@@ -300,31 +277,16 @@ def prepare_snapshot(

    snapshot_config_toml = repo_dir / "config"
    snapshot_config = toml.load(snapshot_config_toml)
-
-    # Provide up/downgrade etcd <-> storage_broker to make forward/backward
-    # compatibility test happy. TODO: leave only the new part once broker is released.
-    if get_neon_version(neon_binpath) == "49da498f651b9f3a53b56c7c0697636d880ddfe0":
-        # old etcd version
-        snapshot_config["etcd_broker"] = {
-            "etcd_binary_path": shutil.which("etcd"),
-            "broker_endpoints": etcd_broker_endpoints,
-        }
-        snapshot_config.pop("broker", None)
-    else:
-        # new storage_broker version
-        broker_listen_addr = f"127.0.0.1:{port_distributor.get_port()}"
-        snapshot_config["broker"] = {"listen_addr": broker_listen_addr}
-        snapshot_config.pop("etcd_broker", None)
-
-    snapshot_config["pageserver"]["listen_http_addr"] = port_distributor.replace_with_new_port(
-        snapshot_config["pageserver"]["listen_http_addr"]
-    )
-    snapshot_config["pageserver"]["listen_pg_addr"] = port_distributor.replace_with_new_port(
-        snapshot_config["pageserver"]["listen_pg_addr"]
+    for param in ("listen_http_addr", "listen_pg_addr"):
+        snapshot_config["pageserver"][param] = port_distributor.replace_with_new_port(
+            snapshot_config["pageserver"][param]
+        )
+    snapshot_config["broker"]["listen_addr"] = port_distributor.replace_with_new_port(
+        snapshot_config["broker"]["listen_addr"]
    )
    for sk in snapshot_config["safekeepers"]:
-        sk["http_port"] = port_distributor.replace_with_new_port(sk["http_port"])
-        sk["pg_port"] = port_distributor.replace_with_new_port(sk["pg_port"])
+        for param in ("http_port", "pg_port", "pg_tenant_only_port"):
+            sk[param] = port_distributor.replace_with_new_port(sk[param])

    if pg_distrib_dir:
        snapshot_config["pg_distrib_dir"] = str(pg_distrib_dir)
@@ -350,12 +312,6 @@ def prepare_snapshot(
    ), f"there're files referencing `test_create_snapshot/repo`, this path should be replaced with {repo_dir}:\n{rv.stdout}"


-# get git SHA of neon binary
-def get_neon_version(neon_binpath: Path):
-    out = subprocess.check_output([neon_binpath / "neon_local", "--version"]).decode("utf-8")
-    return parse_project_git_version_output(out)
-
-
 def check_neon_works(
    repo_dir: Path,
    neon_target_binpath: Path,
@@ -381,7 +337,6 @@ def check_neon_works(
    config.pg_version = pg_version
    config.initial_tenant = snapshot_config["default_tenant_id"]
    config.pg_distrib_dir = pg_distrib_dir
-    config.preserve_database_files = True

    # Use the "target" binaries to launch the storage nodes
    config_target = config
@@ -438,6 +393,14 @@ def check_neon_works(
        test_output_dir / "dump-from-wal.filediff",
    )

+    # TODO: Run pg_amcheck unconditionally after the next release
+    try:
+        pg_bin.run(["psql", connstr, "--command", "CREATE EXTENSION IF NOT EXISTS amcheck"])
+    except subprocess.CalledProcessError:
+        log.info("Extension amcheck is not available, skipping pg_amcheck")
+    else:
+        pg_bin.run_capture(["pg_amcheck", connstr, "--install-missing", "--verbose"])
+
    # Check that we can interract with the data
    pg_bin.run_capture(["pgbench", "--time=10", "--progress=2", connstr])

@@ -445,10 +408,15 @@ def check_neon_works(
    assert not initial_dump_differs, "initial dump differs"


-def dump_differs(first: Path, second: Path, output: Path) -> bool:
+def dump_differs(
+    first: Path, second: Path, output: Path, allowed_diffs: Optional[List[str]] = None
+) -> bool:
    """
    Runs diff(1) command on two SQL dumps and write the output to the given output file.
-    Returns True if the dumps differ, False otherwise.
+    The function supports allowed diffs, if the diff is in the allowed_diffs list, it's not considered as a difference.
+    See the example of it in https://github.com/neondatabase/neon/pull/4425/files#diff-15c5bfdd1d5cc1411b9221091511a60dd13a9edf672bdfbb57dd2ef8bb7815d6
+
+    Returns True if the dumps differ and produced diff is not allowed, False otherwise (in most cases we want it to return False).
    """

    with output.open("w") as stdout:
@@ -466,51 +434,30 @@ def dump_differs(first: Path, second: Path, output: Path) -> bool:

    differs = res.returncode != 0

-    # TODO: Remove after https://github.com/neondatabase/neon/pull/4425 is merged, and a couple of releases are made
-    if differs:
-        with tempfile.NamedTemporaryFile(mode="w") as tmp:
-            tmp.write(PR4425_ALLOWED_DIFF)
-            tmp.flush()
+    allowed_diffs = allowed_diffs or []
+    if differs and len(allowed_diffs) > 0:
+        for allowed_diff in allowed_diffs:
+            with tempfile.NamedTemporaryFile(mode="w") as tmp:
+                tmp.write(allowed_diff)
+                tmp.flush()

-            allowed = subprocess.run(
-                [
-                    "diff",
-                    "--unified",  # Make diff output more readable
-                    r"--ignore-matching-lines=^---",  # Ignore diff headers
-                    r"--ignore-matching-lines=^\+\+\+",  # Ignore diff headers
-                    "--ignore-matching-lines=^@@",  # Ignore diff blocks location
-                    "--ignore-matching-lines=^ *$",  # Ignore lines with only spaces
-                    "--ignore-matching-lines=^ --.*",  # Ignore the " --" lines for compatibility with PG14
-                    "--ignore-blank-lines",
-                    str(output),
-                    str(tmp.name),
-                ],
-            )
+                allowed = subprocess.run(
+                    [
+                        "diff",
+                        "--unified",  # Make diff output more readable
+                        r"--ignore-matching-lines=^---",  # Ignore diff headers
+                        r"--ignore-matching-lines=^\+\+\+",  # Ignore diff headers
+                        "--ignore-matching-lines=^@@",  # Ignore diff blocks location
+                        "--ignore-matching-lines=^ *$",  # Ignore lines with only spaces
+                        "--ignore-matching-lines=^ --.*",  # Ignore SQL comments in diff
+                        "--ignore-blank-lines",
+                        str(output),
+                        str(tmp.name),
+                    ],
+                )

-            differs = allowed.returncode != 0
+                differs = allowed.returncode != 0
+                if not differs:
+                    break

    return differs
-
-
-PR4425_ALLOWED_DIFF = """
--- /tmp/test_output/test_backward_compatibility[release-pg15]/compatibility_snapshot/dump.sql 2023-06-08 18:12:45.000000000 +0000
-+++ /tmp/test_output/test_backward_compatibility[release-pg15]/dump.sql        2023-06-13 07:25:35.211733653 +0000
-@@ -13,12 +13,20 @@
-
- CREATE ROLE cloud_admin;
- ALTER ROLE cloud_admin WITH SUPERUSER INHERIT CREATEROLE CREATEDB LOGIN REPLICATION BYPASSRLS;
-+CREATE ROLE neon_superuser;
-+ALTER ROLE neon_superuser WITH NOSUPERUSER INHERIT CREATEROLE CREATEDB NOLOGIN NOREPLICATION NOBYPASSRLS;
-
- --
- -- User Configurations
- --
-
-
-+--
-+-- Role memberships
-+--
-+
-+GRANT pg_read_all_data TO neon_superuser GRANTED BY cloud_admin;
-+GRANT pg_write_all_data TO neon_superuser GRANTED BY cloud_admin;
-"""
--- a/test_runner/regress/test_gc_aggressive.py
+++ b/test_runner/regress/test_gc_aggressive.py
@@ -136,8 +136,6 @@ def test_gc_index_upload(neon_env_builder: NeonEnvBuilder, remote_storage_kind:
        for sample in ps_metrics.query_all(
            name="pageserver_remote_operation_seconds_count",
            filter={
-                "tenant_id": str(tenant_id),
-                "timeline_id": str(timeline_id),
                "file_kind": str(file_kind),
                "op_kind": str(op_kind),
            },
--- a/test_runner/regress/test_gc_cutoff.py
+++ b/test_runner/regress/test_gc_cutoff.py
@@ -14,10 +14,6 @@ from fixtures.neon_fixtures import NeonEnvBuilder, PgBin
 def test_gc_cutoff(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
    env = neon_env_builder.init_start()

-    # These warnings are expected, when the pageserver is restarted abruptly
-    env.pageserver.allowed_errors.append(".*found future image layer.*")
-    env.pageserver.allowed_errors.append(".*found future delta layer.*")
-
    pageserver_http = env.pageserver.http_client()

    # Use aggressive GC and checkpoint settings, so that we also exercise GC during the test
--- a/test_runner/regress/test_metric_collection.py
+++ b/test_runner/regress/test_metric_collection.py
@@ -140,8 +140,6 @@ def test_metric_collection(
        for sample in ps_metrics.query_all(
            name="pageserver_remote_operation_seconds_count",
            filter={
-                "tenant_id": str(tenant_id),
-                "timeline_id": str(timeline_id),
                "file_kind": str(file_kind),
                "op_kind": str(op_kind),
            },
--- a/test_runner/regress/test_multixact_conc.py
+++ b/test_runner/regress/test_multixact_conc.py
@@ -1,92 +0,0 @@
-import random
-import threading
-from threading import Thread
-
-from fixtures.log_helper import log
-from fixtures.neon_fixtures import NeonEnv, check_restored_datadir_content
-from fixtures.utils import query_scalar
-
-
-#
-# Test multixact state after branching
-# Now this test is very minimalistic -
-# it only checks next_multixact_id field in restored pg_control,
-# since we don't have functions to check multixact internals.
-#
-def test_multixact_conc(neon_simple_env: NeonEnv, test_output_dir):
-    env = neon_simple_env
-    env.neon_cli.create_branch("test_multixact", "empty")
-    endpoint = env.endpoints.create_start("test_multixact")
-
-    log.info("postgres is running on 'test_multixact' branch")
-
-    n_records = 100
-    n_threads =   5
-    n_iters =  1000
-    n_restarts = 10
-
-    cur = endpoint.connect().cursor()
-    cur.execute(
-        f"""
-        CREATE TABLE t1(pk int primary key, val integer);
-        INSERT INTO t1 values (generate_series(1, {n_records}), 0);
-    """
-    )
-
-    next_multixact_id_old = query_scalar(
-        cur, "SELECT next_multixact_id FROM pg_control_checkpoint()"
-    )
-
-    # Lock entries using parallel connections in a round-robin fashion.
-    def do_updates():
-        conn = endpoint.connect(autocommit=False)
-        for i in range(n_iters):
-            pk = random.randrange(1, n_records)
-            conn.cursor().execute(f"update t1 set val=val+1 where pk={pk}")
-            conn.cursor().execute("select * from t1 for key share")
-            conn.commit()
-        conn.close()
-
-    for iter in range(n_restarts):
-        threads: List[threading.Thread] = []
-        for i in range(n_threads):
-            threads.append(threading.Thread(target=do_updates, args=(), daemon=False))
-            threads[-1].start()
-
-        for thread in threads:
-            thread.join()
-
-        # Restart endpoint
-        endpoint.stop()
-        endpoint.start()
-
-        conn = endpoint.connect()
-        cur = conn.cursor()
-        cur.execute("select count(*) from t1")
-        assert cur.fetchone() == (n_records,)
-
-    # force wal flush
-    cur.execute("checkpoint")
-
-    cur.execute(
-        "SELECT next_multixact_id, pg_current_wal_insert_lsn() FROM pg_control_checkpoint()"
-    )
-    res = cur.fetchone()
-    assert res is not None
-    next_multixact_id = res[0]
-    lsn = res[1]
-
-    # Ensure that we did lock some tuples
-    assert int(next_multixact_id) > int(next_multixact_id_old)
-
-    # Branch at this point
-    env.neon_cli.create_branch("test_multixact_new", "test_multixact", ancestor_start_lsn=lsn)
-    endpoint_new = env.endpoints.create_start("test_multixact_new")
-
-    log.info("postgres is running on 'test_multixact_new' branch")
-    next_multixact_id_new = endpoint_new.safe_psql(
-        "SELECT next_multixact_id FROM pg_control_checkpoint()"
-    )[0][0]
-
-    # Check that we restored pg_controlfile correctly
-    assert next_multixact_id_new == next_multixact_id
--- a/test_runner/regress/test_neon_local_cli.py
+++ b/test_runner/regress/test_neon_local_cli.py
@@ -16,11 +16,13 @@ def test_neon_cli_basics(neon_env_builder: NeonEnvBuilder, port_distributor: Por
            endpoint_id="ep-basic-main", pg_port=pg_port, http_port=http_port
        )

-        env.neon_cli.create_branch(new_branch_name="migration_check")
+        branch_name = "migration-check"
+
+        env.neon_cli.create_branch(new_branch_name=branch_name)
        pg_port = port_distributor.get_port()
        http_port = port_distributor.get_port()
        env.neon_cli.endpoint_start(
-            endpoint_id="ep-migration_check", pg_port=pg_port, http_port=http_port
+            f"ep-{branch_name}", pg_port, http_port, branch_name=branch_name
        )
    finally:
        env.neon_cli.stop()
--- a/test_runner/regress/test_ondemand_download.py
+++ b/test_runner/regress/test_ondemand_download.py
@@ -27,15 +27,16 @@ from fixtures.types import Lsn
 from fixtures.utils import query_scalar, wait_until


-def get_num_downloaded_layers(client: PageserverHttpClient, tenant_id, timeline_id):
+def get_num_downloaded_layers(client: PageserverHttpClient):
+    """
+    This assumes that the pageserver only has a single tenant.
+    """
    value = client.get_metric_value(
        "pageserver_remote_operation_seconds_count",
        {
            "file_kind": "layer",
            "op_kind": "download",
            "status": "success",
-            "tenant_id": tenant_id,
-            "timeline_id": timeline_id,
        },
    )
    if value is None:
@@ -57,7 +58,8 @@ def test_ondemand_download_large_rel(
        test_name="test_ondemand_download_large_rel",
    )

-    ##### First start, insert secret data and upload it to the remote storage
+    # thinking about using a shared environment? the test assumes that global
+    # metrics are for single tenant.
    env = neon_env_builder.init_start(
        initial_tenant_conf={
            # disable background GC
@@ -129,7 +131,7 @@ def test_ondemand_download_large_rel(
    # safekeepers, that have now been shut down.
    endpoint = env.endpoints.create_start("main", lsn=current_lsn)

-    before_downloads = get_num_downloaded_layers(client, tenant_id, timeline_id)
+    before_downloads = get_num_downloaded_layers(client)
    assert before_downloads != 0, "basebackup should on-demand non-zero layers"

    # Probe in the middle of the table. There's a high chance that the beginning
@@ -140,7 +142,7 @@ def test_ondemand_download_large_rel(
    with endpoint.cursor() as cur:
        assert query_scalar(cur, "select count(*) from tbl where id = 500000") == 1

-    after_downloads = get_num_downloaded_layers(client, tenant_id, timeline_id)
+    after_downloads = get_num_downloaded_layers(client)
    log.info(f"layers downloaded before {before_downloads} and after {after_downloads}")
    assert after_downloads > before_downloads

@@ -159,13 +161,11 @@ def test_ondemand_download_timetravel(
        test_name="test_ondemand_download_timetravel",
    )

-    ##### First start, insert data and upload it to the remote storage
-    env = neon_env_builder.init_start()
-    pageserver_http = env.pageserver.http_client()
+    # thinking about using a shared environment? the test assumes that global
+    # metrics are for single tenant.

-    # Override defaults, to create more layers
-    tenant, _ = env.neon_cli.create_tenant(
-        conf={
+    env = neon_env_builder.init_start(
+        initial_tenant_conf={
            # Disable background GC & compaction
            # We don't want GC, that would break the assertion about num downloads.
            # We don't want background compaction, we force a compaction every time we do explicit checkpoint.
@@ -178,7 +178,7 @@ def test_ondemand_download_timetravel(
            "compaction_target_size": f"{1 * 1024 ** 2}",  # 1 MB
        }
    )
-    env.initial_tenant = tenant
+    pageserver_http = env.pageserver.http_client()

    endpoint = env.endpoints.create_start("main")

@@ -283,7 +283,7 @@ def test_ondemand_download_timetravel(
                == table_len
            )

-        after_downloads = get_num_downloaded_layers(client, tenant_id, timeline_id)
+        after_downloads = get_num_downloaded_layers(client)
        num_layers_downloaded.append(after_downloads)
        log.info(f"num_layers_downloaded[-1]={num_layers_downloaded[-1]}")

@@ -324,11 +324,8 @@ def test_download_remote_layers_api(
    )

    ##### First start, insert data and upload it to the remote storage
-    env = neon_env_builder.init_start()
-
-    # Override defaults, to create more layers
-    tenant, _ = env.neon_cli.create_tenant(
-        conf={
+    env = neon_env_builder.init_start(
+        initial_tenant_conf={
            # Disable background GC & compaction
            # We don't want GC, that would break the assertion about num downloads.
            # We don't want background compaction, we force a compaction every time we do explicit checkpoint.
@@ -341,7 +338,6 @@ def test_download_remote_layers_api(
            "compaction_target_size": f"{1 * 1024 ** 2}",  # 1 MB
        }
    )
-    env.initial_tenant = tenant

    endpoint = env.endpoints.create_start("main")

@@ -489,8 +485,6 @@ def test_compaction_downloads_on_demand_without_image_creation(
        test_name="test_compaction_downloads_on_demand_without_image_creation",
    )

-    env = neon_env_builder.init_start()
-
    conf = {
        # Disable background GC & compaction
        "gc_period": "0s",
@@ -506,6 +500,8 @@ def test_compaction_downloads_on_demand_without_image_creation(
        # pitr_interval and gc_horizon are not interesting because we dont run gc
    }

+    env = neon_env_builder.init_start(initial_tenant_conf=stringify(conf))
+
    def downloaded_bytes_and_count(pageserver_http: PageserverHttpClient) -> Tuple[int, int]:
        m = pageserver_http.get_metrics()
        # these are global counters
@@ -517,11 +513,12 @@ def test_compaction_downloads_on_demand_without_image_creation(
        assert count < 2**53 and count.is_integer(), "count should still be safe integer-in-f64"
        return (int(total_bytes), int(count))

-    # Override defaults, to create more layers
-    tenant_id, timeline_id = env.neon_cli.create_tenant(conf=stringify(conf))
-    env.initial_tenant = tenant_id
    pageserver_http = env.pageserver.http_client()

+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+    assert timeline_id is not None
+
    with env.endpoints.create_start("main") as endpoint:
        # no particular reason to create the layers like this, but we are sure
        # not to hit the image_creation_threshold here.
@@ -577,8 +574,6 @@ def test_compaction_downloads_on_demand_with_image_creation(
        test_name="test_compaction_downloads_on_demand",
    )

-    env = neon_env_builder.init_start()
-
    conf = {
        # Disable background GC & compaction
        "gc_period": "0s",
@@ -593,9 +588,11 @@ def test_compaction_downloads_on_demand_with_image_creation(
        # pitr_interval and gc_horizon are not interesting because we dont run gc
    }

-    # Override defaults, to create more layers
-    tenant_id, timeline_id = env.neon_cli.create_tenant(conf=stringify(conf))
-    env.initial_tenant = tenant_id
+    env = neon_env_builder.init_start(initial_tenant_conf=stringify(conf))
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+    assert timeline_id is not None
+
    pageserver_http = env.pageserver.http_client()

    endpoint = env.endpoints.create_start("main")
@@ -664,10 +661,6 @@ def test_compaction_downloads_on_demand_with_image_creation(
    assert dict(kinds_after) == {"Delta": 4, "Image": 1}


-def stringify(conf: Dict[str, Any]) -> Dict[str, str]:
-    return dict(map(lambda x: (x[0], str(x[1])), conf.items()))
-
-
@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS])
 def test_ondemand_download_failure_to_replace(
    neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind
@@ -691,15 +684,12 @@ def test_ondemand_download_failure_to_replace(

    env = neon_env_builder.init_start()

-    tenant_id, timeline_id = env.neon_cli.create_tenant()
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+    assert timeline_id is not None

-    env.initial_tenant = tenant_id
    pageserver_http = env.pageserver.http_client()

-    lsn = Lsn(pageserver_http.timeline_detail(tenant_id, timeline_id)["last_record_lsn"])
-
-    wait_for_upload(pageserver_http, tenant_id, timeline_id, lsn)
-
    # remove layers so that they will be redownloaded
    pageserver_http.tenant_detach(tenant_id)
    pageserver_http.tenant_attach(tenant_id)
@@ -710,8 +700,10 @@ def test_ondemand_download_failure_to_replace(
    # requesting details with non-incremental size should trigger a download of the only layer
    # this will need to be adjusted if an index for logical sizes is ever implemented
    with pytest.raises(PageserverApiException):
-        # error message is not useful
-        pageserver_http.timeline_detail(tenant_id, timeline_id, True, timeout=2)
+        # PageserverApiException is expected because of the failpoint (timeline_detail building does something)
+        # ReadTimeout can happen on our busy CI, but it should not, because there is no more busylooping
+        # but should it be added back, we would wait for 15s here.
+        pageserver_http.timeline_detail(tenant_id, timeline_id, True, timeout=15)

    actual_message = ".* ERROR .*layermap-replace-notfound"
    assert env.pageserver.log_contains(actual_message) is not None
@@ -724,3 +716,7 @@ def test_ondemand_download_failure_to_replace(
    env.pageserver.allowed_errors.append(".* ERROR .*Task 'initial size calculation'")

    # if the above returned, then we didn't have a livelock, and all is well
+
+
+def stringify(conf: Dict[str, Any]) -> Dict[str, str]:
+    return dict(map(lambda x: (x[0], str(x[1])), conf.items()))
--- a/test_runner/regress/test_pageserver_restart.py
+++ b/test_runner/regress/test_pageserver_restart.py
@@ -72,10 +72,6 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder):
 def test_pageserver_chaos(neon_env_builder: NeonEnvBuilder):
    env = neon_env_builder.init_start()

-    # These warnings are expected, when the pageserver is restarted abruptly
-    env.pageserver.allowed_errors.append(".*found future image layer.*")
-    env.pageserver.allowed_errors.append(".*found future delta layer.*")
-
    # Use a tiny checkpoint distance, to create a lot of layers quickly.
    # That allows us to stress the compaction and layer flushing logic more.
    tenant, _ = env.neon_cli.create_tenant(
--- a/test_runner/regress/test_proxy.py
+++ b/test_runner/regress/test_proxy.py
@@ -1,6 +1,6 @@
 import json
 import subprocess
-from typing import Any, List, Optional
+from typing import Any, List, Optional, Tuple

 import psycopg2
 import pytest
@@ -260,3 +260,73 @@ def test_sql_over_http_output_options(static_proxy: NeonProxy):

    rows = q("select 1 as n, 'a' as s, '{1,2,3}'::int4[] as arr", True, True)["rows"]
    assert rows == [["1", "a", "{1,2,3}"]]
+
+
+def test_sql_over_http_batch(static_proxy: NeonProxy):
+    static_proxy.safe_psql("create role http with login password 'http' superuser")
+
+    def qq(queries: List[Tuple[str, Optional[List[Any]]]], read_only: bool = False) -> Any:
+        connstr = f"postgresql://http:http@{static_proxy.domain}:{static_proxy.proxy_port}/postgres"
+        response = requests.post(
+            f"https://{static_proxy.domain}:{static_proxy.external_http_port}/sql",
+            data=json.dumps(list(map(lambda x: {"query": x[0], "params": x[1] or []}, queries))),
+            headers={
+                "Content-Type": "application/sql",
+                "Neon-Connection-String": connstr,
+                "Neon-Batch-Isolation-Level": "Serializable",
+                "Neon-Batch-Read-Only": "true" if read_only else "false",
+            },
+            verify=str(static_proxy.test_output_dir / "proxy.crt"),
+        )
+        assert response.status_code == 200
+        return response.json()["results"], response.headers
+
+    result, headers = qq(
+        [
+            ("select 42 as answer", None),
+            ("select $1 as answer", [42]),
+            ("select $1 * 1 as answer", [42]),
+            ("select $1::int[] as answer", [[1, 2, 3]]),
+            ("select $1::json->'a' as answer", [{"a": {"b": 42}}]),
+            ("select * from pg_class limit 1", None),
+            ("create table t(id serial primary key, val int)", None),
+            ("insert into t(val) values (10), (20), (30) returning id", None),
+            ("select * from t", None),
+            ("drop table t", None),
+        ]
+    )
+
+    assert headers["Neon-Batch-Isolation-Level"] == "Serializable"
+    assert headers["Neon-Batch-Read-Only"] == "false"
+
+    assert result[0]["rows"] == [{"answer": 42}]
+    assert result[1]["rows"] == [{"answer": "42"}]
+    assert result[2]["rows"] == [{"answer": 42}]
+    assert result[3]["rows"] == [{"answer": [1, 2, 3]}]
+    assert result[4]["rows"] == [{"answer": {"b": 42}}]
+    assert len(result[5]["rows"]) == 1
+    res = result[6]
+    assert res["command"] == "CREATE"
+    assert res["rowCount"] is None
+    res = result[7]
+    assert res["command"] == "INSERT"
+    assert res["rowCount"] == 3
+    assert res["rows"] == [{"id": 1}, {"id": 2}, {"id": 3}]
+    res = result[8]
+    assert res["command"] == "SELECT"
+    assert res["rowCount"] == 3
+    res = result[9]
+    assert res["command"] == "DROP"
+    assert res["rowCount"] is None
+    assert len(result) == 10
+
+    result, headers = qq(
+        [
+            ("select 42 as answer", None),
+        ],
+        True,
+    )
+    assert headers["Neon-Batch-Isolation-Level"] == "Serializable"
+    assert headers["Neon-Batch-Read-Only"] == "true"
+
+    assert result[0]["rows"] == [{"answer": 42}]
--- a/test_runner/regress/test_recovery.py
+++ b/test_runner/regress/test_recovery.py
@@ -15,10 +15,6 @@ def test_pageserver_recovery(neon_env_builder: NeonEnvBuilder):
    env = neon_env_builder.init_start()
    env.pageserver.is_testing_enabled_or_skip()

-    # These warnings are expected, when the pageserver is restarted abruptly
-    env.pageserver.allowed_errors.append(".*found future delta layer.*")
-    env.pageserver.allowed_errors.append(".*found future image layer.*")
-
    # Create a branch for us
    env.neon_cli.create_branch("test_pageserver_recovery", "main")

--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -378,12 +378,10 @@ def test_remote_timeline_client_calls_started_metric(
        test_name="test_remote_timeline_client_metrics",
    )

-    env = neon_env_builder.init_start()
-
-    # create tenant with config that will determinstically allow
-    # compaction and gc
-    tenant_id, timeline_id = env.neon_cli.create_tenant(
-        conf={
+    # thinking about using a shared environment? the test assumes that global
+    # metrics are for single tenant.
+    env = neon_env_builder.init_start(
+        initial_tenant_conf={
            # small checkpointing and compaction targets to ensure we generate many upload operations
            "checkpoint_distance": f"{128 * 1024}",
            "compaction_threshold": "1",
@@ -398,6 +396,10 @@ def test_remote_timeline_client_calls_started_metric(
        }
    )

+    tenant_id = env.initial_tenant
+    assert env.initial_timeline is not None
+    timeline_id: TimelineId = env.initial_timeline
+
    client = env.pageserver.http_client()

    endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
@@ -419,6 +421,7 @@ def test_remote_timeline_client_calls_started_metric(
                "VACUUM foo",
            ]
        )
+        assert timeline_id is not None
        wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)

    calls_started: Dict[Tuple[str, str], List[int]] = {
@@ -428,13 +431,14 @@ def test_remote_timeline_client_calls_started_metric(
    }

    def fetch_calls_started():
+        assert timeline_id is not None
        for (file_kind, op_kind), observations in calls_started.items():
-            val = client.get_remote_timeline_client_metric(
-                "pageserver_remote_timeline_client_calls_started_count",
-                tenant_id,
-                timeline_id,
-                file_kind,
-                op_kind,
+            val = client.get_metric_value(
+                name="pageserver_remote_timeline_client_calls_started_count",
+                filter={
+                    "file_kind": str(file_kind),
+                    "op_kind": str(op_kind),
+                },
            )
            assert val is not None, f"expecting metric to be present: {file_kind} {op_kind}"
            val = int(val)
@@ -518,12 +522,8 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue(
        test_name="test_timeline_deletion_with_files_stuck_in_upload_queue",
    )

-    env = neon_env_builder.init_start()
-
-    # create tenant with config that will determinstically allow
-    # compaction and gc
-    tenant_id, timeline_id = env.neon_cli.create_tenant(
-        conf={
+    env = neon_env_builder.init_start(
+        initial_tenant_conf={
            # small checkpointing and compaction targets to ensure we generate many operations
            "checkpoint_distance": f"{64 * 1024}",
            "compaction_threshold": "1",
@@ -535,6 +535,10 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue(
            "pitr_interval": "0s",
        }
    )
+    tenant_id = env.initial_tenant
+    assert env.initial_timeline is not None
+    timeline_id: TimelineId = env.initial_timeline
+
    timeline_path = env.timeline_dir(tenant_id, timeline_id)

    client = env.pageserver.http_client()
@@ -787,12 +791,8 @@ def test_compaction_delete_before_upload(
        test_name="test_compaction_delete_before_upload",
    )

-    env = neon_env_builder.init_start()
-
-    # create tenant with config that will determinstically allow
-    # compaction and disables gc
-    tenant_id, timeline_id = env.neon_cli.create_tenant(
-        conf={
+    env = neon_env_builder.init_start(
+        initial_tenant_conf={
            # Set a small compaction threshold
            "compaction_threshold": "3",
            # Disable GC
@@ -802,6 +802,10 @@ def test_compaction_delete_before_upload(
        }
    )

+    tenant_id = env.initial_tenant
+    assert env.initial_timeline is not None
+    timeline_id: TimelineId = env.initial_timeline
+
    client = env.pageserver.http_client()

    with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint:
--- a/test_runner/regress/test_tenant_detach.py
+++ b/test_runner/regress/test_tenant_detach.py
@@ -2,6 +2,7 @@ import asyncio
 import random
 import time
 from threading import Thread
+from typing import List, Optional

 import asyncpg
 import pytest
@@ -21,6 +22,7 @@ from fixtures.pageserver.utils import (
 )
 from fixtures.types import Lsn, TenantId, TimelineId
 from fixtures.utils import query_scalar, wait_until
+from prometheus_client.samples import Sample


 def do_gc_target(
@@ -854,3 +856,89 @@ def ensure_test_data(data_id: int, data: str, endpoint: Endpoint):
        assert (
            query_scalar(cur, f"SELECT secret FROM test WHERE id = {data_id};") == data
        ), "Should have timeline data back"
+
+
+@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS])
+def test_metrics_while_ignoring_broken_tenant_and_reloading(
+    neon_env_builder: NeonEnvBuilder,
+    remote_storage_kind: RemoteStorageKind,
+):
+    neon_env_builder.enable_remote_storage(
+        remote_storage_kind=remote_storage_kind,
+        test_name="test_metrics_while_ignoring_broken_tenant_and_reloading",
+    )
+
+    env = neon_env_builder.init_start()
+
+    client = env.pageserver.http_client()
+    env.pageserver.allowed_errors.append(
+        r".* Changing Active tenant to Broken state, reason: broken from test"
+    )
+
+    def only_int(samples: List[Sample]) -> Optional[int]:
+        if len(samples) == 1:
+            return int(samples[0].value)
+        assert len(samples) == 0
+        return None
+
+    wait_until_tenant_state(client, env.initial_tenant, "Active", 10, 0.5)
+
+    client.tenant_break(env.initial_tenant)
+
+    found_broken = False
+    active, broken, broken_set = ([], [], [])
+    for _ in range(10):
+        m = client.get_metrics()
+        active = m.query_all("pageserver_tenant_states_count", {"state": "Active"})
+        broken = m.query_all("pageserver_tenant_states_count", {"state": "Broken"})
+        broken_set = m.query_all(
+            "pageserver_broken_tenants_count", {"tenant_id": str(env.initial_tenant)}
+        )
+        found_broken = only_int(active) == 0 and only_int(broken) == 1 and only_int(broken_set) == 1
+
+        if found_broken:
+            break
+        log.info(f"active: {active}, broken: {broken}, broken_set: {broken_set}")
+        time.sleep(0.5)
+    assert (
+        found_broken
+    ), f"tenant shows up as broken; active={active}, broken={broken}, broken_set={broken_set}"
+
+    client.tenant_ignore(env.initial_tenant)
+
+    found_broken = False
+    broken, broken_set = ([], [])
+    for _ in range(10):
+        m = client.get_metrics()
+        broken = m.query_all("pageserver_tenant_states_count", {"state": "Broken"})
+        broken_set = m.query_all(
+            "pageserver_broken_tenants_count", {"tenant_id": str(env.initial_tenant)}
+        )
+        found_broken = only_int(broken) == 0 and only_int(broken_set) == 1
+
+        if found_broken:
+            break
+        time.sleep(0.5)
+    assert (
+        found_broken
+    ), f"broken should still be in set, but it is not in the tenant state count: broken={broken}, broken_set={broken_set}"
+
+    client.tenant_load(env.initial_tenant)
+
+    found_active = False
+    active, broken_set = ([], [])
+    for _ in range(10):
+        m = client.get_metrics()
+        active = m.query_all("pageserver_tenant_states_count", {"state": "Active"})
+        broken_set = m.query_all(
+            "pageserver_broken_tenants_count", {"tenant_id": str(env.initial_tenant)}
+        )
+        found_active = only_int(active) == 1 and len(broken_set) == 0
+
+        if found_active:
+            break
+        time.sleep(0.5)
+
+    assert (
+        found_active
+    ), f"reloaded tenant should be active, and broken tenant set item removed: active={active}, broken_set={broken_set}"
--- a/test_runner/regress/test_tenants.py
+++ b/test_runner/regress/test_tenants.py
@@ -213,6 +213,9 @@ def test_metrics_normal_work(neon_env_builder: NeonEnvBuilder):

    # Test (a subset of) pageserver global metrics
    for metric in PAGESERVER_GLOBAL_METRICS:
+        if metric.startswith("pageserver_remote"):
+            continue
+
        ps_samples = ps_metrics.query_all(metric, {})
        assert len(ps_samples) > 0, f"expected at least one sample for {metric}"
        for sample in ps_samples:
@@ -380,10 +383,8 @@ def test_pageserver_with_empty_tenants(
    ps_metrics = client.get_metrics()
    broken_tenants_metric_filter = {
        "tenant_id": str(tenant_without_timelines_dir),
-        "state": "Broken",
    }
    active_tenants_metric_filter = {
-        "tenant_id": str(tenant_with_empty_timelines),
        "state": "Active",
    }

@@ -399,7 +400,7 @@ def test_pageserver_with_empty_tenants(

    tenant_broken_count = int(
        ps_metrics.query_one(
-            "pageserver_tenant_states_count", filter=broken_tenants_metric_filter
+            "pageserver_broken_tenants_count", filter=broken_tenants_metric_filter
        ).value
    )

--- a/test_runner/regress/test_threshold_based_eviction.py
+++ b/test_runner/regress/test_threshold_based_eviction.py
@@ -38,6 +38,12 @@ def test_threshold_based_eviction(
    env = neon_env_builder.init_start()
    env.pageserver.allowed_errors.append(metrics_refused_log_line)

+    # these can happen whenever we run consumption metrics collection
+    env.pageserver.allowed_errors.append(r".*failed to calculate logical size at \S+: cancelled")
+    env.pageserver.allowed_errors.append(
+        r".*failed to calculate synthetic size for tenant \S+: failed to calculate some logical_sizes"
+    )
+
    tenant_id, timeline_id = env.initial_tenant, env.initial_timeline
    assert isinstance(timeline_id, TimelineId)

--- a/test_runner/regress/test_timeline_delete.py
+++ b/test_runner/regress/test_timeline_delete.py
@@ -1,3 +1,4 @@
+import enum
 import os
 import queue
 import shutil
@@ -11,9 +12,12 @@ from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
    NeonEnv,
    NeonEnvBuilder,
+    PgBin,
    RemoteStorageKind,
    S3Storage,
    available_remote_storages,
+    last_flush_lsn_upload,
+    wait_for_last_flush_lsn,
 )
 from fixtures.pageserver.http import PageserverApiException
 from fixtures.pageserver.utils import (
@@ -117,59 +121,183 @@ def test_timeline_delete(neon_simple_env: NeonEnv):
        ps_http.timeline_detail(env.initial_tenant, leaf_timeline_id)


+class Check(enum.Enum):
+    RETRY_WITHOUT_RESTART = enum.auto()
+    RETRY_WITH_RESTART = enum.auto()
+
+
+DELETE_FAILPOINTS = [
+    "timeline-delete-before-index-deleted-at",
+    "timeline-delete-before-schedule",
+    "timeline-delete-before-rm",
+    "timeline-delete-during-rm",
+    "timeline-delete-after-rm",
+    "timeline-delete-before-index-delete",
+    "timeline-delete-after-index-delete",
+    "timeline-delete-after-rm-metadata",
+    "timeline-delete-after-rm-dir",
+]
+
+
+def combinations():
+    result = []
+
+    remotes = [RemoteStorageKind.NOOP, RemoteStorageKind.MOCK_S3]
+    if os.getenv("ENABLE_REAL_S3_REMOTE_STORAGE"):
+        remotes.append(RemoteStorageKind.REAL_S3)
+
+    for remote_storage_kind in remotes:
+        for delete_failpoint in DELETE_FAILPOINTS:
+            if remote_storage_kind == RemoteStorageKind.NOOP and delete_failpoint in (
+                "timeline-delete-before-index-delete",
+                "timeline-delete-after-index-delete",
+            ):
+                # the above failpoints are not relevant for config without remote storage
+                continue
+
+            result.append((remote_storage_kind, delete_failpoint))
+    return result
+
+
 # cover the two cases: remote storage configured vs not configured
-@pytest.mark.parametrize("remote_storage_kind", [None, RemoteStorageKind.LOCAL_FS])
-def test_delete_timeline_post_rm_failure(
-    neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind
+@pytest.mark.parametrize("remote_storage_kind, failpoint", combinations())
+@pytest.mark.parametrize("check", list(Check))
+def test_delete_timeline_exercise_crash_safety_failpoints(
+    neon_env_builder: NeonEnvBuilder,
+    remote_storage_kind: RemoteStorageKind,
+    failpoint: str,
+    check: Check,
+    pg_bin: PgBin,
 ):
    """
-    If there is a failure after removing the timeline directory, the delete operation
-    should be retryable.
+    If there is a failure during deletion in one of the associated failpoints (or crash restart happens at this point) the delete operation
+    should be retryable and should be successfully resumed.
+
+    We iterate over failpoints list, changing failpoint to the next one.
+
+    1. Set settings to generate many layers
+    2. Create branch.
+    3. Insert something
+    4. Go with the test.
+    5. Iterate over failpoints
+    6. Execute delete for each failpoint
+    7. Ensure failpoint is hit
+    8. Retry or restart without the failpoint and check the result.
    """

    if remote_storage_kind is not None:
        neon_env_builder.enable_remote_storage(
-            remote_storage_kind, "test_delete_timeline_post_rm_failure"
+            remote_storage_kind, "test_delete_timeline_exercise_crash_safety_failpoints"
        )

-    env = neon_env_builder.init_start()
-    assert env.initial_timeline
-
-    env.pageserver.allowed_errors.append(".*Error: failpoint: timeline-delete-after-rm")
-    env.pageserver.allowed_errors.append(".*Ignoring state update Stopping for broken timeline")
+    env = neon_env_builder.init_start(
+        initial_tenant_conf={
+            "gc_period": "0s",
+            "compaction_period": "0s",
+            "checkpoint_distance": f"{1024 ** 2}",
+            "image_creation_threshold": "100",
+        }
+    )

    ps_http = env.pageserver.http_client()

-    failpoint_name = "timeline-delete-after-rm"
-    ps_http.configure_failpoints((failpoint_name, "return"))
+    timeline_id = env.neon_cli.create_timeline("delete")
+    with env.endpoints.create_start("delete") as endpoint:
+        # generate enough layers
+        pg_bin.run(["pgbench", "-i", "-I dtGvp", "-s1", endpoint.connstr()])
+        if remote_storage_kind is RemoteStorageKind.NOOP:
+            wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, timeline_id)
+        else:
+            last_flush_lsn_upload(env, endpoint, env.initial_tenant, timeline_id)

-    ps_http.timeline_delete(env.initial_tenant, env.initial_timeline)
-    wait_until_timeline_state(
-        pageserver_http=ps_http,
-        tenant_id=env.initial_tenant,
-        timeline_id=env.initial_timeline,
-        expected_state="Broken",
-        iterations=2,  # effectively try immediately and retry once in one second
-    )
-
-    # FIXME: #4719
-    # timeline_info["state"]["Broken"]["reason"] == "failpoint: timeline-delete-after-rm"
-
-    at_failpoint_log_message = f".*{env.initial_timeline}.*at failpoint {failpoint_name}.*"
-    env.pageserver.allowed_errors.append(at_failpoint_log_message)
+    env.pageserver.allowed_errors.append(f".*{timeline_id}.*failpoint: {failpoint}")
+    # It appears when we stopped flush loop during deletion and then pageserver is stopped
    env.pageserver.allowed_errors.append(
-        f".*DELETE.*{env.initial_timeline}.*InternalServerError.*{failpoint_name}"
+        ".*freeze_and_flush_on_shutdown.*failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited"
    )
-
-    # retry without failpoint, it should succeed
-    ps_http.configure_failpoints((failpoint_name, "off"))
-
-    # this should succeed
-    # this also checks that delete can be retried even when timeline is in Broken state
-    timeline_delete_wait_completed(ps_http, env.initial_tenant, env.initial_timeline)
+    # This happens when we fail before scheduling background operation.
+    # Timeline is left in stopping state and retry tries to stop it again.
    env.pageserver.allowed_errors.append(
-        f".*{env.initial_timeline}.*timeline directory not found, proceeding anyway.*"
+        ".*Ignoring new state, equal to the existing one: Stopping"
    )
+    # This happens when we retry delete requests for broken timelines
+    env.pageserver.allowed_errors.append(".*Ignoring state update Stopping for broken timeline")
+    # This happens when timeline remains are cleaned up during loading
+    env.pageserver.allowed_errors.append(".*Timeline dir entry become invalid.*")
+    # In one of the branches we poll for tenant to become active. Polls can generate this log message:
+    env.pageserver.allowed_errors.append(f".*Tenant {env.initial_tenant} is not active*")
+
+    ps_http.configure_failpoints((failpoint, "return"))
+
+    # These failpoints are earlier than background task is spawned.
+    # so they result in api request failure.
+    if failpoint in (
+        "timeline-delete-before-index-deleted-at",
+        "timeline-delete-before-schedule",
+    ):
+        with pytest.raises(PageserverApiException, match=failpoint):
+            ps_http.timeline_delete(env.initial_tenant, timeline_id)
+
+    else:
+        ps_http.timeline_delete(env.initial_tenant, timeline_id)
+        timeline_info = wait_until_timeline_state(
+            pageserver_http=ps_http,
+            tenant_id=env.initial_tenant,
+            timeline_id=timeline_id,
+            expected_state="Broken",
+            iterations=2,  # effectively try immediately and retry once in one second
+        )
+
+        reason = timeline_info["state"]["Broken"]["reason"]
+        log.info(f"timeline broken: {reason}")
+
+        # failpoint may not be the only error in the stack
+        assert reason.endswith(f"failpoint: {failpoint}"), reason
+
+    wait_longer = remote_storage_kind is RemoteStorageKind.REAL_S3
+    if check is Check.RETRY_WITH_RESTART:
+        env.pageserver.stop()
+        env.pageserver.start()
+        if failpoint == "timeline-delete-before-index-deleted-at":
+            # We crashed before persisting this to remote storage, need to retry delete request
+
+            # Wait till tenant is loaded. Shouldnt take longer than 2 seconds (we shouldnt block tenant loading)
+            wait_until_tenant_active(ps_http, env.initial_tenant, iterations=2)
+
+            timeline_delete_wait_completed(ps_http, env.initial_tenant, timeline_id)
+        else:
+            # Pageserver should've resumed deletion after restart.
+            wait_timeline_detail_404(
+                ps_http, env.initial_tenant, timeline_id, wait_longer=wait_longer
+            )
+    elif check is Check.RETRY_WITHOUT_RESTART:
+        # this should succeed
+        # this also checks that delete can be retried even when timeline is in Broken state
+        ps_http.configure_failpoints((failpoint, "off"))
+
+        timeline_delete_wait_completed(
+            ps_http, env.initial_tenant, timeline_id, wait_longer=wait_longer
+        )
+
+    # Check remote is impty
+    if remote_storage_kind is RemoteStorageKind.MOCK_S3:
+        assert_prefix_empty(
+            neon_env_builder,
+            prefix="/".join(
+                (
+                    "tenants",
+                    str(env.initial_tenant),
+                    "timelines",
+                    str(timeline_id),
+                )
+            ),
+        )
+
+    timeline_dir = env.timeline_dir(env.initial_tenant, timeline_id)
+    # Check local is empty
+    assert not timeline_dir.exists()
+    # Check no delete mark present
+    assert not (timeline_dir.parent / f"{timeline_id}.___deleted").exists()


@pytest.mark.parametrize("remote_storage_kind", available_remote_storages())
@@ -327,7 +455,7 @@ def test_timeline_delete_fail_before_local_delete(neon_env_builder: NeonEnvBuild
    )

    ps_http.timeline_delete(env.initial_tenant, leaf_timeline_id)
-    wait_until_timeline_state(
+    timeline_info = wait_until_timeline_state(
        pageserver_http=ps_http,
        tenant_id=env.initial_tenant,
        timeline_id=leaf_timeline_id,
@@ -335,8 +463,7 @@ def test_timeline_delete_fail_before_local_delete(neon_env_builder: NeonEnvBuild
        iterations=2,  # effectively try immediately and retry once in one second
    )

-    # FIXME: #4719
-    # timeline_info["state"]["Broken"]["reason"] == "failpoint: timeline-delete-after-rm"
+    assert timeline_info["state"]["Broken"]["reason"] == "failpoint: timeline-delete-before-rm"

    assert leaf_timeline_path.exists(), "the failpoint didn't work"

@@ -588,6 +715,7 @@ def test_timeline_delete_works_for_remote_smoke(
    assert tenant_id == env.initial_tenant
    assert main_timeline_id == env.initial_timeline

+    assert env.initial_timeline is not None
    timeline_ids = [env.initial_timeline]
    for i in range(2):
        branch_timeline_id = env.neon_cli.create_branch(f"new{i}", "main")
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,4 +1,4 @@
 {
-    "postgres-v15": "e3fbfc4d143b2d3c3c1813ce747f8af35aa9405e",
-    "postgres-v14": "12c5dc8281d20b5bd636e1097eea80a7bc609591"
+    "postgres-v15": "1220c8a63f00101829f9222a5821fc084b4384c7",
+    "postgres-v14": "ebedb34d01c8ac9c31e8ea4628b9854103a1dc8f"
 }